diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index 5e60f1f2b99fee..415c0fe9bef9ea 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -36,7 +36,7 @@ ENDIF()
 
 if(NOT DEFINED XPU_BASE_URL)
   SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220215")
+  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220219")
 else()
   SET(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()
diff --git a/cmake/pten.cmake b/cmake/pten.cmake
index 6049f6e21e5662..9a3552efce8e12 100644
--- a/cmake/pten.cmake
+++ b/cmake/pten.cmake
@@ -58,26 +58,26 @@ endfunction()
 function(kernel_declare TARGET_LIST)
     foreach(kernel_path ${TARGET_LIST})
         file(READ ${kernel_path} kernel_impl)
-        # TODO(chenweihang): rename PT_REGISTER_KERNEL to PT_REGISTER_KERNEL
+        # TODO(chenweihang): rename PD_REGISTER_KERNEL to PD_REGISTER_KERNEL
         # NOTE(chenweihang): now we don't recommend to use digit in kernel name
-        string(REGEX MATCH "(PT_REGISTER_KERNEL|PT_REGISTER_GENERAL_KERNEL)\\([ \t\r\n]*[a-z0-9_]*," first_registry "${kernel_impl}")
+        string(REGEX MATCH "(PD_REGISTER_KERNEL|PD_REGISTER_GENERAL_KERNEL)\\([ \t\r\n]*[a-z0-9_]*," first_registry "${kernel_impl}")
         if (NOT first_registry STREQUAL "")
             # parse the first kernel name
-            string(REPLACE "PT_REGISTER_KERNEL(" "" kernel_name "${first_registry}")
-            string(REPLACE "PT_REGISTER_GENERAL_KERNEL(" "" kernel_name "${kernel_name}")
+            string(REPLACE "PD_REGISTER_KERNEL(" "" kernel_name "${first_registry}")
+            string(REPLACE "PD_REGISTER_GENERAL_KERNEL(" "" kernel_name "${kernel_name}")
             string(REPLACE "," "" kernel_name "${kernel_name}")
             string(REGEX REPLACE "[ \t\r\n]+" "" kernel_name "${kernel_name}")
             # append kernel declare into declarations.h
             # TODO(chenweihang): default declare ALL_LAYOUT for each kernel
             if (${kernel_path} MATCHES "./cpu\/")
-                file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
+                file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
             elseif (${kernel_path} MATCHES "./gpu\/")
-                file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, GPU, ALL_LAYOUT);\n")
+                file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, GPU, ALL_LAYOUT);\n")
             elseif (${kernel_path} MATCHES "./xpu\/")
-                file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, XPU, ALL_LAYOUT);\n")
+                file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, XPU, ALL_LAYOUT);\n")
             else ()
                 # deal with device independent kernel, now we use CPU temporaary
-                file(APPEND ${kernel_declare_file} "PT_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
+                file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
             endif()
         endif()
     endforeach()
@@ -285,9 +285,9 @@ endfunction()
 
 function(append_op_util_declare TARGET)
     file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET} target_content)
-    string(REGEX MATCH "(PT_REGISTER_BASE_KERNEL_NAME|PT_REGISTER_ARG_MAPPING_FN)\\([ \t\r\n]*[a-z0-9_]*" util_registrar "${target_content}")
-    string(REPLACE "PT_REGISTER_ARG_MAPPING_FN" "PT_DECLARE_ARG_MAPPING_FN" util_declare "${util_registrar}")
-    string(REPLACE "PT_REGISTER_BASE_KERNEL_NAME" "PT_DECLARE_BASE_KERNEL_NAME" util_declare "${util_declare}")
+    string(REGEX MATCH "(PD_REGISTER_BASE_KERNEL_NAME|PD_REGISTER_ARG_MAPPING_FN)\\([ \t\r\n]*[a-z0-9_]*" util_registrar "${target_content}")
+    string(REPLACE "PD_REGISTER_ARG_MAPPING_FN" "PD_DECLARE_ARG_MAPPING_FN" util_declare "${util_registrar}")
+    string(REPLACE "PD_REGISTER_BASE_KERNEL_NAME" "PD_DECLARE_BASE_KERNEL_NAME" util_declare "${util_declare}")
     string(APPEND util_declare ");\n")
     file(APPEND ${op_utils_header} "${util_declare}")
 endfunction()
diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.cc b/paddle/fluid/distributed/fleet_executor/dist_model.cc
index e684d75bfb8320..c1408130b5e577 100644
--- a/paddle/fluid/distributed/fleet_executor/dist_model.cc
+++ b/paddle/fluid/distributed/fleet_executor/dist_model.cc
@@ -52,6 +52,8 @@ bool LoadDataFromDistModelTensor(const DistModelTensor &input_data,
     input_tensor_ptr = input_tensor->mutable_data<float>(dims, place);
   } else if (input_data.dtype == DistModelDataType::INT32) {
     input_tensor_ptr = input_tensor->mutable_data<int32_t>(dims, place);
+  } else if (input_data.dtype == DistModelDataType::FLOAT16) {
+    input_tensor_ptr = input_tensor->mutable_data<float16>(dims, place);
   } else {
     LOG(ERROR) << "unsupported feed type " << input_data.dtype;
     return false;
@@ -412,6 +414,8 @@ bool DistModel::PrepareFeedAndFetch() {
         feeds_to_dtype_.insert({var_name, DistModelDataType::INT32});
       } else if (real_var->GetDataType() == framework::proto::VarType::INT64) {
         feeds_to_dtype_.insert({var_name, DistModelDataType::INT64});
+      } else if (real_var->GetDataType() == framework::proto::VarType::FP16) {
+        feeds_to_dtype_.insert({var_name, DistModelDataType::FLOAT16});
       } else {
         LOG(ERROR) << "Don't support feed var dtype for: "
                    << real_var->GetDataType();
@@ -503,9 +507,13 @@ bool DistModel::FetchResults(std::vector<DistModelTensor> *output_data,
     } else if (type == framework::proto::VarType::INT32) {
       rst = FetchResult<int32_t>(fetch, output);
       output->dtype = DistModelDataType::INT32;
+    } else if (type == framework::proto::VarType::FP16) {
+      rst = FetchResult<float16>(fetch, output);
+      output->dtype = DistModelDataType::FLOAT16;
     } else {
       LOG(ERROR) << "DistModel meets unknown fetch data type. DistModel only "
-                    "supports float32, int64 and int32 fetch type for now.";
+                    "supports float32, float16, int64 and int32 fetch type "
+                    "for now.";
     }
     if (!rst) {
       LOG(ERROR) << "DistModel fails to fetch result " << idx_to_fetches_[idx];
diff --git a/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h b/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h
index 6bdd858d6cf9ed..dc8b2596803e07 100644
--- a/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h
+++ b/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <string>
 #include <vector>
+#include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/macros.h"
 
 namespace paddle {
@@ -40,6 +41,11 @@ constexpr DistModelDataType DistModelGetDtype<float>() {
   return DistModelDataType::FLOAT32;
 }
 
+template <>
+constexpr DistModelDataType DistModelGetDtype<platform::float16>() {
+  return DistModelDataType::FLOAT16;
+}
+
 class DistModelDataBuf {
  public:
   explicit DistModelDataBuf(size_t length)
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index 786bf21e8c8a13..ca02a3d39779de 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -127,6 +127,15 @@ def ReadBwdFile(filepath):
 ######################
 ###  Yaml Parsers  ###
 ######################
+def ParseNoNeedBuffer(string):
+    # string: "x, y"
+    no_need_buffer_set = set()
+    for name in string.split(","):
+        no_need_buffer_set.add(name.strip())
+
+    return no_need_buffer_set
+
+
 def ParseYamlArgs(string):
     # Example: const Tensor& x, const Tensor& y, bool transpose_x, bool transpose_y
 
@@ -397,7 +406,7 @@ def SlotNameMatching(backward_inputs_list, backward_returns_list,
 
 
 def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
-                            backward_attrs_list):
+                            backward_attrs_list, no_need_buffer_set):
     # Inputs:
     # fwd_api_name = ""
     # backward_fwd_input_map   = { "name" : [type, is_fwd_input, orig_position] ...}
@@ -410,15 +419,20 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
     set_tensor_wrapper_methods_str = ""
     tensor_wrapper_members_str = ""
     for tname, (ttype, is_fwd_input, _) in backward_fwd_input_map.items():
+        if tname in no_need_buffer_set:
+            no_need_buffer = "true"
+        else:
+            no_need_buffer = "false"
+
         tensor_wrapper_name = GetSavedName(tname)
         if IsPlainTensorType(ttype):
             SET_PLAIN_TENSOR_WRAPPER_TEMPLATE = """
    void SetTensorWrapper{}(const paddle::experimental::Tensor& {}, bool full_reserved) {{     
-     {} = egr::TensorWrapper({}, full_reserved);
+     {} = egr::TensorWrapper({}, full_reserved, {});
    }}
 """
             set_tensor_wrapper_methods_str += SET_PLAIN_TENSOR_WRAPPER_TEMPLATE.format(
-                tname, tname, tensor_wrapper_name, tname)
+                tname, tname, tensor_wrapper_name, tname, no_need_buffer)
 
             PLAIN_TENSOR_MEMBER_TEMPLATE = """
    egr::TensorWrapper {};
@@ -430,12 +444,12 @@ def GenerateNodeDeclaration(fwd_api_name, backward_fwd_input_map,
             SET_VECTOR_TENSOR_WRAPPER_TEMPLATE = """
    void SetTensorWrapper{}(const std::vector<paddle::experimental::Tensor>& {}, bool full_reserved) {{
      for(const auto& eager_tensor : {}) {{
-        {}.emplace_back( egr::TensorWrapper(eager_tensor, full_reserved) );
+        {}.emplace_back( egr::TensorWrapper(eager_tensor, full_reserved, {}) );
      }};
    }}
 """
             set_tensor_wrapper_methods_str += SET_VECTOR_TENSOR_WRAPPER_TEMPLATE.format(
-                tname, tname, tname, tensor_wrapper_name)
+                tname, tname, tname, tensor_wrapper_name, no_need_buffer)
 
             VECTOR_TENSOR_MEMBER_TEMPLATE = """
    std::vector<egr::TensorWrapper> {};
@@ -997,6 +1011,10 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str):
         assert 'output' in fwd_api.keys()
         assert 'backward' in fwd_api.keys()
 
+        no_need_buffer_set = set()
+        if 'no_need_buffer' in fwd_api.keys():
+            no_need_buffer_set = ParseNoNeedBuffer(fwd_api['no_need_buffer'])
+
         fwd_api_name = fwd_api['api']
         fwd_args_str = fwd_api['args']
         fwd_returns_str = fwd_api['output']
@@ -1062,7 +1080,8 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str):
 
         # Node Declaration Generation
         node_declaration_str += GenerateNodeDeclaration(
-            fwd_api_name, backward_fwd_input_map, backward_attrs_list)
+            fwd_api_name, backward_fwd_input_map, backward_attrs_list,
+            no_need_buffer_set)
         print("Generated Node Declaration: ", node_declaration_str)
 
         node_definition_str += GenerateNodeDefinition(
diff --git a/paddle/fluid/eager/tensor_wrapper.h b/paddle/fluid/eager/tensor_wrapper.h
index 6cc17b0a9c5faf..1732e0513d5244 100644
--- a/paddle/fluid/eager/tensor_wrapper.h
+++ b/paddle/fluid/eager/tensor_wrapper.h
@@ -34,7 +34,8 @@ class TensorWrapper {
  public:
   TensorWrapper() = default;
   explicit TensorWrapper(const paddle::experimental::Tensor& tensor,
-                         bool full_reserved = false) {
+                         bool full_reserved = false,
+                         bool no_need_buffer = false) {
     /**
      * Normally, we should fully reserved all non-output or non-leaf fwd tensor
      * here. And for fwd output tensor, we should not reserve its autogradmeta,
@@ -48,7 +49,22 @@ class TensorWrapper {
     }
 
     // shallow copy tensor_impl here
-    intermidiate_tensor_.set_impl(tensor.impl());
+    if (no_need_buffer) {
+      if (phi::DenseTensor::classof(tensor.impl().get())) {
+        // Only Copy Meta
+        phi::DenseTensor* dense_tensor =
+            static_cast<phi::DenseTensor*>(tensor.impl().get());
+        auto tw_dense_tensor = std::make_shared<phi::DenseTensor>();
+        tw_dense_tensor->set_meta(dense_tensor->meta());
+        intermidiate_tensor_.set_impl(tw_dense_tensor);
+      } else {
+        PADDLE_THROW(paddle::platform::errors::Fatal(
+            "Unrecognized tensor type for no_need_buffer feature"));
+      }
+    } else {
+      intermidiate_tensor_.set_impl(tensor.impl());
+    }
+
     intermidiate_tensor_.set_name(tensor.name() + "@Saved");
     PADDLE_ENFORCE_NOT_NULL(
         EagerUtils::unsafe_autograd_meta(tensor),
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
index b72b7cb87530e0..6c4bf9a4f17e6f 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cpu.cc
@@ -176,4 +176,4 @@ TEST(Benchmark, EagerIntermediateMLPCPU) {
 USE_OP_ITSELF(scale);
 USE_OP_ITSELF(elementwise_add);
 USE_OP_ITSELF(matmul_v2);
-USE_OP(reduce_sum);
+USE_OP_ITSELF(reduce_sum);
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
index 701a9b2cba195d..14e7ce8cfcfb4d 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_eager_cuda.cc
@@ -185,7 +185,7 @@ TEST(Benchmark, EagerIntermediateMLPCUDA) {
 
 USE_OP_ITSELF(scale);
 USE_OP_ITSELF(matmul_v2);
-USE_OP(reduce_sum);
+USE_OP_ITSELF(reduce_sum);
 USE_OP(reduce_sum_grad);
 USE_OP_ITSELF(elementwise_add);
 
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
index bea80809a3b17a..3292de9363696d 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cpu.cc
@@ -214,4 +214,4 @@ TEST(Benchmark, FluidMLPCPU) {
 USE_OP_ITSELF(scale);
 USE_OP_ITSELF(elementwise_add);
 USE_OP_ITSELF(matmul_v2);
-USE_OP(reduce_sum);
+USE_OP_ITSELF(reduce_sum);
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
index 86c1ad7e23a6a3..e9b7d10070dbf2 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_fluid_cuda.cc
@@ -247,7 +247,7 @@ TEST(Benchmark, FluidMLPCUDA) {
 
 USE_OP_ITSELF(scale);
 USE_OP_ITSELF(matmul_v2);
-USE_OP(reduce_sum);
+USE_OP_ITSELF(reduce_sum);
 USE_OP(reduce_sum_grad);
 USE_OP_ITSELF(elementwise_add);
 
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 78f5bb077aaf18..7d527e24a0079e 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -437,8 +437,7 @@ message(STATUS "branch: ${PADDLE_BRANCH}")
 configure_file(commit.h.in commit.h)
 
 cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framework_proto op_registry operator dynamic_loader string_helper pten_tensor op_meta_info pten_api)
-cc_library(custom_kernel SRCS custom_kernel.cc DEPS
-           tensor attribute framework_proto op_registry operator dynamic_loader string_helper pten_tensor op_kernel_info pten_api)
+cc_library(custom_kernel SRCS custom_kernel.cc DEPS op_registry pten_custom_kernel pten_tensor_raw)
 #cc_binary(test_executor SRCS test_executor.cc DEPS executor op_registry ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} )
 #cc_binary(new_executor SRCS new_exec_test.cc DEPS operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler)
 
@@ -459,4 +458,3 @@ else()
   cc_library(fluid_convert_utils SRCS convert_utils.cc DEPS data_type place)
 endif()
 cc_test(convert_utils_test SRCS convert_utils_test.cc DEPS fluid_convert_utils)
-cc_test(custom_kernel_test SRCS custom_kernel_test.cc DEPS custom_kernel pten_tensor)
diff --git a/paddle/fluid/framework/custom_kernel.cc b/paddle/fluid/framework/custom_kernel.cc
index 3a00d9424646a5..49a1e0774a6b1a 100644
--- a/paddle/fluid/framework/custom_kernel.cc
+++ b/paddle/fluid/framework/custom_kernel.cc
@@ -18,355 +18,24 @@ limitations under the License. */
 #endif
 
 #include "paddle/fluid/framework/custom_kernel.h"
-#include <dirent.h>
-#include <algorithm>
-#include <regex>
-#include "paddle/fluid/framework/op_kernel_info_helper.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/phi/api/ext/op_kernel_info.h"
-#include "paddle/phi/core/compat/convert_utils.h"
-#include "paddle/phi/core/kernel_context.h"
-#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/custom_kernel.h"
 
 namespace paddle {
-
 namespace framework {
 
-// set phi::Kernel args_def_ from op_kernel_info
-// because we can not set directly to phi::Kernel without exposing
-// phi::KernelArgsDef when parsing custom user function
-static void ParseArgs(const OpKernelInfo& op_kernel_info,
-                      phi::KernelArgsDef* args_def) {
-  auto& input_defs = OpKernelInfoHelper::GetInputDefs(op_kernel_info);
-  auto& output_defs = OpKernelInfoHelper::GetOutputDefs(op_kernel_info);
-  auto& attribute_defs = OpKernelInfoHelper::GetAttributeDefs(op_kernel_info);
-
-  for (auto& input : input_defs) {
-    auto type_index =
-        input.is_vector
-            ? std::type_index(typeid(const std::vector<phi::DenseTensor>&))
-            : std::type_index(typeid(const phi::DenseTensor&));
-    args_def->AppendInput(input.backend, input.layout, input.dtype, type_index);
-  }
-  for (auto& output : output_defs) {
-    auto type_index =
-        output.is_vector
-            ? std::type_index(typeid(const std::vector<phi::DenseTensor>&))
-            : std::type_index(typeid(const phi::DenseTensor&));
-    args_def->AppendOutput(output.backend, output.layout, output.dtype,
-                           type_index);
-  }
-  for (auto& attr : attribute_defs) {
-    args_def->AppendAttribute(attr.type_index);
-  }
-}
-
-// custom pten kernel call function define
-static void RunKernelFunc(phi::KernelContext* ctx,
-                          const OpKernelInfo& op_kernel_info) {
-  VLOG(3) << "[CUSTOM KERNEL] RunKernelFunc begin...";
-
-  // input and output size is not params' num
-  // but actual Tensors' size
-  size_t input_size = ctx->InputsSize();
-  size_t output_size = ctx->OutputsSize();
-  size_t attr_size = ctx->AttrsSize();
-
-  // parameters' num of unified user kernel function
-  auto& input_defs = OpKernelInfoHelper::GetInputDefs(op_kernel_info);
-  auto& output_defs = OpKernelInfoHelper::GetOutputDefs(op_kernel_info);
-  auto& attribute_defs = OpKernelInfoHelper::GetAttributeDefs(op_kernel_info);
-
-  PADDLE_ENFORCE_GE(input_size, input_defs.size(),
-                    platform::errors::InvalidArgument(
-                        "the size of ctx inputs size (%d) must be larger than "
-                        "the size of kernel input_defs (%d).",
-                        input_size, input_defs.size()));
-
-  PADDLE_ENFORCE_GE(output_size, output_defs.size(),
-                    platform::errors::InvalidArgument(
-                        "the size of ctx outputs size (%d) must be larger than "
-                        "the size of kernel output_defs (%d).",
-                        output_size, output_defs.size()));
-
-  PADDLE_ENFORCE_EQ(attr_size, attribute_defs.size(),
-                    platform::errors::InvalidArgument(
-                        "the size of ctx attribute size (%d) must be equal to "
-                        "to the size of kernel attribute_defs (%d).",
-                        attr_size, attribute_defs.size()));
-
-  VLOG(3) << "[CUSTOM KERNEL] Input num: " << input_defs.size()
-          << "[tensor size:" << input_size << "]"
-          << " Attribute num: " << attribute_defs.size()
-          << " Output num: " << output_defs.size()
-          << "[tensor size:" << output_size << "].";
-
-  // Inputs mapping
-  std::vector<paddle::experimental::Tensor> custom_ins;
-  std::vector<std::vector<paddle::experimental::Tensor>> custom_vec_ins;
-  for (size_t in_idx = 0; in_idx < input_defs.size(); ++in_idx) {
-    VLOG(3) << "Mapping Input[" << in_idx << "]";
-    const std::pair<int, int> range = ctx->InputRangeAt(in_idx);
-
-    // is_vector tells if this Input is Tensor or std::vector<Tensor>
-    if (!input_defs.at(in_idx).is_vector) {
-      paddle::experimental::Tensor custom_t;
-      auto& ctx_tensor = ctx->InputAt<phi::DenseTensor>(range.first);
-      custom_t.set_impl(std::make_shared<phi::DenseTensor>(ctx_tensor));
-      custom_ins.emplace_back(custom_t);
-    } else {
-      std::vector<paddle::experimental::Tensor> custom_vec_in;
-      auto ctx_tensor_vec =
-          ctx->MoveInputsBetween<phi::DenseTensor>(range.first, range.second);
-      for (auto& ctx_tensor : ctx_tensor_vec) {
-        paddle::experimental::Tensor custom_t;
-        custom_t.set_impl(std::make_shared<phi::DenseTensor>(ctx_tensor));
-        custom_vec_in.emplace_back(custom_t);
-      }
-      custom_vec_ins.emplace_back(custom_vec_in);
-    }
-    VLOG(3) << "Mapped Input[" << in_idx << "] with range[" << range.first
-            << "," << range.second << ").";
-  }
-
-  // Attributes mapping
-  std::vector<paddle::any> custom_attrs;
-  for (size_t attr_idx = 0; attr_idx < attribute_defs.size(); ++attr_idx) {
-    VLOG(3) << "Mapping Attribute[" << attr_idx << "]";
-    if (attribute_defs[attr_idx].type_index == std::type_index(typeid(bool))) {
-      bool arg = ctx->AttrAt<bool>(attr_idx);
-      custom_attrs.emplace_back(arg);
-    } else if (attribute_defs[attr_idx].type_index ==
-               std::type_index(typeid(int))) {
-      int arg = ctx->AttrAt<int>(attr_idx);
-      custom_attrs.emplace_back(arg);
-    } else if (attribute_defs[attr_idx].type_index ==
-               std::type_index(typeid(float))) {
-      float arg = ctx->AttrAt<float>(attr_idx);
-      custom_attrs.emplace_back(arg);
-    } else if (attribute_defs[attr_idx].type_index ==
-               std::type_index(typeid(double))) {
-      double arg = ctx->AttrAt<double>(attr_idx);
-      custom_attrs.emplace_back(arg);
-    } else if (attribute_defs[attr_idx].type_index ==
-               std::type_index(typeid(int64_t))) {
-      int64_t arg = ctx->AttrAt<int64_t>(attr_idx);
-      custom_attrs.emplace_back(arg);
-    } else if (attribute_defs[attr_idx].type_index ==
-               std::type_index(typeid(phi::dtype::float16))) {
-      phi::dtype::float16 arg = ctx->AttrAt<phi::dtype::float16>(attr_idx);
-      custom_attrs.emplace_back(arg);
-    } else if (attribute_defs[attr_idx].type_index ==
-               std::type_index(typeid(DataType))) {
-      DataType arg = ctx->AttrAt<DataType>(attr_idx);
-      custom_attrs.emplace_back(arg);
-    } else if (attribute_defs[attr_idx].type_index ==
-               std::type_index(typeid(const Scalar&))) {
-      const Scalar& arg = ctx->AttrAt<const Scalar&>(attr_idx);
-      custom_attrs.emplace_back(arg);
-    } else if (attribute_defs[attr_idx].type_index ==
-               std::type_index(typeid(const std::vector<int64_t>&))) {
-      const std::vector<int64_t>& arg =
-          ctx->AttrAt<const std::vector<int64_t>&>(attr_idx);
-      custom_attrs.emplace_back(arg);
-    } else if (attribute_defs[attr_idx].type_index ==
-               std::type_index(typeid(const ScalarArray&))) {
-      const ScalarArray& arg = ctx->AttrAt<const ScalarArray&>(attr_idx);
-      custom_attrs.emplace_back(arg);
-    } else if (attribute_defs[attr_idx].type_index ==
-               std::type_index(typeid(const std::vector<int>&))) {
-      const std::vector<int>& arg =
-          ctx->AttrAt<const std::vector<int>&>(attr_idx);
-      custom_attrs.emplace_back(arg);
-    } else {
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "Unsupported attribute attribute_defs[%d].type_index", attr_idx));
-    }
-    VLOG(3) << "Mapped Attribute[" << attr_idx << "]";
-  }
-
-  // Outputs mapping
-  std::vector<paddle::experimental::Tensor*> custom_outs;
-  std::vector<std::vector<paddle::experimental::Tensor*>> custom_vec_outs;
-  std::vector<std::shared_ptr<phi::DenseTensor>> custom_outs_ptr;
-  std::vector<std::vector<std::shared_ptr<phi::DenseTensor>>>
-      custom_vec_outs_ptr;
-
-  for (size_t out_idx = 0; out_idx < output_defs.size(); ++out_idx) {
-    VLOG(3) << "Mapping Output[" << out_idx << "]";
-    const std::pair<int, int> range = ctx->OutputRangeAt(out_idx);
-
-    // is_vector tells if this Output is Tensor or std::vector<Tensor>
-    if (!output_defs.at(out_idx).is_vector) {
-      auto* ctx_tensor = ctx->MutableOutputAt<phi::DenseTensor>(range.first);
-      auto* custom_t = new paddle::experimental::Tensor();
-      auto custom_t_ptr = std::make_shared<phi::DenseTensor>(*ctx_tensor);
-      custom_t->set_impl(custom_t_ptr);
-      custom_outs.emplace_back(custom_t);
-      custom_outs_ptr.emplace_back(custom_t_ptr);
-    } else {
-      std::vector<paddle::experimental::Tensor*> custom_vec_out;
-      std::vector<std::shared_ptr<phi::DenseTensor>> custom_vec_out_ptr;
-      auto ctx_tensor_vec = ctx->MutableOutputBetween<phi::DenseTensor>(
-          range.first, range.second);
-      for (auto ctx_tensor : ctx_tensor_vec) {
-        auto* custom_t = new paddle::experimental::Tensor();
-        auto custom_t_ptr = std::make_shared<phi::DenseTensor>(*ctx_tensor);
-        custom_t->set_impl(custom_t_ptr);
-        custom_vec_out.emplace_back(custom_t);
-        custom_vec_out_ptr.emplace_back(custom_t_ptr);
-      }
-      custom_vec_outs.emplace_back(custom_vec_out);
-      custom_vec_outs_ptr.emplace_back(custom_vec_out_ptr);
-    }
-    VLOG(3) << "Mapped Output[" << out_idx << "] with range[" << range.first
-            << "," << range.second << ").";
-  }
-
-  // DeviceContext
-  // In pten, the first paramter XXContext is decided when registering
-  // through template param, but custom kernel function use unified
-  // DeviceContext as first parameter of user_kernel_fn, we use backend
-  // from OpKernelInfo to decide XXContext. In temporary simple
-  // DeviceContext, we just set necessary info to dev_ctx(such as stream
-  // in NPUContext), more related work should be done when
-  // phi::DeviceContext is exposed to outer.
-  DeviceContext dev_ctx;
-  auto& backend = OpKernelInfoHelper::GetBackend(op_kernel_info);
-  if (backend == phi::Backend::CPU) {
-    // do nothing
-  } else {
-#ifdef PADDLE_WITH_CUSTOM_DEVICE
-    size_t device_type_id_ = static_cast<size_t>(backend) -
-                             static_cast<size_t>(phi::Backend::ALL_BACKEND);
-    std::string device_type = phi::GetGlobalDeviceType(device_type_id_);
-    if (!device_type.empty()) {
-      auto custom_ctx =
-          ctx->GetDeviceContext<paddle::platform::CustomDeviceContext>();
-      dev_ctx.set_stream(custom_ctx.stream());
-      return;
-    }
-#endif
-    LOG(ERROR) << "[CUSTOM KERNEL] Unsupported kernel backend: " << backend
-               << " with compiled Paddle.";
-    return;
-  }
-
-  auto& user_kernel_fn = OpKernelInfoHelper::GetKernelFn(op_kernel_info);
-  // call user function
-  user_kernel_fn(dev_ctx, custom_ins, custom_vec_ins, custom_attrs,
-                 &custom_outs, &custom_vec_outs);
-
-  VLOG(3) << "[CUSTOM KERNEL] finished call user kernel function.";
-
-  // NOTE: Map back the output tensors with stored shared_ptrs.
-  for (int out_idx = output_defs.size() - 1; out_idx >= 0; --out_idx) {
-    VLOG(3) << "Mapping Back Output[" << out_idx << "]";
-    const std::pair<int, int> range = ctx->OutputRangeAt(out_idx);
-
-    // is_vector tells if this Output is Tensor or std::vector<Tensor>
-    if (!output_defs.at(out_idx).is_vector) {
-      auto* ctx_tensor = ctx->MutableOutputAt<phi::DenseTensor>(range.first);
-      *ctx_tensor = *(custom_outs_ptr.back().get());
-      custom_outs_ptr.pop_back();
-    } else {
-      auto ctx_tensor_vec = ctx->MutableOutputBetween<phi::DenseTensor>(
-          range.first, range.second);
-      auto custom_vec_ptr_out = custom_vec_outs_ptr.back();
-      for (int idx = ctx_tensor_vec.size() - 1; idx >= 0; --idx) {
-        *(ctx_tensor_vec[idx]) = *(custom_vec_ptr_out.back().get());
-        custom_vec_ptr_out.pop_back();
-      }
-      custom_vec_outs_ptr.pop_back();
-    }
-    VLOG(3) << "Mapped Output[" << out_idx << "] with range[" << range.first
-            << "," << range.second << "].";
-  }
-
-  // delete newed paddle::Tensor for outputs while calling user kernel function
-  for (size_t i = 0; i < custom_outs.size(); ++i) {
-    delete custom_outs[i];
-  }
-  for (size_t i = 0; i < custom_vec_outs.size(); ++i) {
-    for (size_t j = 0; j < custom_vec_outs[i].size(); ++j) {
-      delete custom_vec_outs[i][j];
-    }
-  }
-}
-
-void RegisterKernelWithMetaInfo(
-    const std::vector<OpKernelInfo>& op_kernel_infos) {
-  for (size_t i = 0; i < op_kernel_infos.size(); ++i) {
-    auto& kernel_info = op_kernel_infos[i];
-    auto op_type = OpKernelInfoHelper::GetOpName(kernel_info);
-    auto kernel_key = OpKernelInfoHelper::GetKernelKey(kernel_info);
-
-    VLOG(3) << "[CUSTOM KERNEL] registering [" << op_type << "]" << kernel_key;
-
-    // 1.Check whether this kernel is valid for a specific operator
-    PADDLE_ENFORCE_EQ(
-        phi::KernelFactory::Instance().HasCompatiblePtenKernel(op_type), true,
-        platform::errors::InvalidArgument(
-            "[CUSTOM KERNEL] %s is not ready for custom kernel registering.",
-            op_type));
-
-    // 2.Check whether kernel_key has been already registed
-    PADDLE_ENFORCE_EQ(
-        phi::KernelFactory::Instance().kernels()[op_type].find(kernel_key),
-        phi::KernelFactory::Instance().kernels()[op_type].end(),
-        platform::errors::InvalidArgument(
-            "[CUSTOM KERNEL] The operator <%s>'s kernel: %s has been "
-            "already existed in Paddle, please contribute PR if need "
-            "to optimize the kernel code. Custom kernel do NOT support "
-            "to replace existing kernel in Paddle.",
-            op_type, kernel_key));
-
-    // phi::KernelFn
-    phi::KernelFn kernel_fn = [kernel_info](phi::KernelContext* ctx) {
-      VLOG(3) << "[CUSTOM KERNEL] run custom PTEN kernel func in lambda.";
-      RunKernelFunc(ctx, kernel_info);
-    };
-    // variadic_kernel_fn
-    void* variadic_kernel_fn =
-        OpKernelInfoHelper::GetVariadicKernelFn(kernel_info);
-    phi::Kernel kernel(kernel_fn, variadic_kernel_fn);
-    // args info
-    ParseArgs(kernel_info, kernel.mutable_args_def());
-    // register custom kernel to phi::KernelFactory
-    phi::KernelFactory::Instance().kernels()[op_type][kernel_key] = kernel;
-    VLOG(3) << "[CUSTOM KERNEL] Successed in registering operator <" << op_type
-            << ">'s kernel " << kernel_key << " to Paddle. "
-            << "It will be used like native ones.";
-  }
-}
-
-void RegisterKernelWithMetaInfoMap(
-    const paddle::OpKernelInfoMap& op_kernel_info_map) {
-  auto& kernel_info_map = op_kernel_info_map.GetMap();
-  VLOG(3) << "[CUSTOM KERNEL] size of op_kernel_info_map: "
-          << kernel_info_map.size();
-
-  // pair: {op_type, OpKernelInfo}
-  for (auto& pair : kernel_info_map) {
-    VLOG(3) << "[CUSTOM KERNEL] pair first -> op name: " << pair.first;
-    RegisterKernelWithMetaInfo(pair.second);
-  }
-}
-
 void LoadCustomKernelLib(const std::string& dso_lib_path, void* dso_handle) {
 #ifdef _LINUX
-  typedef OpKernelInfoMap& get_op_kernel_info_map_t();
-  auto* func = reinterpret_cast<get_op_kernel_info_map_t*>(
-      dlsym(dso_handle, "PD_GetOpKernelInfoMap"));
+  typedef phi::CustomKernelMap& get_custom_kernel_map_t();
+  auto* func = reinterpret_cast<get_custom_kernel_map_t*>(
+      dlsym(dso_handle, "PD_GetCustomKernelMap"));
 
   if (func == nullptr) {
     LOG(WARNING) << "Skipped lib [" << dso_lib_path << "]: fail to find "
-                 << "PD_GetOpKernelInfoMap symbol in this lib.";
+                 << "PD_GetCustomKernelMap symbol in this lib.";
     return;
   }
-  auto& op_kernel_info_map = func();
-  RegisterKernelWithMetaInfoMap(op_kernel_info_map);
+  auto& custom_kernel_map = func();
+  phi::RegisterCustomKernels(custom_kernel_map);
   LOG(INFO) << "Successed in loading custom kernels in lib: " << dso_lib_path;
 #else
   VLOG(3) << "Unsupported: Custom kernel is only implemented on Linux.";
diff --git a/paddle/fluid/framework/custom_kernel.h b/paddle/fluid/framework/custom_kernel.h
index 30bccc97000f88..31084a34413ea4 100644
--- a/paddle/fluid/framework/custom_kernel.h
+++ b/paddle/fluid/framework/custom_kernel.h
@@ -14,22 +14,13 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/api/ext/op_kernel_info.h"
+#include <string>
 
 namespace paddle {
 namespace framework {
 
+// Load custom kernel lib and register
 void LoadCustomKernelLib(const std::string& dso_lib_path, void* dso_handle);
 
-// Load custom kernel api: register kernel after user compiled
-void LoadOpKernelInfoAndRegister(const std::string& dso_name);
-
-// Register custom kernel api: register kernel directly
-void RegisterKernelWithMetaInfoMap(
-    const paddle::OpKernelInfoMap& op_kernel_info_map);
-
-// Interface for selective register custom kernel.
-void RegisterKernelWithMetaInfo(
-    const std::vector<OpKernelInfo>& op_kernel_infos);
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/details/scope_buffered_monitor.cc b/paddle/fluid/framework/details/scope_buffered_monitor.cc
index 5881148190752d..57faf0e75ba99c 100644
--- a/paddle/fluid/framework/details/scope_buffered_monitor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_monitor.cc
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/scope_buffered_monitor.h"
-#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 
 namespace paddle {
 namespace framework {
@@ -91,7 +91,8 @@ void ScopeBufferedMonitor::Apply(const std::function<void()> &callback,
                                  bool has_fetch) {
   std::unique_ptr<platform::RecordEvent> pre_local_exec_scopes_event(
       new platform::RecordEvent(
-          "ScopeBufferedMonitor::pre_local_exec_scopes_process"));
+          "ScopeBufferedMonitor::pre_local_exec_scopes_process",
+          platform::TracerEventType::UserDefined, 2));
   for (size_t scope_id = 0; scope_id < local_exec_scopes_.size(); ++scope_id) {
     pre_local_exec_scopes_.at(scope_id).clear();
     auto scopes = local_exec_scopes_.at(scope_id)->kids();
@@ -105,7 +106,8 @@ void ScopeBufferedMonitor::Apply(const std::function<void()> &callback,
 
   std::unique_ptr<platform::RecordEvent> post_local_exec_scopes_event(
       new platform::RecordEvent(
-          "ScopeBufferedMonitor::post_local_exec_scopes_process"));
+          "ScopeBufferedMonitor::post_local_exec_scopes_process",
+          platform::TracerEventType::UserDefined, 2));
   for (size_t scope_id = 0; scope_id < local_exec_scopes_.size(); ++scope_id) {
     post_local_exec_scopes_.at(scope_id).clear();
     auto scopes = local_exec_scopes_.at(scope_id)->kids();
diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
index 5d271d06b6922f..d49630129757b8 100644
--- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
@@ -23,7 +23,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
-#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 
 namespace paddle {
 namespace framework {
@@ -75,7 +75,8 @@ FetchResultType ScopeBufferedSSAGraphExecutor::Run(
 #endif
 
   if (drop_scope_counter_ == 0) {
-    platform::RecordEvent e("InitLocalVars");
+    platform::RecordEvent e("InitLocalVars",
+                            platform::TracerEventType::UserDefined, 2);
     InitVariables();
   }
 
@@ -164,7 +165,8 @@ void ScopeBufferedSSAGraphExecutor::InitVariables() {
 }
 
 void ScopeBufferedSSAGraphExecutor::DropLocalExeScopes(bool need_wait) {
-  platform::RecordEvent drop_scope_event("DropLocalExeScopes");
+  platform::RecordEvent drop_scope_event(
+      "DropLocalExeScopes", platform::TracerEventType::UserDefined, 2);
   drop_scope_counter_ = 0;
   if (need_wait) {
     for (auto &p : places_) {
diff --git a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
index f0de723c20b740..d198eb1459288f 100644
--- a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
@@ -25,7 +25,7 @@
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 
 DECLARE_bool(sync_nccl_allreduce);
 
@@ -66,7 +66,8 @@ SparseAllReduceOpHandle::SparseAllReduceOpHandle(
 }
 
 void SparseAllReduceOpHandle::RunImplEncoded() {
-  platform::RecordEvent record_event(Name());
+  platform::RecordEvent record_event(Name(),
+                                     platform::TracerEventType::UserDefined, 2);
 
   auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
   auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
@@ -279,6 +280,8 @@ bool SparseAllReduceOpHandle::IsEncoded() {
 }
 
 void SparseAllReduceOpHandle::RunImpl() {
+  platform::RecordEvent record_event(
+      Name(), platform::TracerEventType::Communication, 1);
   if (!IsEncoded()) {
     AllReduceOpHandle::RunImpl();
     return;
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index c8a6cd25f0fcbe..39683c9a0d8680 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -15,7 +15,7 @@
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 
 #include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 
 #if defined PADDLE_WITH_PSCORE
 #include "paddle/fluid/distributed/ps/service/communicator/communicator.h"
@@ -56,7 +56,8 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
 inline FetchResultType ThreadedSSAGraphExecutor::RunImpl(
     const std::vector<std::string> &fetch_tensors, bool return_merged) {
   std::unique_ptr<platform::RecordEvent> event(
-      new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare"));
+      new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare",
+                                platform::TracerEventType::UserDefined, 2));
   std::unique_ptr<OpDependentData> op_deps = op_deps_futures_.get();
   CopyOpDeps();
 
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 4e6a4d5360860e..48850d4624a14c 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/controlflow/while_op_helper.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
index 189724a5455200..17346f5fd93932 100644
--- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
@@ -10,6 +10,8 @@ IF(WITH_GPU)
     nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h mem_pool.h DEPS ${HETERPS_DEPS})
     nv_test(test_heter_comm SRCS feature_value.h DEPS heter_comm)
     nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm)
+    nv_library(graph_gpu_ps SRCS graph_gpu_ps_table.h DEPS heter_comm)
+    nv_test(test_graph_comm SRCS test_graph.cu DEPS graph_gpu_ps)
 ENDIF()
 IF(WITH_ROCM)
     hip_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context)
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
new file mode 100644
index 00000000000000..a6508bf96c00f8
--- /dev/null
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h
@@ -0,0 +1,144 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "heter_comm.h"
+#include "paddle/fluid/platform/enforce.h"
+#ifdef PADDLE_WITH_HETERPS
+namespace paddle {
+namespace framework {
+struct GpuPsGraphNode {
+  int64_t node_id;
+  int neighbor_size, neighbor_offset;
+  // this node's neighbor is stored on [neighbor_offset,neighbor_offset +
+  // neighbor_size) of int64_t *neighbor_list;
+};
+
+struct GpuPsCommGraph {
+  int64_t *neighbor_list;
+  GpuPsGraphNode *node_list;
+  int neighbor_size, node_size;
+  // the size of neighbor array and graph_node_list array
+  GpuPsCommGraph()
+      : neighbor_list(NULL), node_list(NULL), neighbor_size(0), node_size(0) {}
+  GpuPsCommGraph(int64_t *neighbor_list_, GpuPsGraphNode *node_list_,
+                 int neighbor_size_, int node_size_)
+      : neighbor_list(neighbor_list_),
+        node_list(node_list_),
+        neighbor_size(neighbor_size_),
+        node_size(node_size_) {}
+};
+
+/*
+suppose we have a graph like this
+
+0----3-----5----7
+ \   |\         |\
+ 17  8 9        1 2
+
+we save the nodes in arbitrary order,
+in this example,the order is
+[0,5,1,2,7,3,8,9,17]
+let us name this array u_id;
+we record each node's neighbors:
+0:3,17
+5:3,7
+1:7
+2:7
+7:1,2,5
+3:0,5,8,9
+8:3
+9:3
+17:0
+
+by concatenating each node's neighbor_list in the order we save the node id.
+we get [3,17,3,7,7,7,1,2,5,0,5,8,9,3,3,0]
+this is the neighbor_list of GpuPsCommGraph
+given this neighbor_list and the order to save node id,
+we know,
+node 0's neighbors are in the range [0,1] of neighbor_list
+node 5's neighbors are in the range [2,3] of neighbor_list
+node 1's neighbors are in the range [4,4] of neighbor_list
+node 2:[5,5]
+node 7:[6,6]
+node 3:[9,12]
+node 8:[13,13]
+node 9:[14,14]
+node 17:[15,15]
+...
+by the above information,
+we generate a node_list:GpuPsGraphNode *graph_node_list in GpuPsCommGraph
+of size 9,
+where node_list[i].id = u_id[i]
+then we have:
+node_list[0]-> node_id:0, neighbor_size:2, neighbor_offset:0
+node_list[1]-> node_id:5, neighbor_size:2, neighbor_offset:2
+node_list[2]-> node_id:1, neighbor_size:1, neighbor_offset:4
+node_list[3]-> node_id:2, neighbor_size:1, neighbor_offset:5
+node_list[4]-> node_id:7, neighbor_size:3, neighbor_offset:6
+node_list[5]-> node_id:3, neighbor_size:4, neighbor_offset:9
+node_list[6]-> node_id:8, neighbor_size:1, neighbor_offset:13
+node_list[7]-> node_id:9, neighbor_size:1, neighbor_offset:14
+node_list[8]-> node_id:17, neighbor_size:1, neighbor_offset:15
+*/
+struct NeighborSampleResult {
+  int64_t *val;
+  int *actual_sample_size, sample_size, key_size;
+  NeighborSampleResult(int _sample_size, int _key_size)
+      : sample_size(_sample_size), key_size(_key_size) {
+    actual_sample_size = NULL;
+    val = NULL;
+  };
+  ~NeighborSampleResult() {
+    if (val != NULL) cudaFree(val);
+    if (actual_sample_size != NULL) cudaFree(actual_sample_size);
+  }
+};
+
+struct NodeQueryResult {
+  int64_t *val;
+  int actual_sample_size;
+  NodeQueryResult() {
+    val = NULL;
+    actual_sample_size = 0;
+  };
+  ~NodeQueryResult() {
+    if (val != NULL) cudaFree(val);
+  }
+};
+class GpuPsGraphTable : public HeterComm<int64_t, int, int> {
+ public:
+  GpuPsGraphTable(std::shared_ptr<HeterPsResource> resource)
+      : HeterComm<int64_t, int, int>(1, resource) {
+    load_factor_ = 0.25;
+  }
+  void build_graph_from_cpu(std::vector<GpuPsCommGraph> &cpu_node_list);
+  NodeQueryResult *graph_node_sample(int gpu_id, int sample_size);
+  NeighborSampleResult *graph_neighbor_sample(int gpu_id, int64_t *key,
+                                              int sample_size, int len);
+  NodeQueryResult *query_node_list(int gpu_id, int start, int query_size);
+  void clear_graph_info();
+  void move_neighbor_sample_result_to_source_gpu(int gpu_id, int gpu_num,
+                                                 int sample_size, int *h_left,
+                                                 int *h_right,
+                                                 int64_t *src_sample_res,
+                                                 int *actual_sample_size);
+
+ private:
+  std::vector<GpuPsCommGraph> gpu_graph_list;
+};
+}
+};
+#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h"
+#endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h
new file mode 100644
index 00000000000000..839c7e5468c6c6
--- /dev/null
+++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.h
@@ -0,0 +1,447 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#ifdef PADDLE_WITH_HETERPS
+namespace paddle {
+namespace framework {
+/*
+comment 0
+this kernel just serves as an example of how to sample nodes' neighbors.
+feel free to modify it
+index[0,len) saves the nodes' index
+actual_size[0,len) is to save the sample size of each node.
+for ith node in index, actual_size[i] = min(node i's neighbor size, sample size)
+sample_result is to save the neighbor sampling result, its size is len *
+sample_size;
+
+*/
+
+__global__ void neighbor_sample_example(GpuPsCommGraph graph, int* index,
+                                        int* actual_size,
+                                        int64_t* sample_result, int sample_size,
+                                        int len) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) {
+    auto node_index = index[i];
+    actual_size[i] = graph.node_list[node_index].neighbor_size < sample_size
+                         ? graph.node_list[node_index].neighbor_size
+                         : sample_size;
+    int offset = graph.node_list[node_index].neighbor_offset;
+    for (int j = 0; j < actual_size[i]; j++) {
+      sample_result[sample_size * i + j] = graph.neighbor_list[offset + j];
+    }
+  }
+}
+
+/*
+ comment 1
+
+ gpu i triggers a neighbor_sample task,
+ when this task is done,
+ this function is called to move the sample result on other gpu back
+ to gup i and aggragate the result.
+ the sample_result is saved on src_sample_res and the actual sample size for
+ each node is saved on actual_sample_size.
+ the number of actual sample_result for
+ key[x] (refer to comment 2 for definition of key)
+ is saved on  actual_sample_size[x], since the neighbor size of key[x] might be
+ smaller than sample_size,
+ is saved on src_sample_res [x*sample_size, x*sample_size +
+ actual_sample_size[x])
+
+ since before each gpu runs the neighbor_sample task,the key array is shuffled,
+ but we have the idx array to save the original order.
+ when the gpu i gets all the sample results from other gpus, it relies on
+ idx array to recover the original order.
+ that's what fill_dvals does.
+
+*/
+void GpuPsGraphTable::move_neighbor_sample_result_to_source_gpu(
+    int gpu_id, int gpu_num, int sample_size, int* h_left, int* h_right,
+    int64_t* src_sample_res, int* actual_sample_size) {
+  for (int i = 0; i < gpu_num; i++) {
+    if (h_left[i] == -1 || h_right[i] == -1) {
+      continue;
+    }
+    auto shard_len = h_right[i] - h_left[i] + 1;
+    // int cur_step = path_[gpu_id][i].nodes_.size() - 1;
+    // auto& node = path_[gpu_id][i].nodes_[cur_step];
+    auto& node = path_[gpu_id][i].nodes_.front();
+    cudaMemcpyAsync(
+        reinterpret_cast<char*>(src_sample_res + h_left[i] * sample_size),
+        node.val_storage + sizeof(int64_t) * shard_len,
+        node.val_bytes_len - sizeof(int64_t) * shard_len, cudaMemcpyDefault,
+        node.out_stream);
+    cudaMemcpyAsync(reinterpret_cast<char*>(actual_sample_size + h_left[i]),
+                    node.val_storage + sizeof(int) * shard_len,
+                    sizeof(int) * shard_len, cudaMemcpyDefault,
+                    node.out_stream);
+  }
+  for (int i = 0; i < gpu_num; ++i) {
+    if (h_left[i] == -1 || h_right[i] == -1) {
+      continue;
+    }
+    auto& node = path_[gpu_id][i].nodes_.front();
+    cudaStreamSynchronize(node.out_stream);
+  }
+}
+
+/*
+TODO:
+how to optimize it to eliminate the for loop
+*/
+__global__ void fill_dvalues(int64_t* d_shard_vals, int64_t* d_vals,
+                             int* d_shard_actual_sample_size,
+                             int* d_actual_sample_size, int* idx,
+                             int sample_size, int len) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < len) {
+    d_actual_sample_size[idx[i]] = d_shard_actual_sample_size[i];
+    // d_vals[idx[i]] = d_shard_vals[i];
+    for (int j = 0; j < sample_size; j++) {
+      d_vals[idx[i] * sample_size + j] = d_shard_vals[i * sample_size + j];
+    }
+  }
+}
+
+__global__ void node_query_example(GpuPsCommGraph graph, int start, int size,
+                                   int64_t* res) {
+  const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < size) {
+    res[i] = graph.node_list[start + i].node_id;
+  }
+}
+
+void GpuPsGraphTable::clear_graph_info() {
+  if (tables_.size()) {
+    for (auto table : tables_) delete table;
+  }
+  tables_.clear();
+  for (auto graph : gpu_graph_list) {
+    if (graph.neighbor_list != NULL) {
+      cudaFree(graph.neighbor_list);
+    }
+    if (graph.node_list != NULL) {
+      cudaFree(graph.node_list);
+    }
+  }
+  gpu_graph_list.clear();
+}
+/*
+the parameter std::vector<GpuPsCommGraph> cpu_graph_list is generated by cpu.
+it saves the graph to be saved on each gpu.
+
+for the ith GpuPsCommGraph, any the node's key satisfies that key % gpu_number
+== i
+
+In this function, memory is allocated on each gpu to save the graphs,
+gpu i saves the ith graph from cpu_graph_list
+*/
+
+void GpuPsGraphTable::build_graph_from_cpu(
+    std::vector<GpuPsCommGraph>& cpu_graph_list) {
+  PADDLE_ENFORCE_EQ(
+      cpu_graph_list.size(), resource_->total_gpu(),
+      platform::errors::InvalidArgument("the cpu node list size doesn't match "
+                                        "the number of gpu on your machine."));
+  clear_graph_info();
+  for (int i = 0; i < cpu_graph_list.size(); i++) {
+    platform::CUDADeviceGuard guard(resource_->dev_id(i));
+    gpu_graph_list.push_back(GpuPsCommGraph());
+    auto table =
+        new Table(std::max(1, cpu_graph_list[i].node_size) / load_factor_);
+    tables_.push_back(table);
+    if (cpu_graph_list[i].node_size > 0) {
+      std::vector<int64_t> keys;
+      std::vector<int> offset;
+      cudaMalloc((void**)&gpu_graph_list[i].node_list,
+                 cpu_graph_list[i].node_size * sizeof(GpuPsGraphNode));
+      cudaMemcpy(gpu_graph_list[i].node_list, cpu_graph_list[i].node_list,
+                 cpu_graph_list[i].node_size * sizeof(GpuPsGraphNode),
+                 cudaMemcpyHostToDevice);
+      for (int j = 0; j < cpu_graph_list[i].node_size; j++) {
+        keys.push_back(cpu_graph_list[i].node_list[j].node_id);
+        offset.push_back(j);
+      }
+      build_ps(i, keys.data(), offset.data(), keys.size(), 1024, 8);
+      gpu_graph_list[i].node_size = cpu_graph_list[i].node_size;
+    } else {
+      gpu_graph_list[i].node_list = NULL;
+      gpu_graph_list[i].node_size = 0;
+    }
+    if (cpu_graph_list[i].neighbor_size) {
+      cudaMalloc((void**)&gpu_graph_list[i].neighbor_list,
+                 cpu_graph_list[i].neighbor_size * sizeof(int64_t));
+      cudaMemcpy(gpu_graph_list[i].neighbor_list,
+                 cpu_graph_list[i].neighbor_list,
+                 cpu_graph_list[i].neighbor_size * sizeof(int64_t),
+                 cudaMemcpyHostToDevice);
+      gpu_graph_list[i].neighbor_size = cpu_graph_list[i].neighbor_size;
+    } else {
+      gpu_graph_list[i].neighbor_list = NULL;
+      gpu_graph_list[i].neighbor_size = 0;
+    }
+  }
+  cudaDeviceSynchronize();
+}
+NeighborSampleResult* GpuPsGraphTable::graph_neighbor_sample(int gpu_id,
+                                                             int64_t* key,
+                                                             int sample_size,
+                                                             int len) {
+  /*
+ comment 2
+  this function shares some kernels with heter_comm_inl.h
+  arguments definitions:
+  gpu_id:the id of gpu.
+  len:how many keys are used,(the length of array key)
+  sample_size:how many neighbors should be sampled for each node in key.
+
+  the code below shuffle the key array to make the keys
+    that belong to a gpu-card stay together,
+    the shuffled result is saved on d_shard_keys,
+    if ith element in d_shard_keys_ptr is
+    from jth element in the original key array, then idx[i] = j,
+    idx could be used to recover the original array.
+    if keys in range [a,b] belong to ith-gpu, then h_left[i] = a, h_right[i] =
+ b,
+    if no keys are allocated for ith-gpu, then h_left[i] == h_right[i] == -1
+
+    for example, suppose key = [0,1,2,3,4,5,6,7,8], gpu_num = 2
+    when we run this neighbor_sample function,
+    the key is shuffled to [0,2,4,6,8,1,3,5,7]
+    the first part (0,2,4,6,8) % 2 == 0,thus should be handled by gpu 0,
+    the rest part should be handled by gpu1, because (1,3,5,7) % 2 == 1,
+    h_left = [0,5],h_right = [4,8]
+
+  */
+  NeighborSampleResult* result = new NeighborSampleResult(sample_size, len);
+  if (len == 0) {
+    return result;
+  }
+  cudaMalloc((void**)&result->val, len * sample_size * sizeof(int64_t));
+  cudaMalloc((void**)&result->actual_sample_size, len * sizeof(int));
+  int* actual_sample_size = result->actual_sample_size;
+  int64_t* val = result->val;
+  int total_gpu = resource_->total_gpu();
+  int dev_id = resource_->dev_id(gpu_id);
+  platform::CUDAPlace place = platform::CUDAPlace(dev_id);
+  platform::CUDADeviceGuard guard(dev_id);
+  auto stream = resource_->local_stream(gpu_id, 0);
+
+  int grid_size = (len - 1) / block_size_ + 1;
+
+  int h_left[total_gpu];   // NOLINT
+  int h_right[total_gpu];  // NOLINT
+
+  auto d_left = memory::Alloc(place, total_gpu * sizeof(int));
+  auto d_right = memory::Alloc(place, total_gpu * sizeof(int));
+  int* d_left_ptr = reinterpret_cast<int*>(d_left->ptr());
+  int* d_right_ptr = reinterpret_cast<int*>(d_right->ptr());
+
+  cudaMemsetAsync(d_left_ptr, -1, total_gpu * sizeof(int), stream);
+  cudaMemsetAsync(d_right_ptr, -1, total_gpu * sizeof(int), stream);
+  //
+  auto d_idx = memory::Alloc(place, len * sizeof(int));
+  int* d_idx_ptr = reinterpret_cast<int*>(d_idx->ptr());
+
+  auto d_shard_keys = memory::Alloc(place, len * sizeof(int64_t));
+  int64_t* d_shard_keys_ptr = reinterpret_cast<int64_t*>(d_shard_keys->ptr());
+  auto d_shard_vals = memory::Alloc(place, len * sizeof(int64_t));
+  int64_t* d_shard_vals_ptr = reinterpret_cast<int64_t*>(d_shard_vals->ptr());
+  auto d_shard_actual_sample_size = memory::Alloc(place, len * sizeof(int));
+  int* d_shard_actual_sample_size_ptr =
+      reinterpret_cast<int*>(d_shard_actual_sample_size->ptr());
+
+  split_input_to_shard(key, d_idx_ptr, len, d_left_ptr, d_right_ptr, gpu_id);
+
+  fill_shard_key<<<grid_size, block_size_, 0, stream>>>(d_shard_keys_ptr, key,
+                                                        d_idx_ptr, len);
+
+  cudaStreamSynchronize(stream);
+
+  cudaMemcpy(h_left, d_left_ptr, total_gpu * sizeof(int),
+             cudaMemcpyDeviceToHost);
+  cudaMemcpy(h_right, d_right_ptr, total_gpu * sizeof(int),
+             cudaMemcpyDeviceToHost);
+
+  for (int i = 0; i < total_gpu; ++i) {
+    int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
+    if (shard_len == 0) {
+      continue;
+    }
+    /*
+   comment 3
+    shard_len denotes the size of keys on i-th gpu here,
+    when we sample  on i-th gpu, we allocate shard_len * (1 + sample_size)
+   int64_t units
+    of memory, we use alloc_mem_i to denote it, the range [0,shard_len) is saved
+   for the respective nodes' indexes
+    and acutal sample_size.
+    with nodes' indexes we could get the nodes to sample.
+    since size of int64_t is 8 bits, while size of int is 4,
+    the range of [0,shard_len) contains shard_len * 2 int uinits;
+    The values of the first half of this range will be updated by
+    the k-v map on i-th-gpu.
+    The second half of this range is saved for actual sample size of each node.
+    For node x,
+    its sampling result is saved on the range
+    [shard_len + sample_size * x,shard_len + sample_size * x +
+   actual_sample_size_of_x)
+    of alloc_mem_i, actual_sample_size_of_x equals ((int
+   *)alloc_mem_i)[shard_len + x]
+    */
+    create_storage(gpu_id, i, shard_len * sizeof(int64_t),
+                   shard_len * (1 + sample_size) * sizeof(int64_t));
+  }
+  walk_to_dest(gpu_id, total_gpu, h_left, h_right, d_shard_keys_ptr, NULL);
+
+  for (int i = 0; i < total_gpu; ++i) {
+    if (h_left[i] == -1) {
+      continue;
+    }
+    // auto& node = path_[gpu_id][i].nodes_.back();
+    auto& node = path_[gpu_id][i].nodes_.front();
+    cudaStreamSynchronize(node.in_stream);
+    platform::CUDADeviceGuard guard(resource_->dev_id(i));
+    // use the key-value map to update alloc_mem_i[0,shard_len)
+    tables_[i]->rwlock_->RDLock();
+    tables_[i]->get(reinterpret_cast<int64_t*>(node.key_storage),
+                    reinterpret_cast<int*>(node.val_storage),
+                    h_right[i] - h_left[i] + 1,
+                    resource_->remote_stream(i, gpu_id));
+  }
+  for (int i = 0; i < total_gpu; ++i) {
+    if (h_left[i] == -1) {
+      continue;
+    }
+    // cudaStreamSynchronize(resource_->remote_stream(i, num));
+    // tables_[i]->rwlock_->UNLock();
+    platform::CUDADeviceGuard guard(resource_->dev_id(i));
+    auto& node = path_[gpu_id][i].nodes_.front();
+    auto shard_len = h_right[i] - h_left[i] + 1;
+    auto graph = gpu_graph_list[i];
+    int* res_array = reinterpret_cast<int*>(node.val_storage);
+    int* actual_size_array = res_array + shard_len;
+    int64_t* sample_array = (int64_t*)(res_array + shard_len * 2);
+    neighbor_sample_example<<<grid_size, block_size_, 0,
+                              resource_->remote_stream(i, gpu_id)>>>(
+        graph, res_array, actual_size_array, sample_array, sample_size,
+        shard_len);
+  }
+
+  for (int i = 0; i < total_gpu; ++i) {
+    if (h_left[i] == -1) {
+      continue;
+    }
+    cudaStreamSynchronize(resource_->remote_stream(i, gpu_id));
+    tables_[i]->rwlock_->UNLock();
+  }
+  // walk_to_src(num, total_gpu, h_left, h_right, d_shard_vals_ptr);
+  move_neighbor_sample_result_to_source_gpu(gpu_id, total_gpu, sample_size,
+                                            h_left, h_right, d_shard_vals_ptr,
+                                            d_shard_actual_sample_size_ptr);
+
+  fill_dvalues<<<grid_size, block_size_, 0, stream>>>(
+      d_shard_vals_ptr, val, d_shard_actual_sample_size_ptr, actual_sample_size,
+      d_idx_ptr, sample_size, len);
+  cudaStreamSynchronize(stream);
+  for (int i = 0; i < total_gpu; ++i) {
+    int shard_len = h_left[i] == -1 ? 0 : h_right[i] - h_left[i] + 1;
+    if (shard_len == 0) {
+      continue;
+    }
+    destroy_storage(gpu_id, i);
+  }
+  return result;
+}
+
+NodeQueryResult* GpuPsGraphTable::graph_node_sample(int gpu_id,
+                                                    int sample_size) {}
+
+NodeQueryResult* GpuPsGraphTable::query_node_list(int gpu_id, int start,
+                                                  int query_size) {
+  NodeQueryResult* result = new NodeQueryResult();
+  if (query_size <= 0) return result;
+  int& actual_size = result->actual_sample_size;
+  actual_size = 0;
+  cudaMalloc((void**)&result->val, query_size * sizeof(int64_t));
+  int64_t* val = result->val;
+  int dev_id = resource_->dev_id(gpu_id);
+  platform::CUDADeviceGuard guard(dev_id);
+  std::vector<int> idx, gpu_begin_pos, local_begin_pos, sample_size;
+  int size = 0;
+  /*
+  if idx[i] = a, gpu_begin_pos[i] = p1,
+  gpu_local_begin_pos[i] = p2;
+  sample_size[i] = s;
+  then on gpu a, the nodes of positions [p1,p1 + s) should be returned
+  and saved from the p2 position on the sample_result array
+
+  for example:
+  suppose
+  gpu 0 saves [0,2,4,6,8], gpu1 saves [1,3,5,7]
+  start = 3, query_size = 5
+  we know [6,8,1,3,5] should be returned;
+  idx = [0,1]
+  gpu_begin_pos = [3,0]
+  local_begin_pos = [0,3]
+  sample_size = [2,3]
+
+  */
+  for (int i = 0; i < gpu_graph_list.size() && query_size != 0; i++) {
+    auto graph = gpu_graph_list[i];
+    if (graph.node_size == 0) {
+      continue;
+    }
+    if (graph.node_size + size > start) {
+      int cur_size = min(query_size, graph.node_size + size - start);
+      query_size -= cur_size;
+      idx.emplace_back(i);
+      gpu_begin_pos.emplace_back(start - size);
+      local_begin_pos.emplace_back(actual_size);
+      start += cur_size;
+      actual_size += cur_size;
+      sample_size.emplace_back(cur_size);
+      create_storage(gpu_id, i, 1, cur_size * sizeof(int64_t));
+    }
+    size += graph.node_size;
+  }
+  for (int i = 0; i < idx.size(); i++) {
+    int dev_id_i = resource_->dev_id(idx[i]);
+    platform::CUDADeviceGuard guard(dev_id_i);
+    auto& node = path_[gpu_id][idx[i]].nodes_.front();
+    int grid_size = (sample_size[i] - 1) / block_size_ + 1;
+    node_query_example<<<grid_size, block_size_, 0,
+                         resource_->remote_stream(idx[i], gpu_id)>>>(
+        gpu_graph_list[idx[i]], gpu_begin_pos[i], sample_size[i],
+        (int64_t*)node.val_storage);
+  }
+
+  for (int i = 0; i < idx.size(); i++) {
+    cudaStreamSynchronize(resource_->remote_stream(idx[i], gpu_id));
+    auto& node = path_[gpu_id][idx[i]].nodes_.front();
+    cudaMemcpyAsync(reinterpret_cast<char*>(val + local_begin_pos[i]),
+                    node.val_storage, node.val_bytes_len, cudaMemcpyDefault,
+                    node.out_stream);
+  }
+  for (int i = 0; i < idx.size(); i++) {
+    auto& node = path_[gpu_id][idx[i]].nodes_.front();
+    cudaStreamSynchronize(node.out_stream);
+  }
+  return result;
+}
+}
+};
+#endif
diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
index 7b43e68ff0151e..1fca8cdf8bb801 100644
--- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
+++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h
@@ -173,16 +173,18 @@ class HeterComm {
   void walk_to_src(int start_index, int gpu_num, int* h_left, int* h_right,
                    ValType* src_val);
 
- private:
+ protected:
   using Table = HashTable<KeyType, ValType>;
-  int block_size_{256};
-  float load_factor_{0.75};
   std::vector<Table*> tables_;
   std::shared_ptr<HeterPsResource> resource_;
-  CustomGradMerger merger_;
-  int topo_aware_{0};
   std::vector<std::vector<Path>> path_;
+  float load_factor_{0.75};
+  int block_size_{256};
+
+ private:
   std::vector<LocalStorage> storage_;
+  CustomGradMerger merger_;
+  int topo_aware_{0};
   int feanum_{1800 * 2048};
   int multi_node_{0};
   std::vector<ncclComm_t> nccl_inner_comms_;
diff --git a/paddle/fluid/framework/fleet/heter_ps/test_graph.cu b/paddle/fluid/framework/fleet/heter_ps/test_graph.cu
new file mode 100644
index 00000000000000..697e0ba2cdf347
--- /dev/null
+++ b/paddle/fluid/framework/fleet/heter_ps/test_graph.cu
@@ -0,0 +1,112 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <vector>
+#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
+#include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
+#include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h"
+#include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h"
+#include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h"
+#include "paddle/fluid/platform/cuda_device_guard.h"
+
+using namespace paddle::framework;
+TEST(TEST_FLEET, graph_comm) {
+  int gpu_count = 3;
+  std::vector<int> dev_ids;
+  dev_ids.push_back(0);
+  dev_ids.push_back(1);
+  dev_ids.push_back(2);
+  std::shared_ptr<HeterPsResource> resource =
+      std::make_shared<HeterPsResource>(dev_ids);
+  resource->enable_p2p();
+  GpuPsGraphTable g(resource);
+  int node_count = 10;
+  std::vector<std::vector<int64_t>> neighbors(node_count);
+  int ind = 0;
+  int64_t node_id = 0;
+  std::vector<GpuPsCommGraph> graph_list(gpu_count);
+  while (ind < node_count) {
+    int neighbor_size = ind + 1;
+    graph_list[ind % gpu_count].node_size++;
+    graph_list[ind % gpu_count].neighbor_size += neighbor_size;
+    while (neighbor_size--) {
+      neighbors[ind].push_back(node_id++);
+    }
+    ind++;
+  }
+  std::vector<int> neighbor_offset(gpu_count, 0), node_index(gpu_count, 0);
+  for (int i = 0; i < graph_list.size(); i++) {
+    graph_list[i].node_list = new GpuPsGraphNode[graph_list[i].node_size];
+    graph_list[i].neighbor_list = new int64_t[graph_list[i].neighbor_size];
+  }
+  for (int i = 0; i < node_count; i++) {
+    ind = i % gpu_count;
+    graph_list[ind].node_list[node_index[ind]].node_id = i;
+    graph_list[ind].node_list[node_index[ind]].neighbor_offset =
+        neighbor_offset[ind];
+    graph_list[ind].node_list[node_index[ind]].neighbor_size =
+        neighbors[i].size();
+    for (auto x : neighbors[i]) {
+      graph_list[ind].neighbor_list[neighbor_offset[ind]++] = x;
+    }
+    node_index[ind]++;
+  }
+  g.build_graph_from_cpu(graph_list);
+  /*
+  gpu 0:
+  0,3,6,9
+  gpu 1:
+  1,4,7
+  gpu 2:
+  2,5,8
+
+  query(2,6) returns nodes [6,9,1,4,7,2]
+  */
+  int64_t answer[6] = {6, 9, 1, 4, 7, 2};
+  int64_t *res = new int64_t[6];
+  auto query_res = g.query_node_list(0, 2, 6);
+  cudaMemcpy(res, query_res->val, 48, cudaMemcpyDeviceToHost);
+  ASSERT_EQ(query_res->actual_sample_size, 6);
+  for (int i = 0; i < 6; i++) {
+    ASSERT_EQ(res[i], answer[i]);
+  }
+  delete[] res;
+  delete query_res;
+  /*
+   node x's neighbor list = [(1+x)*x/2,(1+x)*x/2 + 1,.....,(1+x)*x/2 + x]
+   so node 6's neighbors are [21,22...,27]
+   node 7's neighbors are [28,29,..35]
+    node 0's neighbors are [0]
+   query([7,0,6],sample_size=3) should return [28,29,30,0,x,x,21,22,23]
+   6 --index-->2
+   0 --index--->0
+   7 --index-->2
+  */
+  int64_t cpu_key[3] = {7, 0, 6};
+  void *key;
+  cudaMalloc((void **)&key, 3 * sizeof(int64_t));
+  cudaMemcpy(key, cpu_key, 3 * sizeof(int64_t), cudaMemcpyHostToDevice);
+  auto neighbor_sample_res = g.graph_neighbor_sample(0, (int64_t *)key, 3, 3);
+  res = new int64_t[9];
+  cudaMemcpy(res, neighbor_sample_res->val, 72, cudaMemcpyDeviceToHost);
+  int64_t expected_sample_val[] = {28, 29, 30, 0, -1, -1, 21, 22, 23};
+  for (int i = 0; i < 9; i++) {
+    if (expected_sample_val[i] != -1) {
+      ASSERT_EQ(res[i], expected_sample_val[i]);
+    }
+  }
+  delete[] res;
+  delete neighbor_sample_res;
+}
diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc
index 4a625534909864..aae36cf455dfee 100644
--- a/paddle/fluid/framework/infershape_utils.cc
+++ b/paddle/fluid/framework/infershape_utils.cc
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/common/scalar_array.h"
 #include "paddle/phi/core/compat/arg_map_context.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/compat/op_utils.h"
@@ -54,7 +55,12 @@ class InferShapeArgumentMappingContext : public phi::ArgumentMappingContext {
   }
 
   size_t InputSize(const std::string& name) const override {
-    return ctx_.Inputs(name).size();
+    if (ctx_.HasInputs(name)) {
+      return ctx_.Inputs(name).size();
+    } else if (ctx_.HasInput(name)) {
+      return 1;
+    }
+    return 0;
   }
 
   size_t OutputSize(const std::string& name) const override {
@@ -288,6 +294,16 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
   auto& attr_names = std::get<1>(signature.args);
   auto& output_names = std::get<2>(signature.args);
 
+  auto kernels_map =
+      phi::KernelFactory::Instance().SelectKernelMap(signature.name);
+  if (kernels_map.size() == 0) {
+    PADDLE_THROW(
+        platform::errors::Unimplemented("Not find `%s` kernels when construct "
+                                        "InferMetaContext.",
+                                        signature.name));
+  }
+  auto attr_defs = kernels_map.cbegin()->second.args_def().attribute_defs();
+
   // TODO(chenweihang): support multiple inputs and outputs later
   phi::InferMetaContext infer_mete_context;
   for (auto& in_name : input_names) {
@@ -299,9 +315,70 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
     }
   }
 
+  for (auto& out_name : output_names) {
+    if (ctx->HasOutput(out_name)) {
+      infer_meta_context.EmplaceBackOutput(std::make_shared<CompatMetaTensor>(
+          ctx->GetOutputVarPtrs(out_name)[0], ctx->IsRuntime()));
+    } else {
+      infer_meta_context.EmplaceBackOutput({nullptr});
+    }
+  }
   auto attr_reader = ctx->Attrs();
-  for (auto& attr_name : attr_names) {
-    if (ctx->HasAttr(attr_name)) {
+  for (size_t i = 0; i < attr_names.size(); ++i) {
+    auto attr_name = attr_names[i];
+    if (attr_defs[i].type_index == std::type_index(typeid(phi::ScalarArray))) {
+      // When attr is a vector_tensor or tensor, transform it to ScalarArray
+      if (ctx->HasInputs(attr_name) || ctx->HasInput(attr_name)) {
+        const auto& infershape_inputs = ctx->GetInputVarPtrs(attr_name);
+        if (ctx->IsRuntime()) {
+          // If is in runtime, we will get tensor's value for ScalarArray
+          // and push it into attrs
+          std::vector<Variable*> vars;
+          vars.reserve(infershape_inputs.size());
+          for (size_t i = 0; i < infershape_inputs.size(); i++) {
+            vars.push_back(BOOST_GET_CONST(Variable*, infershape_inputs[i]));
+          }
+          if (infershape_inputs.size() != 1) {
+            infer_meta_context.EmplaceBackAttr(
+                std::move(experimental::MakePtenScalarArrayFromVarList(vars)));
+          } else {
+            infer_meta_context.EmplaceBackAttr(
+                std::move(experimental::MakePtenScalarArrayFromVar(*vars[0])));
+          }
+        } else {
+          // If is not in runtime, we will set default value(-1) for ScalarArray
+          int64_t num_ele = 1;
+          std::vector<VarDesc*> vars;
+          vars.reserve(infershape_inputs.size());
+          for (size_t i = 0; i < infershape_inputs.size(); i++) {
+            vars.push_back(BOOST_GET_CONST(VarDesc*, infershape_inputs[i]));
+          }
+          for (auto& var : vars) {
+            const auto& tensor_dims = var->GetShape();
+            for (size_t i = 0; i < tensor_dims.size(); ++i) {
+              num_ele *= tensor_dims[i];
+            }
+          }
+          phi::ScalarArray tensor_attr(std::vector<int32_t>(num_ele, -1));
+          tensor_attr.SetFromTensor(true);
+          infer_meta_context.EmplaceBackAttr(std::move(tensor_attr));
+        }
+      } else if (ctx->HasAttr(attr_name)) {
+        auto& attr = attr_reader.GetAttr(attr_name);
+        if (std::type_index(attr.type()) ==
+            std::type_index(typeid(std::vector<int32_t>))) {
+          infer_meta_context.EmplaceBackAttr(std::move(
+              phi::ScalarArray(BOOST_GET_CONST(std::vector<int32_t>, attr))));
+        } else {
+          PADDLE_THROW(platform::errors::Unimplemented(
+              "Unsupported cast op attribute `%s` to ScalarArray when "
+              "construct KernelContext.",
+              attr_name));
+        }
+      }
+
+    } else if (ctx->HasAttr(attr_name)) {
+      // Emplace Back Attr according to the type of attr.
       auto& attr = attr_reader.GetAttr(attr_name);
       if (std::type_index(attr.type()) == std::type_index(typeid(bool))) {
         infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(bool, attr));
@@ -345,17 +422,6 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
             "Unsupported attribute type is received when call "
             "InferShapeFunctor."));
       }
-    } else {
-      // do nothing
-    }
-  }
-
-  for (auto& out_name : output_names) {
-    if (ctx->HasOutput(out_name)) {
-      infer_meta_context.EmplaceBackOutput(std::make_shared<CompatMetaTensor>(
-          ctx->GetOutputVarPtrs(out_name)[0], ctx->IsRuntime()));
-    } else {
-      infer_meta_context.EmplaceBackOutput({nullptr});
     }
   }
 
diff --git a/paddle/fluid/framework/infershape_utils_test.cc b/paddle/fluid/framework/infershape_utils_test.cc
index 2554031a91859b..53dcc19fcbae88 100644
--- a/paddle/fluid/framework/infershape_utils_test.cc
+++ b/paddle/fluid/framework/infershape_utils_test.cc
@@ -23,8 +23,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/compat/op_utils.h"
+#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 namespace paddle {
 namespace framework {
@@ -93,6 +96,17 @@ phi::KernelSignature InferShapeUtilsTestOpArgumentMapping(
       {});
 }
 
+template <typename T, typename Context>
+void InferShapeUtilsTestKernel(
+    const Context& dev_ctx, const phi::DenseTensor& x, bool attr1, int attr2,
+    int64_t attr3, float attr4, const std::string& attr5,
+    const std::vector<bool>& attr6, const std::vector<int>& attr7,
+    const std::vector<int64_t>& attr8, const std::vector<float>& attr9,
+    const std::vector<double>& attr10, const std::vector<std::string>& attr11,
+    phi::DenseTensor* out) {
+  VLOG(6) << "Come into InferShapeUtilsTestKernel";
+}
+
 }  // namespace framework
 }  // namespace paddle
 
@@ -104,6 +118,9 @@ REGISTER_OPERATOR(infer_shape_utils_test,
                   paddle::framework::InferShapeUtilsTestOpMaker,
                   InferShapeUtilsTestInferShapeFunctor);
 
+PD_REGISTER_KERNEL(infer_shape_utils_test, CPU, ALL_LAYOUT,
+                   paddle::framework::InferShapeUtilsTestKernel, int) {}
+
 TEST(InferShapeUtilsTest, ALL) {
   paddle::framework::ProgramDesc prog;
   paddle::framework::proto::BlockDesc proto_block;
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 0e1e572a51f7fc..dad5358590cb14 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -147,7 +147,7 @@ if(WITH_IPU)
     pass_library(ipu_runtime_replacer_pass base DIR ipu)
     pass_library(inference_process_pass base DIR ipu)
     pass_library(inference_postprocess_pass base DIR ipu)
-    pass_library(popart_canonicalization_pass base DIR ipu)
+    pass_library(popart_canonicalization_pass base DIR ipu DEPS paddle_ipu)
     pass_library(ipu_inplace_pass base DIR ipu)
     pass_library(infer_shape_pass base DIR ipu)
     pass_library(delete_scale_op_pass base DIR ipu)
diff --git a/paddle/fluid/framework/ir/cost_model.h b/paddle/fluid/framework/ir/cost_model.h
index 41567df2cb332a..20d9be7e95c39f 100644
--- a/paddle/fluid/framework/ir/cost_model.h
+++ b/paddle/fluid/framework/ir/cost_model.h
@@ -26,6 +26,7 @@
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/fluid/platform/variant.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
index 3d8d353cbf530e..9fe50deaf2d726 100644
--- a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc
@@ -56,7 +56,7 @@ const bool is_regularization_op(const std::string& op_namescope) {
 }
 
 void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const {
-  // 这里构建的 op 符合 popart 的定义, 涉及到的一些值需要在 LowerOptimier 时获得
+  // optimizer values will be extracted when lowering optimizer in ipu_backend
   OpDesc new_op("popart_optimizer", {}, {}, {});
   new_op.SetAttr("op_role", 0);
   new_op.SetAttr("with_lr_sched", false);
@@ -86,7 +86,7 @@ void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const {
       bool is_regularization = is_regularization_op(op_namescope);
 
       VLOG(10) << "found optimizer releated op: " << op_type;
-      // initial larning_rate will be set in LowerOptimier
+      // initial larning_rate will be set in ipu_backend
       set_ops.insert(op_type);
       if (op_type == "sgd") {
         auto type = std::string{"sgd"};
diff --git a/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc b/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc
index 975a4b62cc7088..6806e44f095053 100644
--- a/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.h"
 
+#include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 #include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h"
 
@@ -28,11 +29,8 @@ void PopartCanonicalizationPass::ApplyImpl(ir::Graph* graph) const {
 
   auto custom_ops = Get<std::unordered_set<std::string>>("custom_ops");
   std::vector<std::string> missing_ops;
-  auto nodes = graph->Nodes();
-  for (auto* node : nodes) {
-    if (!node->IsOp()) {
-      continue;
-    }
+  auto sorted_ops = TopologySortOperations(*graph);
+  for (auto* node : sorted_ops) {
     auto* op = node->Op();
     auto op_type = op->Type();
 
diff --git a/paddle/fluid/framework/ir/preln_embedding_eltwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/preln_embedding_eltwise_layernorm_fuse_pass.cc
index ca42a613411ba6..d6761d2e82ef30 100644
--- a/paddle/fluid/framework/ir/preln_embedding_eltwise_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/preln_embedding_eltwise_layernorm_fuse_pass.cc
@@ -428,6 +428,19 @@ PrelnEmbeddingEltwiseLayerNormFusePass::
 
 void PrelnEmbeddingEltwiseLayerNormFusePass::ApplyImpl(Graph* graph) const {
   FusePassBase::Init(name_scope_, graph);
+
+  bool enable_int8 = Get<bool>("enable_int8");
+  bool use_oss = Get<bool>("use_oss");
+  bool with_interleaved = Get<bool>("with_interleaved");
+  bool with_dynamic_shape = Get<bool>("with_dynamic_shape");
+  if (!(enable_int8 && use_oss && with_interleaved && with_dynamic_shape)) {
+    VLOG(4) << "preln_embedding_eltwise_layernorm_fuse_pass need: use_trt, "
+               "enable_int8, "
+               "use_oss, with_interleaved, with_dynamic_shape. Stop this pass, "
+               "please reconfig.";
+    return;
+  }
+
   int fusion_count =
       PrelnEmbeddingEltwiseLayerNormFusePass::BuildFusion(graph, name_scope_);
   if (fusion_count > 0) {
diff --git a/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.cc
index 1b7b82cbca9e86..978360d8f0a95b 100644
--- a/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/preln_skip_layernorm_fuse_pass.cc
@@ -39,7 +39,6 @@ struct PrelnSkipLayerNorm : public PatternBase {
   void operator()(PDNode *x, PDNode *y);
 
   // declare operator node's name
-  PATTERN_DECL_NODE(fused_skipe_layernorm);
   PATTERN_DECL_NODE(elementwise);
   PATTERN_DECL_NODE(layer_norm);
   // declare variable node's name
@@ -62,8 +61,13 @@ void PrelnSkipLayerNorm::operator()(PDNode *x, PDNode *y) {
   auto *elementwise_out_var = pattern->NewNode(elementwise_out_repr())
                                   ->assert_is_op_output("elementwise_add")
                                   ->assert_is_op_input("layer_norm", "X")
-                                  ->assert_is_op_input("elementwise_add", "Y");
-
+                                  ->assert_more([](Node *x) {
+                                    if (x->outputs.size() == 2) {
+                                      return true;
+                                    } else {
+                                      return false;
+                                    }
+                                  });
   // Add links for elementwise_add op.
   elementwise->LinksFrom({x, y}).LinksTo({elementwise_out_var});
 
@@ -104,6 +108,18 @@ void PrelnSkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
   PADDLE_ENFORCE_NOT_NULL(
       graph, platform::errors::PreconditionNotMet("graph should not be null."));
   FusePassBase::Init("preln_skip_layernorm_fuse", graph);
+  bool enable_int8 = Get<bool>("enable_int8");
+  bool use_oss = Get<bool>("use_oss");
+  bool with_interleaved = Get<bool>("with_interleaved");
+  bool with_dynamic_shape = Get<bool>("with_dynamic_shape");
+  if (!(enable_int8 && use_oss && with_interleaved && with_dynamic_shape)) {
+    VLOG(4) << "preln_skip_layernorm_fuse_pass need: use_trt, enable_int8, "
+               "use_oss, "
+               "with_interleaved, with_dynamic_shape. Stop this pass, please "
+               "reconfig. ";
+    return;
+  }
+
   int found_subgraph_count = 0;
 
   GraphPatternDetector gpd;
diff --git a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
index db194d59d37baf..bfa14d9296b26e 100644
--- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
@@ -39,7 +39,6 @@ struct SkipLayerNorm : public PatternBase {
   PDNode *operator()(PDNode *x, PDNode *y);
 
   // declare operator node's name
-  PATTERN_DECL_NODE(fused_skipe_layernorm);
   PATTERN_DECL_NODE(elementwise);
   PATTERN_DECL_NODE(layer_norm);
   // declare variable node's name
@@ -59,9 +58,10 @@ PDNode *SkipLayerNorm::operator()(PDNode *x, PDNode *y) {
   y->assert_is_op_input("elementwise_add", "Y");
   auto *elementwise =
       pattern->NewNode(elementwise_repr())->assert_is_op("elementwise_add");
-  auto *elementwise_out_var = pattern->NewNode(elementwise_out_repr())
-                                  ->AsOutput()
-                                  ->assert_is_op_output("elementwise_add");
+  auto *elementwise_out_var =
+      pattern->NewNode(elementwise_out_repr())
+          ->AsOutput()
+          ->assert_is_only_output_of_op("elementwise_add");
 
   // Add links for elementwise_add op.
   elementwise->LinksFrom({x, y}).LinksTo({elementwise_out_var});
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index a73aeddae87857..766a3b9e495d52 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -388,7 +388,9 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
                            : global_scope_->GetMutableScope();
   auto op_with_kernel = dynamic_cast<const framework::OperatorWithKernel*>(op);
   {
-    platform::RecordEvent infershape_event("InferShape");
+    platform::RecordEvent infershape_event(
+        "InferShape", platform::TracerEventType::OperatorInner, 1,
+        platform::EventRole::kInnerOp);
     // If it is OperatorBase, InferShape do nothing.
     if (op_with_kernel != nullptr)
       op_with_kernel->Info().infer_shape_(
@@ -408,7 +410,9 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
     }
   }
   {
-    platform::RecordEvent compute_event("Compute");
+    platform::RecordEvent compute_event(
+        "Compute", platform::TracerEventType::OperatorInner, 1,
+        platform::EventRole::kInnerOp);
     if (op_with_kernel == nullptr) {
       instr_node.OpBase()->Run(*local_scope, place_);
     } else {
diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
index c1d449d30205e2..2c3359ffa8e46f 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
@@ -35,8 +35,8 @@ USE_OP(sigmoid);
 USE_OP(tanh);
 USE_OP(elementwise_mul);
 USE_OP(softmax_with_cross_entropy);
-USE_OP(reduce_mean);
-USE_OP(reduce_sum);
+USE_OP_ITSELF(reduce_mean);
+USE_OP_ITSELF(reduce_sum);
 USE_OP(reduce_sum_grad);
 USE_OP(reduce_mean_grad);
 USE_OP_ITSELF(reshape2_grad);
diff --git a/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h b/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h
index 2ad76562c15dd8..7b3916bafc93ed 100644
--- a/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h
+++ b/paddle/fluid/framework/new_executor/workqueue/nonblocking_threadpool.h
@@ -408,7 +408,8 @@ class ThreadPoolTempl {
       ec_.Notify(true);
       return false;
     }
-    platform::RecordEvent("SleepWaitForWork");
+    platform::RecordEvent("SleepWaitForWork",
+                          platform::TracerEventType::UserDefined, 2);
     ec_.CommitWait(waiter);
     blocked_--;
     return true;
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 1a826f6bdd5e73..5b913ff2d21de5 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -39,6 +39,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
 #include "paddle/fluid/platform/event.h"
 #include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
diff --git a/paddle/fluid/framework/ps_gpu_trainer.cc b/paddle/fluid/framework/ps_gpu_trainer.cc
index 4d34ba85517e16..0705f658ff5fe1 100644
--- a/paddle/fluid/framework/ps_gpu_trainer.cc
+++ b/paddle/fluid/framework/ps_gpu_trainer.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <google/protobuf/text_format.h>
 #include <cstdlib>
 #include <string>
 #include <vector>
@@ -20,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_feed_factory.h"
 #include "paddle/fluid/framework/data_set.h"
 #include "paddle/fluid/framework/device_worker_factory.h"
+#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
 #include "paddle/fluid/framework/trainer.h"
 #if (defined PADDLE_WITH_NCCL || defined PADDLE_WITH_RCCL) && \
     (defined PADDLE_WITH_PSLIB)
@@ -44,6 +46,164 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc,
       dense_grad_names_[table_id][j] = table.dense_grad_name(j);
     }
   }
+  // add for hbmps optimizer config
+  auto fleet_desc_str = trainer_desc.fleet_desc();
+  google::protobuf::TextFormat::ParseFromString(fleet_desc_str, &_ps_param);
+  auto sparse_table =
+      _ps_param.server_param().downpour_server_param().downpour_table_param(0);
+  auto sparse_table_accessor = sparse_table.accessor();
+  auto sparse_table_accessor_parameter =
+      sparse_table_accessor.downpour_accessor_param();
+  auto accessor_class = sparse_table_accessor.accessor_class();
+  // gpups' sparse table optimizer config
+  // now only support single sparse table
+  // auto sparse_table = param_.sparse_table(0);
+  std::unordered_map<std::string, float> config;
+  if (accessor_class == "DownpourFeatureValueAccessor" ||
+      accessor_class == "DownpourCtrAccessor" ||
+      accessor_class == "DownpourCtrDoubleAccessor") {
+    config["nonclk_coeff"] = sparse_table_accessor_parameter.nonclk_coeff();
+    config["clk_coeff"] = sparse_table_accessor_parameter.click_coeff();
+    config["learning_rate"] =
+        sparse_table_accessor.sparse_sgd_param().learning_rate();
+    config["initial_g2sum"] =
+        sparse_table_accessor.sparse_sgd_param().initial_g2sum();
+    config["initial_range"] =
+        sparse_table_accessor.sparse_sgd_param().initial_range();
+    if (sparse_table_accessor.sparse_sgd_param().weight_bounds_size() == 2) {
+      config["min_bound"] =
+          sparse_table_accessor.sparse_sgd_param().weight_bounds()[0];
+      config["max_bound"] =
+          sparse_table_accessor.sparse_sgd_param().weight_bounds()[1];
+    }
+    config["mf_create_thresholds"] = sparse_table_accessor.embedx_threshold();
+  } else if (accessor_class == "DownpourSparseValueAccessor") {
+    auto optimizer_name = sparse_table_accessor.sparse_commonsgd_param().name();
+    if (optimizer_name == "naive") {
+      config["learning_rate"] = sparse_table_accessor.sparse_commonsgd_param()
+                                    .naive()
+                                    .learning_rate();
+      config["initial_range"] = sparse_table_accessor.sparse_commonsgd_param()
+                                    .naive()
+                                    .initial_range();
+      if (sparse_table_accessor.sparse_commonsgd_param()
+              .naive()
+              .weight_bounds_size() == 2) {
+        config["min_bound"] = sparse_table_accessor.sparse_commonsgd_param()
+                                  .naive()
+                                  .weight_bounds()[0];
+        config["max_bound"] = sparse_table_accessor.sparse_commonsgd_param()
+                                  .naive()
+                                  .weight_bounds()[1];
+      }
+    } else if (optimizer_name == "adagrad") {
+      config["learning_rate"] = sparse_table_accessor.sparse_commonsgd_param()
+                                    .adagrad()
+                                    .learning_rate();
+      config["initial_range"] = sparse_table_accessor.sparse_commonsgd_param()
+                                    .adagrad()
+                                    .initial_range();
+      config["initial_g2sum"] = sparse_table_accessor.sparse_commonsgd_param()
+                                    .adagrad()
+                                    .initial_g2sum();
+      if (sparse_table_accessor.sparse_commonsgd_param()
+              .adagrad()
+              .weight_bounds_size() == 2) {
+        config["min_bound"] = sparse_table_accessor.sparse_commonsgd_param()
+                                  .adagrad()
+                                  .weight_bounds()[0];
+        config["max_bound"] = sparse_table_accessor.sparse_commonsgd_param()
+                                  .adagrad()
+                                  .weight_bounds()[1];
+      }
+    } else if (optimizer_name == "adam") {
+      config["learning_rate"] =
+          sparse_table_accessor.sparse_commonsgd_param().adam().learning_rate();
+      config["initial_range"] =
+          sparse_table_accessor.sparse_commonsgd_param().adam().initial_range();
+      if (sparse_table_accessor.sparse_commonsgd_param()
+              .adam()
+              .weight_bounds_size() == 2) {
+        config["min_bound"] = sparse_table_accessor.sparse_commonsgd_param()
+                                  .adam()
+                                  .weight_bounds()[0];
+        config["max_bound"] = sparse_table_accessor.sparse_commonsgd_param()
+                                  .adam()
+                                  .weight_bounds()[1];
+      }
+    }
+  } else if (accessor_class == "DownpourUnitAccessor" ||
+             accessor_class == "DownpourDoubleUnitAccessor") {
+    config["nonclk_coeff"] = sparse_table_accessor_parameter.nonclk_coeff();
+    config["clk_coeff"] = sparse_table_accessor_parameter.click_coeff();
+    auto optimizer_name = sparse_table_accessor.embedx_sgd_param().name();
+    if (optimizer_name == "naive") {
+      config["mf_learning_rate"] =
+          sparse_table_accessor.embedx_sgd_param().naive().learning_rate();
+      config["mf_initial_range"] =
+          sparse_table_accessor.embedx_sgd_param().naive().initial_range();
+      if (sparse_table_accessor.embedx_sgd_param()
+              .naive()
+              .weight_bounds_size() == 2) {
+        config["mf_min_bound"] =
+            sparse_table_accessor.embedx_sgd_param().naive().weight_bounds()[0];
+        config["mf_max_bound"] =
+            sparse_table_accessor.embedx_sgd_param().naive().weight_bounds()[1];
+      }
+    } else if (optimizer_name == "adagrad") {
+      config["mf_learning_rate"] =
+          sparse_table_accessor.embedx_sgd_param().adagrad().learning_rate();
+      config["mf_initial_range"] =
+          sparse_table_accessor.embedx_sgd_param().adagrad().initial_range();
+      config["mf_initial_g2sum"] =
+          sparse_table_accessor.embedx_sgd_param().adagrad().initial_g2sum();
+      if (sparse_table_accessor.embedx_sgd_param()
+              .adagrad()
+              .weight_bounds_size() == 2) {
+        config["mf_min_bound"] = sparse_table_accessor.embedx_sgd_param()
+                                     .adagrad()
+                                     .weight_bounds()[0];
+        config["mf_max_bound"] = sparse_table_accessor.embedx_sgd_param()
+                                     .adagrad()
+                                     .weight_bounds()[1];
+      }
+    } else if (optimizer_name == "std_adagrad") {
+      config["mf_learning_rate"] =
+          sparse_table_accessor.embedx_sgd_param().adagrad().learning_rate();
+      config["mf_initial_range"] =
+          sparse_table_accessor.embedx_sgd_param().adagrad().initial_range();
+      config["mf_initial_g2sum"] =
+          sparse_table_accessor.embedx_sgd_param().adagrad().initial_g2sum();
+      if (sparse_table_accessor.embedx_sgd_param()
+              .adagrad()
+              .weight_bounds_size() == 2) {
+        config["mf_min_bound"] = sparse_table_accessor.embedx_sgd_param()
+                                     .adagrad()
+                                     .weight_bounds()[0];
+        config["mf_max_bound"] = sparse_table_accessor.embedx_sgd_param()
+                                     .adagrad()
+                                     .weight_bounds()[1];
+      }
+    } else if (optimizer_name == "adam") {
+      config["mf_learning_rate"] =
+          sparse_table_accessor.embedx_sgd_param().adam().learning_rate();
+      config["mf_initial_range"] =
+          sparse_table_accessor.embedx_sgd_param().adam().initial_range();
+      if (sparse_table_accessor.embedx_sgd_param()
+              .adam()
+              .weight_bounds_size() == 2) {
+        config["mf_min_bound"] =
+            sparse_table_accessor.embedx_sgd_param().adam().weight_bounds()[0];
+        config["mf_max_bound"] =
+            sparse_table_accessor.embedx_sgd_param().adam().weight_bounds()[1];
+      }
+    }
+    config["mf_create_thresholds"] = sparse_table_accessor.embedx_threshold();
+  }
+
+  auto ps_gpu_wrapper = paddle::framework::PSGPUWrapper::GetInstance();
+  ps_gpu_wrapper->InitializeGPUServer(config);
+
   scale_datanorm_ = trainer_desc.scale_datanorm();
   int place_num = trainer_desc.worker_places_size();
   const std::vector<paddle::framework::DataFeed*> readers =
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 83161fa679014a..1eb5727298c39a 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -23,7 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 
 #include "paddle/phi/core/dense_tensor.h"
 
diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h
index c993895a9f0ea1..85eef89ee27f66 100644
--- a/paddle/fluid/framework/trainer.h
+++ b/paddle/fluid/framework/trainer.h
@@ -36,6 +36,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/reader/blocking_queue.h"
 #include "paddle/phi/backends/dynload/port.h"
 
+#ifdef PADDLE_WITH_PSLIB
+#include <pslib.h>
+#endif
+
 namespace paddle {
 namespace framework {
 
@@ -287,6 +291,9 @@ class PSGPUTrainer : public TrainerBase {
   int mpi_rank_;
   int mpi_size_;
   int dump_file_num_;
+
+  // _ps_param for gpups optimizer config
+  ::paddle::PSParameter _ps_param;
 };
 #endif
 
diff --git a/paddle/fluid/framework/trainer_desc.proto b/paddle/fluid/framework/trainer_desc.proto
index 96d312437b34cf..6fe33545aa22d3 100644
--- a/paddle/fluid/framework/trainer_desc.proto
+++ b/paddle/fluid/framework/trainer_desc.proto
@@ -66,6 +66,9 @@ message TrainerDesc {
   repeated int32 trainers = 35;
   optional int32 trainer_id = 36;
 
+  // add for gpu
+  optional string fleet_desc = 37;
+
   // device worker parameters
   optional HogwildWorkerParameter hogwild_param = 101;
   optional DownpourWorkerParameter downpour_param = 103;
diff --git a/paddle/fluid/framework/var_type_inference.h b/paddle/fluid/framework/var_type_inference.h
index f649c9388f0f65..945b68438e1e70 100644
--- a/paddle/fluid/framework/var_type_inference.h
+++ b/paddle/fluid/framework/var_type_inference.h
@@ -69,6 +69,12 @@ class InferVarTypeContext {
     return op_->Inputs().at(name).size();
   }
 
+  virtual size_t OutputSize(const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(
+        op_, platform::errors::PreconditionNotMet("op_ should not be null"));
+    return op_->Outputs().at(name).size();
+  }
+
   virtual const std::string& InputVarName(const std::string& name,
                                           const int index = 0) const {
     PADDLE_ENFORCE_NOT_NULL(
diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc
index 13b2982b533904..97a188e5c9c271 100644
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -410,7 +410,8 @@ void BasicEngine::Execute() {
     auto& inplace_grad_name_map = shared_cur_node->InplaceGradNameMap();
 
     for (auto& cur_op : *shared_cur_node) {
-      platform::RecordEvent op_type_record_event(cur_op.Type());
+      platform::RecordEvent op_type_record_event(
+          cur_op.Type(), platform::TracerEventType::Operator, 1);
 
       ++op_num;
 
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 34c46f79f57fd4..f3bd85ff29221a 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -233,7 +233,8 @@ void VarBase::ClearGradient(bool set_to_zero) {
         grad_t->mutable_value()->clear();
       }
     } else {
-      platform::RecordEvent record_event("ClearGradient");
+      platform::RecordEvent record_event(
+          "ClearGradient", platform::TracerEventType::UserDefined, 2);
       auto* grad_t =
           grad_var_->MutableVar()->GetMutable<framework::LoDTensor>();
       if (grad_t->IsInitialized()) {
diff --git a/paddle/fluid/imperative/partial_grad_engine.cc b/paddle/fluid/imperative/partial_grad_engine.cc
index e231d3c1801367..f1d0c8afdd50e3 100644
--- a/paddle/fluid/imperative/partial_grad_engine.cc
+++ b/paddle/fluid/imperative/partial_grad_engine.cc
@@ -30,7 +30,7 @@
 #include "paddle/fluid/imperative/op_base.h"
 #include "paddle/fluid/imperative/tracer.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/fluid/string/string_helper.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc
index afe1f92ca03b3b..d05036f7a12ebd 100644
--- a/paddle/fluid/imperative/tests/test_tracer.cc
+++ b/paddle/fluid/imperative/tests/test_tracer.cc
@@ -590,6 +590,6 @@ TEST(test_tracer, eager_tracer) {
 
 USE_OP(mul);
 USE_OP(mul_grad);
-USE_OP(reduce_sum);
+USE_OP_ITSELF(reduce_sum);
 USE_OP(reduce_sum_grad);
 USE_OP_ITSELF(elementwise_add);
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index a669ff9d5a6910..03811ac778779c 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -22,6 +22,7 @@
 #include "paddle/fluid/platform/denormal.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/fluid/string/string_helper.h"
 
 DECLARE_bool(use_mkldnn);
@@ -31,6 +32,8 @@ DECLARE_string(tracer_mkldnn_ops_off);
 namespace paddle {
 namespace imperative {
 
+thread_local bool Tracer::enable_program_desc_tracing_ = false;
+
 thread_local bool Tracer::has_grad_ = true;
 
 thread_local AmpLevel Tracer::amp_level_ = AmpLevel::O0;
@@ -171,7 +174,8 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap<VarType>& ins,
                      const std::map<std::string, std::string>& inplace_map,
                      paddle::framework::AttributeMap* passed_default_attrs_,
                      bool use_default_attr_map) {
-  platform::RecordEvent op_type_record_event(type);
+  platform::RecordEvent op_type_record_event(
+      type, platform::TracerEventType::Operator, 2);
   platform::ScopedFlushDenormal flush;
   VLOG(1) << "Trace Op: " << type;
   if (FLAGS_use_mkldnn) {
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index b508126c367960..73ecbbe6143ca8 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -160,10 +160,11 @@ class Tracer {
  private:
   std::unique_ptr<BasicEngine> basic_engine_;
   std::unique_ptr<jit::ProgramDescTracer> program_desc_tracer_;
-  bool enable_program_desc_tracing_{false};
   std::unique_ptr<UniqueNameGenerator> generator_;
   platform::Place expected_place_;
   GarbageCollectorMap gcs_;
+
+  static thread_local bool enable_program_desc_tracing_;
   static thread_local bool has_grad_;
   static thread_local AmpLevel amp_level_;
   static thread_local phi::DataType amp_dtype_;
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index 837b83004de84e..796c86a3ad1efe 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -54,6 +54,27 @@ void IRPassManager::CreatePasses(Argument *argument,
   int pass_num = 0;
   for (const std::string &pass_name : passes) {
     auto pass = framework::ir::PassRegistry::Instance().Get(pass_name);
+    pass->Set("use_oss", new bool(argument->tensorrt_use_oss()));
+    pass->Set("with_interleaved",
+              new bool(argument->tensorrt_with_interleaved()));
+    pass->Set("disable_logs", new bool(argument->disable_logs()));
+    auto precision_mode = argument->tensorrt_precision_mode();
+    bool enable_int8 = precision_mode == AnalysisConfig::Precision::kInt8;
+    pass->Set("enable_int8", new bool(enable_int8));
+    pass->Set("max_input_shape", new std::map<std::string, std::vector<int>>(
+                                     argument->max_input_shape()));
+    pass->Set("min_input_shape", new std::map<std::string, std::vector<int>>(
+                                     argument->min_input_shape()));
+    pass->Set("optim_input_shape", new std::map<std::string, std::vector<int>>(
+                                       argument->optim_input_shape()));
+    // tuned trt dynamic_shape
+    pass->Set("trt_tuned_dynamic_shape",
+              new bool(argument->tensorrt_tuned_dynamic_shape()));
+    bool with_dynamic_shape = (argument->max_input_shape().size() > 0 &&
+                               argument->min_input_shape().size() > 0 &&
+                               argument->optim_input_shape().size() > 0) ||
+                              argument->tensorrt_tuned_dynamic_shape();
+    pass->Set("with_dynamic_shape", new bool(with_dynamic_shape));
 
     if (pass_name == "graph_viz_pass") {
       std::string optim_cache_dir = argument->optim_cache_dir();
@@ -99,17 +120,9 @@ void IRPassManager::CreatePasses(Argument *argument,
                 new int(argument->tensorrt_min_subgraph_size()));
       pass->Set("program",
                 new framework::ProgramDesc *(&argument->main_program()));
-
-      auto precision_mode = argument->tensorrt_precision_mode();
-      bool enable_int8 = precision_mode == AnalysisConfig::Precision::kInt8;
-
       pass->Set("predictor_id", new int(argument->predictor_id()));
       bool use_calib_mode = argument->tensorrt_use_calib_mode();
-      pass->Set("enable_int8", new bool(enable_int8));
       pass->Set("use_calib_mode", new bool(use_calib_mode));
-      pass->Set("use_oss", new bool(argument->tensorrt_use_oss()));
-      pass->Set("with_interleaved",
-                new bool(argument->tensorrt_with_interleaved()));
       pass->Set("precision_mode",
                 new AnalysisConfig::Precision(precision_mode));
 
@@ -161,22 +174,8 @@ void IRPassManager::CreatePasses(Argument *argument,
       // tuned trt dynamic_shape
       pass->Set("trt_shape_range_info_path",
                 new std::string(argument->tensorrt_shape_range_info_path()));
-      pass->Set("trt_tuned_dynamic_shape",
-                new bool(argument->tensorrt_tuned_dynamic_shape()));
       pass->Set("trt_allow_build_at_runtime",
                 new bool(argument->tensorrt_allow_build_at_runtime()));
-      pass->Set("max_input_shape", new std::map<std::string, std::vector<int>>(
-                                       argument->max_input_shape()));
-      pass->Set("min_input_shape", new std::map<std::string, std::vector<int>>(
-                                       argument->min_input_shape()));
-      pass->Set("optim_input_shape",
-                new std::map<std::string, std::vector<int>>(
-                    argument->optim_input_shape()));
-      bool with_dynamic_shape = (argument->max_input_shape().size() > 0 &&
-                                 argument->min_input_shape().size() > 0 &&
-                                 argument->optim_input_shape().size() > 0) ||
-                                argument->tensorrt_tuned_dynamic_shape();
-      pass->Set("with_dynamic_shape", new bool(with_dynamic_shape));
       pass->Set("trt_disabled_ops", new std::vector<std::string>(
                                         argument->tensorrt_disabled_ops()));
       pass->Set("trt_use_dla", new bool(argument->tensorrt_use_dla()));
@@ -192,14 +191,15 @@ void IRPassManager::CreatePasses(Argument *argument,
                 new framework::ProgramDesc *(&argument->main_program()));
     }
     if (pass_name == "lite_subgraph_pass") {
-      bool enable_int8 =
+      bool lite_enable_int8 =
           argument->lite_precision_mode() == AnalysisConfig::Precision::kInt8;
       pass->Set("program",
                 new framework::ProgramDesc *(&argument->main_program()));
       pass->Set("lite_ops_filter",
                 new std::vector<std::string>(argument->lite_ops_filter()));
       pass->Set("predictor_id", new int(argument->predictor_id()));
-      pass->Set("enable_int8", new bool(enable_int8));
+      pass->Erase("enable_int8");
+      pass->Set("enable_int8", new bool(lite_enable_int8));
       pass->Set("use_gpu", new bool(argument->use_gpu()));
       pass->Set("zero_copy", new bool(argument->lite_zero_copy()));
       pass->Set("use_xpu", new bool(argument->use_xpu()));
@@ -236,7 +236,6 @@ void IRPassManager::CreatePasses(Argument *argument,
                 new std::vector<std::string>(
                     argument->nnadapter_model_cache_token()));
     }
-    disable_logs_ = argument->disable_logs();
     if (pass_name == "fc_fuse_pass") {
       pass->Set("use_gpu", new bool(argument->use_gpu()));
       bool fc_mkldnn_pass = 0;
@@ -248,9 +247,6 @@ void IRPassManager::CreatePasses(Argument *argument,
       bool use_fc_padding = !fc_mkldnn_pass && argument->use_fc_padding();
       pass->Set("use_fc_padding", new bool(use_fc_padding));
     }
-
-    pass->Set("disable_logs", new bool(disable_logs_));
-
     pre_pass = pass_name;
 
     passes_.emplace_back(std::move(pass));
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index a3812244fbe224..6c005e4b2d6e4d 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -592,6 +592,14 @@ void AnalysisPredictor::PrepareArgument() {
     argument_.SetModelParamsPath(config_.params_file());
   }
 
+  argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_);
+  argument_.SetTensorRtUseOSS(config_.trt_use_oss_);
+  argument_.SetTensorRtWithInterleaved(config_.trt_with_interleaved_);
+  argument_.SetMinInputShape(config_.min_input_shape_);
+  argument_.SetMaxInputShape(config_.max_input_shape_);
+  argument_.SetOptimInputShape(config_.optim_input_shape_);
+  argument_.SetTensorRtTunedDynamicShape(
+      config_.tuned_tensorrt_dynamic_shape());
   if (config_.use_gpu() && config_.tensorrt_engine_enabled()) {
     LOG(INFO) << "TensorRT subgraph engine is enabled";
     argument_.SetUseTensorRT(true);
@@ -601,18 +609,10 @@ void AnalysisPredictor::PrepareArgument() {
     argument_.SetTensorRtDisabledOPs(config_.trt_disabled_ops_);
     argument_.SetTensorRtUseDLA(config_.trt_use_dla_);
     argument_.SetTensorRtDLACore(config_.trt_dla_core_);
-    argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_);
     argument_.SetTensorRtUseStaticEngine(config_.trt_use_static_engine_);
     argument_.SetTensorRtUseCalibMode(config_.trt_use_calib_mode_);
-    argument_.SetTensorRtUseOSS(config_.trt_use_oss_);
-    argument_.SetTensorRtWithInterleaved(config_.trt_with_interleaved_);
-    argument_.SetMinInputShape(config_.min_input_shape_);
-    argument_.SetMaxInputShape(config_.max_input_shape_);
-    argument_.SetOptimInputShape(config_.optim_input_shape_);
     argument_.SetCloseTrtPluginFp16(config_.disable_trt_plugin_fp16_);
     argument_.SetTensorRtShapeRangeInfoPath(config_.shape_range_info_path());
-    argument_.SetTensorRtTunedDynamicShape(
-        config_.tuned_tensorrt_dynamic_shape());
     argument_.SetTensorRtAllowBuildAtRuntime(
         config_.trt_allow_build_at_runtime());
     argument_.SetTensorRtUseInspector(config_.trt_use_inspector_);
diff --git a/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc
index a58de101053b38..daa3b186ab4c4c 100644
--- a/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/preln_emb_eltwise_layernorm.cc
@@ -51,21 +51,11 @@ class PrelnEmbEltwiseLayerNormOpConverter : public OpConverter {
     auto pos_emb_name = op_desc.Input("PosEmbedding").front();
     auto sent_emb_name = op_desc.Input("SentEmbedding").front();
 
-    std::vector<std::string> id_names;
     std::vector<std::string> emb_names;
-
-    id_names =
-        std::vector<std::string>{word_id_name, pos_id_name, sent_id_name};
     emb_names =
         std::vector<std::string>{word_emb_name, pos_emb_name, sent_emb_name};
 
-    int input_num = id_names.size();
-
-    // Declare inputs
-    std::vector<nvinfer1::ITensor*> input_ids;
-    for (int i = 0; i < input_num; i++) {
-      input_ids.push_back(engine_->GetITensor(id_names[i]));
-    }
+    int input_num = emb_names.size();
 
     // input_embs[0]: word_embedding
     // input_embs[1]: pos_embedding
@@ -126,7 +116,7 @@ class PrelnEmbEltwiseLayerNormOpConverter : public OpConverter {
         {"bert_embeddings_position_embeddings", input_embs[1],
          nvinfer1::PluginFieldType::kFLOAT32,
          static_cast<int32_t>(emb_sizes[1])},
-        {"output_int8", &output_int8, nvinfer1::PluginFieldType::kINT32, 1},
+        {"output_fp16", &output_int8, nvinfer1::PluginFieldType::kINT32, 1},
     };
 
     nvinfer1::PluginFieldCollection* plugin_ptr =
@@ -156,7 +146,7 @@ class PrelnEmbEltwiseLayerNormOpConverter : public OpConverter {
     shuffle_layer->setReshapeDimensions(shape_dim);
     shuffle_layer->setName(
         ("PrelnEmbeltwise_Shuffle_reshape (Output: max_seqlen " +
-         op_desc.Output("Out")[0] + ")")
+         op_desc.Output("Out_0")[0] + ")")
             .c_str());
     engine_->SetTensorDynamicRange(shuffle_layer->getOutput(0), 1.0f);
     plugin_inputs.emplace_back(
@@ -170,7 +160,7 @@ class PrelnEmbEltwiseLayerNormOpConverter : public OpConverter {
     auto plugin_layer = engine_->network()->addPluginV2(
         plugin_inputs.data(), plugin_inputs.size(), *plugin_obj);
     plugin_layer->setName(("CustomPrelnEmbLayerNormPluginDynamic_V3(Output: " +
-                           op_desc.Output("Out")[0] + ")")
+                           op_desc.Output("Out_0")[0] + ")")
                               .c_str());
     free(plugin_ptr);
     float out_0_scale =
diff --git a/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc
index 521e04b8974fd5..d9eca65fc45dcd 100644
--- a/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/preln_skip_layernorm.cc
@@ -92,8 +92,10 @@ class PrelnSkipLayerNormOpConverter : public OpConverter {
             "fail to add CustomPrelnSkipLayerNormPluginDynamic layer"));
     layer = plugin_layer;
 
-    auto output_name = op_desc.Output("Out")[0];
-    RreplenishLayerAndOutput(layer, "preln_skip_layernorm", {output_name},
+    std::vector<std::string> output_names;
+    output_names.push_back(op_desc.Output("Out_0")[0]);
+    output_names.push_back(op_desc.Output("Out_1")[0]);
+    RreplenishLayerAndOutput(layer, "preln_skip_layernorm", {output_names},
                              test_mode);
 #else
     PADDLE_THROW(platform::errors::Fatal(
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
index 6b2925a068bbd2..9210cd48d078b3 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
@@ -24,7 +24,7 @@
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 
 namespace nvinfer1 {
 class ITensor;
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index cc168ed793ba2a..77fab0a86f8330 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -38,7 +38,7 @@
 #include "paddle/fluid/inference/tests/api/config_printer.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 #include "paddle/fluid/inference/utils/benchmark.h"
-#include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 
 DEFINE_string(model_name, "", "model name");
 DEFINE_string(infer_model, "", "model path");
diff --git a/paddle/fluid/operators/abs_op.cc b/paddle/fluid/operators/abs_op.cc
index 149a87fe32da16..c28026a4bd43aa 100644
--- a/paddle/fluid/operators/abs_op.cc
+++ b/paddle/fluid/operators/abs_op.cc
@@ -16,7 +16,10 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
@@ -27,16 +30,6 @@ namespace operators {
 class AbsOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "abs");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "abs");
-
-    auto in_dims = ctx->GetInputDim("X");
-
-    ctx->SetOutputDim("Out", in_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
 };
 
 class AbsOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -148,11 +141,15 @@ class AbsDoubleGradOp : public framework::OperatorWithKernel {
 }  // namespace operators
 }  // namespace paddle
 
+DELCARE_INFER_SHAPE_FUNCTOR(abs, AbsInferShapeFunctor,
+                            PT_INFER_META(phi::UnchangedInferMeta));
+
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(abs, ops::AbsOp, ops::AbsOpMaker,
                   ops::AbsGradMaker<paddle::framework::OpDesc>,
-                  ops::AbsGradMaker<paddle::imperative::OpBase>);
+                  ops::AbsGradMaker<paddle::imperative::OpBase>,
+                  AbsInferShapeFunctor);
 
 REGISTER_OPERATOR(abs_grad, ops::AbsGradOp,
                   ops::AbsDoubleGradMaker<paddle::framework::OpDesc>,
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.cc b/paddle/fluid/operators/controlflow/conditional_block_op.cc
index 31ed10a71201c6..6bf419c47a5669 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op.cc
+++ b/paddle/fluid/operators/controlflow/conditional_block_op.cc
@@ -272,8 +272,18 @@ class ConditionalBlockGradInferVarType : public framework::VarTypeInference {
     // Input is {Tensor, LoDTensorArray}, we need synchronous the Input's
     // VarType into Input@GRAD to avoid generating {Tensor, Tensor} as
     // Input@GRAD.
-    ctx->SyncTypeAndDataType(ConditionalOp::kInputs,
-                             framework::GradVarName(ConditionalOp::kInputs));
+    auto input_size = ctx->InputSize(ConditionalOp::kInputs);
+    auto output_size =
+        ctx->OutputSize(framework::GradVarName(ConditionalOp::kInputs));
+    PADDLE_ENFORCE_EQ(input_size, output_size,
+                      platform::errors::InvalidArgument(
+                          "input_size and output_size should be equal for "
+                          "conditional_block_grad_op."));
+    for (size_t i = 0; i < output_size; ++i) {
+      ctx->SyncTypeAndDataType(ConditionalOp::kInputs,
+                               framework::GradVarName(ConditionalOp::kInputs),
+                               i);
+    }
   }
 };
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cc b/paddle/fluid/operators/elementwise/elementwise_max_op.cc
index cc27bab7200575..91da732ef0d3df 100644
--- a/paddle/fluid/operators/elementwise/elementwise_max_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cc
@@ -124,13 +124,17 @@ REGISTER_OP_CPU_KERNEL(
     ops::ElementwiseMaxKernel<paddle::platform::CPUDeviceContext, float>,
     ops::ElementwiseMaxKernel<paddle::platform::CPUDeviceContext, double>,
     ops::ElementwiseMaxKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseMaxKernel<paddle::platform::CPUDeviceContext, int64_t>);
+    ops::ElementwiseMaxKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::ElementwiseMaxKernel<paddle::platform::CPUDeviceContext,
+                              paddle::platform::bfloat16>);
 REGISTER_OP_CPU_KERNEL(
     elementwise_max_grad,
     ops::ElementwiseMaxGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::ElementwiseMaxGradKernel<paddle::platform::CPUDeviceContext, double>,
     ops::ElementwiseMaxGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ElementwiseMaxGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
+    ops::ElementwiseMaxGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::ElementwiseMaxGradKernel<paddle::platform::CPUDeviceContext,
+                                  paddle::platform::bfloat16>);
 
 REGISTER_OP_VERSION(elementwise_max)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cu b/paddle/fluid/operators/elementwise/elementwise_max_op.cu
index 7433c505f472a2..123332a4a23de5 100644
--- a/paddle/fluid/operators/elementwise/elementwise_max_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cu
@@ -69,6 +69,8 @@ REGISTER_OP_CUDA_KERNEL(
     elementwise_max,
     ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext,
                               paddle::platform::float16>,
+    ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext,
+                              paddle::platform::bfloat16>,
     ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext, int>,
@@ -77,6 +79,8 @@ REGISTER_OP_CUDA_KERNEL(
     elementwise_max_grad,
     ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext,
                                   paddle::platform::float16>,
+    ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext,
+                                  paddle::platform::bfloat16>,
     ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext, double>,
     ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext, int>,
diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu
index fa9fe9d8602012..21d827c79200c4 100644
--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/distribution_helper.h"
 #include "paddle/fluid/operators/fill_constant_op.h"
+#include "paddle/fluid/operators/index_impl.cu.h"
 
 DECLARE_bool(use_curand);
 
@@ -65,7 +66,6 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
     }
     T mean = static_cast<T>(context.Attr<float>("mean"));
     T std = static_cast<T>(context.Attr<float>("std"));
-    thrust::counting_iterator<int64_t> index_sequence_begin(0);
     auto shape = GetShape(context);
     tensor->Resize(shape);
 
@@ -88,15 +88,13 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
       } else {
         auto seed_offset = gen_cuda->IncrementOffset(1);
         int64_t gen_offset = size * seed_offset.second;
-        thrust::transform(
-            index_sequence_begin, index_sequence_begin + size,
-            thrust::device_ptr<T>(data),
-            GaussianGenerator<T>(mean, std, seed_offset.first, gen_offset));
+        auto func =
+            GaussianGenerator<T>(mean, std, seed_offset.first, gen_offset);
+        IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
       }
     } else {
-      thrust::transform(index_sequence_begin, index_sequence_begin + size,
-                        thrust::device_ptr<T>(data),
-                        GaussianGenerator<T>(mean, std, seed));
+      auto func = GaussianGenerator<T>(mean, std, seed);
+      IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
     }
   }
 };
@@ -116,23 +114,22 @@ class GPUGaussianRandomBatchSizeLikeKernel : public framework::OpKernel<T> {
     }
     T mean = static_cast<T>(context.Attr<float>("mean"));
     T std = static_cast<T>(context.Attr<float>("std"));
-    thrust::counting_iterator<int64_t> index_sequence_begin(0);
     int64_t size = tensor->numel();
 
     int device_id = context.GetPlace().GetDeviceId();
     auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
+    auto& dev_cxt =
+        context.template device_context<platform::CUDADeviceContext>();
 
     if (gen_cuda->GetIsInitPy() && seed_flag) {
       auto seed_offset = gen_cuda->IncrementOffset(1);
       int64_t gen_offset = size * seed_offset.second;
-      thrust::transform(index_sequence_begin, index_sequence_begin + size,
-                        thrust::device_ptr<T>(data),
-                        GaussianGenerator<T>(mean, std, seed_offset.first,
-                                             seed_offset.second));
+      auto func = GaussianGenerator<T>(mean, std, seed_offset.first,
+                                       seed_offset.second);
+      IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
     } else {
-      thrust::transform(index_sequence_begin, index_sequence_begin + size,
-                        thrust::device_ptr<T>(data),
-                        GaussianGenerator<T>(mean, std, seed));
+      auto func = GaussianGenerator<T>(mean, std, seed);
+      IndexKernel<T, GaussianGenerator<T>>(dev_cxt, tensor, func);
     }
   }
 };
diff --git a/paddle/fluid/operators/huber_loss_op.cc b/paddle/fluid/operators/huber_loss_op.cc
index 041f7487fd2575..3915ce5809c394 100644
--- a/paddle/fluid/operators/huber_loss_op.cc
+++ b/paddle/fluid/operators/huber_loss_op.cc
@@ -12,47 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/huber_loss_op.h"
 #include <memory>
 #include <string>
 #include <vector>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/binary.h"
+
 namespace paddle {
 namespace operators {
 
 class HuberLossOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "HuberLoss");
-    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "HuberLoss");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-
-    PADDLE_ENFORCE_EQ(x_dims.size(), y_dims.size(),
-                      platform::errors::InvalidArgument(
-                          "Input(input) rank and Input(label) rank should be "
-                          "same, but received input rank(%d) != label rank(%d)",
-                          x_dims.size(), y_dims.size()));
-
-    bool contain_unknown_dim =
-        phi::contain_unknown_dim(x_dims) || phi::contain_unknown_dim(y_dims);
-    if (ctx->IsRuntime() || !contain_unknown_dim) {
-      PADDLE_ENFORCE_EQ(
-          x_dims, y_dims,
-          platform::errors::InvalidArgument(
-              "The Input(input) and Input(label) should have the same "
-              "shape, but received input shape [%s] != label shape [%s]",
-              x_dims, y_dims));
-    }
-
-    auto out_dims = y_dims;
-    ctx->SetOutputDim("Residual", out_dims);
-    ctx->SetOutputDim("Out", out_dims);
-    ctx->ShareLoD("X", "Out");
-  }
 };
 
 template <typename AttrType>
@@ -139,14 +112,11 @@ class HuberLossGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DELCARE_INFER_SHAPE_FUNCTOR(huber_loss, HuberLossInferShapeFunctor,
+                            PT_INFER_META(phi::HuberLossInferMeta));
+
 REGISTER_OPERATOR(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker<float>,
                   ops::HuberLossGradOpMaker<paddle::framework::OpDesc>,
-                  ops::HuberLossGradOpMaker<paddle::imperative::OpBase>);
+                  ops::HuberLossGradOpMaker<paddle::imperative::OpBase>,
+                  HuberLossInferShapeFunctor);
 REGISTER_OPERATOR(huber_loss_grad, ops::HuberLossGradOp);
-REGISTER_OP_CPU_KERNEL(
-    huber_loss, ops::HuberLossKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::HuberLossKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    huber_loss_grad,
-    ops::HuberLossGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::HuberLossGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/huber_loss_op.cu b/paddle/fluid/operators/huber_loss_op.cu
deleted file mode 100644
index 4ce6856a7eade1..00000000000000
--- a/paddle/fluid/operators/huber_loss_op.cu
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/huber_loss_op.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    huber_loss,
-    ops::HuberLossKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::HuberLossKernel<paddle::platform::CUDADeviceContext, double>);
-REGISTER_OP_CUDA_KERNEL(
-    huber_loss_grad,
-    ops::HuberLossGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::HuberLossGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/huber_loss_op.h b/paddle/fluid/operators/huber_loss_op.h
deleted file mode 100644
index ebe26f05ab3e47..00000000000000
--- a/paddle/fluid/operators/huber_loss_op.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/core/hostdevice.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename T>
-struct HuberLossForward {
-  HOSTDEVICE HuberLossForward(const T& delta) : delta(delta) {}
-
-  HOSTDEVICE T operator()(const T& val) const {
-    T abs_val = std::abs(val);
-    if (abs_val <= delta) {
-      return static_cast<T>(0.5) * val * val;
-    } else {
-      return delta * (abs_val - static_cast<T>(0.5) * delta);
-    }
-  }
-
-  T delta;
-};
-
-template <typename DeviceContext, typename T>
-class HuberLossKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<Tensor>("X");
-    auto* in1 = context.Input<Tensor>("Y");
-    auto* out0 = context.Output<Tensor>("Residual");
-    auto* out1 = context.Output<Tensor>("Out");
-    auto delta = static_cast<T>(context.Attr<float>("delta"));
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    auto x = EigenVector<T>::Flatten(*in0);
-    auto y = EigenVector<T>::Flatten(*in1);
-    out0->mutable_data<T>(context.GetPlace());
-    auto residual = EigenVector<T>::Flatten(*out0);
-    residual.device(place) = y - x;
-    out1->mutable_data<T>(context.GetPlace());
-    auto loss = EigenVector<T>::Flatten(*out1);
-    loss.device(place) = residual.unaryExpr(HuberLossForward<T>(delta));
-  }
-};
-
-template <typename T>
-struct HuberLossBackward {
-  HOSTDEVICE HuberLossBackward(const T& delta, T sign)
-      : sign(sign), delta(delta) {}
-
-  HOSTDEVICE T operator()(const T& val) const {
-    T abs_val = std::abs(val);
-    if (abs_val <= delta) {
-      return sign * val;
-    } else {
-      if (val > 0) {
-        return sign * delta;
-      } else {
-        return -1 * sign * delta;
-      }
-    }
-  }
-
-  T sign;
-  T delta;
-};
-
-template <typename DeviceContext, typename T>
-class HuberLossGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<Tensor>("Residual");
-    auto* in1 = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* out1 = context.Output<Tensor>(framework::GradVarName("Y"));
-    auto delta = static_cast<T>(context.Attr<float>("delta"));
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    auto residual = EigenVector<T>::Flatten(*in0);
-    auto out_grad = EigenVector<T>::Flatten(*in1);
-
-    if (out0) {
-      out0->mutable_data<T>(context.GetPlace());
-      auto x_grad = EigenVector<T>::Flatten(*out0);
-      x_grad.device(place) =
-          residual.unaryExpr(HuberLossBackward<T>(delta, -1.0));
-      x_grad.device(place) = out_grad * x_grad;
-    }
-
-    if (out1) {
-      out1->mutable_data<T>(context.GetPlace());
-      auto y_grad = EigenVector<T>::Flatten(*out1);
-      y_grad.device(place) =
-          residual.unaryExpr(HuberLossBackward<T>(delta, 1.0));
-      y_grad.device(place) = out_grad * y_grad;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/huber_loss_op_npu.cc b/paddle/fluid/operators/huber_loss_op_npu.cc
index 19ced131c00a2a..6fc6960d3db565 100644
--- a/paddle/fluid/operators/huber_loss_op_npu.cc
+++ b/paddle/fluid/operators/huber_loss_op_npu.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/huber_loss_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/huber_loss_op_xpu.cc b/paddle/fluid/operators/huber_loss_op_xpu.cc
index 767ce542736e83..ccddec2779515f 100644
--- a/paddle/fluid/operators/huber_loss_op_xpu.cc
+++ b/paddle/fluid/operators/huber_loss_op_xpu.cc
@@ -13,8 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
-
-#include "paddle/fluid/operators/huber_loss_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/index_impl.cu.h b/paddle/fluid/operators/index_impl.cu.h
new file mode 100644
index 00000000000000..bae0d3f569f5f4
--- /dev/null
+++ b/paddle/fluid/operators/index_impl.cu.h
@@ -0,0 +1,97 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/random.h>
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
+#include "paddle/fluid/operators/distribution_helper.h"
+#include "paddle/fluid/operators/fill_constant_op.h"
+#include "paddle/fluid/platform/aligned_vector.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/primitive/kernel_primitives.h"
+
+namespace paddle {
+namespace operators {
+
+namespace kps = phi::kps;
+template <typename T, typename Functor, int VecSize>
+__global__ void VectorizedIndexKernel(T *out, int numel, int main_offset,
+                                      Functor func) {
+  int data_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize;
+  int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;
+  int args[VecSize];
+  T result[VecSize];
+  for (; data_offset < main_offset; data_offset += stride) {
+    kps::InitWithDataIndex<int, VecSize, 1, 1>(&args[0], data_offset);
+    kps::ElementwiseUnary<int, T, VecSize, 1, 1, Functor>(&result[0], &args[0],
+                                                          func);
+    kps::WriteData<T, VecSize, 1, 1, false>(out + data_offset, &result[0],
+                                            BLOCK_NUM_X * VecSize);
+  }
+  int num = numel - data_offset;
+  if (numel > 0) {
+    kps::InitWithDataIndex<int, VecSize, 1, 1>(&args[0], data_offset);
+    kps::ElementwiseUnary<int, T, VecSize, 1, 1, Functor>(&result[0], &args[0],
+                                                          func);
+    kps::WriteData<T, VecSize, 1, 1, true>(out + data_offset, &result[0], num);
+  }
+}
+
+template <typename T, typename Functor>
+void IndexKernel(const KPDevice &dev_ctx, Tensor *out, Functor func) {
+  int numel = out->numel();
+  T *out_data = out->mutable_data<T>(dev_ctx.GetPlace());
+  if (numel <= 0) return;
+  int vec_size = paddle::platform::GetVectorizedSize((out->data<T>()));
+#ifdef PADDLE_WITH_XPU_KP
+  int block = 64;
+  int grid = 8;
+  auto stream = dev_ctx.x_context()->xpu_stream;
+#else
+  auto config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, vec_size);
+  int grid = config.block_per_grid.x;
+  int block = config.thread_per_block.x;
+  auto stream = dev_ctx.stream();
+#endif
+
+  int main_offset = (numel / (vec_size * block)) * vec_size * block;
+  switch (vec_size) {
+    case 4:
+      VectorizedIndexKernel<T, Functor, 4><<<grid, block, 0, stream>>>(
+          out_data, numel, main_offset, func);
+      break;
+    case 2:
+      VectorizedIndexKernel<T, Functor, 2><<<grid, block, 0, stream>>>(
+          out_data, numel, main_offset, func);
+      break;
+    case 1:
+      VectorizedIndexKernel<T, Functor, 1><<<grid, block, 0, stream>>>(
+          out_data, numel, main_offset, func);
+      break;
+    default: {
+      PADDLE_THROW(paddle::platform::errors::Unimplemented(
+          "Unsupported vectorized size: %d !", vec_size));
+      break;
+    }
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc b/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc
index ddc6287011bcff..b024fe76b0972a 100644
--- a/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc
+++ b/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc
@@ -17,6 +17,9 @@ limitations under the License. */
 #include <string>
 #include <thread>  // NOLINT
 
+#include <random>
+#include <sstream>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/ps/service/heter_client.h"
 #include "paddle/fluid/distributed/ps/service/heter_server.h"
@@ -36,6 +39,19 @@ DECLARE_double(eager_delete_tensor_gb);
 USE_OP_ITSELF(scale);
 USE_NO_KERNEL_OP(heter_listen_and_serv);
 
+std::string get_ip_port() {
+  std::mt19937 rng;
+  rng.seed(std::random_device()());
+  std::uniform_int_distribution<std::mt19937::result_type> dist(4444, 25000);
+  int port = dist(rng);
+  std::string ip_port;
+  std::stringstream temp_str;
+  temp_str << "127.0.0.1:";
+  temp_str << port;
+  temp_str >> ip_port;
+  return ip_port;
+}
+
 framework::BlockDesc* AppendSendAndRecvBlock(framework::ProgramDesc* program) {
   framework::BlockDesc* block =
       program->AppendBlock(*(program->MutableBlock(0)));
@@ -53,16 +69,13 @@ framework::BlockDesc* AppendSendAndRecvBlock(framework::ProgramDesc* program) {
   return block;
 }
 
-void GetHeterListenAndServProgram(framework::ProgramDesc* program) {
+void GetHeterListenAndServProgram(framework::ProgramDesc* program,
+                                  std::string endpoint) {
   auto root_block = program->MutableBlock(0);
-
   auto* sub_block = AppendSendAndRecvBlock(program);
   std::vector<framework::BlockDesc*> optimize_blocks;
   optimize_blocks.push_back(sub_block);
-
   std::vector<std::string> message_to_block_id = {"x:1"};
-  std::string endpoint = "127.0.0.1:19944";
-
   framework::OpDesc* op = root_block->AppendOp();
   op->SetType("heter_listen_and_serv");
   op->SetInput("X", {});
@@ -129,7 +142,7 @@ void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
   CreateVarsOnScope(scope, place);
 }
 
-void StartHeterServer() {
+void StartHeterServer(std::string endpoint) {
   framework::ProgramDesc program;
   framework::Scope scope;
   platform::CPUPlace place;
@@ -137,7 +150,7 @@ void StartHeterServer() {
   platform::CPUDeviceContext ctx(place);
 
   LOG(INFO) << "before GetHeterListenAndServProgram";
-  GetHeterListenAndServProgram(&program);
+  GetHeterListenAndServProgram(&program, endpoint);
   auto prepared = exe.Prepare(program, 0);
 
   LOG(INFO) << "before InitTensorsOnServer";
@@ -150,13 +163,12 @@ void StartHeterServer() {
 TEST(HETER_LISTEN_AND_SERV, CPU) {
   setenv("http_proxy", "", 1);
   setenv("https_proxy", "", 1);
-  std::string endpoint = "127.0.0.1:19944";
-  std::string previous_endpoint = "127.0.0.1:19944";
+  std::string endpoint = get_ip_port();
+  std::string previous_endpoint = endpoint;
   LOG(INFO) << "before StartSendAndRecvServer";
   FLAGS_eager_delete_tensor_gb = -1;
-  std::thread server_thread(StartHeterServer);
+  std::thread server_thread(StartHeterServer, endpoint);
   sleep(1);
-
   auto b_rpc_service = distributed::HeterServer::GetInstance();
   b_rpc_service->WaitServerReady();
   using MicroScope =
diff --git a/paddle/fluid/operators/pscore/heter_server_test.cc b/paddle/fluid/operators/pscore/heter_server_test.cc
index 776fe5d675ded9..6ab4204b2f9dfe 100644
--- a/paddle/fluid/operators/pscore/heter_server_test.cc
+++ b/paddle/fluid/operators/pscore/heter_server_test.cc
@@ -17,6 +17,9 @@ limitations under the License. */
 #include <string>
 #include <thread>  // NOLINT
 
+#include <random>
+#include <sstream>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/ps/service/heter_client.h"
 #include "paddle/fluid/distributed/ps/service/heter_server.h"
@@ -33,6 +36,19 @@ USE_OP_ITSELF(scale);
 
 std::shared_ptr<distributed::HeterServer> b_rpc_service;
 
+std::string get_ip_port() {
+  std::mt19937 rng;
+  rng.seed(std::random_device()());
+  std::uniform_int_distribution<std::mt19937::result_type> dist(4444, 25000);
+  int port = dist(rng);
+  std::string ip_port;
+  std::stringstream temp_str;
+  temp_str << "127.0.0.1:";
+  temp_str << port;
+  temp_str >> ip_port;
+  return ip_port;
+}
+
 framework::BlockDesc* AppendSendAndRecvBlock(framework::ProgramDesc* program) {
   auto root_block = program->MutableBlock(0);
   auto* block = program->AppendBlock(*root_block);
@@ -178,16 +194,17 @@ void StartSendAndRecvServer(std::string endpoint) {
 
   b_rpc_service->SetRequestHandler(b_req_handler);
   LOG(INFO) << "before HeterServer::RunServer";
-  std::thread server_thread(std::bind(RunServer, b_rpc_service));
+  RunServer(b_rpc_service);
+  // std::thread server_thread(std::bind(RunServer, b_rpc_service));
 
-  server_thread.join();
+  // server_thread.join();
 }
 
 TEST(SENDANDRECV, CPU) {
   setenv("http_proxy", "", 1);
   setenv("https_proxy", "", 1);
-  std::string endpoint = "127.0.0.1:4444";
-  std::string previous_endpoint = "127.0.0.1:4444";
+  std::string endpoint = get_ip_port();
+  std::string previous_endpoint = endpoint;
   LOG(INFO) << "before StartSendAndRecvServer";
   b_rpc_service = distributed::HeterServer::GetInstance();
   std::thread server_thread(StartSendAndRecvServer, endpoint);
diff --git a/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc b/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc
index 07c3f9ee50ea07..26da0d3696fdf5 100644
--- a/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc
+++ b/paddle/fluid/operators/pscore/send_and_recv_op_cpu_test.cc
@@ -18,6 +18,8 @@ limitations under the License. */
 #include <string>
 #include <thread>  // NOLINT
 
+#include <random>
+#include <sstream>
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/ps/service/heter_client.h"
 #include "paddle/fluid/distributed/ps/service/heter_server.h"
@@ -36,6 +38,19 @@ USE_OP(send_and_recv);
 
 std::shared_ptr<distributed::HeterServer> b_rpc_service;
 
+std::string get_ip_port() {
+  std::mt19937 rng;
+  rng.seed(std::random_device()());
+  std::uniform_int_distribution<std::mt19937::result_type> dist(4444, 25000);
+  int port = dist(rng);
+  std::string ip_port;
+  std::stringstream temp_str;
+  temp_str << "127.0.0.1:";
+  temp_str << port;
+  temp_str >> ip_port;
+  return ip_port;
+}
+
 framework::BlockDesc* AppendSendAndRecvBlock(framework::ProgramDesc* program) {
   auto root_block = program->MutableBlock(0);
   auto* block = program->AppendBlock(*root_block);
@@ -151,16 +166,18 @@ void StartSendAndRecvServer(std::string endpoint) {
 
   b_rpc_service->SetRequestHandler(b_req_handler);
   LOG(INFO) << "before HeterServer::RunServer";
-  std::thread server_thread(std::bind(RunServer, b_rpc_service));
 
-  server_thread.join();
+  RunServer(b_rpc_service);
+
+  // std::thread server_thread(std::bind(RunServer, b_rpc_service));
+  // server_thread.join();
 }
 
 TEST(SENDANDRECV, CPU) {
   setenv("http_proxy", "", 1);
   setenv("https_proxy", "", 1);
-  std::string endpoint = "127.0.0.1:4444";
-  std::string previous_endpoint = "127.0.0.1:4444";
+  std::string endpoint = get_ip_port();
+  std::string previous_endpoint = endpoint;
   LOG(INFO) << "before StartSendAndRecvServer";
   b_rpc_service = distributed::HeterServer::GetInstance();
   std::thread server_thread(StartSendAndRecvServer, endpoint);
@@ -260,8 +277,10 @@ TEST(SENDANDRECV, CPU) {
   exe.RunPreparedContext(prepared.get(), scope, false);
 
   LOG(INFO) << "client wait for Pop";
+
   auto task = (*task_queue_)[0]->Pop();
   LOG(INFO) << "client get from task queue";
+
   PADDLE_ENFORCE_EQ(
       task.first, "x",
       platform::errors::InvalidArgument(
diff --git a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
index 17be7c64ff3121..a5e292a05e1ff6 100644
--- a/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
+++ b/paddle/fluid/operators/pscore/send_and_recv_op_gpu_test.cc
@@ -19,6 +19,9 @@ limitations under the License. */
 #include <string>
 #include <thread>  // NOLINT
 
+#include <random>
+#include <sstream>
+
 #include "gtest/gtest.h"
 #include "paddle/fluid/distributed/ps/service/heter_client.h"
 #include "paddle/fluid/distributed/ps/service/heter_server.h"
@@ -40,20 +43,30 @@ USE_OP(send_and_recv);
 
 std::shared_ptr<distributed::HeterServer> b_rpc_service2;
 
+std::string get_ip_port() {
+  std::mt19937 rng;
+  rng.seed(std::random_device()());
+  std::uniform_int_distribution<std::mt19937::result_type> dist(4444, 25000);
+  int port = dist(rng);
+  std::string ip_port;
+  std::stringstream temp_str;
+  temp_str << "127.0.0.1:";
+  temp_str << port;
+  temp_str >> ip_port;
+  return ip_port;
+}
+
 framework::BlockDesc* AppendSendAndRecvBlock(framework::ProgramDesc* program) {
   auto root_block = program->MutableBlock(0);
   auto* block = program->AppendBlock(*root_block);
-
   framework::OpDesc* op = block->AppendOp();
   op->SetType("scale");
   op->SetInput("X", {"x"});
   op->SetOutput("Out", {"res"});
   op->SetAttr("scale", 0.5f);
-
   auto& out = *root_block->Var("res");
   out.SetType(framework::proto::VarType::LOD_TENSOR);
   out.SetShape({1, 10});
-
   return block;
 }
 
@@ -172,15 +185,17 @@ void StartSendAndRecvServer(std::string endpoint) {
 
   b_rpc_service2->SetRequestHandler(b_req_handler);
   LOG(INFO) << "before HeterServer::RunServer";
-  std::thread server_thread(std::bind(RunServer, b_rpc_service2));
-  server_thread.join();
+
+  RunServer(b_rpc_service2);
+  // std::thread server_thread(std::bind(RunServer, b_rpc_service2));
+  // server_thread.join();
 }
 
 TEST(SENDANDRECV, GPU) {
   setenv("http_proxy", "", 1);
   setenv("https_proxy", "", 1);
-  std::string endpoint = "127.0.0.1:4445";
-  std::string previous_endpoint = "127.0.0.1:4445";
+  std::string endpoint = get_ip_port();
+  std::string previous_endpoint = endpoint;
   LOG(INFO) << "before StartSendAndRecvServer";
   b_rpc_service2 = distributed::HeterServer::GetInstance();
   std::thread server_thread(StartSendAndRecvServer, endpoint);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
index c8d568c8c2cf73..e80df5f95bb4ab 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cc
@@ -99,13 +99,6 @@ REGISTER_OPERATOR(reduce_mean_grad, ops::ReduceGradOp,
                   ops::ReduceMeanDoubleGradDescMaker,
                   ops::ReduceMeanDoubleGradOpBaseMaker,
                   ops::ReduceMeanGradNoNeedBufferVarInferer);
-REGISTER_OP_CPU_KERNEL(reduce_mean,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         bool, ops::MeanFunctor>,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         float, ops::MeanFunctor>,
-                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                                         double, ops::MeanFunctor>);
 
 template <typename T>
 using CPUReduceMeanGradKernel =
diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu
deleted file mode 100644
index 30a699e979efc4..00000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <vector>
-#include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
-
-REGISTER_OP_CUDA_KERNEL(
-    reduce_mean,
-    ops::ReduceCudaKernel<bool, kps::AddFunctor, kps::DivideFunctor>,
-    ops::ReduceCudaKernel<paddle::platform::float16, kps::AddFunctor,
-                          kps::DivideFunctor>,
-    ops::ReduceCudaKernel<float, kps::AddFunctor, kps::DivideFunctor>,
-    ops::ReduceCudaKernel<double, kps::AddFunctor, kps::DivideFunctor>,
-    ops::ReduceCudaKernel<int, kps::AddFunctor, kps::DivideFunctor>,
-    ops::ReduceCudaKernel<int64_t, kps::AddFunctor, kps::DivideFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
index cfafc11739948b..bdab14a18a05ab 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
@@ -107,27 +107,6 @@ REGISTER_OPERATOR(reduce_sum_grad, ops::ReduceGradOp,
                   ops::ReduceSumDoubleOpGradMaker<paddle::imperative::OpBase>,
                   ops::ReduceSumGradNoNeedBufferVarInferer);
 
-REGISTER_OP_CPU_KERNEL(
-    reduce_sum, ops::ReduceKernel<paddle::platform::CPUDeviceContext, bool,
-                                  ops::SumFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, float,
-                      ops::SumFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, double,
-                      ops::SumFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                      paddle::platform::float16, ops::SumFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int16_t,
-                      ops::SumFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int, ops::SumFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext, int64_t,
-                      ops::SumFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                      paddle::platform::complex<float>, ops::SumFunctor>,
-    ops::ReduceKernel<paddle::platform::CPUDeviceContext,
-                      paddle::platform::complex<double>,
-
-                      ops::SumFunctor>);
-
 template <typename T>
 using CPUReduceSumGradKernel =
     ops::ReduceSumGradKernel<paddle::platform::CPUDeviceContext, T,
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cu b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cu
deleted file mode 100644
index 94ccb0965f06e9..00000000000000
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cu
+++ /dev/null
@@ -1,29 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
-#include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h"
-REGISTER_OP_CUDA_KERNEL(
-    reduce_sum,
-    ops::ReduceCudaKernel<bool, kps::AddFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<float, kps::AddFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<double, kps::AddFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<paddle::platform::float16, kps::AddFunctor,
-                          kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<int16_t, kps::AddFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<int, kps::AddFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<int64_t, kps::AddFunctor, kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<paddle::platform::complex<float>, kps::AddFunctor,
-                          kps::IdentityFunctor>,
-    ops::ReduceCudaKernel<paddle::platform::complex<double>, kps::AddFunctor,
-                          kps::IdentityFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/unity_build_rule.cmake b/paddle/fluid/operators/reduce_ops/unity_build_rule.cmake
index 74781ef6f0237a..c4f32a8d25764b 100644
--- a/paddle/fluid/operators/reduce_ops/unity_build_rule.cmake
+++ b/paddle/fluid/operators/reduce_ops/unity_build_rule.cmake
@@ -21,5 +21,4 @@ register_unity_group(cu
 register_unity_group(cu frobenius_norm_op.cu)
 register_unity_group(cu logsumexp_op.cu)
 register_unity_group(cu reduce_max_op.cu)
-register_unity_group(cu reduce_mean_op.cu)
 register_unity_group(cu reduce_min_op.cu)
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index 5620988545a0f1..ddb598f575f673 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <string>
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/pten_utils.h"
 
@@ -21,8 +22,11 @@ limitations under the License. */
 #include "paddle/phi/api/lib/utils/tensor_utils.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
 #include "paddle/phi/kernels/reshape_grad_kernel.h"
 #include "paddle/phi/kernels/reshape_kernel.h"
+
 namespace paddle {
 namespace framework {
 class InferShapeContext;
@@ -472,22 +476,6 @@ class Reshape2Op : public ReshapeOp {
              const framework::VariableNameMap &outputs,
              const framework::AttributeMap &attrs)
       : ReshapeOp(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("XShape"), true,
-                      platform::errors::InvalidArgument(
-                          "Output(XShape) of ReshapeOp should not be null."));
-    const auto &x_dims = ctx->GetInputDim("X");
-    std::vector<int64_t> xshape_dims(x_dims.size() + 1);
-    xshape_dims[0] = 0;
-    for (int i = 0; i < x_dims.size(); ++i) {
-      xshape_dims[i + 1] = x_dims[i];
-    }
-    ctx->SetOutputDim("XShape", phi::make_ddim(xshape_dims));
-    ctx->ShareLoD("X", /*->*/ "XShape");
-
-    ReshapeOp::InferShape(ctx);
-  }
 };
 
 class Reshape2OpMaker : public ReshapeOpMaker {
@@ -647,10 +635,14 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel,
                                ops::ReshapeGradKernel, int,
                                ops::ReshapeGradKernel, int64_t,
                                ops::ReshapeGradKernel);
+
+DELCARE_INFER_SHAPE_FUNCTOR(reshape2, ReshapeInferShapeFunctor,
+                            PT_INFER_META(phi::ReshapeWithXShapeInferMeta));
+
 REGISTER_OPERATOR(reshape2, ops::Reshape2Op, ops::Reshape2OpMaker,
                   ops::Reshape2GradMaker<paddle::framework::OpDesc>,
                   ops::Reshape2GradMaker<paddle::imperative::OpBase>,
-                  ops::ReshapeOpInplaceInferer);
+                  ReshapeInferShapeFunctor, ops::ReshapeOpInplaceInferer);
 REGISTER_OPERATOR(reshape2_grad, ops::Reshape2GradOp,
                   ops::Reshape2DoubleGradMaker<paddle::framework::OpDesc>,
                   ops::Reshape2DoubleGradMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/uniform_random_inplace_op.cu b/paddle/fluid/operators/uniform_random_inplace_op.cu
index a5231354eb47ea..1c7b9a27f86882 100644
--- a/paddle/fluid/operators/uniform_random_inplace_op.cu
+++ b/paddle/fluid/operators/uniform_random_inplace_op.cu
@@ -12,130 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <thrust/device_vector.h>
-#include <thrust/host_vector.h>
-#include <thrust/random.h>
-#include <thrust/transform.h>
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/uniform_random_op.h"
+#include "paddle/phi/kernels/full_kernel.h"
 
 namespace paddle {
 namespace operators {
-
-template <typename T>
-struct UniformGenerator {
-  T min_, max_;
-  unsigned int seed_;
-  T diag_val_;
-  unsigned int diag_num_;
-  unsigned int diag_step_;
-  __host__ __device__ UniformGenerator(T min, T max, int seed, int diag_num,
-                                       int diag_step, T diag_val)
-      : min_(min),
-        max_(max),
-        seed_(seed),
-        diag_num_(diag_num),
-        diag_step_(diag_step),
-        diag_val_(diag_val) {}
-
-  __host__ __device__ T operator()(const unsigned int n) const {
-    thrust::minstd_rand rng;
-    rng.seed(seed_);
-    thrust::uniform_real_distribution<T> dist(min_, max_);
-    rng.discard(n);
-    T out = dist(rng);
-    unsigned int remainder = n % (diag_step_ + 1);
-    if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) {
-      out = diag_val_;
-    }
-    return out;
-  }
-};
-
-template <typename T>
-struct UniformGeneratorOffset {
-  T min_, max_;
-  unsigned int seed_;
-  T diag_val_;
-  unsigned int diag_num_;
-  unsigned int diag_step_;
-  int offset_;
-  __host__ __device__ UniformGeneratorOffset(T min, T max, int seed,
-                                             int diag_num, int diag_step,
-                                             T diag_val, int offset)
-      : min_(min),
-        max_(max),
-        seed_(seed),
-        diag_num_(diag_num),
-        diag_step_(diag_step),
-        diag_val_(diag_val),
-        offset_(offset) {}
-
-  __host__ __device__ T operator()(const unsigned int n) const {
-    thrust::minstd_rand rng;
-    rng.seed(seed_);
-    thrust::uniform_real_distribution<T> dist(min_, max_);
-    rng.discard(n + offset_);
-    T out = dist(rng);
-    unsigned int remainder = n % (diag_step_ + 1);
-    if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) {
-      out = diag_val_;
-    }
-    return out;
-  }
-};
-
-template <typename T>
-__global__ void fill_value(int64_t size, T* data, float value) {
-  for (int idx = threadIdx.x; idx < size; idx += blockDim.x) {
-    data[idx] = static_cast<T>(value);
-  }
-}
-
-// It seems that Eigen::Tensor::random in GPU will SEGFAULT.
-// Use std::random and thrust::random(thrust is a std library in CUDA) to
-// implement uniform random as uniform_random_op.cu.
 template <typename T>
 class GPUUniformRandomInplaceKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto out_var = ctx.OutputVar("Out");
-    auto* tensor = out_var->GetMutable<framework::LoDTensor>();
-    T* data = tensor->mutable_data<T>(ctx.GetPlace());
-    unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
-    bool seed_flag = false;
-    if (seed == 0) {
-      std::random_device rd;
-      seed = rd();
-      seed_flag = true;
-    }
-
-    T min = static_cast<T>(ctx.Attr<float>("min"));
-    T max = static_cast<T>(ctx.Attr<float>("max"));
-    unsigned int diag_num =
-        static_cast<unsigned int>(ctx.Attr<int>("diag_num"));
-    unsigned int diag_step =
-        static_cast<unsigned int>(ctx.Attr<int>("diag_step"));
-    T diag_val = static_cast<T>(ctx.Attr<float>("diag_val"));
-    thrust::counting_iterator<int64_t> index_sequence_begin(0);
-    int64_t size = tensor->numel();
-    int device_id = ctx.GetPlace().GetDeviceId();
-    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
-    if (gen_cuda->GetIsInitPy() && seed_flag) {
-      auto seed_offset = gen_cuda->IncrementOffset(1);
-      int64_t gen_offset = size * seed_offset.second;
-      thrust::transform(
-          index_sequence_begin, index_sequence_begin + size,
-          thrust::device_ptr<T>(data),
-          UniformGeneratorOffset<T>(min, max, seed_offset.first, diag_num,
-                                    diag_step, diag_val, gen_offset));
-    } else {
-      thrust::transform(
-          index_sequence_begin, index_sequence_begin + size,
-          thrust::device_ptr<T>(data),
-          UniformGenerator<T>(min, max, seed, diag_num, diag_step, diag_val));
-    }
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* tensor = context.Output<framework::Tensor>("Out");
+    UniformRandom<T>(context, tensor);
   }
 };
 
@@ -143,17 +30,15 @@ template <typename T>
 class GPUUniformRandomInplaceGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-#ifdef __HIPCC__
-    const int64_t kMaxBlockDim = 256;
-#else
-    const int64_t kMaxBlockDim = 512;
-#endif
     auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* data = dx->mutable_data<T>(ctx.GetPlace());
-
-    auto size = dx->numel();
-    int64_t kBlockDim = std::min(size, kMaxBlockDim);
-    fill_value<T><<<1, kBlockDim, 0>>>(size, data, static_cast<float>(0));
+    auto dims = vectorize(dx->dims());
+    const auto& dev_cxt =
+        ctx.template device_context<platform::CUDADeviceContext>();
+    float value = static_cast<float>(0.0f);
+    phi::FullKernel<T>(
+        static_cast<const typename paddle::framework::ConvertToPtenContext<
+            paddle::platform::CUDADeviceContext>::TYPE&>(dev_cxt),
+        dims, value, phi::DataType::UNDEFINED, dx);
   }
 };
 
diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
index 086c57527b48ff..fb38a6aded4cf1 100644
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -11,88 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <thrust/device_vector.h>
-#include <thrust/host_vector.h>
-#include <thrust/random.h>
-#include <thrust/transform.h>
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/amp/fp16_type_traits.h"
-#include "paddle/fluid/operators/distribution_helper.h"
 #include "paddle/fluid/operators/uniform_random_op.h"
 
-DECLARE_bool(use_curand);
-
 namespace paddle {
 namespace operators {
 
-template <typename T>
-struct UniformGenerator {
-  T min_, max_;
-  unsigned int seed_;
-  T diag_val_;
-  unsigned int diag_num_;
-  unsigned int diag_step_;
-  __host__ __device__ UniformGenerator(T min, T max, int seed, int diag_num,
-                                       int diag_step, T diag_val)
-      : min_(min),
-        max_(max),
-        seed_(seed),
-        diag_num_(diag_num),
-        diag_step_(diag_step),
-        diag_val_(diag_val) {}
-
-  __host__ __device__ T operator()(const unsigned int n) const {
-    thrust::minstd_rand rng;
-    rng.seed(seed_);
-    thrust::uniform_real_distribution<T> dist(min_, max_);
-    rng.discard(n);
-    T out = dist(rng);
-    unsigned int remainder = n % (diag_step_ + 1);
-    if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) {
-      out = diag_val_;
-    }
-    return out;
-  }
-};
-
-template <typename T>
-struct UniformGeneratorOffset {
-  T min_, max_;
-  unsigned int seed_;
-  T diag_val_;
-  unsigned int diag_num_;
-  unsigned int diag_step_;
-  int offset_;
-  __host__ __device__ UniformGeneratorOffset(T min, T max, int seed,
-                                             int diag_num, int diag_step,
-                                             T diag_val, int offset)
-      : min_(min),
-        max_(max),
-        seed_(seed),
-        diag_num_(diag_num),
-        diag_step_(diag_step),
-        diag_val_(diag_val),
-        offset_(offset) {}
-
-  __host__ __device__ T operator()(const unsigned int n) const {
-    thrust::minstd_rand rng;
-    rng.seed(seed_);
-    thrust::uniform_real_distribution<T> dist(min_, max_);
-    rng.discard(n + offset_);
-    T out = dist(rng);
-    unsigned int remainder = n % (diag_step_ + 1);
-    if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) {
-      out = diag_val_;
-    }
-    return out;
-  }
-};
-
-// It seems that Eigen::Tensor::random in GPU will SEGFAULT.
-// Use std::random and thrust::random(thrust is a std library in CUDA) to
-// implement uniform random.
 template <typename T>
 class GPUUniformRandomKernel : public framework::OpKernel<T> {
  public:
@@ -128,50 +51,7 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
           "unsupport type: %s.",
           framework::ToTypeName(out_var->Type())));
     }
-    auto& dev_cxt =
-        context.template device_context<platform::CUDADeviceContext>();
-    T* data = tensor->mutable_data<T>(dev_cxt.GetPlace());
-    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
-    bool seed_flag = false;
-    if (seed == 0) {
-      std::random_device rd;
-      seed = rd();
-      seed_flag = true;
-    }
-
-    T min = static_cast<T>(context.Attr<float>("min"));
-    T max = static_cast<T>(context.Attr<float>("max"));
-    unsigned int diag_num =
-        static_cast<unsigned int>(context.Attr<int>("diag_num"));
-    unsigned int diag_step =
-        static_cast<unsigned int>(context.Attr<int>("diag_step"));
-    T diag_val = static_cast<T>(context.Attr<float>("diag_val"));
-    thrust::counting_iterator<int64_t> index_sequence_begin(0);
-    int64_t size = tensor->numel();
-    int device_id = context.GetPlace().GetDeviceId();
-    auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
-    if (gen_cuda->GetIsInitPy() && seed_flag) {
-      if (FLAGS_use_curand) {
-        using MT = typename details::MPTypeTrait<T>::Type;
-        distribution::uniform_distribution<MT> dist;
-        distribution::uniform_transform<MT> trans(min, max);
-        distribution::distribution_and_transform<T>(dev_cxt, tensor, dist,
-                                                    trans);
-      } else {
-        auto seed_offset = gen_cuda->IncrementOffset(1);
-        int64_t gen_offset = size * seed_offset.second;
-        thrust::transform(
-            index_sequence_begin, index_sequence_begin + size,
-            thrust::device_ptr<T>(data),
-            UniformGeneratorOffset<T>(min, max, seed_offset.first, diag_num,
-                                      diag_step, diag_val, gen_offset));
-      }
-    } else {
-      thrust::transform(
-          index_sequence_begin, index_sequence_begin + size,
-          thrust::device_ptr<T>(data),
-          UniformGenerator<T>(min, max, seed, diag_num, diag_step, diag_val));
-    }
+    UniformRandom<T>(context, tensor);
   }
 };
 
diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h
index be6c3c740e692c..a864c48ad75741 100644
--- a/paddle/fluid/operators/uniform_random_op.h
+++ b/paddle/fluid/operators/uniform_random_op.h
@@ -18,6 +18,16 @@
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#if defined(__NVCC__) || defined(__HIPCC__)
+DECLARE_bool(use_curand);
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/random.h>
+#include <thrust/transform.h>
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/operators/index_impl.cu.h"
+#include "paddle/phi/kernels/full_kernel.h"
+#endif
 
 namespace paddle {
 namespace operators {
@@ -102,5 +112,117 @@ inline std::vector<int64_t> GetNewDataFromShapeTensorList(
 
   return vec_new_shape;
 }
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+
+template <typename T>
+struct UniformGenerator {
+  T min_, max_;
+  unsigned int seed_;
+  T diag_val_;
+  unsigned int diag_num_;
+  unsigned int diag_step_;
+  __host__ __device__ UniformGenerator(T min, T max, int seed, int diag_num,
+                                       int diag_step, T diag_val)
+      : min_(min),
+        max_(max),
+        seed_(seed),
+        diag_num_(diag_num),
+        diag_step_(diag_step),
+        diag_val_(diag_val) {}
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed_);
+    thrust::uniform_real_distribution<T> dist(min_, max_);
+    rng.discard(n);
+    T out = dist(rng);
+    unsigned int remainder = n % (diag_step_ + 1);
+    if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) {
+      out = diag_val_;
+    }
+    return out;
+  }
+};
+
+template <typename T>
+struct UniformGeneratorOffset {
+  T min_, max_;
+  unsigned int seed_;
+  T diag_val_;
+  unsigned int diag_num_;
+  unsigned int diag_step_;
+  int offset_;
+  __host__ __device__ UniformGeneratorOffset(T min, T max, int seed,
+                                             int diag_num, int diag_step,
+                                             T diag_val, int offset)
+      : min_(min),
+        max_(max),
+        seed_(seed),
+        diag_num_(diag_num),
+        diag_step_(diag_step),
+        diag_val_(diag_val),
+        offset_(offset) {}
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed_);
+    thrust::uniform_real_distribution<T> dist(min_, max_);
+    rng.discard(n + offset_);
+    T out = dist(rng);
+    unsigned int remainder = n % (diag_step_ + 1);
+    if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) {
+      out = diag_val_;
+    }
+    return out;
+  }
+};
+
+template <typename T>
+void UniformRandom(const framework::ExecutionContext& context,
+                   framework::Tensor* tensor) {
+  int64_t size = tensor->numel();
+  auto& dev_cxt =
+      context.template device_context<platform::CUDADeviceContext>();
+  T* data = tensor->mutable_data<T>(dev_cxt.GetPlace());
+  if (size <= 0) return;
+  unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+  bool seed_flag = false;
+  if (seed == 0) {
+    std::random_device rd;
+    seed = rd();
+    seed_flag = true;
+  }
+
+  T min = static_cast<T>(context.Attr<float>("min"));
+  T max = static_cast<T>(context.Attr<float>("max"));
+  unsigned int diag_num =
+      static_cast<unsigned int>(context.Attr<int>("diag_num"));
+  unsigned int diag_step =
+      static_cast<unsigned int>(context.Attr<int>("diag_step"));
+  T diag_val = static_cast<T>(context.Attr<float>("diag_val"));
+  int device_id = context.GetPlace().GetDeviceId();
+  auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
+  if (gen_cuda->GetIsInitPy() && seed_flag) {
+    if (FLAGS_use_curand) {
+      using MT = typename details::MPTypeTrait<T>::Type;
+      distribution::uniform_distribution<MT> dist;
+      distribution::uniform_transform<MT> trans(min, max);
+      distribution::distribution_and_transform<T>(dev_cxt, tensor, dist, trans);
+    } else {
+      auto seed_offset = gen_cuda->IncrementOffset(1);
+      int64_t gen_offset = size * seed_offset.second;
+      auto func =
+          UniformGeneratorOffset<T>(min, max, seed_offset.first, diag_num,
+                                    diag_step, diag_val, gen_offset);
+      IndexKernel<T, UniformGeneratorOffset<T>>(dev_cxt, tensor, func);
+    }
+  } else {
+    auto func =
+        UniformGenerator<T>(min, max, seed, diag_num, diag_step, diag_val);
+    IndexKernel<T, UniformGenerator<T>>(dev_cxt, tensor, func);
+  }
+}
+#endif
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index be02bac1aa0ef7..b808e1561b24af 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -146,6 +146,10 @@ if(WITH_ASCEND_CL)
     target_link_libraries(device_context npu_resource_pool)
 endif()
 
+if(WITH_CUSTOM_DEVICE)
+    target_link_libraries(device_context custom_context)
+endif()
+
 cc_test(init_test SRCS init_test.cc DEPS device_context)
 
 # Manage all device event library
diff --git a/paddle/fluid/platform/device/ipu/CMakeLists.txt b/paddle/fluid/platform/device/ipu/CMakeLists.txt
index d54c6a33ecbf53..acf914c5087d0f 100644
--- a/paddle/fluid/platform/device/ipu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/ipu/CMakeLists.txt
@@ -13,9 +13,9 @@ IF(WITH_IPU)
     "ipu_device.cc"
   )
 
-  cc_library(ipu_backend SRCS ${IPU_BACKEND_SRC} DEPS popart graph graph_helper)
-  cc_library(ipu_info SRCS ${IPU_INFO_SRC} DEPS popart enforce)
-  cc_library(paddle_ipu SHARED SRCS ${PADDLE_IPU_SRC} DEPS popart graph_helper)
+  cc_library(ipu_backend SRCS ${IPU_BACKEND_SRC} DEPS popart-only graph graph_helper)
+  cc_library(ipu_info SRCS ${IPU_INFO_SRC} DEPS popart-only enforce)
+  add_library(paddle_ipu SHARED ${PADDLE_IPU_SRC})
   add_dependencies(paddle_ipu ipu_backend)
   set(PADDLE_IPU_LIB "${CMAKE_CURRENT_BINARY_DIR}/libpaddle_ipu.so" CACHE STRING "")
   set(PADDLE_IPU_LIB_DIR "${CMAKE_CURRENT_BINARY_DIR}" CACHE STRING "")
diff --git a/paddle/fluid/platform/device/ipu/ipu_backend.cc b/paddle/fluid/platform/device/ipu/ipu_backend.cc
index 8f2a7ef78c9824..e0b3b08a2313d0 100644
--- a/paddle/fluid/platform/device/ipu/ipu_backend.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_backend.cc
@@ -43,17 +43,17 @@ void IpuBackend::Compile(Graph* graph,
                          const std::vector<std::string>& feed_list,
                          const std::vector<std::string>& fetch_list) {
   VLOG(10) << "enter IpuBackend::Compile";
-  compiler_->Prepare();
-  executor_->SetCompilerResources(compiler_->GetResources());
-
-  compiler_->InitInputs(graph, feed_list);
-  compiler_->LowerConstants(graph, scope_);
-  compiler_->LowerWeights(graph, scope_);
-  compiler_->LowerBody(graph);
+  compiler_->Prepare(graph);
+  compiler_->InitInputs(feed_list);
+  compiler_->LowerConstants(scope_);
+  compiler_->LowerWeights(scope_);
+  compiler_->LowerBody();
   compiler_->InitOutputs(fetch_list);
   if (ipu_strategy_->is_training) {
-    compiler_->LowerOptimier(graph, scope_);
+    compiler_->LowerOptimizer(scope_);
   }
+  executor_->SetCompilerResources(compiler_->GetResources());
+
   is_compiled_ = true;
   // when call compile, means a new graph
   is_prepared_ = false;
@@ -95,11 +95,9 @@ void IpuBackend::SetIpuStrategy(const IpuStrategy& strategy) {
   ipu_strategy_ = &strategy;
   compiler_->SetIpuStrategy(strategy);
   executor_->SetIpuStrategy(strategy);
-}
-
-void IpuBackend::SetCustomOps(
-    const std::vector<IpuCustomOpIdentifier>& custom_ops) {
-  compiler_->SetCustomOps(custom_ops);
+  if (!strategy.custom_ops.empty()) {
+    compiler_->SetCustomOps(strategy.custom_ops);
+  }
 }
 
 void IpuBackend::SaveModelProto(const std::string& path) {
diff --git a/paddle/fluid/platform/device/ipu/ipu_backend.h b/paddle/fluid/platform/device/ipu/ipu_backend.h
index b12e2539258dfe..1244192490c16c 100644
--- a/paddle/fluid/platform/device/ipu/ipu_backend.h
+++ b/paddle/fluid/platform/device/ipu/ipu_backend.h
@@ -71,7 +71,6 @@ class IpuBackend {
   const Scope *GetScope() { return scope_; }
   void SetIpuStrategy(const IpuStrategy &strategy);
   const IpuStrategy *GetIpuStrategy() { return ipu_strategy_; }
-  void SetCustomOps(const std::vector<IpuCustomOpIdentifier> &custom_ops);
 
   // save compiled model to onnx
   void SaveModelProto(const std::string &path);
diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.cc b/paddle/fluid/platform/device/ipu/ipu_compiler.cc
index df2e456383e175..cdb3f6f9b3e285 100644
--- a/paddle/fluid/platform/device/ipu/ipu_compiler.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_compiler.cc
@@ -98,6 +98,19 @@ TO GetCastSigAttrAllowNull(std::string attr, OpDesc* op_desc) {
   }
 }
 
+GraphHelper::GraphHelper(const Graph* g) {
+  graph = g;
+  sorted_ops = framework::ir::TopologySortOperations(*g);
+  for (auto* node : g->Nodes()) {
+    nodes_id_map[node->id()] = node;
+    if (node->IsVar()) {
+      vars_name_map[node->Name()] = node;
+      sorted_vars_id.push_back(node->id());
+    }
+  }
+  std::sort(sorted_vars_id.begin(), sorted_vars_id.end());
+}
+
 Compiler::Compiler() { RegisterOpFunc(); }
 
 Compiler::~Compiler() {
@@ -105,9 +118,10 @@ Compiler::~Compiler() {
   resources_.reset();
 }
 
-void Compiler::Prepare() {
+void Compiler::Prepare(const Graph* graph) {
   builder_ = popart::Builder::create();
   resources_ = std::make_unique<CompilerResources>();
+  graph_helper_ = std::make_unique<GraphHelper>(graph);
 }
 
 void Compiler::RegisterOpFunc() {
@@ -171,93 +185,24 @@ void Compiler::RegisterOpFunc() {
 #undef INT_VEC
 }
 
-void Compiler::LowerBody(const Graph* graph) {
-  VLOG(10) << "enter Compiler::LowerBody";
-  auto nodes = framework::ir::TopologySortOperations(*graph);
-  for (auto* node : nodes) {
-    auto* op_desc = node->Op();
-    auto op_type = op_desc->Type();
-    VLOG(10) << "lowering op: " << op_type;
-
-    if (op_type == "popart_constant") {
-      // pass
-    } else if (op_type == "popart_optimizer") {
-      // pass
-    } else if (op_type == "popart_checkpointoutput") {
-      auto inputs = GetOpInputs(op_desc);
-      auto outputs = GetOpOutputs(op_desc);
-      auto output_ids = builder_->checkpointOutput(inputs);
-      InsertTensors(outputs, output_ids);
-    } else if (op_type == "popart_custom_op") {
-      auto inputs = GetOpInputs(op_desc);
-      auto outputs = GetOpOutputs(op_desc);
-      auto debug_context = BuildDebugContext(op_desc);
-      auto attributes = std::map<std::string, popart::any>{};
-      for (auto& attr : op_desc->GetAttrMap()) {
-        CustomOpAttrVisitor visitor(&attributes, attr.first);
-        boost::apply_visitor(visitor, attr.second);
-      }
-      auto __op_type =
-          BOOST_GET_CONST(std::string, op_desc->GetAttr("__op_type"));
-      VLOG(10) << "Build graph from custom op: " << __op_type;
-      auto it = custom_ops_.find(__op_type);
-      auto output_ids =
-          builder_->customOp(it->second.popart_op, it->second.popart_op.version,
-                             inputs, outputs.size(), attributes, debug_context);
-      SetIpuIndexStage(output_ids, op_desc);
-      InsertTensors(outputs, output_ids);
-    } else if (op_type == "popart_printtensor") {
-      auto inputs = GetOpInputs(op_desc);
-      auto outputs = GetOpOutputs(op_desc);
-      auto debug_context = BuildDebugContext(op_desc);
-      auto print_gradient =
-          BOOST_GET_CONST(int64_t, op_desc->GetAttr("print_gradient"));
-      auto title = BOOST_GET_CONST(std::string, op_desc->GetAttr("title"));
-      auto output_ids = builder_->aiGraphcoreOpset1().printtensor(
-          inputs, print_gradient, debug_context, title);
-      SetIpuIndexStage(output_ids, op_desc);
-      InsertTensors(outputs, output_ids);
-    } else {
-      auto itr = name_function_.find(op_type);
-      if (itr != name_function_.end()) {
-        itr->second(node->Op());
-      } else {
-        PADDLE_THROW(platform::errors::NotFound(
-            "%s is not registered, please check for unsupported operators for "
-            "running on IPU",
-            op_type));
-      }
-    }
-  }
-  VLOG(10) << "leave Compiler::LowerBody";
-}
-
-void Compiler::InitInputs(Graph* graph,
-                          const std::vector<std::string>& feed_list) {
+void Compiler::InitInputs(const std::vector<std::string>& feed_list) {
   for (const auto& feed_name : feed_list) {
-    feed_list_.push_back(feed_name);
-    for (const Node* n : graph->Nodes()) {
-      if (n->IsVar()) {
-        auto* var_desc = n->Var();
-        if (feed_name == var_desc->Name()) {
-          VLOG(10) << "feed_name= " << var_desc->Name();
-          auto data_type = VarType2PopartType(var_desc->GetDataType());
-          popart::TensorInfo input_info{data_type, var_desc->GetShape()};
-          VLOG(10) << "popart input_info = " << input_info;
-          popart::TensorId tensor_id =
-              builder_->addInputTensor(input_info, feed_name);
-          VLOG(10) << "popart input tensor id = " << tensor_id;
-          resources_->inputs.push_back(tensor_id);
-          resources_->tensors.emplace(var_desc->Name(), tensor_id);
-        }
-      }
-    }
+    auto* node = graph_helper_->vars_name_map[feed_name];
+    auto* var_desc = node->Var();
+    VLOG(10) << "feed_name= " << var_desc->Name();
+    auto data_type = VarType2PopartType(var_desc->GetDataType());
+    popart::TensorInfo input_info{data_type, var_desc->GetShape()};
+    VLOG(10) << "popart input_info = " << input_info;
+    popart::TensorId tensor_id =
+        builder_->addInputTensor(input_info, feed_name);
+    VLOG(10) << "popart input tensor id = " << tensor_id;
+    resources_->inputs.push_back(tensor_id);
+    resources_->tensors.emplace(var_desc->Name(), tensor_id);
   }
 }
 
 void Compiler::InitOutputs(const std::vector<std::string>& fetch_list) {
   for (const auto& fetch_name : fetch_list) {
-    fetch_list_.push_back(fetch_name);
     auto tensor = resources_->tensors.find(fetch_name);
     PADDLE_ENFORCE_NE(
         tensor, resources_->tensors.end(),
@@ -271,14 +216,10 @@ void Compiler::InitOutputs(const std::vector<std::string>& fetch_list) {
   }
 }
 
-void Compiler::LowerConstants(const Graph* graph, const Scope* scope) {
+void Compiler::LowerConstants(const Scope* scope) {
   auto& kid_scope = scope->NewScope();
   VLOG(10) << "enter Compiler::LowerConstants";
-  for (auto* node : graph->Nodes()) {
-    if (!node->IsOp()) {
-      continue;
-    }
-
+  for (auto* node : graph_helper_->sorted_ops) {
     auto* op_desc = node->Op();
     auto op_type = op_desc->Type();
     if (op_type == "popart_constant") {
@@ -308,17 +249,16 @@ void Compiler::LowerConstants(const Graph* graph, const Scope* scope) {
   VLOG(10) << "leave Compiler::LowerConstants";
 }
 
-void Compiler::LowerWeights(const Graph* graph, const Scope* scope) {
+void Compiler::LowerWeights(const Scope* scope) {
   VLOG(10) << "enter Compiler::LowerWeights";
-  PADDLE_ENFORCE_NOT_NULL(scope,
-                          platform::errors::PreconditionNotMet(
-                              "You should call set_scope before LowerWeights"));
   // at this step, the graph doesn't contains optimizer related states
-  for (const auto* node : graph->Nodes()) {
+  for (auto id : graph_helper_->sorted_vars_id) {
+    auto* node = graph_helper_->nodes_id_map[id];
     if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
       if (node->Var()->Persistable() && node->inputs.empty()) {
         auto var_name = node->Var()->Name();
         if (resources_->tensors.count(var_name) != 0) {
+          VLOG(10) << "found existed one, skip lowering Weight: " << var_name;
           continue;
         }
         VLOG(10) << "lowering weight: " << var_name;
@@ -344,12 +284,68 @@ void Compiler::LowerWeights(const Graph* graph, const Scope* scope) {
   VLOG(10) << "leave Compiler::LowerWeights";
 }
 
-void Compiler::LowerOptimier(const Graph* graph, const Scope* scope) {
-  for (auto* node : graph->Nodes()) {
-    if (!node->IsOp()) {
-      continue;
+void Compiler::LowerBody() {
+  VLOG(10) << "enter Compiler::LowerBody";
+  for (auto* node : graph_helper_->sorted_ops) {
+    auto* op_desc = node->Op();
+    auto op_type = op_desc->Type();
+    VLOG(10) << "lowering op: " << op_type;
+
+    if (op_type == "popart_constant") {
+      // pass
+    } else if (op_type == "popart_optimizer") {
+      // pass
+    } else if (op_type == "popart_checkpointoutput") {
+      auto inputs = GetOpInputs(op_desc);
+      auto outputs = GetOpOutputs(op_desc);
+      auto output_ids = builder_->checkpointOutput(inputs);
+      InsertTensors(outputs, output_ids);
+    } else if (op_type == "popart_custom_op") {
+      auto inputs = GetOpInputs(op_desc);
+      auto outputs = GetOpOutputs(op_desc);
+      auto debug_context = BuildDebugContext(op_desc);
+      auto attributes = std::map<std::string, popart::any>{};
+      for (auto& attr : op_desc->GetAttrMap()) {
+        CustomOpAttrVisitor visitor(&attributes, attr.first);
+        boost::apply_visitor(visitor, attr.second);
+      }
+      auto __op_type =
+          BOOST_GET_CONST(std::string, op_desc->GetAttr("__op_type"));
+      VLOG(10) << "Build graph from custom op: " << __op_type;
+      auto it = custom_ops_.find(__op_type);
+      auto output_ids =
+          builder_->customOp(it->second.popart_op, it->second.popart_op.version,
+                             inputs, outputs.size(), attributes, debug_context);
+      SetIpuIndexStage(output_ids, op_desc);
+      InsertTensors(outputs, output_ids);
+    } else if (op_type == "popart_printtensor") {
+      auto inputs = GetOpInputs(op_desc);
+      auto outputs = GetOpOutputs(op_desc);
+      auto debug_context = BuildDebugContext(op_desc);
+      auto print_gradient =
+          BOOST_GET_CONST(int64_t, op_desc->GetAttr("print_gradient"));
+      auto title = BOOST_GET_CONST(std::string, op_desc->GetAttr("title"));
+      auto output_ids = builder_->aiGraphcoreOpset1().printtensor(
+          inputs, print_gradient, debug_context, title);
+      SetIpuIndexStage(output_ids, op_desc);
+      InsertTensors(outputs, output_ids);
+    } else {
+      auto itr = name_function_.find(op_type);
+      if (itr != name_function_.end()) {
+        itr->second(node->Op());
+      } else {
+        PADDLE_THROW(platform::errors::NotFound(
+            "%s is not registered, please check for unsupported operators for "
+            "running on IPU",
+            op_type));
+      }
     }
+  }
+  VLOG(10) << "leave Compiler::LowerBody";
+}
 
+void Compiler::LowerOptimizer(const Scope* scope) {
+  for (auto* node : graph_helper_->sorted_ops) {
     auto* op_desc = node->Op();
     auto op_type = op_desc->Type();
     if (op_type == "popart_optimizer") {
diff --git a/paddle/fluid/platform/device/ipu/ipu_compiler.h b/paddle/fluid/platform/device/ipu/ipu_compiler.h
index 5576266b1a7716..5d1e8c2727d8f9 100644
--- a/paddle/fluid/platform/device/ipu/ipu_compiler.h
+++ b/paddle/fluid/platform/device/ipu/ipu_compiler.h
@@ -68,34 +68,29 @@ struct CompilerResources {
   std::unique_ptr<popart::Optimizer> optimizer;
 };
 
+// helper for lowering graph
+struct GraphHelper {
+  explicit GraphHelper(const Graph *);
+
+  const Graph *graph;
+  std::map<std::string, Node *> vars_name_map;
+  std::map<int, Node *> nodes_id_map;
+  std::vector<Node *> sorted_ops;
+  std::vector<int> sorted_vars_id;
+};
+
 class Compiler {
  public:
   Compiler();
   ~Compiler();
 
-  void RegisterOpFunc();
-  void Prepare();
-  void LowerBody(const Graph *graph);
-  void InitInputs(Graph *graph, const std::vector<std::string> &feed_list);
+  void Prepare(const Graph *graph);
+  void InitInputs(const std::vector<std::string> &feed_list);
   void InitOutputs(const std::vector<std::string> &fetch_list);
-  void LowerConstants(const Graph *graph, const Scope *scope);
-  void LowerWeights(const Graph *graph, const Scope *scope);
-  void LowerOptimier(const Graph *graph, const Scope *scope);
-
-  void InsertTensors(const std::vector<std::string> &output_names,
-                     const std::vector<std::string> &tensor_ids);
-  void InsertTensors(const std::vector<std::string> &output_names,
-                     const std::string &tensor_id);
-  void SetIpuIndexStage(const std::vector<std::string> &tensor_ids,
-                        const OpDesc *op_desc);
-  void SetIpuIndexStage(const std::string &tensor_id, const OpDesc *op_desc);
-  void SetAMPAttributes(const std::vector<std::string> &tensor_ids,
-                        const OpDesc *op_desc);
-  void SetAMPAttributes(const std::string &tensor_id, const OpDesc *op_desc);
-  void SetSerializeAttributes(const std::vector<std::string> &tensor_ids,
-                              const OpDesc *op_desc);
-  void SetSerializeAttributes(const std::string &tensor_id,
-                              const OpDesc *op_desc);
+  void LowerConstants(const Scope *scope);
+  void LowerWeights(const Scope *scope);
+  void LowerBody();
+  void LowerOptimizer(const Scope *scope);
 
   void SetIpuStrategy(const IpuStrategy &strategy) {
     ipu_strategy_ = &strategy;
@@ -112,21 +107,34 @@ class Compiler {
   void SaveModelProtoNoCheck(const std::string &path);
 
  private:
+  void RegisterOpFunc();
   std::vector<std::string> GetOpInputs(const OpDesc *op);
   const std::vector<std::string> &GetOpOutputs(const OpDesc *op);
   popart::DebugContext BuildDebugContext(const OpDesc *op);
 
+  void InsertTensors(const std::vector<std::string> &output_names,
+                     const std::vector<std::string> &tensor_ids);
+  void InsertTensors(const std::vector<std::string> &output_names,
+                     const std::string &tensor_id);
+  void SetIpuIndexStage(const std::vector<std::string> &tensor_ids,
+                        const OpDesc *op_desc);
+  void SetIpuIndexStage(const std::string &tensor_id, const OpDesc *op_desc);
+  void SetAMPAttributes(const std::vector<std::string> &tensor_ids,
+                        const OpDesc *op_desc);
+  void SetAMPAttributes(const std::string &tensor_id, const OpDesc *op_desc);
+  void SetSerializeAttributes(const std::vector<std::string> &tensor_ids,
+                              const OpDesc *op_desc);
+  void SetSerializeAttributes(const std::string &tensor_id,
+                              const OpDesc *op_desc);
+
  private:
   std::unique_ptr<popart::Builder> builder_;
   std::unique_ptr<CompilerResources> resources_;
+  std::unique_ptr<GraphHelper> graph_helper_;
 
   using OpFunc = std::function<void(OpDesc *op_desc)>;
   std::unordered_map<std::string, OpFunc> name_function_;
 
-  // feed_list_ & fetch_list save paddle tensor id
-  std::vector<std::string> feed_list_;
-  std::vector<std::string> fetch_list_;
-
   const IpuStrategy *ipu_strategy_ = nullptr;
   std::map<std::string, IpuCustomOpIdentifier> custom_ops_;
 };
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.cc b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
index 4a9b9c00cb75cd..943dfcc6cffb87 100644
--- a/paddle/fluid/platform/device/ipu/ipu_strategy.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
@@ -241,6 +241,15 @@ IpuStrategy::IpuStrategy() {
 #undef ADD_POPART_BOOL_OPTION_ALIAS
 #undef ADD_POPART_ENUM_OPTION_ALIAS
 
+  RegisterGetter(vector_options_getter, options_type, "custom_ops", "vector",
+                 [&]() {
+                   std::vector<std::string> res;
+                   for (auto x : custom_ops) {
+                     res.push_back(x.repr());
+                   }
+                   return res;
+                 });
+
   RegisterSetter(bool_options, "enable_manual_shard", [&](bool value) {
     if (value) {
       popart_options.virtualGraphMode = popart::VirtualGraphMode::Manual;
@@ -429,6 +438,14 @@ void IpuStrategy::SetTensorLocation(const std::string& tensor,
   }
 }
 
+void IpuStrategy::AddCustomOp(const std::string& paddle_op,
+                              const std::string& popart_op,
+                              const std::string& domain, int version) {
+  LOG(INFO) << "IpuStrategy add custom op: " << paddle_op;
+  custom_ops.push_back(
+      IpuCustomOpIdentifier(paddle_op, popart_op, domain, version));
+}
+
 std::string IpuStrategy::GetOption(const std::string& option) {
   return get(option, options_getter);
 }
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.h b/paddle/fluid/platform/device/ipu/ipu_strategy.h
index 0e2af26454c401..64436dc14fec33 100644
--- a/paddle/fluid/platform/device/ipu/ipu_strategy.h
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <popart/patterns/patterns.hpp>
 #include <popart/sessionoptions.hpp>
 #include <popart/tensorlocation.hpp>
+#include "paddle/fluid/platform/device/ipu/ipu_utils.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -71,6 +72,9 @@ struct IpuStrategy {
   // popart pattern manager
   popart::Patterns popart_patterns;
 
+  // custom ops
+  std::vector<IpuCustomOpIdentifier> custom_ops;
+
  private:
   std::map<std::string, std::function<void(bool)>> bool_options;
   std::map<std::string, std::function<void(std::uint64_t)>> uint64_options;
@@ -123,6 +127,8 @@ struct IpuStrategy {
                               const std::string &value);
   void SetTensorLocation(const std::string &tensor, const std::string &option,
                          std::uint64_t value);
+  void AddCustomOp(const std::string &paddle_op, const std::string &popart_op,
+                   const std::string &domain, int version);
 
   std::string GetOption(const std::string &);
   std::vector<std::string> GetVectorOption(const std::string &);
diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
index 6e7c98dd7156c5..e6b08ed7bc340b 100644
--- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
@@ -134,7 +134,7 @@ XPUOpMap& get_kl2_ops() {
        XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                      pOpKernelType(vartype::INT32, XPUPlace()),
                      pOpKernelType(vartype::INT16, XPUPlace()),
-                     pOpKernelType(vartype::INT8, XPUPlace()),
+                     pOpKernelType(vartype::UINT8, XPUPlace()),
                      pOpKernelType(vartype::BOOL, XPUPlace()),
                      pOpKernelType(vartype::FP64, XPUPlace()),
                      pOpKernelType(vartype::FP32, XPUPlace()),
@@ -196,6 +196,7 @@ XPUOpMap& get_kl2_ops() {
       {"hard_swish_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
                      pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"hard_swish", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"huber_loss_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"huber_loss", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
diff --git a/paddle/fluid/pybind/bind_fleet_executor.cc b/paddle/fluid/pybind/bind_fleet_executor.cc
index 7bb7f03983eb9e..b29cc10e8f56f5 100644
--- a/paddle/fluid/pybind/bind_fleet_executor.cc
+++ b/paddle/fluid/pybind/bind_fleet_executor.cc
@@ -24,10 +24,41 @@
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/place.h"
+#include "pybind11/pybind11.h"
 
 namespace py = pybind11;
 
+namespace pybind11 {
+namespace detail {
+
+// Note: use same enum number of float16 in numpy.
+// import numpy as np
+// print np.dtype(np.float16).num  # 23
+constexpr int NPY_FLOAT16_ = 23;
+
+// Note: Since float16 is not a builtin type in C++, we register
+// paddle::platform::float16 as numpy.float16.
+// Ref: https://github.com/pybind/pybind11/issues/1776
+template <>
+struct npy_format_descriptor<paddle::platform::float16> {
+  static py::dtype dtype() {
+    handle ptr = npy_api::get().PyArray_DescrFromType_(NPY_FLOAT16_);
+    return reinterpret_borrow<py::dtype>(ptr);
+  }
+  static std::string format() {
+    // Note: "e" represents float16.
+    // Details at:
+    // https://docs.python.org/3/library/struct.html#format-characters.
+    return "e";
+  }
+  static constexpr auto name = _("float16");
+};
+
+}  // namespace detail
+}  // namespace pybind11
+
 namespace paddle {
 namespace pybind {
 
@@ -175,6 +206,7 @@ void BindFleetExecutor(py::module* m) {
       .def(py::init(&DistModelDataBufCreate<int32_t>))
       .def(py::init(&DistModelDataBufCreate<int64_t>))
       .def(py::init(&DistModelDataBufCreate<float>))
+      .def(py::init(&DistModelDataBufCreate<paddle::platform::float16>))
       .def("reset",
            [](DistModelDataBuf& self, std::vector<float>& data) {
              self.Resize(data.size() * sizeof(float));
@@ -183,29 +215,35 @@ void BindFleetExecutor(py::module* m) {
       .def("reset", &DistModelDataBufReset<int32_t>)
       .def("reset", &DistModelDataBufReset<int64_t>)
       .def("reset", &DistModelDataBufReset<float>)
+      .def("reset", &DistModelDataBufReset<paddle::platform::float16>)
       .def("length", &DistModelDataBuf::length)
-      .def("tolist",
-           [](DistModelDataBuf& self, const std::string& dtype) -> py::list {
-             py::list l;
-             if (dtype == "int32") {
-               auto* data = static_cast<int32_t*>(self.data());
-               auto size = self.length() / sizeof(int32_t);
-               l = py::cast(std::vector<int32_t>(data, data + size));
-             } else if (dtype == "int64") {
-               auto* data = static_cast<int64_t*>(self.data());
-               auto size = self.length() / sizeof(int64_t);
-               l = py::cast(std::vector<int64_t>(data, data + size));
-             } else if (dtype == "float32") {
-               auto* data = static_cast<float*>(self.data());
-               auto size = self.length() / sizeof(float);
-               l = py::cast(std::vector<float>(data, data + size));
-             } else {
-               PADDLE_THROW(platform::errors::Unimplemented(
-                   "Unsupported data type. Now only supports INT32, INT64 and "
-                   "FLOAT32."));
-             }
-             return l;
-           });
+      .def("tolist", [](DistModelDataBuf& self,
+                        const std::string& dtype) -> py::list {
+        py::list l;
+        if (dtype == "int32") {
+          auto* data = static_cast<int32_t*>(self.data());
+          auto size = self.length() / sizeof(int32_t);
+          l = py::cast(std::vector<int32_t>(data, data + size));
+        } else if (dtype == "int64") {
+          auto* data = static_cast<int64_t*>(self.data());
+          auto size = self.length() / sizeof(int64_t);
+          l = py::cast(std::vector<int64_t>(data, data + size));
+        } else if (dtype == "float32") {
+          auto* data = static_cast<float*>(self.data());
+          auto size = self.length() / sizeof(float);
+          l = py::cast(std::vector<float>(data, data + size));
+        } else if (dtype == "float16") {
+          auto* data = static_cast<paddle::platform::float16*>(self.data());
+          auto size = self.length() / sizeof(paddle::platform::float16);
+          l = py::cast(
+              std::vector<paddle::platform::float16>(data, data + size));
+        } else {
+          PADDLE_THROW(platform::errors::Unimplemented(
+              "Unsupported data type. Now only supports INT32, INT64, "
+              "FLOAT16 and FLOAT32."));
+        }
+        return l;
+      });
 
   py::class_<DistModelTensor>(*m, "DistModelTensor")
       .def(py::init<>())
@@ -221,6 +259,10 @@ void BindFleetExecutor(py::module* m) {
            py::arg("name") = "",
            py::arg("lod") = std::vector<std::vector<size_t>>(),
            py::arg("copy") = true)
+      .def(py::init(&DistModelTensorCreate<paddle::platform::float16>),
+           py::arg("data"), py::arg("name") = "",
+           py::arg("lod") = std::vector<std::vector<size_t>>(),
+           py::arg("copy") = true)
       .def_readwrite("name", &DistModelTensor::name)
       .def_readwrite("shape", &DistModelTensor::shape)
       .def_readwrite("data", &DistModelTensor::data)
@@ -231,7 +273,8 @@ void BindFleetExecutor(py::module* m) {
   py::enum_<DistModelDataType>(*m, "DistModelDataType")
       .value("FLOAT32", DistModelDataType::FLOAT32)
       .value("INT64", DistModelDataType::INT64)
-      .value("INT32", DistModelDataType::INT32);
+      .value("INT32", DistModelDataType::INT32)
+      .value("FLOAT16", DistModelDataType::FLOAT16);
 }
 }  // namespace pybind
 }  // namespace paddle
diff --git a/paddle/phi/api/all.h b/paddle/phi/api/all.h
index 8d840214092ba9..06f3cd84476061 100644
--- a/paddle/phi/api/all.h
+++ b/paddle/phi/api/all.h
@@ -41,7 +41,6 @@ limitations under the License. */
 #include "paddle/phi/api/ext/dispatch.h"
 #include "paddle/phi/api/ext/dll_decl.h"
 #include "paddle/phi/api/ext/exception.h"
-#include "paddle/phi/api/ext/op_kernel_info.h"
 #include "paddle/phi/api/ext/op_meta_info.h"
 #include "paddle/phi/api/ext/place.h"
 #include "paddle/phi/api/ext/tensor_compat.h"
diff --git a/paddle/phi/api/ext/op_kernel_info.h b/paddle/phi/api/ext/op_kernel_info.h
index b52b0abe9e745d..b3adbe9d18b966 100644
--- a/paddle/phi/api/ext/op_kernel_info.h
+++ b/paddle/phi/api/ext/op_kernel_info.h
@@ -630,16 +630,16 @@ class PADDLE_API OpKernelInfoBuilder {
 };
 /////////////////////// Custom kernel register API /////////////////////////
 // For inference: compile directly with framework
-// Call after PD_REGISTER_KERNEL(...)
+// Call after PD_REGISTER_BUILTIN_KERNEL(...)
 void RegisterAllCustomKernel();
 
 //////////////// Custom kernel register macro /////////////////////
 // Refer to paddle/phi/core/kernel_registry.h, we can not use
-// PT_REGISTER_KERNEL directly, common macros and functions are
+// PD_REGISTER_KERNEL directly, common macros and functions are
 // not ready for custom kernel now.
 // Difference: custom_kernel stores all kernels' info into global
 // g_custom_kernel_info_map before loading and registering into
-// pten kernel management. Only providing PD_REGISTER_KERNEL which
+// pten kernel management. Only providing PD_REGISTER_BUILTIN_KERNEL which
 // supports 2 template arguments.
 
 #define PD_BACKEND(arg__) phi::Backend::arg__
@@ -666,11 +666,12 @@ void RegisterAllCustomKernel();
 #define PD_ID __LINE__
 #endif
 
-#define PD_REGISTER_KERNEL(kernel_name, backend, layout, func, cpp_dtype, ...) \
-  STATIC_ASSERT_GLOBAL_NAMESPACE(                                              \
-      _reg_custom_kernel_ns_check_##kernel_name##_##backend##_##layout,        \
-      "PD_REGISTER_KERNEL must be called in global namespace.");               \
-  _PD_REGISTER_2TA_KERNEL(                                                     \
+#define PD_REGISTER_BUILTIN_KERNEL(                                      \
+    kernel_name, backend, layout, func, cpp_dtype, ...)                  \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                        \
+      _reg_custom_kernel_ns_check_##kernel_name##_##backend##_##layout,  \
+      "PD_REGISTER_BUILTIN_KERNEL must be called in global namespace."); \
+  _PD_REGISTER_2TA_KERNEL(                                               \
       kernel_name, backend, layout, func, cpp_dtype, ##__VA_ARGS__)
 
 // WIN32 is not supported
diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt
index 175bf34c0da66f..720c6f54bb075d 100644
--- a/paddle/phi/api/lib/CMakeLists.txt
+++ b/paddle/phi/api/lib/CMakeLists.txt
@@ -90,7 +90,6 @@ cc_library(manual_api SRCS manual_api.cc DEPS pten_tensor_raw pten kernel_dispat
 cc_library(pten_tensor SRCS tensor_method.cc DEPS pten_tensor_raw pten_function_api)
 
 cc_library(op_meta_info SRCS op_meta_info.cc DEPS pten_tensor)
-cc_library(op_kernel_info SRCS op_kernel_info.cc DEPS pten_tensor_raw)
 
 cc_library(sparse_api SRCS sparse_api.cc DEPS pten_tensor pten kernel_dispatch pten_data_transform)
 cc_library(pten_function_api SRCS ${api_source_file} DEPS pten_tensor pten kernel_dispatch pten_data_transform)
diff --git a/paddle/phi/api/lib/api_declare.h b/paddle/phi/api/lib/api_declare.h
index 650161a933a8cb..26408290bd325e 100644
--- a/paddle/phi/api/lib/api_declare.h
+++ b/paddle/phi/api/lib/api_declare.h
@@ -17,6 +17,6 @@ limitations under the License. */
 // api symbols declare, remove in the future
 #include "paddle/phi/api/lib/api_registry.h"
 
-PT_DECLARE_API(Math);
-PT_DECLARE_API(Utils);
-PT_DECLARE_API(SparseApi);
+PD_DECLARE_API(Math);
+PD_DECLARE_API(Utils);
+PD_DECLARE_API(SparseApi);
diff --git a/paddle/phi/api/lib/api_registry.h b/paddle/phi/api/lib/api_registry.h
index 2812bede8e09ba..3783620ea449b4 100644
--- a/paddle/phi/api/lib/api_registry.h
+++ b/paddle/phi/api/lib/api_registry.h
@@ -36,10 +36,10 @@ namespace experimental {
  */
 
 // use to declare symbol
-#define PT_REGISTER_API(name) \
+#define PD_REGISTER_API(name) \
   PADDLE_API int RegisterSymbolsFor##name() { return 0; }
 
-#define PT_DECLARE_API(name)                        \
+#define PD_DECLARE_API(name)                        \
   extern PADDLE_API int RegisterSymbolsFor##name(); \
   UNUSED static int use_pten_api_##name = RegisterSymbolsFor##name()
 
diff --git a/paddle/phi/api/lib/manual_api.cc b/paddle/phi/api/lib/manual_api.cc
index e0da15eac39b79..7bd4711cc3f308 100644
--- a/paddle/phi/api/lib/manual_api.cc
+++ b/paddle/phi/api/lib/manual_api.cc
@@ -27,15 +27,15 @@ limitations under the License. */
 #include "paddle/phi/core/meta_tensor.h"
 #include "paddle/phi/infermeta/unary.h"
 
-PT_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(split, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(split, CPU, ALL_LAYOUT);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
 #endif
 
 #ifdef PADDLE_WITH_XPU
-PT_DECLARE_KERNEL(copy, XPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(copy, XPU, ALL_LAYOUT);
 #endif
 
 namespace paddle {
@@ -147,4 +147,4 @@ PADDLE_API std::vector<Tensor> split(const Tensor& x,
 }  // namespace experimental
 }  // namespace paddle
 
-PT_REGISTER_API(Utils);
+PD_REGISTER_API(Utils);
diff --git a/paddle/phi/api/lib/op_kernel_info.cc b/paddle/phi/api/lib/op_kernel_info.cc
index 78b4955f321da0..c2aef8288dae1a 100644
--- a/paddle/phi/api/lib/op_kernel_info.cc
+++ b/paddle/phi/api/lib/op_kernel_info.cc
@@ -86,7 +86,7 @@ OpKernelInfoBuilder& OpKernelInfoBuilder::ArgsDef(CustomKernelArgsDefFn func) {
 /////////////////////// Op register API /////////////////////////
 
 // For inference: compile directly with framework
-// Call after PD_REGISTER_KERNEL(...)
+// Call after PD_REGISTER_BUILTIN_KERNEL(...)
 void RegisterAllCustomKernel() {
   auto& op_kernel_info_map = OpKernelInfoMap::Instance();
   framework::RegisterKernelWithMetaInfoMap(op_kernel_info_map);
diff --git a/paddle/phi/api/lib/sparse_api.cc b/paddle/phi/api/lib/sparse_api.cc
index 5a22d617492d21..cc90c2b819daef 100644
--- a/paddle/phi/api/lib/sparse_api.cc
+++ b/paddle/phi/api/lib/sparse_api.cc
@@ -22,20 +22,20 @@ limitations under the License. */
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/infermeta/unary.h"
 
-PT_DECLARE_KERNEL(dense_to_sparse_coo, CPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(sparse_csr_to_coo, CPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(dense_to_sparse_csr, CPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(sparse_coo_to_csr, CPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(sparse_coo_to_dense, CPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(sparse_csr_to_dense, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(dense_to_sparse_coo, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sparse_csr_to_coo, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(dense_to_sparse_csr, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sparse_coo_to_csr, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sparse_coo_to_dense, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sparse_csr_to_dense, CPU, ALL_LAYOUT);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_DECLARE_KERNEL(dense_to_sparse_coo, GPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(sparse_csr_to_coo, GPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(dense_to_sparse_csr, GPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(sparse_coo_to_csr, GPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(sparse_coo_to_dense, GPU, ALL_LAYOUT);
-PT_DECLARE_KERNEL(sparse_csr_to_dense, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(dense_to_sparse_coo, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sparse_csr_to_coo, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(dense_to_sparse_csr, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sparse_coo_to_csr, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sparse_coo_to_dense, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sparse_csr_to_dense, GPU, ALL_LAYOUT);
 #endif
 
 namespace paddle {
@@ -228,4 +228,4 @@ PADDLE_API Tensor to_dense(const Tensor& x, Backend backend) {
 }  // namespace experimental
 }  // namespace paddle
 
-PT_REGISTER_API(SparseApi);
+PD_REGISTER_API(SparseApi);
diff --git a/paddle/phi/api/lib/utils/tensor_utils.cc b/paddle/phi/api/lib/utils/tensor_utils.cc
index 6231922fdbafac..fc56d201fe3ccc 100644
--- a/paddle/phi/api/lib/utils/tensor_utils.cc
+++ b/paddle/phi/api/lib/utils/tensor_utils.cc
@@ -131,7 +131,7 @@ phi::ScalarArray MakePtenScalarArrayFromVarList(
   }
 
   phi::ScalarArray result{vector_data};
-  result.setInitByTensor(true);
+  result.SetFromTensor(true);
 
   return result;
 }
diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt
index 441bd0a8c303b5..38366d57841b00 100644
--- a/paddle/phi/backends/CMakeLists.txt
+++ b/paddle/phi/backends/CMakeLists.txt
@@ -21,3 +21,7 @@ endif()
 if(WITH_GPU)
   add_dependencies(pten_context gpu_context)
 endif()
+
+if(WITH_CUSTOM_DEVICE)
+  add_dependencies(pten_context custom_context)
+endif()
diff --git a/paddle/phi/backends/all_context.h b/paddle/phi/backends/all_context.h
index b53c5ce5c780cb..3fe03905e42dd3 100644
--- a/paddle/phi/backends/all_context.h
+++ b/paddle/phi/backends/all_context.h
@@ -21,12 +21,15 @@ limitations under the License. */
 // path replacement after implementing pten DeviceContext
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/custom/custom_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"
 
+#ifndef PADDLE_WITH_CUSTOM_KERNEL
 // TODO(wilber): DeviceContextPool nees include fluid file.
 #include "paddle/fluid/platform/device_context.h"
 
 namespace phi {
 using DeviceContextPool = paddle::platform::DeviceContextPool;
 }  // namespace phi
+#endif
diff --git a/paddle/phi/backends/custom/custom_context.cc b/paddle/phi/backends/custom/custom_context.cc
index 445f550839160f..bde3b6a08539b5 100644
--- a/paddle/phi/backends/custom/custom_context.cc
+++ b/paddle/phi/backends/custom/custom_context.cc
@@ -32,8 +32,8 @@ struct CustomContext::Impl {
 
   const Place& GetPlace() const { return place_; }
 
-  C_Stream stream() const {
-    return reinterpret_cast<C_Stream>(stream_->raw_stream());
+  void* stream() const {
+    return reinterpret_cast<void*>(stream_->raw_stream());
   }
 
   void Wait() const { stream_->Wait(); }
@@ -47,7 +47,7 @@ void CustomContext::Init() { impl_->Init(); }
 
 const Place& CustomContext::GetPlace() const { return impl_->GetPlace(); }
 
-C_Stream CustomContext::stream() const { return impl_->stream(); }
+void* CustomContext::stream() const { return impl_->stream(); }
 
 void CustomContext::Wait() const { return impl_->Wait(); }
 
diff --git a/paddle/phi/backends/custom/custom_context.h b/paddle/phi/backends/custom/custom_context.h
index 109f5e53707f6e..37b0ee21219b59 100644
--- a/paddle/phi/backends/custom/custom_context.h
+++ b/paddle/phi/backends/custom/custom_context.h
@@ -15,7 +15,6 @@ limitations under the License. */
 #pragma once
 
 #include <memory>
-#include "paddle/fluid/platform/device/device_ext.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/device_context.h"
 
@@ -30,7 +29,7 @@ class CustomContext : public DeviceContext {
   const Place& GetPlace() const override;
 
   /*! \brief  Return stream in the device context. */
-  C_Stream stream() const;
+  void* stream() const;
 
   // Wait for all operations completion in the stream.
   void Wait() const override;
diff --git a/paddle/phi/common/backend.h b/paddle/phi/common/backend.h
index f7c39eacae9bd1..9a2ec093119fdb 100644
--- a/paddle/phi/common/backend.h
+++ b/paddle/phi/common/backend.h
@@ -71,17 +71,17 @@ enum class Backend : uint8_t {
    * Of course, we have also considered solving this problem through different
    * named macros, for example, if we define
    *
-   * PT_REGISTER_KERNEL_FOR_ALL_BACKEND
+   * PD_REGISTER_KERNEL_FOR_ALL_BACKEND
    *
    * Based on this design pattern, the dtype and layout also have the same
    * requirements, this cause we need to define a series of macros
    *
-   * PT_REGISTER_KERNEL_FOR_ALL_DTYPE
-   * PT_REGISTER_KERNEL_FOR_ALL_LAYOUT
-   * PT_REGISTER_KERNEL_FOR_ALL_BACKEND_AND_LAYOUT
-   * PT_REGISTER_KERNEL_FOR_ALL_BACKEND_AND_DTYPE
-   * PT_REGISTER_KERNEL_FOR_ALL_LAYOUT_AND_DTYPE
-   * PT_REGISTER_KERNEL_FOR_ALL_BACKEND_AND_LAYOUT_AND_DTYPE
+   * PD_REGISTER_KERNEL_FOR_ALL_DTYPE
+   * PD_REGISTER_KERNEL_FOR_ALL_LAYOUT
+   * PD_REGISTER_KERNEL_FOR_ALL_BACKEND_AND_LAYOUT
+   * PD_REGISTER_KERNEL_FOR_ALL_BACKEND_AND_DTYPE
+   * PD_REGISTER_KERNEL_FOR_ALL_LAYOUT_AND_DTYPE
+   * PD_REGISTER_KERNEL_FOR_ALL_BACKEND_AND_LAYOUT_AND_DTYPE
    *
    * It makes the system of registering macros more complicated, we think
    * this is not a simple design, so we still adopt the design of providing
@@ -130,6 +130,32 @@ inline std::ostream& operator<<(std::ostream& os, Backend backend) {
   return os;
 }
 
+inline Backend StringToBackend(const char* backend_cstr) {
+  std::string s(backend_cstr);
+  if (s == std::string("Undefined")) {
+    return Backend::UNDEFINED;
+  }
+  for (size_t i = 0; i < s.size(); ++i) {
+    s[i] = toupper(s[i]);
+  }
+  if (s == std::string("CPU")) {
+    return Backend::CPU;
+  } else if (s == std::string("GPU")) {
+    return Backend::GPU;
+  } else if (s == std::string("XPU")) {
+    return Backend::XPU;
+  } else if (s == std::string("NPU")) {
+    return Backend::NPU;
+  } else if (s == std::string("MKLDNN")) {
+    return Backend::MKLDNN;
+  } else if (s == std::string("CUDNN")) {
+    return Backend::CUDNN;
+  } else {
+    return static_cast<Backend>(static_cast<size_t>(Backend::NUM_BACKENDS) +
+                                phi::GetOrRegisterGlobalDeviceTypeId(s));
+  }
+}
+
 }  // namespace experimental
 }  // namespace paddle
 
diff --git a/paddle/phi/common/scalar.h b/paddle/phi/common/scalar.h
index 092e05e95979a0..9a5a3fbf921d07 100644
--- a/paddle/phi/common/scalar.h
+++ b/paddle/phi/common/scalar.h
@@ -25,7 +25,7 @@ namespace experimental {
 template <typename T>
 class ScalarBase {
  public:
-  bool IsInitByTensor() const { return is_init_by_tensor_; }
+  bool FromTensor() const { return is_from_tensor_; }
   // Constructor support implicit
   ScalarBase(double val) : dtype_(DataType::FLOAT64) {  // NOLINT
     data_.f64 = val;
@@ -104,7 +104,7 @@ class ScalarBase {
 
   // The Tensor must have one dim
   ScalarBase(const T& tensor) : dtype_(tensor.dtype()) {  // NOLINT
-    is_init_by_tensor_ = true;
+    is_from_tensor_ = true;
     PD_CHECK(
         tensor.numel() == 1,
         "The Scalar only supports Tensor with 1 element, but now Tensor has `",
@@ -191,12 +191,14 @@ class ScalarBase {
     }
   }
 
+  DataType dtype() const { return dtype_; }
+
  private:
   template <typename T1, typename T2>
   friend void CopyScalar(const ScalarBase<T1>& src, ScalarBase<T2>* dst);
 
  private:
-  bool is_init_by_tensor_{false};
+  bool is_from_tensor_{false};
   DataType dtype_;
   union data {
     bool b;
diff --git a/paddle/phi/common/scalar_array.h b/paddle/phi/common/scalar_array.h
index 522228ba99e0b5..39284095961a72 100644
--- a/paddle/phi/common/scalar_array.h
+++ b/paddle/phi/common/scalar_array.h
@@ -43,13 +43,13 @@ class ScalarArrayBase {
     AssignData(date_value, n);
   }
 
-  bool IsInitByTensor() const { return is_init_by_tensor_; }
+  bool FromTensor() const { return is_from_tensor_; }
 
-  void setInitByTensor(bool val) { is_init_by_tensor_ = val; }
+  void SetFromTensor(bool val) { is_from_tensor_ = val; }
 
   // The Tensor must have one dim
   ScalarArrayBase(const T& tensor) {  // NOLINT
-    is_init_by_tensor_ = true;
+    is_from_tensor_ = true;
     size_t n = tensor.numel();
     array_.reserve(n);
     switch (tensor.dtype()) {
@@ -71,7 +71,7 @@ class ScalarArrayBase {
 
   // The Tensor in vec must have only one element
   ScalarArrayBase(const std::vector<T>& tensor_list) {  // NOLINT
-    is_init_by_tensor_ = true;
+    is_from_tensor_ = true;
 
     for (size_t i = 0; i < tensor_list.size(); ++i) {
       DataType data_type = tensor_list[i].dtype();
@@ -117,7 +117,7 @@ class ScalarArrayBase {
   // TODO(zhangyunfei) Replace std::vector with a more efficient container
   // structure.
   std::vector<int64_t> array_;
-  bool is_init_by_tensor_{false};
+  bool is_from_tensor_{false};
 };
 
 using ScalarArray =
diff --git a/paddle/phi/core/CMakeLists.txt b/paddle/phi/core/CMakeLists.txt
index 18f209377bafc7..32b9b42f74f621 100644
--- a/paddle/phi/core/CMakeLists.txt
+++ b/paddle/phi/core/CMakeLists.txt
@@ -25,6 +25,8 @@ cc_library(meta_tensor SRCS meta_tensor.cc DEPS tensor_base tensor_meta dense_te
 cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor)
 cc_library(selected_rows SRCS selected_rows_impl.cc DEPS dense_tensor mixed_vector pten_enforce ddim)
 
+cc_library(pten_custom_kernel SRCS custom_kernel.cc DEPS kernel_factory convert_utils)
+
 # Will remove once we implemented MKLDNN_Tensor
 if(WITH_MKLDNN)
     add_dependencies(dense_tensor mkldnn)
diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h
index 5c0c440d8942c8..ec810d4e163408 100644
--- a/paddle/phi/core/compat/op_utils.h
+++ b/paddle/phi/core/compat/op_utils.h
@@ -164,34 +164,34 @@ struct ArgumentMappingFnRegistrar {
   }
 };
 
-#define PT_REGISTER_BASE_KERNEL_NAME(op_type, base_kernel_name)                \
+#define PD_REGISTER_BASE_KERNEL_NAME(op_type, base_kernel_name)                \
   PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                           \
-      pt_register_base_kernel_name_ns_check_##op_type,                         \
-      "PT_REGISTER_BASE_KERNEL_NAME must be called in global namespace.");     \
+      PD_REGISTER_base_kernel_name_ns_check_##op_type,                         \
+      "PD_REGISTER_BASE_KERNEL_NAME must be called in global namespace.");     \
   static const ::phi::BaseKernelNameRegistrar                                  \
       __registrar_base_kernel_name_for_##op_type(#op_type, #base_kernel_name); \
   int TouchBaseKernelNameSymbol_##op_type() { return 0; }
 
-#define PT_DECLARE_BASE_KERNEL_NAME(op_type)                              \
+#define PD_DECLARE_BASE_KERNEL_NAME(op_type)                              \
   PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                      \
-      pt_declare_ai_name_ns_check_##op_type,                              \
-      "PT_DECLARE_BASE_KERNEL_NAME must be called in global namespace."); \
+      PD_DECLARE_ai_name_ns_check_##op_type,                              \
+      "PD_DECLARE_BASE_KERNEL_NAME must be called in global namespace."); \
   extern int TouchBaseKernelNameSymbol_##op_type();                       \
   UNUSED static int __declare_base_kernel_name_symbol_for_##op_type =     \
       TouchBaseKernelNameSymbol_##op_type()
 
-#define PT_REGISTER_ARG_MAPPING_FN(op_type, arg_mapping_fn)              \
+#define PD_REGISTER_ARG_MAPPING_FN(op_type, arg_mapping_fn)              \
   PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                     \
-      pt_register_arg_map_fn_ns_check_##op_type,                         \
-      "PT_REGISTER_ARG_MAPPING_FN must be called in global namespace."); \
+      PD_REGISTER_arg_map_fn_ns_check_##op_type,                         \
+      "PD_REGISTER_ARG_MAPPING_FN must be called in global namespace."); \
   static const ::phi::ArgumentMappingFnRegistrar                         \
       __registrar_arg_map_fn_for_##op_type(#op_type, arg_mapping_fn);    \
   int TouchArgumentMappingFnSymbol_##op_type() { return 0; }
 
-#define PT_DECLARE_ARG_MAPPING_FN(op_type)                              \
+#define PD_DECLARE_ARG_MAPPING_FN(op_type)                              \
   PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                    \
-      pt_declare_arg_map_fn_ns_check_##op_type,                         \
-      "PT_DECLARE_ARG_MAPPING_FN must be called in global namespace."); \
+      PD_DECLARE_arg_map_fn_ns_check_##op_type,                         \
+      "PD_DECLARE_ARG_MAPPING_FN must be called in global namespace."); \
   extern int TouchArgumentMappingFnSymbol_##op_type();                  \
   UNUSED static int __declare_arg_map_fn_symbol_for_##op_type =         \
       TouchArgumentMappingFnSymbol_##op_type()
diff --git a/paddle/phi/core/custom_kernel.cc b/paddle/phi/core/custom_kernel.cc
new file mode 100644
index 00000000000000..75ff9cc2860037
--- /dev/null
+++ b/paddle/phi/core/custom_kernel.cc
@@ -0,0 +1,66 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/custom_kernel.h"
+
+namespace phi {
+
+void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map) {
+  auto& kernel_info_map = custom_kernel_map.GetMap();
+  VLOG(3) << "Size of custom_kernel_map: " << kernel_info_map.size();
+
+  for (auto& pair : kernel_info_map) {
+    PADDLE_ENFORCE_EQ(
+        KernelFactory::Instance().HasCompatiblePtenKernel(pair.first),
+        true,
+        phi::errors::InvalidArgument(
+            "The kernel %s is not ready for custom kernel registering.",
+            pair.first));
+
+    for (auto& info_pair : pair.second) {
+      auto& kernels = KernelFactory::Instance().kernels();
+      PADDLE_ENFORCE_EQ(
+          kernels[pair.first].find(info_pair.first),
+          kernels[pair.first].end(),
+          phi::errors::InvalidArgument(
+              "The operator <%s>'s kernel: %s has been already existed "
+              "in Paddle, please contribute PR if it is necessary "
+              "to optimize the kernel code. Custom kernel does NOT support "
+              "to replace existing kernel in Paddle.",
+              pair.first,
+              info_pair.first));
+
+      kernels[pair.first][info_pair.first] = info_pair.second;
+
+      VLOG(3) << "Successed in registering operator <" << pair.first
+              << ">'s kernel: " << info_pair.first
+              << " to Paddle. It will be used like native ones.";
+    }
+  }
+}
+
+}  // namespace phi
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// C-API to get global CustomKernelMap.
+phi::CustomKernelMap& PD_GetCustomKernelMap() {
+  return phi::CustomKernelMap::Instance();
+}
+
+#ifdef __cplusplus
+}  // end extern "C"
+#endif
diff --git a/paddle/phi/core/custom_kernel.h b/paddle/phi/core/custom_kernel.h
new file mode 100644
index 00000000000000..20ae2b7bb7360a
--- /dev/null
+++ b/paddle/phi/core/custom_kernel.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/kernel_factory.h"
+#include "paddle/phi/core/macros.h"
+
+namespace phi {
+/**
+ * Note:
+ * Used to store kernels' info before registered to KernelFactory.
+ */
+class CustomKernelMap {
+ public:
+  static CustomKernelMap& Instance() {
+    static CustomKernelMap g_custom_kernel_info_map;
+    return g_custom_kernel_info_map;
+  }
+
+  KernelNameMap& Kernels() { return kernels_; }
+
+  const KernelNameMap& GetMap() const { return kernels_; }
+
+ private:
+  CustomKernelMap() = default;
+  DISABLE_COPY_AND_ASSIGN(CustomKernelMap);
+
+  KernelNameMap kernels_;
+};
+
+/**
+ * Note:
+ * Used to register custom kernels to KernelFactory.
+ */
+void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map);
+
+}  // namespace phi
diff --git a/paddle/phi/core/dense_tensor.h b/paddle/phi/core/dense_tensor.h
index 622cedf1d7f91e..0dddd63099bbca 100644
--- a/paddle/phi/core/dense_tensor.h
+++ b/paddle/phi/core/dense_tensor.h
@@ -171,6 +171,9 @@ class DenseTensor : public TensorBase,
   DenseTensorMeta meta_;
   std::shared_ptr<phi::Allocation> holder_;
 
+#ifndef PADDLE_WITH_CUSTOM_KERNEL
 #include "paddle/phi/core/dense_tensor.inl"
+#endif
 };
+
 }  // namespace phi
diff --git a/paddle/phi/core/infermeta_utils.h b/paddle/phi/core/infermeta_utils.h
index 2b98ab22bcdbd4..1b8cfea130d490 100644
--- a/paddle/phi/core/infermeta_utils.h
+++ b/paddle/phi/core/infermeta_utils.h
@@ -282,10 +282,10 @@ struct InferMetaFnRegistrar {
   }
 };
 
-#define PT_REGISTER_INFER_META_FN(kernel_name_prefix, variadic_infer_meta_fn) \
+#define PD_REGISTER_INFER_META_FN(kernel_name_prefix, variadic_infer_meta_fn) \
   PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                          \
-      pt_register_infer_meta_fn_ns_check_##kernel_name_prefix,                \
-      "PT_REGISTER_INFER_META_FN must be called in global namespace.");       \
+      PD_REGISTER_infer_meta_fn_ns_check_##kernel_name_prefix,                \
+      "PD_REGISTER_INFER_META_FN must be called in global namespace.");       \
   static const ::phi::InferMetaFnRegistrar                                    \
       __registrar_arg_map_fn_for_##kernel_name_prefix(                        \
           #kernel_name_prefix, PT_INFER_META(variadic_infer_meta_fn))
diff --git a/paddle/phi/core/kernel_context.h b/paddle/phi/core/kernel_context.h
index 0b960004fcb272..57e2db60c24cae 100644
--- a/paddle/phi/core/kernel_context.h
+++ b/paddle/phi/core/kernel_context.h
@@ -22,6 +22,7 @@
 #include "paddle/phi/core/tensor_base.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/utils/any.h"
+#include "paddle/utils/optional.h"
 #include "paddle/utils/small_vector.h"
 
 namespace phi {
diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h
index 577e9e28cf3791..4603f4123acd02 100644
--- a/paddle/phi/core/kernel_registry.h
+++ b/paddle/phi/core/kernel_registry.h
@@ -21,6 +21,7 @@
 #include <typeinfo>
 #include <vector>
 
+#include "paddle/phi/core/custom_kernel.h"
 #include "paddle/phi/core/kernel_factory.h"
 #include "paddle/phi/core/kernel_utils.h"
 #include "paddle/phi/core/macros.h"
@@ -62,6 +63,9 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
 #elif defined(PADDLE_WITH_XPU)
           ||
           arg_type == std::type_index(typeid(const XPUContext&))) {
+#elif defined(PADDLE_WITH_CUSTOM_DEVICE)
+          ||
+          arg_type == std::type_index(typeid(const CustomContext&))) {
 #else
               ) {
 #endif
@@ -83,11 +87,13 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
                               default_tensor_layout,
                               default_key.dtype(),
                               arg_type);
+#ifndef PADDLE_WITH_CUSTOM_KERNEL
       } else if (arg_type == std::type_index(typeid(const SelectedRows&))) {
         args_def->AppendInput(default_key.backend(),
                               default_tensor_layout,
                               default_key.dtype(),
                               arg_type);
+#endif
       } else if (arg_type == std::type_index(typeid(DenseTensor*))) {
         args_def->AppendOutput(default_key.backend(),
                                default_tensor_layout,
@@ -99,11 +105,13 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
                                default_tensor_layout,
                                default_key.dtype(),
                                arg_type);
+#ifndef PADDLE_WITH_CUSTOM_KERNEL
       } else if (arg_type == std::type_index(typeid(SelectedRows*))) {
         args_def->AppendOutput(default_key.backend(),
                                default_tensor_layout,
                                default_key.dtype(),
                                arg_type);
+#endif
       } else {
         // Attribute deal with
         // TODO(chenweihang): now here allow any types of attribute, maybe
@@ -121,20 +129,28 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
   }
 };
 
+// NOTE: used for making a difference between kernels compiled with phi or not.
+enum class RegType : uint8_t {
+  BUILTIN = 0,  // compiled with phi
+  PLUGIN,       // separate compiled and registered
+};
+
 // TODO(chenweihang): Polish the kernel selection logic, support the selection
 // of ALL_DTYPE kernel, and simplify the constructor
 struct KernelRegistrar {
  public:
-  KernelRegistrar(const char* kernel_name_cstr,
-                  Backend backend,
+  KernelRegistrar(RegType reg_type,
+                  const char* kernel_name_cstr,
+                  const char* backend_cstr,
                   DataLayout layout,
                   DataType dtype,
                   KernelArgsParseFn args_parse_fn,
                   KernelArgsDefFn args_def_fn,
                   KernelFn kernel_fn,
                   void* variadic_kernel_fn) {
-    ConstructKernel(kernel_name_cstr,
-                    backend,
+    ConstructKernel(reg_type,
+                    kernel_name_cstr,
+                    backend_cstr,
                     layout,
                     dtype,
                     args_parse_fn,
@@ -143,8 +159,9 @@ struct KernelRegistrar {
                     variadic_kernel_fn);
   }
 
-  KernelRegistrar(const char* kernel_name_cstr,
-                  Backend backend,
+  KernelRegistrar(RegType reg_type,
+                  const char* kernel_name_cstr,
+                  const char* backend_cstr,
                   DataLayout layout,
                   KernelArgsParseFn args_parse_fn,
                   KernelArgsDefFn args_def_fn,
@@ -160,8 +177,9 @@ struct KernelRegistrar {
           dtype == static_cast<size_t>(DataType::UINT16)) {
         continue;
       }
-      ConstructKernel(kernel_name_cstr,
-                      backend,
+      ConstructKernel(reg_type,
+                      kernel_name_cstr,
+                      backend_cstr,
                       layout,
                       static_cast<DataType>(dtype),
                       args_parse_fn,
@@ -172,8 +190,9 @@ struct KernelRegistrar {
   }
 
  private:
-  void ConstructKernel(const char* kernel_name_cstr,
-                       Backend backend,
+  void ConstructKernel(RegType reg_type,
+                       const char* kernel_name_cstr,
+                       const char* backend_cstr,
                        DataLayout layout,
                        DataType dtype,
                        KernelArgsParseFn args_parse_fn,
@@ -181,11 +200,16 @@ struct KernelRegistrar {
                        KernelFn kernel_fn,
                        void* variadic_kernel_fn) {
     std::string kernel_name(kernel_name_cstr);
-    KernelKey kernel_key(backend, layout, dtype);
+    KernelKey kernel_key(
+        paddle::experimental::StringToBackend(backend_cstr), layout, dtype);
     Kernel kernel(kernel_fn, variadic_kernel_fn);
     args_parse_fn(kernel_key, kernel.mutable_args_def());
     args_def_fn(kernel_key, &kernel);
-    KernelFactory::Instance().kernels()[kernel_name][kernel_key] = kernel;
+    if (reg_type == RegType::BUILTIN) {
+      KernelFactory::Instance().kernels()[kernel_name][kernel_key] = kernel;
+    } else {
+      CustomKernelMap::Instance().Kernels()[kernel_name][kernel_key] = kernel;
+    }
   }
 };
 
@@ -210,7 +234,7 @@ struct KernelRegistrar {
 #define _PT_ARG_N(args) _PT_ARG_N_EXPAND args
 #define _PT_RESQ_N() 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 
-/** PT_REGISTER_KERNEL
+/** PD_REGISTER_KERNEL
  *
  * The most frequently used kernel registration macro, used for kernel
  * registration with only data type as template parameter, and the function
@@ -219,22 +243,39 @@ struct KernelRegistrar {
  *
  * Note: `2TA` means `2 template argument`
  */
-#define PT_REGISTER_KERNEL(kernel_name, backend, layout, meta_kernel_fn, ...) \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                          \
-      pt_register_tp_kernel_ns_check_##kernel_name##_##backend##_##layout,    \
-      "PT_REGISTER_KERNEL must be called in global namespace.");              \
-  PT_EXPAND(_PT_REGISTER_2TA_KERNEL(                                          \
-      kernel_name, backend, layout, meta_kernel_fn, __VA_ARGS__))
+#define PD_REGISTER_KERNEL(kernel_name, backend, layout, meta_kernel_fn, ...) \
+  _PD_REGISTER_KERNEL(::phi::RegType::BUILTIN,                                \
+                      kernel_name,                                            \
+                      backend,                                                \
+                      ::phi::backend##Context,                                \
+                      layout,                                                 \
+                      meta_kernel_fn,                                         \
+                      __VA_ARGS__)
+
+#define _PD_REGISTER_KERNEL(                                               \
+    reg_type, kernel_name, backend, context, layout, meta_kernel_fn, ...)  \
+  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                       \
+      PD_REGISTER_tp_kernel_ns_check_##kernel_name##_##backend##_##layout, \
+      "PD_REGISTER_KERNEL must be called in global namespace.");           \
+  PT_EXPAND(_PD_REGISTER_2TA_KERNEL(reg_type,                              \
+                                    kernel_name,                           \
+                                    backend,                               \
+                                    context,                               \
+                                    layout,                                \
+                                    meta_kernel_fn,                        \
+                                    __VA_ARGS__))
 
 #ifndef _WIN32
-#define _PT_REGISTER_2TA_KERNEL(                                            \
-    kernel_name, backend, layout, meta_kernel_fn, ...)                      \
-  PT_KERNEL_INSTANTIATION(meta_kernel_fn, backend, __VA_ARGS__);            \
+#define _PD_REGISTER_2TA_KERNEL(                                            \
+    reg_type, kernel_name, backend, context, layout, meta_kernel_fn, ...)   \
+  PT_KERNEL_INSTANTIATION(meta_kernel_fn, backend, context, __VA_ARGS__);   \
   static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel);           \
   PT_KERNEL_REGISTRAR_INIT(                                                 \
+      reg_type,                                                             \
       kernel_name,                                                          \
       backend,                                                              \
+      context,                                                              \
       layout,                                                               \
       &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,        \
       meta_kernel_fn,                                                       \
@@ -254,13 +295,15 @@ struct KernelRegistrar {
  *
  * And msvc can work without template instantiation
  */
-#define _PT_REGISTER_2TA_KERNEL(                                            \
-    kernel_name, backend, layout, meta_kernel_fn, ...)                      \
+#define _PD_REGISTER_2TA_KERNEL(                                            \
+    reg_type, kernel_name, backend, context, layout, meta_kernel_fn, ...)   \
   static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel);           \
   PT_EXPAND(PT_KERNEL_REGISTRAR_INIT(                                       \
+      reg_type,                                                             \
       kernel_name,                                                          \
       backend,                                                              \
+      context,                                                              \
       layout,                                                               \
       &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,        \
       meta_kernel_fn,                                                       \
@@ -269,82 +312,119 @@ struct KernelRegistrar {
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel)
 #endif
 
-#define PT_KERNEL_INSTANTIATION(meta_kernel_fn, backend, ...) \
-  _PT_KERNEL_INSTANTIATION(                                   \
-      PT_NARGS(__VA_ARGS__), meta_kernel_fn, backend, __VA_ARGS__)
-
-#define _PT_KERNEL_INSTANTIATION(N, meta_kernel_fn, backend, ...) \
-  PT_CONCATENATE(_PT_KERNEL_INSTANTIATION_, N)                    \
-  (meta_kernel_fn, backend, __VA_ARGS__)
-
-#define _PT_KERNEL_INSTANTIATION_1(meta_kernel_fn, backend, cpp_dtype)  \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>) \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>
-#define _PT_KERNEL_INSTANTIATION_2(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)     \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>;                   \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_1(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_3(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)     \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>;                   \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_2(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_4(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)     \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>;                   \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_3(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_5(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)     \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>;                   \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_4(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_6(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)     \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>;                   \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_5(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_7(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)     \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>;                   \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_6(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_8(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)     \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>;                   \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_7(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_9(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)     \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>;                   \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_8(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_10(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)      \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>;                    \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_9(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_11(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)      \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>;                    \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_10(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_12(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)      \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>;                    \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_11(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_13(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)      \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>;                    \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_12(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_14(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)      \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>;                    \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_13(meta_kernel_fn, backend, __VA_ARGS__))
-#define _PT_KERNEL_INSTANTIATION_15(meta_kernel_fn, backend, cpp_dtype, ...) \
-  template decltype(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)      \
-      meta_kernel_fn<cpp_dtype, ::phi::backend##Context>;                    \
-  PT_EXPAND(_PT_KERNEL_INSTANTIATION_14(meta_kernel_fn, backend, __VA_ARGS__))
-
-#define PT_KERNEL_REGISTRAR_INIT(                                   \
-    kernel_name, backend, layout, args_def_fn, meta_kernel_fn, ...) \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT(PT_NARGS(__VA_ARGS__),        \
-                                      kernel_name,                  \
-                                      backend,                      \
-                                      layout,                       \
-                                      args_def_fn,                  \
-                                      meta_kernel_fn,               \
+#define PT_KERNEL_INSTANTIATION(meta_kernel_fn, backend, context, ...) \
+  _PT_KERNEL_INSTANTIATION(                                            \
+      PT_NARGS(__VA_ARGS__), meta_kernel_fn, backend, context, __VA_ARGS__)
+
+#define _PT_KERNEL_INSTANTIATION(N, meta_kernel_fn, backend, context, ...) \
+  PT_CONCATENATE(_PT_KERNEL_INSTANTIATION_, N)                             \
+  (meta_kernel_fn, backend, context, __VA_ARGS__)
+
+#define _PT_KERNEL_INSTANTIATION_1(              \
+    meta_kernel_fn, backend, context, cpp_dtype) \
+  template decltype(                             \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>
+#define _PT_KERNEL_INSTANTIATION_2(                                           \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
+  template decltype(                                                          \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_1(                                       \
+      meta_kernel_fn, backend, context, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_3(                                           \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
+  template decltype(                                                          \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_2(                                       \
+      meta_kernel_fn, backend, context, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_4(                                           \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
+  template decltype(                                                          \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_3(                                       \
+      meta_kernel_fn, backend, context, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_5(                                           \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
+  template decltype(                                                          \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_4(                                       \
+      meta_kernel_fn, backend, context, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_6(                                           \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
+  template decltype(                                                          \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_5(                                       \
+      meta_kernel_fn, backend, context, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_7(                                           \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
+  template decltype(                                                          \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_6(                                       \
+      meta_kernel_fn, backend, context, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_8(                                           \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
+  template decltype(                                                          \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_7(                                       \
+      meta_kernel_fn, backend, context, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_9(                                           \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
+  template decltype(                                                          \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_8(                                       \
+      meta_kernel_fn, backend, context, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_10(                                          \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
+  template decltype(                                                          \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_9(                                       \
+      meta_kernel_fn, backend, context, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_11(                                          \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
+  template decltype(                                                          \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_10(                                      \
+      meta_kernel_fn, backend, context, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_12(                                          \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
+  template decltype(                                                          \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_11(                                      \
+      meta_kernel_fn, backend, context, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_13(                                          \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
+  template decltype(                                                          \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_12(                                      \
+      meta_kernel_fn, backend, context, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_14(                                          \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
+  template decltype(                                                          \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_13(                                      \
+      meta_kernel_fn, backend, context, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_15(                                          \
+    meta_kernel_fn, backend, context, cpp_dtype, ...)                         \
+  template decltype(                                                          \
+      meta_kernel_fn<cpp_dtype, context>) meta_kernel_fn<cpp_dtype, context>; \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_14(                                      \
+      meta_kernel_fn, backend, context, __VA_ARGS__))
+
+#define PT_KERNEL_REGISTRAR_INIT(reg_type,                   \
+                                 kernel_name,                \
+                                 backend,                    \
+                                 context,                    \
+                                 layout,                     \
+                                 args_def_fn,                \
+                                 meta_kernel_fn,             \
+                                 ...)                        \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT(PT_NARGS(__VA_ARGS__), \
+                                      reg_type,              \
+                                      kernel_name,           \
+                                      backend,               \
+                                      context,               \
+                                      layout,                \
+                                      args_def_fn,           \
+                                      meta_kernel_fn,        \
                                       __VA_ARGS__))
 
 // clang-format off
@@ -352,15 +432,19 @@ struct KernelRegistrar {
 /* The =pre-commit always treats this macro into the wrong format,
   and multi-line macros cannot be skipped with NOLINT.*/
 #define _PT_KERNEL_REGISTRAR_INIT(N,                       \
+                                  reg_type,                \
                                   kernel_name,             \
                                   backend,                 \
+                                  context,                 \
                                   layout,                  \
                                   args_def_fn,             \
                                   meta_kernel_fn,          \
                                   ...)                     \
   PT_EXPAND(PT_CONCATENATE(_PT_KERNEL_REGISTRAR_INIT_, N) ( \
+    reg_type,                                              \
     kernel_name,                                           \
     backend,                                               \
+    context,                                               \
     layout,                                                \
     PT_ID,                                                 \
     args_def_fn,                                           \
@@ -369,413 +453,492 @@ struct KernelRegistrar {
 
 // clang-format on
 
-#define _PT_KERNEL_REGISTRAR_INIT_1(kernel_name,                               \
-                                    backend,                                   \
-                                    layout,                                    \
-                                    registrar_id,                              \
-                                    args_def_fn,                               \
-                                    meta_kernel_fn,                            \
-                                    cpp_dtype)                                 \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
+#define _PT_KERNEL_REGISTRAR_INIT_1(reg_type,                                 \
+                                    kernel_name,                              \
+                                    backend,                                  \
+                                    context,                                  \
+                                    layout,                                   \
+                                    registrar_id,                             \
+                                    args_def_fn,                              \
+                                    meta_kernel_fn,                           \
+                                    cpp_dtype)                                \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
   int TouchKernelSymbolFor_##kernel_name##_##backend##_##layout() { return 0; }
-#define _PT_KERNEL_REGISTRAR_INIT_2(kernel_name,                               \
-                                    backend,                                   \
-                                    layout,                                    \
-                                    registrar_id,                              \
-                                    args_def_fn,                               \
-                                    meta_kernel_fn,                            \
-                                    cpp_dtype,                                 \
-                                    ...)                                       \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_1(kernel_name,                           \
-                                        backend,                               \
-                                        layout,                                \
-                                        PT_ID,                                 \
-                                        args_def_fn,                           \
-                                        meta_kernel_fn,                        \
+#define _PT_KERNEL_REGISTRAR_INIT_2(reg_type,                                 \
+                                    kernel_name,                              \
+                                    backend,                                  \
+                                    context,                                  \
+                                    layout,                                   \
+                                    registrar_id,                             \
+                                    args_def_fn,                              \
+                                    meta_kernel_fn,                           \
+                                    cpp_dtype,                                \
+                                    ...)                                      \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_1(reg_type,                             \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        context,                              \
+                                        layout,                               \
+                                        PT_ID,                                \
+                                        args_def_fn,                          \
+                                        meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_3(kernel_name,                               \
-                                    backend,                                   \
-                                    layout,                                    \
-                                    registrar_id,                              \
-                                    args_def_fn,                               \
-                                    meta_kernel_fn,                            \
-                                    cpp_dtype,                                 \
-                                    ...)                                       \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_2(kernel_name,                           \
-                                        backend,                               \
-                                        layout,                                \
-                                        PT_ID,                                 \
-                                        args_def_fn,                           \
-                                        meta_kernel_fn,                        \
+#define _PT_KERNEL_REGISTRAR_INIT_3(reg_type,                                 \
+                                    kernel_name,                              \
+                                    backend,                                  \
+                                    context,                                  \
+                                    layout,                                   \
+                                    registrar_id,                             \
+                                    args_def_fn,                              \
+                                    meta_kernel_fn,                           \
+                                    cpp_dtype,                                \
+                                    ...)                                      \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_2(reg_type,                             \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        context,                              \
+                                        layout,                               \
+                                        PT_ID,                                \
+                                        args_def_fn,                          \
+                                        meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_4(kernel_name,                               \
-                                    backend,                                   \
-                                    layout,                                    \
-                                    registrar_id,                              \
-                                    args_def_fn,                               \
-                                    meta_kernel_fn,                            \
-                                    cpp_dtype,                                 \
-                                    ...)                                       \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_3(kernel_name,                           \
-                                        backend,                               \
-                                        layout,                                \
-                                        PT_ID,                                 \
-                                        args_def_fn,                           \
-                                        meta_kernel_fn,                        \
+#define _PT_KERNEL_REGISTRAR_INIT_4(reg_type,                                 \
+                                    kernel_name,                              \
+                                    backend,                                  \
+                                    context,                                  \
+                                    layout,                                   \
+                                    registrar_id,                             \
+                                    args_def_fn,                              \
+                                    meta_kernel_fn,                           \
+                                    cpp_dtype,                                \
+                                    ...)                                      \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_3(reg_type,                             \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        context,                              \
+                                        layout,                               \
+                                        PT_ID,                                \
+                                        args_def_fn,                          \
+                                        meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_5(kernel_name,                               \
-                                    backend,                                   \
-                                    layout,                                    \
-                                    registrar_id,                              \
-                                    args_def_fn,                               \
-                                    meta_kernel_fn,                            \
-                                    cpp_dtype,                                 \
-                                    ...)                                       \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_4(kernel_name,                           \
-                                        backend,                               \
-                                        layout,                                \
-                                        PT_ID,                                 \
-                                        args_def_fn,                           \
-                                        meta_kernel_fn,                        \
+#define _PT_KERNEL_REGISTRAR_INIT_5(reg_type,                                 \
+                                    kernel_name,                              \
+                                    backend,                                  \
+                                    context,                                  \
+                                    layout,                                   \
+                                    registrar_id,                             \
+                                    args_def_fn,                              \
+                                    meta_kernel_fn,                           \
+                                    cpp_dtype,                                \
+                                    ...)                                      \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_4(reg_type,                             \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        context,                              \
+                                        layout,                               \
+                                        PT_ID,                                \
+                                        args_def_fn,                          \
+                                        meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_6(kernel_name,                               \
-                                    backend,                                   \
-                                    layout,                                    \
-                                    registrar_id,                              \
-                                    args_def_fn,                               \
-                                    meta_kernel_fn,                            \
-                                    cpp_dtype,                                 \
-                                    ...)                                       \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_5(kernel_name,                           \
-                                        backend,                               \
-                                        layout,                                \
-                                        PT_ID,                                 \
-                                        args_def_fn,                           \
-                                        meta_kernel_fn,                        \
+#define _PT_KERNEL_REGISTRAR_INIT_6(reg_type,                                 \
+                                    kernel_name,                              \
+                                    backend,                                  \
+                                    context,                                  \
+                                    layout,                                   \
+                                    registrar_id,                             \
+                                    args_def_fn,                              \
+                                    meta_kernel_fn,                           \
+                                    cpp_dtype,                                \
+                                    ...)                                      \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_5(reg_type,                             \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        context,                              \
+                                        layout,                               \
+                                        PT_ID,                                \
+                                        args_def_fn,                          \
+                                        meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_7(kernel_name,                               \
-                                    backend,                                   \
-                                    layout,                                    \
-                                    registrar_id,                              \
-                                    args_def_fn,                               \
-                                    meta_kernel_fn,                            \
-                                    cpp_dtype,                                 \
-                                    ...)                                       \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_6(kernel_name,                           \
-                                        backend,                               \
-                                        layout,                                \
-                                        PT_ID,                                 \
-                                        args_def_fn,                           \
-                                        meta_kernel_fn,                        \
+#define _PT_KERNEL_REGISTRAR_INIT_7(reg_type,                                 \
+                                    kernel_name,                              \
+                                    backend,                                  \
+                                    context,                                  \
+                                    layout,                                   \
+                                    registrar_id,                             \
+                                    args_def_fn,                              \
+                                    meta_kernel_fn,                           \
+                                    cpp_dtype,                                \
+                                    ...)                                      \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_6(reg_type,                             \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        context,                              \
+                                        layout,                               \
+                                        PT_ID,                                \
+                                        args_def_fn,                          \
+                                        meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_8(kernel_name,                               \
-                                    backend,                                   \
-                                    layout,                                    \
-                                    registrar_id,                              \
-                                    args_def_fn,                               \
-                                    meta_kernel_fn,                            \
-                                    cpp_dtype,                                 \
-                                    ...)                                       \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_7(kernel_name,                           \
-                                        backend,                               \
-                                        layout,                                \
-                                        PT_ID,                                 \
-                                        args_def_fn,                           \
-                                        meta_kernel_fn,                        \
+#define _PT_KERNEL_REGISTRAR_INIT_8(reg_type,                                 \
+                                    kernel_name,                              \
+                                    backend,                                  \
+                                    context,                                  \
+                                    layout,                                   \
+                                    registrar_id,                             \
+                                    args_def_fn,                              \
+                                    meta_kernel_fn,                           \
+                                    cpp_dtype,                                \
+                                    ...)                                      \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_7(reg_type,                             \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        context,                              \
+                                        layout,                               \
+                                        PT_ID,                                \
+                                        args_def_fn,                          \
+                                        meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_9(kernel_name,                               \
-                                    backend,                                   \
-                                    layout,                                    \
-                                    registrar_id,                              \
-                                    args_def_fn,                               \
-                                    meta_kernel_fn,                            \
-                                    cpp_dtype,                                 \
-                                    ...)                                       \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_8(kernel_name,                           \
-                                        backend,                               \
-                                        layout,                                \
-                                        PT_ID,                                 \
-                                        args_def_fn,                           \
-                                        meta_kernel_fn,                        \
+#define _PT_KERNEL_REGISTRAR_INIT_9(reg_type,                                 \
+                                    kernel_name,                              \
+                                    backend,                                  \
+                                    context,                                  \
+                                    layout,                                   \
+                                    registrar_id,                             \
+                                    args_def_fn,                              \
+                                    meta_kernel_fn,                           \
+                                    cpp_dtype,                                \
+                                    ...)                                      \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_8(reg_type,                             \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        context,                              \
+                                        layout,                               \
+                                        PT_ID,                                \
+                                        args_def_fn,                          \
+                                        meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_10(kernel_name,                              \
-                                     backend,                                  \
-                                     layout,                                   \
-                                     registrar_id,                             \
-                                     args_def_fn,                              \
-                                     meta_kernel_fn,                           \
-                                     cpp_dtype,                                \
-                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_9(kernel_name,                           \
-                                        backend,                               \
-                                        layout,                                \
-                                        PT_ID,                                 \
-                                        args_def_fn,                           \
-                                        meta_kernel_fn,                        \
+#define _PT_KERNEL_REGISTRAR_INIT_10(reg_type,                                \
+                                     kernel_name,                             \
+                                     backend,                                 \
+                                     context,                                 \
+                                     layout,                                  \
+                                     registrar_id,                            \
+                                     args_def_fn,                             \
+                                     meta_kernel_fn,                          \
+                                     cpp_dtype,                               \
+                                     ...)                                     \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_9(reg_type,                             \
+                                        kernel_name,                          \
+                                        backend,                              \
+                                        context,                              \
+                                        layout,                               \
+                                        PT_ID,                                \
+                                        args_def_fn,                          \
+                                        meta_kernel_fn,                       \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_11(kernel_name,                              \
-                                     backend,                                  \
-                                     layout,                                   \
-                                     registrar_id,                             \
-                                     args_def_fn,                              \
-                                     meta_kernel_fn,                           \
-                                     cpp_dtype,                                \
-                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_10(kernel_name,                          \
-                                         backend,                              \
-                                         layout,                               \
-                                         PT_ID,                                \
-                                         args_def_fn,                          \
-                                         meta_kernel_fn,                       \
+#define _PT_KERNEL_REGISTRAR_INIT_11(reg_type,                                \
+                                     kernel_name,                             \
+                                     backend,                                 \
+                                     context,                                 \
+                                     layout,                                  \
+                                     registrar_id,                            \
+                                     args_def_fn,                             \
+                                     meta_kernel_fn,                          \
+                                     cpp_dtype,                               \
+                                     ...)                                     \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_10(reg_type,                            \
+                                         kernel_name,                         \
+                                         backend,                             \
+                                         context,                             \
+                                         layout,                              \
+                                         PT_ID,                               \
+                                         args_def_fn,                         \
+                                         meta_kernel_fn,                      \
                                          __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_12(kernel_name,                              \
-                                     backend,                                  \
-                                     layout,                                   \
-                                     registrar_id,                             \
-                                     args_def_fn,                              \
-                                     meta_kernel_fn,                           \
-                                     cpp_dtype,                                \
-                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_11(kernel_name,                          \
-                                         backend,                              \
-                                         layout,                               \
-                                         PT_ID,                                \
-                                         args_def_fn,                          \
-                                         meta_kernel_fn,                       \
+#define _PT_KERNEL_REGISTRAR_INIT_12(reg_type,                                \
+                                     kernel_name,                             \
+                                     backend,                                 \
+                                     context,                                 \
+                                     layout,                                  \
+                                     registrar_id,                            \
+                                     args_def_fn,                             \
+                                     meta_kernel_fn,                          \
+                                     cpp_dtype,                               \
+                                     ...)                                     \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_11(reg_type,                            \
+                                         kernel_name,                         \
+                                         backend,                             \
+                                         context,                             \
+                                         layout,                              \
+                                         PT_ID,                               \
+                                         args_def_fn,                         \
+                                         meta_kernel_fn,                      \
                                          __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_13(kernel_name,                              \
-                                     backend,                                  \
-                                     layout,                                   \
-                                     registrar_id,                             \
-                                     args_def_fn,                              \
-                                     meta_kernel_fn,                           \
-                                     cpp_dtype,                                \
-                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_12(kernel_name,                          \
-                                         backend,                              \
-                                         layout,                               \
-                                         PT_ID,                                \
-                                         args_def_fn,                          \
-                                         meta_kernel_fn,                       \
+#define _PT_KERNEL_REGISTRAR_INIT_13(reg_type,                                \
+                                     kernel_name,                             \
+                                     backend,                                 \
+                                     context,                                 \
+                                     layout,                                  \
+                                     registrar_id,                            \
+                                     args_def_fn,                             \
+                                     meta_kernel_fn,                          \
+                                     cpp_dtype,                               \
+                                     ...)                                     \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_12(reg_type,                            \
+                                         kernel_name,                         \
+                                         backend,                             \
+                                         context,                             \
+                                         layout,                              \
+                                         PT_ID,                               \
+                                         args_def_fn,                         \
+                                         meta_kernel_fn,                      \
                                          __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_14(kernel_name,                              \
-                                     backend,                                  \
-                                     layout,                                   \
-                                     registrar_id,                             \
-                                     args_def_fn,                              \
-                                     meta_kernel_fn,                           \
-                                     cpp_dtype,                                \
-                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_13(kernel_name,                          \
-                                         backend,                              \
-                                         layout,                               \
-                                         PT_ID,                                \
-                                         args_def_fn,                          \
-                                         meta_kernel_fn,                       \
+#define _PT_KERNEL_REGISTRAR_INIT_14(reg_type,                                \
+                                     kernel_name,                             \
+                                     backend,                                 \
+                                     context,                                 \
+                                     layout,                                  \
+                                     registrar_id,                            \
+                                     args_def_fn,                             \
+                                     meta_kernel_fn,                          \
+                                     cpp_dtype,                               \
+                                     ...)                                     \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_13(reg_type,                            \
+                                         kernel_name,                         \
+                                         backend,                             \
+                                         context,                             \
+                                         layout,                              \
+                                         PT_ID,                               \
+                                         args_def_fn,                         \
+                                         meta_kernel_fn,                      \
                                          __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_15(kernel_name,                              \
-                                     backend,                                  \
-                                     layout,                                   \
-                                     registrar_id,                             \
-                                     args_def_fn,                              \
-                                     meta_kernel_fn,                           \
-                                     cpp_dtype,                                \
-                                     ...)                                      \
-  static const ::phi::KernelRegistrar PT_CONCATENATE(                          \
-      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)(  \
-      #kernel_name,                                                            \
-      BACKEND(backend),                                                        \
-      DATALAYOUT(layout),                                                      \
-      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),            \
-      ::phi::KernelArgsParseFunctor<decltype(                                  \
-          &meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)>::Parse,        \
-      args_def_fn,                                                             \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>),           \
-      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, ::phi::backend##Context>)); \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_14(kernel_name,                          \
-                                         backend,                              \
-                                         layout,                               \
-                                         PT_ID,                                \
-                                         args_def_fn,                          \
-                                         meta_kernel_fn,                       \
+#define _PT_KERNEL_REGISTRAR_INIT_15(reg_type,                                \
+                                     kernel_name,                             \
+                                     backend,                                 \
+                                     context,                                 \
+                                     layout,                                  \
+                                     registrar_id,                            \
+                                     args_def_fn,                             \
+                                     meta_kernel_fn,                          \
+                                     cpp_dtype,                               \
+                                     ...)                                     \
+  static const ::phi::KernelRegistrar PT_CONCATENATE(                         \
+      __reg_pt_kernel_##kernel_name##_##backend##_##layout##_, registrar_id)( \
+      reg_type,                                                               \
+      #kernel_name,                                                           \
+      #backend,                                                               \
+      DATALAYOUT(layout),                                                     \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(),           \
+      ::phi::KernelArgsParseFunctor<decltype(                                 \
+          &meta_kernel_fn<cpp_dtype, context>)>::Parse,                       \
+      args_def_fn,                                                            \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype, context>),                          \
+      PT_VARIADIC_KERNEL(meta_kernel_fn<cpp_dtype, context>));                \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_14(reg_type,                            \
+                                         kernel_name,                         \
+                                         backend,                             \
+                                         context,                             \
+                                         layout,                              \
+                                         PT_ID,                               \
+                                         args_def_fn,                         \
+                                         meta_kernel_fn,                      \
                                          __VA_ARGS__))
-
-/** PT_REGISTER_GENERAL_KERNEL
+/** PD_REGISTER_GENERAL_KERNEL
  *
  * Basic Kernel register marco, used to register a instantiated kernel function
  * with one template argument.
  */
 
-#define PT_REGISTER_GENERAL_KERNEL(                                          \
-    kernel_name, backend, layout, kernel_fn, dtype)                          \
+#define PD_REGISTER_GENERAL_KERNEL(                 \
+    kernel_name, backend, layout, kernel_fn, dtype) \
+  _PD_REGISTER_GENERAL_KERNEL(                      \
+      ::phi::RegType::BUILTIN, kernel_name, backend, layout, kernel_fn, dtype)
+
+#define _PD_REGISTER_GENERAL_KERNEL(                                         \
+    reg_type, kernel_name, backend, layout, kernel_fn, dtype)                \
   PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
-      pt_register_no_t_kernel_ns_check_##kernel_name##_##backend##_##layout, \
-      "PT_REGISTER_NO_TEMPLATE_KERNEL must be called in global namespace."); \
-  _PT_REGISTER_GENERAL_KERNEL(kernel_name, backend, layout, kernel_fn, dtype)
+      PD_REGISTER_no_t_kernel_ns_check_##kernel_name##_##backend##_##layout, \
+      "PD_REGISTER_NO_TEMPLATE_KERNEL must be called in global namespace."); \
+  __PD_REGISTER_GENERAL_KERNEL(                                              \
+      reg_type, kernel_name, backend, layout, kernel_fn, dtype)
 
 #ifndef _WIN32
-#define _PT_REGISTER_GENERAL_KERNEL(                                        \
-    kernel_name, backend, layout, kernel_fn, dtype)                         \
+#define __PD_REGISTER_GENERAL_KERNEL(                                       \
+    reg_type, kernel_name, backend, layout, kernel_fn, dtype)               \
   template decltype(kernel_fn) kernel_fn;                                   \
   static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel);           \
   static const ::phi::KernelRegistrar                                       \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout(                 \
+          reg_type,                                                         \
           #kernel_name,                                                     \
-          BACKEND(backend),                                                 \
+          #backend,                                                         \
           DATALAYOUT(layout),                                               \
           ::phi::KernelArgsParseFunctor<decltype(&kernel_fn)>::Parse,       \
           &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,    \
@@ -787,14 +950,15 @@ struct KernelRegistrar {
   void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout(        \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel)
 #else
-#define _PT_REGISTER_GENERAL_KERNEL(                                        \
-    kernel_name, backend, layout, kernel_fn, dtype)                         \
+#define __PD_REGISTER_GENERAL_KERNEL(                                       \
+    reg_type, kernel_name, backend, layout, kernel_fn, dtype)               \
   static void __PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout( \
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel);           \
   static const ::phi::KernelRegistrar                                       \
       __reg_pt_kernel_##kernel_name##_##backend##_##layout(                 \
+          reg_type,                                                         \
           #kernel_name,                                                     \
-          BACKEND(backend),                                                 \
+          #backend,                                                         \
           DATALAYOUT(layout),                                               \
           ::phi::KernelArgsParseFunctor<decltype(&kernel_fn)>::Parse,       \
           &__PT_KERNEL_args_def_FN_##kernel_name##_##backend##_##layout,    \
@@ -807,18 +971,48 @@ struct KernelRegistrar {
       const ::phi::KernelKey& kernel_key, ::phi::Kernel* kernel)
 #endif
 
-/** PT_DECLARE_KERNEL
+/** PD_DECLARE_KERNEL
  *
  * Used to export the symbols of the file where the kernel is located,
  * to avoid being removed by linker
  */
-#define PT_DECLARE_KERNEL(kernel_name, backend, layout)                   \
+#define PD_DECLARE_KERNEL(kernel_name, backend, layout)                   \
   PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                      \
-      pt_declare_tp_kernel_ns_check_##kernel_name##_##backend##_##layout, \
-      "PT_DECLARE_KERNEL must be called in global namespace.");           \
+      PD_DECLARE_tp_kernel_ns_check_##kernel_name##_##backend##_##layout, \
+      "PD_DECLARE_KERNEL must be called in global namespace.");           \
   extern int TouchKernelSymbolFor_##kernel_name##_##backend##_##layout(); \
   UNUSED static int                                                       \
       __declare_kernel_symbol_for_##kernel_name##_##backend##_##layout =  \
           TouchKernelSymbolFor_##kernel_name##_##backend##_##layout()
 
+/** PD_REGISTER_BUILTIN_KERNEL
+ *
+ * Used to register kernels for built-in backends.
+ * Support CPU GPU XPU.
+ */
+#define PD_REGISTER_BUILTIN_KERNEL(                    \
+    kernel_name, backend, layout, meta_kernel_fn, ...) \
+  _PD_REGISTER_KERNEL(::phi::RegType::PLUGIN,          \
+                      kernel_name,                     \
+                      backend,                         \
+                      ::phi::backend##Context,         \
+                      layout,                          \
+                      meta_kernel_fn,                  \
+                      __VA_ARGS__)
+
+/** PD_REGISTER_PLUGIN_KERNEL
+ *
+ * Used to register kernels for plug-in backends.
+ * Support user-defined backend such as 'Ascend910'.
+ */
+#define PD_REGISTER_PLUGIN_KERNEL(                     \
+    kernel_name, backend, layout, meta_kernel_fn, ...) \
+  _PD_REGISTER_KERNEL(::phi::RegType::PLUGIN,          \
+                      kernel_name,                     \
+                      backend,                         \
+                      ::phi::CustomContext,            \
+                      layout,                          \
+                      meta_kernel_fn,                  \
+                      __VA_ARGS__)
+
 }  // namespace phi
diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h
index a9c064f1b8896b..862f61b20400e6 100644
--- a/paddle/phi/core/kernel_utils.h
+++ b/paddle/phi/core/kernel_utils.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/custom/custom_context.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/xpu/xpu_context.h"
 #include "paddle/phi/common/scalar.h"
@@ -22,7 +23,9 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_context.h"
+#ifndef PADDLE_WITH_CUSTOM_KERNEL
 #include "paddle/phi/core/selected_rows.h"
+#endif
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
 #include "paddle/phi/core/type_defs.h"
@@ -210,13 +213,18 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
 #ifdef PADDLE_WITH_XPU
   PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(XPUContext);
 #endif
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(CustomContext);
+#endif
 
   /* Input Helpers */
 
   PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(DenseTensor);
   PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(DenseTensor);
   PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(DenseTensor);
+#ifndef PADDLE_WITH_CUSTOM_KERNEL
   PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SelectedRows);
+#endif
 
   PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SparseCooTensor);
   PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SparseCooTensor);
@@ -241,12 +249,18 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
   PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const ScalarArray&);
   PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<int>&);
   PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::string&);
+  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<bool>&);
+  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<float>&);
+  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<double>&);
+  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<std::string>&);
 
   /* Output Helpers */
 
   PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(DenseTensor);
   PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(DenseTensor);
+#ifndef PADDLE_WITH_CUSTOM_KERNEL
   PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SelectedRows);
+#endif
 
   PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SparseCooTensor);
   PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(SparseCooTensor);
diff --git a/paddle/phi/core/lod_utils.h b/paddle/phi/core/lod_utils.h
index 2b0be4d93429d2..a5f73b66fb99b6 100644
--- a/paddle/phi/core/lod_utils.h
+++ b/paddle/phi/core/lod_utils.h
@@ -15,10 +15,16 @@
 #pragma once
 
 // See Note [ Why still include the fluid headers? ]
+#ifndef PADDLE_WITH_CUSTOM_KERNEL
 #include "paddle/fluid/framework/mixed_vector.h"
+#endif
 
 namespace phi {
+#ifndef PADDLE_WITH_CUSTOM_KERNEL
 using LoD = std::vector<paddle::framework::Vector<size_t>>;
+#else
+using LoD = std::vector<std::vector<size_t>>;
+#endif
 
 void AppendLoD(LoD* lod, const LoD& lod_length);
 
diff --git a/paddle/phi/core/tensor_meta.h b/paddle/phi/core/tensor_meta.h
index d5e5e2aa001fd4..ede9b43b1f382d 100644
--- a/paddle/phi/core/tensor_meta.h
+++ b/paddle/phi/core/tensor_meta.h
@@ -24,12 +24,18 @@ limitations under the License. */
 // Note: mixed_vector include many header now, LoD will be
 // used on CUDA device? Can we use small_vector here?
 // @zhanlve: Rollback to original LoD for now
+#ifndef PADDLE_WITH_CUSTOM_KERNEL
 #include "paddle/fluid/framework/mixed_vector.h"
+#endif
 
 namespace phi {
 
 using DDim = phi::DDim;
+#ifndef PADDLE_WITH_CUSTOM_KERNEL
 using LoD = std::vector<paddle::framework::Vector<size_t>>;
+#else
+using LoD = std::vector<std::vector<size_t>>;
+#endif
 /// \brief The meta data of dense tensor. Take the structure type
 /// and use all default operations.
 ///
diff --git a/paddle/phi/core/tensor_utils.h b/paddle/phi/core/tensor_utils.h
index 04db7c0877ad81..676a590ecbce23 100644
--- a/paddle/phi/core/tensor_utils.h
+++ b/paddle/phi/core/tensor_utils.h
@@ -31,25 +31,25 @@ class DenseTensorUtils {
     size_t bytes = tensor.numel() * SizeOf(tensor.dtype());
     PADDLE_ENFORCE_GE(tensor.capacity(),
                       bytes,
-                      paddle::platform::errors::InvalidArgument(
+                      phi::errors::InvalidArgument(
                           "The memory size %d should be enough to meet the "
                           "volume required by metadata %d.",
                           tensor.capacity(),
                           bytes));
-    PADDLE_ENFORCE_GE(begin_idx,
-                      0,
-                      paddle::platform::errors::OutOfRange(
-                          "The start row index must be greater than 0."
-                          "But received the start index is d%.",
-                          begin_idx));
-    PADDLE_ENFORCE_LE(end_idx,
-                      tensor.dims()[0],
-                      paddle::platform::errors::OutOfRange(
-                          "The end row index is out of bound."));
+    PADDLE_ENFORCE_GE(
+        begin_idx,
+        0,
+        phi::errors::OutOfRange("The start row index must be greater than 0."
+                                "But received the start index is d%.",
+                                begin_idx));
+    PADDLE_ENFORCE_LE(
+        end_idx,
+        tensor.dims()[0],
+        phi::errors::OutOfRange("The end row index is out of bound."));
     PADDLE_ENFORCE_LT(
         begin_idx,
         end_idx,
-        paddle::platform::errors::InvalidArgument(
+        phi::errors::InvalidArgument(
             "The start row index must be less than the end row index."
             "But received the start index = %d, the end index = %d.",
             begin_idx,
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index f79b5982f6194c..a964788b15e312 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/infermeta/binary.h"
+#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
 
 namespace phi {
@@ -188,4 +189,40 @@ void ElementwiseRawInferMeta(const MetaTensor& x,
   out->share_lod(x);
 }
 
+void HuberLossInferMeta(const MetaTensor& input,
+                        const MetaTensor& label,
+                        float delta,
+                        MetaTensor* out,
+                        MetaTensor* residual,
+                        MetaConfig config) {
+  auto input_dims = input.dims();
+  auto label_dims = label.dims();
+
+  PADDLE_ENFORCE_EQ(input_dims.size(),
+                    label_dims.size(),
+                    phi::errors::InvalidArgument(
+                        "Input(input) rank and Input(label) rank should be "
+                        "same, but received input rank(%d) != label rank(%d)",
+                        input_dims.size(),
+                        label_dims.size()));
+
+  bool contain_unknown_dim = phi::contain_unknown_dim(input_dims) ||
+                             phi::contain_unknown_dim(label_dims);
+  if (config.is_runtime || !contain_unknown_dim) {
+    PADDLE_ENFORCE_EQ(
+        input_dims,
+        label_dims,
+        phi::errors::InvalidArgument(
+            "The Input(input) and Input(label) should have the same "
+            "shape, but received input shape [%s] != label shape [%s]",
+            input_dims,
+            label_dims));
+  }
+
+  auto out_dims = label_dims;
+  residual->set_dims(out_dims);
+  out->set_dims(out_dims);
+  out->share_lod(input);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index 5e3214127ee236..93ef9f5f35abba 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -45,4 +45,11 @@ void ElementwiseRawInferMeta(const MetaTensor& x_meta,
                              const MetaTensor& y_meta,
                              int axis,
                              MetaTensor* out);
+
+void HuberLossInferMeta(const MetaTensor& input_meta,
+                        const MetaTensor& label_meta,
+                        float delta,
+                        MetaTensor* out,
+                        MetaTensor* residual,
+                        MetaConfig config = MetaConfig());
 }  // namespace phi
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 2f01174dff9b34..66a91e0ca53e82 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -15,8 +15,8 @@ limitations under the License. */
 #include "paddle/phi/infermeta/unary.h"
 
 #include <set>
-
 #include "paddle/phi/common/data_type.h"
+#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/infermeta_utils.h"
 
 namespace phi {
@@ -217,7 +217,7 @@ void InferMetaFromVecValue(const MetaTensor& x,
                            MetaTensor* out) {
   PADDLE_ENFORCE_EQ(!shape.empty(),
                     true,
-                    paddle::platform::errors::InvalidArgument(
+                    phi::errors::InvalidArgument(
                         "The parameter 'shape' in ReshapeOp must be set. "
                         "But received 'shape' is empty."));
   auto x_dims = x.dims();
@@ -234,8 +234,42 @@ void InferMetaFromVecValue(const MetaTensor& x,
 
 void ReshapeInferMeta(const MetaTensor& x,
                       const ScalarArray& shape,
-                      MetaTensor* out) {
-  InferMetaFromVecValue(x, shape.GetData(), out);
+                      MetaTensor* out,
+                      MetaConfig config) {
+  auto& shape_data = shape.GetData();
+  PADDLE_ENFORCE_NOT_NULL(out,
+                          phi::errors::InvalidArgument(
+                              "Output(Out) of ReshapeOp should not be null."));
+  if (!config.is_runtime && shape.FromTensor()) {
+    out->set_dims(phi::make_ddim(shape_data));
+    out->share_lod(x);
+    return;
+  }
+  PADDLE_ENFORCE_GT(shape_data.size(),
+                    0,
+                    phi::errors::InvalidArgument(
+                        "The shape's size in ReshapeOp can't be zero."));
+  InferMetaFromVecValue(x, shape_data, out);
+}
+
+void ReshapeWithXShapeInferMeta(const MetaTensor& x,
+                                const ScalarArray& shape,
+                                MetaTensor* xshape,
+                                MetaTensor* out,
+                                MetaConfig config) {
+  PADDLE_ENFORCE_NOT_NULL(
+      xshape,
+      phi::errors::InvalidArgument(
+          "Output(XShape) of ReshapeOp should not be null."));
+  const auto& x_dims = x.dims();
+  std::vector<int64_t> xshape_dims(x_dims.size() + 1);
+  xshape_dims[0] = 0;
+  for (int i = 0; i < x_dims.size(); ++i) {
+    xshape_dims[i + 1] = x_dims[i];
+  }
+  xshape->set_dims(phi::make_ddim(xshape_dims));
+  xshape->share_lod(x);
+  ReshapeInferMeta(x, shape, out, config);
 }
 
 /*  Why not use ReduceInferMeta directly?
@@ -505,5 +539,5 @@ void TraceInferMeta(
 
 }  // namespace phi
 
-PT_REGISTER_INFER_META_FN(copy_to, phi::CopyToInferMeta);
-PT_REGISTER_INFER_META_FN(split, phi::SplitInferMeta);
+PD_REGISTER_INFER_META_FN(copy_to, phi::CopyToInferMeta);
+PD_REGISTER_INFER_META_FN(split, phi::SplitInferMeta);
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index 560ce0d2d4c489..2ab425d42cd33e 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -54,7 +54,14 @@ void InferMetaFromVecValue(const MetaTensor& x,
 
 void ReshapeInferMeta(const MetaTensor& x,
                       const ScalarArray& shape,
-                      MetaTensor* out);
+                      MetaTensor* out,
+                      MetaConfig config = MetaConfig());
+
+void ReshapeWithXShapeInferMeta(const MetaTensor& x,
+                                const ScalarArray& shape,
+                                MetaTensor* xshape,
+                                MetaTensor* out,
+                                MetaConfig config = MetaConfig());
 
 void ReduceInferMetaBase(const MetaTensor& x,
                          const std::vector<int64_t>& axis,
diff --git a/paddle/phi/kernels/cpu/abs_grad_kernel.cc b/paddle/phi/kernels/cpu/abs_grad_kernel.cc
index 3c90a348d86a4c..ca42a5eb2976f6 100644
--- a/paddle/phi/kernels/cpu/abs_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/abs_grad_kernel.cc
@@ -19,7 +19,7 @@
 
 using phi::dtype::complex;
 
-PT_REGISTER_KERNEL(abs_grad,
+PD_REGISTER_KERNEL(abs_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::AbsGradKernel,
@@ -29,7 +29,7 @@ PT_REGISTER_KERNEL(abs_grad,
                    int64_t,
                    complex<float>,
                    complex<double>) {}
-PT_REGISTER_KERNEL(abs_double_grad,
+PD_REGISTER_KERNEL(abs_double_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::AbsDoubleGradKernel,
diff --git a/paddle/phi/kernels/cpu/abs_kernel.cc b/paddle/phi/kernels/cpu/abs_kernel.cc
index 97bd89832870cc..71d818c45e6f3f 100644
--- a/paddle/phi/kernels/cpu/abs_kernel.cc
+++ b/paddle/phi/kernels/cpu/abs_kernel.cc
@@ -36,7 +36,7 @@ void AbsKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(abs,
+PD_REGISTER_KERNEL(abs,
                    CPU,
                    ALL_LAYOUT,
                    phi::AbsKernel,
diff --git a/paddle/phi/kernels/cpu/bernoulli_kernel.cc b/paddle/phi/kernels/cpu/bernoulli_kernel.cc
index 4ba965a4e5f1d2..09c07d9ec9dea0 100644
--- a/paddle/phi/kernels/cpu/bernoulli_kernel.cc
+++ b/paddle/phi/kernels/cpu/bernoulli_kernel.cc
@@ -51,5 +51,5 @@ void BernoulliKernel(const Context& ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     bernoulli, CPU, ALL_LAYOUT, phi::BernoulliKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/cast_kernel.cc b/paddle/phi/kernels/cpu/cast_kernel.cc
index 4e95a37270dd43..c2c207bfaf25e5 100644
--- a/paddle/phi/kernels/cpu/cast_kernel.cc
+++ b/paddle/phi/kernels/cpu/cast_kernel.cc
@@ -58,7 +58,7 @@ void CastKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(cast,
+PD_REGISTER_KERNEL(cast,
                    CPU,
                    ALL_LAYOUT,
                    phi::CastKernel,
diff --git a/paddle/phi/kernels/cpu/complex_kernel.cc b/paddle/phi/kernels/cpu/complex_kernel.cc
index 3a886c3378524c..ae09f2a5effe16 100644
--- a/paddle/phi/kernels/cpu/complex_kernel.cc
+++ b/paddle/phi/kernels/cpu/complex_kernel.cc
@@ -21,7 +21,7 @@
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/phi/common/complex.h"
 
-PT_REGISTER_KERNEL(conj,
+PD_REGISTER_KERNEL(conj,
                    CPU,
                    ALL_LAYOUT,
                    phi::ConjKernel,
diff --git a/paddle/phi/kernels/cpu/concat_kernel.cc b/paddle/phi/kernels/cpu/concat_kernel.cc
index 7f4cce379e04d4..0cae2599f8d13f 100644
--- a/paddle/phi/kernels/cpu/concat_kernel.cc
+++ b/paddle/phi/kernels/cpu/concat_kernel.cc
@@ -110,7 +110,7 @@ void ConcatKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(concat,
+PD_REGISTER_KERNEL(concat,
                    CPU,
                    ALL_LAYOUT,
                    phi::ConcatKernel,
diff --git a/paddle/phi/kernels/cpu/copy_kernel.cc b/paddle/phi/kernels/cpu/copy_kernel.cc
index 8a79a5f6b1941e..7dcd75d39e4df5 100644
--- a/paddle/phi/kernels/cpu/copy_kernel.cc
+++ b/paddle/phi/kernels/cpu/copy_kernel.cc
@@ -56,5 +56,5 @@ void Copy(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_GENERAL_KERNEL(
+PD_REGISTER_GENERAL_KERNEL(
     copy, CPU, ALL_LAYOUT, phi::Copy<phi::CPUContext>, ALL_DTYPE) {}
diff --git a/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc b/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc
index 351b2335386a8b..c3c290b4fe91ec 100644
--- a/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/diagonal_grad_kernel.cc
@@ -82,7 +82,7 @@ void DiagonalGradKernel(const Context& dev_ctx,
   }
 }
 }  // namespace phi
-PT_REGISTER_KERNEL(diagonal_grad,
+PD_REGISTER_KERNEL(diagonal_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::DiagonalGradKernel,
diff --git a/paddle/phi/kernels/cpu/diagonal_kernel.cc b/paddle/phi/kernels/cpu/diagonal_kernel.cc
index 79f09008f3e2e4..df17b458e1166b 100644
--- a/paddle/phi/kernels/cpu/diagonal_kernel.cc
+++ b/paddle/phi/kernels/cpu/diagonal_kernel.cc
@@ -79,7 +79,7 @@ void DiagonalKernel(const Context& dev_ctx,
   }
 }
 }  // namespace phi
-PT_REGISTER_KERNEL(diagonal,
+PD_REGISTER_KERNEL(diagonal,
                    CPU,
                    ALL_LAYOUT,
                    phi::DiagonalKernel,
diff --git a/paddle/phi/kernels/cpu/digamma_grad_kernel.cc b/paddle/phi/kernels/cpu/digamma_grad_kernel.cc
index 5cb86eef498bd3..da1b5ae556609c 100644
--- a/paddle/phi/kernels/cpu/digamma_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/digamma_grad_kernel.cc
@@ -19,5 +19,5 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/digamma_grad_kernel_impl.h"
 
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     digamma_grad, CPU, ALL_LAYOUT, phi::DigammaGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/digamma_kernel.cc b/paddle/phi/kernels/cpu/digamma_kernel.cc
index 0013d8ee7740b8..ee120a29b6061e 100644
--- a/paddle/phi/kernels/cpu/digamma_kernel.cc
+++ b/paddle/phi/kernels/cpu/digamma_kernel.cc
@@ -19,5 +19,5 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/digamma_kernel_impl.h"
 
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     digamma, CPU, ALL_LAYOUT, phi::DigammaKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/dot_grad_kernel.cc b/paddle/phi/kernels/cpu/dot_grad_kernel.cc
index 729bc9aa3a3aca..a2abdb7c00900e 100644
--- a/paddle/phi/kernels/cpu/dot_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/dot_grad_kernel.cc
@@ -20,7 +20,7 @@
 
 #include "paddle/phi/common/complex.h"
 
-PT_REGISTER_KERNEL(dot_grad,
+PD_REGISTER_KERNEL(dot_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::DotGradKernel,
diff --git a/paddle/phi/kernels/cpu/dot_kernel.cc b/paddle/phi/kernels/cpu/dot_kernel.cc
index f4f5d1ffeb544d..3518501a6b63d1 100644
--- a/paddle/phi/kernels/cpu/dot_kernel.cc
+++ b/paddle/phi/kernels/cpu/dot_kernel.cc
@@ -49,7 +49,7 @@ void DotKernel(const Context& dev_ctx,
 using complex64 = ::phi::dtype::complex<float>;
 using complex128 = ::phi::dtype::complex<double>;
 
-PT_REGISTER_KERNEL(dot,
+PD_REGISTER_KERNEL(dot,
                    CPU,
                    ALL_LAYOUT,
                    phi::DotKernel,
diff --git a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
index 2d1b2a3bd7c3fa..0b29091367c83a 100644
--- a/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
@@ -125,7 +125,7 @@ void SubtractDoubleGradKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(add_grad,
+PD_REGISTER_KERNEL(add_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::AddGradKernel,
@@ -137,7 +137,7 @@ PT_REGISTER_KERNEL(add_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(add_double_grad,
+PD_REGISTER_KERNEL(add_double_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::AddDoubleGradKernel,
@@ -149,7 +149,7 @@ PT_REGISTER_KERNEL(add_double_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(add_triple_grad,
+PD_REGISTER_KERNEL(add_triple_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::AddTripleGradKernel,
@@ -161,7 +161,7 @@ PT_REGISTER_KERNEL(add_triple_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(subtract_grad,
+PD_REGISTER_KERNEL(subtract_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::SubtractGradKernel,
@@ -173,7 +173,7 @@ PT_REGISTER_KERNEL(subtract_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(subtract_double_grad,
+PD_REGISTER_KERNEL(subtract_double_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::SubtractDoubleGradKernel,
diff --git a/paddle/phi/kernels/cpu/expand_grad_kernel.cc b/paddle/phi/kernels/cpu/expand_grad_kernel.cc
index 427b6441b2d24c..4799a6aa7afdf8 100644
--- a/paddle/phi/kernels/cpu/expand_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/expand_grad_kernel.cc
@@ -19,7 +19,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/expand_grad_kernel_impl.h"
 
-PT_REGISTER_KERNEL(expand_grad,
+PD_REGISTER_KERNEL(expand_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::ExpandGradKernel,
diff --git a/paddle/phi/kernels/cpu/expand_kernel.cc b/paddle/phi/kernels/cpu/expand_kernel.cc
index cce367c8eb8324..077048976729fd 100644
--- a/paddle/phi/kernels/cpu/expand_kernel.cc
+++ b/paddle/phi/kernels/cpu/expand_kernel.cc
@@ -19,7 +19,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/expand_kernel_impl.h"
 
-PT_REGISTER_KERNEL(expand,
+PD_REGISTER_KERNEL(expand,
                    CPU,
                    ALL_LAYOUT,
                    phi::ExpandKernel,
diff --git a/paddle/phi/kernels/cpu/full_kernel.cc b/paddle/phi/kernels/cpu/full_kernel.cc
index b55eb109f7de32..84d7f56d3361c6 100644
--- a/paddle/phi/kernels/cpu/full_kernel.cc
+++ b/paddle/phi/kernels/cpu/full_kernel.cc
@@ -73,7 +73,7 @@ void FullLikeKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(full,
+PD_REGISTER_KERNEL(full,
                    CPU,
                    ALL_LAYOUT,
                    phi::FullKernel,
@@ -89,7 +89,7 @@ PT_REGISTER_KERNEL(full,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(full_like,
+PD_REGISTER_KERNEL(full_like,
                    CPU,
                    ALL_LAYOUT,
                    phi::FullLikeKernel,
diff --git a/paddle/phi/kernels/cpu/histogram_kernel.cc b/paddle/phi/kernels/cpu/histogram_kernel.cc
index fbcf47c3070e68..82b88f868d8a70 100644
--- a/paddle/phi/kernels/cpu/histogram_kernel.cc
+++ b/paddle/phi/kernels/cpu/histogram_kernel.cc
@@ -77,7 +77,7 @@ void HistogramKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(histogram,
+PD_REGISTER_KERNEL(histogram,
                    CPU,
                    ALL_LAYOUT,
                    phi::HistogramKernel,
diff --git a/paddle/phi/kernels/cpu/huber_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/huber_loss_grad_kernel.cc
new file mode 100644
index 00000000000000..654f2c9400af00
--- /dev/null
+++ b/paddle/phi/kernels/cpu/huber_loss_grad_kernel.cc
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/huber_loss_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/huber_loss_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    huber_loss_grad, CPU, ALL_LAYOUT, phi::HuberLossGradKernel, float, double) {
+}
diff --git a/paddle/phi/kernels/cpu/huber_loss_kernel.cc b/paddle/phi/kernels/cpu/huber_loss_kernel.cc
new file mode 100644
index 00000000000000..702c0589057af7
--- /dev/null
+++ b/paddle/phi/kernels/cpu/huber_loss_kernel.cc
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/huber_loss_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/huber_loss_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    huber_loss, CPU, ALL_LAYOUT, phi::HuberLossKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/lerp_grad_kernel.cc b/paddle/phi/kernels/cpu/lerp_grad_kernel.cc
index 7cfb42dbcf96fa..d74919011ec5da 100644
--- a/paddle/phi/kernels/cpu/lerp_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/lerp_grad_kernel.cc
@@ -17,5 +17,5 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/lerp_grad_kernel_impl.h"
 
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     lerp_grad, CPU, ALL_LAYOUT, phi::LerpGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/lerp_kernel.cc b/paddle/phi/kernels/cpu/lerp_kernel.cc
index 97083c96464c30..7adfc35bfa321e 100644
--- a/paddle/phi/kernels/cpu/lerp_kernel.cc
+++ b/paddle/phi/kernels/cpu/lerp_kernel.cc
@@ -17,4 +17,4 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/lerp_kernel_impl.h"
 
-PT_REGISTER_KERNEL(lerp, CPU, ALL_LAYOUT, phi::LerpKernel, float, double) {}
+PD_REGISTER_KERNEL(lerp, CPU, ALL_LAYOUT, phi::LerpKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/masked_select_grad_kernel.cc b/paddle/phi/kernels/cpu/masked_select_grad_kernel.cc
index 071bbba1975e40..7fe41e686af8c5 100644
--- a/paddle/phi/kernels/cpu/masked_select_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/masked_select_grad_kernel.cc
@@ -43,7 +43,7 @@ void MaskedSelectGradKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(masked_select_grad,
+PD_REGISTER_KERNEL(masked_select_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::MaskedSelectGradKernel,
diff --git a/paddle/phi/kernels/cpu/masked_select_kernel.cc b/paddle/phi/kernels/cpu/masked_select_kernel.cc
index 08fc3f69f01e17..274863a863b799 100644
--- a/paddle/phi/kernels/cpu/masked_select_kernel.cc
+++ b/paddle/phi/kernels/cpu/masked_select_kernel.cc
@@ -61,7 +61,7 @@ void MaskedSelectKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(masked_select,
+PD_REGISTER_KERNEL(masked_select,
                    CPU,
                    ALL_LAYOUT,
                    phi::MaskedSelectKernel,
diff --git a/paddle/phi/kernels/cpu/math_kernel.cc b/paddle/phi/kernels/cpu/math_kernel.cc
index 862ee42296c924..581c5f90f35e5c 100644
--- a/paddle/phi/kernels/cpu/math_kernel.cc
+++ b/paddle/phi/kernels/cpu/math_kernel.cc
@@ -118,7 +118,7 @@ using complex128 = ::phi::dtype::complex<double>;
 
 // NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
 // using bfloat16 = ::phi::dtype::bfloat16;
-PT_REGISTER_KERNEL(add_raw,
+PD_REGISTER_KERNEL(add_raw,
                    CPU,
                    ALL_LAYOUT,
                    phi::AddRawKernel,
@@ -129,7 +129,7 @@ PT_REGISTER_KERNEL(add_raw,
                    int64_t,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(subtract_raw,
+PD_REGISTER_KERNEL(subtract_raw,
                    CPU,
                    ALL_LAYOUT,
                    phi::SubtractRawKernel,
@@ -140,7 +140,7 @@ PT_REGISTER_KERNEL(subtract_raw,
                    int64_t,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(divide_raw,
+PD_REGISTER_KERNEL(divide_raw,
                    CPU,
                    ALL_LAYOUT,
                    phi::DivideRawKernel,
@@ -150,7 +150,7 @@ PT_REGISTER_KERNEL(divide_raw,
                    int64_t,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(multiply_raw,
+PD_REGISTER_KERNEL(multiply_raw,
                    CPU,
                    ALL_LAYOUT,
                    phi::MultiplyRawKernel,
@@ -161,7 +161,7 @@ PT_REGISTER_KERNEL(multiply_raw,
                    bool,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(sum_raw,
+PD_REGISTER_KERNEL(sum_raw,
                    CPU,
                    ALL_LAYOUT,
                    phi::SumRawKernel,
@@ -176,5 +176,5 @@ PT_REGISTER_KERNEL(sum_raw,
                    complex128) {
   kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
 }
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     mean_raw, CPU, ALL_LAYOUT, phi::MeanRawKernel, float, double, bool) {}
diff --git a/paddle/phi/kernels/cpu/matmul_grad_kernel.cc b/paddle/phi/kernels/cpu/matmul_grad_kernel.cc
index 56a185e4ade064..c68e8115e898b3 100644
--- a/paddle/phi/kernels/cpu/matmul_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/matmul_grad_kernel.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/impl/matmul_grad_kernel_impl.h"
 
-PT_REGISTER_KERNEL(matmul_grad,
+PD_REGISTER_KERNEL(matmul_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::MatmulGradKernel,
@@ -28,7 +28,7 @@ PT_REGISTER_KERNEL(matmul_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(matmul_double_grad,
+PD_REGISTER_KERNEL(matmul_double_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::MatmulDoubleGradKernel,
@@ -37,7 +37,7 @@ PT_REGISTER_KERNEL(matmul_double_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(matmul_triple_grad,
+PD_REGISTER_KERNEL(matmul_triple_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::MatmulTripleGradKernel,
diff --git a/paddle/phi/kernels/cpu/matmul_kernel.cc b/paddle/phi/kernels/cpu/matmul_kernel.cc
index 8676aec3eccb47..2bf56c07a5bc74 100644
--- a/paddle/phi/kernels/cpu/matmul_kernel.cc
+++ b/paddle/phi/kernels/cpu/matmul_kernel.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/kernels/impl/matmul_kernel_impl.h"
 
-PT_REGISTER_KERNEL(matmul,
+PD_REGISTER_KERNEL(matmul,
                    CPU,
                    ALL_LAYOUT,
                    phi::MatmulKernel,
diff --git a/paddle/phi/kernels/cpu/norm_grad_kernel.cc b/paddle/phi/kernels/cpu/norm_grad_kernel.cc
index d2073c07244bd5..597207a05a226a 100644
--- a/paddle/phi/kernels/cpu/norm_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/norm_grad_kernel.cc
@@ -83,5 +83,5 @@ void NormGradKernel(const Context& ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     norm_grad, CPU, ALL_LAYOUT, phi::NormGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/norm_kernel.cc b/paddle/phi/kernels/cpu/norm_kernel.cc
index e8f35b5fe7efd8..50906d9c3bb949 100644
--- a/paddle/phi/kernels/cpu/norm_kernel.cc
+++ b/paddle/phi/kernels/cpu/norm_kernel.cc
@@ -76,4 +76,4 @@ void NormKernel(const Context& ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(norm, CPU, ALL_LAYOUT, phi::NormKernel, float, double) {}
+PD_REGISTER_KERNEL(norm, CPU, ALL_LAYOUT, phi::NormKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/scale_kernel.cc b/paddle/phi/kernels/cpu/scale_kernel.cc
index 156afb8798de40..e929b5bd7219b6 100644
--- a/paddle/phi/kernels/cpu/scale_kernel.cc
+++ b/paddle/phi/kernels/cpu/scale_kernel.cc
@@ -51,7 +51,7 @@ void ScaleKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(scale,
+PD_REGISTER_KERNEL(scale,
                    CPU,
                    ALL_LAYOUT,
                    phi::ScaleKernel,
diff --git a/paddle/phi/kernels/cpu/sign_kernel.cc b/paddle/phi/kernels/cpu/sign_kernel.cc
index 6be931904d1331..5fe11ffbd6d5c0 100644
--- a/paddle/phi/kernels/cpu/sign_kernel.cc
+++ b/paddle/phi/kernels/cpu/sign_kernel.cc
@@ -21,4 +21,4 @@ limitations under the License. */
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/phi/common/bfloat16.h"
 
-PT_REGISTER_KERNEL(sign, CPU, ALL_LAYOUT, phi::SignKernel, float, double) {}
+PD_REGISTER_KERNEL(sign, CPU, ALL_LAYOUT, phi::SignKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/split_kernel.cc b/paddle/phi/kernels/cpu/split_kernel.cc
index 4df1e6e1629c02..259bf9e388c2c1 100644
--- a/paddle/phi/kernels/cpu/split_kernel.cc
+++ b/paddle/phi/kernels/cpu/split_kernel.cc
@@ -29,7 +29,7 @@ void SplitKernel(const Context& dev_ctx,
                  const Scalar& axis_scalar,
                  std::vector<DenseTensor*> outs) {
   // need to infershape output
-  if (num_or_sections.IsInitByTensor() || axis_scalar.IsInitByTensor()) {
+  if (num_or_sections.FromTensor() || axis_scalar.FromTensor()) {
     std::vector<MetaTensor> out_metas;
     for (size_t i = 0; i < outs.size(); ++i) {
       out_metas.push_back(outs[i]);
@@ -60,7 +60,7 @@ void SplitKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(split,
+PD_REGISTER_KERNEL(split,
                    CPU,
                    ALL_LAYOUT,
                    phi::SplitKernel,
diff --git a/paddle/phi/kernels/cpu/trace_grad_kernel.cc b/paddle/phi/kernels/cpu/trace_grad_kernel.cc
index e6ffd99bc53bd8..2167851b197d14 100644
--- a/paddle/phi/kernels/cpu/trace_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/trace_grad_kernel.cc
@@ -18,7 +18,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/trace_grad_kernel_impl.h"
 
-PT_REGISTER_KERNEL(trace_grad,
+PD_REGISTER_KERNEL(trace_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::TraceGradKernel,
diff --git a/paddle/phi/kernels/cpu/trace_kernel.cc b/paddle/phi/kernels/cpu/trace_kernel.cc
index 2b2cda6491d484..3646e226519139 100644
--- a/paddle/phi/kernels/cpu/trace_kernel.cc
+++ b/paddle/phi/kernels/cpu/trace_kernel.cc
@@ -45,7 +45,7 @@ void TraceKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(trace,
+PD_REGISTER_KERNEL(trace,
                    CPU,
                    ALL_LAYOUT,
                    phi::TraceKernel,
diff --git a/paddle/phi/kernels/cpu/trunc_grad_kernel.cc b/paddle/phi/kernels/cpu/trunc_grad_kernel.cc
index 7fc677c16ef739..4d85dd609e2d1f 100644
--- a/paddle/phi/kernels/cpu/trunc_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/trunc_grad_kernel.cc
@@ -30,7 +30,7 @@ void TruncGradKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(trunc_grad,
+PD_REGISTER_KERNEL(trunc_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::TruncGradKernel,
diff --git a/paddle/phi/kernels/cpu/trunc_kernel.cc b/paddle/phi/kernels/cpu/trunc_kernel.cc
index 10e42196679fa5..babae6ce7c9318 100644
--- a/paddle/phi/kernels/cpu/trunc_kernel.cc
+++ b/paddle/phi/kernels/cpu/trunc_kernel.cc
@@ -35,5 +35,5 @@ void TruncKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     trunc, CPU, ALL_LAYOUT, phi::TruncKernel, float, double, int, int64_t) {}
diff --git a/paddle/phi/kernels/empty_kernel.cc b/paddle/phi/kernels/empty_kernel.cc
index 6d9e733b2f5767..8109d3879cb21e 100644
--- a/paddle/phi/kernels/empty_kernel.cc
+++ b/paddle/phi/kernels/empty_kernel.cc
@@ -38,7 +38,7 @@ void EmptyLikeKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(empty,
+PD_REGISTER_KERNEL(empty,
                    CPU,
                    ALL_LAYOUT,
                    phi::EmptyKernel,
@@ -54,7 +54,7 @@ PT_REGISTER_KERNEL(empty,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(empty_like,
+PD_REGISTER_KERNEL(empty_like,
                    CPU,
                    ALL_LAYOUT,
                    phi::EmptyLikeKernel,
@@ -71,7 +71,7 @@ PT_REGISTER_KERNEL(empty_like,
                    phi::dtype::complex<double>) {}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_REGISTER_KERNEL(empty,
+PD_REGISTER_KERNEL(empty,
                    GPU,
                    ALL_LAYOUT,
                    phi::EmptyKernel,
@@ -86,7 +86,7 @@ PT_REGISTER_KERNEL(empty,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(empty_like,
+PD_REGISTER_KERNEL(empty_like,
                    GPU,
                    ALL_LAYOUT,
                    phi::EmptyLikeKernel,
diff --git a/paddle/phi/kernels/flatten_grad_kernel.cc b/paddle/phi/kernels/flatten_grad_kernel.cc
index 33e6c2724982a7..7e8010a43f3d18 100644
--- a/paddle/phi/kernels/flatten_grad_kernel.cc
+++ b/paddle/phi/kernels/flatten_grad_kernel.cc
@@ -32,7 +32,7 @@ void FlattenGradKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(flatten_grad,
+PD_REGISTER_KERNEL(flatten_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::FlattenGradKernel,
@@ -44,7 +44,7 @@ PT_REGISTER_KERNEL(flatten_grad,
                    int64_t) {}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_REGISTER_KERNEL(flatten_grad,
+PD_REGISTER_KERNEL(flatten_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::FlattenGradKernel,
@@ -59,7 +59,7 @@ PT_REGISTER_KERNEL(flatten_grad,
 #endif
 
 #ifdef PADDLE_WITH_XPU
-PT_REGISTER_KERNEL(flatten_grad,
+PD_REGISTER_KERNEL(flatten_grad,
                    XPU,
                    ALL_LAYOUT,
                    phi::FlattenGradKernel,
diff --git a/paddle/phi/kernels/flatten_kernel.cc b/paddle/phi/kernels/flatten_kernel.cc
index 1ac444aa1792f4..12eaab92d5211c 100644
--- a/paddle/phi/kernels/flatten_kernel.cc
+++ b/paddle/phi/kernels/flatten_kernel.cc
@@ -48,7 +48,7 @@ void FlattenWithXShape(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(flatten,
+PD_REGISTER_KERNEL(flatten,
                    CPU,
                    ALL_LAYOUT,
                    phi::FlattenKernel,
@@ -60,7 +60,7 @@ PT_REGISTER_KERNEL(flatten,
                    int,
                    int64_t) {}
 
-PT_REGISTER_KERNEL(flatten_with_xshape,
+PD_REGISTER_KERNEL(flatten_with_xshape,
                    CPU,
                    ALL_LAYOUT,
                    phi::FlattenWithXShape,
@@ -73,7 +73,7 @@ PT_REGISTER_KERNEL(flatten_with_xshape,
                    int64_t) {}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_REGISTER_KERNEL(flatten,
+PD_REGISTER_KERNEL(flatten,
                    GPU,
                    ALL_LAYOUT,
                    phi::FlattenKernel,
@@ -86,7 +86,7 @@ PT_REGISTER_KERNEL(flatten,
                    int,
                    int64_t) {}
 
-PT_REGISTER_KERNEL(flatten_with_xshape,
+PD_REGISTER_KERNEL(flatten_with_xshape,
                    GPU,
                    ALL_LAYOUT,
                    phi::FlattenWithXShape,
@@ -101,7 +101,7 @@ PT_REGISTER_KERNEL(flatten_with_xshape,
 #endif
 
 #ifdef PADDLE_WITH_XPU
-PT_REGISTER_KERNEL(flatten,
+PD_REGISTER_KERNEL(flatten,
                    XPU,
                    ALL_LAYOUT,
                    phi::FlattenKernel,
@@ -112,7 +112,7 @@ PT_REGISTER_KERNEL(flatten,
                    int,
                    int64_t) {}
 
-PT_REGISTER_KERNEL(flatten_with_xshape,
+PD_REGISTER_KERNEL(flatten_with_xshape,
                    XPU,
                    ALL_LAYOUT,
                    phi::FlattenWithXShape,
diff --git a/paddle/phi/kernels/gpu/abs_grad_kernel.cu b/paddle/phi/kernels/gpu/abs_grad_kernel.cu
index 37b19278233a87..1ce6a1638b1a04 100644
--- a/paddle/phi/kernels/gpu/abs_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/abs_grad_kernel.cu
@@ -20,7 +20,7 @@
 
 using phi::dtype::complex;
 
-PT_REGISTER_KERNEL(abs_grad,
+PD_REGISTER_KERNEL(abs_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::AbsGradKernel,
@@ -31,7 +31,7 @@ PT_REGISTER_KERNEL(abs_grad,
                    phi::dtype::float16,
                    complex<float>,
                    complex<double>) {}
-PT_REGISTER_KERNEL(abs_double_grad,
+PD_REGISTER_KERNEL(abs_double_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::AbsDoubleGradKernel,
diff --git a/paddle/phi/kernels/gpu/abs_kernel.cu b/paddle/phi/kernels/gpu/abs_kernel.cu
index 5c191dfc992a52..e122e6b1e9c8ab 100644
--- a/paddle/phi/kernels/gpu/abs_kernel.cu
+++ b/paddle/phi/kernels/gpu/abs_kernel.cu
@@ -52,7 +52,7 @@ void AbsKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(abs,
+PD_REGISTER_KERNEL(abs,
                    GPU,
                    ALL_LAYOUT,
                    phi::AbsKernel,
diff --git a/paddle/phi/kernels/gpu/bernoulli_kernel.cu b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
index b043a55e21b611..6127bceef509c9 100644
--- a/paddle/phi/kernels/gpu/bernoulli_kernel.cu
+++ b/paddle/phi/kernels/gpu/bernoulli_kernel.cu
@@ -73,5 +73,5 @@ void BernoulliKernel(const Context& ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     bernoulli, GPU, ALL_LAYOUT, phi::BernoulliKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/cast_kernel.cu b/paddle/phi/kernels/gpu/cast_kernel.cu
index c05cd15b4757a3..7a6c99c5fe15f6 100644
--- a/paddle/phi/kernels/gpu/cast_kernel.cu
+++ b/paddle/phi/kernels/gpu/cast_kernel.cu
@@ -61,7 +61,7 @@ void CastKernel(const Context& dev_ctx,
 }  // namespace phi
 
 #define PTEN_REGISTER_CAST_CUDA_BASE_TYPE(op_name, ...) \
-  PT_REGISTER_KERNEL(cast,                              \
+  PD_REGISTER_KERNEL(cast,                              \
                      GPU,                               \
                      ALL_LAYOUT,                        \
                      phi::CastKernel,                   \
diff --git a/paddle/phi/kernels/gpu/complex_kernel.cu b/paddle/phi/kernels/gpu/complex_kernel.cu
index 47a43ee9910b85..02fd408aba86f3 100644
--- a/paddle/phi/kernels/gpu/complex_kernel.cu
+++ b/paddle/phi/kernels/gpu/complex_kernel.cu
@@ -21,7 +21,7 @@
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/phi/common/complex.h"
 
-PT_REGISTER_KERNEL(conj,
+PD_REGISTER_KERNEL(conj,
                    GPU,
                    ALL_LAYOUT,
                    phi::ConjKernel,
diff --git a/paddle/phi/kernels/gpu/concat_kernel.cu b/paddle/phi/kernels/gpu/concat_kernel.cu
index 22faeaf4197008..c80a873127708c 100644
--- a/paddle/phi/kernels/gpu/concat_kernel.cu
+++ b/paddle/phi/kernels/gpu/concat_kernel.cu
@@ -110,7 +110,7 @@ void ConcatKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(concat,
+PD_REGISTER_KERNEL(concat,
                    GPU,
                    ALL_LAYOUT,
                    phi::ConcatKernel,
diff --git a/paddle/phi/kernels/gpu/copy_kernel.cu b/paddle/phi/kernels/gpu/copy_kernel.cu
index 58b0a31d1d6d54..e88795b6173706 100644
--- a/paddle/phi/kernels/gpu/copy_kernel.cu
+++ b/paddle/phi/kernels/gpu/copy_kernel.cu
@@ -207,5 +207,5 @@ void Copy(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_GENERAL_KERNEL(
+PD_REGISTER_GENERAL_KERNEL(
     copy, GPU, ALL_LAYOUT, phi::Copy<phi::GPUContext>, ALL_DTYPE) {}
diff --git a/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu b/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu
index 599fa2842a974e..423093728e9d62 100644
--- a/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/diagonal_grad_kernel.cu
@@ -158,7 +158,7 @@ void DiagonalGradKernel(const Context& dev_ctx,
   }
 }
 }  // namespace phi
-PT_REGISTER_KERNEL(diagonal_grad,
+PD_REGISTER_KERNEL(diagonal_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::DiagonalGradKernel,
diff --git a/paddle/phi/kernels/gpu/diagonal_kernel.cu b/paddle/phi/kernels/gpu/diagonal_kernel.cu
index c4b61cf819f844..58da29b2224a61 100644
--- a/paddle/phi/kernels/gpu/diagonal_kernel.cu
+++ b/paddle/phi/kernels/gpu/diagonal_kernel.cu
@@ -154,7 +154,7 @@ void DiagonalKernel(const Context& dev_ctx,
 }
 }  // namespace phi
 
-PT_REGISTER_KERNEL(diagonal,
+PD_REGISTER_KERNEL(diagonal,
                    GPU,
                    ALL_LAYOUT,
                    phi::DiagonalKernel,
diff --git a/paddle/phi/kernels/gpu/digamma_grad_kernel.cu b/paddle/phi/kernels/gpu/digamma_grad_kernel.cu
index 54a618fe0421e4..695227bba0f71d 100644
--- a/paddle/phi/kernels/gpu/digamma_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/digamma_grad_kernel.cu
@@ -18,5 +18,5 @@
 #include "paddle/phi/kernels/digamma_grad_kernel.h"
 #include "paddle/phi/kernels/impl/digamma_grad_kernel_impl.h"
 
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     digamma_grad, GPU, ALL_LAYOUT, phi::DigammaGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/digamma_kernel.cu b/paddle/phi/kernels/gpu/digamma_kernel.cu
index 91d63eeab8c83e..381c22a82e863d 100644
--- a/paddle/phi/kernels/gpu/digamma_kernel.cu
+++ b/paddle/phi/kernels/gpu/digamma_kernel.cu
@@ -19,5 +19,5 @@
 #include "paddle/phi/kernels/digamma_kernel.h"
 #include "paddle/phi/kernels/impl/digamma_kernel_impl.h"
 
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     digamma, GPU, ALL_LAYOUT, phi::DigammaKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/dot_grad_kernel.cu b/paddle/phi/kernels/gpu/dot_grad_kernel.cu
index 3290dba3d45b97..7defc0304e511e 100644
--- a/paddle/phi/kernels/gpu/dot_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/dot_grad_kernel.cu
@@ -20,7 +20,7 @@ limitations under the License. */
 
 #include "paddle/phi/common/complex.h"
 
-PT_REGISTER_KERNEL(dot_grad,
+PD_REGISTER_KERNEL(dot_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::DotGradKernel,
diff --git a/paddle/phi/kernels/gpu/dot_kernel.cu b/paddle/phi/kernels/gpu/dot_kernel.cu
index 9f3c3ff794abae..4442396f6c9dd7 100644
--- a/paddle/phi/kernels/gpu/dot_kernel.cu
+++ b/paddle/phi/kernels/gpu/dot_kernel.cu
@@ -52,7 +52,7 @@ void DotKernel(const Context& dev_ctx,
 using complex64 = ::phi::dtype::complex<float>;
 using complex128 = ::phi::dtype::complex<double>;
 
-PT_REGISTER_KERNEL(dot,
+PD_REGISTER_KERNEL(dot,
                    GPU,
                    ALL_LAYOUT,
                    phi::DotKernel,
diff --git a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
index fc78fe88c2e0e1..02dbb506c4eb57 100644
--- a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
@@ -119,7 +119,7 @@ void SubtractDoubleGradKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(add_grad,
+PD_REGISTER_KERNEL(add_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::AddGradKernel,
@@ -131,7 +131,7 @@ PT_REGISTER_KERNEL(add_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(add_double_grad,
+PD_REGISTER_KERNEL(add_double_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::AddDoubleGradKernel,
@@ -143,7 +143,7 @@ PT_REGISTER_KERNEL(add_double_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(add_triple_grad,
+PD_REGISTER_KERNEL(add_triple_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::AddTripleGradKernel,
@@ -155,7 +155,7 @@ PT_REGISTER_KERNEL(add_triple_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(subtract_grad,
+PD_REGISTER_KERNEL(subtract_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::SubtractGradKernel,
@@ -167,7 +167,7 @@ PT_REGISTER_KERNEL(subtract_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(subtract_double_grad,
+PD_REGISTER_KERNEL(subtract_double_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::SubtractDoubleGradKernel,
diff --git a/paddle/phi/kernels/gpu/expand_grad_kernel.cu b/paddle/phi/kernels/gpu/expand_grad_kernel.cu
index 9ee58ad6caf29c..8e2c3fde04a6a0 100644
--- a/paddle/phi/kernels/gpu/expand_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/expand_grad_kernel.cu
@@ -18,7 +18,7 @@
 #include "paddle/phi/kernels/expand_grad_kernel.h"
 #include "paddle/phi/kernels/impl/expand_grad_kernel_impl.h"
 
-PT_REGISTER_KERNEL(expand_grad,
+PD_REGISTER_KERNEL(expand_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::ExpandGradKernel,
diff --git a/paddle/phi/kernels/gpu/expand_kernel.cu b/paddle/phi/kernels/gpu/expand_kernel.cu
index dc1b4717fcc4c8..d4275804b3db8f 100644
--- a/paddle/phi/kernels/gpu/expand_kernel.cu
+++ b/paddle/phi/kernels/gpu/expand_kernel.cu
@@ -19,7 +19,7 @@
 #include "paddle/phi/kernels/expand_kernel.h"
 #include "paddle/phi/kernels/impl/expand_kernel_impl.h"
 
-PT_REGISTER_KERNEL(expand,
+PD_REGISTER_KERNEL(expand,
                    GPU,
                    ALL_LAYOUT,
                    phi::ExpandKernel,
diff --git a/paddle/phi/kernels/gpu/full_kernel.cu b/paddle/phi/kernels/gpu/full_kernel.cu
index caa05514c4f0fa..d5cb1575b71817 100644
--- a/paddle/phi/kernels/gpu/full_kernel.cu
+++ b/paddle/phi/kernels/gpu/full_kernel.cu
@@ -98,7 +98,7 @@ void FullLikeKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(full,
+PD_REGISTER_KERNEL(full,
                    GPU,
                    ALL_LAYOUT,
                    phi::FullKernel,
@@ -113,7 +113,7 @@ PT_REGISTER_KERNEL(full,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(full_like,
+PD_REGISTER_KERNEL(full_like,
                    GPU,
                    ALL_LAYOUT,
                    phi::FullLikeKernel,
diff --git a/paddle/phi/kernels/gpu/histogram_kernel.cu b/paddle/phi/kernels/gpu/histogram_kernel.cu
index 47dee820e2fbde..6db987e22fc6c2 100644
--- a/paddle/phi/kernels/gpu/histogram_kernel.cu
+++ b/paddle/phi/kernels/gpu/histogram_kernel.cu
@@ -149,7 +149,7 @@ void HistogramKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(histogram,
+PD_REGISTER_KERNEL(histogram,
                    GPU,
                    ALL_LAYOUT,
                    phi::HistogramKernel,
diff --git a/paddle/phi/kernels/gpu/huber_loss_grad_kernel.cu b/paddle/phi/kernels/gpu/huber_loss_grad_kernel.cu
new file mode 100644
index 00000000000000..20cc2ed669adf9
--- /dev/null
+++ b/paddle/phi/kernels/gpu/huber_loss_grad_kernel.cu
@@ -0,0 +1,22 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/huber_loss_grad_kernel.h"
+#include "paddle/phi/kernels/impl/huber_loss_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    huber_loss_grad, GPU, ALL_LAYOUT, phi::HuberLossGradKernel, float, double) {
+}
diff --git a/paddle/phi/kernels/gpu/huber_loss_kernel.cu b/paddle/phi/kernels/gpu/huber_loss_kernel.cu
new file mode 100644
index 00000000000000..26648a260b99ec
--- /dev/null
+++ b/paddle/phi/kernels/gpu/huber_loss_kernel.cu
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/huber_loss_kernel.h"
+#include "paddle/phi/kernels/impl/huber_loss_kernel_impl.h"
+
+PD_REGISTER_KERNEL(
+    huber_loss, GPU, ALL_LAYOUT, phi::HuberLossKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/lerp_grad_kernel.cu b/paddle/phi/kernels/gpu/lerp_grad_kernel.cu
index 81bd69a5f12e04..0a5ac99fa8e458 100644
--- a/paddle/phi/kernels/gpu/lerp_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/lerp_grad_kernel.cu
@@ -17,5 +17,5 @@
 #include "paddle/phi/kernels/impl/lerp_grad_kernel_impl.h"
 #include "paddle/phi/kernels/lerp_grad_kernel.h"
 
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     lerp_grad, GPU, ALL_LAYOUT, phi::LerpGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/lerp_kernel.cu b/paddle/phi/kernels/gpu/lerp_kernel.cu
index 190248c0cd077a..96010aff4e70c6 100644
--- a/paddle/phi/kernels/gpu/lerp_kernel.cu
+++ b/paddle/phi/kernels/gpu/lerp_kernel.cu
@@ -17,4 +17,4 @@
 #include "paddle/phi/kernels/impl/lerp_kernel_impl.h"
 #include "paddle/phi/kernels/lerp_kernel.h"
 
-PT_REGISTER_KERNEL(lerp, GPU, ALL_LAYOUT, phi::LerpKernel, float, double) {}
+PD_REGISTER_KERNEL(lerp, GPU, ALL_LAYOUT, phi::LerpKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/masked_select_grad_kernel.cu b/paddle/phi/kernels/gpu/masked_select_grad_kernel.cu
index c4f4b461f2aa04..71b7cd8750462f 100644
--- a/paddle/phi/kernels/gpu/masked_select_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/masked_select_grad_kernel.cu
@@ -96,7 +96,7 @@ void MaskedSelectGradKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(masked_select_grad,
+PD_REGISTER_KERNEL(masked_select_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::MaskedSelectGradKernel,
diff --git a/paddle/phi/kernels/gpu/masked_select_kernel.cu b/paddle/phi/kernels/gpu/masked_select_kernel.cu
index 8254ce4be6356e..fc4adca2f42438 100644
--- a/paddle/phi/kernels/gpu/masked_select_kernel.cu
+++ b/paddle/phi/kernels/gpu/masked_select_kernel.cu
@@ -108,7 +108,7 @@ void MaskedSelectKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(masked_select,
+PD_REGISTER_KERNEL(masked_select,
                    GPU,
                    ALL_LAYOUT,
                    phi::MaskedSelectKernel,
diff --git a/paddle/phi/kernels/gpu/math_kernel.cu b/paddle/phi/kernels/gpu/math_kernel.cu
index 2ae40bd4b1923b..f7b1205cb593a2 100644
--- a/paddle/phi/kernels/gpu/math_kernel.cu
+++ b/paddle/phi/kernels/gpu/math_kernel.cu
@@ -95,7 +95,7 @@ using float16 = phi::dtype::float16;
 using complex64 = ::phi::dtype::complex<float>;
 using complex128 = ::phi::dtype::complex<double>;
 
-PT_REGISTER_KERNEL(add_raw,
+PD_REGISTER_KERNEL(add_raw,
                    GPU,
                    ALL_LAYOUT,
                    phi::AddRawKernel,
@@ -107,7 +107,7 @@ PT_REGISTER_KERNEL(add_raw,
                    float16,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(subtract_raw,
+PD_REGISTER_KERNEL(subtract_raw,
                    GPU,
                    ALL_LAYOUT,
                    phi::SubtractRawKernel,
@@ -119,7 +119,7 @@ PT_REGISTER_KERNEL(subtract_raw,
                    float16,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(divide_raw,
+PD_REGISTER_KERNEL(divide_raw,
                    GPU,
                    ALL_LAYOUT,
                    phi::DivideRawKernel,
@@ -130,7 +130,7 @@ PT_REGISTER_KERNEL(divide_raw,
                    float16,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(multiply_raw,
+PD_REGISTER_KERNEL(multiply_raw,
                    GPU,
                    ALL_LAYOUT,
                    phi::MultiplyRawKernel,
@@ -142,7 +142,7 @@ PT_REGISTER_KERNEL(multiply_raw,
                    float16,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(sum_raw,
+PD_REGISTER_KERNEL(sum_raw,
                    GPU,
                    ALL_LAYOUT,
                    phi::SumRawKernel,
@@ -158,11 +158,13 @@ PT_REGISTER_KERNEL(sum_raw,
   kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
 }
 
-PT_REGISTER_KERNEL(mean_raw,
+PD_REGISTER_KERNEL(mean_raw,
                    GPU,
                    ALL_LAYOUT,
                    phi::MeanRawKernel,
                    float,
                    double,
                    bool,
-                   float16) {}
+                   float16,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/matmul_grad_kernel.cu b/paddle/phi/kernels/gpu/matmul_grad_kernel.cu
index 7da5fb2c988180..ff23ebd05b5283 100644
--- a/paddle/phi/kernels/gpu/matmul_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/matmul_grad_kernel.cu
@@ -19,7 +19,7 @@ limitations under the License. */
 
 #include "paddle/phi/kernels/impl/matmul_grad_kernel_impl.h"
 
-PT_REGISTER_KERNEL(matmul_grad,
+PD_REGISTER_KERNEL(matmul_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::MatmulGradKernel,
@@ -30,7 +30,7 @@ PT_REGISTER_KERNEL(matmul_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(matmul_double_grad,
+PD_REGISTER_KERNEL(matmul_double_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::MatmulDoubleGradKernel,
@@ -40,7 +40,7 @@ PT_REGISTER_KERNEL(matmul_double_grad,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(matmul_triple_grad,
+PD_REGISTER_KERNEL(matmul_triple_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::MatmulTripleGradKernel,
diff --git a/paddle/phi/kernels/gpu/matmul_kernel.cu b/paddle/phi/kernels/gpu/matmul_kernel.cu
index 3041784e93695f..98be79c5f9dab5 100644
--- a/paddle/phi/kernels/gpu/matmul_kernel.cu
+++ b/paddle/phi/kernels/gpu/matmul_kernel.cu
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/kernels/impl/matmul_kernel_impl.h"
 
-PT_REGISTER_KERNEL(matmul,
+PD_REGISTER_KERNEL(matmul,
                    GPU,
                    ALL_LAYOUT,
                    phi::MatmulKernel,
diff --git a/paddle/phi/kernels/gpu/norm_grad_kernel.cu b/paddle/phi/kernels/gpu/norm_grad_kernel.cu
index 3530de11d35e2a..ab38a82eceb1e7 100644
--- a/paddle/phi/kernels/gpu/norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/norm_grad_kernel.cu
@@ -111,7 +111,7 @@ void NormGradKernel(const Context& ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(norm_grad,
+PD_REGISTER_KERNEL(norm_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::NormGradKernel,
diff --git a/paddle/phi/kernels/gpu/norm_kernel.cu b/paddle/phi/kernels/gpu/norm_kernel.cu
index 4ed3100918edf5..274f91b8dd6611 100644
--- a/paddle/phi/kernels/gpu/norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/norm_kernel.cu
@@ -124,7 +124,7 @@ void NormKernel(const Context& ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(norm,
+PD_REGISTER_KERNEL(norm,
                    GPU,
                    ALL_LAYOUT,
                    phi::NormKernel,
diff --git a/paddle/phi/kernels/gpu/scale_kernel.cu b/paddle/phi/kernels/gpu/scale_kernel.cu
index 245605ed8a91b9..d9c8de21c5bc2d 100644
--- a/paddle/phi/kernels/gpu/scale_kernel.cu
+++ b/paddle/phi/kernels/gpu/scale_kernel.cu
@@ -63,7 +63,7 @@ void ScaleKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(scale,
+PD_REGISTER_KERNEL(scale,
                    GPU,
                    ALL_LAYOUT,
                    phi::ScaleKernel,
diff --git a/paddle/phi/kernels/gpu/sign_kernel.cu.cc b/paddle/phi/kernels/gpu/sign_kernel.cu.cc
index 950cf67d7cff5b..1fe17a7a227ecf 100644
--- a/paddle/phi/kernels/gpu/sign_kernel.cu.cc
+++ b/paddle/phi/kernels/gpu/sign_kernel.cu.cc
@@ -23,5 +23,5 @@ limitations under the License. */
 
 using float16 = phi::dtype::float16;
 
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     sign, GPU, ALL_LAYOUT, phi::SignKernel, float, double, float16) {}
diff --git a/paddle/phi/kernels/gpu/split_kernel.cu b/paddle/phi/kernels/gpu/split_kernel.cu
index 9d26b7361ff631..5222fce03ace6f 100644
--- a/paddle/phi/kernels/gpu/split_kernel.cu
+++ b/paddle/phi/kernels/gpu/split_kernel.cu
@@ -28,7 +28,7 @@ void SplitKernel(const Context& dev_ctx,
                  const Scalar& axis_scalar,
                  std::vector<DenseTensor*> outs) {
   // need to infershape output
-  if (num_or_sections.IsInitByTensor() || axis_scalar.IsInitByTensor()) {
+  if (num_or_sections.FromTensor() || axis_scalar.FromTensor()) {
     std::vector<MetaTensor> out_metas;
     for (size_t i = 0; i < outs.size(); ++i) {
       out_metas.push_back(outs[i]);
@@ -59,7 +59,7 @@ void SplitKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(split,
+PD_REGISTER_KERNEL(split,
                    GPU,
                    ALL_LAYOUT,
                    phi::SplitKernel,
diff --git a/paddle/phi/kernels/gpu/trace_grad_kernel.cu b/paddle/phi/kernels/gpu/trace_grad_kernel.cu
index a7e4b55b4ca221..6692c1e19b033c 100644
--- a/paddle/phi/kernels/gpu/trace_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/trace_grad_kernel.cu
@@ -18,7 +18,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/trace_grad_kernel_impl.h"
 
-PT_REGISTER_KERNEL(trace_grad,
+PD_REGISTER_KERNEL(trace_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::TraceGradKernel,
diff --git a/paddle/phi/kernels/gpu/trace_kernel.cu b/paddle/phi/kernels/gpu/trace_kernel.cu
index bc8b6bc922c917..7ac7c451b00542 100644
--- a/paddle/phi/kernels/gpu/trace_kernel.cu
+++ b/paddle/phi/kernels/gpu/trace_kernel.cu
@@ -44,7 +44,7 @@ void TraceKernel(const Context& ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(trace,
+PD_REGISTER_KERNEL(trace,
                    GPU,
                    ALL_LAYOUT,
                    phi::TraceKernel,
diff --git a/paddle/phi/kernels/gpu/trunc_grad_kernel.cu b/paddle/phi/kernels/gpu/trunc_grad_kernel.cu
index b5427d0b738676..92d95e7259bf0c 100644
--- a/paddle/phi/kernels/gpu/trunc_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/trunc_grad_kernel.cu
@@ -44,7 +44,7 @@ void TruncGradKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(trunc_grad,
+PD_REGISTER_KERNEL(trunc_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::TruncGradKernel,
diff --git a/paddle/phi/kernels/gpu/trunc_kernel.cu b/paddle/phi/kernels/gpu/trunc_kernel.cu
index d9c0803de2832c..cc44602b657aab 100644
--- a/paddle/phi/kernels/gpu/trunc_kernel.cu
+++ b/paddle/phi/kernels/gpu/trunc_kernel.cu
@@ -77,5 +77,5 @@ void TruncKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     trunc, GPU, ALL_LAYOUT, phi::TruncKernel, float, double, int, int64_t) {}
diff --git a/paddle/phi/kernels/huber_loss_grad_kernel.h b/paddle/phi/kernels/huber_loss_grad_kernel.h
new file mode 100644
index 00000000000000..c6246b15531979
--- /dev/null
+++ b/paddle/phi/kernels/huber_loss_grad_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void HuberLossGradKernel(const Context& dev_ctx,
+                         const DenseTensor& residual,
+                         const DenseTensor& out_grad,
+                         float delta,
+                         DenseTensor* input_grad,
+                         DenseTensor* label_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/huber_loss_kernel.h b/paddle/phi/kernels/huber_loss_kernel.h
new file mode 100644
index 00000000000000..3533a9ec6ded52
--- /dev/null
+++ b/paddle/phi/kernels/huber_loss_kernel.h
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void HuberLossKernel(const Context& dev_ctx,
+                     const DenseTensor& input,
+                     const DenseTensor& label,
+                     float delta,
+                     DenseTensor* out,
+                     DenseTensor* residual);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/huber_loss_grad_kernel_impl.h b/paddle/phi/kernels/impl/huber_loss_grad_kernel_impl.h
new file mode 100644
index 00000000000000..b93578abba2b72
--- /dev/null
+++ b/paddle/phi/kernels/impl/huber_loss_grad_kernel_impl.h
@@ -0,0 +1,75 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/huber_loss_grad_kernel.h"
+
+namespace phi {
+
+template <typename T>
+struct HuberLossBackward {
+  HOSTDEVICE HuberLossBackward(const T& delta, T sign)
+      : sign(sign), delta(delta) {}
+
+  HOSTDEVICE T operator()(const T& val) const {
+    T abs_val = std::abs(val);
+    if (abs_val <= delta) {
+      return sign * val;
+    } else {
+      if (val > 0) {
+        return sign * delta;
+      } else {
+        return -1 * sign * delta;
+      }
+    }
+  }
+
+  T sign;
+  T delta;
+};
+
+template <typename T, typename Context>
+void HuberLossGradKernel(const Context& dev_ctx,
+                         const DenseTensor& residual,
+                         const DenseTensor& out_grad,
+                         float delta,
+                         DenseTensor* input_grad,
+                         DenseTensor* label_grad) {
+  T delta_ = static_cast<T>(delta);
+  auto& place = *dev_ctx.eigen_device();
+
+  auto eigen_residual = EigenVector<T>::Flatten(residual);
+  auto eigen_out_grad = EigenVector<T>::Flatten(out_grad);
+
+  if (input_grad) {
+    dev_ctx.template Alloc<T>(input_grad);
+    auto eigen_input_grad = EigenVector<T>::Flatten(*input_grad);
+    eigen_input_grad.device(place) =
+        eigen_residual.unaryExpr(HuberLossBackward<T>(delta_, -1.0));
+    eigen_input_grad.device(place) = eigen_out_grad * eigen_input_grad;
+  }
+
+  if (label_grad) {
+    dev_ctx.template Alloc<T>(label_grad);
+    auto eigen_label_grad = EigenVector<T>::Flatten(*label_grad);
+    eigen_label_grad.device(place) =
+        eigen_residual.unaryExpr(HuberLossBackward<T>(delta_, 1.0));
+    eigen_label_grad.device(place) = eigen_out_grad * eigen_label_grad;
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/huber_loss_kernel_impl.h b/paddle/phi/kernels/impl/huber_loss_kernel_impl.h
new file mode 100644
index 00000000000000..7fbdc80c3829bf
--- /dev/null
+++ b/paddle/phi/kernels/impl/huber_loss_kernel_impl.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/huber_loss_kernel.h"
+
+namespace phi {
+
+template <typename T>
+struct HuberLossForward {
+  HOSTDEVICE HuberLossForward(const T& delta) : delta(delta) {}
+
+  HOSTDEVICE T operator()(const T& val) const {
+    T abs_val = std::abs(val);
+    if (abs_val <= delta) {
+      return static_cast<T>(0.5) * val * val;
+    } else {
+      return delta * (abs_val - static_cast<T>(0.5) * delta);
+    }
+  }
+
+  T delta;
+};
+
+template <typename T, typename Context>
+void HuberLossKernel(const Context& dev_ctx,
+                     const DenseTensor& input,
+                     const DenseTensor& label,
+                     float delta,
+                     DenseTensor* out,
+                     DenseTensor* residual) {
+  T delta_ = static_cast<T>(delta);
+  auto& place = *dev_ctx.eigen_device();
+
+  auto x = EigenVector<T>::Flatten(input);
+  auto y = EigenVector<T>::Flatten(label);
+
+  dev_ctx.template Alloc<T>(residual);
+  auto eigen_residual = EigenVector<T>::Flatten(*residual);
+  eigen_residual.device(place) = y - x;
+
+  dev_ctx.template Alloc<T>(out);
+  auto loss = EigenVector<T>::Flatten(*out);
+  loss.device(place) = eigen_residual.unaryExpr(HuberLossForward<T>(delta_));
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/math_kernel.cc b/paddle/phi/kernels/math_kernel.cc
index 7fb6cc0ba9cca5..db6c5e1ac35919 100644
--- a/paddle/phi/kernels/math_kernel.cc
+++ b/paddle/phi/kernels/math_kernel.cc
@@ -81,10 +81,10 @@ void MultiplyKernel(const Context& dev_ctx,
 using complex64 = ::phi::dtype::complex<float>;
 using complex128 = ::phi::dtype::complex<double>;
 
-PT_REGISTER_KERNEL(
+PD_REGISTER_KERNEL(
     mean, CPU, ALL_LAYOUT, phi::MeanKernel, float, double, bool) {}
 
-PT_REGISTER_KERNEL(sum,
+PD_REGISTER_KERNEL(sum,
                    CPU,
                    ALL_LAYOUT,
                    phi::SumKernel,
@@ -100,7 +100,7 @@ PT_REGISTER_KERNEL(sum,
   kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
 }
 
-PT_REGISTER_KERNEL(add,
+PD_REGISTER_KERNEL(add,
                    CPU,
                    ALL_LAYOUT,
                    phi::AddKernel,
@@ -111,7 +111,7 @@ PT_REGISTER_KERNEL(add,
                    int64_t,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(subtract,
+PD_REGISTER_KERNEL(subtract,
                    CPU,
                    ALL_LAYOUT,
                    phi::SubtractKernel,
@@ -122,7 +122,7 @@ PT_REGISTER_KERNEL(subtract,
                    int64_t,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(divide,
+PD_REGISTER_KERNEL(divide,
                    CPU,
                    ALL_LAYOUT,
                    phi::DivideKernel,
@@ -132,7 +132,7 @@ PT_REGISTER_KERNEL(divide,
                    int64_t,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(multiply,
+PD_REGISTER_KERNEL(multiply,
                    CPU,
                    ALL_LAYOUT,
                    phi::MultiplyKernel,
@@ -145,15 +145,17 @@ PT_REGISTER_KERNEL(multiply,
                    complex128) {}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_REGISTER_KERNEL(mean,
+PD_REGISTER_KERNEL(mean,
                    GPU,
                    ALL_LAYOUT,
                    phi::MeanKernel,
                    float,
                    double,
                    bool,
+                   int,
+                   int64_t,
                    phi::dtype::float16) {}
-PT_REGISTER_KERNEL(sum,
+PD_REGISTER_KERNEL(sum,
                    GPU,
                    ALL_LAYOUT,
                    phi::SumKernel,
@@ -168,7 +170,7 @@ PT_REGISTER_KERNEL(sum,
                    complex128) {
   kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
 }
-PT_REGISTER_KERNEL(add,
+PD_REGISTER_KERNEL(add,
                    GPU,
                    ALL_LAYOUT,
                    phi::AddKernel,
@@ -180,7 +182,7 @@ PT_REGISTER_KERNEL(add,
                    phi::dtype::float16,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(subtract,
+PD_REGISTER_KERNEL(subtract,
                    GPU,
                    ALL_LAYOUT,
                    phi::SubtractKernel,
@@ -192,7 +194,7 @@ PT_REGISTER_KERNEL(subtract,
                    phi::dtype::float16,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(divide,
+PD_REGISTER_KERNEL(divide,
                    GPU,
                    ALL_LAYOUT,
                    phi::DivideKernel,
@@ -203,7 +205,7 @@ PT_REGISTER_KERNEL(divide,
                    phi::dtype::float16,
                    complex64,
                    complex128) {}
-PT_REGISTER_KERNEL(multiply,
+PD_REGISTER_KERNEL(multiply,
                    GPU,
                    ALL_LAYOUT,
                    phi::MultiplyKernel,
diff --git a/paddle/phi/kernels/primitive/compute_primitives.h b/paddle/phi/kernels/primitive/compute_primitives.h
index 574b94fd35d7fb..4f3c069f3b2491 100644
--- a/paddle/phi/kernels/primitive/compute_primitives.h
+++ b/paddle/phi/kernels/primitive/compute_primitives.h
@@ -136,6 +136,40 @@ __device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) {
   return shared_memory[threadIdx.x];
 }
 
+// Swap data
+template <typename T>
+__device__ __forceinline__ void Swap(T* first_value, T* second_value) {
+  T t_value;
+  t_value = (*first_value);
+  (*first_value) = (*second_value);
+  (*second_value) = t_value;
+}
+
+// swap with monotonic_type
+template <typename T>
+__device__ __forceinline__ void Comparator(T* first_value,
+                                           T* second_value,
+                                           int monotonic_type) {
+  if (((*first_value) > (*second_value)) == monotonic_type) {
+    Swap<T>(first_value, second_value);
+  }
+}
+
+template <typename T, typename IndexType>
+__device__ __forceinline__ void ComparatorWithIndex(T* first_value,
+
+                                                    T* second_value,
+                                                    IndexType* first_index,
+                                                    IndexType* second_index,
+                                                    int monotonic_type) {
+  if ((*first_value > (*second_value)) == monotonic_type) {
+    // swap value
+    Swap<T>(first_value, second_value);
+    // swap index
+    Swap<IndexType>(first_index, second_index);
+  }
+}
+
 }  // namespace details
 
 /**
@@ -485,5 +519,94 @@ __device__ __forceinline__ void Cumsum(OutT* out,
       static_cast<OutT>(temp[tidx + shared_size + (tidx + shared_size) / 32]);
 }
 
+#define SHARED_SIZE_LIMIT \
+  1024  // each thread load 2 data from global memory so SHARED_SIZE_LIMIT must
+        // larger than blockDim.x * 2
+// if monotonic_type = 1 then increase
+// if gridDim.x > 1 please set monotonic_type = blockIdx.x & 1; blockIdx.x % 2
+// == 1 the increase
+template <typename T>
+__device__ __forceinline__ void Sort(T* dst,
+                                     const T* src_data,
+                                     int num,
+                                     int monotonic_type) {
+  // todo: set  num = Pow2(num)
+  // shareMem for value and index  num must smaller than SHARED_SIZE_LIMIT / 2
+  __shared__ T value[SHARED_SIZE_LIMIT];  // shareMem's size must larger than
+                                          // blockDim * 2
+  // Copy value and index from src and src_index
+  value[threadIdx.x] = src_data[0];
+  value[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = src_data[1];
+  // make bitonicSort
+  for (int size = 2; size < num; size <<= 1) {
+    int bitonic_type = (threadIdx.x & (size / 2)) != 0;
+    for (int stride = size / 2; stride > 0; stride >>= 1) {
+      __syncthreads();
+      int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
+      details::Comparator<T>(&value[pos], &value[pos + stride], bitonic_type);
+    }
+  }
+  // last sort
+  for (int stride = SHARED_SIZE_LIMIT / 2; stride > 0; stride >>= 1) {
+    __syncthreads();
+    int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
+    // last sort when monotonic_type = 1 then increase
+    details::Comparator<T>(&value[pos], &value[pos + stride], monotonic_type);
+  }
+  __syncthreads();
+  dst[0] = value[threadIdx.x];
+  dst[1] = value[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
+}
+
+template <typename T, typename IndexType>
+__device__ __forceinline__ void Sort(T* dst,
+                                     IndexType* dst_index,
+                                     const T* src_data,
+                                     IndexType* src_index,
+                                     int num,
+                                     int monotonic_type) {
+  // todo: set  num = Pow2(num)
+  // shareMem for value and index  num must smaller than SHARED_SIZE_LIMIT / 2
+  __shared__ T value[SHARED_SIZE_LIMIT];  // shareMem's size must larger than
+                                          // blockDim * 2
+  __shared__ IndexType index[SHARED_SIZE_LIMIT];
+  // Copy value and index from src and src_index
+  value[threadIdx.x] = src_data[0];
+  value[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = src_data[1];
+  // index
+  index[threadIdx.x] = src_index[0];
+  index[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = src_index[1];
+  // make bitonicSort
+  for (int size = 2; size < num; size <<= 1) {
+    int bitonic_type = (threadIdx.x & (size / 2)) != 0;
+    for (int stride = size / 2; stride > 0; stride >>= 1) {
+      __syncthreads();
+      int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
+      details::ComparatorWithIndex<T, IndexType>(&value[pos],
+                                                 &value[pos + stride],
+                                                 &index[pos],
+                                                 &index[pos + stride],
+                                                 bitonic_type);
+    }
+  }
+
+  for (int stride = SHARED_SIZE_LIMIT / 2; stride > 0; stride >>= 1) {
+    __syncthreads();
+    int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
+    // last sort when monotonic_type = 1 then increase
+    details::ComparatorWithIndex<T, IndexType>(&value[pos],
+                                               &value[pos + stride],
+                                               &index[pos],
+                                               &index[pos + stride],
+                                               monotonic_type);
+  }
+
+  __syncthreads();
+  dst[0] = value[threadIdx.x];
+  dst[1] = value[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
+  dst_index[0] = index[threadIdx.x];
+  dst_index[1] = index[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
+}
+
 }  // namespace kps
 }  // namespace phi
diff --git a/paddle/phi/kernels/primitive/datamover_primitives.h b/paddle/phi/kernels/primitive/datamover_primitives.h
index 120be251db2c80..a6c4c40a7505e1 100644
--- a/paddle/phi/kernels/primitive/datamover_primitives.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives.h
@@ -714,5 +714,14 @@ __device__ __forceinline__ void ReadDataBc(
   }
 }
 
+template <typename T, int NX, int NY, int BlockSize>
+__device__ __forceinline__ void InitWithDataIndex(T* dst, int block_offset) {
+  int thread_offset = block_offset + threadIdx.x * NX;
+#pragma unroll
+  for (int nx = 0; nx < NX; ++nx) {
+    dst[nx] = static_cast<T>(thread_offset + nx);
+  }
+}
+
 }  // namespace kps
 }  // namespace phi
diff --git a/paddle/phi/kernels/reshape_grad_kernel.cc b/paddle/phi/kernels/reshape_grad_kernel.cc
index 436813b53e6cd3..5361315bb611b0 100644
--- a/paddle/phi/kernels/reshape_grad_kernel.cc
+++ b/paddle/phi/kernels/reshape_grad_kernel.cc
@@ -37,24 +37,24 @@ void ReshapeDoubleGradKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_GENERAL_KERNEL(reshape_grad,
+PD_REGISTER_GENERAL_KERNEL(reshape_grad,
                            CPU,
                            ALL_LAYOUT,
                            phi::ReshapeGradKernel<phi::CPUContext>,
                            ALL_DTYPE) {}
-PT_REGISTER_GENERAL_KERNEL(reshape_double_grad,
+PD_REGISTER_GENERAL_KERNEL(reshape_double_grad,
                            CPU,
                            ALL_LAYOUT,
                            phi::ReshapeDoubleGradKernel<phi::CPUContext>,
                            ALL_DTYPE) {}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_REGISTER_GENERAL_KERNEL(reshape_grad,
+PD_REGISTER_GENERAL_KERNEL(reshape_grad,
                            GPU,
                            ALL_LAYOUT,
                            phi::ReshapeGradKernel<phi::GPUContext>,
                            ALL_DTYPE) {}
-PT_REGISTER_GENERAL_KERNEL(reshape_double_grad,
+PD_REGISTER_GENERAL_KERNEL(reshape_double_grad,
                            GPU,
                            ALL_LAYOUT,
                            phi::ReshapeDoubleGradKernel<phi::GPUContext>,
@@ -62,12 +62,12 @@ PT_REGISTER_GENERAL_KERNEL(reshape_double_grad,
 #endif
 
 #ifdef PADDLE_WITH_XPU
-PT_REGISTER_GENERAL_KERNEL(reshape_grad,
+PD_REGISTER_GENERAL_KERNEL(reshape_grad,
                            XPU,
                            ALL_LAYOUT,
                            phi::ReshapeGradKernel<phi::XPUContext>,
                            ALL_DTYPE) {}
-PT_REGISTER_GENERAL_KERNEL(reshape_double_grad,
+PD_REGISTER_GENERAL_KERNEL(reshape_double_grad,
                            XPU,
                            ALL_LAYOUT,
                            phi::ReshapeDoubleGradKernel<phi::XPUContext>,
diff --git a/paddle/phi/kernels/reshape_kernel.cc b/paddle/phi/kernels/reshape_kernel.cc
index 0a6aeb030e28d9..570e70ea112271 100644
--- a/paddle/phi/kernels/reshape_kernel.cc
+++ b/paddle/phi/kernels/reshape_kernel.cc
@@ -47,24 +47,23 @@ void ReshapeWithXShape(const Context& dev_ctx,
                        const ScalarArray& shape,
                        DenseTensor* xshape,
                        DenseTensor* out) {
-  funcs::SetXShape(x, xshape);
   ReshapeKernel(dev_ctx, x, shape, out);
 }
 
 }  // namespace phi
 
-PT_REGISTER_GENERAL_KERNEL(
+PD_REGISTER_GENERAL_KERNEL(
     reshape, CPU, ALL_LAYOUT, phi::ReshapeKernel<phi::CPUContext>, ALL_DTYPE) {}
-PT_REGISTER_GENERAL_KERNEL(reshape_with_xshape,
+PD_REGISTER_GENERAL_KERNEL(reshape_with_xshape,
                            CPU,
                            ALL_LAYOUT,
                            phi::ReshapeWithXShape<phi::CPUContext>,
                            ALL_DTYPE) {}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_REGISTER_GENERAL_KERNEL(
+PD_REGISTER_GENERAL_KERNEL(
     reshape, GPU, ALL_LAYOUT, phi::ReshapeKernel<phi::GPUContext>, ALL_DTYPE) {}
-PT_REGISTER_GENERAL_KERNEL(reshape_with_xshape,
+PD_REGISTER_GENERAL_KERNEL(reshape_with_xshape,
                            GPU,
                            ALL_LAYOUT,
                            phi::ReshapeWithXShape<phi::GPUContext>,
@@ -72,9 +71,9 @@ PT_REGISTER_GENERAL_KERNEL(reshape_with_xshape,
 #endif
 
 #ifdef PADDLE_WITH_XPU
-PT_REGISTER_GENERAL_KERNEL(
+PD_REGISTER_GENERAL_KERNEL(
     reshape, XPU, ALL_LAYOUT, phi::ReshapeKernel<phi::XPUContext>, ALL_DTYPE) {}
-PT_REGISTER_GENERAL_KERNEL(reshape_with_xshape,
+PD_REGISTER_GENERAL_KERNEL(reshape_with_xshape,
                            XPU,
                            ALL_LAYOUT,
                            phi::ReshapeWithXShape<phi::XPUContext>,
diff --git a/paddle/phi/kernels/selected_rows/full_kernel.cc b/paddle/phi/kernels/selected_rows/full_kernel.cc
index baedf899d2b53a..02231867fdd35c 100644
--- a/paddle/phi/kernels/selected_rows/full_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/full_kernel.cc
@@ -36,7 +36,7 @@ void FullSR(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(full_sr,
+PD_REGISTER_KERNEL(full_sr,
                    CPU,
                    ALL_LAYOUT,
                    phi::FullSR,
@@ -53,7 +53,7 @@ PT_REGISTER_KERNEL(full_sr,
                    phi::dtype::complex<double>) {}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_REGISTER_KERNEL(full_sr,
+PD_REGISTER_KERNEL(full_sr,
                    GPU,
                    ALL_LAYOUT,
                    phi::FullSR,
diff --git a/paddle/phi/kernels/selected_rows/scale_kernel.cc b/paddle/phi/kernels/selected_rows/scale_kernel.cc
index 67717ed469488d..094b6f4d12022b 100644
--- a/paddle/phi/kernels/selected_rows/scale_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/scale_kernel.cc
@@ -38,7 +38,7 @@ void ScaleSR(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(scale_sr,
+PD_REGISTER_KERNEL(scale_sr,
                    CPU,
                    ALL_LAYOUT,
                    phi::ScaleSR,
@@ -52,7 +52,7 @@ PT_REGISTER_KERNEL(scale_sr,
                    int64_t) {}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_REGISTER_KERNEL(scale_sr,
+PD_REGISTER_KERNEL(scale_sr,
                    GPU,
                    ALL_LAYOUT,
                    phi::ScaleSR,
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
index 408240b90a9884..4374b5d7f1a1d9 100644
--- a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
@@ -284,7 +284,7 @@ void SparseCooToDenseKernel(const Context& dev_ctx,
 }  // namespace sparse
 }  // namespace phi
 
-PT_REGISTER_KERNEL(dense_to_sparse_coo,
+PD_REGISTER_KERNEL(dense_to_sparse_coo,
                    CPU,
                    ALL_LAYOUT,
                    phi::sparse::DenseToSparseCooKernel,
@@ -297,7 +297,7 @@ PT_REGISTER_KERNEL(dense_to_sparse_coo,
                    int,
                    int64_t) {}
 
-PT_REGISTER_KERNEL(sparse_csr_to_coo,
+PD_REGISTER_KERNEL(sparse_csr_to_coo,
                    CPU,
                    ALL_LAYOUT,
                    phi::sparse::SparseCsrToCooKernel,
@@ -310,7 +310,7 @@ PT_REGISTER_KERNEL(sparse_csr_to_coo,
                    int,
                    int64_t) {}
 
-PT_REGISTER_KERNEL(sparse_coo_to_csr,
+PD_REGISTER_KERNEL(sparse_coo_to_csr,
                    CPU,
                    ALL_LAYOUT,
                    phi::sparse::SparseCooToCsrKernel,
@@ -323,7 +323,7 @@ PT_REGISTER_KERNEL(sparse_coo_to_csr,
                    int,
                    int64_t) {}
 
-PT_REGISTER_KERNEL(dense_to_sparse_csr,
+PD_REGISTER_KERNEL(dense_to_sparse_csr,
                    CPU,
                    ALL_LAYOUT,
                    phi::sparse::DenseToSparseCsrKernel,
@@ -336,7 +336,7 @@ PT_REGISTER_KERNEL(dense_to_sparse_csr,
                    int,
                    int64_t) {}
 
-PT_REGISTER_KERNEL(sparse_coo_to_dense,
+PD_REGISTER_KERNEL(sparse_coo_to_dense,
                    CPU,
                    ALL_LAYOUT,
                    phi::sparse::SparseCooToDenseKernel,
@@ -349,7 +349,7 @@ PT_REGISTER_KERNEL(sparse_coo_to_dense,
                    int,
                    int64_t) {}
 
-PT_REGISTER_KERNEL(sparse_csr_to_dense,
+PD_REGISTER_KERNEL(sparse_csr_to_dense,
                    CPU,
                    ALL_LAYOUT,
                    phi::sparse::SparseCsrToDenseKernel,
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
index ab2be13615e0e0..b7793e40554455 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
@@ -553,7 +553,7 @@ void SparseCooToDenseKernel(const Context& dev_ctx,
 }  // namespace sparse
 }  // namespace phi
 
-PT_REGISTER_KERNEL(dense_to_sparse_coo,
+PD_REGISTER_KERNEL(dense_to_sparse_coo,
                    GPU,
                    ALL_LAYOUT,
                    phi::sparse::DenseToSparseCooKernel,
@@ -566,7 +566,7 @@ PT_REGISTER_KERNEL(dense_to_sparse_coo,
                    int,
                    int64_t) {}
 
-PT_REGISTER_KERNEL(sparse_csr_to_coo,
+PD_REGISTER_KERNEL(sparse_csr_to_coo,
                    GPU,
                    ALL_LAYOUT,
                    phi::sparse::SparseCsrToCooKernel,
@@ -579,7 +579,7 @@ PT_REGISTER_KERNEL(sparse_csr_to_coo,
                    int,
                    int64_t) {}
 
-PT_REGISTER_KERNEL(sparse_coo_to_csr,
+PD_REGISTER_KERNEL(sparse_coo_to_csr,
                    GPU,
                    ALL_LAYOUT,
                    phi::sparse::SparseCooToCsrKernel,
@@ -592,7 +592,7 @@ PT_REGISTER_KERNEL(sparse_coo_to_csr,
                    int,
                    int64_t) {}
 
-PT_REGISTER_KERNEL(dense_to_sparse_csr,
+PD_REGISTER_KERNEL(dense_to_sparse_csr,
                    GPU,
                    ALL_LAYOUT,
                    phi::sparse::DenseToSparseCsrKernel,
@@ -605,7 +605,7 @@ PT_REGISTER_KERNEL(dense_to_sparse_csr,
                    int,
                    int64_t) {}
 
-PT_REGISTER_KERNEL(sparse_coo_to_dense,
+PD_REGISTER_KERNEL(sparse_coo_to_dense,
                    GPU,
                    ALL_LAYOUT,
                    phi::sparse::SparseCooToDenseKernel,
@@ -618,7 +618,7 @@ PT_REGISTER_KERNEL(sparse_coo_to_dense,
                    int,
                    int64_t) {}
 
-PT_REGISTER_KERNEL(sparse_csr_to_dense,
+PD_REGISTER_KERNEL(sparse_csr_to_dense,
                    GPU,
                    ALL_LAYOUT,
                    phi::sparse::SparseCsrToDenseKernel,
diff --git a/paddle/phi/kernels/transfer_layout_kernel.cc b/paddle/phi/kernels/transfer_layout_kernel.cc
index eb7146487e38b2..c981ca11585070 100644
--- a/paddle/phi/kernels/transfer_layout_kernel.cc
+++ b/paddle/phi/kernels/transfer_layout_kernel.cc
@@ -69,7 +69,7 @@ void TransferLayoutKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_GENERAL_KERNEL(pten_transfer_layout,
+PD_REGISTER_GENERAL_KERNEL(pten_transfer_layout,
                            CPU,
                            ALL_LAYOUT,
                            phi::TransferLayoutKernel<phi::CPUContext>,
diff --git a/paddle/phi/kernels/xpu/cast_kernel.cc b/paddle/phi/kernels/xpu/cast_kernel.cc
index 0e50306a068c89..9aa503d58736de 100644
--- a/paddle/phi/kernels/xpu/cast_kernel.cc
+++ b/paddle/phi/kernels/xpu/cast_kernel.cc
@@ -86,7 +86,7 @@ void CastKernel(const Context& dev_ctx,
 }
 }  // namespace phi
 
-PT_REGISTER_KERNEL(cast,
+PD_REGISTER_KERNEL(cast,
                    XPU,
                    ALL_LAYOUT,
                    phi::CastKernel,
diff --git a/paddle/phi/kernels/xpu/copy_kernel.cc b/paddle/phi/kernels/xpu/copy_kernel.cc
index 559d110a9e8ad8..3bbedbbb346e42 100644
--- a/paddle/phi/kernels/xpu/copy_kernel.cc
+++ b/paddle/phi/kernels/xpu/copy_kernel.cc
@@ -69,5 +69,5 @@ void Copy(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_GENERAL_KERNEL(
+PD_REGISTER_GENERAL_KERNEL(
     copy, XPU, ALL_LAYOUT, phi::Copy<phi::XPUContext>, ALL_DTYPE) {}
diff --git a/paddle/phi/kernels/xpu/full_kernel.cc b/paddle/phi/kernels/xpu/full_kernel.cc
index 98810fa9779a4a..b514425cc54da2 100644
--- a/paddle/phi/kernels/xpu/full_kernel.cc
+++ b/paddle/phi/kernels/xpu/full_kernel.cc
@@ -116,7 +116,7 @@ void FullLikeKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(full,
+PD_REGISTER_KERNEL(full,
                    XPU,
                    ALL_LAYOUT,
                    phi::FullKernel,
@@ -132,7 +132,7 @@ PT_REGISTER_KERNEL(full,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PT_REGISTER_KERNEL(full_like,
+PD_REGISTER_KERNEL(full_like,
                    XPU,
                    ALL_LAYOUT,
                    phi::FullLikeKernel,
diff --git a/paddle/phi/kernels/xpu/scale_kernel.cc b/paddle/phi/kernels/xpu/scale_kernel.cc
index 0814e2d9b322fc..e103e5afdcf9be 100644
--- a/paddle/phi/kernels/xpu/scale_kernel.cc
+++ b/paddle/phi/kernels/xpu/scale_kernel.cc
@@ -56,7 +56,7 @@ void ScaleKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PT_REGISTER_KERNEL(scale,
+PD_REGISTER_KERNEL(scale,
                    XPU,
                    ALL_LAYOUT,
                    phi::ScaleKernel,
diff --git a/paddle/phi/ops/compat/abs_sig.cc b/paddle/phi/ops/compat/abs_sig.cc
index 67319a18aafa1d..b4b94457e6be9f 100644
--- a/paddle/phi/ops/compat/abs_sig.cc
+++ b/paddle/phi/ops/compat/abs_sig.cc
@@ -32,7 +32,7 @@ KernelSignature AbsDoubleGradOpArgumentMapping(
 
 }  // namespace phi
 
-PT_REGISTER_ARG_MAPPING_FN(abs, phi::AbsOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(abs_grad, phi::AbsGradOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(abs_double_grad,
+PD_REGISTER_ARG_MAPPING_FN(abs, phi::AbsOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(abs_grad, phi::AbsGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(abs_double_grad,
                            phi::AbsDoubleGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/cast_sig.cc b/paddle/phi/ops/compat/cast_sig.cc
index 79cf59f32990e9..3d970e92a7d688 100644
--- a/paddle/phi/ops/compat/cast_sig.cc
+++ b/paddle/phi/ops/compat/cast_sig.cc
@@ -22,4 +22,4 @@ KernelSignature CastOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 }  // namespace phi
 
-PT_REGISTER_ARG_MAPPING_FN(cast, phi::CastOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(cast, phi::CastOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/concat_sig.cc b/paddle/phi/ops/compat/concat_sig.cc
index de37b973409e94..21e653ccfe90f8 100644
--- a/paddle/phi/ops/compat/concat_sig.cc
+++ b/paddle/phi/ops/compat/concat_sig.cc
@@ -25,4 +25,4 @@ KernelSignature ConcatOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 }  // namespace phi
 
-PT_REGISTER_ARG_MAPPING_FN(concat, phi::ConcatOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(concat, phi::ConcatOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/diagonal_sig.cc b/paddle/phi/ops/compat/diagonal_sig.cc
index 430edea89bea2a..b4a424ec06bf2b 100644
--- a/paddle/phi/ops/compat/diagonal_sig.cc
+++ b/paddle/phi/ops/compat/diagonal_sig.cc
@@ -25,4 +25,4 @@ KernelSignature DiagonalGradOpArgumentMapping(
 }
 
 }  // namespace phi
-PT_REGISTER_ARG_MAPPING_FN(diagonal_grad, phi::DiagonalGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(diagonal_grad, phi::DiagonalGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/digamma_sig.cc b/paddle/phi/ops/compat/digamma_sig.cc
index 555c16ef6b6bf1..fa693f92c6fe3a 100644
--- a/paddle/phi/ops/compat/digamma_sig.cc
+++ b/paddle/phi/ops/compat/digamma_sig.cc
@@ -24,4 +24,4 @@ KernelSignature DigammaGradOpArgumentMapping(
 
 }  // namespace phi
 
-PT_REGISTER_ARG_MAPPING_FN(digamma_grad, phi::DigammaGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(digamma_grad, phi::DigammaGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/dot_sig.cc b/paddle/phi/ops/compat/dot_sig.cc
index 481bd3a4949d8c..2437ecc1ca7672 100644
--- a/paddle/phi/ops/compat/dot_sig.cc
+++ b/paddle/phi/ops/compat/dot_sig.cc
@@ -25,4 +25,4 @@ KernelSignature DotGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 }  // namespace phi
 
-PT_REGISTER_ARG_MAPPING_FN(dot_grad, phi::DotGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(dot_grad, phi::DotGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/elementwise_sig.cc b/paddle/phi/ops/compat/elementwise_sig.cc
index dfffa034f1d1de..cddebcbce1273a 100644
--- a/paddle/phi/ops/compat/elementwise_sig.cc
+++ b/paddle/phi/ops/compat/elementwise_sig.cc
@@ -102,28 +102,28 @@ KernelSignature ElementwiseSubGradOpArgumentMapping(
 
 }  // namespace phi
 
-PT_REGISTER_BASE_KERNEL_NAME(elementwise_add, add);
-PT_REGISTER_BASE_KERNEL_NAME(elementwise_sub, subtract);
-PT_REGISTER_BASE_KERNEL_NAME(elementwise_mul, multiply);
-PT_REGISTER_BASE_KERNEL_NAME(elementwise_div, divide);
-PT_REGISTER_BASE_KERNEL_NAME(elementwise_add_grad, add_grad);
-PT_REGISTER_BASE_KERNEL_NAME(elementwise_add_grad_grad, add_double_grad);
-PT_REGISTER_BASE_KERNEL_NAME(elementwise_add_triple_grad, add_triple_grad);
-PT_REGISTER_BASE_KERNEL_NAME(elementwise_sub_grad, subtract_grad);
-
-PT_REGISTER_ARG_MAPPING_FN(elementwise_add,
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_add, add);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_sub, subtract);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_mul, multiply);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_div, divide);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_add_grad, add_grad);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_add_grad_grad, add_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_add_triple_grad, add_triple_grad);
+PD_REGISTER_BASE_KERNEL_NAME(elementwise_sub_grad, subtract_grad);
+
+PD_REGISTER_ARG_MAPPING_FN(elementwise_add,
                            phi::ElementwiseAddOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(elementwise_sub,
+PD_REGISTER_ARG_MAPPING_FN(elementwise_sub,
                            phi::ElementwiseSubOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(elementwise_mul,
+PD_REGISTER_ARG_MAPPING_FN(elementwise_mul,
                            phi::ElementwiseMulOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(elementwise_div,
+PD_REGISTER_ARG_MAPPING_FN(elementwise_div,
                            phi::ElementwiseDivOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(elementwise_add_grad,
+PD_REGISTER_ARG_MAPPING_FN(elementwise_add_grad,
                            phi::ElementwiseAddGradOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(elementwise_add_grad_grad,
+PD_REGISTER_ARG_MAPPING_FN(elementwise_add_grad_grad,
                            phi::ElementwiseAddDoubleGradOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(elementwise_add_triple_grad,
+PD_REGISTER_ARG_MAPPING_FN(elementwise_add_triple_grad,
                            phi::ElementwiseAddTripleGradOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(elementwise_sub_grad,
+PD_REGISTER_ARG_MAPPING_FN(elementwise_sub_grad,
                            phi::ElementwiseSubGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/empty_sig.cc b/paddle/phi/ops/compat/empty_sig.cc
index 9315fdf827dcf0..42cd55bdc0cdab 100644
--- a/paddle/phi/ops/compat/empty_sig.cc
+++ b/paddle/phi/ops/compat/empty_sig.cc
@@ -28,4 +28,4 @@ KernelSignature EmptyOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 }  // namespace phi
 
-PT_REGISTER_ARG_MAPPING_FN(empty, phi::EmptyOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(empty, phi::EmptyOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/expand_sig.cc b/paddle/phi/ops/compat/expand_sig.cc
index 3f7ff458296c7e..3b2e468267da03 100644
--- a/paddle/phi/ops/compat/expand_sig.cc
+++ b/paddle/phi/ops/compat/expand_sig.cc
@@ -47,8 +47,8 @@ KernelSignature ExpandGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 }  // namespace phi
 
-PT_REGISTER_BASE_KERNEL_NAME(expand_v2, expand);
-PT_REGISTER_BASE_KERNEL_NAME(expand_v2_grad, expand_grad);
+PD_REGISTER_BASE_KERNEL_NAME(expand_v2, expand);
+PD_REGISTER_BASE_KERNEL_NAME(expand_v2_grad, expand_grad);
 
-PT_REGISTER_ARG_MAPPING_FN(expand_v2, phi::ExpandOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(expand_v2_grad, phi::ExpandGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(expand_v2, phi::ExpandOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(expand_v2_grad, phi::ExpandGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/fill_any_like_sig.cc b/paddle/phi/ops/compat/fill_any_like_sig.cc
index 3fbd022ca6a9a9..84af155d402d6b 100644
--- a/paddle/phi/ops/compat/fill_any_like_sig.cc
+++ b/paddle/phi/ops/compat/fill_any_like_sig.cc
@@ -23,6 +23,6 @@ KernelSignature FillAnyLikeOpArgumentMapping(
 
 }  // namespace phi
 
-PT_REGISTER_BASE_KERNEL_NAME(fill_any_like, full_like);
+PD_REGISTER_BASE_KERNEL_NAME(fill_any_like, full_like);
 
-PT_REGISTER_ARG_MAPPING_FN(fill_any_like, phi::FillAnyLikeOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(fill_any_like, phi::FillAnyLikeOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/fill_constant_sig.cc b/paddle/phi/ops/compat/fill_constant_sig.cc
index 85dfdc3db3eaed..df28a7b81b61b5 100644
--- a/paddle/phi/ops/compat/fill_constant_sig.cc
+++ b/paddle/phi/ops/compat/fill_constant_sig.cc
@@ -123,6 +123,6 @@ KernelSignature FillConstantOpArgumentMapping(
 
 }  // namespace phi
 
-PT_REGISTER_BASE_KERNEL_NAME(fill_constant, full);
+PD_REGISTER_BASE_KERNEL_NAME(fill_constant, full);
 
-PT_REGISTER_ARG_MAPPING_FN(fill_constant, phi::FillConstantOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(fill_constant, phi::FillConstantOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/flatten_sig.cc b/paddle/phi/ops/compat/flatten_sig.cc
index ae5f438cafc248..b72ad05ea09d8d 100644
--- a/paddle/phi/ops/compat/flatten_sig.cc
+++ b/paddle/phi/ops/compat/flatten_sig.cc
@@ -36,10 +36,10 @@ KernelSignature FlattenGradOpArgumentMapping(
 
 }  // namespace phi
 
-PT_REGISTER_BASE_KERNEL_NAME(flatten_contiguous_range, flatten);
-PT_REGISTER_BASE_KERNEL_NAME(flatten_contiguous_range_grad, flatten_grad);
+PD_REGISTER_BASE_KERNEL_NAME(flatten_contiguous_range, flatten);
+PD_REGISTER_BASE_KERNEL_NAME(flatten_contiguous_range_grad, flatten_grad);
 
-PT_REGISTER_ARG_MAPPING_FN(flatten_contiguous_range,
+PD_REGISTER_ARG_MAPPING_FN(flatten_contiguous_range,
                            phi::FlattenOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(flatten_contiguous_range_grad,
+PD_REGISTER_ARG_MAPPING_FN(flatten_contiguous_range_grad,
                            phi::FlattenGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/histogram_sig.cc b/paddle/phi/ops/compat/histogram_sig.cc
index 0fd1fdea764248..0cea146ea4e7fc 100644
--- a/paddle/phi/ops/compat/histogram_sig.cc
+++ b/paddle/phi/ops/compat/histogram_sig.cc
@@ -22,4 +22,4 @@ KernelSignature HistogramOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 }  // namespace phi
 
-PT_REGISTER_ARG_MAPPING_FN(histogram, phi::HistogramOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(histogram, phi::HistogramOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/huber_loss_sig.cc b/paddle/phi/ops/compat/huber_loss_sig.cc
new file mode 100644
index 00000000000000..6f669a4a8b697a
--- /dev/null
+++ b/paddle/phi/ops/compat/huber_loss_sig.cc
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature HuberLossOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "huber_loss", {"X", "Y"}, {"delta"}, {"Out", "Residual"});
+}
+
+KernelSignature HuberLossGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("huber_loss_grad",
+                         {"Residual", GradVarName("Out")},
+                         {"delta"},
+                         {GradVarName("X"), GradVarName("Y")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(huber_loss, phi::HuberLossOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(huber_loss_grad,
+                           phi::HuberLossGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/lerp_sig.cc b/paddle/phi/ops/compat/lerp_sig.cc
index d33a714048bd00..3a8b23ca4c4a4a 100644
--- a/paddle/phi/ops/compat/lerp_sig.cc
+++ b/paddle/phi/ops/compat/lerp_sig.cc
@@ -29,5 +29,5 @@ KernelSignature LerpGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 }  // namespace phi
 
-PT_REGISTER_ARG_MAPPING_FN(lerp, phi::LerpOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(lerp_grad, phi::LerpGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(lerp, phi::LerpOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(lerp_grad, phi::LerpGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/masked_select_sig.cc b/paddle/phi/ops/compat/masked_select_sig.cc
index 77a97d103e8890..8083b123bcff53 100644
--- a/paddle/phi/ops/compat/masked_select_sig.cc
+++ b/paddle/phi/ops/compat/masked_select_sig.cc
@@ -31,6 +31,6 @@ KernelSignature MaskedSelectGradOpArgumentMapping(
 
 }  // namespace phi
 
-PT_REGISTER_ARG_MAPPING_FN(masked_select, phi::MaskedSelectOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(masked_select_grad,
+PD_REGISTER_ARG_MAPPING_FN(masked_select, phi::MaskedSelectOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(masked_select_grad,
                            phi::MaskedSelectGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/matmul_sig.cc b/paddle/phi/ops/compat/matmul_sig.cc
index d4106cd39e3044..771a7c3acc39df 100644
--- a/paddle/phi/ops/compat/matmul_sig.cc
+++ b/paddle/phi/ops/compat/matmul_sig.cc
@@ -49,13 +49,13 @@ KernelSignature MatmulTripleGradOpArgumentMapping(
 
 }  // namespace phi
 
-PT_REGISTER_BASE_KERNEL_NAME(matmul_v2, matmul);
-PT_REGISTER_BASE_KERNEL_NAME(matmul_v2_grad, matmul_grad);
-PT_REGISTER_BASE_KERNEL_NAME(matmul_v2_grad_grad, matmul_double_grad);
-PT_REGISTER_BASE_KERNEL_NAME(matmul_v2_triple_grad, matmul_triple_grad);
+PD_REGISTER_BASE_KERNEL_NAME(matmul_v2, matmul);
+PD_REGISTER_BASE_KERNEL_NAME(matmul_v2_grad, matmul_grad);
+PD_REGISTER_BASE_KERNEL_NAME(matmul_v2_grad_grad, matmul_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(matmul_v2_triple_grad, matmul_triple_grad);
 
-PT_REGISTER_ARG_MAPPING_FN(matmul_v2_grad, phi::MatmulGradOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(matmul_v2_grad_grad,
+PD_REGISTER_ARG_MAPPING_FN(matmul_v2_grad, phi::MatmulGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(matmul_v2_grad_grad,
                            phi::MatmulDoubleGradOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(matmul_v2_triple_grad,
+PD_REGISTER_ARG_MAPPING_FN(matmul_v2_triple_grad,
                            phi::MatmulTripleGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/norm_sig.cc b/paddle/phi/ops/compat/norm_sig.cc
index f67c22ba712c8c..81d294b8424857 100644
--- a/paddle/phi/ops/compat/norm_sig.cc
+++ b/paddle/phi/ops/compat/norm_sig.cc
@@ -30,5 +30,5 @@ KernelSignature NormGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 }  // namespace phi
 
-PT_REGISTER_ARG_MAPPING_FN(norm, phi::NormOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(norm_grad, phi::NormGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(norm, phi::NormOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(norm_grad, phi::NormGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/reduce_sig.cc b/paddle/phi/ops/compat/reduce_sig.cc
index 2d16817ad886b6..74704671f8b5d2 100644
--- a/paddle/phi/ops/compat/reduce_sig.cc
+++ b/paddle/phi/ops/compat/reduce_sig.cc
@@ -45,8 +45,8 @@ KernelSignature ReduceMeanOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 }  // namespace phi
 
-PT_REGISTER_BASE_KERNEL_NAME(reduce_sum, sum);
-PT_REGISTER_BASE_KERNEL_NAME(reduce_mean, mean);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_sum, sum);
+PD_REGISTER_BASE_KERNEL_NAME(reduce_mean, mean);
 
-PT_REGISTER_ARG_MAPPING_FN(reduce_sum, phi::ReduceSumOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(reduce_mean, phi::ReduceMeanOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_sum, phi::ReduceSumOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reduce_mean, phi::ReduceMeanOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/reshape_sig.cc b/paddle/phi/ops/compat/reshape_sig.cc
index 353d364e0ce0b3..b6d10dabb1c7f6 100644
--- a/paddle/phi/ops/compat/reshape_sig.cc
+++ b/paddle/phi/ops/compat/reshape_sig.cc
@@ -17,13 +17,19 @@ limitations under the License. */
 namespace phi {
 
 KernelSignature ReshapeOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  if (ctx.InputSize("ShapeTensor") > 0) {
-    return KernelSignature("reshape", {"X"}, {"ShapeTensor"}, {"Out"});
-  } else if (ctx.HasInput("Shape")) {
-    return KernelSignature("reshape", {"X"}, {"Shape"}, {"Out"});
-  } else {
-    return KernelSignature("reshape", {"X"}, {"shape"}, {"Out"});
+  if (ctx.HasOutput("XShape")) {
+    if (ctx.InputSize("ShapeTensor") > 0) {
+      return KernelSignature(
+          "reshape_with_xshape", {"X"}, {"ShapeTensor"}, {"XShape", "Out"});
+    } else if (ctx.HasInput("Shape")) {
+      return KernelSignature(
+          "reshape_with_xshape", {"X"}, {"Shape"}, {"XShape", "Out"});
+    } else {
+      return KernelSignature(
+          "reshape_with_xshape", {"X"}, {"shape"}, {"XShape", "Out"});
+    }
   }
+  return KernelSignature("unregistered", {}, {}, {});
 }
 
 KernelSignature ReshapeGradOpArgumentMapping(
@@ -39,11 +45,11 @@ KernelSignature ReshapeDoubleGradOpArgumentMapping(
 
 }  // namespace phi
 
-PT_REGISTER_BASE_KERNEL_NAME(reshape2, reshape);
-PT_REGISTER_BASE_KERNEL_NAME(reshape2_grad, reshape_grad);
-PT_REGISTER_BASE_KERNEL_NAME(reshape2_grad_grad, reshape_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(reshape2, reshape);
+PD_REGISTER_BASE_KERNEL_NAME(reshape2_grad, reshape_grad);
+PD_REGISTER_BASE_KERNEL_NAME(reshape2_grad_grad, reshape_double_grad);
 
-PT_REGISTER_ARG_MAPPING_FN(reshape2, phi::ReshapeOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(reshape2_grad, phi::ReshapeGradOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(reshape2_grad_grad,
+PD_REGISTER_ARG_MAPPING_FN(reshape2, phi::ReshapeOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reshape2_grad, phi::ReshapeGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(reshape2_grad_grad,
                            phi::ReshapeDoubleGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/scale_sig.cc b/paddle/phi/ops/compat/scale_sig.cc
index da8d028b2e39ad..915ea4ce302aea 100644
--- a/paddle/phi/ops/compat/scale_sig.cc
+++ b/paddle/phi/ops/compat/scale_sig.cc
@@ -72,4 +72,4 @@ KernelSignature ScaleOpArgumentMapping(const ArgumentMappingContext& ctx) {
 }  // namespace phi
 
 // op_type, api_name, arg_mapping_fn
-PT_REGISTER_ARG_MAPPING_FN(scale, phi::ScaleOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(scale, phi::ScaleOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/split_sig.cc b/paddle/phi/ops/compat/split_sig.cc
index 361a928e753948..b3a614aab00126 100644
--- a/paddle/phi/ops/compat/split_sig.cc
+++ b/paddle/phi/ops/compat/split_sig.cc
@@ -46,4 +46,4 @@ KernelSignature SplitOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 }  // namespace phi
 
-PT_REGISTER_ARG_MAPPING_FN(split, phi::SplitOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(split, phi::SplitOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/trace_sig.cc b/paddle/phi/ops/compat/trace_sig.cc
index 774ac5a944f596..44fd53db98a3cf 100644
--- a/paddle/phi/ops/compat/trace_sig.cc
+++ b/paddle/phi/ops/compat/trace_sig.cc
@@ -30,5 +30,5 @@ KernelSignature TraceGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 }  // namespace phi
 
-PT_REGISTER_ARG_MAPPING_FN(trace, phi::TraceOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(trace_grad, phi::TraceGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(trace, phi::TraceOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(trace_grad, phi::TraceGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/trunc_sig.cc b/paddle/phi/ops/compat/trunc_sig.cc
index 47fa5bc47b4b50..2d35439216da52 100644
--- a/paddle/phi/ops/compat/trunc_sig.cc
+++ b/paddle/phi/ops/compat/trunc_sig.cc
@@ -27,5 +27,5 @@ KernelSignature TruncGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
 
 }  // namespace phi
 
-PT_REGISTER_ARG_MAPPING_FN(trunc, phi::TruncOpArgumentMapping);
-PT_REGISTER_ARG_MAPPING_FN(trunc_grad, phi::TruncGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(trunc, phi::TruncOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(trunc_grad, phi::TruncGradOpArgumentMapping);
diff --git a/paddle/phi/tests/common/test_backend.cc b/paddle/phi/tests/common/test_backend.cc
index 941c00d9fea8b2..d74a35c9eae2e1 100644
--- a/paddle/phi/tests/common/test_backend.cc
+++ b/paddle/phi/tests/common/test_backend.cc
@@ -52,5 +52,19 @@ TEST(Backend, OStream) {
   }
 }
 
+TEST(Backend, StringToBackend) {
+  namespace pexp = paddle::experimental;
+  EXPECT_EQ(phi::Backend::UNDEFINED, pexp::StringToBackend("Undefined"));
+  EXPECT_EQ(phi::Backend::CPU, pexp::StringToBackend("CPU"));
+  EXPECT_EQ(phi::Backend::GPU, pexp::StringToBackend("GPU"));
+  EXPECT_EQ(phi::Backend::XPU, pexp::StringToBackend("XPU"));
+  EXPECT_EQ(phi::Backend::NPU, pexp::StringToBackend("NPU"));
+  EXPECT_EQ(phi::Backend::MKLDNN, pexp::StringToBackend("MKLDNN"));
+  EXPECT_EQ(phi::Backend::CUDNN, pexp::StringToBackend("CUDNN"));
+  EXPECT_EQ(static_cast<phi::Backend>(
+                static_cast<size_t>(phi::Backend::NUM_BACKENDS) + 1),
+            pexp::StringToBackend("CustomBackend"));
+}
+
 }  // namespace tests
 }  // namespace phi
diff --git a/paddle/phi/tests/core/CMakeLists.txt b/paddle/phi/tests/core/CMakeLists.txt
index 971d9112eead97..576ab7ffe6a666 100644
--- a/paddle/phi/tests/core/CMakeLists.txt
+++ b/paddle/phi/tests/core/CMakeLists.txt
@@ -1,3 +1,4 @@
+cc_test(test_custom_kernel SRCS test_custom_kernel.cc DEPS pten_custom_kernel)
 cc_test(test_dense_tensor SRCS test_dense_tensor.cc DEPS dense_tensor)
 cc_test(test_intrusive_ptr SRCS test_intrusive_ptr.cc)
 cc_test(test_type_info SRCS test_type_info.cc)
diff --git a/paddle/fluid/framework/custom_kernel_test.cc b/paddle/phi/tests/core/test_custom_kernel.cc
similarity index 70%
rename from paddle/fluid/framework/custom_kernel_test.cc
rename to paddle/phi/tests/core/test_custom_kernel.cc
index fb3cc0a35f0e02..bc75e6ec45245e 100644
--- a/paddle/fluid/framework/custom_kernel_test.cc
+++ b/paddle/phi/tests/core/test_custom_kernel.cc
@@ -17,24 +17,21 @@ limitations under the License. */
 #define _LINUX
 #endif
 
-#include "paddle/fluid/framework/custom_kernel.h"
-
-#include <glog/logging.h>
 #include <gtest/gtest.h>
-#include "paddle/extension.h"
+
+#ifdef _LINUX
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_kernel_info_helper.h"
-#include "paddle/fluid/memory/allocation/allocator_facade.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/api/lib/utils/storage.h"
-#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/common/scalar_array.h"
 #include "paddle/phi/core/kernel_context.h"
 #include "paddle/phi/core/kernel_factory.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/meta_tensor.h"
 #include "paddle/phi/infermeta/binary.h"
-#include "paddle/utils/small_vector.h"
 
-#ifdef _LINUX
 // user kernel function
 namespace custom_kernel {
 
@@ -43,17 +40,23 @@ namespace custom_kernel {
 // attribute 11: fake_attributes
 // output 2: one Tensor* and one std::vector<Tensor*>
 template <typename T, typename Context>
-void FakeDot(const Context& dev_ctx, const paddle::Tensor& x,
-             const paddle::Tensor& y,
-             const std::vector<paddle::Tensor>& fake_input_vec,
-             bool fake_attr_bool, int fake_attr_int, float fake_attr_float,
-             double fake_attr_double, int64_t fake_attr_int64,
-             phi::dtype::float16 fake_attr_f16, phi::DataType fake_attr_dtype,
+void FakeDot(const Context& dev_ctx,
+             const phi::DenseTensor& x,
+             const phi::DenseTensor& y,
+             const std::vector<phi::DenseTensor>& fake_input_vec,
+             bool fake_attr_bool,
+             int fake_attr_int,
+             float fake_attr_float,
+             double fake_attr_double,
+             int64_t fake_attr_int64,
+             phi::dtype::float16 fake_attr_f16,
+             phi::DataType fake_attr_dtype,
              const phi::Scalar& fake_attr_scalar,
              const phi::ScalarArray& fake_attr_scalar_array,
              const std::vector<int64_t>& fake_attr_int64_vec,
-             const std::vector<int>& fake_attr_int_vec, paddle::Tensor* out,
-             std::vector<paddle::Tensor*> fake_out_vec) {
+             const std::vector<int>& fake_attr_int_vec,
+             phi::DenseTensor* out,
+             std::vector<phi::DenseTensor*> fake_out_vec) {
   // print param info
   std::cout << "fake_input_vec.size: " << fake_input_vec.size() << std::endl;
   std::cout << "fake_attr_bool: " << fake_attr_bool << std::endl;
@@ -83,10 +86,10 @@ void FakeDot(const Context& dev_ctx, const paddle::Tensor& x,
 
   auto const *x_ptr = x.data<T>(), *x_ptr_ = &x_ptr[0];
   auto const *y_ptr = y.data<T>(), *y_ptr_ = &y_ptr[0];
-  auto* z = out->mutable_data<T>(paddle::PlaceType::kCPU);
-  auto shape = x.shape();
+  T* z = dev_ctx.template Alloc<T>(out);
+  auto&& d = x.dims();
   auto const N = x.numel();
-  auto const B = shape[shape.size() - 1];
+  auto const B = d[d.size() - 1];
   for (int j = 0; j < N / B; j++) {
     T ss = 0;
     for (int i = 0; i < B; i++) ss += (*x_ptr_++) * (*y_ptr_++);
@@ -95,8 +98,19 @@ void FakeDot(const Context& dev_ctx, const paddle::Tensor& x,
 }
 }  // namespace custom_kernel
 
-PD_REGISTER_KERNEL(fake_dot, CPU, ALL_LAYOUT, custom_kernel::FakeDot, float,
-                   double, int, int64_t, int8_t, uint8_t) {}
+PD_REGISTER_BUILTIN_KERNEL(fake_dot,
+                           CPU,
+                           ALL_LAYOUT,
+                           custom_kernel::FakeDot,
+                           float,
+                           double,
+                           int,
+                           int64_t,
+                           int8_t,
+                           uint8_t) {}
+
+namespace phi {
+namespace tests {
 
 // Upper code will store dot kernels info into OpKernelInfoMap
 TEST(CustomKernel, custom_kernel_dot) {
@@ -105,33 +119,38 @@ TEST(CustomKernel, custom_kernel_dot) {
   phi::DataLayout layout = phi::DataLayout::ALL_LAYOUT;
 
   // 1.custom kernel info parsed and store
-  EXPECT_TRUE(paddle::OpKernelInfoMap::Instance().GetMap().find(op_name) !=
-              paddle::OpKernelInfoMap::Instance().GetMap().end());
+  EXPECT_TRUE(phi::CustomKernelMap::Instance().GetMap().find(op_name) !=
+              phi::CustomKernelMap::Instance().GetMap().end());
 
+  auto& custom_kernels = phi::CustomKernelMap::Instance().Kernels();
   // 2.info check
-  EXPECT_EQ(
-      6, static_cast<int>(paddle::OpKernelInfoMap::Instance()[op_name].size()));
-  // index 0
-  EXPECT_TRUE(paddle::OpKernelInfoMap::Instance()[op_name][0].GetBackend() ==
-              backend);
-  EXPECT_TRUE(paddle::OpKernelInfoMap::Instance()[op_name][0].GetDataLayout() ==
-              layout);
-  EXPECT_TRUE(paddle::OpKernelInfoMap::Instance()[op_name][0].GetDataType() ==
-              phi::DataType::FLOAT32);
-  // index 5
-  EXPECT_TRUE(paddle::OpKernelInfoMap::Instance()[op_name][5].GetBackend() ==
-              backend);
-  EXPECT_TRUE(paddle::OpKernelInfoMap::Instance()[op_name][5].GetDataLayout() ==
-              layout);
-  EXPECT_TRUE(paddle::OpKernelInfoMap::Instance()[op_name][5].GetDataType() ==
-              phi::DataType::UINT8);
+  EXPECT_EQ(6, static_cast<int>(custom_kernels[op_name].size()));
+  auto& custom_fake_dot_kernels = custom_kernels[op_name];
+  EXPECT_TRUE(custom_fake_dot_kernels.find(
+                  phi::KernelKey(backend, layout, phi::DataType::FLOAT32)) !=
+              custom_fake_dot_kernels.end());
+  EXPECT_TRUE(custom_fake_dot_kernels.find(
+                  phi::KernelKey(backend, layout, phi::DataType::FLOAT64)) !=
+              custom_fake_dot_kernels.end());
+  EXPECT_TRUE(custom_fake_dot_kernels.find(
+                  phi::KernelKey(backend, layout, phi::DataType::INT32)) !=
+              custom_fake_dot_kernels.end());
+  EXPECT_TRUE(custom_fake_dot_kernels.find(
+                  phi::KernelKey(backend, layout, phi::DataType::INT64)) !=
+              custom_fake_dot_kernels.end());
+  EXPECT_TRUE(custom_fake_dot_kernels.find(
+                  phi::KernelKey(backend, layout, phi::DataType::INT8)) !=
+              custom_fake_dot_kernels.end());
+  EXPECT_TRUE(custom_fake_dot_kernels.find(
+                  phi::KernelKey(backend, layout, phi::DataType::UINT8)) !=
+              custom_fake_dot_kernels.end());
 
   // 3.before register
   auto& kernel_factory_instance = phi::KernelFactory::Instance();
   auto& kernels = phi::KernelFactory::Instance().kernels();
   EXPECT_TRUE(!kernel_factory_instance.HasCompatiblePtenKernel(op_name));
 
-  // mock fake_dot is supported by pten for HasCompatiblePtenKernel check while
+  // mock fake_dot is supported by phi for HasCompatiblePtenKernel check while
   // registering
   auto& fake_dot_kernels = kernels[op_name];
 
@@ -155,8 +174,7 @@ TEST(CustomKernel, custom_kernel_dot) {
               fake_dot_kernels.end());
 
   // register
-  paddle::framework::RegisterKernelWithMetaInfoMap(
-      paddle::OpKernelInfoMap::Instance());
+  phi::RegisterCustomKernels(phi::CustomKernelMap::Instance());
 
   EXPECT_TRUE(fake_dot_kernels.find(
                   phi::KernelKey(backend, layout, phi::DataType::FLOAT32)) !=
@@ -186,15 +204,15 @@ TEST(CustomKernel, custom_kernel_dot) {
       paddle::platform::CPUPlace());
   auto dense_x = std::make_shared<phi::DenseTensor>(
       alloc.get(),
-      phi::DenseTensorMeta(phi::DataType::UINT8, phi::make_ddim({2, 3}),
-                           phi::DataLayout::NCHW));
+      phi::DenseTensorMeta(
+          phi::DataType::UINT8, phi::make_ddim({2, 3}), phi::DataLayout::NCHW));
   auto* dense_x_data =
       dense_x->mutable_data<uint8_t>(paddle::platform::CPUPlace());
 
   auto dense_y = std::make_shared<phi::DenseTensor>(
       alloc.get(),
-      phi::DenseTensorMeta(phi::DataType::UINT8, phi::make_ddim({2, 3}),
-                           phi::DataLayout::NCHW));
+      phi::DenseTensorMeta(
+          phi::DataType::UINT8, phi::make_ddim({2, 3}), phi::DataLayout::NCHW));
   auto* dense_y_data =
       dense_y->mutable_data<uint8_t>(paddle::platform::CPUPlace());
 
@@ -288,38 +306,7 @@ TEST(CustomKernel, custom_kernel_dot) {
   ASSERT_EQ(expect_result[1], actual_result1);
 }
 
-// test OpKernelInfoHelper
-TEST(OpKernelInfoHelper, op_kernel_info_help_getters) {
-  using OpKernelInfoHelper = paddle::framework::OpKernelInfoHelper;
-  std::string op_name = "fake_dot";
-  phi::Backend backend = phi::Backend::CPU;
-  phi::DataLayout layout = phi::DataLayout::ANY;
-  phi::DataType dtype = phi::DataType::FLOAT32;
-
-  auto op_kernel_info = paddle::OpKernelInfoMap::Instance()[op_name][0];
-
-  EXPECT_EQ(op_name, OpKernelInfoHelper::GetOpName(op_kernel_info));
-  EXPECT_EQ(backend, OpKernelInfoHelper::GetBackend(op_kernel_info));
-  EXPECT_EQ(layout, OpKernelInfoHelper::GetDataLayout(op_kernel_info));
-  EXPECT_EQ(dtype, OpKernelInfoHelper::GetDataType(op_kernel_info));
-
-  EXPECT_EQ(phi::KernelKey(backend, layout, dtype),
-            OpKernelInfoHelper::GetKernelKey(op_kernel_info));
-
-  paddle::CustomKernelFunc kernel_fn =
-      PD_PT_KERNEL(custom_kernel::FakeDot<float, paddle::CPUContext>);
-  EXPECT_EQ(kernel_fn, OpKernelInfoHelper::GetKernelFn(op_kernel_info));
-
-  void* variadic_func =
-      PD_PT_VARIADIC_KERNEL(custom_kernel::FakeDot<float, paddle::CPUContext>);
-  EXPECT_EQ(variadic_func,
-            OpKernelInfoHelper::GetVariadicKernelFn(op_kernel_info));
-
-  auto& input_defs = OpKernelInfoHelper::GetInputDefs(op_kernel_info);
-  auto& output_defs = OpKernelInfoHelper::GetOutputDefs(op_kernel_info);
-  auto& attribute_defs = OpKernelInfoHelper::GetAttributeDefs(op_kernel_info);
-  EXPECT_EQ(3, static_cast<int>(input_defs.size()));
-  EXPECT_EQ(2, static_cast<int>(output_defs.size()));
-  EXPECT_EQ(11, static_cast<int>(attribute_defs.size()));
-}
+}  // namespace tests
+}  // namespace phi
+
 #endif
diff --git a/paddle/phi/tests/core/test_kernel_factory.cc b/paddle/phi/tests/core/test_kernel_factory.cc
index c85485cb915136..cb4b50f5b6c3dc 100644
--- a/paddle/phi/tests/core/test_kernel_factory.cc
+++ b/paddle/phi/tests/core/test_kernel_factory.cc
@@ -22,7 +22,7 @@ limitations under the License. */
 
 #include "gtest/gtest.h"
 
-PT_DECLARE_KERNEL(scale, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(scale, CPU, ALL_LAYOUT);
 
 namespace phi {
 namespace tests {
@@ -76,7 +76,7 @@ TEST(KernelRegistry, SetFP32Input) {
 }  // namespace tests
 }  // namespace phi
 
-PT_REGISTER_KERNEL(test,
+PD_REGISTER_KERNEL(test,
                    CPU,
                    ALL_LAYOUT,
                    phi::tests::TestKernel,
diff --git a/paddle/phi/tests/kernels/test_flatten_dev_api.cc b/paddle/phi/tests/kernels/test_flatten_dev_api.cc
index b65720a4b4e241..dc283728ee5f76 100644
--- a/paddle/phi/tests/kernels/test_flatten_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_flatten_dev_api.cc
@@ -23,14 +23,14 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
 
-PT_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
 #endif
 
 #ifdef PADDLE_WITH_XPU
-PT_DECLARE_KERNEL(copy, XPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(copy, XPU, ALL_LAYOUT);
 #endif
 
 namespace phi {
diff --git a/python/paddle/distributed/auto_parallel/dist_context.py b/python/paddle/distributed/auto_parallel/dist_context.py
index e06811df88179a..caf220646bb609 100644
--- a/python/paddle/distributed/auto_parallel/dist_context.py
+++ b/python/paddle/distributed/auto_parallel/dist_context.py
@@ -45,9 +45,13 @@ class DistributedContext:
     One auto-parallel run should use its own DistributedContext to avoid interfering other run.
     """
 
-    def __init__(self, program=None):
+    def __init__(self,
+                 serial_main_prog=None,
+                 serial_startup_prog=None,
+                 dist_main_progs=None,
+                 dist_startup_progs=None):
         # Program related data members
-        self._serial_program = program
+        self._serial_program = serial_main_prog
         self._is_initialized_for_program = False
         self._dist_tensors_for_program = {}
         self._dist_ops_for_program = {}
@@ -65,8 +69,12 @@ def __init__(self, program=None):
         self._tensor_id_to_tensor_node_ids = {}
 
         # Distributed programs
-        self._dist_main_programs = {}
-        self._dist_startup_programs = {}
+        self._dist_main_programs = dist_main_progs
+        if not self._dist_main_programs:
+            self._dist_main_programs = {}
+        self._dist_startup_programs = dist_startup_progs
+        if not self._dist_startup_programs:
+            self._dist_startup_programs = {}
 
     @property
     def serial_program(self):
@@ -78,8 +86,8 @@ def serial_graph(self):
 
     @serial_program.setter
     def serial_program(self, program):
-        assert self._serial_program is None, \
-            "This distributed context has already been realted to a serial program"
+        # assert self._serial_program is None, \
+        #     "This distributed context has already been realted to a serial program"
         self._serial_program = program
 
     @property
diff --git a/python/paddle/distributed/auto_parallel/dist_loader.py b/python/paddle/distributed/auto_parallel/dist_loader.py
new file mode 100644
index 00000000000000..92deeffd2c9014
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/dist_loader.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+import abc
+import numpy as np
+import paddle
+from paddle.io import DataLoader, DistributedBatchSampler
+
+
+class DistributedDataLoader(metaclass=abc.ABCMeta):
+    def __init__(self,
+                 dataset,
+                 batch_size=1,
+                 epochs=1,
+                 data_parallel_world_size=None,
+                 data_parallel_rank=None,
+                 drop_last=False):
+        self.dataset = dataset
+        self.batch_size = batch_size
+        self.epochs = epochs
+        self.data_parallel_world_size = data_parallel_world_size
+        self.data_parallel_rank = data_parallel_rank
+        self.drop_lost = drop_last
+        if data_parallel_world_size is not None:
+            assert batch_size % data_parallel_world_size == 0
+
+    @abc.abstractmethod
+    def __iter__(self):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def __next__(self):
+        raise NotImplementedError
+
+
+class NonIterableGeneratorLoader(DistributedDataLoader):
+    def __init__(self,
+                 dataset,
+                 feed_list,
+                 places,
+                 batch_size=1,
+                 epochs=1,
+                 steps_per_epoch=1000,
+                 data_parallel_world_size=None,
+                 data_parallel_rank=None,
+                 drop_last=False):
+        self.feed_list = feed_list
+        self.places = places
+        self.steps_per_epoch = steps_per_epoch
+        super(NonIterableGeneratorLoader, self).__init__(
+            dataset, batch_size, epochs, data_parallel_world_size,
+            data_parallel_rank, drop_last)
+        self._inner_dataloader = self._create_inner_dataloader()
+
+    def __iter__(self):
+        self._cur_step = 0
+        self._inner_dataloader.start()
+        return self
+
+    def __next__(self):
+        if self._cur_step < self.steps_per_epoch:
+            self._cur_step += 1
+        else:
+            self._inner_dataloader.reset()
+            raise StopIteration
+
+    def _create_inner_dataloader(self):
+        def data_generator():
+            batch_data = None
+            for step, data in enumerate(self.dataset):
+                if batch_data is None:
+                    batch_data = [[] for i in range(len(data))]
+                for idx, data_item in enumerate(data):
+                    batch_data[idx].append(np.array(data_item))
+                if (step + 1) % self.batch_size == 0:
+                    yield batch_data[0], batch_data[1]
+                    batch_data = None
+
+        dataloader = paddle.fluid.io.DataLoader.from_generator(
+            feed_list=self.feed_list, capacity=70, iterable=False)
+        dataloader.set_batch_generator(data_generator, self.places)
+        return dataloader
diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py
new file mode 100644
index 00000000000000..98b76056a15a42
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/engine.py
@@ -0,0 +1,309 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import logging
+from collections import defaultdict
+
+import paddle
+from paddle import fluid
+from paddle.io import Dataset
+from paddle.fluid.backward import append_backward
+import paddle.fluid.core as core
+from paddle.static import InputSpec
+from paddle.fluid import program_guard
+from paddle.fluid.framework import Operator
+from paddle.fluid.framework import _current_expected_place as _get_device
+from paddle.fluid.dygraph.parallel import ParallelEnv
+from paddle.distributed.passes import new_pass, PassContext
+from paddle.distributed.utils import get_logger
+
+from .dist_loader import NonIterableGeneratorLoader
+from .dist_op import DistributedOperator
+from .dist_tensor import DistributedTensor
+from .dist_context import DistributedContext
+from .dist_context import get_default_distributed_context
+from .dist_context import set_default_distributed_context
+from .process_group import get_all_process_groups
+from .process_group import get_process_group
+from .process_group import get_world_process_group
+from .process_group import _g_process_group_map, ProcessGroup
+from .completion import Completer
+from .partitioner import Partitioner
+from .reshard import reshard, HAS_SENT, HAS_RECV, HAS_ALLGATHER
+from .cluster import Cluster
+from .mapper import mapping
+from .planner import Planner
+from .utils import make_data_unshard
+from .utils import set_grad_var_shape
+from .utils import print_program_with_dist_attr
+from .utils import SerialProgramInfo
+
+paddle.enable_static()
+
+
+def to_list(value):
+    if value is None:
+        return value
+    if isinstance(value, (list, tuple)):
+        return list(value)
+    return [value]
+
+
+class Engine:
+    def __init__(self, model=None, data_spec=None, cluster=None, strategy=None):
+        self.model = model
+        self.data_spec = data_spec
+        self.cluster = cluster
+        self.strategy = strategy
+        self._executor = None
+        self._orig_main_prog = fluid.default_main_program()
+        self._orig_startup_prog = fluid.default_startup_program()
+        self._serial_main_progs = {}
+        self._serial_startup_progs = {}
+        self._dist_main_progs = defaultdict(dict)
+        self._dist_startup_progs = defaultdict(dict)
+        self._orig_dist_context = get_default_distributed_context()
+        self._dist_contexts = {}
+        self._pass_contexts = {}
+        self._cur_rank = paddle.distributed.get_rank()
+        self._logger = get_logger(logging.INFO)
+
+    def prepare(self,
+                optimizer=None,
+                loss=None,
+                metrics=None,
+                mode="train",
+                all_ranks=False):
+        self.optimizer = optimizer
+        self.loss = loss
+        self.metrics = metrics
+        self.mode = mode
+        self._build()
+        self._plan()
+        if not all_ranks:
+            self._parallel(self._cur_rank)
+        else:
+            world_process_group = get_world_process_group()
+            all_ranks = world_process_group.ranks
+            for rank in all_ranks:
+                self._parallel(rank)
+        place = _get_device()
+        if isinstance(place, fluid.CUDAPlace):
+            self._place = fluid.CUDAPlace(ParallelEnv().dev_id)
+        if self._executor is None:
+            self._executor = fluid.Executor(place)
+
+    def _build(self):
+        serial_main_prog = self._serial_main_progs.get(self.mode, None)
+        if serial_main_prog is not None:
+            return
+
+        serial_main_prog = self._orig_main_prog.clone()
+        serial_startup_prog = self._orig_startup_prog.clone()
+        with fluid.program_guard(serial_main_prog, serial_startup_prog):
+            inputs_spec = self.data_spec[0]
+            labels_spec = self.data_spec[1]
+            inputs = [s._create_feed_layer() for s in to_list(inputs_spec)]
+            labels = [s._create_feed_layer() for s in to_list(labels_spec)]
+            self._input_vars = inputs
+            self._label_vars = labels
+            feed_list = self._input_vars + self._label_vars
+            outputs = to_list(self.model(*inputs))
+            if self.mode != "predict" and self.loss:
+                loss = self.loss(*(outputs + labels))
+                self._loss_var = loss
+
+        self._serial_main_progs[self.mode] = serial_main_prog
+        self._serial_startup_progs[self.mode] = serial_startup_prog
+        self._dist_contexts[self.mode] = DistributedContext(
+            serial_main_prog, serial_startup_prog,
+            self._dist_main_progs[self.mode],
+            self._dist_startup_progs[self.mode])
+        self._pass_contexts[self.mode] = PassContext()
+
+    def _plan(self):
+        # Complete the distributed annotation
+        serial_main_prog = self._serial_main_progs[self.mode]
+        self._completer = Completer(self._dist_contexts[self.mode])
+        self._completer.complete_forward_annotation(serial_main_prog)
+        # TODO: add auto planner process
+
+    def _parallel(self, rank):
+        serial_main_program = self._serial_main_progs[self.mode]
+        serial_startup_program = self._serial_startup_progs[self.mode]
+        dist_context = self._dist_contexts[self.mode]
+        if self.mode != "predict" and self.loss:
+            # Generate backward
+            serial_loss = self._loss_var
+            params_grads = self._generate_backward(
+                serial_main_program, serial_startup_program, serial_loss)
+            # Apply pre optimization passes
+            self._apply_pre_optimization(serial_main_program,
+                                         serial_startup_program, serial_loss,
+                                         params_grads)
+            # Do logical partition
+            partitioner = Partitioner(dist_context, rank)
+            dist_main_prog, dist_startup_prog, dist_params_grads = partitioner.partition(
+                serial_main_program, serial_startup_program, params_grads)
+            # Generate optimizer
+            self._generate_optimizer(dist_main_prog, dist_startup_prog,
+                                     dist_params_grads)
+            # Do reshard process
+            set_grad_var_shape(dist_main_prog, dist_context)
+            make_data_unshard(dist_main_prog, dist_startup_prog, dist_context)
+            reshard(dist_main_prog, dist_startup_prog, rank, dist_context,
+                    dist_params_grads)
+            # Apply post optimization passes
+            self._apply_post_optimization(dist_main_prog, dist_startup_prog,
+                                          rank, dist_params_grads)
+        self._dist_main_progs[self.mode][rank] = dist_main_prog
+        self._dist_startup_progs[self.mode][rank] = dist_startup_prog
+
+    def _generate_backward(self, main_program, startup_program, loss):
+        with program_guard(main_program, startup_program):
+            params_grads = append_backward(
+                loss,
+                distop_context=self._dist_contexts[self.mode].dist_op_context)
+        self._completer.complete_backward_annotation(main_program)
+        return params_grads
+
+    def _generate_optimizer(self, main_program, startup_program, params_grads):
+        with program_guard(main_program, startup_program):
+            optimizer_ops = copy.deepcopy(self.optimizer).apply_gradients(
+                params_grads)
+        self._completer.complete_update_annotation(main_program)
+        return optimizer_ops
+
+    def _apply_pre_optimization(self, main_program, startup_program, loss,
+                                params_grads):
+        # apply amp pass
+        if self.strategy.amp:
+            config = copy.deepcopy(self.strategy.amp_configs)
+            config["dist_context"] = self._dist_contexts[self.mode]
+            config["params_grads"] = params_grads
+            config["loss"] = loss
+            auto_parallel_amp_pass = new_pass("auto_parallel_amp", config)
+            auto_parallel_amp_pass.apply([main_program], [startup_program],
+                                         self._pass_contexts[self.mode])
+
+        # apply recompute pass
+        if self.strategy.recompute:
+            config = copy.deepcopy(self.strategy.recompute_configs)
+            config["dist_context"] = self._dist_contexts[self.mode]
+            config["no_grad_set"] = None
+            config["loss"] = loss
+            auto_parallel_recompute_pass = new_pass("auto_parallel_recompute",
+                                                    config)
+            auto_parallel_recompute_pass.apply([main_program],
+                                               [startup_program],
+                                               self._pass_contexts[self.mode])
+
+    def _apply_post_optimization(self, main_program, startup_program, rank,
+                                 params_grads):
+        if self.strategy.sharding:
+            config = copy.deepcopy(self.strategy.sharding_configs)
+            config["dist_context"] = self._dist_contexts[self.mode]
+            config["params_grads"] = params_grads
+            config["global_rank"] = rank
+            auto_parallel_sharding_pass = new_pass("auto_parallel_sharding",
+                                                   config)
+            auto_parallel_sharding_pass.apply([main_program],
+                                              [startup_program],
+                                              self._pass_contexts[self.mode])
+
+        if self.strategy.gradient_merge:
+            config = copy.deepcopy(self.strategy.gradient_merge_configs)
+            config["dist_context"] = self._dist_contexts[self.mode]
+            config["params_grads"] = params_grads
+            auto_parallel_gradient_merge_pass = new_pass(
+                "auto_parallel_gradient_merge_pass", config)
+            auto_parallel_gradient_merge_pass.apply(
+                [main_program], [startup_program],
+                self._pass_contexts[self.mode])
+
+    def fit(self, train_data, batch_size=1, epochs=1, steps_per_epoch=1000):
+        assert isinstance(train_data, Dataset)
+        assert steps_per_epoch is not None
+        train_dataloader = self._create_dataloader(train_data, batch_size,
+                                                   epochs, steps_per_epoch)
+        self._init_communication()
+        dist_startup_prog = self._dist_startup_progs["train"][self._cur_rank]
+        self._executor.run(dist_startup_prog)
+        for epoch in range(epochs):
+            # train_dataloader.start()
+            # for step in range(steps_per_epoch):
+            #     logs = self.train_step(None)
+            #     self._logger.info(logs)
+            # train_dataloader.reset()
+            for step, data in enumerate(train_dataloader):
+                logs = self._train_step(data)
+                train_logs = {
+                    "train_" + name: val
+                    for name, val in logs.items()
+                }
+                self._logger.info(logs)
+
+    def _train_step(self, data):
+        logs = {}
+        dist_main_prog = self._dist_main_progs["train"][self._cur_rank]
+        if self._loss_var.name not in dist_main_prog.global_block().vars:
+            loss = self._executor.run(dist_main_prog)
+            logs["loss"] = None
+        else:
+            fetch_list = self._loss_var
+            loss = self._executor.run(dist_main_prog, fetch_list=fetch_list)
+            logs["loss"] = loss
+        return logs
+
+    def _create_dataloader(self, dataset, batch_size, epochs, steps_per_epoch):
+        feed_list = self._input_vars + self._label_vars
+        dist_main_prog = self._dist_main_progs[self.mode][self._cur_rank]
+        dist_startup_prog = self._dist_startup_progs[self.mode][self._cur_rank]
+        dist_context = self._dist_contexts[self.mode]
+        dist_main_block = dist_main_prog.global_block()
+        op_size = len(dist_main_block.ops)
+        places = paddle.static.cuda_places()
+        with fluid.program_guard(dist_main_prog, dist_startup_prog):
+            dataloader = NonIterableGeneratorLoader(
+                dataset, feed_list, places, batch_size, epochs, steps_per_epoch)
+        new_op_size = len(dist_main_block.ops)
+        for idx in range(new_op_size - 1, op_size - 1, -1):
+            op = dist_main_block.ops[new_op_size - 1]
+            new_op_desc = dist_main_block.desc._prepend_op()
+            new_op_desc.copy_from(op.desc)
+            new_op = Operator(
+                dist_main_block, new_op_desc, type=new_op_desc.type())
+            dist_main_block.ops.insert(0, new_op)
+            dist_op = DistributedOperator(new_op)
+            dist_context.add_dist_op_for_program(dist_op)
+        for _ in range(new_op_size - op_size):
+            dist_main_block._remove_op(new_op_size, sync=False)
+        dist_main_block._sync_with_cpp()
+        return dataloader
+
+    def _init_communication(self):
+        # Traverse different rank programs and traverse each op of them,
+        # instantiate communication by process_mapping.
+        all_process_groups = get_all_process_groups()
+        for process_group in all_process_groups:
+            if self._cur_rank not in process_group.ranks:
+                continue
+            process_group.instantiate()
+
+    # def save(self, path, training=True):
+    #     pass
+
+    # def load(self, path, strict=True, load_optimizer=True):
+    #     pass
diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py
index 01d64550321d5e..37134764e9d1c8 100644
--- a/python/paddle/fluid/dygraph/amp/auto_cast.py
+++ b/python/paddle/fluid/dygraph/amp/auto_cast.py
@@ -149,7 +149,7 @@ def _is_gpu_bfloat16_supported():
     """
     prop = paddle.device.cuda.get_device_capability()
     cuda_version = paddle.version.cuda()
-    if cuda_version is not None:
+    if cuda_version is not None and cuda_version != 'False':
         cuda_version_check = int(cuda_version.split('.')[0]) >= 11
     else:
         cuda_version_check = False
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py
index 4d9ed5916adfd7..4a6d855a893f64 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/logging_utils.py
@@ -143,9 +143,10 @@ def error(self, msg, *args, **kwargs):
             self._output_to_stdout('ERROR: ' + msg, *args)
 
     def warn(self, msg, *args, **kwargs):
-        self.logger.warning(msg, *args, **kwargs)
-        if self.need_to_echo_log_to_stdout:
-            self._output_to_stdout('WARNING: ' + msg, *args)
+        if self.verbosity_level != -1:
+            self.logger.warning(msg, *args, **kwargs)
+            if self.need_to_echo_log_to_stdout:
+                self._output_to_stdout('WARNING: ' + msg, *args)
 
     def log(self, level, msg, *args, **kwargs):
         if self.has_verbosity(level):
diff --git a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot.cc b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot.cc
index 3ae30c2f30577a..b0519138ca5404 100644
--- a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot.cc
+++ b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/extension.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 namespace paddle {
 
@@ -21,19 +21,19 @@ namespace custom_kernel {
 // Here we use dot <CPU, ANY, INT8> for test
 // This test will fail when this kernel is supported in framework
 template <typename T, typename Context>
-void Dot(const Context& dev_ctx,
-         const paddle::Tensor& x,
-         const paddle::Tensor& y,
-         paddle::Tensor* out) {
+void DotKernel(const Context& dev_ctx,
+               const phi::DenseTensor& x,
+               const phi::DenseTensor& y,
+               phi::DenseTensor* out) {
   auto const *x_ptr = x.data<T>(), *x_ptr_ = &x_ptr[0];
   auto const *y_ptr = y.data<T>(), *y_ptr_ = &y_ptr[0];
-  auto* z = out->mutable_data<T>(paddle::PlaceType::kCPU);
+  T* z = dev_ctx.template Alloc<T>(out);
 
   // Loop over the total N elements of both operands while sum-reducing every
   // B pairs along the way where B is the dimension of the least ordered axis
-  auto shape = x.shape();
+  auto&& d = x.dims();
   auto const N = x.numel();
-  auto const B = shape[shape.size() - 1];
+  auto const B = d[d.size() - 1];
 
   for (int j = 0; j < N / B; j++) {
     T ss = 0;
@@ -45,6 +45,7 @@ void Dot(const Context& dev_ctx,
 }  // namespace custom_kernel
 }  // namespace paddle
 
-PD_REGISTER_KERNEL(dot, CPU, ALL_LAYOUT, paddle::custom_kernel::Dot, int8_t) {
+PD_REGISTER_BUILTIN_KERNEL(
+    dot, CPU, ALL_LAYOUT, paddle::custom_kernel::DotKernel, int8_t) {
   kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::INT8);
 }
diff --git a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py
index 5e3bd2f8ed98d1..3cef228d14d6eb 100644
--- a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py
+++ b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -16,9 +16,28 @@
 from paddle.fluid import core
 from distutils.sysconfig import get_python_lib
 from distutils.core import setup, Extension
+from setuptools.command.build_ext import build_ext
+
+
+# refer: https://note.qidong.name/2018/03/setup-warning-strict-prototypes
+# Avoid a gcc warning below:
+# cc1plus: warning: command line option ‘-Wstrict-prototypes’ is valid
+# for C/ObjC but not for C++
+class BuildExt(build_ext):
+    def build_extensions(self):
+        if '-Wstrict-prototypes' in self.compiler.compiler_so:
+            self.compiler.compiler_so.remove('-Wstrict-prototypes')
+        super(BuildExt, self).build_extensions()
+
 
 # cc flags
-paddle_extra_compile_args = ['-std=c++14', '-shared', '-fPIC']
+paddle_extra_compile_args = [
+    '-std=c++14',
+    '-shared',
+    '-fPIC',
+    '-Wno-parentheses',
+    '-DPADDLE_WITH_CUSTOM_KERNEL',
+]
 if core.is_compiled_with_npu():
     paddle_extra_compile_args += ['-D_GLIBCXX_USE_CXX11_ABI=0']
 
@@ -27,6 +46,14 @@
 paddle_custom_kernel_include = [
     os.path.join(site_packages_path, 'paddle', 'include'),
 ]
+# include path third_party
+compile_third_party_path = os.path.join(os.environ['PADDLE_ROOT'],
+                                        'build/third_party')
+paddle_custom_kernel_include += [
+    os.path.join(compile_third_party_path, 'boost/src/extern_boost'),  # boost
+    os.path.join(compile_third_party_path, 'install/gflags/include'),  # gflags
+    os.path.join(compile_third_party_path, 'install/glog/include'),  # glog
+]
 
 # libs path
 paddle_custom_kernel_library_dir = [
@@ -50,4 +77,5 @@
     name='custom_kernel_dot',
     version='1.0',
     description='custom kernel fot compiling',
+    cmdclass={'build_ext': BuildExt},
     ext_modules=[custom_kernel_dot_module])
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 0c81a0e9346d61..ca18416a7a1235 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -9,7 +9,16 @@ list(REMOVE_ITEM DIST_TEST_OPS "test_dist_op")
 if ((NOT WITH_NCCL) AND (NOT WITH_RCCL))
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_mnist_dgc_nccl")
 endif()
+
 string(REPLACE ".py" "" DIST_TEST_OPS "${DIST_TEST_OPS}")
+
+if ((NOT WITH_GPU) AND (NOT WITH_XPU) AND NOT (WITH_ASCEND OR WITH_ASCEND_CL))
+    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_grad_clip")
+    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_heter_ctr")
+    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_ps_gpu_ctr")
+    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_mnist_batch_merge")
+endif()
+
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_mnist)
 list(APPEND DIST_TEST_OPS test_pipeline)
 list(APPEND DIST_TEST_OPS test_ir_pass_pipeline)
@@ -18,12 +27,14 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_se_resnext)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding_over_height)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_transformer)
-list(APPEND DIST_TEST_OPS test_fleet_pipeline_meta_optimizer)
-list(APPEND DIST_TEST_OPS test_fleet_pipeline_meta_optimizer_with_recompute)
-list(APPEND DIST_TEST_OPS test_fleet_raw_program_meta_optimizer)
-list(APPEND DIST_TEST_OPS test_rnn_dp)
-list(APPEND DIST_TEST_OPS test_fleet_graph_execution_meta_optimizer)
-list(APPEND DIST_TEST_OPS test_gen_nccl_id_op)
+if (WITH_GPU OR WITH_XPU OR WITH_ASCEND OR WITH_ASCEND_CL)
+    list(APPEND DIST_TEST_OPS test_fleet_graph_execution_meta_optimizer)
+    list(APPEND DIST_TEST_OPS test_fleet_pipeline_meta_optimizer)
+    list(APPEND DIST_TEST_OPS test_fleet_pipeline_meta_optimizer_with_recompute)
+    list(APPEND DIST_TEST_OPS test_fleet_raw_program_meta_optimizer)
+    list(APPEND DIST_TEST_OPS test_gen_nccl_id_op)
+    list(APPEND DIST_TEST_OPS test_rnn_dp)
+endif()
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_unused_variables)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_control_flow)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_no_sync)
@@ -564,6 +575,15 @@ foreach(TEST_OP ${TEST_OPS_WITH_GC})
   py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS ${GC_ENVS})
 endforeach()
 
+if ((NOT WITH_GPU) AND (NOT WITH_XPU) AND NOT (WITH_ASCEND OR WITH_ASCEND_CL))
+    list(REMOVE_ITEM TEST_OPS "test_fleet_graph_execution_meta_optimizer")
+    list(REMOVE_ITEM TEST_OPS "test_gen_nccl_id_op")
+    list(REMOVE_ITEM TEST_OPS "test_dist_fleet_grad_clip")
+    list(REMOVE_ITEM TEST_OPS "test_dist_fleet_heter_ctr")
+    list(REMOVE_ITEM TEST_OPS "test_dist_fleet_ps_gpu_ctr")
+    list(REMOVE_ITEM TEST_OPS "test_dist_mnist_batch_merge")
+endif()
+
 foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
@@ -635,26 +655,25 @@ if(WITH_DISTRIBUTE)
     py_test_modules(test_communicator_sync MODULES test_communicator_sync ENVS ${dist_ENVS} FLAGS_communicator_send_queue_size=1 FLAGS_communicator_max_merge_var_num=1)
     py_test_modules(test_collective_optimizer MODULES test_collective_optimizer)
     if(NOT APPLE)
-    	   py_test_modules(test_fleet_base MODULES test_fleet_base ENVS ${dist_ENVS})
-    	   py_test_modules(test_fleet_base_2 MODULES test_fleet_base_2 ENVS ${dist_ENVS})
-    	   py_test_modules(test_fleet_base_3 MODULES test_fleet_base_3 ENVS ${dist_ENVS})
-    	   py_test_modules(test_fleet_recompute_meta_optimizer MODULES test_fleet_recompute_meta_optimizer ENVS ${dist_ENVS})
-	       py_test_modules(test_fleet_graph_executor MODULES test_fleet_graph_executor ENVS ${dist_ENVS})
-           py_test_modules(test_fleet_gradient_merge_meta_optimizer MODULES test_fleet_gradient_merge_meta_optimizer ENVS ${dist_ENVS})
-           py_test_modules(test_fleet_sharding_meta_optimizer MODULES test_fleet_sharding_meta_optimizer ENVS ${dist_ENVS})
-           py_test_modules(test_fleet_hybrid_meta_optimizer MODULES test_fleet_hybrid_meta_optimizer ENVS ${dist_ENVS})
-           py_test_modules(test_fleet_amp_meta_optimizer MODULES test_fleet_amp_meta_optimizer ENVS ${dist_ENVS})
-           py_test_modules(test_fleet_amp_init MODULES test_fleet_amp_init ENVS ${dist_ENVS})
-           py_test_modules(test_fleet_fp16_allreduce_meta_optimizer MODULES test_fleet_fp16_allreduce_meta_optimizer ENVS ${dist_ENVS})
-    	   py_test_modules(test_fleet_private_function MODULES test_fleet_private_function ENVS ${dist_ENVS})
-	   py_test_modules(test_fleet_meta_optimizer_base MODULES test_fleet_meta_optimizer_base ENVS ${dist_ENVS})
-	   py_test_modules(test_fleet_distributed_strategy MODULES test_fleet_distributed_strategy)
-	   py_test_modules(test_fleet_static_mp_layers MODULES test_fleet_static_mp_layers)
-	   #py_test_modules(test_fleet_auto MODULES test_fleet_auto ENVS ${dist_ENVS})
+        py_test_modules(test_fleet_base MODULES test_fleet_base ENVS ${dist_ENVS})
+        py_test_modules(test_fleet_base_2 MODULES test_fleet_base_2 ENVS ${dist_ENVS})
+        py_test_modules(test_fleet_base_3 MODULES test_fleet_base_3 ENVS ${dist_ENVS})
+        py_test_modules(test_fleet_amp_init MODULES test_fleet_amp_init ENVS ${dist_ENVS})
+        py_test_modules(test_fleet_fp16_allreduce_meta_optimizer MODULES test_fleet_fp16_allreduce_meta_optimizer ENVS ${dist_ENVS})
+        py_test_modules(test_fleet_private_function MODULES test_fleet_private_function ENVS ${dist_ENVS})
+        py_test_modules(test_fleet_meta_optimizer_base MODULES test_fleet_meta_optimizer_base ENVS ${dist_ENVS})
+        py_test_modules(test_fleet_distributed_strategy MODULES test_fleet_distributed_strategy)
+        py_test_modules(test_fleet_static_mp_layers MODULES test_fleet_static_mp_layers)
+        #py_test_modules(test_fleet_auto MODULES test_fleet_auto ENVS ${dist_ENVS})
+        if (WITH_GPU OR WITH_XPU OR WITH_ASCEND OR WITH_ASCEND_CL)
+            py_test_modules(test_fleet_amp_meta_optimizer MODULES test_fleet_amp_meta_optimizer ENVS ${dist_ENVS})
+            py_test_modules(test_fleet_gradient_merge_meta_optimizer MODULES test_fleet_gradient_merge_meta_optimizer ENVS ${dist_ENVS})
+            py_test_modules(test_fleet_graph_executor MODULES test_fleet_graph_executor ENVS ${dist_ENVS})
+            py_test_modules(test_fleet_hybrid_meta_optimizer MODULES test_fleet_hybrid_meta_optimizer ENVS ${dist_ENVS})
+            py_test_modules(test_fleet_recompute_meta_optimizer MODULES test_fleet_recompute_meta_optimizer ENVS ${dist_ENVS})
+            py_test_modules(test_fleet_sharding_meta_optimizer MODULES test_fleet_sharding_meta_optimizer ENVS ${dist_ENVS})
+        endif()
         if(NOT WIN32)
-            py_test_modules(test_fleet_localsgd_meta_optimizer MODULES test_fleet_localsgd_meta_optimizer ENVS ${dist_ENVS})
-            py_test_modules(test_fleet_lars_meta_optimizer MODULES test_fleet_lars_meta_optimizer ENVS ${dist_ENVS})
-            py_test_modules(test_fleet_lamb_meta_optimizer MODULES test_fleet_lamb_meta_optimizer ENVS ${dist_ENVS})
             py_test_modules(test_auto_parallel_partitioner MODULES test_auto_parallel_partitioner ENVS ${dist_ENVS})
             py_test_modules(test_auto_parallel_partitioner_gpt MODULES test_auto_parallel_partitioner_gpt ENVS ${dist_ENVS})
             py_test_modules(test_auto_parallel_searcher MODULES test_auto_parallel_searcher ENVS ${dist_ENVS})
@@ -664,6 +683,14 @@ if(WITH_DISTRIBUTE)
             py_test_modules(test_auto_parallel_reshard_mppp MODULES test_auto_parallel_reshard_mppp ENVS ${dist_ENVS})
             py_test_modules(test_auto_parallel_reshard_dpmppp MODULES test_auto_parallel_reshard_dpmppp ENVS ${dist_ENVS})
             py_test_modules(test_auto_parallel_cost_model MODULES test_auto_parallel_cost_model ENVS ${dist_ENVS})
+            if (WITH_GPU OR WITH_XPU OR WITH_ASCEND OR WITH_ASCEND_CL)
+                py_test_modules(test_fleet_lamb_meta_optimizer MODULES test_fleet_lamb_meta_optimizer ENVS ${dist_ENVS})
+                py_test_modules(test_fleet_lars_meta_optimizer MODULES test_fleet_lars_meta_optimizer ENVS ${dist_ENVS})
+                py_test_modules(test_fleet_localsgd_meta_optimizer MODULES test_fleet_localsgd_meta_optimizer ENVS ${dist_ENVS})
+
+
+
+            endif()
         endif(NOT WIN32)
     endif(NOT APPLE)
     if(WITH_DGC)
@@ -688,7 +715,9 @@ if(WITH_DISTRIBUTE)
         bash_test_modules(test_fleet_run_random_port START_BASH test_fleet_run_random_port.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
         bash_test_modules(test_fleet_launch_async START_BASH test_fleet_launch_async.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
         bash_test_modules(test_fleet_launch_cloud START_BASH test_fleet_launch_cloud.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
-        bash_test_modules(test_fleet_launch_nproc START_BASH test_fleet_launch_nproc.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+        if (WITH_GPU OR WITH_XPU OR WITH_ASCEND OR WITH_ASCEND_CL)
+            bash_test_modules(test_fleet_launch_nproc START_BASH test_fleet_launch_nproc.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
+        endif()
         if(WITH_ASCEND OR WITH_ASCEND_CL)
             bash_test_modules(test_fleet_launch_ascend START_BASH test_fleet_launch_ascend.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
             bash_test_modules(test_ascend_group START_BASH test_ascend_group.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR})
@@ -708,7 +737,9 @@ if(WITH_DISTRIBUTE)
         if (WITH_GLOO)
             bash_test_modules(test_cpuonly_launch START_BASH test_cpuonly_launch.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} )
         endif()
-        bash_test_modules(test_new_group START_BASH test_new_group.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}+20" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} )
+        if (WITH_GPU OR WITH_XPU OR WITH_ASCEND OR WITH_ASCEND_CL)
+            bash_test_modules(test_new_group START_BASH test_new_group.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}+20" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} )
+        endif()
     endif(NOT APPLE)
 endif()
 
diff --git a/python/paddle/fluid/tests/unittests/asp/CMakeLists.txt b/python/paddle/fluid/tests/unittests/asp/CMakeLists.txt
index 5a58bd25d27e73..b6b313465ab20a 100644
--- a/python/paddle/fluid/tests/unittests/asp/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/asp/CMakeLists.txt
@@ -11,9 +11,13 @@ endforeach(TEST_OP)
 
 if(WITH_DISTRIBUTE)
     py_test_modules(test_fleet_with_asp MODULES test_fleet_with_asp ENVS ${dist_ENVS})
-    py_test_modules(test_fleet_with_asp_amp MODULES test_fleet_with_asp_amp ENVS ${dist_ENVS})
+    if (WITH_GPU OR WITH_XPU OR WITH_ASCEND OR WITH_ASCEND_CL)
+        py_test_modules(test_fleet_with_asp_amp MODULES test_fleet_with_asp_amp ENVS ${dist_ENVS})
+    endif()
 endif()
 
 if((WITH_DISTRIBUTE) AND (NOT WIN32) AND (NOT APPLE))
-    py_test_modules(test_fleet_with_asp_sharding MODULES test_fleet_with_asp_sharding ENVS ${dist_ENVS})
+    if (WITH_GPU OR WITH_XPU OR WITH_ASCEND OR WITH_ASCEND_CL)
+        py_test_modules(test_fleet_with_asp_sharding MODULES test_fleet_with_asp_sharding ENVS ${dist_ENVS})
+    endif()
 endif()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
index 220611be181446..0a9eaf34ba512b 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
@@ -7,4 +7,5 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
     set_tests_properties(test_relaunch_with_planner PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 120)
     py_test_modules(test_relaunch_with_gpt_planner MODULES test_relaunch_with_planner ENVS ${dist_ENVS})
     set_tests_properties(test_relaunch_with_gpt_planner PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 240)
+    py_test_modules(test_engine_api MODULES test_engine_api ENVS ${dist_ENVS})
 endif()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py
new file mode 100644
index 00000000000000..0fc1ea41033e00
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api.py
@@ -0,0 +1,135 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import time
+import paddle.fluid as fluid
+import copy
+import os
+import numpy as np
+import subprocess
+import paddle
+import paddle.nn as nn
+import paddle.fluid as fluid
+import paddle.static as static
+import paddle.nn.functional as F
+import paddle.utils as utils
+from paddle.fluid import layers
+from paddle.io import Dataset, IterableDataset, DataLoader
+from paddle.static import InputSpec
+from paddle.distributed import fleet
+import paddle.distributed.auto_parallel as auto
+from paddle.distributed.auto_parallel.engine import Engine
+
+paddle.enable_static()
+global_process_mesh = auto.ProcessMesh(mesh=[0])
+batch_size = 1
+batch_num = 10
+hidden_size = 1024
+sequence_len = 512
+image_size = hidden_size
+class_num = 10
+
+paddle.seed(44)
+
+
+class MyDataset(Dataset):
+    def __init__(self, num_samples):
+        super(MyDataset, self).__init__()
+        self.num_samples = num_samples
+
+    def __getitem__(self, index):
+        input = np.random.uniform(size=image_size).astype("float32")
+        label = np.random.randint(0, class_num - 1, dtype="int64")
+        return input, label
+
+    def __len__(self):
+        return self.num_samples
+
+
+class MLPLayer(nn.Layer):
+    def __init__(self,
+                 hidden_size=1024,
+                 intermediate_size=4 * 1024,
+                 dropout_ratio=0.1,
+                 initializer_range=0.02):
+        super(MLPLayer, self).__init__()
+        d_model = hidden_size
+        dim_feedforward = intermediate_size
+        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(
+            mean=0.0, std=initializer_range))
+        bias_attr = None
+
+        self.linear0 = nn.Linear(
+            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr)
+        self.linear1 = nn.Linear(
+            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr)
+        self.linear2 = nn.Linear(d_model, 1, weight_attr, bias_attr=bias_attr)
+        # self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
+        # self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train")
+
+    def forward(self, input):
+        auto.shard_tensor(
+            input,
+            dist_attr={
+                "process_mesh": global_process_mesh,
+                "dims_mappig": [-1]
+            })
+        # out = self.norm(input)
+        out = self.linear0(input)
+        out = F.gelu(out, approximate=True)
+        out = self.linear1(out)
+        # out = self.dropout(out)
+        out = self.linear2(out)
+        return out
+
+
+class TestEngineAPI(unittest.TestCase):
+    def test_engine_api(self):
+        mlp = MLPLayer(
+            hidden_size=hidden_size,
+            intermediate_size=4 * hidden_size,
+            dropout_ratio=0.1,
+            initializer_range=0.02)
+        loss = paddle.nn.CrossEntropyLoss()
+        optimizer = paddle.fluid.optimizer.AdamOptimizer(
+            learning_rate=0.00001,
+            beta1=0.9,
+            beta2=0.999,
+            epsilon=1e-08,
+            grad_clip=None)
+
+        dataset = MyDataset(batch_num * batch_size)
+        data_spec = [
+            InputSpec([batch_size, hidden_size], 'float32', 'x'),
+            InputSpec([batch_size], 'int64', 'label')
+        ]
+
+        dist_strategy = fleet.DistributedStrategy()
+        dist_strategy.amp = False
+        dist_strategy.pipeline = False
+        dist_strategy.recompute = False
+        # init parallel optimizer
+        dist_strategy.semi_auto = True
+        fleet.init(is_collective=True, strategy=dist_strategy)
+
+        engine = Engine(mlp, data_spec, strategy=dist_strategy)
+        engine.prepare(optimizer, loss)
+        engine.fit(dataset,
+                   batch_size=batch_size,
+                   steps_per_epoch=batch_num * batch_size)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt b/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt
index b3ba7c80b32265..188b51ee16174c 100755
--- a/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt
@@ -1,6 +1,17 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
+if ((NOT WITH_GPU) AND (NOT WITH_XPU) AND NOT (WITH_ASCEND OR WITH_ASCEND_CL))
+    list(REMOVE_ITEM TEST_OPS "test_dist_fuse_adam_pass")
+    list(REMOVE_ITEM TEST_OPS "test_dist_fuse_all_reduce_pass")
+    list(REMOVE_ITEM TEST_OPS "test_dist_fuse_bn_act_pass")
+    list(REMOVE_ITEM TEST_OPS "test_dist_fuse_bn_add_act_pass")
+    list(REMOVE_ITEM TEST_OPS "test_dist_fuse_momentum_pass")
+    list(REMOVE_ITEM TEST_OPS "test_dist_fuse_relu_depthwise_conv_pass")
+    list(REMOVE_ITEM TEST_OPS "test_dist_fuse_sgd_pass")
+    list(REMOVE_ITEM TEST_OPS "test_dist_gradient_merge_pass")
+endif()
+
 foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
     list(APPEND DIST_TEST_OPS ${TEST_OP})
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py
index 567f266cd57b1e..ba1f5ed2b3ead7 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_list.py
@@ -320,10 +320,12 @@ def forward(self, x, index):
 
         if index > 0:
             res = a[0] * a[0]
+            y = y + 1
         else:
             res = a[-1] * a[-1]
+            y = y - 1
 
-        z = a[-1] * res
+        z = a[-1] * res * y[0]
         return z
 
 
@@ -333,7 +335,7 @@ def test_to_static(self):
         x = paddle.to_tensor([2, 3, 4], dtype='float32')
         index = paddle.to_tensor([1])
         res = net(x, index)
-        self.assertEqual(res[0], 16.)
+        self.assertEqual(res[0], 48.)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_dropout.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_dropout.py
index 57f5b5a0bb245c..f9bb4e66f2ab40 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_dropout.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_dropout.py
@@ -112,7 +112,7 @@ def clear_dynamic_shape():
         def generate_trt_nodes_num(attrs, dynamic_shape):
             if attrs[0]['dropout_implementation'] == "upscale_in_train":
                 return 0, 2
-            elif self.dims == 1:
+            elif self.dims == 1 and dynamic_shape == False:
                 return 0, 3
             else:
                 return 1, 2
@@ -141,17 +141,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
                                                                      True), 1e-5
 
     def add_skip_trt_case(self):
-        def teller1(program_config, predictor_config):
-            if len(
-                    program_config.inputs['input_data'].shape
-            ) == 2 and not predictor_config.tensorrt_dynamic_shape_enabled():
-                return True
-            return False
-
-        self.add_skip_case(
-            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "The output shape has diff, but we can add shuffle layer to resolve it."
-        )
+        pass
 
     def test(self):
         self.add_skip_trt_case()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_matmul.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_matmul.py
index 8913159b2c4dfc..c6f2fa205c713f 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_matmul.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_matmul.py
@@ -172,11 +172,11 @@ def generate_dynamic_shape(attrs):
             }
             self.dynamic_shape.max_input_shape = {
                 "input1_data": [16, 4, 4],
-                "input2_data": [16, 4, 128]
+                "input2_data": [16, 4, 4]
             }
             self.dynamic_shape.opt_input_shape = {
                 "input1_data": [8, 4, 4],
-                "input2_data": [8, 4, 16]
+                "input2_data": [8, 4, 4]
             }
 
         attrs = [
@@ -192,17 +192,7 @@ def generate_dynamic_shape(attrs):
         yield self.create_inference_config(), (1, 3), 1e-5
 
     def add_skip_trt_case(self):
-        def teller1(program_config, predictor_config):
-            if len(
-                    self.dynamic_shape.min_input_shape
-            ) != 0 and self.trt_param.precision == paddle_infer.PrecisionType.Half:
-                return True
-            return False
-
-        self.add_skip_case(
-            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "Tensorrt MatrixMultiply layer will get error when dynamic shape fp16 mode."
-        )
+        pass
 
     def test(self):
         self.add_skip_trt_case()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape_op.py
index b23c7d9b493d0c..0522df3a9219d5 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape_op.py
@@ -91,14 +91,10 @@ def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
             data = fluid.data(
                 name='data', shape=self.data_shape, dtype='float32')
-            actual_reshape = fluid.data(
-                name='actual_reshape', shape=[4], dtype='int32')
-            reshape_out = fluid.layers.reshape(
-                x=data, shape=self.reshape, actual_shape=actual_reshape)
+            reshape_out = fluid.layers.reshape(x=data, shape=self.reshape)
             out = fluid.layers.batch_norm(reshape_out, is_test=True)
         self.feeds = {
-            'data': np.random.random(self.data_shape).astype('float32'),
-            'actual_reshape': np.array([2, 0, -1, 6]).astype('int32')
+            'data': np.random.random(self.data_shape).astype('float32')
         }
         self.enable_trt = True
         self.trt_parameters = TRTReshapeTest.TensorRTParam(
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 85423df3d38283..ec3b68086b0659 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -168,8 +168,10 @@ def __get_elem__(tensor, i):
         elif tensor_to_check._dtype() == core.VarDesc.VarType.BF16:
             numpy_tensor = np.array(tensor).astype(np.uint16)
             numpy_tensor = numpy_tensor.flatten()
-            return struct.unpack('<f', struct.pack('<I', numpy_tensor[i]
-                                                   << 16))[0]
+            return struct.unpack('<f',
+                                 struct.pack('<I',
+                                             np.uint32(numpy_tensor[i])
+                                             << np.uint32(16)))[0]
         elif tensor_to_check_dtype == np.float32:
             return tensor._get_float_element(i)
         elif tensor_to_check_dtype == np.float64:
@@ -272,7 +274,7 @@ def convert_float_to_uint16(float_list, data_format="NCHW"):
 def convert_uint16_to_float(in_list):
     in_list = np.asarray(in_list)
     out = np.vectorize(
-        lambda x: struct.unpack('<f', struct.pack('<I', x << 16))[0],
+        lambda x: struct.unpack('<f', struct.pack('<I', np.uint32(x) << np.uint32(16)))[0],
         otypes=[np.float32])(in_list.flat)
     return np.reshape(out, in_list.shape)
 
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_max_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_max_op.py
index 624619f499f666..719ee5df6dbbf5 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_max_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_max_op.py
@@ -16,7 +16,10 @@
 
 import unittest
 import numpy as np
-from op_test import OpTest, skip_check_grad_ci
+from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
+import os
+import re
+import paddle.fluid.core as core
 
 
 class TestElementwiseOp(OpTest):
@@ -46,6 +49,38 @@ def test_check_grad_ingore_y(self):
             ['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y'))
 
 
+@unittest.skipIf(
+    core.is_compiled_with_cuda() and core.cudnn_version() < 8100,
+    "run test when gpu is availble and the minimum cudnn version is 8.1.0.")
+class TestElementwiseBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_max"
+        self.dtype = np.uint16
+        # If x and y have the same value, the max() is not differentiable.
+        # So we generate test data by the following method
+        # to avoid them being too close to each other.
+        x = np.random.uniform(0.1, 1, [13, 17]).astype(np.float32)
+        sgn = np.random.choice([-1, 1], [13, 17]).astype(np.float32)
+        y = x + sgn * np.random.uniform(0.1, 1, [13, 17]).astype(np.float32)
+        self.inputs = {
+            'X': convert_float_to_uint16(x),
+            'Y': convert_float_to_uint16(y)
+        }
+        self.outputs = {'Out': convert_float_to_uint16(np.maximum(x, y))}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out')
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(['Y'], 'Out', no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad(['X'], 'Out', no_grad_set=set('Y'))
+
+
 @skip_check_grad_ci(
     reason="[skip shape check] Use y_shape(1) to test broadcast.")
 class TestElementwiseMaxOp_scalar(TestElementwiseOp):
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_exe_dist_model_tensor.py b/python/paddle/fluid/tests/unittests/test_fleet_exe_dist_model_tensor.py
index a74b4f0d224ef6..2d4fe92f05156c 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_exe_dist_model_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_exe_dist_model_tensor.py
@@ -58,6 +58,19 @@ def test_dist_model_tensor(self):
         self.assertEqual(dist_tensor_float.as_ndarray().ravel().tolist(),
                          tensor_float.ravel().tolist())
 
+        tensor_float_16 = np.random.randn(20, 2).astype('float16')
+        dist_tensor_float_16 = DistModelTensor(tensor_float_16,
+                                               'float_tensor_16')
+        self.assertEqual(dist_tensor_float_16.dtype, DistModelDataType.FLOAT16)
+        self.assertEqual(
+            dist_tensor_float_16.data.tolist('float16'),
+            tensor_float_16.ravel().tolist())
+        self.assertEqual(dist_tensor_float_16.data.length(), 40 * 2)
+        self.assertEqual(dist_tensor_float_16.name, 'float_tensor_16')
+        dist_tensor_float_16.data.reset(tensor_float_16)
+        self.assertEqual(dist_tensor_float_16.as_ndarray().ravel().tolist(),
+                         tensor_float_16.ravel().tolist())
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
index 70ab1cc523507e..43bcc3438eef49 100644
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
@@ -21,7 +21,7 @@
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 from paddle.fluid.executor import Executor
-from op_test import OpTest
+from paddle.fluid.tests.unittests.op_test import OpTest
 import paddle
 
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
index 57af5739f5dbee..3436e443ab853a 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_activation_op_xpu.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,32 +14,31 @@
 
 from __future__ import print_function
 
-import sys
-sys.path.append("..")
 import unittest
 import numpy as np
-import paddle.fluid.core as core
+import sys
+sys.path.append("..")
+
+import paddle
+
 from op_test import OpTest
 from op_test_xpu import XPUOpTest
-from scipy.special import expit, erf
-import paddle
-import paddle.fluid as fluid
-import paddle.nn as nn
-import paddle.nn.functional as F
-from paddle.fluid import compiler, Program, program_guard
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 
+paddle.enable_static()
 
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPUActivation(XPUOpTest):
+
+class TestActivationOPBase(XPUOpTest):
     def setUp(self):
-        self.op_type = "exp"
+        self.place = paddle.XPUPlace(0)
         self.init_dtype()
-        self.init_kernel_type()
+        self.set_case()
 
-        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
-        out = np.exp(x)
+    def set_case(self):
+        self.op_type = 'exp'
 
+        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+        out = np.exp(x)
         self.attrs = {'use_xpu': True}
         self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
         self.outputs = {'Out': out}
@@ -48,182 +47,180 @@ def init_dtype(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place, atol=1e-3)
+        self.check_output_with_place(self.place)
 
-    def init_kernel_type(self):
-        pass
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ['X'], 'Out')
 
 
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPUSigmoid(TestXPUActivation):
-    def setUp(self):
-        self.op_type = "sigmoid"
-        self.init_dtype()
+class XPUTestExpOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'exp'
+        self.use_dynamic_create_class = False
 
-        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
-        out = 1 / (1 + np.exp(-x))
+    class XPUTestExp(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = 'exp'
+            self.dtype = self.in_type
 
-        self.attrs = {'use_xpu': True}
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
+            x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+            out = np.exp(x)
+            self.attrs = {'use_xpu': True}
+            self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+            self.outputs = {'Out': out}
 
-    def test_check_grad(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ['X'], 'Out')
 
+support_types = get_xpu_op_support_types('exp')
+for stype in support_types:
+    create_test_class(globals(), XPUTestExpOP, stype)
 
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPUTanh(TestXPUActivation):
-    def setUp(self):
-        self.op_type = "tanh"
-        self.init_dtype()
-        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
-        out = np.tanh(x)
 
-        self.attrs = {'use_xpu': True}
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
+class XPUTestSigmoidOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'sigmoid'
+        self.use_dynamic_create_class = False
 
-    def test_check_grad(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ['X'], 'Out')
+    class XPUTestSigmoid(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "sigmoid"
+            self.dtype = self.in_type
 
+            x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+            out = 1 / (1 + np.exp(-x))
+            self.attrs = {'use_xpu': True}
+            self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+            self.outputs = {'Out': out}
 
-class TestXPUTanhFP16(TestXPUActivation):
-    def setUp(self):
-        self.op_type = "tanh"
-        self.init_dtype()
-        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
-        out = np.tanh(x)
 
-        self.attrs = {'use_xpu': True}
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
+support_types = get_xpu_op_support_types('sigmoid')
+for stype in support_types:
+    create_test_class(globals(), XPUTestSigmoidOP, stype)
 
-    def init_dtype(self):
-        self.dtype = np.float16
 
-    def test_check_grad(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ['X'], 'Out')
+class XPUTestTanhOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'tanh'
+        self.use_dynamic_create_class = False
 
+    class XPUTestTanh(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "tanh"
+            self.dtype = self.in_type
 
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPUSqrt(TestXPUActivation):
-    def setUp(self):
-        self.op_type = "sqrt"
-        self.init_dtype()
+            x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+            out = np.tanh(x)
+            self.attrs = {'use_xpu': True}
+            self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+            self.outputs = {'Out': out}
 
-        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
-        out = np.sqrt(x)
 
-        self.attrs = {'use_xpu': True}
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
+support_types = get_xpu_op_support_types('tanh')
+for stype in support_types:
+    create_test_class(globals(), XPUTestTanhOP, stype)
 
-    def test_check_grad(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ['X'], 'Out')
 
+class XPUTestSqrtOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'sqrt'
+        self.use_dynamic_create_class = False
 
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPUAbs(TestXPUActivation):
-    def setUp(self):
-        self.op_type = "abs"
-        self.init_dtype()
+    class XPUTestSqrt(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "sqrt"
+            self.dtype = self.in_type
 
-        x = np.random.uniform(-1, 1, [4, 25]).astype(self.dtype)
-        # Because we set delta = 0.005 in calculating numeric gradient,
-        # if x is too small, such as 0.002, x_neg will be -0.003
-        # x_pos will be 0.007, so the numeric gradient is inaccurate.
-        # we should avoid this
-        x[np.abs(x) < 0.005] = 0.02
-        out = np.abs(x)
+            x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+            out = np.sqrt(x)
 
-        self.attrs = {'use_xpu': True}
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True}
+            self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+            self.outputs = {'Out': out}
 
-    def test_check_grad(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ['X'], 'Out')
 
+support_types = get_xpu_op_support_types('sqrt')
+for stype in support_types:
+    create_test_class(globals(), XPUTestSqrtOP, stype)
 
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPURelu(TestXPUActivation):
-    def setUp(self):
-        self.op_type = "relu"
-        self.init_dtype()
 
-        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
-        # The same reason with TestAbs
-        x[np.abs(x) < 0.005] = 0.02
-        out = np.maximum(x, 0)
+class XPUTestAbsOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'abs'
+        self.use_dynamic_create_class = False
 
-        self.attrs = {'use_xpu': True}
-        self.inputs = {'X': x}
-        self.outputs = {'Out': out}
+    class XPUTestAbs(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "abs"
+            self.dtype = self.in_type
 
-    def test_check_grad(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ['X'], 'Out')
+            x = np.random.uniform(-1, 1, [4, 25]).astype(self.dtype)
+            # Because we set delta = 0.005 in calculating numeric gradient,
+            # if x is too small, such as 0.002, x_neg will be -0.003
+            # x_pos will be 0.007, so the numeric gradient is inaccurate.
+            # we should avoid this
+            x[np.abs(x) < 0.005] = 0.02
+            out = np.abs(x)
 
+            self.attrs = {'use_xpu': True}
+            self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+            self.outputs = {'Out': out}
 
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPUGelu(TestXPUActivation):
-    def setUp(self):
-        self.op_type = "gelu"
-        self.init_dtype()
-        approximate = False
-        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
-        out = gelu(x, approximate)
 
-        self.inputs = {'X': x}
-        self.outputs = {'Out': out}
-        self.attrs = {"approximate": approximate, 'use_xpu': True}
+support_types = get_xpu_op_support_types('abs')
+for stype in support_types:
+    create_test_class(globals(), XPUTestAbsOP, stype)
 
-    def test_check_grad(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ['X'], 'Out')
 
+class XPUTestReluOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'relu'
+        self.use_dynamic_create_class = False
 
-class TestXPUGelu(TestXPUActivation):
-    def setUp(self):
-        self.op_type = "gelu"
-        self.init_dtype()
-        approximate = False
-        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
-        out = gelu(x, approximate)
+    class XPUTestRelu(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "relu"
+            self.dtype = self.in_type
 
-        self.inputs = {'X': x}
-        self.outputs = {'Out': out}
-        self.attrs = {"approximate": approximate, 'use_xpu': True}
+            x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+            # The same reason with TestAbs
+            x[np.abs(x) < 0.005] = 0.02
+            out = np.maximum(x, 0)
 
-    def init_dtype(self):
-        self.dtype = np.float16
+            self.attrs = {'use_xpu': True}
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+
+
+support_types = get_xpu_op_support_types('relu')
+for stype in support_types:
+    create_test_class(globals(), XPUTestReluOP, stype)
+
+
+class XPUTestGeluOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'gelu'
+        self.use_dynamic_create_class = False
+
+    class XPUTestGelu(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "gelu"
+            self.dtype = self.in_type
+
+            approximate = False
+            x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+            out = gelu(x, approximate)
+
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+            self.attrs = {"approximate": approximate, 'use_xpu': True}
 
-    def test_check_grad(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ['X'], 'Out')
+
+support_types = get_xpu_op_support_types('gelu')
+for stype in support_types:
+    create_test_class(globals(), XPUTestGeluOP, stype)
 
 
 def gelu(x, approximate):
+    from scipy.special import erf
     if approximate:
         y_ref = 0.5 * x * (1.0 + np.tanh(
             np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3))))
@@ -232,26 +229,30 @@ def gelu(x, approximate):
     return y_ref.astype(x.dtype)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPUHardSwish(TestXPUActivation):
-    def setUp(self):
-        self.op_type = "hard_swish"
-        self.init_dtype()
-        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
-        offset = 3.0
-        threshold = 6.0
-        scale = 6.0
-        out = hard_swish(x, offset, threshold, scale)
+class XPUTestHardSwishGeluOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'hard_swish'
+        self.use_dynamic_create_class = False
 
-        self.inputs = {'X': x}
-        self.outputs = {'Out': out}
-        self.attrs = {'use_xpu': True}
+    class XPUTestHardSwish(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "hard_swish"
+            self.dtype = self.in_type
 
-    def test_check_grad(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ['X'], 'Out')
+            x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+            offset = 3.0
+            threshold = 6.0
+            scale = 6.0
+            out = hard_swish(x, offset, threshold, scale)
+
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True}
+
+
+support_types = get_xpu_op_support_types('hard_swish')
+for stype in support_types:
+    create_test_class(globals(), XPUTestHardSwishOP, stype)
 
 
 def hard_swish(x, offset, threshold, scale):
@@ -259,76 +260,99 @@ def hard_swish(x, offset, threshold, scale):
     return y_ref.astype(x.dtype)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPULog(TestXPUActivation):
-    def setUp(self):
-        self.op_type = "log"
-        self.init_dtype()
+class XPUTestLogOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'log'
+        self.use_dynamic_create_class = False
 
-        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
-        out = np.log(x)
+    class XPUTestLog(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "log"
+            self.dtype = self.in_type
 
-        self.attrs = {'use_xpu': True}
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
+            x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+            out = np.log(x)
 
+            self.attrs = {'use_xpu': True}
+            self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+            self.outputs = {'Out': out}
 
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPUSquare(TestXPUActivation):
-    def setUp(self):
-        self.op_type = "square"
-        self.init_dtype()
 
-        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
-        out = np.square(x)
+support_types = get_xpu_op_support_types('log')
+for stype in support_types:
+    create_test_class(globals(), XPUTestLogOP, stype)
 
-        self.attrs = {'use_xpu': True}
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
 
-    def test_check_grad(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ['X'], 'Out')
+class XPUTestSquareOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'square'
+        self.use_dynamic_create_class = False
 
+    class XPUTestSquare(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "square"
+            self.dtype = self.in_type
 
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPUPow(TestXPUActivation):
-    def setUp(self):
-        self.op_type = "pow"
-        self.init_dtype()
+            x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+            out = np.square(x)
 
-        x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
-        out = np.power(x, 3)
+            self.attrs = {'use_xpu': True}
+            self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+            self.outputs = {'Out': out}
 
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.attrs = {'factor': 3.0, 'use_xpu': True}
-        self.outputs = {'Out': out}
 
+support_types = get_xpu_op_support_types('square')
+for stype in support_types:
+    create_test_class(globals(), XPUTestSquareOP, stype)
 
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPULeakyRelu(TestXPUActivation):
-    def setUp(self):
-        self.op_type = "leaky_relu"
-        self.init_dtype()
-        x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
-        alpha = np.random.uniform(
-            0,
-            1, )
-        out = leaky_relu(x, alpha)
 
-        self.inputs = {'X': x}
-        self.outputs = {'Out': out}
-        self.attrs = {'use_xpu': True, 'alpha': alpha}
+class XPUTestPowOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'pow'
+        self.use_dynamic_create_class = False
 
-    def test_check_grad(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ['X'], 'Out')
+    class XPUTestPow(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "pow"
+            self.dtype = self.in_type
+
+            x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
+            out = np.power(x, 3)
+
+            self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+            self.attrs = {'factor': 3.0, 'use_xpu': True}
+            self.outputs = {'Out': out}
+
+
+support_types = get_xpu_op_support_types('pow')
+for stype in support_types:
+    create_test_class(globals(), XPUTestPowOP, stype)
+
+
+class XPUTestLeakyReluOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'leaky_relu'
+        self.use_dynamic_create_class = False
+
+    class XPUTestLeakyRelu(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "leaky_relu"
+            self.dtype = self.in_type
+
+            x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+            alpha = np.random.uniform(
+                0,
+                1, )
+            out = leaky_relu(x, alpha)
+
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True, 'alpha': alpha}
+
+
+support_types = get_xpu_op_support_types('leaky_relu')
+for stype in support_types:
+    create_test_class(globals(), XPUTestLeakyReluOP, stype)
 
 
 def leaky_relu(x, alpha):
@@ -339,69 +363,70 @@ def leaky_relu(x, alpha):
     return y_ref.astype(x.dtype)
 
 
-class TestXPUReciprocal(TestXPUActivation):
-    def setUp(self):
-        self.op_type = "reciprocal"
-        self.init_dtype()
+class XPUTestReciprocalOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'reciprocal'
+        self.use_dynamic_create_class = False
 
-        np.random.seed(1024)
-        x = np.random.uniform(1, 2, [1111, 1117]).astype(self.dtype)
-        out = np.reciprocal(x)
+    class XPUTestRecipocal(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "reciprocal"
+            self.dtype = self.in_type
 
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
-        self.attrs = {'use_xpu': True}
+            np.random.seed(1024)
+            x = np.random.uniform(1, 2, [1111, 1117]).astype(self.dtype)
+            out = np.reciprocal(x)
 
-    def test_check_grad(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ['X'], 'Out')
+            self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+            self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True}
 
 
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPUSoftPlus(TestXPUActivation):
-    def setUp(self):
-        self.op_type = "softplus"
-        self.init_dtype()
-        self.init_config()
+support_types = get_xpu_op_support_types('reciprocal')
+for stype in support_types:
+    create_test_class(globals(), XPUTestReciprocalOP, stype)
 
-        beta = np.random.uniform(0, 1)
-        threshold = np.random.uniform(0, 1)
-        out = ref_softplus(self.x, beta, threshold)
 
-        self.inputs = {'X': self.x}
-        self.outputs = {'Out': out}
-        self.attrs = {'use_xpu': True, 'beta': beta, 'threshold': threshold}
+class XPUTestSoftPlusOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'softplus'
+        self.use_dynamic_create_class = False
 
-    def init_config(self):
-        self.x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
+    class XPUTestSoftPlusBase(TestActivationOPBase):
+        def set_case(self):
+            self.op_type = "softplus"
+            self.dtype = self.in_type
 
-    def test_check_grad(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ['X'], 'Out')
+            self.init_config()
+            beta = np.random.uniform(0, 1)
+            threshold = np.random.uniform(0, 1)
+            out = ref_softplus(self.x, beta, threshold)
+
+            self.inputs = {'X': self.x}
+            self.outputs = {'Out': out}
+            self.attrs = {'use_xpu': True, 'beta': beta, 'threshold': threshold}
 
+        def init_config(self):
+            self.x = np.random.uniform(-1, 1, [11, 17]).astype(self.dtype)
 
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPUSoftPlus2(TestXPUSoftPlus):
-    def init_config(self):
-        self.x = np.random.uniform(-2, 2, [1024, 8]).astype(self.dtype)
+    class XPUTestSoftPlus2(XPUTestSoftPlusBase):
+        def init_config(self):
+            self.x = np.random.uniform(-2, 2, [1024, 8]).astype(self.dtype)
 
+    class XPUTestSoftPlus3(XPUTestSoftPlusBase):
+        def init_config(self):
+            self.x = np.random.uniform(-2, 2,
+                                       [4, 512, 15, 15]).astype(self.dtype)
 
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPUSoftPlus3(TestXPUSoftPlus):
-    def init_config(self):
-        self.x = np.random.uniform(-2, 2, [4, 512, 15, 15]).astype(self.dtype)
+    class XPUTestSoftPlus4(XPUTestSoftPlusBase):
+        def init_config(self):
+            self.x = np.random.uniform(-2, 2,
+                                       [4, 256, 22, 22]).astype(self.dtype)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPUSoftPlus4(TestXPUSoftPlus):
-    def init_config(self):
-        self.x = np.random.uniform(-2, 2, [4, 256, 22, 22]).astype(self.dtype)
+support_types = get_xpu_op_support_types('softplus')
+for stype in support_types:
+    create_test_class(globals(), XPUTestSoftPlusOP, stype)
 
 
 def ref_softplus(x, beta=1, threshold=20):
@@ -412,5 +437,4 @@ def ref_softplus(x, beta=1, threshold=20):
 
 
 if __name__ == "__main__":
-    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fill_constant_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_fill_constant_op_xpu.py
index b31c80ee9e7e8e..d989fd0afad850 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_fill_constant_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_fill_constant_op_xpu.py
@@ -17,224 +17,234 @@
 import sys
 sys.path.append("..")
 import unittest
-from op_test import OpTest
-
 import paddle
+from paddle.fluid import core
 import numpy as np
-
-
-# Situation 1: Attr(shape) is a list(without tensor)
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestFillConstantOp1(OpTest):
-    def setUp(self):
-        '''Test fill_constant op with specified value'''
-        self.op_type = "fill_constant"
-
-        self.inputs = {}
-        self.attrs = {'shape': [123, 92], 'dtype': 5, 'value': 3.8}
-        self.outputs = {'Out': np.full((123, 92), 3.8)}
-
-    def test_check_output(self):
-        place = paddle.XPUPlace(0)
-        self.check_output_with_place(place)
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestFillConstantOp2(OpTest):
-    def setUp(self):
-        '''Test fill_constant op with default value'''
-        self.op_type = "fill_constant"
-
-        self.inputs = {}
-        self.attrs = {'shape': [123, 92], 'dtype': 5}
-        self.outputs = {'Out': np.full((123, 92), 0.0)}
-
-    def test_check_output(self):
-        place = paddle.XPUPlace(0)
-        self.check_output_with_place(place)
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestFillConstantOp3(OpTest):
-    def setUp(self):
-        '''Test fill_constant op with specified int64 value'''
-        self.op_type = "fill_constant"
-
-        self.inputs = {}
-        self.attrs = {'shape': [123, 92], 'dtype': 3, 'value': 10000000000}
-        self.outputs = {'Out': np.full((123, 92), 10000000000)}
-
-    def test_check_output(self):
-        place = paddle.XPUPlace(0)
-        self.check_output_with_place(place)
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestFillConstantOp4(OpTest):
-    def setUp(self):
-        '''Test fill_constant op with specified int value'''
-        self.op_type = "fill_constant"
-
-        self.inputs = {}
-        self.attrs = {'shape': [123, 92], 'dtype': 2, 'value': 3}
-        self.outputs = {'Out': np.full((123, 92), 3)}
-
-    def test_check_output(self):
-        place = paddle.XPUPlace(0)
-        self.check_output_with_place(place)
-
-
-# Situation 2: Attr(shape) is a list(with tensor)
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestFillConstantOp1_ShapeTensorList(OpTest):
-    def setUp(self):
-        '''Test fill_constant op with specified value'''
-        self.op_type = "fill_constant"
-        self.init_data()
-        shape_tensor_list = []
-        for index, ele in enumerate(self.shape):
-            shape_tensor_list.append(("x" + str(index), np.ones(
-                (1)).astype('int32') * ele))
-
-        self.inputs = {"ShapeTensorList": shape_tensor_list}
-        self.attrs = {
-            'shape': self.infer_shape,
-            'dtype': 5,
-            'value': self.value
-        }
-        self.outputs = {'Out': np.full(self.shape, self.value)}
-
-    def init_data(self):
-        self.shape = [123, 92]
-        self.infer_shape = [-1, 92]
-        self.value = 3.8
-
-    def test_check_output(self):
-        place = paddle.XPUPlace(0)
-        self.check_output_with_place(place)
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestFillConstantOp2_ShapeTensorList(OpTest):
-    def setUp(self):
-        '''Test fill_constant op with default value'''
-        self.op_type = "fill_constant"
-        self.init_data()
-        shape_tensor_list = []
-        for index, ele in enumerate(self.shape):
-            shape_tensor_list.append(("x" + str(index), np.ones(
-                (1)).astype('int32') * ele))
-
-        self.inputs = {"ShapeTensorList": shape_tensor_list}
-        self.attrs = {'shape': self.infer_shape, 'dtype': 5}
-        self.outputs = {'Out': np.full(self.shape, 0.0)}
-
-    def init_data(self):
-        self.shape = [123, 92]
-        self.infer_shape = [-1, -1]
-
-    def test_check_output(self):
-        place = paddle.XPUPlace(0)
-        self.check_output_with_place(place)
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestFillConstantOp3_ShapeTensorList(TestFillConstantOp1_ShapeTensorList):
-    def init_data(self):
-        self.shape = [123, 92]
-        self.infer_shape = [123, -1]
-        self.value = 10000000000
-
-
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestFillConstantOp4_ShapeTensorList(TestFillConstantOp1_ShapeTensorList):
-    def init_data(self):
-        self.shape = [123, 92]
-        self.infer_shape = [123, -1]
-        self.value = 3
-
-
-# Situation 3: shape is a tensor
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestFillConstantOp1_ShapeTensor(OpTest):
-    def setUp(self):
-        '''Test fill_constant op with specified value'''
-        self.op_type = "fill_constant"
-        self.init_data()
-
-        self.inputs = {"ShapeTensor": np.array(self.shape).astype("int32")}
-        self.attrs = {'value': self.value, 'dtype': 5}
-        self.outputs = {'Out': np.full(self.shape, self.value)}
-
-    def init_data(self):
-        self.shape = [123, 92]
-        self.value = 3.8
-
-    def test_check_output(self):
-        place = paddle.XPUPlace(0)
-        self.check_output_with_place(place)
-
-
-# Situation 4: value is a tensor
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestFillConstantOp1_ValueTensor(OpTest):
-    def setUp(self):
-        '''Test fill_constant op with specified value'''
-        self.op_type = "fill_constant"
-        self.init_data()
-
-        self.inputs = {
-            "ShapeTensor": np.array(self.shape).astype("int32"),
-            'ValueTensor': np.array([self.value]).astype("float32")
-        }
-        self.attrs = {'value': self.value + 1.0, 'dtype': 5}
-        self.outputs = {'Out': np.full(self.shape, self.value)}
-
-    def init_data(self):
-        self.shape = [123, 92]
-        self.value = 3.8
-        self.dtype = np.float32
-
-    def test_check_output(self):
-        place = paddle.XPUPlace(0)
-        self.check_output_with_place(place)
-
-
-# Situation 5: value is a tensor
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestFillConstantOp2_ValueTensor(OpTest):
-    def setUp(self):
-        '''Test fill_constant op with specified value'''
-        self.op_type = "fill_constant"
-        self.init_data()
-
-        self.inputs = {
-            "ShapeTensor": np.array(self.shape).astype("int32"),
-            'ValueTensor': np.array([self.value]).astype("int32")
-        }
-        self.attrs = {'value': self.value, 'dtype': 2}
-        self.outputs = {'Out': np.full(self.shape, self.value)}
-
-    def init_data(self):
-        self.shape = [123, 92]
-        self.value = 3
-        self.dtype = np.int32
-
-    def test_check_output(self):
-        place = paddle.XPUPlace(0)
-        self.check_output_with_place(place)
-
+from op_test import OpTest, convert_float_to_uint16
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+
+
+class XPUTestFillConstantOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'fill_constant'
+        self.use_dynamic_create_class = False
+
+    # Situation 1: Attr(shape) is a list(without tensor)
+    class TestFillConstantOp(XPUOpTest):
+        def setUp(self):
+            '''Test fill_constant op with specified value
+            '''
+            self.init_dtype()
+            self.set_xpu()
+            self.op_type = "fill_constant"
+            self.place = paddle.XPUPlace(0)
+            self.set_shape()
+            self.convert_dtype2index()
+            self.set_value()
+            self.set_data()
+
+        def init_dtype(self):
+            self.dtype = self.in_type
+
+        def set_shape(self):
+            self.shape = [90, 10]
+
+        def set_xpu(self):
+            self.__class__.use_xpu = True
+            self.__class__.no_need_check_grad = True
+            self.__class__.op_type = self.in_type
+
+        def convert_dtype2index(self):
+            '''
+            if new type added, need to add corresponding index
+            '''
+            if self.dtype == np.bool_:
+                self.index = 0
+            if self.dtype == np.int16:
+                self.index = 1
+            if self.dtype == np.int32:
+                self.index = 2
+            if self.dtype == np.int64:
+                self.index = 3
+            if self.dtype == np.float16:
+                self.index = 4
+            if self.dtype == np.float32:
+                self.index = 5
+            if self.dtype == np.float64:
+                self.index = 6
+            if self.dtype == np.uint8:
+                self.index = 20
+            if self.dtype == np.int8:
+                self.index = 21
+            if self.dtype == np.uint16:  # same as paddle.bfloat16
+                self.index = 22
+            if self.dtype == np.complex64:
+                self.index = 23
+            if self.dtype == np.complex128:
+                self.index = 24
+
+        def set_value(self):
+            if self.index == 3:
+                self.value = 10000000000
+            elif self.index == 0:
+                self.value = np.random.randint(0, 2)
+            elif self.index in [20, 21]:
+                self.value = 125
+            elif self.index in [1, 2]:
+                self.value = 7
+            elif self.index in [4, 5, 6]:
+                self.value = 1e-5
+            elif self.index == 22:
+                self.value = 1.0
+            else:
+                self.value = 3.7
+
+        def set_data(self):
+            self.inputs = {}
+            self.attrs = {
+                'shape': self.shape,
+                'dtype': self.index,
+                'value': self.value
+            }
+            self.outputs = {'Out': np.full(self.shape, self.value)}
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+    class TestFillConstantOp2(TestFillConstantOp):
+        '''Test fill_constant op with default value
+        '''
+
+        def set_shape(self):
+            self.shape = [10, 10]
+
+    class TestFillConstantOp3(TestFillConstantOp):
+        '''Test fill_constant op with specified int64 value
+        '''
+
+        def set_shape(self):
+            self.shape = [123, 2, 1]
+
+    class TestFillConstantOp4(TestFillConstantOp):
+        '''Test fill_constant op with specified int value
+        '''
+
+        def set_shape(self):
+            self.shape = [123, 3, 2, 1]
+
+    class TestFillConstantOp5(TestFillConstantOp):
+        '''Test fill_constant op with specified float value
+        '''
+
+        def set_shape(self):
+            self.shape = [123]
+
+    # Situation 2: Attr(shape) is a list(with tensor)
+    class TestFillConstantOp1_ShapeTensorList(TestFillConstantOp):
+        '''Test fill_constant op with specified value
+        '''
+
+        def set_data(self):
+            shape_tensor_list = []
+            for index, ele in enumerate(self.shape):
+                shape_tensor_list.append(("x" + str(index), np.ones(
+                    (1)).astype('int32') * ele))
+
+            self.inputs = {"ShapeTensorList": shape_tensor_list}
+            self.attrs = {
+                'shape': self.infer_shape,
+                'dtype': self.index,
+                'value': self.value
+            }
+            self.outputs = {'Out': np.full(self.shape, self.value)}
+            if self.index == 22:
+                self.outputs = {
+                    'Out':
+                    np.full(self.shape,
+                            convert_float_to_uint16(
+                                np.array([self.value]).astype("float32")))
+                }
+
+        def set_shape(self):
+            self.shape = [123, 92]
+            self.infer_shape = [123, 1]
+
+    class TestFillConstantOp2_ShapeTensorList(TestFillConstantOp):
+        '''Test fill_constant op with default value
+        '''
+
+        def set_data(self):
+            shape_tensor_list = []
+            for index, ele in enumerate(self.shape):
+                shape_tensor_list.append(("x" + str(index), np.ones(
+                    (1)).astype('int32') * ele))
+
+            self.inputs = {"ShapeTensorList": shape_tensor_list}
+            self.attrs = {'shape': self.infer_shape, 'dtype': self.index}
+            self.outputs = {'Out': np.full(self.shape, 0.0)}
+
+        def set_shape(self):
+            self.shape = [123, 2, 1]
+            self.infer_shape = [1, 1, 1]
+
+    class TestFillConstantOp3_ShapeTensorList(
+            TestFillConstantOp1_ShapeTensorList):
+        def set_shape(self):
+            self.shape = [123, 3, 2, 1]
+            self.infer_shape = [123, 111, 11, 1]
+
+    class TestFillConstantOp4_ShapeTensorList(
+            TestFillConstantOp1_ShapeTensorList):
+        def set_shape(self):
+            self.shape = [123]
+            self.infer_shape = [1]
+
+    # Situation 3: shape is a tensor
+    class TestFillConstantOp1_ShapeTensor(TestFillConstantOp):
+        '''Test fill_constant op with specified value
+        '''
+
+        def set_data(self):
+            self.inputs = {"ShapeTensor": np.array(self.shape).astype("int32")}
+            self.attrs = {'value': self.value, 'dtype': self.index}
+            self.outputs = {'Out': np.full(self.shape, self.value)}
+            if self.index == 22:
+                self.outputs = {
+                    'Out':
+                    np.full(self.shape,
+                            convert_float_to_uint16(
+                                np.array([self.value]).astype("float32")))
+                }
+
+        def set_shape(self):
+            self.shape = [123, 92]
+
+    # Situation 4: value is a tensor
+    class TestFillConstantOp1_ValueTensor(TestFillConstantOp):
+        '''Test fill_constant op with specified value
+        '''
+
+        def set_data(self):
+            self.inputs = {
+                "ShapeTensor": np.array(self.shape).astype("int32"),
+                'ValueTensor': np.array([self.value]).astype(self.dtype)
+            }
+            if self.index == 22:
+                self.inputs = {
+                    'ValueTensor': convert_float_to_uint16(
+                        np.array([self.value]).astype("float32"))
+                }
+            self.attrs = {'value': self.value, 'dtype': self.index}
+            self.outputs = {'Out': np.full(self.shape, self.value)}
+
+        def set_shape(self):
+            self.shape = [123, 92]
+
+
+support_types = get_xpu_op_support_types('fill_constant')
+for stype in support_types:
+    create_test_class(globals(), XPUTestFillConstantOp, stype)
 
 if __name__ == "__main__":
     paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_prod_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_reduce_prod_op_xpu.py
index 44686ae418bfcf..b621cb59c0ed00 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_reduce_prod_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_reduce_prod_op_xpu.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,138 +18,129 @@
 import numpy as np
 import sys
 sys.path.append("..")
-from op_test_xpu import OpTest, XPUOpTest
-from op_test import skip_check_grad_ci
-import paddle
-import paddle.fluid.core as core
-import paddle.fluid as fluid
-from paddle.fluid import compiler, Program, program_guard
-from paddle.fluid.framework import convert_np_dtype_to_dtype_
-
-
-class TestXPUReduceProdOp(XPUOpTest):
-    def setUp(self):
-        self.init_op_type()
-        self.initTestCase()
-        self.use_xpu = True
-        self.use_mkldnn = False
-        self.attrs = {
-            'dim': self.axis,
-            'keep_dim': self.keep_dim,
-            'reduce_all': self.reduce_all
-        }
-        self.inputs = {'X': np.random.random(self.shape).astype("float32")}
-        if self.attrs['reduce_all']:
-            self.outputs = {'Out': self.inputs['X'].prod()}
-        else:
-            self.outputs = {
-                'Out': self.inputs['X'].prod(
-                    axis=self.axis, keepdims=self.attrs['keep_dim'])
-            }
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            paddle.enable_static()
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
-
-    def test_check_grad(self):
-        if paddle.is_compiled_with_xpu():
-            paddle.enable_static()
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ['X'], 'Out')
-
-    def init_op_type(self):
-        self.op_type = "reduce_prod"
-        self.use_mkldnn = False
-        self.keep_dim = False
-        self.reduce_all = False
-
-    def initTestCase(self):
-        self.shape = (5, 6, 10)
-        self.axis = (0, )
-
-
-class TestProdOp5D(TestXPUReduceProdOp):
-    def initTestCase(self):
-        self.shape = (1, 2, 5, 6, 10)
-        self.axis = (0, )
-
-
-class TestProdOp6D(TestXPUReduceProdOp):
-    def initTestCase(self):
-        self.shape = (1, 1, 2, 5, 6, 10)
-        self.axis = (0, )
-
-
-class TestProdOp8D(TestXPUReduceProdOp):
-    def initTestCase(self):
-        self.shape = (1, 3, 1, 2, 1, 4, 3, 10)
-        self.axis = (0, 3)
-
-
-class Test1DReduce(TestXPUReduceProdOp):
-    def initTestCase(self):
-        self.shape = 120
-        self.axis = (0, )
-
-
-class Test2DReduce0(TestXPUReduceProdOp):
-    def initTestCase(self):
-        self.shape = (20, 10)
-        self.axis = (0, )
-
 
-class Test2DReduce1(TestXPUReduceProdOp):
-    def initTestCase(self):
-        self.shape = (20, 10)
-        self.axis = (1, )
-
-
-class Test3DReduce0(TestXPUReduceProdOp):
-    def initTestCase(self):
-        self.shape = (5, 6, 7)
-        self.axis = (1, )
-
-
-class Test3DReduce1(TestXPUReduceProdOp):
-    def initTestCase(self):
-        self.shape = (5, 6, 7)
-        self.axis = (2, )
-
-
-class Test3DReduce2(TestXPUReduceProdOp):
-    def initTestCase(self):
-        self.shape = (5, 6, 7)
-        self.axis = (-2, )
-
-
-class Test3DReduce3(TestXPUReduceProdOp):
-    def initTestCase(self):
-        self.shape = (5, 6, 7)
-        self.axis = (1, 2)
-
-
-class TestKeepDimReduce(TestXPUReduceProdOp):
-    def initTestCase(self):
-        self.shape = (5, 6, 10)
-        self.axis = (1, )
-        self.keep_dim = True
-
-
-class TestKeepDim8DReduce(TestXPUReduceProdOp):
-    def initTestCase(self):
-        self.shape = (2, 5, 3, 2, 2, 3, 4, 2)
-        self.axis = (3, 4, 5)
-        self.keep_dim = True
-
-
-class TestReduceAll(TestXPUReduceProdOp):
-    def initTestCase(self):
-        self.shape = (5, 6, 2, 10)
-        self.axis = (0, )
-        self.reduce_all = True
+import paddle
 
+from op_test import OpTest
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+
+paddle.enable_static()
+
+
+class XPUTestReduceProdOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'reduce_prod'
+        self.use_dynamic_create_class = False
+
+    class TestXPUReduceProdOp(XPUOpTest):
+        def setUp(self):
+            self.place = paddle.XPUPlace(0)
+            self.init_dtype()
+            self.op_type = 'reduce_prod'
+            self.use_mkldnn = False
+            self.keep_dim = False
+            self.reduce_all = False
+            self.initTestCase()
+            self.attrs = {
+                'dim': self.axis,
+                'keep_dim': self.keep_dim,
+                'reduce_all': self.reduce_all
+            }
+            self.inputs = {'X': np.random.random(self.shape).astype(self.dtype)}
+            if self.attrs['reduce_all']:
+                self.outputs = {'Out': self.inputs['X'].prod()}
+            else:
+                self.outputs = {
+                    'Out': self.inputs['X'].prod(
+                        axis=self.axis, keepdims=self.attrs['keep_dim'])
+                }
+
+        def initTestCase(self):
+            self.shape = (5, 6, 10)
+            self.axis = (0, )
+
+        def init_dtype(self):
+            self.dtype = self.in_type
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            self.check_grad_with_place(self.place, ['X'], 'Out')
+
+    class TestProdOp5D(TestXPUReduceProdOp):
+        def initTestCase(self):
+            self.shape = (1, 2, 5, 6, 10)
+            self.axis = (0, )
+
+    class TestProdOp6D(TestXPUReduceProdOp):
+        def initTestCase(self):
+            self.shape = (1, 1, 2, 5, 6, 10)
+            self.axis = (0, )
+
+    class TestProdOp8D(TestXPUReduceProdOp):
+        def initTestCase(self):
+            self.shape = (1, 3, 1, 2, 1, 4, 3, 10)
+            self.axis = (0, 3)
+
+    class Test1DReduce(TestXPUReduceProdOp):
+        def initTestCase(self):
+            self.shape = 120
+            self.axis = (0, )
+
+    class Test2DReduce0(TestXPUReduceProdOp):
+        def initTestCase(self):
+            self.shape = (20, 10)
+            self.axis = (0, )
+
+    class Test2DReduce1(TestXPUReduceProdOp):
+        def initTestCase(self):
+            self.shape = (20, 10)
+            self.axis = (1, )
+
+    class Test3DReduce0(TestXPUReduceProdOp):
+        def initTestCase(self):
+            self.shape = (5, 6, 7)
+            self.axis = (1, )
+
+    class Test3DReduce1(TestXPUReduceProdOp):
+        def initTestCase(self):
+            self.shape = (5, 6, 7)
+            self.axis = (2, )
+
+    class Test3DReduce2(TestXPUReduceProdOp):
+        def initTestCase(self):
+            self.shape = (5, 6, 7)
+            self.axis = (-2, )
+
+    class Test3DReduce3(TestXPUReduceProdOp):
+        def initTestCase(self):
+            self.shape = (5, 6, 7)
+            self.axis = (1, 2)
+
+    class TestKeepDimReduce(TestXPUReduceProdOp):
+        def initTestCase(self):
+            self.shape = (5, 6, 10)
+            self.axis = (1, )
+            self.keep_dim = True
+
+    class TestKeepDim8DReduce(TestXPUReduceProdOp):
+        def initTestCase(self):
+            self.shape = (2, 5, 3, 2, 2, 3, 4, 2)
+            self.axis = (3, 4, 5)
+            self.keep_dim = True
+
+    class TestReduceAll(TestXPUReduceProdOp):
+        def initTestCase(self):
+            self.shape = (5, 6, 2, 10)
+            self.axis = (0, )
+            self.reduce_all = True
+
+
+support_types = get_xpu_op_support_types('reduce_prod')
+for stype in support_types:
+    create_test_class(globals(), XPUTestReduceProdOP, stype)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reshape2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_reshape2_op_xpu.py
index 1a21b0f1972b7a..0b000fc924ac1d 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_reshape2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_reshape2_op_xpu.py
@@ -14,194 +14,167 @@
 
 from __future__ import print_function
 
-import unittest
 import numpy as np
 import sys
-
+import unittest
 sys.path.append("..")
-from op_test import OpTest
+
 import paddle
-import paddle.fluid as fluid
-from paddle.fluid import compiler, Program, program_guard
-
-
-# situation 1: have shape( list, no tensor), no actual shape(Tensor)
-class TestReshapeOp(OpTest):
-    def setUp(self):
-        self.init_data()
-        self.op_type = "reshape2"
-        self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
-        self.attrs = {"shape": self.new_shape, "use_xpu": True}
-        self.outputs = {
-            "Out": self.inputs["X"].reshape(self.infered_shape),
-            'XShape': np.random.random(self.ori_shape).astype("float32")
-        }
-
-    def init_data(self):
-        self.ori_shape = (2, 60)
-        self.new_shape = (12, 10)
-        self.infered_shape = (12, 10)
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place, no_check_set=['XShape'])
-
-    def test_check_grad(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ["X"], "Out")
-
-
-class TestReshapeOpDimInfer1(TestReshapeOp):
-    def init_data(self):
-        self.ori_shape = (5, 25)
-        self.new_shape = (5, -1, 5)
-        self.infered_shape = (5, -1, 5)
-
-
-class TestReshapeOpDimInfer2(TestReshapeOp):
-    def init_data(self):
-        self.ori_shape = (10, 2, 6)
-        self.new_shape = (10, 0, 3, -1)
-        self.infered_shape = (10, 2, 3, -1)
-
-
-# situation 2: have shape(list, no tensor), have actual shape(Tensor)
-class TestReshapeOpWithInputShape(OpTest):
-    def setUp(self):
-        self.init_data()
-        self.op_type = "reshape2"
-
-        self.inputs = {
-            "X": np.random.random(self.ori_shape).astype("float32"),
-            "Shape": np.array(
-                self.actual_shape, dtype="int32")
-        }
-        self.attrs = {"shape": self.new_shape, "use_xpu": True}
-        self.outputs = {
-            "Out": self.inputs["X"].reshape(self.actual_shape),
-            'XShape': np.random.random(self.ori_shape).astype("float32")
-        }
-
-    def init_data(self):
-        self.ori_shape = (6, 20)
-        self.new_shape = (0, -1, 20)
-        self.actual_shape = (2, 3, 20)
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place, no_check_set=['XShape'])
-
-    def test_check_grad(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ["X"], "Out")
-
-
-# Situation 3: have shape(list, have tensor), no actual shape(Tensor)
-class TestReshapeOp_attr_ShapeTensor(OpTest):
-    def setUp(self):
-        self.init_data()
-        self.op_type = "reshape2"
-
-        shape_tensor = []
-        for index, ele in enumerate(self.new_shape):
-            shape_tensor.append(("x" + str(index), np.ones(
-                (1)).astype('int32') * ele))
-
-        self.inputs = {
-            "X": np.random.random(self.ori_shape).astype("float32"),
-            'ShapeTensor': shape_tensor
-        }
-        self.attrs = {'shape': self.shape, "use_xpu": True}
-        self.outputs = {
-            "Out": self.inputs["X"].reshape(self.infered_shape),
-            'XShape': np.random.random(self.ori_shape).astype("float32")
-        }
-
-    def init_data(self):
-        self.ori_shape = (4, 25)
-        self.new_shape = (10, 10)
-        self.infered_shape = (10, 10)
-        self.shape = (-1, -1)
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place, no_check_set=['XShape'])
-
-    def test_check_grad(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ["X"], "Out")
-
-
-class TestReshapeOpDimInfer1_attr_ShapeTensor(TestReshapeOp_attr_ShapeTensor):
-    def init_data(self):
-        self.ori_shape = (5, 20)
-        self.new_shape = (5, -1, 20)
-        self.infered_shape = (5, -1, 20)
-        self.shape = (5, -1, -1)
-
-
-class TestReshapeOpDimInfer2_attr_ShapeTensor(TestReshapeOp_attr_ShapeTensor):
-    def init_data(self):
-        self.ori_shape = (10, 2, 6)
-        self.new_shape = (10, 0, 3, -1)
-        self.infered_shape = (10, 2, 3, -1)
-        self.shape = (10, 0, 3, -1)
-
-
-# Situation 4: have shape(Tensor), no actual shape(Tensor)
-class TestReshapeOp_attr_OnlyShape(OpTest):
-    def setUp(self):
-        self.init_data()
-        self.op_type = "reshape2"
-
-        self.inputs = {
-            "X": np.random.random(self.ori_shape).astype("float32"),
-            "Shape": np.array(
-                self.new_shape, dtype="int32")
-        }
-        self.attrs = {"use_xpu": True}
-        self.outputs = {
-            "Out": self.inputs["X"].reshape(self.infered_shape),
-            'XShape': np.random.random(self.ori_shape).astype("float32")
-        }
-
-    def init_data(self):
-        self.ori_shape = (4, 25)
-        self.new_shape = (10, 10)
-        self.infered_shape = (10, 10)
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place, no_check_set=['XShape'])
-
-    def test_check_grad(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_grad_with_place(place, ["X"], "Out")
-
-
-class TestReshapeOpDimInfer1_attr_OnlyShape(TestReshapeOp_attr_OnlyShape):
-    def init_data(self):
-        self.ori_shape = (5, 20)
-        self.new_shape = (5, -1, 10)
-        self.infered_shape = (5, -1, 10)
-        self.shape = (5, -1, -1)
-
-
-class TestReshapeOpDimInfer2_attr_OnlyShape(TestReshapeOp_attr_OnlyShape):
-    def init_data(self):
-        self.ori_shape = (10, 2, 6)
-        self.new_shape = (10, 0, 3, -1)
-        self.infered_shape = (10, 2, 3, -1)
-        self.shape = (10, 0, 3, -1)
 
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+
+paddle.enable_static()
+
+
+class XPUTestReshapeOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = "reshape2"
+        self.use_dynamic_create_class = False
+
+    # situation 1: have shape( list, no tensor), no actual shape(Tensor)
+    class TestReshapeOp(XPUOpTest):
+        def setUp(self):
+            self.init_data()
+            self.op_type = "reshape2"
+            self.init_test_input()
+            self.init_test_output()
+            self.init_attrs()
+
+        def init_data(self):
+            self.ori_shape = (2, 60)
+            self.new_shape = (12, 10)
+            self.infered_shape = (12, 10)
+
+        def init_test_input(self):
+            self.inputs = {
+                "X": np.random.random(self.ori_shape).astype(self.dtype)
+            }
+
+        def init_test_output(self):
+            self.outputs = {
+                "Out": self.inputs["X"].reshape(self.infered_shape),
+                'XShape': np.random.random(self.ori_shape).astype(self.dtype)
+            }
+
+        def init_attrs(self):
+            self.attrs = {"shape": self.new_shape, "use_xpu": True}
+
+        def test_check_output(self):
+            if paddle.is_compiled_with_xpu():
+                place = paddle.XPUPlace(0)
+                self.check_output_with_place(place, no_check_set=['XShape'])
+
+        def test_check_grad(self):
+            if paddle.is_compiled_with_xpu():
+                place = paddle.XPUPlace(0)
+                self.check_grad_with_place(place, ["X"], "Out")
+
+    class TestReshapeOpDimInfer1(TestReshapeOp):
+        def init_data(self):
+            self.ori_shape = (5, 25)
+            self.new_shape = (5, -1, 5)
+            self.infered_shape = (5, -1, 5)
+
+    class TestReshapeOpDimInfer2(TestReshapeOp):
+        def init_data(self):
+            self.ori_shape = (10, 2, 6)
+            self.new_shape = (10, 0, 3, -1)
+            self.infered_shape = (10, 2, 3, -1)
+
+    # situation 2: have shape(list, no tensor), have actual shape(Tensor)
+    class TestReshapeOpWithInputShape(TestReshapeOp):
+        def init_data(self):
+            self.ori_shape = (6, 20)
+            self.new_shape = (0, -1, 20)
+            self.actual_shape = (2, 3, 20)
+
+        def init_test_input(self):
+            self.inputs = {
+                "X": np.random.random(self.ori_shape).astype(self.dtype),
+                "Shape": np.array(
+                    self.actual_shape, dtype="int32")
+            }
+
+        def init_test_output(self):
+            self.outputs = {
+                "Out": self.inputs["X"].reshape(self.actual_shape),
+                'XShape': np.random.random(self.ori_shape).astype(self.dtype)
+            }
+
+    # Situation 3: have shape(list, have tensor), no actual shape(Tensor)
+    class TestReshapeOp_attr_ShapeTensor(TestReshapeOp):
+        def init_data(self):
+            self.ori_shape = (4, 25)
+            self.new_shape = (10, 10)
+            self.infered_shape = (10, 10)
+            self.shape = (-1, -1)
+
+        def init_test_input(self):
+            shape_tensor = []
+            for index, ele in enumerate(self.new_shape):
+                shape_tensor.append(("x" + str(index), np.ones(
+                    (1)).astype('int32') * ele))
+
+            self.inputs = {
+                "X": np.random.random(self.ori_shape).astype(self.dtype),
+                'ShapeTensor': shape_tensor
+            }
+
+        def init_attrs(self):
+            self.attrs = {'shape': self.shape, "use_xpu": True}
+
+    class TestReshapeOpDimInfer1_attr_ShapeTensor(
+            TestReshapeOp_attr_ShapeTensor):
+        def init_data(self):
+            self.ori_shape = (5, 20)
+            self.new_shape = (5, -1, 20)
+            self.infered_shape = (5, -1, 20)
+            self.shape = (5, -1, -1)
+
+    class TestReshapeOpDimInfer2_attr_ShapeTensor(
+            TestReshapeOp_attr_ShapeTensor):
+        def init_data(self):
+            self.ori_shape = (10, 2, 6)
+            self.new_shape = (10, 0, 3, -1)
+            self.infered_shape = (10, 2, 3, -1)
+            self.shape = (10, 0, 3, -1)
+
+    # Situation 4: have shape(Tensor), no actual shape(Tensor)
+    class TestReshapeOp_attr_OnlyShape(TestReshapeOp):
+        def init_data(self):
+            self.ori_shape = (4, 25)
+            self.new_shape = (10, 10)
+            self.infered_shape = (10, 10)
+
+        def init_test_input(self):
+            self.inputs = {
+                "X": np.random.random(self.ori_shape).astype(self.dtype),
+                "Shape": np.array(
+                    self.new_shape, dtype="int32")
+            }
+
+        def init_attrs(self):
+            self.attrs = {"use_xpu": True}
+
+    class TestReshapeOpDimInfer1_attr_OnlyShape(TestReshapeOp_attr_OnlyShape):
+        def init_data(self):
+            self.ori_shape = (5, 20)
+            self.new_shape = (5, -1, 10)
+            self.infered_shape = (5, -1, 10)
+            self.shape = (5, -1, -1)
+
+    class TestReshapeOpDimInfer2_attr_OnlyShape(TestReshapeOp_attr_OnlyShape):
+        def init_data(self):
+            self.ori_shape = (10, 2, 6)
+            self.new_shape = (10, 0, 3, -1)
+            self.infered_shape = (10, 2, 3, -1)
+            self.shape = (10, 0, 3, -1)
+
+
+support_types = get_xpu_op_support_types("reshape2")
+for stype in support_types:
+    create_test_class(globals(), XPUTestReshapeOp, stype)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_shape_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_shape_op_xpu.py
index f194f3ca80cf0c..c7fa72ca7700e0 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_shape_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_shape_op_xpu.py
@@ -18,77 +18,99 @@
 import numpy as np
 import sys
 sys.path.append("..")
-from op_test import OpTest
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 import paddle
 from paddle.fluid import core
 from paddle.fluid.op import Operator
 
-
-class TestShapeOp(OpTest):
-    def setUp(self):
-        self.op_type = "shape"
-        self.config()
-        self.shape = [2, 3]
-        input = np.zeros(self.shape)
-        self.inputs = {'Input': input}
-        self.outputs = {'Out': np.array(self.shape)}
-
-    def config(self):
-        self.shape = [2, 3]
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
-
-
-class case1(TestShapeOp):
-    def config(self):
-        self.shape = [2]
-
-
-class case2(TestShapeOp):
-    def config(self):
-        self.shape = [1, 2, 3]
-
-
-class TestShapeWithSelectedRows(unittest.TestCase):
-    def get_places(self):
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-        if core.is_compiled_with_xpu():
-            places.append(core.XPUPlace(0))
-        return places
-
-    def check_with_place(self, place):
-        scope = core.Scope()
-        x_rows = [0, 1, 5, 4, 19]
-        height = 20
-        row_numel = 2
-
-        np_array = np.ones((len(x_rows), row_numel)).astype("float32")
-
-        # initialize input variable X
-        x = scope.var('X').get_selected_rows()
-        x.set_rows(x_rows)
-        x.set_height(height)
-        x_tensor = x.get_tensor()
-        x_tensor.set(np_array, place)
-
-        # initialize input variable Out
-        out_shape = scope.var("Out").get_tensor()
-        op = Operator("shape", Input="X", Out="Out")
-
-        op.run(scope, place)
-
-        out_shape = np.array(out_shape).tolist()
-        self.assertListEqual([5, 2], out_shape)
-
-    def test_check_output(self):
-        for place in self.get_places():
-            self.check_with_place(place)
-
+paddle.enable_static()
+
+
+class XPUTestShapeOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = "shape"
+        self.use_dynamic_create_class = False
+
+    class TestShapeOp(XPUOpTest):
+        def setUp(self):
+            self.dtype = self.in_type
+            self.op_type = "shape"
+            self.config()
+            input = np.zeros(self.shape)
+            self.inputs = {'Input': input.astype(self.dtype)}
+            self.outputs = {'Out': np.array(self.shape)}
+
+        def config(self):
+            self.shape = [2, 3]
+
+        def test_check_output(self):
+            if paddle.is_compiled_with_xpu():
+                place = paddle.XPUPlace(0)
+                self.check_output_with_place(place)
+
+    class TestShapeOp1(TestShapeOp):
+        def config(self):
+            self.shape = [2]
+
+    class TestShapeOp2(TestShapeOp):
+        def config(self):
+            self.shape = [1, 2, 3]
+
+    class TestShapeOp3(TestShapeOp):
+        def config(self):
+            self.shape = [1, 2, 3, 4]
+
+    class TestShapeOp4(TestShapeOp):
+        def config(self):
+            self.shape = [1, 2, 3, 4, 1024]
+
+    class TestShapeOp5(TestShapeOp):
+        def config(self):
+            self.shape = [1, 2, 3, 4, 1, 201]
+
+    class TestShapeWithSelectedRows(unittest.TestCase):
+        def setUp(self):
+            self.dtype = self.in_type
+
+        def get_places(self):
+            places = [core.CPUPlace()]
+            if core.is_compiled_with_cuda():
+                places.append(core.CUDAPlace(0))
+            if core.is_compiled_with_xpu():
+                places.append(core.XPUPlace(0))
+            return places
+
+        def check_with_place(self, place):
+            scope = core.Scope()
+            x_rows = [0, 1, 5, 4, 19]
+            height = 20
+            row_numel = 2
+
+            np_array = np.ones((len(x_rows), row_numel)).astype(self.dtype)
+
+            # initialize input variable X
+            x = scope.var('X').get_selected_rows()
+            x.set_rows(x_rows)
+            x.set_height(height)
+            x_tensor = x.get_tensor()
+            x_tensor.set(np_array, place)
+            out_shape = scope.var("Out").get_tensor()
+            op = Operator("shape", Input="X", Out="Out")
+
+            op.run(scope, place)
+
+            out_shape = np.array(out_shape).tolist()
+            self.assertListEqual([5, 2], out_shape)
+
+        def test_check_output(self):
+            for place in self.get_places():
+                self.check_with_place(place)
+
+
+support_types = get_xpu_op_support_types("shape")
+for stype in support_types:
+    create_test_class(globals(), XPUTestShapeOp, stype)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/trainer_desc.py b/python/paddle/fluid/trainer_desc.py
index 39320f5c0acf3b..cdc9b14b6e3281 100644
--- a/python/paddle/fluid/trainer_desc.py
+++ b/python/paddle/fluid/trainer_desc.py
@@ -111,6 +111,10 @@ def _set_infer(self, infer):
 
     def _set_fleet_desc(self, fleet_desc):
         self._fleet_desc = fleet_desc
+        ## serialize fleet_desc
+        from google.protobuf import text_format
+        fleet_desc_str = text_format.MessageToString(fleet_desc)
+        self.proto_desc.fleet_desc = fleet_desc_str
 
     def _gen_trainer_desc(self):
         pass
diff --git a/python/paddle/hapi/progressbar.py b/python/paddle/hapi/progressbar.py
index 6ed33f4f960b40..8020029be2a4e4 100644
--- a/python/paddle/hapi/progressbar.py
+++ b/python/paddle/hapi/progressbar.py
@@ -20,6 +20,7 @@
 import sys
 import time
 import numpy as np
+import struct
 from collections import namedtuple
 
 __all__ = []
@@ -79,6 +80,20 @@ def start(self):
     def update(self, current_num, values={}):
         now = time.time()
 
+        def convert_uint16_to_float(in_list):
+            in_list = np.asarray(in_list)
+            out = np.vectorize(
+                lambda x: struct.unpack('<f', struct.pack('<I', x << 16))[0],
+                otypes=[np.float32])(in_list.flat)
+            return np.reshape(out, in_list.shape)
+
+        for i, (k, val) in enumerate(values):
+            if k == "loss":
+                val = val if isinstance(val, list) or isinstance(
+                    val, np.ndarray) else [val]
+                if isinstance(val[0], np.uint16):
+                    values[i] = ("loss", list(convert_uint16_to_float(val)))
+
         if current_num:
             time_per_unit = (now - self._start) / current_num
         else:
diff --git a/python/paddle/tests/test_progressbar.py b/python/paddle/tests/test_progressbar.py
index 4726522918238a..a68aee7aa8f896 100644
--- a/python/paddle/tests/test_progressbar.py
+++ b/python/paddle/tests/test_progressbar.py
@@ -41,6 +41,7 @@ def prog_bar(self, num, epoch, width, verbose=1):
         progbar.update(1, [['loss', 1e-4]])
         progbar.update(1, [['loss', np.array([1.])]])
         progbar.update(1, [['loss', np.array([1e-4])]])
+        progbar.update(1, [['loss', np.array([1]).astype(np.uint16)]])
         progbar.start()
 
         progbar.update(0, values)
diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py
index 106f698fd4b1e6..f1e69a21f28d86 100644
--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -118,7 +118,7 @@ def source_include(header_file_path):
 
 def api_register():
     return """
-PT_REGISTER_API(Math);
+PD_REGISTER_API(Math);
 """
 
 
diff --git a/python/paddle/utils/code_gen/wrapped_infermeta_gen.py b/python/paddle/utils/code_gen/wrapped_infermeta_gen.py
index 53270c0546eae1..0d018f8e3f64fc 100644
--- a/python/paddle/utils/code_gen/wrapped_infermeta_gen.py
+++ b/python/paddle/utils/code_gen/wrapped_infermeta_gen.py
@@ -26,7 +26,7 @@ def get_wrapped_infermeta_name(api_name):
 def gene_wrapped_infermeta_and_register(api):
     if api.is_base_api and not api.is_dygraph_api:
         register_code = f"""
-PT_REGISTER_INFER_META_FN({api.kernel['func'][0]}, phi::{api.infer_meta['func']});"""
+PD_REGISTER_INFER_META_FN({api.kernel['func'][0]}, phi::{api.infer_meta['func']});"""
 
         if api.infer_meta['param'] is not None:
             kernel_params = api.kernel['param']
@@ -73,7 +73,7 @@ def gene_wrapped_infermeta_and_register(api):
 """
 
             register_code = f"""
-PT_REGISTER_INFER_META_FN({api.kernel['func'][0]}, phi::{get_wrapped_infermeta_name(api.kernel['func'][0])});"""
+PD_REGISTER_INFER_META_FN({api.kernel['func'][0]}, phi::{get_wrapped_infermeta_name(api.kernel['func'][0])});"""
 
             return declare_code, defind_code, register_code
         else:
diff --git a/python/setup.py.in b/python/setup.py.in
index 7b3909d40a01b3..f39429387dbc3e 100755
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -577,9 +577,9 @@ headers = (
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/common')) +  # pten common headers
     # pten level api headers (low level api)
     list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/core', recursive=True)) +  # pten core headers
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/backends', recursive=True)) +  # pten backends headers
     # utila api headers
-    ['@PADDLE_SOURCE_DIR@/paddle/utils/any.h'] +
-    ['@PADDLE_SOURCE_DIR@/paddle/utils/small_vector.h'] +
+    list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/utils', recursive=True)) +  # paddle utils headers
     ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/device/device_ext.h'])
 
 if '${WITH_MKLDNN}' == 'ON':
diff --git a/python/unittest_py/requirements.txt b/python/unittest_py/requirements.txt
index fe8382faa0c34c..9165764adcaf4c 100644
--- a/python/unittest_py/requirements.txt
+++ b/python/unittest_py/requirements.txt
@@ -3,6 +3,7 @@ coverage
 pycrypto ; platform_system != "Windows"
 mock
 gym
+pygame==2.1.0
 hypothesis
 opencv-python<=4.2.0.32
 visualdl
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index 7823646ff7bcb8..55d2d59c7ece6a 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -250,31 +250,42 @@ if [ "${EMPTY_GRAD_OP_REGISTERED}" != "" ] && [ "${GIT_PT_ID}" != "" ]; then
     check_approval 1 43953930 46782768 22165420 22361972
 fi
 
-HAS_MODIFIED_PTEN_FILES=`git diff --name-only upstream/$BRANCH | grep "paddle/phi/" || true`
-PTEN_INCLUDE_FLUID_FILES=""
-for CHANGE_FILE in ${HAS_MODIFIED_PTEN_FILES}; do
-    PTEN_DIR_ADDED_LINES=`git diff -U0 upstream/$BRANCH -- ${PADDLE_ROOT}/${CHANGE_FILE} | grep "^+" | grep "#include \"paddle/fluid/" || true`
-    if [ "${PTEN_DIR_ADDED_LINES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-        PTEN_INCLUDE_FLUID_FILES="${PTEN_INCLUDE_FLUID_FILES} ${CHANGE_FILE}"
+HAS_MODIFIED_PHI_FILES=`git diff --name-only upstream/$BRANCH | grep "paddle/phi/" || true`
+PHI_INCLUDE_FLUID_FILES=""
+for CHANGE_FILE in ${HAS_MODIFIED_PHI_FILES}; do
+    PHI_DIR_ADDED_LINES=`git diff -U0 upstream/$BRANCH -- ${PADDLE_ROOT}/${CHANGE_FILE} | grep "^+" | grep "#include \"paddle/fluid/" || true`
+    if [ "${PHI_DIR_ADDED_LINES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+        PHI_INCLUDE_FLUID_FILES="${PHI_INCLUDE_FLUID_FILES} ${CHANGE_FILE}"
     fi 
 done
-if [ "${PTEN_INCLUDE_FLUID_FILES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="You must have one RD (chenwhql, MingMingShangTian, YuanRisheng or zyfncg) approval for the including paddle/fluid header in paddle/phi files(${PTEN_INCLUDE_FLUID_FILES}).\n"
+if [ "${PHI_INCLUDE_FLUID_FILES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    echo_line="You must have one RD (chenwhql, MingMingShangTian, YuanRisheng or zyfncg) approval for the including paddle/fluid header in paddle/phi files(${PHI_INCLUDE_FLUID_FILES}).\n"
     check_approval 1 chenwhql MingMingShangTian YuanRisheng zyfncg
 fi
 
-HAS_MODIFIED_PTEN_KERNEL_FILES=`git diff --name-only upstream/$BRANCH | grep "paddle/phi/kernels" || true`
-PTEN_USE_MUTABLE_DATA_FILES=""
-for CHANGE_FILE in ${HAS_MODIFIED_PTEN_KERNEL_FILES}; do
-    PTEN_DIR_ADDED_LINES=`git diff -U0 upstream/$BRANCH -- ${PADDLE_ROOT}/${CHANGE_FILE} | grep "^+" | grep -w "mutable_data" || true`
-    if [ "${PTEN_DIR_ADDED_LINES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-        PTEN_USE_MUTABLE_DATA_FILES="${PTEN_USE_MUTABLE_DATA_FILES} ${CHANGE_FILE}"
+HAS_MODIFIED_PHI_KERNEL_FILES=`git diff --name-only upstream/$BRANCH | grep "paddle/phi/kernels" || true`
+PHI_USE_MUTABLE_DATA_FILES=""
+for CHANGE_FILE in ${HAS_MODIFIED_PHI_KERNEL_FILES}; do
+    PHI_DIR_ADDED_LINES=`git diff -U0 upstream/$BRANCH -- ${PADDLE_ROOT}/${CHANGE_FILE} | grep "^+" | grep -w "mutable_data" || true`
+    if [ "${PHI_DIR_ADDED_LINES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+        PHI_USE_MUTABLE_DATA_FILES="${PHI_USE_MUTABLE_DATA_FILES} ${CHANGE_FILE}"
     fi 
 done
-if [ "${PTEN_USE_MUTABLE_DATA_FILES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    echo_line="You can not use the DenseTensor::mutable_data() method in paddle/phi/kernels files(${PTEN_USE_MUTABLE_DATA_FILES}). If you want to alloc memory, use phi::DeviceContext::Alloc() or phi::DeviceContext::HostAlloc() instead and if you want to get mutable data, use DenseTensor::data(). If you have any questions, you can have one RD (chenwhql, Shixiaowei02, MingMingShangTian, YuanRisheng or zyfncg) review and approve.\n"
+if [ "${PHI_USE_MUTABLE_DATA_FILES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    echo_line="You can not use the DenseTensor::mutable_data() method in paddle/phi/kernels files(${PHI_USE_MUTABLE_DATA_FILES}). If you want to alloc memory, use phi::DeviceContext::Alloc() or phi::DeviceContext::HostAlloc() instead and if you want to get mutable data, use DenseTensor::data(). If you have any questions, you can have one RD (chenwhql, Shixiaowei02, MingMingShangTian, YuanRisheng or zyfncg) review and approve.\n"
     check_approval 1 chenwhql Shixiaowei02 MingMingShangTian YuanRisheng zyfncg
 fi
+PHI_USE_HOSTALLOC_FILES=""
+for CHANGE_FILE in ${HAS_MODIFIED_PHI_KERNEL_FILES}; do
+    PHI_DIR_ADDED_LINES=`git diff -U0 upstream/$BRANCH -- ${PADDLE_ROOT}/${CHANGE_FILE} | grep "^+" | grep -w "HostAlloc" || true`
+    if [ "${PHI_DIR_ADDED_LINES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+        PHI_USE_HOSTALLOC_FILES="${PHI_USE_HOSTALLOC_FILES} ${CHANGE_FILE}"
+    fi
+done
+if [ "${PHI_USE_HOSTALLOC_FILES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    echo_line="You must have one RD (phlrain, chenwhql) approval for the usage of phi::DeviceContext::HostAlloc() method in paddle/phi/kernels files(${PHI_USE_HOSTALLOC_FILES})\n"
+    check_approval 1 phlrain chenwhql
+fi
   
 ALL_CHANGE_FILES=`git diff --numstat upstream/$BRANCH | awk '{print $3}' | grep ".py"`
 ALL_OPTEST_BAN_DYGRAPH_MESSAGE=""
diff --git a/tools/get_pr_ut.py b/tools/get_pr_ut.py
index 0ba60265353073..6b90a656f0107c 100644
--- a/tools/get_pr_ut.py
+++ b/tools/get_pr_ut.py
@@ -139,13 +139,10 @@ def get_is_white_file(self, filename):
         """ judge is white file in pr's files. """
         isWhiteFile = False
         not_white_files = (PADDLE_ROOT + 'cmake/', PADDLE_ROOT + 'patches/',
-                           PADDLE_ROOT + 'paddle/testing/',
                            PADDLE_ROOT + 'tools/dockerfile/',
                            PADDLE_ROOT + 'tools/windows/',
                            PADDLE_ROOT + 'tools/test_runner.py',
-                           PADDLE_ROOT + 'tools/parallel_UT_rule.py',
-                           PADDLE_ROOT + 'paddle/scripts/paddle_build.sh',
-                           PADDLE_ROOT + 'paddle/scripts/paddle_build.bat')
+                           PADDLE_ROOT + 'tools/parallel_UT_rule.py')
         if 'cmakelist' in filename.lower():
             isWhiteFile = False
         elif filename.startswith((not_white_files)):
@@ -285,9 +282,21 @@ def get_pr_ut(self):
         file_list = []
         file_dict = self.get_pr_files()
         for filename in file_dict:
-            if filename.startswith(
-                (PADDLE_ROOT + 'python/', PADDLE_ROOT + 'paddle/fluid/')):
+            if filename.startswith(PADDLE_ROOT + 'python/'):
                 file_list.append(filename)
+            elif filename.startswith(PADDLE_ROOT + 'paddle/'):
+                if filename.startswith((PADDLE_ROOT + 'paddle/infrt',
+                                        PADDLE_ROOT + 'paddle/utils')):
+                    filterFiles.append(filename)
+                elif filename.startswith(PADDLE_ROOT + 'paddle/scripts'):
+                    if filename.startswith(
+                        (PADDLE_ROOT + 'paddle/scripts/paddle_build.sh',
+                         PADDLE_ROOT + 'paddle/scripts/paddle_build.bat')):
+                        file_list.append(filename)
+                    else:
+                        filterFiles.append(filename)
+                else:
+                    file_list.append(filename)
             else:
                 if file_dict[filename] == 'added':
                     file_list.append(filename)
diff --git a/tools/infrt/get_pten_kernel_function.sh b/tools/infrt/get_pten_kernel_function.sh
index 75009b077b823a..488c5b4c4328d1 100644
--- a/tools/infrt/get_pten_kernel_function.sh
+++ b/tools/infrt/get_pten_kernel_function.sh
@@ -24,9 +24,9 @@ set -e
 kernel_register_info_file=`mktemp`
 PADDLE_ROOT="$( cd "$( dirname "$0" )/../../" && pwd )"
 unset GREP_OPTIONS && find ${PADDLE_ROOT}/paddle/phi/kernels -name "*.c*" \
-  | xargs sed -e '/PT_REGISTER_\(GENERAL_\)\?KERNEL(/,/)/!d' \
+  | xargs sed -e '/PD_REGISTER_\(GENERAL_\)\?KERNEL(/,/)/!d' \
   | awk 'BEGIN { RS="{" }{ gsub(/\n /,""); print $0 }' \
-  | grep PT_REGISTER \
+  | grep PD_REGISTER \
   | awk -F ",|\(|\)" '{gsub(/ /,"");$1="";print}' \
   | sort -u  | awk '{gsub(/phi::/,"");gsub(/paddle::platform::/,"");gsub(/dtype::/,"");gsub(/paddle::/,"");print $0}' \
   | grep -v "_grad" > $kernel_register_info_file
@@ -38,7 +38,7 @@ python3 ${PADDLE_ROOT}/python/paddle/utils/code_gen/wrapped_infermeta_gen.py \
   --wrapped_infermeta_header_path ${temp_path}/generate.h \
   --wrapped_infermeta_source_path ${temp_path}/generate.cc
 
-grep PT_REGISTER_INFER_META_FN ${temp_path}/generate.cc  \
+grep PD_REGISTER_INFER_META_FN ${temp_path}/generate.cc  \
   | awk -F "\(|,|::|\)" '{print $2, $4}' > ${temp_path}/wrap_info.txt
 
 #step 3: merge all infos