From 3f545c4fe926bd24bb25aec4ceb069751f9b1f6a Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Fri, 9 Jul 2021 11:50:53 +0000
Subject: [PATCH 001/125] initial tensor design & sign kernel demo

---
 .gitignore                            |   1 +
 paddle/CMakeLists.txt                 |   1 +
 paddle/fluid/framework/eigen.h        |  34 ++++
 paddle/fluid/framework/pten_utils.h   |  51 +++++
 paddle/fluid/framework/tensor.h       |   2 +
 paddle/fluid/operators/CMakeLists.txt |   5 +-
 paddle/fluid/operators/sign_op.h      |  27 ++-
 paddle/pten/CMakeLists.txt            |   1 +
 paddle/pten/core/CMakeLists.txt       |   2 +
 paddle/pten/core/autograd_meta_if.h   |  28 +++
 paddle/pten/core/backend.h            |  36 ++++
 paddle/pten/core/base_tensor.cc       | 145 +++++++++++++++
 paddle/pten/core/base_tensor.h        | 120 ++++++++++++
 paddle/pten/core/convert_utils.cc     | 112 +++++++++++
 paddle/pten/core/convert_utils.h      |  39 ++++
 paddle/pten/core/ddim.h               |  19 ++
 paddle/pten/core/device_context.h     |  19 ++
 paddle/pten/core/dtype.h              |  45 +++++
 paddle/pten/core/layout.h             |  37 ++++
 paddle/pten/core/lod_tensor.h         |  15 ++
 paddle/pten/core/scalar_tensor.h      |  19 ++
 paddle/pten/core/selected_rows.h      |  15 ++
 paddle/pten/core/tensor.h             | 257 ++++++++++++++++++++++++++
 paddle/pten/core/tensor_impl_if.h     | 100 ++++++++++
 paddle/pten/core/tensor_meta.h        |  70 +++++++
 paddle/pten/cpu/math.h                |  34 ++++
 paddle/pten/cuda/math.h               |  34 ++++
 paddle/pten/module/sign.h             |  45 +++++
 paddle/pten/tests/CMakeLists.txt      |   0
 29 files changed, 1302 insertions(+), 11 deletions(-)
 create mode 100644 paddle/fluid/framework/pten_utils.h
 create mode 100644 paddle/pten/CMakeLists.txt
 create mode 100644 paddle/pten/core/CMakeLists.txt
 create mode 100644 paddle/pten/core/autograd_meta_if.h
 create mode 100644 paddle/pten/core/backend.h
 create mode 100644 paddle/pten/core/base_tensor.cc
 create mode 100644 paddle/pten/core/base_tensor.h
 create mode 100644 paddle/pten/core/convert_utils.cc
 create mode 100644 paddle/pten/core/convert_utils.h
 create mode 100644 paddle/pten/core/ddim.h
 create mode 100644 paddle/pten/core/device_context.h
 create mode 100644 paddle/pten/core/dtype.h
 create mode 100644 paddle/pten/core/layout.h
 create mode 100644 paddle/pten/core/lod_tensor.h
 create mode 100644 paddle/pten/core/scalar_tensor.h
 create mode 100644 paddle/pten/core/selected_rows.h
 create mode 100644 paddle/pten/core/tensor.h
 create mode 100644 paddle/pten/core/tensor_impl_if.h
 create mode 100644 paddle/pten/core/tensor_meta.h
 create mode 100644 paddle/pten/cpu/math.h
 create mode 100644 paddle/pten/cuda/math.h
 create mode 100644 paddle/pten/module/sign.h
 create mode 100644 paddle/pten/tests/CMakeLists.txt

diff --git a/.gitignore b/.gitignore
index 749832c3930cf..8a7b73d46c032 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@ paddle/fluid/API_DEV.spec
 paddle/fluid/API_PR.spec
 paddle/fluid/op_use_default_grad_maker_DEV.spec
 paddle/fluid/op_use_default_grad_maker_PR.spec
+tools/__pycache__/static_mode_white_list.cpython-37.pyc
 
 *.DS_Store
 *.vs
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index c0c04d475959d..488583fe2c767 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -2,3 +2,4 @@ add_subdirectory(scripts)
 add_subdirectory(testing)
 set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests CACHE INTERNAL "python tests directory")
 add_subdirectory(fluid)
+add_subdirectory(pten)
diff --git a/paddle/fluid/framework/eigen.h b/paddle/fluid/framework/eigen.h
index a6abda8a83bc8..e6f9085a5c7a4 100644
--- a/paddle/fluid/framework/eigen.h
+++ b/paddle/fluid/framework/eigen.h
@@ -19,6 +19,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
+#include "paddle/pten/core/base_tensor.h"
+
 namespace paddle {
 namespace framework {
 
@@ -67,6 +69,28 @@ struct EigenTensor {
   static ConstType From(const Tensor& tensor) {
     return From(tensor, tensor.dims_);
   }
+
+  // for pt::BaseTensor
+  static Type From(pt::BaseTensor& tensor, DDim dims) {  // NOLINT
+    // why tensor.data<T>() not work?
+    // return Type(const_cast<T*>(reinterpret_cast<const T*>(tensor.data())),
+    // EigenDim<D>::From(dims));
+    return Type(const_cast<T*>(tensor.data<T>()), EigenDim<D>::From(dims));
+  }
+
+  static Type From(pt::BaseTensor& tensor) {  // NOLINT
+    return From(tensor, tensor.dims());
+  }  // NOLINT
+
+  static ConstType From(const pt::BaseTensor& tensor, DDim dims) {
+    // return ConstType(reinterpret_cast<const T*>(tensor.data()),
+    // EigenDim<D>::From(dims));
+    return ConstType(tensor.data<T>(), EigenDim<D>::From(dims));
+  }
+
+  static ConstType From(const pt::BaseTensor& tensor) {
+    return From(tensor, tensor.dims());
+  }
 };
 
 template <typename T, int MajorType = Eigen::RowMajor,
@@ -109,6 +133,16 @@ struct EigenVector : public EigenTensor<T, 1, MajorType, IndexType> {
       const Tensor& tensor) {  // NOLINT
     return EigenVector::From(tensor, {product(tensor.dims_)});
   }
+
+  // for pt::BaseTensor
+  static typename EigenVector::Type Flatten(pt::BaseTensor& tensor) {  // NOLINT
+    return EigenVector::From(tensor, {product(tensor.dims())});
+  }
+
+  static typename EigenVector::ConstType Flatten(
+      const pt::BaseTensor& tensor) {  // NOLINT
+    return EigenVector::From(tensor, {product(tensor.dims())});
+  }
 };
 
 template <typename T, int MajorType = Eigen::RowMajor,
diff --git a/paddle/fluid/framework/pten_utils.h b/paddle/fluid/framework/pten_utils.h
new file mode 100644
index 0000000000000..3df999b554ce1
--- /dev/null
+++ b/paddle/fluid/framework/pten_utils.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/base_tensor.h"
+#include "paddle/pten/core/convert_utils.h"
+
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace framework {
+
+template <typename TensorImplT>
+std::shared_ptr<TensorImplT> MakeTensorImpl(const Tensor& tensor,
+                                            const platform::Place& place,
+                                            proto::VarType::Type type) {
+  auto holder = tensor.Holder();
+  auto meta =
+      pt::TensorMeta(tensor.dims(), pt::TransToPtenBackend(place),
+                     pt::TransToPtenDataType(type),
+                     pt::TransToPtenLayout(tensor.layout()), tensor.offset());
+  auto tensor_impl = std::make_shared<TensorImplT>(meta);
+  if (holder != nullptr) {
+    tensor_impl->template ShareAllocation(tensor.Holder());
+  } else {
+    LOG(WARNING) << "Old Tensor holder is nullptr.";
+  }
+  return tensor_impl;
+}
+
+template <typename TensorImplT>
+void ShareTensorImpl(TensorImplT* tensor_impl, Tensor* out) {
+  out->set_type(pt::TransToProtoVarType(tensor_impl->template type()));
+  out->ResetHolder(tensor_impl->template MoveMemory());
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 539859c45c907..5147d6c53fd80 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -211,6 +211,8 @@ class Tensor {
     return holder_->place();
   }
 
+  void set_type(proto::VarType::Type type) { type_ = type; }
+
   proto::VarType::Type type() const {
     PADDLE_ENFORCE_NOT_NULL(
         holder_,
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 0956410041bb2..7fc64f63b0ea3 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -75,7 +75,7 @@ if(WITH_UNITY_BUILD)
 endif()
 
 register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op lstm_op run_program_op eye_op recurrent_op
-        sync_batch_norm_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
+        sync_batch_norm_op sign_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
 
 op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS})
 
@@ -94,7 +94,8 @@ else()
     op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 endif()
 
-op_library(lstm_op DEPS ${OP_HEADER_DEPS}  lstm_compute)
+op_library(sign_op DEPS ${OP_HEADER_DEPS} base_tensor)
+op_library(lstm_op DEPS ${OP_HEADER_DEPS} lstm_compute)
 op_library(eye_op DEPS ${OP_HEADER_DEPS})
 op_library(recurrent_op DEPS ${OP_HEADER_DEPS})
 
diff --git a/paddle/fluid/operators/sign_op.h b/paddle/fluid/operators/sign_op.h
index b6d501afa621a..e2f5790602818 100644
--- a/paddle/fluid/operators/sign_op.h
+++ b/paddle/fluid/operators/sign_op.h
@@ -18,22 +18,31 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 
+#include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/pten/core/base_tensor.h"
+#include "paddle/pten/cpu/math.h"
+#include "paddle/pten/cuda/math.h"
+
 namespace paddle {
 namespace operators {
 template <typename DeviceContext, typename T>
 class SignKernel : public framework::OpKernel<T> {
  public:
   virtual void Compute(const framework::ExecutionContext& context) const {
+    auto* x = context.Input<framework::Tensor>("X");
     auto* out = context.Output<framework::Tensor>("Out");
-    auto* in = context.Input<framework::Tensor>("X");
-    out->mutable_data<T>(in->place());
-
-    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
-    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    EigenSign<std::decay_t<decltype(place)>, T>::Eval(place, eigen_out,
-                                                      eigen_in);
+    auto& dev_ctx = context.device_context<DeviceContext>();
+
+    auto pt_x =
+        framework::MakeTensorImpl<pt::BaseTensor>(*x, x->place(), x->type());
+    auto pt_out =
+        framework::MakeTensorImpl<pt::BaseTensor>(*out, x->place(), x->type());
+
+    // call new kernel
+    pt::Sign<T>(dev_ctx, *pt_x.get(), pt_out.get());
+
+    // share pt_out data to out
+    framework::ShareTensorImpl(pt_out.get(), out);
   }
 };
 
diff --git a/paddle/pten/CMakeLists.txt b/paddle/pten/CMakeLists.txt
new file mode 100644
index 0000000000000..ad6d4787c23e3
--- /dev/null
+++ b/paddle/pten/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(core)
diff --git a/paddle/pten/core/CMakeLists.txt b/paddle/pten/core/CMakeLists.txt
new file mode 100644
index 0000000000000..85203251d6a7a
--- /dev/null
+++ b/paddle/pten/core/CMakeLists.txt
@@ -0,0 +1,2 @@
+cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place)
+cc_library(base_tensor SRCS base_tensor.cc DEPS enforce data_type ddim allocator place convert_utils)
diff --git a/paddle/pten/core/autograd_meta_if.h b/paddle/pten/core/autograd_meta_if.h
new file mode 100644
index 0000000000000..2b301f4c75c07
--- /dev/null
+++ b/paddle/pten/core/autograd_meta_if.h
@@ -0,0 +1,28 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace pt {
+
+class Tensor;
+
+class AutogradMetaInterface {
+ public:
+  virtual const Tensor& grad() const = 0;
+  virtual ~AutogradMetaInterface() = 0;
+  // TODO(yangjiabin): design other methods
+};
+
+}  // namespace pt
diff --git a/paddle/pten/core/backend.h b/paddle/pten/core/backend.h
new file mode 100644
index 0000000000000..ce7499fae38e8
--- /dev/null
+++ b/paddle/pten/core/backend.h
@@ -0,0 +1,36 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace pt {
+
+/**
+ * Backend not only means place. Backend is a superset of place.
+ */
+enum class Backend {
+  kUndef = 0,
+  kCPU,
+  kCUDA,
+  kCUDAPinned,
+  kHIP,
+  kXPU,
+  kNPU,
+  kNPUPinned,
+  kMKLDNN,
+  kCUDNN,
+  kNumBackends,
+};
+
+}  // namespace pt
diff --git a/paddle/pten/core/base_tensor.cc b/paddle/pten/core/base_tensor.cc
new file mode 100644
index 0000000000000..7c994b8cf2333
--- /dev/null
+++ b/paddle/pten/core/base_tensor.cc
@@ -0,0 +1,145 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/core/base_tensor.h"
+#include "paddle/pten/core/convert_utils.h"
+
+// fluid headers [may be replaced by new impl]
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace pt {
+
+// TODO(chenweihang): Place still link to framework, design abstract interface
+// of place?
+using CPUPlace = paddle::platform::CPUPlace;
+using CUDAPlace = paddle::platform::CUDAPlace;
+using CUDAPinnedPlace = paddle::platform::CUDAPinnedPlace;
+using XPUPlace = paddle::platform::XPUPlace;
+using NPUPlace = paddle::platform::NPUPlace;
+using NPUPinnedPlace = paddle::platform::NPUPinnedPlace;
+
+BaseTensor::BaseTensor(TensorMeta meta)
+    : meta_(std::forward<TensorMeta>(meta)) {}
+
+int64_t BaseTensor::numel() const { return product(meta_.dims); }
+
+DDim BaseTensor::dims() const { return meta_.dims; }
+
+void BaseTensor::resize(const DDim& dims) { meta_.dims = dims; }
+
+DataType BaseTensor::type() const { return meta_.type; }
+
+Layout BaseTensor::layout() const { return meta_.layout; }
+
+Place BaseTensor::place() const {
+  PADDLE_ENFORCE_NOT_NULL(
+      memory_,
+      paddle::platform::errors::PreconditionNotMet(
+          "Tensor not initialized yet when Tensor::place() is called."));
+  return memory_->place();
+}
+
+Backend BaseTensor::backend() const { return meta_.backend; }
+
+bool BaseTensor::initialized() const { return memory_ != nullptr; }
+
+//----------------------------------------------------------------
+// Inner methods
+
+void BaseTensor::ShareAllocation(const std::shared_ptr<Allocation>& memory) {
+  // This operation can be very slow!
+  // std::shared_ptr reference count is atomic. increasing or decreasing
+  // the reference count requires atomic increment or decrement.
+  // This is hundred times slower than non-atomic increment/decrement
+  memory_ = memory;
+}
+
+// TODO(chenweihang): Add other place branchs
+Place BaseTensor::GetPlaceByBackend() const {
+  switch (meta_.backend) {
+    case Backend::kCPU:
+      return CPUPlace();
+    case Backend::kCUDA:
+      return CUDAPlace();
+    default:
+      PADDLE_THROW(paddle::platform::errors::Unimplemented(
+          "Unsupported Tensor backend."));
+  }
+}
+
+size_t BaseTensor::MemorySize() const {
+  return memory_ == nullptr ? 0UL : memory_->size() - meta_.offset;
+}
+
+void BaseTensor::CheckMemorySize() const {
+  PADDLE_ENFORCE_NOT_NULL(memory_,
+                          paddle::platform::errors::PreconditionNotMet(
+                              "Tensor holds no memory. "
+                              "Call Tensor::mutable_data firstly."));
+  size_t size_of_type =
+      paddle::framework::SizeOfType(TransToProtoVarType(meta_.type));
+  PADDLE_ENFORCE_LE(
+      numel() * size_of_type,
+      MemorySize(),
+      paddle::platform::errors::PreconditionNotMet(
+          "Tensor's dimension is out of bound."
+          "Tensor's dimension must be equal or less than the size of its "
+          "memory."
+          "But received  Tensor's dimension is d%, memory's size is %d.",
+          numel() * size_of_type,
+          MemorySize()));
+}
+
+std::shared_ptr<Allocation> BaseTensor::MoveMemory() {
+  return std::move(memory_);
+}
+
+const void* BaseTensor::data() const {
+  CheckMemorySize();
+  return reinterpret_cast<const void*>(
+      reinterpret_cast<uintptr_t>(memory_->ptr()) + meta_.offset);
+}
+
+void* BaseTensor::mutable_data() {
+  PADDLE_ENFORCE_GE(
+      numel(),
+      0,
+      paddle::platform::errors::PreconditionNotMet(
+          "The Tensor's element number must be equal or greater than zero. "
+          "The Tensor's shape is [",
+          dims(),
+          "] now"));
+  size_t size =
+      numel() * paddle::framework::SizeOfType(TransToProtoVarType(meta_.type));
+  auto place = GetPlaceByBackend();
+  if (memory_ == nullptr) {
+    memory_.reset();
+    memory_ = paddle::memory::AllocShared(place, size);
+  } else {
+    LOG(WARNING) << "When call mutable_data, BaseTensor has been initialized.";
+    if (!(memory_->place() == place) || memory_->size() < size + meta_.offset) {
+      memory_.reset();
+      memory_ = paddle::memory::AllocShared(place, size);
+    } else {
+      // do nothing
+    }
+  }
+  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(memory_->ptr()) +
+                                 meta_.offset);
+}
+
+}  // namespace pt
diff --git a/paddle/pten/core/base_tensor.h b/paddle/pten/core/base_tensor.h
new file mode 100644
index 0000000000000..f641507d10b0c
--- /dev/null
+++ b/paddle/pten/core/base_tensor.h
@@ -0,0 +1,120 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+
+#include "paddle/pten/core/tensor_impl_if.h"
+#include "paddle/pten/core/tensor_meta.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+class Allocation;
+}
+}
+}
+
+namespace pt {
+
+// TODO(chenweihang): Allocation still link to framework, Redesign and
+// decoupled Allocation and Allocator?
+using Allocation = paddle::memory::allocation::Allocation;
+
+/**
+ * The implementation of general Tensor (For CPU, CUDA, HIP, etc.),
+ * contains a pointer to Allocation and a series of descriptive metadata
+ * required by Tensor.
+ *
+ * BaseTensor is still a base class, it may have mutiple inherited classes,
+ * such as LoDTensor, SelectedRows, etc. The memory layout
+ * of these inherited classes is consistent with the basic BaseTensor, except
+ * that a small number of members are added to further specialize the
+ * description of the tensor. For example, LoDTensor adds LoD information,
+ * and SelectedRows adds rows and height information.
+ * If the memory layout is different, it cannot be described based on the
+ * general Allocation, and it needs to be directly inherited from
+ * TensorImplInterface.
+ *
+ */
+class BaseTensor : public TensorImplInterface {
+ public:
+  // Not allowed to initialize a tensor without descriptive metadata
+  BaseTensor() = delete;
+
+  BaseTensor(const BaseTensor&) = delete;
+  BaseTensor& operator=(const BaseTensor&) = delete;
+  BaseTensor(BaseTensor&&) = delete;
+  BaseTensor& operator=(BaseTensor&&) = delete;
+
+  /**
+   * If we still malloc memory by mutable_data,
+   * the BaseTensor doesn't need complicated constructor.
+   *
+   * Note: Tensor objects lacking meta information are not allowed to exist.
+   */
+  explicit BaseTensor(TensorMeta meta);
+
+  ~BaseTensor() override {}
+
+  /**
+   * Most of Tensor's methods need to have corresponding implementations
+   * in BaseTensor
+   */
+  int64_t numel() const override;
+
+  DDim dims() const override;
+
+  void resize(const DDim& dims) override;
+
+  DataType type() const override;
+
+  Layout layout() const override;
+
+  Place place() const override;
+
+  Backend backend() const override;
+
+  const void* data() const override;
+
+  void* mutable_data() override;
+
+  bool initialized() const override;
+
+  /**
+   * using base class template methods.
+   */
+  using TensorImplInterface::data;
+  using TensorImplInterface::mutable_data;
+
+  // For non-API interfaces, we still follow the C++ code style
+  void ShareAllocation(const std::shared_ptr<Allocation>& memory);
+
+  Place GetPlaceByBackend() const;
+
+  size_t MemorySize() const;
+
+  void CheckMemorySize() const;
+
+  std::shared_ptr<Allocation> MoveMemory();
+
+ private:
+  // The actual Tensor storage holder
+  std::shared_ptr<Allocation> memory_;
+  // The Tensor meta data
+  TensorMeta meta_;
+};
+
+}  // namespace pt
diff --git a/paddle/pten/core/convert_utils.cc b/paddle/pten/core/convert_utils.cc
new file mode 100644
index 0000000000000..285db16f082d5
--- /dev/null
+++ b/paddle/pten/core/convert_utils.cc
@@ -0,0 +1,112 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/core/convert_utils.h"
+
+namespace pt {
+
+// TODO(chenweihang): Add other place branchs
+Backend TransToPtenBackend(const paddle::platform::Place& place) {
+  if (paddle::platform::is_cpu_place(place)) {
+    return Backend::kCPU;
+  } else if (paddle::platform::is_gpu_place(place)) {
+    return Backend::kCUDA;
+  } else {
+    return Backend::kUndef;
+  }
+}
+
+pt::DataType TransToPtenDataType(
+    const paddle::framework::proto::VarType::Type& dtype) {
+  // Set the order of case branches according to the frequency with
+  // the data type is used
+  switch (dtype) {
+    case paddle::framework::proto::VarType::FP32:
+      return DataType::kFLOAT32;
+    case paddle::framework::proto::VarType::FP64:
+      return DataType::kFLOAT64;
+    case paddle::framework::proto::VarType::INT64:
+      return DataType::kINT64;
+    case paddle::framework::proto::VarType::INT32:
+      return DataType::kINT32;
+    case paddle::framework::proto::VarType::INT8:
+      return DataType::kINT8;
+    case paddle::framework::proto::VarType::UINT8:
+      return DataType::kUINT8;
+    case paddle::framework::proto::VarType::INT16:
+      return DataType::kINT16;
+    case paddle::framework::proto::VarType::COMPLEX64:
+      return DataType::kCOMPLEX64;
+    case paddle::framework::proto::VarType::COMPLEX128:
+      return DataType::kCOMPLEX128;
+    case paddle::framework::proto::VarType::FP16:
+      return DataType::kFLOAT16;
+    case paddle::framework::proto::VarType::BOOL:
+      return DataType::kBOOL;
+    default:
+      return DataType::kUndef;
+  }
+}
+
+Layout TransToPtenLayout(const paddle::framework::DataLayout& layout) {
+  switch (layout) {
+    case paddle::framework::DataLayout::kNHWC:
+      return Layout::kNHWC;
+    case paddle::framework::DataLayout::kNCHW:
+      return Layout::kNCHW;
+    case paddle::framework::DataLayout::kAnyLayout:
+      return Layout::kAny;
+    case paddle::framework::DataLayout::kMKLDNN:
+      return Layout::kMKLDNN;
+    default:
+      return Layout::kUndef;
+  }
+}
+
+paddle::framework::proto::VarType::Type TransToProtoVarType(
+    const pt::DataType& dtype) {
+  // Set the order of case branches according to the frequency with
+  // the data type is used
+  switch (dtype) {
+    case DataType::kFLOAT32:
+      return paddle::framework::proto::VarType::FP32;
+    case DataType::kFLOAT64:
+      return paddle::framework::proto::VarType::FP64;
+    case DataType::kINT64:
+      return paddle::framework::proto::VarType::INT64;
+    case DataType::kINT32:
+      return paddle::framework::proto::VarType::INT32;
+    case DataType::kINT8:
+      return paddle::framework::proto::VarType::INT8;
+    case DataType::kUINT8:
+      return paddle::framework::proto::VarType::UINT8;
+    case DataType::kINT16:
+      return paddle::framework::proto::VarType::INT16;
+    case DataType::kCOMPLEX64:
+      return paddle::framework::proto::VarType::COMPLEX64;
+    case DataType::kCOMPLEX128:
+      return paddle::framework::proto::VarType::COMPLEX128;
+    case DataType::kFLOAT16:
+      return paddle::framework::proto::VarType::FP16;
+    case DataType::kBOOL:
+      return paddle::framework::proto::VarType::BOOL;
+    default:
+      PADDLE_THROW(paddle::platform::errors::Unimplemented(
+          "Unsupported data type code(%d) when casting enum data type into "
+          "paddle data type.",
+          static_cast<int>(dtype)));
+  }
+}
+
+}  // namespace pt
diff --git a/paddle/pten/core/convert_utils.h b/paddle/pten/core/convert_utils.h
new file mode 100644
index 0000000000000..e5c325e6fd4c0
--- /dev/null
+++ b/paddle/pten/core/convert_utils.h
@@ -0,0 +1,39 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/backend.h"
+#include "paddle/pten/core/dtype.h"
+#include "paddle/pten/core/layout.h"
+
+// fluid headers [may be replaced by new impl]
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/platform/place.h"
+
+// TODO(chenweihang): this file may need to be removed
+
+namespace pt {
+
+// TODO(chenweihang): Use the original var type as much as possible
+// to avoid transform, such as DataLayout, VarType
+Backend TransToPtenBackend(const paddle::platform::Place& place);
+DataType TransToPtenDataType(
+    const paddle::framework::proto::VarType::Type& dtype);
+Layout TransToPtenLayout(const paddle::framework::DataLayout& layout);
+paddle::framework::proto::VarType::Type TransToProtoVarType(
+    const DataType& dtype);
+
+}  // namespace pt
diff --git a/paddle/pten/core/ddim.h b/paddle/pten/core/ddim.h
new file mode 100644
index 0000000000000..0dee0e4690a36
--- /dev/null
+++ b/paddle/pten/core/ddim.h
@@ -0,0 +1,19 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+/**
+ * TODO(chenweihang): Design DDim Interface for new Tensor
+ */
diff --git a/paddle/pten/core/device_context.h b/paddle/pten/core/device_context.h
new file mode 100644
index 0000000000000..0dee0e4690a36
--- /dev/null
+++ b/paddle/pten/core/device_context.h
@@ -0,0 +1,19 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+/**
+ * TODO(chenweihang): Design DDim Interface for new Tensor
+ */
diff --git a/paddle/pten/core/dtype.h b/paddle/pten/core/dtype.h
new file mode 100644
index 0000000000000..04376ce24f6e0
--- /dev/null
+++ b/paddle/pten/core/dtype.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace pt {
+
+/**
+ * We need to ensure that the operator library is relatively independent
+ * and does not depend on the framework. Therefore, before calling the kernel
+ * in the Tensor operation library inside the framework, the internal
+ * data type needs to be converted to the data type in the Tensor operation
+ * library.
+ *
+ * The data type design in proto is confusing, maybe we need polish the
+ * VarType in framework.proto.
+ */
+enum class DataType {
+  kUndef = 0,
+  kBOOL,
+  kINT8,   // Char
+  kUINT8,  // BYte
+  kINT16,
+  kINT32,
+  kINT64,
+  kFLOAT16,
+  kFLOAT32,
+  kFLOAT64,
+  kCOMPLEX64,
+  kCOMPLEX128,
+  kNumDataTypes,
+};
+
+}  // namespace pt
diff --git a/paddle/pten/core/layout.h b/paddle/pten/core/layout.h
new file mode 100644
index 0000000000000..ae6c578e74ca3
--- /dev/null
+++ b/paddle/pten/core/layout.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace pt {
+
+/**
+ * We need to ensure that the operator library is relatively independent
+ * and does not depend on the framework. Therefore, before calling the kernel
+ * in the Tensor operation library inside the framework, the internal
+ * layout needs to be converted to the data type in the Tensor operation
+ * library.
+ *
+ * Here we also can use the DataLayout in framework, they are all enum classes
+ */
+enum class Layout {
+  kUndef = 0,
+  kAny,
+  kNHWC,
+  kNCHW,
+  kMKLDNN,
+  kNumLayouts,
+};
+
+}  // namespace pt
diff --git a/paddle/pten/core/lod_tensor.h b/paddle/pten/core/lod_tensor.h
new file mode 100644
index 0000000000000..e1a22f3269ecb
--- /dev/null
+++ b/paddle/pten/core/lod_tensor.h
@@ -0,0 +1,15 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
diff --git a/paddle/pten/core/scalar_tensor.h b/paddle/pten/core/scalar_tensor.h
new file mode 100644
index 0000000000000..59fe21aff2484
--- /dev/null
+++ b/paddle/pten/core/scalar_tensor.h
@@ -0,0 +1,19 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/base_tensor.h"
+
+class LoDTensor : public BaseTensor {};
diff --git a/paddle/pten/core/selected_rows.h b/paddle/pten/core/selected_rows.h
new file mode 100644
index 0000000000000..e1a22f3269ecb
--- /dev/null
+++ b/paddle/pten/core/selected_rows.h
@@ -0,0 +1,15 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
diff --git a/paddle/pten/core/tensor.h b/paddle/pten/core/tensor.h
new file mode 100644
index 0000000000000..ee07d2de05774
--- /dev/null
+++ b/paddle/pten/core/tensor.h
@@ -0,0 +1,257 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <utility>
+
+#include "paddle/pten/core/autograd_meta_if.h"
+#include "paddle/pten/core/tensor_impl_if.h"
+
+// fluid headers [may be replaced by new impl]
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace pt {
+
+/**
+ * Tensor is the API description of the basic data structure in the
+ * [ PaddlePaddle Tensor Operation Library ].
+ *
+ * It is not limited to a simple n-dimensional array.
+ * It contains a smart pointer to `TensorImpl`. The data description contained
+ * in Tensor is defined by TensorImpl. Tensor only defines the interface for
+ * operation.
+ *
+ * This is a new Tensor design, which is independent of the original
+ * framework::Tensor in fluid. The original Tensor will be gradually discarded
+ * in the future.
+ *
+ * Note: Tensor can be NULL state, Tensor is meaningful only when the
+ * TensorImpl to which it is pointed is not empty.
+ *
+ * Note: For the consistency of C++ API self, and the consistency between C++
+ * API and Python API, all member methods of Tensor are named with lowercase
+ * letters and underscores.
+ *
+ * Note: Tensor cannot be inherited. The heterogeneous Tensor implementation
+ * can be achieved by inheriting the underlying TensorImplInterface.
+ */
+
+class Tensor final {
+ public:
+  /* Part 1: Construction and destruction methods */
+  Tensor() {}
+  Tensor(const Tensor&) = default;
+  Tensor(Tensor&&) = default;
+
+  /**
+   * @description: Use a TensorImpl pointer to construct a Tensor
+   * @param {shared_ptr<TensorImplInterface>} tensor_impl
+   * @return {Tensor}
+   */
+  explicit Tensor(std::shared_ptr<TensorImplInterface> tensor_impl)
+      : impl_(std::move(tensor_impl)) {
+    if (impl_.get() == nullptr) {
+      throw std::runtime_error("TensorImpl with nullptr is not supported");
+    }
+  }
+
+  /* Part 2: Dimension, DataType and Layout methods */
+  /**
+   * @description: Return the number of elements of current Tensor.
+   * @param None
+   * @return {int64_t}
+   */
+  int64_t numel() const { return impl_->numel(); }
+
+  /**
+   * @description: Return the shape (dimensions) of current Tensor.
+   * @param None
+   * @return {DDim}
+   */
+  DDim shape() const { return impl_->dims(); }
+
+  /**
+   * @description: Resize the shape (dimensions) of current Tensor.
+   * @param {const} DDim
+   * @return {*}
+   */
+  void resize(const DDim& dims) { impl_->resize(dims); }
+
+  /**
+   * @description: Return the data type of current Tensor.
+   * @param None
+   * @return {DataType}
+   */
+  DataType type() const { return impl_->type(); }
+
+  /**
+   * @description: Return the layout of current Tensor.
+   * @param None
+   * @return {Layout}
+   */
+  Layout layout() const { return impl_->layout(); }
+
+  /* Part 3: Device and Backend methods */
+  /**
+   * @description: Return the place (device) of current Tensor.
+   * @param None
+   * @return {Place}
+   */
+  Place place() const { return impl_->place(); }
+
+  /**
+   * @description: Convert the current Tensor to a Tensor of
+   *               a specific data type for a specific device
+   * @param {const} Backend
+   * @param {const} DataType
+   * @return {*}
+   */
+  // Tensor to(const Backend& backend, const DataType& dtype) {
+  //   // TODO(chenweihang): use kernels to impl later
+  // }
+
+  /**
+   * Backend judgment APIs, shield the concept of Backend.
+   */
+  // TODO(chenweihang): impl later
+  bool is_cpu() const { return impl_->backend() == Backend::kCPU; }
+  bool is_cuda() const;
+  bool is_hip() const;
+  bool is_xpu() const;
+  bool is_npu() const;
+  bool is_mkldnn() const;
+  bool is_cudnn() const;
+
+  /**
+   * Backend convert APIs.
+   */
+  Tensor cpu() const;
+  Tensor cuda() const;
+  Tensor hip() const;
+  Tensor xpu() const;
+  Tensor npu() const;
+  Tensor mkldnn() const;
+  Tensor cudnn() const;
+
+  /* Part 4: Data Access methods */
+  /**
+   * @description: Return the implemention of current Tensor.
+   * @param None
+   * @return {std::shared_ptr<TensorImplInterface>}
+   */
+  std::shared_ptr<TensorImplInterface> impl() const { return impl_; }
+
+  /**
+   * @description: Get the const memory pointer of current Tensor.
+   * @param None
+   * @return {const T*}
+   */
+  template <typename T>
+  const T* data() const {
+    return impl_->data<T>();
+  }
+
+  /**
+   * @description: Get the mutable memory pointer of current Tensor.
+   * @param None
+   * @return {T*}
+   */
+  template <typename T>
+  T* mutable_data() {
+    return impl_->mutable_data<T>();
+  }
+
+  // TODO(chenweihang): slice and split methods use kernels?
+
+  /* Part 5: Status utils methods */
+  /**
+   * @description: Determine whether it is a meaningful Tensor
+   * @param None
+   * @return {bool}
+   */
+  bool defined() const { return impl_ != nullptr; }
+
+  /**
+   * @description: Determine whether Tensor is initialized
+   * @param None
+   * @return {bool}
+   */
+  bool initialized() const { return impl_->initialized(); }
+
+  /**
+   * @description: Reset the Tensor implementation
+   * @param None
+   * @return {void}
+   */
+  void reset() { impl_.reset(); }
+
+  /* Part 6: Operator overloading */
+  Tensor& operator=(const Tensor& x) & {
+    impl_ = x.impl_;
+    return *this;
+  }
+  Tensor& operator=(Tensor&& x) & {
+    impl_ = std::move(x.impl_);
+    return *this;
+  }
+  // TODO(chenweihang): impl later
+  Tensor& operator=(const Tensor&) &&;
+  Tensor& operator=(Tensor&&) &&;
+
+  /* Part 7: Autograd methods */
+  // TODO(yangjiabin): Design autograd methods
+
+  /* Part 8: Auto generated Tensor methods */
+  // ...
+
+ private:
+  /**
+   * [ Why use abstract TensorImpl interface here? ]
+   *
+   * We hope that the data structure at the API level of the framework can be
+   * unified to Tensor, but Tensor itself is heterogeneous.
+   *
+   * Tensor can generally be represented by void* and size_t, place.
+   * This is suitable for most scenarios including CPU, CUDA, HIP, CPU, etc.,
+   * but there are a few cases where this definition cannot be described,
+   * such as the Tensor representation in third-party lib such as Metal,
+   * OpenCL, etc., as well as some special Tensor implementations, including
+   * Tensor containing only one Scalar value, or Tensor representing String,
+   * etc.
+   *
+   * Therefore, we hope to use a unified interface to shield the underlying
+   * heterogeneous Tensor implementation, so that the API level can be unified
+   * to one `Tensor`.
+   */
+  std::shared_ptr<TensorImplInterface> impl_;
+
+  /**
+   * [ Why need abstract AutogradMetaInterface here? ]
+   *
+   * Dynamic graphs need to hold backward information
+   *
+   * [ Why AutogradMeta not in TensorImpl? ]
+   *
+   * 1. AutogradMeta is only used in dynamic graph, It is execution-related
+   *    information, not Tensor data description-related information.
+   * 2. Kernel calculation does not require AutogradMeta.
+   */
+  std::unique_ptr<AutogradMetaInterface> autograd_meta_ = nullptr;
+};
+
+}  // namespace pt
diff --git a/paddle/pten/core/tensor_impl_if.h b/paddle/pten/core/tensor_impl_if.h
new file mode 100644
index 0000000000000..0c0555ee46af4
--- /dev/null
+++ b/paddle/pten/core/tensor_impl_if.h
@@ -0,0 +1,100 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/backend.h"
+#include "paddle/pten/core/ddim.h"
+#include "paddle/pten/core/dtype.h"
+#include "paddle/pten/core/layout.h"
+
+namespace paddle {
+namespace framework {
+class DDim;
+}
+namespace platform {
+class Place;
+}
+}
+
+namespace pt {
+
+// TODO(chenweihang): DDim still link to framework, design abstract interface
+// of DDim?
+using DDim = paddle::framework::DDim;
+
+// TODO(chenweihang): Place still link to framework, design abstract interface
+// of place?
+using Place = paddle::platform::Place;
+
+/**
+ * The abstract class of Tensor implemention, it needs to define its basic
+ * behavior through inherited classes.
+ *
+ */
+class TensorImplInterface {
+ public:
+  // Not allowed to initialize a tensor without descriptive metadata
+  TensorImplInterface() = default;
+
+  TensorImplInterface(const TensorImplInterface&) = delete;
+  TensorImplInterface& operator=(const TensorImplInterface&) = delete;
+  TensorImplInterface(TensorImplInterface&&) = delete;
+  TensorImplInterface& operator=(TensorImplInterface&&) = delete;
+
+  virtual ~TensorImplInterface() {}
+
+  /**
+   * Most of Tensor's methods need to have corresponding implementations
+   * in TensorImplInterface
+   */
+  virtual int64_t numel() const = 0;
+
+  virtual DDim dims() const = 0;
+
+  virtual void resize(const DDim& dims) = 0;
+
+  virtual DataType type() const = 0;
+
+  virtual Layout layout() const = 0;
+
+  virtual Place place() const = 0;
+
+  virtual Backend backend() const = 0;
+
+  virtual const void* data() const = 0;
+
+  virtual void* mutable_data() = 0;
+
+  virtual bool initialized() const = 0;
+
+  /**
+   * template methods can not be virtual
+   */
+  template <typename T>
+  const T* data() const {
+    static_assert(std::is_pod<T>::value,
+                  "T must be POD when call Tensor.data<T>().");
+    return reinterpret_cast<const T*>(data());
+  }
+
+  template <typename T>
+  T* mutable_data() {
+    static_assert(std::is_pod<T>::value,
+                  "T must be POD when call Tensor.mutable_data<T>().");
+    return reinterpret_cast<T*>(mutable_data());
+  }
+};
+
+}  // namespace pt
diff --git a/paddle/pten/core/tensor_meta.h b/paddle/pten/core/tensor_meta.h
new file mode 100644
index 0000000000000..ab0e42c1bd1ef
--- /dev/null
+++ b/paddle/pten/core/tensor_meta.h
@@ -0,0 +1,70 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/backend.h"
+#include "paddle/pten/core/dtype.h"
+#include "paddle/pten/core/layout.h"
+
+// fluid headers [may be replaced by new impl]
+#include "paddle/fluid/framework/ddim.h"
+
+namespace pt {
+
+/*
+class InplaceVersion {
+ public:
+ private:
+};
+*/
+
+/**
+ * The Meta data member of TensorImpl.
+ * It holds Tensor description information and status information.
+ *
+ * Note: TensorMeta is a struct, the members are named like
+ * ordinary nonmember variables, such as `type` instead of `type_`.
+ * And we direct access its members, in addition to constructor, destructor
+ * and functions for setting data members, can not provide other functions.
+ */
+struct TensorMeta {
+  TensorMeta() = delete;
+
+  // May introduce bug
+  explicit TensorMeta(DDim dims) : dims(dims) {}
+
+  // Compatible Contructor
+  TensorMeta(const DDim& dims,
+             Backend backend,
+             DataType type,
+             Layout layout,
+             size_t offset)
+      : dims(dims),
+        backend(backend),
+        type(type),
+        layout(layout),
+        offset(offset) {}
+
+  DDim dims;
+
+  Backend backend{Backend::kCPU};
+  DataType type{DataType::kFLOAT32};
+  Layout layout{Layout::kNCHW};
+  size_t offset{0};
+
+  // InplaceVersion inplace_version_counter{0};
+};
+
+}  // namespace pt
diff --git a/paddle/pten/cpu/math.h b/paddle/pten/cpu/math.h
new file mode 100644
index 0000000000000..c3e29f8a56d3d
--- /dev/null
+++ b/paddle/pten/cpu/math.h
@@ -0,0 +1,34 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/base_tensor.h"
+#include "paddle/pten/module/sign.h"
+
+// fluid headers [may be replaced by new impl]
+#include "paddle/fluid/platform/device_context.h"
+
+namespace pt {
+
+using CPUDeviceContext = paddle::platform::CPUDeviceContext;
+
+template <typename T>
+void Sign(const CPUDeviceContext& dev_ctx,
+          const BaseTensor& x,
+          BaseTensor* out) {
+  module::Sign<CPUDeviceContext, T>(dev_ctx, x, out);
+}
+
+}  // namespace pt
diff --git a/paddle/pten/cuda/math.h b/paddle/pten/cuda/math.h
new file mode 100644
index 0000000000000..dcc3d6721eb6e
--- /dev/null
+++ b/paddle/pten/cuda/math.h
@@ -0,0 +1,34 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/base_tensor.h"
+#include "paddle/pten/module/sign.h"
+
+// fluid headers [may be replaced by new impl]
+#include "paddle/fluid/platform/device_context.h"
+
+namespace pt {
+
+using CUDADeviceContext = paddle::platform::CUDADeviceContext;
+
+template <typename T>
+void Sign(const CUDADeviceContext& dev_ctx,
+          const BaseTensor& x,
+          BaseTensor* out) {
+  module::Sign<CUDADeviceContext, T>(dev_ctx, x, out);
+}
+
+}  // namespace pt
diff --git a/paddle/pten/module/sign.h b/paddle/pten/module/sign.h
new file mode 100644
index 0000000000000..1217f7b4e0700
--- /dev/null
+++ b/paddle/pten/module/sign.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/base_tensor.h"
+
+// fluid headers [may be replaced by new impl]
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+
+namespace pt {
+namespace module {
+
+template <typename DevCtx, typename T>
+void Sign(const DevCtx& dev_ctx, const BaseTensor& x, BaseTensor* out) {
+  VLOG(1) << "enter module::Sign";
+  // out->mutable_data<T>(x.place());
+  out->mutable_data<T>();
+
+  VLOG(1) << "module::Sign, calc by eigen.";
+  // TODO(chenweihang): if we design new tensor, we should support
+  // the low-level calc functor use new tensor as input,
+  // which may be a big project!
+  auto eigen_out = paddle::framework::EigenVector<T>::Flatten(*out);
+  auto eigen_x = paddle::framework::EigenVector<T>::Flatten(x);
+
+  auto& dev = *dev_ctx.template eigen_device();
+  paddle::operators::EigenSign<std::decay_t<decltype(dev)>, T>::Eval(
+      dev, eigen_out, eigen_x);
+}
+
+}  // namespace module
+}  // namespace pt
diff --git a/paddle/pten/tests/CMakeLists.txt b/paddle/pten/tests/CMakeLists.txt
new file mode 100644
index 0000000000000..e69de29bb2d1d

From 1f4ea40906c91f1db64492eb4153f653fef33141 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 12 Jul 2021 07:38:28 +0000
Subject: [PATCH 002/125] add move constructor for meta & add lodtensor

---
 paddle/pten/core/base_tensor.cc   |  5 ++-
 paddle/pten/core/base_tensor.h    |  8 ++---
 paddle/pten/core/lod_tensor.h     | 51 +++++++++++++++++++++++++++++++
 paddle/pten/core/tensor_impl_if.h |  9 +++---
 paddle/pten/core/tensor_meta.h    | 11 +++++++
 5 files changed, 72 insertions(+), 12 deletions(-)

diff --git a/paddle/pten/core/base_tensor.cc b/paddle/pten/core/base_tensor.cc
index 7c994b8cf2333..d6189c5dc69a0 100644
--- a/paddle/pten/core/base_tensor.cc
+++ b/paddle/pten/core/base_tensor.cc
@@ -23,8 +23,7 @@ limitations under the License. */
 
 namespace pt {
 
-// TODO(chenweihang): Place still link to framework, design abstract interface
-// of place?
+// TODO(chenweihang): design abstract interface of each place?
 using CPUPlace = paddle::platform::CPUPlace;
 using CUDAPlace = paddle::platform::CUDAPlace;
 using CUDAPinnedPlace = paddle::platform::CUDAPinnedPlace;
@@ -32,7 +31,7 @@ using XPUPlace = paddle::platform::XPUPlace;
 using NPUPlace = paddle::platform::NPUPlace;
 using NPUPinnedPlace = paddle::platform::NPUPinnedPlace;
 
-BaseTensor::BaseTensor(TensorMeta meta)
+BaseTensor::BaseTensor(TensorMeta&& meta)
     : meta_(std::forward<TensorMeta>(meta)) {}
 
 int64_t BaseTensor::numel() const { return product(meta_.dims); }
diff --git a/paddle/pten/core/base_tensor.h b/paddle/pten/core/base_tensor.h
index f641507d10b0c..320ab441c86ed 100644
--- a/paddle/pten/core/base_tensor.h
+++ b/paddle/pten/core/base_tensor.h
@@ -34,9 +34,9 @@ namespace pt {
 using Allocation = paddle::memory::allocation::Allocation;
 
 /**
- * The implementation of general Tensor (For CPU, CUDA, HIP, etc.),
- * contains a pointer to Allocation and a series of descriptive metadata
- * required by Tensor.
+ * The implementation of general Tensor (For CPU, CUDA, HIP, etc.), similar
+ * to the Tensor in fluid, contains a pointer to Allocation and a series of
+ * descriptive metadata required by Tensor.
  *
  * BaseTensor is still a base class, it may have mutiple inherited classes,
  * such as LoDTensor, SelectedRows, etc. The memory layout
@@ -65,7 +65,7 @@ class BaseTensor : public TensorImplInterface {
    *
    * Note: Tensor objects lacking meta information are not allowed to exist.
    */
-  explicit BaseTensor(TensorMeta meta);
+  explicit BaseTensor(TensorMeta&& meta);
 
   ~BaseTensor() override {}
 
diff --git a/paddle/pten/core/lod_tensor.h b/paddle/pten/core/lod_tensor.h
index e1a22f3269ecb..38ca81a136f5a 100644
--- a/paddle/pten/core/lod_tensor.h
+++ b/paddle/pten/core/lod_tensor.h
@@ -13,3 +13,54 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
+#include "paddle/pten/compat/mixed_vector.h"
+#include "paddle/pten/core/base_tensor.h"
+
+namespace pt {
+
+/*
+ * LoD is short for Level of Details.
+ *
+ * - in a level, each element indicates relative offset of the lower level
+ * - the first element should be 0 and that indicates that this sequence start
+ * from 0
+ * - each sequence's begin and end(no-inclusive) is level[id, id+1]
+ *
+ * For example:
+ *    3-level LoD stores
+ *
+ *    0 2 3
+ *    0 2 4 7
+ *    0 2 5 7 10 12 15 20
+ */
+using LoD = std::vector<Vector<size_t>>;
+
+/**
+ * LoDTensor: compatible with LoDTensor in fluid and related operators.
+ *
+ * Note: LoDTensor (Level of details Tensor)
+ * see https://en.wikipedia.org/wiki/Level_of_details for reference.
+ */
+class LoDTensor : public BaseTensor {
+ public:
+  LoDTensor() = delete;
+
+  LoDTensor(const LoDTensor&) = delete;
+  LoDTensor& operator=(const LoDTensor&) = delete;
+  LoDTensor(LoDTensor&&) = delete;
+  LoDTensor& operator=(LoDTensor&&) = delete;
+
+  explicit LoDTensor(TensorMeta meta, const LoD& lod) : lod_(lod) {}
+
+  void set_lod(const LoD& lod) { lod_ = lod; }
+
+  const LoD& lod() const { return lod_; }
+
+  LoD* mutable_lod() { return &lod_; }
+
+ private:
+  LoD lod_;
+};
+
+}  // namespace pt
diff --git a/paddle/pten/core/tensor_impl_if.h b/paddle/pten/core/tensor_impl_if.h
index 0c0555ee46af4..f0ddb6243384a 100644
--- a/paddle/pten/core/tensor_impl_if.h
+++ b/paddle/pten/core/tensor_impl_if.h
@@ -15,7 +15,6 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/pten/core/backend.h"
-#include "paddle/pten/core/ddim.h"
 #include "paddle/pten/core/dtype.h"
 #include "paddle/pten/core/layout.h"
 
@@ -30,12 +29,12 @@ class Place;
 
 namespace pt {
 
-// TODO(chenweihang): DDim still link to framework, design abstract interface
-// of DDim?
+// TODO(chenweihang): Use the existing DDim directly?
+// or design a abstract interface of DDim?
 using DDim = paddle::framework::DDim;
 
-// TODO(chenweihang): Place still link to framework, design abstract interface
-// of place?
+// TODO(chenweihang): Use the existing Place directly?
+// or design a abstract interface of Place?
 using Place = paddle::platform::Place;
 
 /**
diff --git a/paddle/pten/core/tensor_meta.h b/paddle/pten/core/tensor_meta.h
index ab0e42c1bd1ef..441813f015e65 100644
--- a/paddle/pten/core/tensor_meta.h
+++ b/paddle/pten/core/tensor_meta.h
@@ -41,6 +41,17 @@ class InplaceVersion {
  */
 struct TensorMeta {
   TensorMeta() = delete;
+  TensorMeta(const TensorMeta&) = delete;
+  TensorMeta& operator=(const TensorMeta&) = delete;
+  // TensorMeta(TensorMeta&&) = delete;
+  TensorMeta& operator=(TensorMeta&&) = delete;
+
+  TensorMeta(TensorMeta&& meta)
+      : dims(meta.dims),
+        backend(meta.backend),
+        type(meta.type),
+        layout(meta.layout),
+        offset(meta.offset) {}
 
   // May introduce bug
   explicit TensorMeta(DDim dims) : dims(dims) {}

From 44bf926d28a5b315daa97c9838acb6e0255f19e7 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 12 Jul 2021 11:22:34 +0000
Subject: [PATCH 003/125] add dirs & sign xpu kernel

---
 paddle/fluid/framework/pten_utils.h     |  2 +-
 paddle/pten/api/src/CMakeLists.txt      |  0
 paddle/pten/core/base_tensor.cc         |  2 +-
 paddle/pten/core/device_context.h       | 19 ----------
 paddle/pten/core/lod_tensor.h           |  7 +++-
 paddle/pten/core/tensor.h               | 17 ++++++++-
 paddle/pten/core/tensor_meta.h          |  2 +-
 paddle/pten/cpu/CMakeLists.txt          |  0
 paddle/pten/cpu/math.h                  |  2 +-
 paddle/pten/cuda/CMakeLists.txt         |  0
 paddle/pten/cuda/math.h                 |  6 +++-
 paddle/pten/mkldnn/CMakeLists.txt       |  0
 paddle/pten/module/CMakeLists.txt       |  0
 paddle/pten/module/sign.h               |  2 +-
 paddle/pten/npu/CMakeLists.txt          |  0
 paddle/pten/{core/ddim.h => npu/math.h} | 14 ++++++--
 paddle/pten/xpu/CMakeLists.txt          |  0
 paddle/pten/xpu/math.h                  | 47 +++++++++++++++++++++++++
 18 files changed, 90 insertions(+), 30 deletions(-)
 create mode 100644 paddle/pten/api/src/CMakeLists.txt
 delete mode 100644 paddle/pten/core/device_context.h
 create mode 100644 paddle/pten/cpu/CMakeLists.txt
 create mode 100644 paddle/pten/cuda/CMakeLists.txt
 create mode 100644 paddle/pten/mkldnn/CMakeLists.txt
 create mode 100644 paddle/pten/module/CMakeLists.txt
 create mode 100644 paddle/pten/npu/CMakeLists.txt
 rename paddle/pten/{core/ddim.h => npu/math.h} (70%)
 create mode 100644 paddle/pten/xpu/CMakeLists.txt
 create mode 100644 paddle/pten/xpu/math.h

diff --git a/paddle/fluid/framework/pten_utils.h b/paddle/fluid/framework/pten_utils.h
index 3df999b554ce1..e16e8b012328d 100644
--- a/paddle/fluid/framework/pten_utils.h
+++ b/paddle/fluid/framework/pten_utils.h
@@ -32,7 +32,7 @@ std::shared_ptr<TensorImplT> MakeTensorImpl(const Tensor& tensor,
       pt::TensorMeta(tensor.dims(), pt::TransToPtenBackend(place),
                      pt::TransToPtenDataType(type),
                      pt::TransToPtenLayout(tensor.layout()), tensor.offset());
-  auto tensor_impl = std::make_shared<TensorImplT>(meta);
+  auto tensor_impl = std::make_shared<TensorImplT>(std::move(meta));
   if (holder != nullptr) {
     tensor_impl->template ShareAllocation(tensor.Holder());
   } else {
diff --git a/paddle/pten/api/src/CMakeLists.txt b/paddle/pten/api/src/CMakeLists.txt
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/paddle/pten/core/base_tensor.cc b/paddle/pten/core/base_tensor.cc
index d6189c5dc69a0..d52c40d38f578 100644
--- a/paddle/pten/core/base_tensor.cc
+++ b/paddle/pten/core/base_tensor.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/pten/core/base_tensor.h"
 #include "paddle/pten/core/convert_utils.h"
 
-// fluid headers [may be replaced by new impl]
+// See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/pten/core/device_context.h b/paddle/pten/core/device_context.h
deleted file mode 100644
index 0dee0e4690a36..0000000000000
--- a/paddle/pten/core/device_context.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-/**
- * TODO(chenweihang): Design DDim Interface for new Tensor
- */
diff --git a/paddle/pten/core/lod_tensor.h b/paddle/pten/core/lod_tensor.h
index 38ca81a136f5a..6b0b590e83cb9 100644
--- a/paddle/pten/core/lod_tensor.h
+++ b/paddle/pten/core/lod_tensor.h
@@ -14,11 +14,16 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/pten/compat/mixed_vector.h"
 #include "paddle/pten/core/base_tensor.h"
 
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/framework/mixed_vector.h"
+#include "paddle/fluid/platform/device_context.h"
+
 namespace pt {
 
+using Vector = paddle::framework::Vector;
+
 /*
  * LoD is short for Level of Details.
  *
diff --git a/paddle/pten/core/tensor.h b/paddle/pten/core/tensor.h
index ee07d2de05774..e3834797938a9 100644
--- a/paddle/pten/core/tensor.h
+++ b/paddle/pten/core/tensor.h
@@ -21,7 +21,22 @@ limitations under the License. */
 #include "paddle/pten/core/autograd_meta_if.h"
 #include "paddle/pten/core/tensor_impl_if.h"
 
-// fluid headers [may be replaced by new impl]
+/**
+ * [ Why still include the fluid headers? ]
+ *
+ * We hope to organize the basic implementation of Tensor and the logic related
+ * to Tensor operation into an independent library, which we call
+ * [Tensor Operation Library], so we extract or rewrite the original OpKernels.
+ *
+ * In the future, the training library, inference library and custom operators
+ * will link to this Tensor operation library.
+ *
+ * However, if we directly split the link relation, we need to make too many
+ * changes, which will affect the stability of the framework, so here we still
+ * rely on the implementation of the framework, which is a intermediate state.
+ * In the future, the necessary components will be moved to the this library,
+ * or the corresponding components will be re-implemented.
+ */
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/platform/place.h"
 
diff --git a/paddle/pten/core/tensor_meta.h b/paddle/pten/core/tensor_meta.h
index 441813f015e65..57f6cfd3aaafb 100644
--- a/paddle/pten/core/tensor_meta.h
+++ b/paddle/pten/core/tensor_meta.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/pten/core/dtype.h"
 #include "paddle/pten/core/layout.h"
 
-// fluid headers [may be replaced by new impl]
+// See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/ddim.h"
 
 namespace pt {
diff --git a/paddle/pten/cpu/CMakeLists.txt b/paddle/pten/cpu/CMakeLists.txt
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/paddle/pten/cpu/math.h b/paddle/pten/cpu/math.h
index c3e29f8a56d3d..1894a97bc80e1 100644
--- a/paddle/pten/cpu/math.h
+++ b/paddle/pten/cpu/math.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/pten/core/base_tensor.h"
 #include "paddle/pten/module/sign.h"
 
-// fluid headers [may be replaced by new impl]
+// See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
 
 namespace pt {
diff --git a/paddle/pten/cuda/CMakeLists.txt b/paddle/pten/cuda/CMakeLists.txt
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/paddle/pten/cuda/math.h b/paddle/pten/cuda/math.h
index dcc3d6721eb6e..d14faa20a398d 100644
--- a/paddle/pten/cuda/math.h
+++ b/paddle/pten/cuda/math.h
@@ -14,10 +14,12 @@ limitations under the License. */
 
 #pragma once
 
+#ifdef PADDLE_WITH_CUDA
+
 #include "paddle/pten/core/base_tensor.h"
 #include "paddle/pten/module/sign.h"
 
-// fluid headers [may be replaced by new impl]
+// See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
 
 namespace pt {
@@ -32,3 +34,5 @@ void Sign(const CUDADeviceContext& dev_ctx,
 }
 
 }  // namespace pt
+
+#endif
diff --git a/paddle/pten/mkldnn/CMakeLists.txt b/paddle/pten/mkldnn/CMakeLists.txt
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/paddle/pten/module/CMakeLists.txt b/paddle/pten/module/CMakeLists.txt
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/paddle/pten/module/sign.h b/paddle/pten/module/sign.h
index 1217f7b4e0700..56dc2b3665629 100644
--- a/paddle/pten/module/sign.h
+++ b/paddle/pten/module/sign.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "paddle/pten/core/base_tensor.h"
 
-// fluid headers [may be replaced by new impl]
+// See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 
diff --git a/paddle/pten/npu/CMakeLists.txt b/paddle/pten/npu/CMakeLists.txt
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/paddle/pten/core/ddim.h b/paddle/pten/npu/math.h
similarity index 70%
rename from paddle/pten/core/ddim.h
rename to paddle/pten/npu/math.h
index 0dee0e4690a36..0d3a28bb658bb 100644
--- a/paddle/pten/core/ddim.h
+++ b/paddle/pten/npu/math.h
@@ -14,6 +14,14 @@ limitations under the License. */
 
 #pragma once
 
-/**
- * TODO(chenweihang): Design DDim Interface for new Tensor
- */
+#ifdef PADDLE_WITH_ASCEND_CL
+
+#include "paddle/pten/core/base_tensor.h"
+#include "paddle/pten/module/sign.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/device_context.h"
+
+namespace pt {}  // namespace pt
+
+#endif
diff --git a/paddle/pten/xpu/CMakeLists.txt b/paddle/pten/xpu/CMakeLists.txt
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/paddle/pten/xpu/math.h b/paddle/pten/xpu/math.h
new file mode 100644
index 0000000000000..c15023e210d12
--- /dev/null
+++ b/paddle/pten/xpu/math.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/pten/core/base_tensor.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/xpu_header.h"
+
+namespace pt {
+
+using XPUDeviceContext = paddle::platform::XPUDeviceContext;
+
+template <typename T>
+void Sign(const XPUDeviceContext& dev_ctx,
+          const BaseTensor& x,
+          BaseTensor* out) {
+  out->mutable_data<T>();
+  auto xpu_context = dev_ctx.x_context();
+  int r = xpu::activation_forward(xpu_context,
+                                  xpu::Activation_t::SIGN,
+                                  in.numel(),
+                                  in.data<T>(),
+                                  out->mutbale_data<T>());
+  PADDLE_ENFORCE_EQ(r,
+                    xpu::Error_t::SUCCESS,
+                    platform::errors::Fatal("XPU sign kernel error!"));
+}
+
+}  // namespace pt
+
+#endif

From b20689db7987f22e59bd7d072b8d7cd93c469c19 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 15 Jul 2021 07:12:56 +0000
Subject: [PATCH 004/125] add mean cpu&cuda kernel impl

---
 paddle/fluid/framework/eigen.h                |  9 ++
 paddle/fluid/operators/CMakeLists.txt         |  6 +-
 paddle/fluid/operators/mean_op.cu             | 48 +---------
 paddle/fluid/operators/mean_op.h              | 24 +++--
 paddle/fluid/operators/sign_op.h              |  8 +-
 paddle/pten/CMakeLists.txt                    | 24 +++++
 paddle/pten/api/CMakeLists.txt                |  8 ++
 paddle/pten/api/all.cc                        | 17 ++++
 paddle/pten/api/all.h                         | 21 +++++
 paddle/pten/api/dev/core.h                    | 17 ++++
 paddle/pten/api/dev/math.h                    | 19 ++++
 paddle/pten/api/{ => user}/src/CMakeLists.txt |  0
 paddle/pten/core/backend.h                    |  9 ++
 paddle/pten/core/base_tensor.cc               |  1 -
 paddle/pten/core/base_tensor.h                |  3 +
 paddle/pten/core/dtype.h                      |  7 +-
 paddle/pten/core/layout.h                     |  2 +-
 paddle/pten/core/lod_tensor.h                 |  2 +-
 paddle/pten/core/tensor_meta.h                | 16 ++--
 paddle/pten/core/tensor_status.h              | 47 ++++++++++
 paddle/pten/cpu/math.h                        | 21 +++++
 paddle/pten/cuda/CMakeLists.txt               |  1 +
 paddle/pten/cuda/math.cu                      | 89 +++++++++++++++++++
 paddle/pten/cuda/math.h                       | 17 ++++
 paddle/pten/hip/CMakeLists.txt                |  0
 25 files changed, 342 insertions(+), 74 deletions(-)
 create mode 100644 paddle/pten/api/CMakeLists.txt
 create mode 100644 paddle/pten/api/all.cc
 create mode 100644 paddle/pten/api/all.h
 create mode 100644 paddle/pten/api/dev/core.h
 create mode 100644 paddle/pten/api/dev/math.h
 rename paddle/pten/api/{ => user}/src/CMakeLists.txt (100%)
 create mode 100644 paddle/pten/core/tensor_status.h
 create mode 100644 paddle/pten/cuda/math.cu
 create mode 100644 paddle/pten/hip/CMakeLists.txt

diff --git a/paddle/fluid/framework/eigen.h b/paddle/fluid/framework/eigen.h
index e6f9085a5c7a4..be03a61643b62 100644
--- a/paddle/fluid/framework/eigen.h
+++ b/paddle/fluid/framework/eigen.h
@@ -159,6 +159,15 @@ struct EigenScalar {
   static ConstType From(const Tensor& tensor) {
     return ConstType(tensor.data<T>());
   }
+
+  // for pt::BaseTensor
+  static Type From(pt::BaseTensor& tensor) {  // NOLINT
+    return Type(const_cast<T*>(tensor.data<T>()));
+  }
+
+  static ConstType From(const pt::BaseTensor& tensor) {
+    return ConstType(tensor.data<T>());
+  }
 };
 
 // Define Tensor with 32-bit index.
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 7fc64f63b0ea3..af55d5d5679a6 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -75,7 +75,7 @@ if(WITH_UNITY_BUILD)
 endif()
 
 register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op lstm_op run_program_op eye_op recurrent_op
-        sync_batch_norm_op sign_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
+        sync_batch_norm_op sign_op mean_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
 
 op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS})
 
@@ -94,11 +94,13 @@ else()
     op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 endif()
 
-op_library(sign_op DEPS ${OP_HEADER_DEPS} base_tensor)
 op_library(lstm_op DEPS ${OP_HEADER_DEPS} lstm_compute)
 op_library(eye_op DEPS ${OP_HEADER_DEPS})
 op_library(recurrent_op DEPS ${OP_HEADER_DEPS})
 
+op_library(sign_op DEPS ${OP_HEADER_DEPS} pten)
+op_library(mean_op DEPS ${OP_HEADER_DEPS} pten)
+
 set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
 
 if (WITH_DGC)
diff --git a/paddle/fluid/operators/mean_op.cu b/paddle/fluid/operators/mean_op.cu
index 430036bc67de7..ffb667ba974b8 100644
--- a/paddle/fluid/operators/mean_op.cu
+++ b/paddle/fluid/operators/mean_op.cu
@@ -25,17 +25,6 @@ namespace cub = hipcub;
 namespace paddle {
 namespace operators {
 
-template <typename T>
-struct DivideFunctor {
-  HOSTDEVICE explicit inline DivideFunctor(int n)
-      : n_inv(static_cast<T>(1.0 / n)) {}
-
-  HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; }
-
- private:
-  T n_inv;
-};
-
 template <typename T>
 __global__ void MeanRunKernel(const T* in_data, T* out_data, int N) {
   int idx = blockDim.x * blockIdx.x + threadIdx.x;
@@ -45,37 +34,6 @@ __global__ void MeanRunKernel(const T* in_data, T* out_data, int N) {
   }
 }
 
-template <typename DeviceContext, typename T>
-class MeanCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Output<Tensor>("Out");
-
-    output->mutable_data<T>(context.GetPlace());
-    auto size_prob = input->numel();
-    const T* in_data = input->data<T>();
-    T* out_data = output->mutable_data<T>(context.GetPlace());
-    auto stream = context.cuda_device_context().stream();
-
-    DivideFunctor<T> transformer(size_prob);
-    cub::TransformInputIterator<T, DivideFunctor<T>, const T*> trans_x(
-        in_data, transformer);
-    size_t temp_storage_bytes = 0;
-
-    auto err = cub::DeviceReduce::Sum(nullptr, temp_storage_bytes, trans_x,
-                                      out_data, size_prob, stream);
-    PADDLE_ENFORCE_CUDA_SUCCESS(err);
-    framework::Tensor tmp;
-    auto* temp_storage = tmp.mutable_data<uint8_t>(
-        framework::make_ddim({static_cast<int64_t>(temp_storage_bytes)}),
-        context.GetPlace());
-    err = cub::DeviceReduce::Sum(temp_storage, temp_storage_bytes, trans_x,
-                                 out_data, size_prob, stream);
-    PADDLE_ENFORCE_CUDA_SUCCESS(err);
-  }
-};
-
 template <typename DeviceContext, typename T>
 class MeanCUDAGradKernel : public framework::OpKernel<T> {
  public:
@@ -105,9 +63,9 @@ class MeanCUDAGradKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
-    mean, ops::MeanCUDAKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MeanCUDAKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MeanCUDAKernel<paddle::platform::CUDADeviceContext, plat::float16>);
+    mean, ops::MeanKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MeanKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::MeanKernel<paddle::platform::CUDADeviceContext, plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
     mean_grad,
     ops::MeanCUDAGradKernel<paddle::platform::CUDADeviceContext, float>,
diff --git a/paddle/fluid/operators/mean_op.h b/paddle/fluid/operators/mean_op.h
index 4780150751bf6..4dcdb41420b28 100644
--- a/paddle/fluid/operators/mean_op.h
+++ b/paddle/fluid/operators/mean_op.h
@@ -15,6 +15,11 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/pten_utils.h"
+
+// only can include the headers in paddle/pten/api dirs
+#include "paddle/pten/api/dev/core.h"
+#include "paddle/pten/api/dev/math.h"
 
 namespace paddle {
 namespace operators {
@@ -31,17 +36,20 @@ template <typename DeviceContext, typename T>
 class MeanKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Output<Tensor>("Out");
+    auto* x = context.Input<Tensor>("X");
+    auto* out = context.Output<Tensor>("Out");
+    auto& dev_ctx = context.device_context<DeviceContext>();
 
-    output->mutable_data<T>(context.GetPlace());
+    auto pt_x =
+        framework::MakeTensorImpl<pt::BaseTensor>(*x, x->place(), x->type());
+    auto pt_out =
+        framework::MakeTensorImpl<pt::BaseTensor>(*out, x->place(), x->type());
 
-    auto X = EigenVector<T>::Flatten(*input);
-    auto y = EigenScalar<T>::From(*output);
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
+    // call new kernel
+    pt::Mean<T>(dev_ctx, *pt_x.get(), pt_out.get());
 
-    y.device(place) = X.mean();
+    // share pt_out data to out
+    framework::ShareTensorImpl(pt_out.get(), out);
   }
 };
 
diff --git a/paddle/fluid/operators/sign_op.h b/paddle/fluid/operators/sign_op.h
index e2f5790602818..10c583295d26f 100644
--- a/paddle/fluid/operators/sign_op.h
+++ b/paddle/fluid/operators/sign_op.h
@@ -16,12 +16,12 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 
-#include "paddle/fluid/framework/pten_utils.h"
-#include "paddle/pten/core/base_tensor.h"
-#include "paddle/pten/cpu/math.h"
-#include "paddle/pten/cuda/math.h"
+// only can include the headers in paddle/pten/api dirs
+#include "paddle/pten/api/dev/core.h"
+#include "paddle/pten/api/dev/math.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/pten/CMakeLists.txt b/paddle/pten/CMakeLists.txt
index ad6d4787c23e3..5407a8ec836c7 100644
--- a/paddle/pten/CMakeLists.txt
+++ b/paddle/pten/CMakeLists.txt
@@ -1 +1,25 @@
+# pten api
+add_subdirectory(api)
+# pten core components
 add_subdirectory(core)
+# pten kernels for diff device
+add_subdirectory(cpu)
+if(WITH_GPU)
+  add_subdirectory(cuda)
+endif()
+if(WITH_ROCM)
+  add_subdirectory(hip)
+endif()
+if(WITH_MKLDNN)
+  add_subdirectory(mkldnn)
+endif()
+if(WITH_ASCEND_CL)
+  add_subdirectory(npu)
+endif()
+if(WITH_XPU)
+  add_subdirectory(xpu)
+endif()
+# pten public functors
+add_subdirectory(module)
+# pten tests
+add_subdirectory(tests)
diff --git a/paddle/pten/api/CMakeLists.txt b/paddle/pten/api/CMakeLists.txt
new file mode 100644
index 0000000000000..5262784d244a0
--- /dev/null
+++ b/paddle/pten/api/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_subdirectory(user/src)
+
+set(PTEN_DEPS base_tensor convert_utils)
+if(WITH_GPU)
+  set(PTEN_DEPS ${PTEN_DEPS} math_cuda)
+endif()
+
+cc_library(pten SRCS all.cc DEPS ${PTEN_DEPS})
diff --git a/paddle/pten/api/all.cc b/paddle/pten/api/all.cc
new file mode 100644
index 0000000000000..4141f5127fe31
--- /dev/null
+++ b/paddle/pten/api/all.cc
@@ -0,0 +1,17 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/api/all.h"
+
+namespace pt {}  // namespace pt
diff --git a/paddle/pten/api/all.h b/paddle/pten/api/all.h
new file mode 100644
index 0000000000000..342e51c128cd8
--- /dev/null
+++ b/paddle/pten/api/all.h
@@ -0,0 +1,21 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+// develop apis
+#include "paddle/pten/api/dev/core.h"
+#include "paddle/pten/api/dev/math.h"
+
+// user apis
diff --git a/paddle/pten/api/dev/core.h b/paddle/pten/api/dev/core.h
new file mode 100644
index 0000000000000..7c8982e132676
--- /dev/null
+++ b/paddle/pten/api/dev/core.h
@@ -0,0 +1,17 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/base_tensor.h"
diff --git a/paddle/pten/api/dev/math.h b/paddle/pten/api/dev/math.h
new file mode 100644
index 0000000000000..4de11d5e33a6b
--- /dev/null
+++ b/paddle/pten/api/dev/math.h
@@ -0,0 +1,19 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/cpu/math.h"
+#include "paddle/pten/cuda/math.h"
+#include "paddle/pten/xpu/math.h"
diff --git a/paddle/pten/api/src/CMakeLists.txt b/paddle/pten/api/user/src/CMakeLists.txt
similarity index 100%
rename from paddle/pten/api/src/CMakeLists.txt
rename to paddle/pten/api/user/src/CMakeLists.txt
diff --git a/paddle/pten/core/backend.h b/paddle/pten/core/backend.h
index ce7499fae38e8..78c2361c61e6f 100644
--- a/paddle/pten/core/backend.h
+++ b/paddle/pten/core/backend.h
@@ -17,7 +17,16 @@ limitations under the License. */
 namespace pt {
 
 /**
+ * [ Why need Backend? ]
+ *
  * Backend not only means place. Backend is a superset of place.
+ *
+ * Place cannot indicate the difference in calculation methods on the device,
+ * but in order to make the boundary of the kernel clearer and the function
+ * more specific, we need to distinguish the calculation method.
+ *
+ * Such as the kernel for CUDA device, it is native CUDA kernel, or kernel
+ * by calling CUDNN library.
  */
 enum class Backend {
   kUndef = 0,
diff --git a/paddle/pten/core/base_tensor.cc b/paddle/pten/core/base_tensor.cc
index d52c40d38f578..8b8e5a85e6b6f 100644
--- a/paddle/pten/core/base_tensor.cc
+++ b/paddle/pten/core/base_tensor.cc
@@ -23,7 +23,6 @@ limitations under the License. */
 
 namespace pt {
 
-// TODO(chenweihang): design abstract interface of each place?
 using CPUPlace = paddle::platform::CPUPlace;
 using CUDAPlace = paddle::platform::CUDAPlace;
 using CUDAPinnedPlace = paddle::platform::CUDAPinnedPlace;
diff --git a/paddle/pten/core/base_tensor.h b/paddle/pten/core/base_tensor.h
index 320ab441c86ed..ac1905d696158 100644
--- a/paddle/pten/core/base_tensor.h
+++ b/paddle/pten/core/base_tensor.h
@@ -18,6 +18,7 @@ limitations under the License. */
 
 #include "paddle/pten/core/tensor_impl_if.h"
 #include "paddle/pten/core/tensor_meta.h"
+#include "paddle/pten/core/tensor_status.h"
 
 namespace paddle {
 namespace memory {
@@ -115,6 +116,8 @@ class BaseTensor : public TensorImplInterface {
   std::shared_ptr<Allocation> memory_;
   // The Tensor meta data
   TensorMeta meta_;
+  // The Tensor status data
+  // TensorStatus status_;
 };
 
 }  // namespace pt
diff --git a/paddle/pten/core/dtype.h b/paddle/pten/core/dtype.h
index 04376ce24f6e0..3879dfdd14399 100644
--- a/paddle/pten/core/dtype.h
+++ b/paddle/pten/core/dtype.h
@@ -17,14 +17,17 @@ limitations under the License. */
 namespace pt {
 
 /**
+ * [ Why need new data type? ]
+ *
+ * The Var data type design in framework.proto is confusing, maybe we need
+ * polish the VarType in framework.proto.
+ *
  * We need to ensure that the operator library is relatively independent
  * and does not depend on the framework. Therefore, before calling the kernel
  * in the Tensor operation library inside the framework, the internal
  * data type needs to be converted to the data type in the Tensor operation
  * library.
  *
- * The data type design in proto is confusing, maybe we need polish the
- * VarType in framework.proto.
  */
 enum class DataType {
   kUndef = 0,
diff --git a/paddle/pten/core/layout.h b/paddle/pten/core/layout.h
index ae6c578e74ca3..7b8882fe30251 100644
--- a/paddle/pten/core/layout.h
+++ b/paddle/pten/core/layout.h
@@ -23,7 +23,7 @@ namespace pt {
  * layout needs to be converted to the data type in the Tensor operation
  * library.
  *
- * Here we also can use the DataLayout in framework, they are all enum classes
+ * Here we also can use the DataLayout in framework, they are all enum classes.
  */
 enum class Layout {
   kUndef = 0,
diff --git a/paddle/pten/core/lod_tensor.h b/paddle/pten/core/lod_tensor.h
index 6b0b590e83cb9..0eb5f1769bbfc 100644
--- a/paddle/pten/core/lod_tensor.h
+++ b/paddle/pten/core/lod_tensor.h
@@ -47,7 +47,7 @@ using LoD = std::vector<Vector<size_t>>;
  * Note: LoDTensor (Level of details Tensor)
  * see https://en.wikipedia.org/wiki/Level_of_details for reference.
  */
-class LoDTensor : public BaseTensor {
+class LoDTensor final : public BaseTensor {
  public:
   LoDTensor() = delete;
 
diff --git a/paddle/pten/core/tensor_meta.h b/paddle/pten/core/tensor_meta.h
index 57f6cfd3aaafb..2e0996c5a7e65 100644
--- a/paddle/pten/core/tensor_meta.h
+++ b/paddle/pten/core/tensor_meta.h
@@ -23,16 +23,12 @@ limitations under the License. */
 
 namespace pt {
 
-/*
-class InplaceVersion {
- public:
- private:
-};
-*/
-
 /**
- * The Meta data member of TensorImpl.
- * It holds Tensor description information and status information.
+ * The Meta data member of BaseTensor.
+ *
+ * Here the `meta` represents information describing the basic features and
+ * data features of Tensor, and does not include the status information of
+ * Tensor
  *
  * Note: TensorMeta is a struct, the members are named like
  * ordinary nonmember variables, such as `type` instead of `type_`.
@@ -53,7 +49,7 @@ struct TensorMeta {
         layout(meta.layout),
         offset(meta.offset) {}
 
-  // May introduce bug
+  // Bad constructor, may introduce bug
   explicit TensorMeta(DDim dims) : dims(dims) {}
 
   // Compatible Contructor
diff --git a/paddle/pten/core/tensor_status.h b/paddle/pten/core/tensor_status.h
new file mode 100644
index 0000000000000..be98e31a27630
--- /dev/null
+++ b/paddle/pten/core/tensor_status.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/backend.h"
+#include "paddle/pten/core/dtype.h"
+#include "paddle/pten/core/layout.h"
+
+namespace pt {
+
+/**
+ * The Status data member of BaseTensor.
+ *
+ * Here the `static` represents information describing the status of Tensor,
+ * such as version counter, or other bool status members.
+ *
+ * Note: TensorStatus is a struct, the members are named like
+ * ordinary nonmember variables, such as `type` instead of `type_`.
+ * And we direct access its members, in addition to constructor, destructor
+ * and functions for setting data members, can not provide other functions.
+ *
+ * Note: Impl later
+ */
+struct TensorStatus {
+  TensorStatus() = default;
+
+  TensorStatus(const TensorStatus&) = delete;
+  TensorStatus& operator=(const TensorStatus&) = delete;
+  TensorStatus(TensorStatus&&) = delete;
+  TensorStatus& operator=(TensorStatus&&) = delete;
+
+  // InplaceVersion inplace_version_counter{0};
+};
+
+}  // namespace pt
diff --git a/paddle/pten/cpu/math.h b/paddle/pten/cpu/math.h
index 1894a97bc80e1..bf123ad2851a2 100644
--- a/paddle/pten/cpu/math.h
+++ b/paddle/pten/cpu/math.h
@@ -18,10 +18,20 @@ limitations under the License. */
 #include "paddle/pten/module/sign.h"
 
 // See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/platform/device_context.h"
 
 namespace pt {
 
+template <typename T,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenScalar = paddle::framework::EigenScalar<T, MajorType, IndexType>;
+template <typename T,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = paddle::framework::EigenVector<T, MajorType, IndexType>;
+
 using CPUDeviceContext = paddle::platform::CPUDeviceContext;
 
 template <typename T>
@@ -31,4 +41,15 @@ void Sign(const CPUDeviceContext& dev_ctx,
   module::Sign<CPUDeviceContext, T>(dev_ctx, x, out);
 }
 
+template <typename T>
+void Mean(const CPUDeviceContext& dev_ctx,
+          const BaseTensor& x,
+          BaseTensor* out) {
+  out->mutable_data<T>();
+  auto x_data = EigenVector<T>::Flatten(x);
+  auto y_data = EigenScalar<T>::From(*out);
+  auto& place = *dev_ctx.eigen_device();
+  y_data.device(place) = x_data.mean();
+}
+
 }  // namespace pt
diff --git a/paddle/pten/cuda/CMakeLists.txt b/paddle/pten/cuda/CMakeLists.txt
index e69de29bb2d1d..7ad6ae7c489ce 100644
--- a/paddle/pten/cuda/CMakeLists.txt
+++ b/paddle/pten/cuda/CMakeLists.txt
@@ -0,0 +1 @@
+nv_library(math_cuda SRCS math.cu DEPS device_context base_tensor convert_utils)
diff --git a/paddle/pten/cuda/math.cu b/paddle/pten/cuda/math.cu
new file mode 100644
index 0000000000000..66b55e7da134f
--- /dev/null
+++ b/paddle/pten/cuda/math.cu
@@ -0,0 +1,89 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/cuda/math.h"
+
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include "paddle/pten/core/convert_utils.h"
+
+namespace pt {
+
+/**
+ * Util Functors
+ */
+
+template <typename T>
+struct DivideFunctor {
+  HOSTDEVICE explicit inline DivideFunctor(int n)
+      : n_inv(static_cast<T>(1.0 / n)) {}
+
+  HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; }
+
+ private:
+  T n_inv;
+};
+
+/**
+ * Kernels
+ */
+
+template <typename T>
+void MeanCUDA(const CUDADeviceContext& dev_ctx,
+              const BaseTensor& x,
+              BaseTensor* out) {
+  auto size_prob = x.numel();
+  const T* x_data = x.data<T>();
+  T* out_data = out->mutable_data<T>();
+  auto stream = dev_ctx.stream();
+
+  DivideFunctor<T> transformer(size_prob);
+  cub::TransformInputIterator<T, DivideFunctor<T>, const T*> trans_x(
+      x_data, transformer);
+  size_t temp_storage_bytes = 0;
+
+  auto err = cub::DeviceReduce::Sum(
+      nullptr, temp_storage_bytes, trans_x, out_data, size_prob, stream);
+  PADDLE_ENFORCE_CUDA_SUCCESS(err);
+
+  // TODO(chenweihang): maybe too complicated
+  pt::TensorMeta meta(
+      paddle::framework::make_ddim({static_cast<int64_t>(temp_storage_bytes)}),
+      pt::TransToPtenBackend(dev_ctx.GetPlace()),
+      x.type(),
+      x.layout(),
+      0);
+  pt::BaseTensor tmp(std::move(meta));
+  auto* temp_storage = tmp.mutable_data<uint8_t>();
+  err = cub::DeviceReduce::Sum(
+      temp_storage, temp_storage_bytes, trans_x, out_data, size_prob, stream);
+  PADDLE_ENFORCE_CUDA_SUCCESS(err);
+}
+
+template void MeanCUDA<float>(const CUDADeviceContext& dev_ctx,
+                              const BaseTensor& x,
+                              BaseTensor* out);
+template void MeanCUDA<double>(const CUDADeviceContext& dev_ctx,
+                               const BaseTensor& x,
+                               BaseTensor* out);
+template void MeanCUDA<paddle::platform::float16>(
+    const CUDADeviceContext& dev_ctx, const BaseTensor& x, BaseTensor* out);
+
+}  // namespace pt
diff --git a/paddle/pten/cuda/math.h b/paddle/pten/cuda/math.h
index d14faa20a398d..6d78ac3839a3d 100644
--- a/paddle/pten/cuda/math.h
+++ b/paddle/pten/cuda/math.h
@@ -33,6 +33,23 @@ void Sign(const CUDADeviceContext& dev_ctx,
   module::Sign<CUDADeviceContext, T>(dev_ctx, x, out);
 }
 
+// TODO(chenweihang): Perhaps the Kernel call should not be implemented by
+// calling functions, but by finding the Kernel call method from the global
+// KernelMap. For a kernel like cuda, if you have to call functions through
+// include header files, there will be many more function declarations and
+// redundant function call
+template <typename T>
+void MeanCUDA(const CUDADeviceContext& dev_ctx,
+              const BaseTensor& x,
+              BaseTensor* out);
+
+template <typename T>
+void Mean(const CUDADeviceContext& dev_ctx,
+          const BaseTensor& x,
+          BaseTensor* out) {
+  MeanCUDA<T>(dev_ctx, x, out);
+}
+
 }  // namespace pt
 
 #endif
diff --git a/paddle/pten/hip/CMakeLists.txt b/paddle/pten/hip/CMakeLists.txt
new file mode 100644
index 0000000000000..e69de29bb2d1d

From 79d2a1a0e291264446e5e2f017cf769e94b6e54e Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 15 Jul 2021 08:21:15 +0000
Subject: [PATCH 005/125] move sign & mean xpu & npu kernel

---
 paddle/fluid/operators/mean_op.cc     | 13 ++++++++
 paddle/fluid/operators/mean_op_npu.cc | 28 -----------------
 paddle/fluid/operators/mean_op_xpu.cc | 20 ------------
 paddle/fluid/operators/sign_op.cc     |  7 +++++
 paddle/fluid/operators/sign_op_xpu.cc | 44 ---------------------------
 paddle/pten/api/dev/math.h            |  1 +
 paddle/pten/inferdtype/CMakeLists.txt |  0
 paddle/pten/infershape/CMakeLists.txt |  0
 paddle/pten/npu/math.h                | 23 ++++++++++++--
 paddle/pten/xpu/math.h                | 26 +++++++++++-----
 10 files changed, 61 insertions(+), 101 deletions(-)
 delete mode 100644 paddle/fluid/operators/sign_op_xpu.cc
 create mode 100644 paddle/pten/inferdtype/CMakeLists.txt
 create mode 100644 paddle/pten/infershape/CMakeLists.txt

diff --git a/paddle/fluid/operators/mean_op.cc b/paddle/fluid/operators/mean_op.cc
index 764529a15b6a2..0ec9a39cb6850 100644
--- a/paddle/fluid/operators/mean_op.cc
+++ b/paddle/fluid/operators/mean_op.cc
@@ -100,3 +100,16 @@ REGISTER_OP_CPU_KERNEL(
 REGISTER_OP_CPU_KERNEL(
     mean_grad, ops::MeanGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::MeanGradKernel<paddle::platform::CPUDeviceContext, double>);
+
+#ifdef PADDLE_WITH_XPU
+REGISTER_OP_XPU_KERNEL(
+    mean, ops::MeanKernel<paddle::platform::XPUDeviceContext, float>);
+#endif
+
+#ifdef PADDLE_WITH_ASCEND_CL
+REGISTER_OP_NPU_KERNEL(
+    mean, ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, plat::float16>)
+#endif
diff --git a/paddle/fluid/operators/mean_op_npu.cc b/paddle/fluid/operators/mean_op_npu.cc
index ab0a3336b361f..be52a23d82ff6 100644
--- a/paddle/fluid/operators/mean_op_npu.cc
+++ b/paddle/fluid/operators/mean_op_npu.cc
@@ -16,29 +16,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename DeviceContext, typename T>
-class MeanNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::LoDTensor>("X");
-    auto* out = ctx.Output<framework::LoDTensor>("Out");
-
-    std::vector<int> axes;
-
-    framework::NPUAttributeMap attr_input = {{"keep_dims", false},
-                                             {"axes", axes}};
-
-    out->mutable_data<T>(ctx.GetPlace());
-
-    const auto& runner = NpuOpRunner("ReduceMeanD", {*x}, {*out}, attr_input);
-
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    runner.Run(stream);
-  }
-};
-
 template <typename DeviceContext, typename T>
 class MeanGradNPUKernel : public framework::OpKernel<T> {
  public:
@@ -90,11 +67,6 @@ class MeanGradNPUKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(
-    mean, ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, plat::float16>)
 
 REGISTER_OP_NPU_KERNEL(
     mean_grad, ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, int>,
diff --git a/paddle/fluid/operators/mean_op_xpu.cc b/paddle/fluid/operators/mean_op_xpu.cc
index 71bcc4be15ce5..58220bf79a8ed 100644
--- a/paddle/fluid/operators/mean_op_xpu.cc
+++ b/paddle/fluid/operators/mean_op_xpu.cc
@@ -21,24 +21,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename DeviceContext, typename T>
-class MeanXPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Output<Tensor>("Out");
-    output->mutable_data<T>(context.GetPlace());
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    const float* x_data = input->data<float>();
-    float* y_data = output->data<float>();
-    int r = xpu::mean(dev_ctx.x_context(), x_data, y_data, input->numel());
-    PADDLE_ENFORCE_EQ(
-        r, xpu::Error_t::SUCCESS,
-        platform::errors::External(
-            "XPU kernel error, Mean op execution not succeed, error code=%d",
-            r));
-  }
-};
 template <typename DeviceContext, typename T>
 class MeanGradXPUKernel : public framework::OpKernel<T> {
  public:
@@ -64,8 +46,6 @@ class MeanGradXPUKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_XPU_KERNEL(
-    mean, ops::MeanXPUKernel<paddle::platform::XPUDeviceContext, float>);
 REGISTER_OP_XPU_KERNEL(
     mean_grad,
     ops::MeanGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/sign_op.cc b/paddle/fluid/operators/sign_op.cc
index 6207c33f9d629..8620cec8cf62d 100644
--- a/paddle/fluid/operators/sign_op.cc
+++ b/paddle/fluid/operators/sign_op.cc
@@ -71,9 +71,16 @@ REGISTER_OP_CPU_KERNEL(
     sign, ops::SignKernel<paddle::platform::CPUDeviceContext, float>,
     ops::SignKernel<paddle::platform::CPUDeviceContext, double>);
 
+#ifdef PADDLE_WITH_CUDA
 REGISTER_OP_CUDA_KERNEL(
     sign,
     paddle::operators::SignKernel<paddle::platform::CUDADeviceContext, float>,
     paddle::operators::SignKernel<paddle::platform::CUDADeviceContext, double>,
     paddle::operators::SignKernel<paddle::platform::CUDADeviceContext,
                                   paddle::platform::float16>);
+#endif
+
+#ifdef PADDLE_WITH_XPU
+REGISTER_OP_XPU_KERNEL(
+    sign, ops::SignKernel<paddle::platform::XPUDeviceContext, float>);
+#endif
diff --git a/paddle/fluid/operators/sign_op_xpu.cc b/paddle/fluid/operators/sign_op_xpu.cc
deleted file mode 100644
index 86fe826c659ef..0000000000000
--- a/paddle/fluid/operators/sign_op_xpu.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_XPU
-
-#include "paddle/fluid/operators/sign_op.h"
-#include "paddle/fluid/platform/xpu_header.h"
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class SignXPUKernel : public framework::OpKernel<T> {
- public:
-  virtual void Compute(const framework::ExecutionContext& context) const {
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto* in = context.Input<framework::Tensor>("X");
-    out->mutable_data<T>(in->place());
-    auto xpu_context = context.device_context<DeviceContext>().x_context();
-    int r = xpu::activation_forward(xpu_context, xpu::Activation_t::SIGN,
-                                    in->numel(), in->data<T>(), out->data<T>());
-    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                      platform::errors::Fatal("XPU sign kernel error!"));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_XPU_KERNEL(
-    sign, ops::SignXPUKernel<paddle::platform::XPUDeviceContext, float>);
-
-#endif
diff --git a/paddle/pten/api/dev/math.h b/paddle/pten/api/dev/math.h
index 4de11d5e33a6b..a15389d112958 100644
--- a/paddle/pten/api/dev/math.h
+++ b/paddle/pten/api/dev/math.h
@@ -16,4 +16,5 @@ limitations under the License. */
 
 #include "paddle/pten/cpu/math.h"
 #include "paddle/pten/cuda/math.h"
+#include "paddle/pten/npu/math.h"
 #include "paddle/pten/xpu/math.h"
diff --git a/paddle/pten/inferdtype/CMakeLists.txt b/paddle/pten/inferdtype/CMakeLists.txt
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/paddle/pten/infershape/CMakeLists.txt b/paddle/pten/infershape/CMakeLists.txt
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/paddle/pten/npu/math.h b/paddle/pten/npu/math.h
index 0d3a28bb658bb..c534045f1901b 100644
--- a/paddle/pten/npu/math.h
+++ b/paddle/pten/npu/math.h
@@ -17,11 +17,30 @@ limitations under the License. */
 #ifdef PADDLE_WITH_ASCEND_CL
 
 #include "paddle/pten/core/base_tensor.h"
-#include "paddle/pten/module/sign.h"
 
 // See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/operators/npu_op_runner.h"
 #include "paddle/fluid/platform/device_context.h"
 
-namespace pt {}  // namespace pt
+namespace pt {
+
+using NPUDeviceContext = paddle::platfrom::NPUDeviceContext;
+
+template <typename T>
+void Mean(const NPUDeviceContext& dev_ctx,
+          const BaseTensor& x,
+          BaseTensor* out) {
+  std::vector<int> axes;
+  framework::NPUAttributeMap attr_input = {{"keep_dims", false},
+                                           {"axes", axes}};
+  out->mutable_data<T>();
+  const auto& runner = NpuOpRunner("ReduceMeanD", {x}, {*out}, attr_input);
+  auto stream =
+      ctx.template device_context<paddle::platform::NPUDeviceContext>()
+          .stream();
+  runner.Run(stream);
+}
+
+}  // namespace pt
 
 #endif
diff --git a/paddle/pten/xpu/math.h b/paddle/pten/xpu/math.h
index c15023e210d12..e91bd65fae6bc 100644
--- a/paddle/pten/xpu/math.h
+++ b/paddle/pten/xpu/math.h
@@ -30,18 +30,30 @@ template <typename T>
 void Sign(const XPUDeviceContext& dev_ctx,
           const BaseTensor& x,
           BaseTensor* out) {
-  out->mutable_data<T>();
-  auto xpu_context = dev_ctx.x_context();
-  int r = xpu::activation_forward(xpu_context,
-                                  xpu::Activation_t::SIGN,
-                                  in.numel(),
-                                  in.data<T>(),
-                                  out->mutbale_data<T>());
+  T* out_data = out->mutable_data<T>();
+  auto xpu_ctx = dev_ctx.x_context();
+  int r = xpu::activation_forward(
+      xpu_ctx, xpu::Activation_t::SIGN, in.numel(), in.data<T>(), out_data);
   PADDLE_ENFORCE_EQ(r,
                     xpu::Error_t::SUCCESS,
                     platform::errors::Fatal("XPU sign kernel error!"));
 }
 
+template <typename T>
+void Mean(const XPUDeviceContext& dev_ctx,
+          const BaseTensor& x,
+          BaseTensor* out) {
+  T* out_data = out->mutable_data<T>();
+  auto xpu_ctx = dev_ctx.x_context();
+  const T* x_data = x.Inputdata<T>();
+  int r = xpu::mean(xpu_ctx, x_data, out_data, x.numel());
+  PADDLE_ENFORCE_EQ(
+      r,
+      xpu::Error_t::SUCCESS,
+      platform::errors::External(
+          "XPU kernel error, Mean op execution not succeed, error code=%d", r));
+}
+
 }  // namespace pt
 
 #endif

From 434136f1dcdf9c7ae9903eae6e849d0ddb1ce39b Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Fri, 16 Jul 2021 11:37:16 +0000
Subject: [PATCH 006/125] add selected_rows basic impl

---
 paddle/pten/api/CMakeLists.txt        |  2 +-
 paddle/pten/core/CMakeLists.txt       |  2 +
 paddle/pten/core/lod_tensor.cc        | 17 ++++++++
 paddle/pten/core/lod_tensor.h         |  4 +-
 paddle/pten/core/selected_rows.cc     | 17 ++++++++
 paddle/pten/core/selected_rows.h      | 56 +++++++++++++++++++++++++++
 paddle/pten/core/tensor_meta.h        |  2 +-
 paddle/pten/tests/CMakeLists.txt      |  1 +
 paddle/pten/tests/base_tensor_test.cc | 45 +++++++++++++++++++++
 9 files changed, 142 insertions(+), 4 deletions(-)
 create mode 100644 paddle/pten/core/lod_tensor.cc
 create mode 100644 paddle/pten/core/selected_rows.cc
 create mode 100644 paddle/pten/tests/base_tensor_test.cc

diff --git a/paddle/pten/api/CMakeLists.txt b/paddle/pten/api/CMakeLists.txt
index 5262784d244a0..523a70569a348 100644
--- a/paddle/pten/api/CMakeLists.txt
+++ b/paddle/pten/api/CMakeLists.txt
@@ -1,6 +1,6 @@
 add_subdirectory(user/src)
 
-set(PTEN_DEPS base_tensor convert_utils)
+set(PTEN_DEPS convert_utils base_tensor pten_lod_tensor pten_selected_rows)
 if(WITH_GPU)
   set(PTEN_DEPS ${PTEN_DEPS} math_cuda)
 endif()
diff --git a/paddle/pten/core/CMakeLists.txt b/paddle/pten/core/CMakeLists.txt
index 85203251d6a7a..95b1f5986029f 100644
--- a/paddle/pten/core/CMakeLists.txt
+++ b/paddle/pten/core/CMakeLists.txt
@@ -1,2 +1,4 @@
 cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place)
 cc_library(base_tensor SRCS base_tensor.cc DEPS enforce data_type ddim allocator place convert_utils)
+cc_library(pten_lod_tensor SRCS lod_tensor.cc DEPS base_tensor)
+cc_library(pten_selected_rows SRCS lod_tensor.cc DEPS base_tensor)
diff --git a/paddle/pten/core/lod_tensor.cc b/paddle/pten/core/lod_tensor.cc
new file mode 100644
index 0000000000000..9f348d9b1332b
--- /dev/null
+++ b/paddle/pten/core/lod_tensor.cc
@@ -0,0 +1,17 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/core/lod_tensor.h"
+
+namespace pt {}  // namespace pt
diff --git a/paddle/pten/core/lod_tensor.h b/paddle/pten/core/lod_tensor.h
index 0eb5f1769bbfc..b4495013432f3 100644
--- a/paddle/pten/core/lod_tensor.h
+++ b/paddle/pten/core/lod_tensor.h
@@ -18,7 +18,6 @@ limitations under the License. */
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/mixed_vector.h"
-#include "paddle/fluid/platform/device_context.h"
 
 namespace pt {
 
@@ -56,7 +55,8 @@ class LoDTensor final : public BaseTensor {
   LoDTensor(LoDTensor&&) = delete;
   LoDTensor& operator=(LoDTensor&&) = delete;
 
-  explicit LoDTensor(TensorMeta meta, const LoD& lod) : lod_(lod) {}
+  explicit LoDTensor(const LoD& lod, TensorMeta&& meta)
+      : lod_(lod), BaseTensor(meta) {}
 
   void set_lod(const LoD& lod) { lod_ = lod; }
 
diff --git a/paddle/pten/core/selected_rows.cc b/paddle/pten/core/selected_rows.cc
new file mode 100644
index 0000000000000..ec70dd0e8cdbe
--- /dev/null
+++ b/paddle/pten/core/selected_rows.cc
@@ -0,0 +1,17 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/core/selected_rows.h"
+
+namespace pt {}  // namespace pt
diff --git a/paddle/pten/core/selected_rows.h b/paddle/pten/core/selected_rows.h
index e1a22f3269ecb..9aec9d605c76a 100644
--- a/paddle/pten/core/selected_rows.h
+++ b/paddle/pten/core/selected_rows.h
@@ -13,3 +13,59 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
+#include <algorithm>
+#include <memory>
+#include <mutex>  // NOLINT
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "paddle/pten/core/base_tensor.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/framework/mixed_vector.h"
+#include "paddle/fluid/framework/rw_lock.h"
+
+namespace pt {
+
+using Vector = paddle::framework::Vector;
+
+/**
+ * SelectedRows: compatible with SelectedRows in fluid and related operators.
+ */
+class SelectedRows final : public BaseTensor {
+ public:
+  SelectedRows() = delete;
+
+  SelectedRows(const SelectedRows&) = delete;
+  SelectedRows& operator=(const SelectedRows&) = delete;
+  SelectedRows(SelectedRows&&) = delete;
+  SelectedRows& operator=(SelectedRows&&) = delete;
+
+  SelectedRows(const std::vector<int64_t>& rows,
+               int64_t height,
+               TensorMeta&& meta)
+      : rows_(rows), height_(height), BaseTensor(meta) {}
+
+  const Vector<int64_t>& rows() const { return rows_; }
+
+  Vector<int64_t>* mutable_rows() { return &rows_; }
+
+  void set_rows(const Vector<int64_t>& rows)()
+
+      int64_t height() const {
+    return height_;
+  }
+
+  void set_height(int64_t height) { height_ = height; }
+
+ private:
+  Vector<int64_t> rows_;
+  int64_t height_;
+
+  std::unordered_map<int64_t, int64_t> id_to_index_;
+  std::unique_ptr<RWLock> rwlock_{nullptr};
+};
+
+}  // namespace pt
diff --git a/paddle/pten/core/tensor_meta.h b/paddle/pten/core/tensor_meta.h
index 2e0996c5a7e65..febb6600c5a9c 100644
--- a/paddle/pten/core/tensor_meta.h
+++ b/paddle/pten/core/tensor_meta.h
@@ -50,7 +50,7 @@ struct TensorMeta {
         offset(meta.offset) {}
 
   // Bad constructor, may introduce bug
-  explicit TensorMeta(DDim dims) : dims(dims) {}
+  // explicit TensorMeta(DDim dims) : dims(dims) {}
 
   // Compatible Contructor
   TensorMeta(const DDim& dims,
diff --git a/paddle/pten/tests/CMakeLists.txt b/paddle/pten/tests/CMakeLists.txt
index e69de29bb2d1d..dda192ff8b6a4 100644
--- a/paddle/pten/tests/CMakeLists.txt
+++ b/paddle/pten/tests/CMakeLists.txt
@@ -0,0 +1 @@
+cc_test(base_tensor_test SRCS base_tensor_test.cc DEPS base_tensor)
diff --git a/paddle/pten/tests/base_tensor_test.cc b/paddle/pten/tests/base_tensor_test.cc
new file mode 100644
index 0000000000000..58e6bc05ab94e
--- /dev/null
+++ b/paddle/pten/tests/base_tensor_test.cc
@@ -0,0 +1,45 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/core/base_tensor.h"
+
+#include <gtest/gtest.h>
+
+namespace framework = paddle::framework;
+using DDim = paddle::framework::DDim;
+
+TEST(BaseTensor, Constructor) {
+  pt::TensorMeta meta(framework::make_ddim({5, 10}),
+                      pt::Backend::kCPU,
+                      pt::DataType::kFLOAT32,
+                      pt::Layout::kNCHW,
+                      0UL);
+  pt::BaseTensor tensor(std::move(meta));
+  ASSERT_EQ(tensor.dims().size(), 2);
+  ASSERT_EQ(tensor.backend(), pt::Backend::kCPU);
+  ASSERT_EQ(tensor.type(), pt::DataType::kFLOAT32);
+  ASSERT_EQ(tensor.layout(), pt::Layout::kNCHW);
+}
+
+TEST(BaseTensor, Dims) {
+  // impl later
+}
+
+TEST(BaseTensor, Place) {
+  // impl later
+}
+
+TEST(BaseTensor, Data) {
+  // impl later
+}

From 6c6ee22b4af121d0203ea9dd160f7504713b598e Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Tue, 27 Jul 2021 08:39:34 +0000
Subject: [PATCH 007/125] refactor design, BaseTensor to DenseTensor, etc.

---
 paddle/fluid/framework/eigen.h                |  25 ++--
 paddle/fluid/framework/pten_utils.h           |  17 +--
 paddle/fluid/framework/tensor.h               |   2 -
 paddle/fluid/operators/CMakeLists.txt         |   8 +-
 paddle/fluid/operators/mean_op.h              |   4 +-
 paddle/fluid/operators/scale_op.cc            |  14 ++
 paddle/fluid/operators/scale_op.h             |  39 ++---
 paddle/fluid/operators/scale_op_npu.cc        |  72 ----------
 paddle/fluid/operators/scale_op_xpu.cc        |  66 ---------
 paddle/fluid/operators/sign_op.h              |   4 +-
 paddle/pten/api/CMakeLists.txt                |   4 +-
 paddle/pten/api/dev/core.h                    |   2 +-
 paddle/pten/api/dev/math.h                    |   2 +
 paddle/pten/api/{user => }/src/CMakeLists.txt |   0
 paddle/pten/core/CMakeLists.txt               |  11 +-
 paddle/pten/core/base_tensor.h                | 123 ----------------
 paddle/pten/core/convert_utils.cc             |  20 ++-
 paddle/pten/core/convert_utils.h              |   4 +-
 .../core/{base_tensor.cc => dense_tensor.cc}  |  89 ++++++------
 paddle/pten/core/dense_tensor.h               | 135 ++++++++++++++++++
 paddle/pten/core/layout.h                     |   2 +-
 paddle/pten/core/lod_tensor.cc                |  17 ---
 paddle/pten/core/lod_tensor.h                 |  71 ---------
 paddle/pten/core/scalar_tensor.h              |   4 +-
 paddle/pten/core/selected_rows.h              |  51 ++++---
 paddle/pten/core/spatial_tensor.h             |  49 +++++++
 paddle/pten/core/tensor.h                     |  39 ++---
 paddle/pten/core/tensor_impl_if.h             |  36 ++---
 paddle/pten/core/tensor_meta.h                |  96 ++++++++++++-
 paddle/pten/core/tensor_status.h              |  23 ++-
 paddle/pten/cpu/math.h                        |  42 +++++-
 paddle/pten/cuda/CMakeLists.txt               |   2 +-
 paddle/pten/cuda/math.cu                      |  30 ++--
 paddle/pten/cuda/math.h                       |  25 ++--
 paddle/pten/module/scale.h                    |  51 +++++++
 paddle/pten/module/sign.h                     |   4 +-
 paddle/pten/npu/math.h                        |  43 +++++-
 paddle/pten/selected_rows/CMakeLists.txt      |   0
 paddle/pten/selected_rows/math.h              |  44 ++++++
 paddle/pten/tests/CMakeLists.txt              |   2 +-
 ...se_tensor_test.cc => dense_tensor_test.cc} |  24 ++--
 paddle/pten/xpu/math.h                        |  39 ++++-
 42 files changed, 736 insertions(+), 599 deletions(-)
 delete mode 100644 paddle/fluid/operators/scale_op_npu.cc
 delete mode 100644 paddle/fluid/operators/scale_op_xpu.cc
 rename paddle/pten/api/{user => }/src/CMakeLists.txt (100%)
 delete mode 100644 paddle/pten/core/base_tensor.h
 rename paddle/pten/core/{base_tensor.cc => dense_tensor.cc} (63%)
 create mode 100644 paddle/pten/core/dense_tensor.h
 delete mode 100644 paddle/pten/core/lod_tensor.cc
 delete mode 100644 paddle/pten/core/lod_tensor.h
 create mode 100644 paddle/pten/core/spatial_tensor.h
 create mode 100644 paddle/pten/module/scale.h
 create mode 100644 paddle/pten/selected_rows/CMakeLists.txt
 create mode 100644 paddle/pten/selected_rows/math.h
 rename paddle/pten/tests/{base_tensor_test.cc => dense_tensor_test.cc} (64%)

diff --git a/paddle/fluid/framework/eigen.h b/paddle/fluid/framework/eigen.h
index be03a61643b62..ad76889a9a7d6 100644
--- a/paddle/fluid/framework/eigen.h
+++ b/paddle/fluid/framework/eigen.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
-#include "paddle/pten/core/base_tensor.h"
+#include "paddle/pten/core/dense_tensor.h"
 
 namespace paddle {
 namespace framework {
@@ -70,25 +70,25 @@ struct EigenTensor {
     return From(tensor, tensor.dims_);
   }
 
-  // for pt::BaseTensor
-  static Type From(pt::BaseTensor& tensor, DDim dims) {  // NOLINT
+  // for pt::DenseTensor
+  static Type From(pt::DenseTensor& tensor, DDim dims) {  // NOLINT
     // why tensor.data<T>() not work?
     // return Type(const_cast<T*>(reinterpret_cast<const T*>(tensor.data())),
     // EigenDim<D>::From(dims));
     return Type(const_cast<T*>(tensor.data<T>()), EigenDim<D>::From(dims));
   }
 
-  static Type From(pt::BaseTensor& tensor) {  // NOLINT
+  static Type From(pt::DenseTensor& tensor) {  // NOLINT
     return From(tensor, tensor.dims());
   }  // NOLINT
 
-  static ConstType From(const pt::BaseTensor& tensor, DDim dims) {
+  static ConstType From(const pt::DenseTensor& tensor, DDim dims) {
     // return ConstType(reinterpret_cast<const T*>(tensor.data()),
     // EigenDim<D>::From(dims));
     return ConstType(tensor.data<T>(), EigenDim<D>::From(dims));
   }
 
-  static ConstType From(const pt::BaseTensor& tensor) {
+  static ConstType From(const pt::DenseTensor& tensor) {
     return From(tensor, tensor.dims());
   }
 };
@@ -134,13 +134,14 @@ struct EigenVector : public EigenTensor<T, 1, MajorType, IndexType> {
     return EigenVector::From(tensor, {product(tensor.dims_)});
   }
 
-  // for pt::BaseTensor
-  static typename EigenVector::Type Flatten(pt::BaseTensor& tensor) {  // NOLINT
+  // for pt::DenseTensor
+  static typename EigenVector::Type Flatten(
+      pt::DenseTensor& tensor) {  // NOLINT
     return EigenVector::From(tensor, {product(tensor.dims())});
   }
 
   static typename EigenVector::ConstType Flatten(
-      const pt::BaseTensor& tensor) {  // NOLINT
+      const pt::DenseTensor& tensor) {  // NOLINT
     return EigenVector::From(tensor, {product(tensor.dims())});
   }
 };
@@ -160,12 +161,12 @@ struct EigenScalar {
     return ConstType(tensor.data<T>());
   }
 
-  // for pt::BaseTensor
-  static Type From(pt::BaseTensor& tensor) {  // NOLINT
+  // for pt::DenseTensor
+  static Type From(pt::DenseTensor& tensor) {  // NOLINT
     return Type(const_cast<T*>(tensor.data<T>()));
   }
 
-  static ConstType From(const pt::BaseTensor& tensor) {
+  static ConstType From(const pt::DenseTensor& tensor) {
     return ConstType(tensor.data<T>());
   }
 };
diff --git a/paddle/fluid/framework/pten_utils.h b/paddle/fluid/framework/pten_utils.h
index e16e8b012328d..85a345b9a3796 100644
--- a/paddle/fluid/framework/pten_utils.h
+++ b/paddle/fluid/framework/pten_utils.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/pten/core/base_tensor.h"
 #include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/dense_tensor.h"
 
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/place.h"
@@ -28,11 +28,11 @@ std::shared_ptr<TensorImplT> MakeTensorImpl(const Tensor& tensor,
                                             const platform::Place& place,
                                             proto::VarType::Type type) {
   auto holder = tensor.Holder();
-  auto meta =
-      pt::TensorMeta(tensor.dims(), pt::TransToPtenBackend(place),
-                     pt::TransToPtenDataType(type),
-                     pt::TransToPtenLayout(tensor.layout()), tensor.offset());
-  auto tensor_impl = std::make_shared<TensorImplT>(std::move(meta));
+  auto tensor_impl = std::make_shared<TensorImplT>(
+      std::unique_ptr<pt::TensorMeta>(new pt::TensorMeta(
+          tensor.dims(), pt::TransToPtenBackend(place),
+          pt::TransToPtenDataType(type), pt::TransToPtenLayout(tensor.layout()),
+          tensor.offset())));
   if (holder != nullptr) {
     tensor_impl->template ShareAllocation(tensor.Holder());
   } else {
@@ -43,8 +43,9 @@ std::shared_ptr<TensorImplT> MakeTensorImpl(const Tensor& tensor,
 
 template <typename TensorImplT>
 void ShareTensorImpl(TensorImplT* tensor_impl, Tensor* out) {
-  out->set_type(pt::TransToProtoVarType(tensor_impl->template type()));
-  out->ResetHolder(tensor_impl->template MoveMemory());
+  out->ResetHolderWithType(
+      tensor_impl->template MoveMemory(),
+      pt::TransToProtoVarType(tensor_impl->template type()));
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 5147d6c53fd80..539859c45c907 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -211,8 +211,6 @@ class Tensor {
     return holder_->place();
   }
 
-  void set_type(proto::VarType::Type type) { type_ = type; }
-
   proto::VarType::Type type() const {
     PADDLE_ENFORCE_NOT_NULL(
         holder_,
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index af55d5d5679a6..e3b3f84125814 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -74,8 +74,9 @@ if(WITH_UNITY_BUILD)
     include(unity_build_rule.cmake)
 endif()
 
+set(OP_HEADER_DEPS ${OP_HEADER_DEPS} pten)
 register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op lstm_op run_program_op eye_op recurrent_op
-        sync_batch_norm_op sign_op mean_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
+        sync_batch_norm_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
 
 op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS})
 
@@ -94,13 +95,10 @@ else()
     op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 endif()
 
-op_library(lstm_op DEPS ${OP_HEADER_DEPS} lstm_compute)
+op_library(lstm_op DEPS ${OP_HEADER_DEPS}  lstm_compute)
 op_library(eye_op DEPS ${OP_HEADER_DEPS})
 op_library(recurrent_op DEPS ${OP_HEADER_DEPS})
 
-op_library(sign_op DEPS ${OP_HEADER_DEPS} pten)
-op_library(mean_op DEPS ${OP_HEADER_DEPS} pten)
-
 set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
 
 if (WITH_DGC)
diff --git a/paddle/fluid/operators/mean_op.h b/paddle/fluid/operators/mean_op.h
index 4dcdb41420b28..0404e050a573f 100644
--- a/paddle/fluid/operators/mean_op.h
+++ b/paddle/fluid/operators/mean_op.h
@@ -41,9 +41,9 @@ class MeanKernel : public framework::OpKernel<T> {
     auto& dev_ctx = context.device_context<DeviceContext>();
 
     auto pt_x =
-        framework::MakeTensorImpl<pt::BaseTensor>(*x, x->place(), x->type());
+        framework::MakeTensorImpl<pt::DenseTensor>(*x, x->place(), x->type());
     auto pt_out =
-        framework::MakeTensorImpl<pt::BaseTensor>(*out, x->place(), x->type());
+        framework::MakeTensorImpl<pt::DenseTensor>(*out, x->place(), x->type());
 
     // call new kernel
     pt::Mean<T>(dev_ctx, *pt_x.get(), pt_out.get());
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index a195452791048..5d5efb42c279f 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -171,3 +171,17 @@ REGISTER_OP_CUDA_KERNEL(
                                    int64_t>,
     paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
                                    paddle::platform::float16>);
+
+#ifdef PADDLE_WITH_XPU
+REGISTER_OP_XPU_KERNEL(
+    scale,
+    paddle::operators::ScaleKernel<paddle::platform::XPUDeviceContext, float>);
+#endif
+
+#ifdef PADDLE_WITH_ASCEND_CL
+REGISTER_OP_NPU_KERNEL(
+    scale,
+    paddle::operators::ScaleKernel<paddle::platform::NPUDeviceContext, float>,
+    paddle::operators::ScaleKernel<paddle::platform::NPUDeviceContext,
+                                   paddle::platform::float16>);
+#endif
diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h
index 544f0a916681e..d4d517a7e87e7 100644
--- a/paddle/fluid/operators/scale_op.h
+++ b/paddle/fluid/operators/scale_op.h
@@ -14,9 +14,12 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
+#include "paddle/fluid/framework/pten_utils.h"
+
+// only can include the headers in paddle/pten/api dirs
+#include "paddle/pten/api/dev/core.h"
+#include "paddle/pten/api/dev/math.h"
 
 namespace paddle {
 namespace operators {
@@ -39,13 +42,13 @@ class ScaleKernel : public framework::OpKernel<T> {
     auto* in_var = ctx.InputVar("X");
     auto* in = framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_var);
 
-    auto bias = static_cast<T>(ctx.Attr<float>("bias"));
+    auto bias = ctx.Attr<float>("bias");
     auto bias_after_scale = ctx.Attr<bool>("bias_after_scale");
 
-    auto scale = static_cast<T>(ctx.Attr<float>("scale"));
+    auto scale = ctx.Attr<float>("scale");
     if (ctx.HasInput("ScaleTensor")) {
       auto* scale_tensor = ctx.Input<framework::Tensor>("ScaleTensor");
-      scale = GetAttrFromTensor<T>(scale_tensor);
+      scale = static_cast<float>(GetAttrFromTensor<T>(scale_tensor));
     }
 
     auto* out_var = ctx.OutputVar("Out");
@@ -58,19 +61,19 @@ class ScaleKernel : public framework::OpKernel<T> {
 
     auto* out =
         framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(out_var);
-    out->mutable_data<T>(in->place());
-
-    PADDLE_ENFORCE_EQ(in->dims(), out->dims(),
-                      paddle::platform::errors::InvalidArgument(
-                          "the input and output should have the same dim"
-                          "but input dim is %s, output dim is %s",
-                          in->dims(), out->dims()));
-
-    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
-    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
-    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-    EigenScale<std::decay_t<decltype(dev)>, T>::Eval(
-        dev, eigen_out, eigen_in, scale, bias, bias_after_scale);
+    auto& dev_ctx = ctx.device_context<DeviceContext>();
+
+    auto pt_x = framework::MakeTensorImpl<pt::DenseTensor>(*in, in->place(),
+                                                           in->type());
+    auto pt_out = framework::MakeTensorImpl<pt::DenseTensor>(*out, in->place(),
+                                                             in->type());
+
+    // call new kernel
+    pt::Scale<T>(dev_ctx, *pt_x.get(), scale, bias, bias_after_scale,
+                 pt_out.get());
+
+    // share pt_out data to out
+    framework::ShareTensorImpl(pt_out.get(), out);
   }
 };
 
diff --git a/paddle/fluid/operators/scale_op_npu.cc b/paddle/fluid/operators/scale_op_npu.cc
deleted file mode 100644
index 6fb0e6d372745..0000000000000
--- a/paddle/fluid/operators/scale_op_npu.cc
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/operators/npu_op_runner.h"
-#include "paddle/fluid/operators/scale_op.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class ScaleNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<framework::Tensor>("X");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    auto scale = static_cast<float>(ctx.Attr<float>("scale"));
-    auto bias = static_cast<float>(ctx.Attr<float>("bias"));
-    auto bias_after_scale = ctx.Attr<bool>("bias_after_scale");
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    float _power = 1.0;
-    VLOG(4) << "scale:" << scale << ", bias:" << bias
-            << " ,bias_after_scale:" << bias_after_scale;
-    if (bias_after_scale) {
-      out->mutable_data<T>(ctx.GetPlace());
-      const auto& runner =
-          NpuOpRunner("Power", {*x}, {*out},
-                      {{"power", _power}, {"scale", scale}, {"shift", bias}});
-
-      runner.Run(stream);
-    } else {
-      Tensor tmp_x(x->type());
-      tmp_x.Resize(x->dims());
-      tmp_x.mutable_data<T>(ctx.GetPlace());
-      const auto& runner_tmp =
-          NpuOpRunner("Adds", {*x}, {tmp_x}, {{"value", bias}});
-      runner_tmp.Run(stream);
-
-      out->mutable_data<T>(ctx.GetPlace());
-      float _bias = 0.0;
-      const auto& runner =
-          NpuOpRunner("Power", {tmp_x}, {*out},
-                      {{"power", _power}, {"scale", scale}, {"shift", _bias}});
-      runner.Run(stream);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_NPU_KERNEL(
-    scale, ops::ScaleNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::ScaleNPUKernel<paddle::platform::NPUDeviceContext,
-                        paddle::platform::float16>);
diff --git a/paddle/fluid/operators/scale_op_xpu.cc b/paddle/fluid/operators/scale_op_xpu.cc
deleted file mode 100644
index fdb90797b69db..0000000000000
--- a/paddle/fluid/operators/scale_op_xpu.cc
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_XPU
-
-#include "paddle/fluid/operators/scale_op.h"
-#include <string>
-#include "paddle/fluid/platform/xpu_header.h"
-
-namespace paddle {
-namespace operators {
-template <typename DeviceContext, typename T>
-class ScaleXPUKernel : public framework::OpKernel<T> {
- public:
-  virtual void Compute(const framework::ExecutionContext& ctx) const {
-    auto* in_var = ctx.InputVar("X");
-    auto* in = framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_var);
-    auto scale = static_cast<T>(ctx.Attr<float>("scale"));
-    auto bias = static_cast<T>(ctx.Attr<float>("bias"));
-    auto bias_after_scale = ctx.Attr<bool>("bias_after_scale");
-    auto* out_var = ctx.OutputVar("Out");
-    if (in_var->IsType<framework::SelectedRows>() && in_var != out_var) {
-      auto& in_slr = in_var->Get<framework::SelectedRows>();
-      auto* out_slr = out_var->GetMutable<framework::SelectedRows>();
-      out_slr->set_rows(in_slr.rows());
-      out_slr->set_height(in_slr.height());
-    }
-    auto* out =
-        framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(out_var);
-    out->mutable_data<T>(in->place());
-    PADDLE_ENFORCE_EQ(
-        in->dims(), out->dims(),
-        platform::errors::InvalidArgument("In and out should have the same dim,"
-                                          " expected %s, but got %s.",
-                                          in->dims().to_str().c_str(),
-                                          out->dims().to_str().c_str()));
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    int r =
-        xpu::scale(dev_ctx.x_context(), in->data<float>(), out->data<float>(),
-                   in->numel(), bias_after_scale, scale, bias);
-    PADDLE_ENFORCE_EQ(
-        r, XPU_SUCCESS,
-        platform::errors::External("XPU scale kernel return wrong value[%d %s]",
-                                   r, XPUAPIErrorMsg[r]));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_XPU_KERNEL(
-    scale, ops::ScaleXPUKernel<paddle::platform::XPUDeviceContext, float>);
-
-#endif
diff --git a/paddle/fluid/operators/sign_op.h b/paddle/fluid/operators/sign_op.h
index 10c583295d26f..8758c7c0ab33b 100644
--- a/paddle/fluid/operators/sign_op.h
+++ b/paddle/fluid/operators/sign_op.h
@@ -34,9 +34,9 @@ class SignKernel : public framework::OpKernel<T> {
     auto& dev_ctx = context.device_context<DeviceContext>();
 
     auto pt_x =
-        framework::MakeTensorImpl<pt::BaseTensor>(*x, x->place(), x->type());
+        framework::MakeTensorImpl<pt::DenseTensor>(*x, x->place(), x->type());
     auto pt_out =
-        framework::MakeTensorImpl<pt::BaseTensor>(*out, x->place(), x->type());
+        framework::MakeTensorImpl<pt::DenseTensor>(*out, x->place(), x->type());
 
     // call new kernel
     pt::Sign<T>(dev_ctx, *pt_x.get(), pt_out.get());
diff --git a/paddle/pten/api/CMakeLists.txt b/paddle/pten/api/CMakeLists.txt
index 523a70569a348..4f901ff7a0d12 100644
--- a/paddle/pten/api/CMakeLists.txt
+++ b/paddle/pten/api/CMakeLists.txt
@@ -1,6 +1,6 @@
-add_subdirectory(user/src)
+add_subdirectory(src)
 
-set(PTEN_DEPS convert_utils base_tensor pten_lod_tensor pten_selected_rows)
+set(PTEN_DEPS convert_utils dense_tensor selected_rows_tensor)
 if(WITH_GPU)
   set(PTEN_DEPS ${PTEN_DEPS} math_cuda)
 endif()
diff --git a/paddle/pten/api/dev/core.h b/paddle/pten/api/dev/core.h
index 7c8982e132676..f660306848dc2 100644
--- a/paddle/pten/api/dev/core.h
+++ b/paddle/pten/api/dev/core.h
@@ -14,4 +14,4 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/pten/core/base_tensor.h"
+#include "paddle/pten/core/dense_tensor.h"
diff --git a/paddle/pten/api/dev/math.h b/paddle/pten/api/dev/math.h
index a15389d112958..d00461f128dd7 100644
--- a/paddle/pten/api/dev/math.h
+++ b/paddle/pten/api/dev/math.h
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+// See Note: [ How do we organize the kernel directory ]
 #include "paddle/pten/cpu/math.h"
 #include "paddle/pten/cuda/math.h"
 #include "paddle/pten/npu/math.h"
+#include "paddle/pten/selected_rows/math.h"
 #include "paddle/pten/xpu/math.h"
diff --git a/paddle/pten/api/user/src/CMakeLists.txt b/paddle/pten/api/src/CMakeLists.txt
similarity index 100%
rename from paddle/pten/api/user/src/CMakeLists.txt
rename to paddle/pten/api/src/CMakeLists.txt
diff --git a/paddle/pten/core/CMakeLists.txt b/paddle/pten/core/CMakeLists.txt
index 95b1f5986029f..6d0e9297b3281 100644
--- a/paddle/pten/core/CMakeLists.txt
+++ b/paddle/pten/core/CMakeLists.txt
@@ -1,4 +1,9 @@
+IF(WITH_MKLDNN)
+    set(MKLDNN_CTX_DEPS mkldnn)
+ELSE()
+    set(MKLDNN_CTX_DEPS)
+ENDIF()
+
 cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place)
-cc_library(base_tensor SRCS base_tensor.cc DEPS enforce data_type ddim allocator place convert_utils)
-cc_library(pten_lod_tensor SRCS lod_tensor.cc DEPS base_tensor)
-cc_library(pten_selected_rows SRCS lod_tensor.cc DEPS base_tensor)
+cc_library(dense_tensor SRCS dense_tensor.cc DEPS enforce data_type ddim allocator place convert_utils ${MKLDNN_CTX_DEPS})
+cc_library(selected_rows_tensor SRCS selected_rows.cc DEPS dense_tensor)
diff --git a/paddle/pten/core/base_tensor.h b/paddle/pten/core/base_tensor.h
deleted file mode 100644
index ac1905d696158..0000000000000
--- a/paddle/pten/core/base_tensor.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <memory>
-
-#include "paddle/pten/core/tensor_impl_if.h"
-#include "paddle/pten/core/tensor_meta.h"
-#include "paddle/pten/core/tensor_status.h"
-
-namespace paddle {
-namespace memory {
-namespace allocation {
-class Allocation;
-}
-}
-}
-
-namespace pt {
-
-// TODO(chenweihang): Allocation still link to framework, Redesign and
-// decoupled Allocation and Allocator?
-using Allocation = paddle::memory::allocation::Allocation;
-
-/**
- * The implementation of general Tensor (For CPU, CUDA, HIP, etc.), similar
- * to the Tensor in fluid, contains a pointer to Allocation and a series of
- * descriptive metadata required by Tensor.
- *
- * BaseTensor is still a base class, it may have mutiple inherited classes,
- * such as LoDTensor, SelectedRows, etc. The memory layout
- * of these inherited classes is consistent with the basic BaseTensor, except
- * that a small number of members are added to further specialize the
- * description of the tensor. For example, LoDTensor adds LoD information,
- * and SelectedRows adds rows and height information.
- * If the memory layout is different, it cannot be described based on the
- * general Allocation, and it needs to be directly inherited from
- * TensorImplInterface.
- *
- */
-class BaseTensor : public TensorImplInterface {
- public:
-  // Not allowed to initialize a tensor without descriptive metadata
-  BaseTensor() = delete;
-
-  BaseTensor(const BaseTensor&) = delete;
-  BaseTensor& operator=(const BaseTensor&) = delete;
-  BaseTensor(BaseTensor&&) = delete;
-  BaseTensor& operator=(BaseTensor&&) = delete;
-
-  /**
-   * If we still malloc memory by mutable_data,
-   * the BaseTensor doesn't need complicated constructor.
-   *
-   * Note: Tensor objects lacking meta information are not allowed to exist.
-   */
-  explicit BaseTensor(TensorMeta&& meta);
-
-  ~BaseTensor() override {}
-
-  /**
-   * Most of Tensor's methods need to have corresponding implementations
-   * in BaseTensor
-   */
-  int64_t numel() const override;
-
-  DDim dims() const override;
-
-  void resize(const DDim& dims) override;
-
-  DataType type() const override;
-
-  Layout layout() const override;
-
-  Place place() const override;
-
-  Backend backend() const override;
-
-  const void* data() const override;
-
-  void* mutable_data() override;
-
-  bool initialized() const override;
-
-  /**
-   * using base class template methods.
-   */
-  using TensorImplInterface::data;
-  using TensorImplInterface::mutable_data;
-
-  // For non-API interfaces, we still follow the C++ code style
-  void ShareAllocation(const std::shared_ptr<Allocation>& memory);
-
-  Place GetPlaceByBackend() const;
-
-  size_t MemorySize() const;
-
-  void CheckMemorySize() const;
-
-  std::shared_ptr<Allocation> MoveMemory();
-
- private:
-  // The actual Tensor storage holder
-  std::shared_ptr<Allocation> memory_;
-  // The Tensor meta data
-  TensorMeta meta_;
-  // The Tensor status data
-  // TensorStatus status_;
-};
-
-}  // namespace pt
diff --git a/paddle/pten/core/convert_utils.cc b/paddle/pten/core/convert_utils.cc
index 285db16f082d5..ddc2513d2a65d 100644
--- a/paddle/pten/core/convert_utils.cc
+++ b/paddle/pten/core/convert_utils.cc
@@ -22,6 +22,14 @@ Backend TransToPtenBackend(const paddle::platform::Place& place) {
     return Backend::kCPU;
   } else if (paddle::platform::is_gpu_place(place)) {
     return Backend::kCUDA;
+  } else if (paddle::platform::is_cuda_pinned_place(place)) {
+    return Backend::kCUDAPinned;
+  } else if (paddle::platform::is_xpu_place(place)) {
+    return Backend::kXPU;
+  } else if (paddle::platform::is_npu_place(place)) {
+    return Backend::kNPU;
+  } else if (paddle::platform::is_npu_pinned_place(place)) {
+    return Backend::kNPUPinned;
   } else {
     return Backend::kUndef;
   }
@@ -59,18 +67,18 @@ pt::DataType TransToPtenDataType(
   }
 }
 
-Layout TransToPtenLayout(const paddle::framework::DataLayout& layout) {
+DataLayout TransToPtenLayout(const paddle::framework::DataLayout& layout) {
   switch (layout) {
     case paddle::framework::DataLayout::kNHWC:
-      return Layout::kNHWC;
+      return DataLayout::kNHWC;
     case paddle::framework::DataLayout::kNCHW:
-      return Layout::kNCHW;
+      return DataLayout::kNCHW;
     case paddle::framework::DataLayout::kAnyLayout:
-      return Layout::kAny;
+      return DataLayout::kAny;
     case paddle::framework::DataLayout::kMKLDNN:
-      return Layout::kMKLDNN;
+      return DataLayout::kMKLDNN;
     default:
-      return Layout::kUndef;
+      return DataLayout::kUndef;
   }
 }
 
diff --git a/paddle/pten/core/convert_utils.h b/paddle/pten/core/convert_utils.h
index e5c325e6fd4c0..398ad61e3cd97 100644
--- a/paddle/pten/core/convert_utils.h
+++ b/paddle/pten/core/convert_utils.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/pten/core/dtype.h"
 #include "paddle/pten/core/layout.h"
 
-// fluid headers [may be replaced by new impl]
+// See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/platform/place.h"
@@ -32,7 +32,7 @@ namespace pt {
 Backend TransToPtenBackend(const paddle::platform::Place& place);
 DataType TransToPtenDataType(
     const paddle::framework::proto::VarType::Type& dtype);
-Layout TransToPtenLayout(const paddle::framework::DataLayout& layout);
+DataLayout TransToPtenLayout(const paddle::framework::DataLayout& layout);
 paddle::framework::proto::VarType::Type TransToProtoVarType(
     const DataType& dtype);
 
diff --git a/paddle/pten/core/base_tensor.cc b/paddle/pten/core/dense_tensor.cc
similarity index 63%
rename from paddle/pten/core/base_tensor.cc
rename to paddle/pten/core/dense_tensor.cc
index 8b8e5a85e6b6f..f990351e24e31 100644
--- a/paddle/pten/core/base_tensor.cc
+++ b/paddle/pten/core/dense_tensor.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/pten/core/base_tensor.h"
+#include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/convert_utils.h"
 
 // See Note [ Why still include the fluid headers? ]
@@ -30,66 +30,64 @@ using XPUPlace = paddle::platform::XPUPlace;
 using NPUPlace = paddle::platform::NPUPlace;
 using NPUPinnedPlace = paddle::platform::NPUPinnedPlace;
 
-BaseTensor::BaseTensor(TensorMeta&& meta)
-    : meta_(std::forward<TensorMeta>(meta)) {}
-
-int64_t BaseTensor::numel() const { return product(meta_.dims); }
-
-DDim BaseTensor::dims() const { return meta_.dims; }
-
-void BaseTensor::resize(const DDim& dims) { meta_.dims = dims; }
-
-DataType BaseTensor::type() const { return meta_.type; }
-
-Layout BaseTensor::layout() const { return meta_.layout; }
-
-Place BaseTensor::place() const {
+Place DenseTensor::place() const {
   PADDLE_ENFORCE_NOT_NULL(
-      memory_,
+      allocation_,
       paddle::platform::errors::PreconditionNotMet(
           "Tensor not initialized yet when Tensor::place() is called."));
-  return memory_->place();
+  return allocation_->place();
 }
 
-Backend BaseTensor::backend() const { return meta_.backend; }
-
-bool BaseTensor::initialized() const { return memory_ != nullptr; }
-
 //----------------------------------------------------------------
 // Inner methods
 
-void BaseTensor::ShareAllocation(const std::shared_ptr<Allocation>& memory) {
+void DenseTensor::ShareAllocation(
+    const std::shared_ptr<Allocation>& allocation) {
   // This operation can be very slow!
   // std::shared_ptr reference count is atomic. increasing or decreasing
   // the reference count requires atomic increment or decrement.
   // This is hundred times slower than non-atomic increment/decrement
-  memory_ = memory;
+  allocation_ = allocation;
 }
 
 // TODO(chenweihang): Add other place branchs
-Place BaseTensor::GetPlaceByBackend() const {
-  switch (meta_.backend) {
+Place DenseTensor::GetPlaceByBackend() const {
+  switch (meta_->backend) {
     case Backend::kCPU:
       return CPUPlace();
+#ifdef PADDLE_WITH_CUDA
     case Backend::kCUDA:
       return CUDAPlace();
+    case Backend::kCUDAPinned:
+      return CUDAPinnedPlace();
+#endif
+#ifdef PADDLE_WITH_XPU
+    case Backend::kXPU:
+      return XPUPlace();
+#endif
+#ifdef PADDLE_WITH_NPU
+    case Backend::kNPU:
+      return NPUPlace();
+    case Backend::kNPUPinned:
+      return NPUPinnedPlace();
+#endif
     default:
       PADDLE_THROW(paddle::platform::errors::Unimplemented(
           "Unsupported Tensor backend."));
   }
 }
 
-size_t BaseTensor::MemorySize() const {
-  return memory_ == nullptr ? 0UL : memory_->size() - meta_.offset;
+size_t DenseTensor::MemorySize() const {
+  return allocation_ == nullptr ? 0UL : allocation_->size() - meta_->offset;
 }
 
-void BaseTensor::CheckMemorySize() const {
-  PADDLE_ENFORCE_NOT_NULL(memory_,
+void DenseTensor::CheckMemorySize() const {
+  PADDLE_ENFORCE_NOT_NULL(allocation_,
                           paddle::platform::errors::PreconditionNotMet(
                               "Tensor holds no memory. "
                               "Call Tensor::mutable_data firstly."));
   size_t size_of_type =
-      paddle::framework::SizeOfType(TransToProtoVarType(meta_.type));
+      paddle::framework::SizeOfType(TransToProtoVarType(meta_->type));
   PADDLE_ENFORCE_LE(
       numel() * size_of_type,
       MemorySize(),
@@ -102,17 +100,17 @@ void BaseTensor::CheckMemorySize() const {
           MemorySize()));
 }
 
-std::shared_ptr<Allocation> BaseTensor::MoveMemory() {
-  return std::move(memory_);
+std::shared_ptr<Allocation> DenseTensor::MoveMemory() {
+  return std::move(allocation_);
 }
 
-const void* BaseTensor::data() const {
+const void* DenseTensor::data() const {
   CheckMemorySize();
   return reinterpret_cast<const void*>(
-      reinterpret_cast<uintptr_t>(memory_->ptr()) + meta_.offset);
+      reinterpret_cast<uintptr_t>(allocation_->ptr()) + meta_->offset);
 }
 
-void* BaseTensor::mutable_data() {
+void* DenseTensor::mutable_data() {
   PADDLE_ENFORCE_GE(
       numel(),
       0,
@@ -122,22 +120,23 @@ void* BaseTensor::mutable_data() {
           dims(),
           "] now"));
   size_t size =
-      numel() * paddle::framework::SizeOfType(TransToProtoVarType(meta_.type));
+      numel() * paddle::framework::SizeOfType(TransToProtoVarType(meta_->type));
   auto place = GetPlaceByBackend();
-  if (memory_ == nullptr) {
-    memory_.reset();
-    memory_ = paddle::memory::AllocShared(place, size);
+  if (allocation_ == nullptr) {
+    allocation_.reset();
+    allocation_ = paddle::memory::AllocShared(place, size);
   } else {
-    LOG(WARNING) << "When call mutable_data, BaseTensor has been initialized.";
-    if (!(memory_->place() == place) || memory_->size() < size + meta_.offset) {
-      memory_.reset();
-      memory_ = paddle::memory::AllocShared(place, size);
+    LOG(WARNING) << "When call mutable_data, DenseTensor has been initialized.";
+    if (!(allocation_->place() == place) ||
+        allocation_->size() < size + meta_->offset) {
+      allocation_.reset();
+      allocation_ = paddle::memory::AllocShared(place, size);
     } else {
       // do nothing
     }
   }
-  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(memory_->ptr()) +
-                                 meta_.offset);
+  return reinterpret_cast<void*>(
+      reinterpret_cast<uintptr_t>(allocation_->ptr()) + meta_->offset);
 }
 
 }  // namespace pt
diff --git a/paddle/pten/core/dense_tensor.h b/paddle/pten/core/dense_tensor.h
new file mode 100644
index 0000000000000..09bed4ca702e5
--- /dev/null
+++ b/paddle/pten/core/dense_tensor.h
@@ -0,0 +1,135 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+
+#include "paddle/pten/core/tensor_impl_if.h"
+#include "paddle/pten/core/tensor_meta.h"
+#include "paddle/pten/core/tensor_status.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+class Allocation;
+}
+}
+}
+
+namespace pt {
+
+// TODO(chenweihang): Allocation still link to framework, Redesign and
+// decoupled Allocation and Allocator?
+using Allocation = paddle::memory::allocation::Allocation;
+
+/**
+ * The implementation of general Tensor (For CPU, CUDA, HIP, etc.), similar
+ * to the Tensor in fluid, contains a pointer to Allocation and a series of
+ * descriptive metadata and status required by Tensor.
+ *
+ * DenseTensor is still a base class, it may have inherited classes.
+ *
+ * The memory layout of these inherited classes is consistent with the
+ * basic DenseTensor, except that a small number of members are added to
+ * further specialize the description of the tensor.
+ *
+ * If the memory layout is different, it cannot be described based on the
+ * general Allocation, and it needs to be directly inherited from
+ * TensorImplInterface.
+ */
+class DenseTensor : public TensorImplInterface {
+ public:
+  // Not allowed to initialize a tensor without descriptive metadata
+  DenseTensor() = delete;
+
+  DenseTensor(const DenseTensor&) = delete;
+  DenseTensor& operator=(const DenseTensor&) = delete;
+  DenseTensor(DenseTensor&&) = delete;
+  DenseTensor& operator=(DenseTensor&&) = delete;
+
+  /**
+   * If we still malloc memory by mutable_data,
+   * the DenseTensor doesn't need complicated constructor.
+   *
+   * Note: Tensor objects lacking meta information are not allowed to exist.
+   */
+  explicit DenseTensor(std::unique_ptr<TensorMeta> meta,
+                       std::unique_ptr<TensorStatus> status =
+                           std::unique_ptr<TensorStatus>(new TensorStatus()))
+      : meta_(std::move(meta)), status_(std::move(status)) {}
+
+  ~DenseTensor() override {}
+
+  int64_t numel() const override { return meta_->numel; }
+
+  DDim dims() const override { return meta_->dims; }
+
+  DataType type() const override { return meta_->type; }
+
+  DataLayout layout() const override { return meta_->layout; }
+
+  Place place() const override;
+
+  Backend backend() const override { return meta_->backend; }
+
+  bool initialized() const override { return allocation_ != nullptr; }
+
+  /* Data Access Methods */
+
+  const void* data() const;
+
+  void* mutable_data();
+
+  template <typename T>
+  const T* data() const {
+    static_assert(std::is_pod<T>::value,
+                  "T must be POD when call Tensor.data<T>().");
+    return reinterpret_cast<const T*>(data());
+  }
+
+  // mutable_data does not hold arguments.
+  // Before calling mutable_data, please make sure that Tensor has maintained
+  // the correct meta and status.
+  template <typename T>
+  T* mutable_data() {
+    static_assert(std::is_pod<T>::value,
+                  "T must be POD when call Tensor.mutable_data<T>().");
+    return reinterpret_cast<T*>(mutable_data());
+  }
+
+  // For non-API interfaces, we still follow the C++ code style
+
+  void Resize(const DDim& dims) { meta_->dims = dims; }
+
+  void ShareAllocation(const std::shared_ptr<Allocation>& allocation);
+
+  Place GetPlaceByBackend() const;
+
+  size_t MemorySize() const;
+
+  void CheckMemorySize() const;
+
+  std::shared_ptr<Allocation> MoveMemory();
+
+ private:
+  // The actual Tensor storage holder
+  std::shared_ptr<Allocation> allocation_;
+  // The Tensor meta data
+  std::unique_ptr<TensorMeta> meta_;
+  // The Tensor status data
+  std::unique_ptr<TensorStatus> status_;
+};
+
+}  // namespace pt
diff --git a/paddle/pten/core/layout.h b/paddle/pten/core/layout.h
index 7b8882fe30251..2f4e95f36fdfd 100644
--- a/paddle/pten/core/layout.h
+++ b/paddle/pten/core/layout.h
@@ -25,7 +25,7 @@ namespace pt {
  *
  * Here we also can use the DataLayout in framework, they are all enum classes.
  */
-enum class Layout {
+enum class DataLayout {
   kUndef = 0,
   kAny,
   kNHWC,
diff --git a/paddle/pten/core/lod_tensor.cc b/paddle/pten/core/lod_tensor.cc
deleted file mode 100644
index 9f348d9b1332b..0000000000000
--- a/paddle/pten/core/lod_tensor.cc
+++ /dev/null
@@ -1,17 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/pten/core/lod_tensor.h"
-
-namespace pt {}  // namespace pt
diff --git a/paddle/pten/core/lod_tensor.h b/paddle/pten/core/lod_tensor.h
deleted file mode 100644
index b4495013432f3..0000000000000
--- a/paddle/pten/core/lod_tensor.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/pten/core/base_tensor.h"
-
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/framework/mixed_vector.h"
-
-namespace pt {
-
-using Vector = paddle::framework::Vector;
-
-/*
- * LoD is short for Level of Details.
- *
- * - in a level, each element indicates relative offset of the lower level
- * - the first element should be 0 and that indicates that this sequence start
- * from 0
- * - each sequence's begin and end(no-inclusive) is level[id, id+1]
- *
- * For example:
- *    3-level LoD stores
- *
- *    0 2 3
- *    0 2 4 7
- *    0 2 5 7 10 12 15 20
- */
-using LoD = std::vector<Vector<size_t>>;
-
-/**
- * LoDTensor: compatible with LoDTensor in fluid and related operators.
- *
- * Note: LoDTensor (Level of details Tensor)
- * see https://en.wikipedia.org/wiki/Level_of_details for reference.
- */
-class LoDTensor final : public BaseTensor {
- public:
-  LoDTensor() = delete;
-
-  LoDTensor(const LoDTensor&) = delete;
-  LoDTensor& operator=(const LoDTensor&) = delete;
-  LoDTensor(LoDTensor&&) = delete;
-  LoDTensor& operator=(LoDTensor&&) = delete;
-
-  explicit LoDTensor(const LoD& lod, TensorMeta&& meta)
-      : lod_(lod), BaseTensor(meta) {}
-
-  void set_lod(const LoD& lod) { lod_ = lod; }
-
-  const LoD& lod() const { return lod_; }
-
-  LoD* mutable_lod() { return &lod_; }
-
- private:
-  LoD lod_;
-};
-
-}  // namespace pt
diff --git a/paddle/pten/core/scalar_tensor.h b/paddle/pten/core/scalar_tensor.h
index 59fe21aff2484..e9836633ba465 100644
--- a/paddle/pten/core/scalar_tensor.h
+++ b/paddle/pten/core/scalar_tensor.h
@@ -14,6 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/pten/core/base_tensor.h"
+#include "paddle/pten/core/dense_tensor.h"
 
-class LoDTensor : public BaseTensor {};
+class LoDTensor : public DenseTensor {};
diff --git a/paddle/pten/core/selected_rows.h b/paddle/pten/core/selected_rows.h
index 9aec9d605c76a..86ba8414f972f 100644
--- a/paddle/pten/core/selected_rows.h
+++ b/paddle/pten/core/selected_rows.h
@@ -21,7 +21,7 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
-#include "paddle/pten/core/base_tensor.h"
+#include "paddle/pten/core/dense_tensor.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/mixed_vector.h"
@@ -29,38 +29,55 @@ limitations under the License. */
 
 namespace pt {
 
-using Vector = paddle::framework::Vector;
+template <typename T>
+using Vector = paddle::framework::Vector<T>;
+using RWLock = paddle::framework::RWLock;
 
 /**
- * SelectedRows: compatible with SelectedRows in fluid and related operators.
+ * SelectedRowsTensor: compatible with SelectedRows in fluid and related
+ * operators.
+ *
+ * SelectedRowsTensor is not a typical design of sparse Tensor, and may
+ * no longer be recommended for use in the future, and there may be new
+ * SparseTensor later.
  */
-class SelectedRows final : public BaseTensor {
+
+// TODO(chenweihang): add other methods later
+
+class SelectedRowsTensor : public TensorImplInterface {
  public:
-  SelectedRows() = delete;
+  SelectedRowsTensor() = delete;
+
+  SelectedRowsTensor(const SelectedRowsTensor&) = delete;
+  SelectedRowsTensor& operator=(const SelectedRowsTensor&) = delete;
+  SelectedRowsTensor(SelectedRowsTensor&&) = delete;
+  SelectedRowsTensor& operator=(SelectedRowsTensor&&) = delete;
+
+  SelectedRowsTensor(std::unique_ptr<TensorMeta> meta,
+                     std::unique_ptr<TensorStatus> status,
+                     const std::vector<int64_t>& rows,
+                     int64_t height)
+      : rows_(rows), height_(height) {
+    value_.reset(new DenseTensor(std::move(meta), std::move(status)));
+  }
 
-  SelectedRows(const SelectedRows&) = delete;
-  SelectedRows& operator=(const SelectedRows&) = delete;
-  SelectedRows(SelectedRows&&) = delete;
-  SelectedRows& operator=(SelectedRows&&) = delete;
+  const DenseTensor& value() const { return *value_; }
 
-  SelectedRows(const std::vector<int64_t>& rows,
-               int64_t height,
-               TensorMeta&& meta)
-      : rows_(rows), height_(height), BaseTensor(meta) {}
+  DenseTensor* mutable_value() { return value_.get(); }
 
   const Vector<int64_t>& rows() const { return rows_; }
 
   Vector<int64_t>* mutable_rows() { return &rows_; }
 
-  void set_rows(const Vector<int64_t>& rows)()
+  void set_rows(const Vector<int64_t>& rows) { rows_ = rows; }
 
-      int64_t height() const {
-    return height_;
-  }
+  int64_t height() const { return height_; }
 
   void set_height(int64_t height) { height_ = height; }
 
  private:
+  std::unique_ptr<DenseTensor> value_{nullptr};
+
   Vector<int64_t> rows_;
   int64_t height_;
 
diff --git a/paddle/pten/core/spatial_tensor.h b/paddle/pten/core/spatial_tensor.h
new file mode 100644
index 0000000000000..8093417f626a8
--- /dev/null
+++ b/paddle/pten/core/spatial_tensor.h
@@ -0,0 +1,49 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace pt {
+
+/**
+ * SpatialTensor represents a Tensor whose memory layout is different from
+ * the typical Allocation (size+ptr).
+ *
+ * It needs to pass in a specific Allocation implementation when it is
+ * instantiated.
+ */
+
+template <typename AllocationType>
+class SpatialTensor : public TensorImplInterface {
+ public:
+  SpatialTensor(std::shared_ptr<AllocationType> allocation,
+                std::unique_ptr<TensorMeta> meta,
+                std::unique_ptr<TensorStatus> status)
+      : allocation_(std::move(allocation)),
+        meta_(std::move(meta)),
+        status_(std::move(status)) {}
+
+ private:
+  std::shared_ptr<AllocationType> allocation_;
+  std::unique_ptr<TensorMeta> meta_;
+  std::unique_ptr<TensorStatus> status_;
+};
+
+template <typename AllocationType>
+class MetalTensor : public SpatialTensor<AllocationType> {};
+
+template <typename AllocationType>
+class OpenCLTensor : public SpatialTensor<AllocationType> {};
+
+}  // namespace pt
diff --git a/paddle/pten/core/tensor.h b/paddle/pten/core/tensor.h
index e3834797938a9..a1a57e14c7001 100644
--- a/paddle/pten/core/tensor.h
+++ b/paddle/pten/core/tensor.h
@@ -34,6 +34,7 @@ limitations under the License. */
  * However, if we directly split the link relation, we need to make too many
  * changes, which will affect the stability of the framework, so here we still
  * rely on the implementation of the framework, which is a intermediate state.
+ *
  * In the future, the necessary components will be moved to the this library,
  * or the corresponding components will be re-implemented.
  */
@@ -64,6 +65,9 @@ namespace pt {
  *
  * Note: Tensor cannot be inherited. The heterogeneous Tensor implementation
  * can be achieved by inheriting the underlying TensorImplInterface.
+ *
+ * Note: This Tensor API is suitable for training and custom operators,
+ * another simple Tensor design may be required for inference.
  */
 
 class Tensor final {
@@ -85,7 +89,7 @@ class Tensor final {
     }
   }
 
-  /* Part 2: Dimension, DataType and Layout methods */
+  /* Part 2: Dimension, DataType and DataLayout methods */
   /**
    * @description: Return the number of elements of current Tensor.
    * @param None
@@ -100,13 +104,6 @@ class Tensor final {
    */
   DDim shape() const { return impl_->dims(); }
 
-  /**
-   * @description: Resize the shape (dimensions) of current Tensor.
-   * @param {const} DDim
-   * @return {*}
-   */
-  void resize(const DDim& dims) { impl_->resize(dims); }
-
   /**
    * @description: Return the data type of current Tensor.
    * @param None
@@ -117,9 +114,9 @@ class Tensor final {
   /**
    * @description: Return the layout of current Tensor.
    * @param None
-   * @return {Layout}
+   * @return {DataLayout}
    */
-  Layout layout() const { return impl_->layout(); }
+  DataLayout layout() const { return impl_->layout(); }
 
   /* Part 3: Device and Backend methods */
   /**
@@ -152,6 +149,8 @@ class Tensor final {
   bool is_mkldnn() const;
   bool is_cudnn() const;
 
+  bool is_selected_rows() const;
+
   /**
    * Backend convert APIs.
    */
@@ -171,25 +170,7 @@ class Tensor final {
    */
   std::shared_ptr<TensorImplInterface> impl() const { return impl_; }
 
-  /**
-   * @description: Get the const memory pointer of current Tensor.
-   * @param None
-   * @return {const T*}
-   */
-  template <typename T>
-  const T* data() const {
-    return impl_->data<T>();
-  }
-
-  /**
-   * @description: Get the mutable memory pointer of current Tensor.
-   * @param None
-   * @return {T*}
-   */
-  template <typename T>
-  T* mutable_data() {
-    return impl_->mutable_data<T>();
-  }
+  // Whether API Tensor need `data` and `mutable_data`?
 
   // TODO(chenweihang): slice and split methods use kernels?
 
diff --git a/paddle/pten/core/tensor_impl_if.h b/paddle/pten/core/tensor_impl_if.h
index f0ddb6243384a..8207bb428233f 100644
--- a/paddle/pten/core/tensor_impl_if.h
+++ b/paddle/pten/core/tensor_impl_if.h
@@ -41,6 +41,13 @@ using Place = paddle::platform::Place;
  * The abstract class of Tensor implemention, it needs to define its basic
  * behavior through inherited classes.
  *
+ * TensorImplInterface allows Tensor to uniformly access various different
+ * TensorImpls within the framework. It will not be used as a kernel argument,
+ * but only contains the interfaces supported by various TensorImpls.
+ * In extreme cases, it can be an empty base class.
+ *
+ * If we don't use TensorImplInterface, we may need to use shared_ptr<void>
+ * to unify Tensor's API.
  */
 class TensorImplInterface {
  public:
@@ -54,46 +61,19 @@ class TensorImplInterface {
 
   virtual ~TensorImplInterface() {}
 
-  /**
-   * Most of Tensor's methods need to have corresponding implementations
-   * in TensorImplInterface
-   */
   virtual int64_t numel() const = 0;
 
   virtual DDim dims() const = 0;
 
-  virtual void resize(const DDim& dims) = 0;
-
   virtual DataType type() const = 0;
 
-  virtual Layout layout() const = 0;
+  virtual DataLayout layout() const = 0;
 
   virtual Place place() const = 0;
 
   virtual Backend backend() const = 0;
 
-  virtual const void* data() const = 0;
-
-  virtual void* mutable_data() = 0;
-
   virtual bool initialized() const = 0;
-
-  /**
-   * template methods can not be virtual
-   */
-  template <typename T>
-  const T* data() const {
-    static_assert(std::is_pod<T>::value,
-                  "T must be POD when call Tensor.data<T>().");
-    return reinterpret_cast<const T*>(data());
-  }
-
-  template <typename T>
-  T* mutable_data() {
-    static_assert(std::is_pod<T>::value,
-                  "T must be POD when call Tensor.mutable_data<T>().");
-    return reinterpret_cast<T*>(mutable_data());
-  }
 };
 
 }  // namespace pt
diff --git a/paddle/pten/core/tensor_meta.h b/paddle/pten/core/tensor_meta.h
index febb6600c5a9c..e37b070b6fc17 100644
--- a/paddle/pten/core/tensor_meta.h
+++ b/paddle/pten/core/tensor_meta.h
@@ -14,17 +14,47 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
+
+#ifdef PADDLE_WITH_MKLDNN
+#include "mkldnn.hpp"
+#endif
+
 #include "paddle/pten/core/backend.h"
 #include "paddle/pten/core/dtype.h"
 #include "paddle/pten/core/layout.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/ddim.h"
+// Note: mixed_vector include many header now, LoD will be
+// used on CUDA device? Can we use small_vector here?
+// #include "paddle/fluid/framework/mixed_vector.h"
 
 namespace pt {
 
+// template <typename T>
+// using Vector = paddle::framework::Vector<T>;
+
+/*
+ * LoD is short for Level of Details.
+ *
+ * - in a level, each element indicates relative offset of the lower level
+ * - the first element should be 0 and that indicates that this sequence start
+ * from 0
+ * - each sequence's begin and end(no-inclusive) is level[id, id+1]
+ *
+ * For example:
+ *    3-level LoD stores
+ *
+ *    0 2 3
+ *    0 2 4 7
+ *    0 2 5 7 10 12 15 20
+ */
+// using LoD = std::vector<paddle::framework::Vector<size_t>>;
+using LoD = std::vector<std::vector<size_t>>;
+
 /**
- * The Meta data member of BaseTensor.
+ * The Meta data member of DenseTensor.
  *
  * Here the `meta` represents information describing the basic features and
  * data features of Tensor, and does not include the status information of
@@ -47,7 +77,9 @@ struct TensorMeta {
         backend(meta.backend),
         type(meta.type),
         layout(meta.layout),
-        offset(meta.offset) {}
+        numel(meta.numel),
+        offset(meta.offset),
+        lod(meta.lod) {}
 
   // Bad constructor, may introduce bug
   // explicit TensorMeta(DDim dims) : dims(dims) {}
@@ -56,22 +88,72 @@ struct TensorMeta {
   TensorMeta(const DDim& dims,
              Backend backend,
              DataType type,
-             Layout layout,
-             size_t offset)
+             DataLayout layout,
+             size_t offset = 0UL,
+             const LoD& lod = {})
       : dims(dims),
         backend(backend),
         type(type),
         layout(layout),
-        offset(offset) {}
+        offset(offset),
+        lod(lod) {
+    int64_t init_numel = paddle::framework::product(dims);
+    if (init_numel > 0) {
+      numel = init_numel;
+    }
+  }
 
   DDim dims;
 
   Backend backend{Backend::kCPU};
   DataType type{DataType::kFLOAT32};
-  Layout layout{Layout::kNCHW};
+  DataLayout layout{DataLayout::kNCHW};
+
+  /**
+   * [ Why not calculate numel based on dims? ]
+   *
+   * Tensor may be 0-dimensional, but 0-dimensional Tensor may have values.
+   * For example:
+   *
+   *   import paddle
+   *
+   *   a = paddle.to_tensor([1, 2, 3])
+   *   print(a[0].shape) # expected: []
+   *   print(a[0].numel()) # expected: 1
+   *
+   * Now Paddle can not get expected result above, because the old Tensor's
+   * numel is calculated based on dims.
+   */
+  int64_t numel{1};
+
   size_t offset{0};
 
-  // InplaceVersion inplace_version_counter{0};
+  /**
+   * [ Why basic TensorMeta hold LoD? ]
+   *
+   * LoDTensor is still the main Tensor concept in Paddle.
+   * Although only a small number of ops need to use LoD information,
+   * LoD may need to be passed between Op's input and output, which is
+   * difficult to remove in a short time.
+   *
+   * But we don't want to add a Tensor type because of LoD, which makes
+   * the concept complicated, so LoD is a member held by Tensor by default.
+   */
+  LoD lod;
+};
+
+#ifdef PADDLE_WITH_MKLDNN
+struct MKLDNNTensorMeta : public TensorMeta {
+  /**
+   * @brief the detail format of memory block which have layout as kMKLDNN
+   *
+   * @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C,
+   *       nChw16c, etc. For a MKLDNN memory block, layout will be set as
+   *       DataLayout::kMKLDNN meanwhile detail memory format will be kept in
+   *       this field.
+   */
+  mkldnn::memory::format_tag format = mkldnn::memory::format_tag::undef;
 };
+#endif
 
 }  // namespace pt
diff --git a/paddle/pten/core/tensor_status.h b/paddle/pten/core/tensor_status.h
index be98e31a27630..3f6f7060feb0d 100644
--- a/paddle/pten/core/tensor_status.h
+++ b/paddle/pten/core/tensor_status.h
@@ -20,8 +20,20 @@ limitations under the License. */
 
 namespace pt {
 
+class TensorInplaceVersion {
+ public:
+  explicit TensorInplaceVersion(uint32_t inplace_version = 0)
+      : inplace_version_(inplace_version) {}
+  bool IsUnique() const { return inplace_version_ == 0; }
+  void Bump() { ++inplace_version_; }
+  uint32_t CurrentVersion() const { return inplace_version_; }
+
+ private:
+  uint32_t inplace_version_;
+};
+
 /**
- * The Status data member of BaseTensor.
+ * The Status data member of DenseTensor.
  *
  * Here the `static` represents information describing the status of Tensor,
  * such as version counter, or other bool status members.
@@ -31,7 +43,7 @@ namespace pt {
  * And we direct access its members, in addition to constructor, destructor
  * and functions for setting data members, can not provide other functions.
  *
- * Note: Impl later
+ * Note: polish impl later
  */
 struct TensorStatus {
   TensorStatus() = default;
@@ -41,7 +53,12 @@ struct TensorStatus {
   TensorStatus(TensorStatus&&) = delete;
   TensorStatus& operator=(TensorStatus&&) = delete;
 
-  // InplaceVersion inplace_version_counter{0};
+  TensorInplaceVersion inplace_version_counter{0};
+
+  /**
+   * For Scalar Tensor design
+   */
+  bool is_scalar{false};
 };
 
 }  // namespace pt
diff --git a/paddle/pten/cpu/math.h b/paddle/pten/cpu/math.h
index bf123ad2851a2..50ba5db3cd2a7 100644
--- a/paddle/pten/cpu/math.h
+++ b/paddle/pten/cpu/math.h
@@ -14,7 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/pten/core/base_tensor.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/module/scale.h"
 #include "paddle/pten/module/sign.h"
 
 // See Note [ Why still include the fluid headers? ]
@@ -34,17 +35,37 @@ using EigenVector = paddle::framework::EigenVector<T, MajorType, IndexType>;
 
 using CPUDeviceContext = paddle::platform::CPUDeviceContext;
 
+/**
+ * [ How do we organize the kernel directory ]
+ * Now according to the classification of operators in the Python API,
+ * the same type of operation kernel is placed in a header file.
+ * This is only a temporary approach.
+ *
+ * Considerations:
+ *
+ * 1. In the future, it may be tailored the lib on kernel level.
+ *    This organization will cause difficulty in tailoring;
+ * 2. If there is still one *.h and *.cc file for one kernel,
+ *    and now the kernel is organized by device, the number of files
+ *    will be greatly expanded, but this may be more reasonable;
+ * 3. In the future, the kernel implementation of the function should
+ *    be in the *.cc file. If you want to call the kernel in the tensor
+ *    operation library, you should find the call through the global
+ *    KernelMap instead of including the header file of the corresponding
+ *    calculation. This may reduce the number of header files.
+ */
+
 template <typename T>
 void Sign(const CPUDeviceContext& dev_ctx,
-          const BaseTensor& x,
-          BaseTensor* out) {
+          const DenseTensor& x,
+          DenseTensor* out) {
   module::Sign<CPUDeviceContext, T>(dev_ctx, x, out);
 }
 
 template <typename T>
 void Mean(const CPUDeviceContext& dev_ctx,
-          const BaseTensor& x,
-          BaseTensor* out) {
+          const DenseTensor& x,
+          DenseTensor* out) {
   out->mutable_data<T>();
   auto x_data = EigenVector<T>::Flatten(x);
   auto y_data = EigenScalar<T>::From(*out);
@@ -52,4 +73,15 @@ void Mean(const CPUDeviceContext& dev_ctx,
   y_data.device(place) = x_data.mean();
 }
 
+template <typename T>
+void Scale(const CPUDeviceContext& dev_ctx,
+           const DenseTensor& x,
+           float scale,
+           float bias,
+           bool bias_after_scale,
+           DenseTensor* out) {
+  module::Scale<CPUDeviceContext, T>(
+      dev_ctx, x, scale, bias, bias_after_scale, out);
+}
+
 }  // namespace pt
diff --git a/paddle/pten/cuda/CMakeLists.txt b/paddle/pten/cuda/CMakeLists.txt
index 7ad6ae7c489ce..328b81265f03d 100644
--- a/paddle/pten/cuda/CMakeLists.txt
+++ b/paddle/pten/cuda/CMakeLists.txt
@@ -1 +1 @@
-nv_library(math_cuda SRCS math.cu DEPS device_context base_tensor convert_utils)
+nv_library(math_cuda SRCS math.cu DEPS device_context dense_tensor convert_utils)
diff --git a/paddle/pten/cuda/math.cu b/paddle/pten/cuda/math.cu
index 66b55e7da134f..585acc41e6a99 100644
--- a/paddle/pten/cuda/math.cu
+++ b/paddle/pten/cuda/math.cu
@@ -46,9 +46,9 @@ struct DivideFunctor {
  */
 
 template <typename T>
-void MeanCUDA(const CUDADeviceContext& dev_ctx,
-              const BaseTensor& x,
-              BaseTensor* out) {
+void Mean(const CUDADeviceContext& dev_ctx,
+          const DenseTensor& x,
+          DenseTensor* out) {
   auto size_prob = x.numel();
   const T* x_data = x.data<T>();
   T* out_data = out->mutable_data<T>();
@@ -63,27 +63,25 @@ void MeanCUDA(const CUDADeviceContext& dev_ctx,
       nullptr, temp_storage_bytes, trans_x, out_data, size_prob, stream);
   PADDLE_ENFORCE_CUDA_SUCCESS(err);
 
-  // TODO(chenweihang): maybe too complicated
-  pt::TensorMeta meta(
+  pt::DenseTensor tmp(std::unique_ptr<TensorMeta>(new TensorMeta(
       paddle::framework::make_ddim({static_cast<int64_t>(temp_storage_bytes)}),
       pt::TransToPtenBackend(dev_ctx.GetPlace()),
       x.type(),
-      x.layout(),
-      0);
-  pt::BaseTensor tmp(std::move(meta));
+      x.layout())));
   auto* temp_storage = tmp.mutable_data<uint8_t>();
   err = cub::DeviceReduce::Sum(
       temp_storage, temp_storage_bytes, trans_x, out_data, size_prob, stream);
   PADDLE_ENFORCE_CUDA_SUCCESS(err);
 }
 
-template void MeanCUDA<float>(const CUDADeviceContext& dev_ctx,
-                              const BaseTensor& x,
-                              BaseTensor* out);
-template void MeanCUDA<double>(const CUDADeviceContext& dev_ctx,
-                               const BaseTensor& x,
-                               BaseTensor* out);
-template void MeanCUDA<paddle::platform::float16>(
-    const CUDADeviceContext& dev_ctx, const BaseTensor& x, BaseTensor* out);
+template void Mean<float>(const CUDADeviceContext& dev_ctx,
+                          const DenseTensor& x,
+                          DenseTensor* out);
+template void Mean<double>(const CUDADeviceContext& dev_ctx,
+                           const DenseTensor& x,
+                           DenseTensor* out);
+template void Mean<paddle::platform::float16>(const CUDADeviceContext& dev_ctx,
+                                              const DenseTensor& x,
+                                              DenseTensor* out);
 
 }  // namespace pt
diff --git a/paddle/pten/cuda/math.h b/paddle/pten/cuda/math.h
index 6d78ac3839a3d..6b610cca839dc 100644
--- a/paddle/pten/cuda/math.h
+++ b/paddle/pten/cuda/math.h
@@ -16,7 +16,8 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_CUDA
 
-#include "paddle/pten/core/base_tensor.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/module/scale.h"
 #include "paddle/pten/module/sign.h"
 
 // See Note [ Why still include the fluid headers? ]
@@ -28,8 +29,8 @@ using CUDADeviceContext = paddle::platform::CUDADeviceContext;
 
 template <typename T>
 void Sign(const CUDADeviceContext& dev_ctx,
-          const BaseTensor& x,
-          BaseTensor* out) {
+          const DenseTensor& x,
+          DenseTensor* out) {
   module::Sign<CUDADeviceContext, T>(dev_ctx, x, out);
 }
 
@@ -39,15 +40,19 @@ void Sign(const CUDADeviceContext& dev_ctx,
 // include header files, there will be many more function declarations and
 // redundant function call
 template <typename T>
-void MeanCUDA(const CUDADeviceContext& dev_ctx,
-              const BaseTensor& x,
-              BaseTensor* out);
+void Mean(const CUDADeviceContext& dev_ctx,
+          const DenseTensor& x,
+          DenseTensor* out);
 
 template <typename T>
-void Mean(const CUDADeviceContext& dev_ctx,
-          const BaseTensor& x,
-          BaseTensor* out) {
-  MeanCUDA<T>(dev_ctx, x, out);
+void Scale(const CUDADeviceContext& dev_ctx,
+           const DenseTensor& x,
+           float scale,
+           float bias,
+           bool bias_after_scale,
+           DenseTensor* out) {
+  module::Scale<CUDADeviceContext, T>(
+      dev_ctx, x, scale, bias, bias_after_scale, out);
 }
 
 }  // namespace pt
diff --git a/paddle/pten/module/scale.h b/paddle/pten/module/scale.h
new file mode 100644
index 0000000000000..c3eb32ae6c407
--- /dev/null
+++ b/paddle/pten/module/scale.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/dense_tensor.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+
+namespace pt {
+namespace module {
+
+template <typename DevCtx, typename T>
+void Scale(const DevCtx& dev_ctx,
+           const DenseTensor& x,
+           float scale,
+           float bias,
+           bool bias_after_scale,
+           DenseTensor* out) {
+  // calc
+  out->mutable_data<T>();
+  auto eigen_out = paddle::framework::EigenVector<T>::Flatten(*out);
+  auto eigen_x = paddle::framework::EigenVector<T>::Flatten(x);
+  auto& dev = *dev_ctx.eigen_device();
+  // TODO(chenweihang): now the eigen function here need the dtype of scale,
+  // eigen_x, bias should be same, so here need cast for two scalar arg,
+  // maybe we declare that the type of scale and bias is T?
+  paddle::operators::EigenScale<std::decay_t<decltype(dev)>, T>::Eval(
+      dev,
+      eigen_out,
+      eigen_x,
+      static_cast<T>(scale),
+      static_cast<T>(bias),
+      bias_after_scale);
+}
+
+}  // namespace module
+}  // namespace pt
diff --git a/paddle/pten/module/sign.h b/paddle/pten/module/sign.h
index 56dc2b3665629..16e49d475f137 100644
--- a/paddle/pten/module/sign.h
+++ b/paddle/pten/module/sign.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/pten/core/base_tensor.h"
+#include "paddle/pten/core/dense_tensor.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/eigen.h"
@@ -24,7 +24,7 @@ namespace pt {
 namespace module {
 
 template <typename DevCtx, typename T>
-void Sign(const DevCtx& dev_ctx, const BaseTensor& x, BaseTensor* out) {
+void Sign(const DevCtx& dev_ctx, const DenseTensor& x, DenseTensor* out) {
   VLOG(1) << "enter module::Sign";
   // out->mutable_data<T>(x.place());
   out->mutable_data<T>();
diff --git a/paddle/pten/npu/math.h b/paddle/pten/npu/math.h
index c534045f1901b..a08c60312a011 100644
--- a/paddle/pten/npu/math.h
+++ b/paddle/pten/npu/math.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_ASCEND_CL
 
-#include "paddle/pten/core/base_tensor.h"
+#include "paddle/pten/core/dense_tensor.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/operators/npu_op_runner.h"
@@ -28,8 +28,8 @@ using NPUDeviceContext = paddle::platfrom::NPUDeviceContext;
 
 template <typename T>
 void Mean(const NPUDeviceContext& dev_ctx,
-          const BaseTensor& x,
-          BaseTensor* out) {
+          const DenseTensor& x,
+          DenseTensor* out) {
   std::vector<int> axes;
   framework::NPUAttributeMap attr_input = {{"keep_dims", false},
                                            {"axes", axes}};
@@ -41,6 +41,43 @@ void Mean(const NPUDeviceContext& dev_ctx,
   runner.Run(stream);
 }
 
+template <typename T>
+void Scale(const NPUDeviceContext& dev_ctx,
+           const DenseTensor& x,
+           float scale,
+           float bias,
+           bool bias_after_scale,
+           DenseTensor* out) {
+  out->mutable_data<T>();
+  auto stream = dev_ctx.stream();
+  float _power = 1.0;
+  if (bias_after_scale) {
+    auto runner =
+        NpuOpRunner("Power",
+                    {x},
+                    {*out},
+                    {{"power", _power}, {"scale", scale}, {"shift", bias}});
+
+    runner.Run(stream);
+  } else {
+    DenseTensor tmp_x(std::unique_ptr<TensorMeta>(
+        new TensorMeta(x.dims(), x.backend(), x.type(), x.layout())));
+    tmp_x.mutable_data<T>();
+
+    auto runner_tmp = NpuOpRunner("Adds", {x}, {tmp_x}, {{"value", bias}});
+    runner_tmp.Run(stream);
+
+    out->mutable_data<T>(x.place());
+    float _bias = 0.0;
+    auto runner =
+        NpuOpRunner("Power",
+                    {tmp_x},
+                    {*out},
+                    {{"power", _power}, {"scale", scale}, {"shift", _bias}});
+    runner.Run(stream);
+  }
+}
+
 }  // namespace pt
 
 #endif
diff --git a/paddle/pten/selected_rows/CMakeLists.txt b/paddle/pten/selected_rows/CMakeLists.txt
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/paddle/pten/selected_rows/math.h b/paddle/pten/selected_rows/math.h
new file mode 100644
index 0000000000000..e2c3c6c703060
--- /dev/null
+++ b/paddle/pten/selected_rows/math.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/pten/core/selected_rows.h"
+
+// In fact, it is ugly to use such a complicated include
+// relationship when coding.
+// After the kernel registration module is completed, the calculation
+// function should be reused by calling the kernel in global KernelMap.
+#include "paddle/pten/cpu/math.h"
+#include "paddle/pten/cuda/math.h"
+#include "paddle/pten/npu/math.h"
+#include "paddle/pten/xpu/math.h"
+
+// See Note [ Why still include the fluid headers? ]
+
+namespace pt {
+
+template <typename T>
+void Scale(const CPUDeviceContext& dev_ctx,
+           const SelectedRowsTensor& x,
+           float scale,
+           float bias,
+           bool bias_after_scale,
+           SelectedRowsTensor* out) {
+  out->set_rows(x.rows());
+  out->set_height(x.height());
+  Scale<T>(dev_ctx, x.value(), scale, bias, bias_after_scale, out->value());
+}
+
+}  // namespace pt
diff --git a/paddle/pten/tests/CMakeLists.txt b/paddle/pten/tests/CMakeLists.txt
index dda192ff8b6a4..eea2826c4e066 100644
--- a/paddle/pten/tests/CMakeLists.txt
+++ b/paddle/pten/tests/CMakeLists.txt
@@ -1 +1 @@
-cc_test(base_tensor_test SRCS base_tensor_test.cc DEPS base_tensor)
+cc_test(dense_tensor_test SRCS dense_tensor_test.cc DEPS dense_tensor)
diff --git a/paddle/pten/tests/base_tensor_test.cc b/paddle/pten/tests/dense_tensor_test.cc
similarity index 64%
rename from paddle/pten/tests/base_tensor_test.cc
rename to paddle/pten/tests/dense_tensor_test.cc
index 58e6bc05ab94e..2aa3edc7699a9 100644
--- a/paddle/pten/tests/base_tensor_test.cc
+++ b/paddle/pten/tests/dense_tensor_test.cc
@@ -12,34 +12,34 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/pten/core/base_tensor.h"
+#include "paddle/pten/core/dense_tensor.h"
 
 #include <gtest/gtest.h>
 
 namespace framework = paddle::framework;
 using DDim = paddle::framework::DDim;
 
-TEST(BaseTensor, Constructor) {
-  pt::TensorMeta meta(framework::make_ddim({5, 10}),
-                      pt::Backend::kCPU,
-                      pt::DataType::kFLOAT32,
-                      pt::Layout::kNCHW,
-                      0UL);
-  pt::BaseTensor tensor(std::move(meta));
+TEST(DenseTensor, Constructor) {
+  pt::DenseTensor tensor(std::unique_ptr<pt::TensorMeta>(
+      new pt::TensorMeta(framework::make_ddim({5, 10}),
+                         pt::Backend::kCPU,
+                         pt::DataType::kFLOAT32,
+                         pt::DataLayout::kNCHW,
+                         0UL)));
   ASSERT_EQ(tensor.dims().size(), 2);
   ASSERT_EQ(tensor.backend(), pt::Backend::kCPU);
   ASSERT_EQ(tensor.type(), pt::DataType::kFLOAT32);
-  ASSERT_EQ(tensor.layout(), pt::Layout::kNCHW);
+  ASSERT_EQ(tensor.layout(), pt::DataLayout::kNCHW);
 }
 
-TEST(BaseTensor, Dims) {
+TEST(DenseTensor, Dims) {
   // impl later
 }
 
-TEST(BaseTensor, Place) {
+TEST(DenseTensor, Place) {
   // impl later
 }
 
-TEST(BaseTensor, Data) {
+TEST(DenseTensor, Data) {
   // impl later
 }
diff --git a/paddle/pten/xpu/math.h b/paddle/pten/xpu/math.h
index e91bd65fae6bc..1e3511fec9b00 100644
--- a/paddle/pten/xpu/math.h
+++ b/paddle/pten/xpu/math.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/pten/core/base_tensor.h"
+#include "paddle/pten/core/dense_tensor.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
@@ -28,8 +28,8 @@ using XPUDeviceContext = paddle::platform::XPUDeviceContext;
 
 template <typename T>
 void Sign(const XPUDeviceContext& dev_ctx,
-          const BaseTensor& x,
-          BaseTensor* out) {
+          const DenseTensor& x,
+          DenseTensor* out) {
   T* out_data = out->mutable_data<T>();
   auto xpu_ctx = dev_ctx.x_context();
   int r = xpu::activation_forward(
@@ -41,8 +41,8 @@ void Sign(const XPUDeviceContext& dev_ctx,
 
 template <typename T>
 void Mean(const XPUDeviceContext& dev_ctx,
-          const BaseTensor& x,
-          BaseTensor* out) {
+          const DenseTensor& x,
+          DenseTensor* out) {
   T* out_data = out->mutable_data<T>();
   auto xpu_ctx = dev_ctx.x_context();
   const T* x_data = x.Inputdata<T>();
@@ -54,6 +54,35 @@ void Mean(const XPUDeviceContext& dev_ctx,
           "XPU kernel error, Mean op execution not succeed, error code=%d", r));
 }
 
+template <typename T>
+void Scale(const XPUDeviceContext& dev_ctx,
+           const DenseTensor& x,
+           float scale,
+           float bias,
+           bool bias_after_scale,
+           DenseTensor* out) {
+  T* out_data = out->mutable_data<T>();
+  PADDLE_ENFORCE_EQ(
+      x.dims(),
+      out->dims(),
+      platform::errors::InvalidArgument("In and out should have the same dim,"
+                                        " expected %s, but got %s.",
+                                        x.dims().to_str().c_str(),
+                                        out->dims().to_str().c_str()));
+  int r = xpu::scale(dev_ctx.x_context(),
+                     x.data<T>(),
+                     out_data,
+                     x.numel(),
+                     bias_after_scale,
+                     scale,
+                     bias);
+  PADDLE_ENFORCE_EQ(
+      r,
+      XPU_SUCCESS,
+      platform::errors::External(
+          "XPU scale kernel return wrong value[%d %s]", r, XPUAPIErrorMsg[r]));
+}
+
 }  // namespace pt
 
 #endif

From 33bba0644d6c0539f6eee1c18194489085bc6667 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 28 Jul 2021 02:27:50 +0000
Subject: [PATCH 008/125] add scale mkldnn kernel

---
 paddle/fluid/framework/pten_utils.h           | 12 +++
 .../fluid/operators/mkldnn/scale_mkldnn_op.cc | 61 -------------
 paddle/fluid/operators/scale_op.cc            |  8 ++
 paddle/fluid/platform/mkldnn_reuse.h          | 21 ++++-
 paddle/pten/CMakeLists.txt                    |  5 ++
 paddle/pten/api/dev/math.h                    |  1 +
 paddle/pten/core/dense_tensor.h               | 10 ++-
 paddle/pten/core/tensor_meta.h                | 14 +++
 paddle/pten/mkldnn/base.h                     | 87 +++++++++++++++++++
 paddle/pten/mkldnn/math.h                     | 63 ++++++++++++++
 10 files changed, 218 insertions(+), 64 deletions(-)
 delete mode 100644 paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc
 create mode 100644 paddle/pten/mkldnn/base.h
 create mode 100644 paddle/pten/mkldnn/math.h

diff --git a/paddle/fluid/framework/pten_utils.h b/paddle/fluid/framework/pten_utils.h
index 85a345b9a3796..5ca26fcc28439 100644
--- a/paddle/fluid/framework/pten_utils.h
+++ b/paddle/fluid/framework/pten_utils.h
@@ -29,10 +29,17 @@ std::shared_ptr<TensorImplT> MakeTensorImpl(const Tensor& tensor,
                                             proto::VarType::Type type) {
   auto holder = tensor.Holder();
   auto tensor_impl = std::make_shared<TensorImplT>(
+#ifdef PADDLE_WITH_MKLDNN
+      std::unique_ptr<pt::TensorMeta>(new pt::MKLDNNTensorMeta(
+          tensor.dims(), pt::TransToPtenBackend(place),
+          pt::TransToPtenDataType(type), pt::TransToPtenLayout(tensor.layout()),
+          tensor.offset(), /*lod=*/{}, tensor.format())));
+#else
       std::unique_ptr<pt::TensorMeta>(new pt::TensorMeta(
           tensor.dims(), pt::TransToPtenBackend(place),
           pt::TransToPtenDataType(type), pt::TransToPtenLayout(tensor.layout()),
           tensor.offset())));
+#endif
   if (holder != nullptr) {
     tensor_impl->template ShareAllocation(tensor.Holder());
   } else {
@@ -46,6 +53,11 @@ void ShareTensorImpl(TensorImplT* tensor_impl, Tensor* out) {
   out->ResetHolderWithType(
       tensor_impl->template MoveMemory(),
       pt::TransToProtoVarType(tensor_impl->template type()));
+#ifdef PADDLE_WITH_MKLDNN
+  out->set_format(
+      dynamic_cast<const pt::MKLDNNTensorMeta&>(tensor_impl->template meta())
+          .format);
+#endif
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc
deleted file mode 100644
index ae17048b5d568..0000000000000
--- a/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/mkldnn_reuse.h"
-
-namespace paddle {
-namespace operators {
-
-using paddle::framework::Tensor;
-
-template <typename T>
-class ScaleMKLDNNKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    this->RunKernel(ctx);
-  }
-
-  void RunKernel(const framework::ExecutionContext& ctx) const {
-    const auto& dev_ctx =
-        ctx.template device_context<platform::MKLDNNDeviceContext>();
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
-
-    bool is_inplaced = x->IsSharedBufferWith(*out);
-
-    platform::ActivationMKLDNNHandler<T> handler(
-        mkldnn::algorithm::eltwise_linear, ctx, dev_ctx, ctx.GetPlace(), x,
-        ctx.InputName("X"), is_inplaced);
-
-    auto src_memory_p = handler.AcquireSrcMemory(x);
-    auto dst_memory_p = handler.AcquireDstMemory(out);
-    auto activation_p = handler.AcquireForwardPrimitive();
-
-    auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
-    activation_p->execute(astream, {{MKLDNN_ARG_FROM, *src_memory_p},
-                                    {MKLDNN_ARG_TO, *dst_memory_p}});
-    astream.wait();
-
-    out->set_layout(framework::DataLayout::kMKLDNN);
-    out->set_format(platform::GetMKLDNNFormat(*dst_memory_p));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_KERNEL(scale, MKLDNN, paddle::platform::CPUPlace,
-                   ops::ScaleMKLDNNKernel<float>,
-                   ops::ScaleMKLDNNKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index 5d5efb42c279f..b9c3ddf201c7a 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -185,3 +185,11 @@ REGISTER_OP_NPU_KERNEL(
     paddle::operators::ScaleKernel<paddle::platform::NPUDeviceContext,
                                    paddle::platform::float16>);
 #endif
+
+#ifdef PADDLE_WITH_MKLDNN
+REGISTER_OP_KERNEL(
+    scale, MKLDNN, paddle::platform::CPUPlace,
+    ops::ScaleKernel<paddle::platform::MKLDNNDeviceContext, float>,
+    ops::ScaleKernel<paddle::platform::MKLDNNDeviceContext,
+                     paddle::platform::bfloat16>);
+#endif
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 58622fb2529b8..b134d60991968 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -26,6 +26,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #include "paddle/fluid/platform/place.h"
 
+#include "paddle/pten/api/dev/core.h"
+
 namespace paddle {
 namespace platform {
 
@@ -95,6 +97,13 @@ class MKLDNNHandlerT {
         fwd_pd_->src_desc(), to_void_cast<T>(input_data), "@src_mem_p");
   }
 
+  std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
+      const pt::DenseTensor* input) {
+    const T* input_data = const_cast<T*>(input->data<T>());
+    return this->AcquireMemoryFromPrimitive(
+        fwd_pd_->src_desc(), to_void_cast<T>(input_data), "@src_mem_p");
+  }
+
   template <typename T_out = T>
   std::shared_ptr<mkldnn::memory> AcquireDstMemory(framework::Tensor* output) {
     T_out* ptr =
@@ -103,6 +112,13 @@ class MKLDNNHandlerT {
                                             "@dst_mem_p");
   }
 
+  template <typename T_out = T>
+  std::shared_ptr<mkldnn::memory> AcquireDstMemory(pt::DenseTensor* output) {
+    T_out* ptr = output->mutable_data<T_out>();
+    return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc(), ptr,
+                                            "@dst_mem_p");
+  }
+
   template <typename T_out = T>
   std::shared_ptr<mkldnn::memory> AcquireDstMemory(void) {
     return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc(), "@dstt_mem_p");
@@ -810,8 +826,9 @@ class ActivationMKLDNNHandler
       if (algorithm == mkldnn::algorithm::eltwise_linear) {
         bool bias_after_scale = ctx.Attr<bool>("bias_after_scale");
         auto* scale_tensor = ctx.Input<Tensor>("ScaleTensor");
-        alpha = (scale_tensor == nullptr) ? ctx.Attr<float>("scale")
-                                          : (float)*(scale_tensor->data<T>());
+        alpha = (scale_tensor == nullptr)
+                    ? ctx.Attr<float>("scale")
+                    : static_cast<float*>(scale_tensor->data<T>());
         beta = ctx.Attr<float>("bias");
         // if bias_after_scale == true
         //   out = scale*X + bias
diff --git a/paddle/pten/CMakeLists.txt b/paddle/pten/CMakeLists.txt
index 5407a8ec836c7..479e71361b511 100644
--- a/paddle/pten/CMakeLists.txt
+++ b/paddle/pten/CMakeLists.txt
@@ -19,6 +19,11 @@ endif()
 if(WITH_XPU)
   add_subdirectory(xpu)
 endif()
+# pten kernels for other tensor
+add_subdirectory(selected_rows)
+# pten infershape and dtype
+add_subdirectory(infershape)
+add_subdirectory(inferdtype)
 # pten public functors
 add_subdirectory(module)
 # pten tests
diff --git a/paddle/pten/api/dev/math.h b/paddle/pten/api/dev/math.h
index d00461f128dd7..7f5365207c6ba 100644
--- a/paddle/pten/api/dev/math.h
+++ b/paddle/pten/api/dev/math.h
@@ -17,6 +17,7 @@ limitations under the License. */
 // See Note: [ How do we organize the kernel directory ]
 #include "paddle/pten/cpu/math.h"
 #include "paddle/pten/cuda/math.h"
+#include "paddle/pten/mkldnn/math.h"
 #include "paddle/pten/npu/math.h"
 #include "paddle/pten/selected_rows/math.h"
 #include "paddle/pten/xpu/math.h"
diff --git a/paddle/pten/core/dense_tensor.h b/paddle/pten/core/dense_tensor.h
index 09bed4ca702e5..256dde13fb841 100644
--- a/paddle/pten/core/dense_tensor.h
+++ b/paddle/pten/core/dense_tensor.h
@@ -86,6 +86,14 @@ class DenseTensor : public TensorImplInterface {
 
   bool initialized() const override { return allocation_ != nullptr; }
 
+  /* member methods */
+
+  const std::shared_ptr<Allocation>& allocation() const { return allocation_; }
+
+  const TensorMeta& meta() const { return *meta_; }
+
+  TensorMeta* mutable_meta() { return meta_.get(); }
+
   /* Data Access Methods */
 
   const void* data() const;
@@ -109,7 +117,7 @@ class DenseTensor : public TensorImplInterface {
     return reinterpret_cast<T*>(mutable_data());
   }
 
-  // For non-API interfaces, we still follow the C++ code style
+  // For non-API and non-member interfaces, we still follow the C++ code style?
 
   void Resize(const DDim& dims) { meta_->dims = dims; }
 
diff --git a/paddle/pten/core/tensor_meta.h b/paddle/pten/core/tensor_meta.h
index e37b070b6fc17..063d481e9c4b1 100644
--- a/paddle/pten/core/tensor_meta.h
+++ b/paddle/pten/core/tensor_meta.h
@@ -103,6 +103,8 @@ struct TensorMeta {
     }
   }
 
+  virtual ~TensorMeta() = default;
+
   DDim dims;
 
   Backend backend{Backend::kCPU};
@@ -144,6 +146,18 @@ struct TensorMeta {
 
 #ifdef PADDLE_WITH_MKLDNN
 struct MKLDNNTensorMeta : public TensorMeta {
+  MKLDNNTensorMeta(
+      const DDim& dims,
+      Backend backend,
+      DataType type,
+      DataLayout layout,
+      size_t offset = 0UL,
+      const LoD& lod = {},
+      mkldnn::memory::format_tag format = mkldnn::memory::format_tag::undef)
+      : TensorMeta(dims, backend, type, layout, offset, lod), format(format) {}
+
+  ~MKLDNNTensorMeta() override {}
+
   /**
    * @brief the detail format of memory block which have layout as kMKLDNN
    *
diff --git a/paddle/pten/mkldnn/base.h b/paddle/pten/mkldnn/base.h
new file mode 100644
index 0000000000000..d7134ecf92d8b
--- /dev/null
+++ b/paddle/pten/mkldnn/base.h
@@ -0,0 +1,87 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_MKLDNN
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/mkldnn_reuse.h"
+
+namespace pt {
+
+using MKLDNNDeviceContext = paddle::platform::MKLDNNDeviceContext;
+
+// TODO(chenweihang): the handlers in `mkldnn_reuse.h` are coupled to
+// `ExecutionContext`, refactoring that may be a big project!
+
+template <typename T>
+class ScaleMKLDNNHandler
+    : public paddle::platform::MKLDNNHandlerT<T,
+                                              mkldnn::eltwise_forward,
+                                              mkldnn::eltwise_backward> {
+ public:
+  ScaleMKLDNNHandler(const MKLDNNDeviceContext& dev_ctx,
+                     const pt::DenseTensor& in_x,
+                     const std::string& unique_name,
+                     bool is_inplaced,
+                     float alpha,
+                     float beta,
+                     bool bias_after_scale)
+      : paddle::platform::MKLDNNHandlerT<T,
+                                         mkldnn::eltwise_forward,
+                                         mkldnn::eltwise_backward>(
+            dev_ctx,
+            dev_ctx.GetEngine(),
+            in_x.place(),
+            is_inplaced ? paddle::platform::CreateKey(
+                              dev_ctx,
+                              paddle::framework::vectorize(in_x.dims()),
+                              "a",
+                              mkldnn::algorithm::eltwise_linear,
+                              unique_name)
+                        : paddle::platform::CreateKey(
+                              dev_ctx,
+                              paddle::framework::vectorize(in_x.dims()),
+                              "a",
+                              unique_name)) {
+    if (!bias_after_scale) {
+      beta *= alpha;
+    }
+
+    PADDLE_ENFORCE(in_x.dims().size() >= 1 || in_x.dims().size() <= 6,
+                   paddle::platform::errors::Unimplemented(
+                       "Input dimension size can be 1, 2, 3, 4, "
+                       "5, or 6, but now the dimension size is",
+                       in_x.dims().size()));
+
+    auto src_tz = paddle::framework::vectorize<int64_t>(in_x.dims());
+    auto src_fmt =
+        src_tz.size() == 2
+            ? paddle::MKLDNNMemoryFormat::nc
+            : dynamic_cast<const MKLDNNTensorMeta&>(in_x.meta()).format;
+    auto md = mkldnn::memory::desc(
+        src_tz, paddle::platform::MKLDNNGetDataType<T>(), src_fmt);
+
+    this->AcquireForwardPrimitiveDescriptor(mkldnn::prop_kind::forward_training,
+                                            mkldnn::algorithm::eltwise_linear,
+                                            md,
+                                            alpha,
+                                            beta);
+  }
+};
+
+}  // namespace pt
+
+#endif
diff --git a/paddle/pten/mkldnn/math.h b/paddle/pten/mkldnn/math.h
new file mode 100644
index 0000000000000..7d521516f0a3c
--- /dev/null
+++ b/paddle/pten/mkldnn/math.h
@@ -0,0 +1,63 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_MKLDNN
+
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/mkldnn/base.h"
+
+namespace pt {
+
+using MKLDNNDeviceContext = paddle::platform::MKLDNNDeviceContext;
+
+template <typename T>
+void Scale(const MKLDNNDeviceContext& dev_ctx,
+           const DenseTensor& x,
+           float scale,
+           float bias,
+           bool bias_after_scale,
+           DenseTensor* out) {
+  bool is_inplaced = x.allocation() && x.allocation() == out->allocation();
+
+  // TODO(chenweihang): add `name` into TensorMeta?
+  ScaleMKLDNNHandler<T> handler(dev_ctx,
+                                x,
+                                /*unique_name=*/"X",
+                                is_inplaced,
+                                /*alpha=*/scale,
+                                /*beta=*/bias,
+                                bias_after_scale);
+
+  auto src_memory_p = handler.AcquireSrcMemory(&x);
+  auto dst_memory_p = handler.AcquireDstMemory(out);
+  auto activation_p = handler.AcquireForwardPrimitive();
+
+  auto& astream = MKLDNNDeviceContext::tls().get_stream();
+  activation_p->execute(
+      astream,
+      {{MKLDNN_ARG_FROM, *src_memory_p}, {MKLDNN_ARG_TO, *dst_memory_p}});
+  astream.wait();
+
+  out->mutable_meta()->layout = DataLayout::kMKLDNN;
+  // TODO(chenweihang): we should use dynamic_cast get MKLDNNTensorMeta,
+  // Is there any better way here?
+  dynamic_cast<MKLDNNTensorMeta*>(out->mutable_meta())->format =
+      paddle::platform::GetMKLDNNFormat(*dst_memory_p);
+}
+
+}  // namespace pt
+
+#endif

From d895a116c561ec1e12fec1520ee4ecc06f63b1e5 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 28 Jul 2021 11:37:03 +0000
Subject: [PATCH 009/125] polish xpu & npu impl details

---
 paddle/fluid/inference/CMakeLists.txt |  2 +-
 paddle/fluid/operators/mean_op.cc     |  8 +++---
 paddle/pten/core/tensor.h             |  2 +-
 paddle/pten/npu/math.h                | 36 +++++++++++++--------------
 paddle/pten/xpu/math.h                | 10 ++++----
 5 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index c002c7a10cb7b..82aa4b3cb65de 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -81,7 +81,7 @@ set(SHARED_INFERENCE_SRCS
     ${PADDLE_CUSTOM_OP_SRCS})
 
 # shared inference library deps
-set(SHARED_INFERENCE_DEPS ${fluid_modules} analysis_predictor)
+set(SHARED_INFERENCE_DEPS ${fluid_modules} analysis_predictor pten)
 
 if (WITH_CRYPTO) 
     set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} paddle_crypto)
diff --git a/paddle/fluid/operators/mean_op.cc b/paddle/fluid/operators/mean_op.cc
index 0ec9a39cb6850..6aa4e0189825d 100644
--- a/paddle/fluid/operators/mean_op.cc
+++ b/paddle/fluid/operators/mean_op.cc
@@ -108,8 +108,8 @@ REGISTER_OP_XPU_KERNEL(
 
 #ifdef PADDLE_WITH_ASCEND_CL
 REGISTER_OP_NPU_KERNEL(
-    mean, ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, plat::float16>)
+    mean, ops::MeanKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::MeanKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::MeanKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::MeanKernel<paddle::platform::NPUDeviceContext, plat::float16>)
 #endif
diff --git a/paddle/pten/core/tensor.h b/paddle/pten/core/tensor.h
index a1a57e14c7001..5071b5d275046 100644
--- a/paddle/pten/core/tensor.h
+++ b/paddle/pten/core/tensor.h
@@ -45,7 +45,7 @@ namespace pt {
 
 /**
  * Tensor is the API description of the basic data structure in the
- * [ PaddlePaddle Tensor Operation Library ].
+ * [ Paddle Tensor Operation Library ].
  *
  * It is not limited to a simple n-dimensional array.
  * It contains a smart pointer to `TensorImpl`. The data description contained
diff --git a/paddle/pten/npu/math.h b/paddle/pten/npu/math.h
index a08c60312a011..bdb1768a67eff 100644
--- a/paddle/pten/npu/math.h
+++ b/paddle/pten/npu/math.h
@@ -24,20 +24,19 @@ limitations under the License. */
 
 namespace pt {
 
-using NPUDeviceContext = paddle::platfrom::NPUDeviceContext;
+using NPUDeviceContext = paddle::platform::NPUDeviceContext;
 
 template <typename T>
 void Mean(const NPUDeviceContext& dev_ctx,
           const DenseTensor& x,
           DenseTensor* out) {
   std::vector<int> axes;
-  framework::NPUAttributeMap attr_input = {{"keep_dims", false},
-                                           {"axes", axes}};
+  paddle::framework::NPUAttributeMap attr_input = {{"keep_dims", false},
+                                                   {"axes", axes}};
   out->mutable_data<T>();
-  const auto& runner = NpuOpRunner("ReduceMeanD", {x}, {*out}, attr_input);
-  auto stream =
-      ctx.template device_context<paddle::platform::NPUDeviceContext>()
-          .stream();
+  const auto& runner =
+      paddle::operators::NpuOpRunner("ReduceMeanD", {x}, {*out}, attr_input);
+  auto stream = dev_ctx.stream();
   runner.Run(stream);
 }
 
@@ -52,11 +51,11 @@ void Scale(const NPUDeviceContext& dev_ctx,
   auto stream = dev_ctx.stream();
   float _power = 1.0;
   if (bias_after_scale) {
-    auto runner =
-        NpuOpRunner("Power",
-                    {x},
-                    {*out},
-                    {{"power", _power}, {"scale", scale}, {"shift", bias}});
+    auto runner = paddle::operators::NpuOpRunner(
+        "Power",
+        {x},
+        {*out},
+        {{"power", _power}, {"scale", scale}, {"shift", bias}});
 
     runner.Run(stream);
   } else {
@@ -64,16 +63,17 @@ void Scale(const NPUDeviceContext& dev_ctx,
         new TensorMeta(x.dims(), x.backend(), x.type(), x.layout())));
     tmp_x.mutable_data<T>();
 
-    auto runner_tmp = NpuOpRunner("Adds", {x}, {tmp_x}, {{"value", bias}});
+    auto runner_tmp =
+        paddle::operators::NpuOpRunner("Adds", {x}, {tmp_x}, {{"value", bias}});
     runner_tmp.Run(stream);
 
     out->mutable_data<T>(x.place());
     float _bias = 0.0;
-    auto runner =
-        NpuOpRunner("Power",
-                    {tmp_x},
-                    {*out},
-                    {{"power", _power}, {"scale", scale}, {"shift", _bias}});
+    auto runner = paddle::operators::NpuOpRunner(
+        "Power",
+        {tmp_x},
+        {*out},
+        {{"power", _power}, {"scale", scale}, {"shift", _bias}});
     runner.Run(stream);
   }
 }
diff --git a/paddle/pten/xpu/math.h b/paddle/pten/xpu/math.h
index 1e3511fec9b00..062267d55a962 100644
--- a/paddle/pten/xpu/math.h
+++ b/paddle/pten/xpu/math.h
@@ -33,10 +33,10 @@ void Sign(const XPUDeviceContext& dev_ctx,
   T* out_data = out->mutable_data<T>();
   auto xpu_ctx = dev_ctx.x_context();
   int r = xpu::activation_forward(
-      xpu_ctx, xpu::Activation_t::SIGN, in.numel(), in.data<T>(), out_data);
+      xpu_ctx, xpu::Activation_t::SIGN, x.numel(), x.data<T>(), out_data);
   PADDLE_ENFORCE_EQ(r,
                     xpu::Error_t::SUCCESS,
-                    platform::errors::Fatal("XPU sign kernel error!"));
+                    paddle::platform::errors::Fatal("XPU sign kernel error!"));
 }
 
 template <typename T>
@@ -45,12 +45,12 @@ void Mean(const XPUDeviceContext& dev_ctx,
           DenseTensor* out) {
   T* out_data = out->mutable_data<T>();
   auto xpu_ctx = dev_ctx.x_context();
-  const T* x_data = x.Inputdata<T>();
+  const T* x_data = x.data<T>();
   int r = xpu::mean(xpu_ctx, x_data, out_data, x.numel());
   PADDLE_ENFORCE_EQ(
       r,
       xpu::Error_t::SUCCESS,
-      platform::errors::External(
+      paddle::platform::errors::External(
           "XPU kernel error, Mean op execution not succeed, error code=%d", r));
 }
 
@@ -79,7 +79,7 @@ void Scale(const XPUDeviceContext& dev_ctx,
   PADDLE_ENFORCE_EQ(
       r,
       XPU_SUCCESS,
-      platform::errors::External(
+      paddle::platform::errors::External(
           "XPU scale kernel return wrong value[%d %s]", r, XPUAPIErrorMsg[r]));
 }
 

From 62ebf01163e68af7bc0f7cce0abfaf56767b4882 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 29 Jul 2021 08:31:09 +0000
Subject: [PATCH 010/125] fix mkldnn reuse compile failed

---
 paddle/fluid/platform/mkldnn_reuse.h       | 2 +-
 paddle/pten/{core => api/include}/tensor.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
 rename paddle/pten/{core => api/include}/tensor.h (99%)

diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index b134d60991968..31fe423fbf377 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -828,7 +828,7 @@ class ActivationMKLDNNHandler
         auto* scale_tensor = ctx.Input<Tensor>("ScaleTensor");
         alpha = (scale_tensor == nullptr)
                     ? ctx.Attr<float>("scale")
-                    : static_cast<float*>(scale_tensor->data<T>());
+                    : (float)*(scale_tensor->data<T>());  // NOLINT
         beta = ctx.Attr<float>("bias");
         // if bias_after_scale == true
         //   out = scale*X + bias
diff --git a/paddle/pten/core/tensor.h b/paddle/pten/api/include/tensor.h
similarity index 99%
rename from paddle/pten/core/tensor.h
rename to paddle/pten/api/include/tensor.h
index 5071b5d275046..d3b86bba2514c 100644
--- a/paddle/pten/core/tensor.h
+++ b/paddle/pten/api/include/tensor.h
@@ -45,7 +45,7 @@ namespace pt {
 
 /**
  * Tensor is the API description of the basic data structure in the
- * [ Paddle Tensor Operation Library ].
+ * [ Paddle "Tensor OPeration (top)" Library ].
  *
  * It is not limited to a simple n-dimensional array.
  * It contains a smart pointer to `TensorImpl`. The data description contained

From 7c0972653a4f86536c6829f0edbdfd6b36b92262 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 29 Jul 2021 11:51:14 +0000
Subject: [PATCH 011/125] change tensor operation lib name

---
 paddle/CMakeLists.txt                             |  2 +-
 paddle/fluid/framework/eigen.h                    |  2 +-
 paddle/fluid/framework/pten_utils.h               |  4 ++--
 paddle/fluid/inference/CMakeLists.txt             |  2 +-
 paddle/fluid/operators/CMakeLists.txt             |  2 +-
 paddle/fluid/operators/mean_op.h                  |  6 +++---
 paddle/fluid/operators/scale_op.h                 |  6 +++---
 paddle/fluid/operators/sign_op.h                  |  6 +++---
 paddle/fluid/platform/mkldnn_reuse.h              |  2 +-
 paddle/{pten => top}/CMakeLists.txt               | 14 +++++++-------
 paddle/{pten => top}/api/CMakeLists.txt           |  2 +-
 paddle/{pten => top}/api/all.cc                   |  2 +-
 paddle/{pten => top}/api/all.h                    |  4 ++--
 paddle/{pten => top}/api/dev/core.h               |  2 +-
 paddle/{pten => top}/api/dev/math.h               | 12 ++++++------
 paddle/{pten => top}/api/include/tensor.h         |  4 ++--
 paddle/{pten => top}/api/src/CMakeLists.txt       |  0
 paddle/{pten => top}/core/CMakeLists.txt          |  0
 paddle/{pten => top}/core/autograd_meta_if.h      |  0
 paddle/{pten => top}/core/backend.h               |  0
 paddle/{pten => top}/core/convert_utils.cc        |  2 +-
 paddle/{pten => top}/core/convert_utils.h         |  6 +++---
 paddle/{pten => top}/core/dense_tensor.cc         |  4 ++--
 paddle/{pten => top}/core/dense_tensor.h          |  6 +++---
 paddle/{pten => top}/core/dtype.h                 |  0
 paddle/{pten => top}/core/layout.h                |  0
 paddle/{pten => top}/core/scalar_tensor.h         |  2 +-
 paddle/{pten => top}/core/selected_rows.cc        |  2 +-
 paddle/{pten => top}/core/selected_rows.h         |  2 +-
 paddle/{pten => top}/core/spatial_tensor.h        |  0
 paddle/{pten => top}/core/tensor_impl_if.h        |  6 +++---
 paddle/{pten => top}/core/tensor_meta.h           |  6 +++---
 paddle/{pten => top}/core/tensor_status.h         |  6 +++---
 paddle/{pten => top}/cpu/CMakeLists.txt           |  0
 paddle/{pten => top}/cpu/math.h                   |  6 +++---
 paddle/{pten => top}/cuda/CMakeLists.txt          |  0
 paddle/{pten => top}/cuda/math.cu                 |  4 ++--
 paddle/{pten => top}/cuda/math.h                  |  6 +++---
 paddle/{pten => top}/hip/CMakeLists.txt           |  0
 paddle/{pten => top}/inferdtype/CMakeLists.txt    |  0
 paddle/{pten => top}/infershape/CMakeLists.txt    |  0
 paddle/{pten => top}/mkldnn/CMakeLists.txt        |  0
 paddle/{pten => top}/mkldnn/base.h                |  0
 paddle/{pten => top}/mkldnn/math.h                |  4 ++--
 paddle/{pten => top}/module/CMakeLists.txt        |  0
 paddle/{pten => top}/module/scale.h               |  2 +-
 paddle/{pten => top}/module/sign.h                |  2 +-
 paddle/{pten => top}/npu/CMakeLists.txt           |  0
 paddle/{pten => top}/npu/math.h                   |  2 +-
 paddle/{pten => top}/selected_rows/CMakeLists.txt |  0
 paddle/{pten => top}/selected_rows/math.h         | 10 +++++-----
 paddle/{pten => top}/tests/CMakeLists.txt         |  0
 paddle/{pten => top}/tests/dense_tensor_test.cc   |  2 +-
 paddle/{pten => top}/xpu/CMakeLists.txt           |  0
 paddle/{pten => top}/xpu/math.h                   |  2 +-
 55 files changed, 77 insertions(+), 77 deletions(-)
 rename paddle/{pten => top}/CMakeLists.txt (73%)
 rename paddle/{pten => top}/api/CMakeLists.txt (75%)
 rename paddle/{pten => top}/api/all.cc (95%)
 rename paddle/{pten => top}/api/all.h (89%)
 rename paddle/{pten => top}/api/dev/core.h (93%)
 rename paddle/{pten => top}/api/dev/math.h (75%)
 rename paddle/{pten => top}/api/include/tensor.h (98%)
 rename paddle/{pten => top}/api/src/CMakeLists.txt (100%)
 rename paddle/{pten => top}/core/CMakeLists.txt (100%)
 rename paddle/{pten => top}/core/autograd_meta_if.h (100%)
 rename paddle/{pten => top}/core/backend.h (100%)
 rename paddle/{pten => top}/core/convert_utils.cc (98%)
 rename paddle/{pten => top}/core/convert_utils.h (92%)
 rename paddle/{pten => top}/core/dense_tensor.cc (98%)
 rename paddle/{pten => top}/core/dense_tensor.h (97%)
 rename paddle/{pten => top}/core/dtype.h (100%)
 rename paddle/{pten => top}/core/layout.h (100%)
 rename paddle/{pten => top}/core/scalar_tensor.h (93%)
 rename paddle/{pten => top}/core/selected_rows.cc (93%)
 rename paddle/{pten => top}/core/selected_rows.h (98%)
 rename paddle/{pten => top}/core/spatial_tensor.h (100%)
 rename paddle/{pten => top}/core/tensor_impl_if.h (95%)
 rename paddle/{pten => top}/core/tensor_meta.h (97%)
 rename paddle/{pten => top}/core/tensor_status.h (94%)
 rename paddle/{pten => top}/cpu/CMakeLists.txt (100%)
 rename paddle/{pten => top}/cpu/math.h (96%)
 rename paddle/{pten => top}/cuda/CMakeLists.txt (100%)
 rename paddle/{pten => top}/cuda/math.cu (97%)
 rename paddle/{pten => top}/cuda/math.h (93%)
 rename paddle/{pten => top}/hip/CMakeLists.txt (100%)
 rename paddle/{pten => top}/inferdtype/CMakeLists.txt (100%)
 rename paddle/{pten => top}/infershape/CMakeLists.txt (100%)
 rename paddle/{pten => top}/mkldnn/CMakeLists.txt (100%)
 rename paddle/{pten => top}/mkldnn/base.h (100%)
 rename paddle/{pten => top}/mkldnn/math.h (96%)
 rename paddle/{pten => top}/module/CMakeLists.txt (100%)
 rename paddle/{pten => top}/module/scale.h (97%)
 rename paddle/{pten => top}/module/sign.h (97%)
 rename paddle/{pten => top}/npu/CMakeLists.txt (100%)
 rename paddle/{pten => top}/npu/math.h (98%)
 rename paddle/{pten => top}/selected_rows/CMakeLists.txt (100%)
 rename paddle/{pten => top}/selected_rows/math.h (87%)
 rename paddle/{pten => top}/tests/CMakeLists.txt (100%)
 rename paddle/{pten => top}/tests/dense_tensor_test.cc (96%)
 rename paddle/{pten => top}/xpu/CMakeLists.txt (100%)
 rename paddle/{pten => top}/xpu/math.h (98%)

diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 488583fe2c767..de6b3dac7da22 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -2,4 +2,4 @@ add_subdirectory(scripts)
 add_subdirectory(testing)
 set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests CACHE INTERNAL "python tests directory")
 add_subdirectory(fluid)
-add_subdirectory(pten)
+add_subdirectory(top)
diff --git a/paddle/fluid/framework/eigen.h b/paddle/fluid/framework/eigen.h
index ad76889a9a7d6..acb6a88f059c6 100644
--- a/paddle/fluid/framework/eigen.h
+++ b/paddle/fluid/framework/eigen.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
-#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/top/core/dense_tensor.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/pten_utils.h b/paddle/fluid/framework/pten_utils.h
index 5ca26fcc28439..0cb6f1e3363d5 100644
--- a/paddle/fluid/framework/pten_utils.h
+++ b/paddle/fluid/framework/pten_utils.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/pten/core/convert_utils.h"
-#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/top/core/convert_utils.h"
+#include "paddle/top/core/dense_tensor.h"
 
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 82aa4b3cb65de..4afada2739dae 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -81,7 +81,7 @@ set(SHARED_INFERENCE_SRCS
     ${PADDLE_CUSTOM_OP_SRCS})
 
 # shared inference library deps
-set(SHARED_INFERENCE_DEPS ${fluid_modules} analysis_predictor pten)
+set(SHARED_INFERENCE_DEPS ${fluid_modules} analysis_predictor top)
 
 if (WITH_CRYPTO) 
     set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} paddle_crypto)
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index e3b3f84125814..fb4f158c9da1c 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -74,7 +74,7 @@ if(WITH_UNITY_BUILD)
     include(unity_build_rule.cmake)
 endif()
 
-set(OP_HEADER_DEPS ${OP_HEADER_DEPS} pten)
+set(OP_HEADER_DEPS ${OP_HEADER_DEPS} top)
 register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op lstm_op run_program_op eye_op recurrent_op
         sync_batch_norm_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
 
diff --git a/paddle/fluid/operators/mean_op.h b/paddle/fluid/operators/mean_op.h
index 0404e050a573f..93888cffcc857 100644
--- a/paddle/fluid/operators/mean_op.h
+++ b/paddle/fluid/operators/mean_op.h
@@ -17,9 +17,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/pten_utils.h"
 
-// only can include the headers in paddle/pten/api dirs
-#include "paddle/pten/api/dev/core.h"
-#include "paddle/pten/api/dev/math.h"
+// only can include the headers in paddle/top/api dirs
+#include "paddle/top/api/dev/core.h"
+#include "paddle/top/api/dev/math.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h
index d4d517a7e87e7..ee2835340ec41 100644
--- a/paddle/fluid/operators/scale_op.h
+++ b/paddle/fluid/operators/scale_op.h
@@ -17,9 +17,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/pten_utils.h"
 
-// only can include the headers in paddle/pten/api dirs
-#include "paddle/pten/api/dev/core.h"
-#include "paddle/pten/api/dev/math.h"
+// only can include the headers in paddle/top/api dirs
+#include "paddle/top/api/dev/core.h"
+#include "paddle/top/api/dev/math.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/sign_op.h b/paddle/fluid/operators/sign_op.h
index 8758c7c0ab33b..02c1abd3b36b4 100644
--- a/paddle/fluid/operators/sign_op.h
+++ b/paddle/fluid/operators/sign_op.h
@@ -19,9 +19,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 
-// only can include the headers in paddle/pten/api dirs
-#include "paddle/pten/api/dev/core.h"
-#include "paddle/pten/api/dev/math.h"
+// only can include the headers in paddle/top/api dirs
+#include "paddle/top/api/dev/core.h"
+#include "paddle/top/api/dev/math.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 31fe423fbf377..1439ff9746c21 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -26,7 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #include "paddle/fluid/platform/place.h"
 
-#include "paddle/pten/api/dev/core.h"
+#include "paddle/top/api/dev/core.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/pten/CMakeLists.txt b/paddle/top/CMakeLists.txt
similarity index 73%
rename from paddle/pten/CMakeLists.txt
rename to paddle/top/CMakeLists.txt
index 479e71361b511..a18d72209ebf4 100644
--- a/paddle/pten/CMakeLists.txt
+++ b/paddle/top/CMakeLists.txt
@@ -1,8 +1,8 @@
-# pten api
+# top api
 add_subdirectory(api)
-# pten core components
+# top core components
 add_subdirectory(core)
-# pten kernels for diff device
+# top kernels for diff device
 add_subdirectory(cpu)
 if(WITH_GPU)
   add_subdirectory(cuda)
@@ -19,12 +19,12 @@ endif()
 if(WITH_XPU)
   add_subdirectory(xpu)
 endif()
-# pten kernels for other tensor
+# top kernels for other tensor
 add_subdirectory(selected_rows)
-# pten infershape and dtype
+# top infershape and dtype
 add_subdirectory(infershape)
 add_subdirectory(inferdtype)
-# pten public functors
+# top public functors
 add_subdirectory(module)
-# pten tests
+# top tests
 add_subdirectory(tests)
diff --git a/paddle/pten/api/CMakeLists.txt b/paddle/top/api/CMakeLists.txt
similarity index 75%
rename from paddle/pten/api/CMakeLists.txt
rename to paddle/top/api/CMakeLists.txt
index 4f901ff7a0d12..98dc769f1786b 100644
--- a/paddle/pten/api/CMakeLists.txt
+++ b/paddle/top/api/CMakeLists.txt
@@ -5,4 +5,4 @@ if(WITH_GPU)
   set(PTEN_DEPS ${PTEN_DEPS} math_cuda)
 endif()
 
-cc_library(pten SRCS all.cc DEPS ${PTEN_DEPS})
+cc_library(top SRCS all.cc DEPS ${PTEN_DEPS})
diff --git a/paddle/pten/api/all.cc b/paddle/top/api/all.cc
similarity index 95%
rename from paddle/pten/api/all.cc
rename to paddle/top/api/all.cc
index 4141f5127fe31..5fe5586af3ab0 100644
--- a/paddle/pten/api/all.cc
+++ b/paddle/top/api/all.cc
@@ -12,6 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/pten/api/all.h"
+#include "paddle/top/api/all.h"
 
 namespace pt {}  // namespace pt
diff --git a/paddle/pten/api/all.h b/paddle/top/api/all.h
similarity index 89%
rename from paddle/pten/api/all.h
rename to paddle/top/api/all.h
index 342e51c128cd8..ac48529f25f3e 100644
--- a/paddle/pten/api/all.h
+++ b/paddle/top/api/all.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 // develop apis
-#include "paddle/pten/api/dev/core.h"
-#include "paddle/pten/api/dev/math.h"
+#include "paddle/top/api/dev/core.h"
+#include "paddle/top/api/dev/math.h"
 
 // user apis
diff --git a/paddle/pten/api/dev/core.h b/paddle/top/api/dev/core.h
similarity index 93%
rename from paddle/pten/api/dev/core.h
rename to paddle/top/api/dev/core.h
index f660306848dc2..d7cd929e44551 100644
--- a/paddle/pten/api/dev/core.h
+++ b/paddle/top/api/dev/core.h
@@ -14,4 +14,4 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/top/core/dense_tensor.h"
diff --git a/paddle/pten/api/dev/math.h b/paddle/top/api/dev/math.h
similarity index 75%
rename from paddle/pten/api/dev/math.h
rename to paddle/top/api/dev/math.h
index 7f5365207c6ba..be6c5df762697 100644
--- a/paddle/pten/api/dev/math.h
+++ b/paddle/top/api/dev/math.h
@@ -15,9 +15,9 @@ limitations under the License. */
 #pragma once
 
 // See Note: [ How do we organize the kernel directory ]
-#include "paddle/pten/cpu/math.h"
-#include "paddle/pten/cuda/math.h"
-#include "paddle/pten/mkldnn/math.h"
-#include "paddle/pten/npu/math.h"
-#include "paddle/pten/selected_rows/math.h"
-#include "paddle/pten/xpu/math.h"
+#include "paddle/top/cpu/math.h"
+#include "paddle/top/cuda/math.h"
+#include "paddle/top/mkldnn/math.h"
+#include "paddle/top/npu/math.h"
+#include "paddle/top/selected_rows/math.h"
+#include "paddle/top/xpu/math.h"
diff --git a/paddle/pten/api/include/tensor.h b/paddle/top/api/include/tensor.h
similarity index 98%
rename from paddle/pten/api/include/tensor.h
rename to paddle/top/api/include/tensor.h
index d3b86bba2514c..25a11d1b5d023 100644
--- a/paddle/pten/api/include/tensor.h
+++ b/paddle/top/api/include/tensor.h
@@ -18,8 +18,8 @@ limitations under the License. */
 #include <memory>
 #include <utility>
 
-#include "paddle/pten/core/autograd_meta_if.h"
-#include "paddle/pten/core/tensor_impl_if.h"
+#include "paddle/top/core/autograd_meta_if.h"
+#include "paddle/top/core/tensor_impl_if.h"
 
 /**
  * [ Why still include the fluid headers? ]
diff --git a/paddle/pten/api/src/CMakeLists.txt b/paddle/top/api/src/CMakeLists.txt
similarity index 100%
rename from paddle/pten/api/src/CMakeLists.txt
rename to paddle/top/api/src/CMakeLists.txt
diff --git a/paddle/pten/core/CMakeLists.txt b/paddle/top/core/CMakeLists.txt
similarity index 100%
rename from paddle/pten/core/CMakeLists.txt
rename to paddle/top/core/CMakeLists.txt
diff --git a/paddle/pten/core/autograd_meta_if.h b/paddle/top/core/autograd_meta_if.h
similarity index 100%
rename from paddle/pten/core/autograd_meta_if.h
rename to paddle/top/core/autograd_meta_if.h
diff --git a/paddle/pten/core/backend.h b/paddle/top/core/backend.h
similarity index 100%
rename from paddle/pten/core/backend.h
rename to paddle/top/core/backend.h
diff --git a/paddle/pten/core/convert_utils.cc b/paddle/top/core/convert_utils.cc
similarity index 98%
rename from paddle/pten/core/convert_utils.cc
rename to paddle/top/core/convert_utils.cc
index ddc2513d2a65d..fce27f325dc4b 100644
--- a/paddle/pten/core/convert_utils.cc
+++ b/paddle/top/core/convert_utils.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/top/core/convert_utils.h"
 
 namespace pt {
 
diff --git a/paddle/pten/core/convert_utils.h b/paddle/top/core/convert_utils.h
similarity index 92%
rename from paddle/pten/core/convert_utils.h
rename to paddle/top/core/convert_utils.h
index 398ad61e3cd97..862784a783bd1 100644
--- a/paddle/pten/core/convert_utils.h
+++ b/paddle/top/core/convert_utils.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/pten/core/backend.h"
-#include "paddle/pten/core/dtype.h"
-#include "paddle/pten/core/layout.h"
+#include "paddle/top/core/backend.h"
+#include "paddle/top/core/dtype.h"
+#include "paddle/top/core/layout.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/data_layout.h"
diff --git a/paddle/pten/core/dense_tensor.cc b/paddle/top/core/dense_tensor.cc
similarity index 98%
rename from paddle/pten/core/dense_tensor.cc
rename to paddle/top/core/dense_tensor.cc
index f990351e24e31..f9840bae58580 100644
--- a/paddle/pten/core/dense_tensor.cc
+++ b/paddle/top/core/dense_tensor.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/top/core/dense_tensor.h"
+#include "paddle/top/core/convert_utils.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/data_type.h"
diff --git a/paddle/pten/core/dense_tensor.h b/paddle/top/core/dense_tensor.h
similarity index 97%
rename from paddle/pten/core/dense_tensor.h
rename to paddle/top/core/dense_tensor.h
index 256dde13fb841..fd53e2db7df5b 100644
--- a/paddle/pten/core/dense_tensor.h
+++ b/paddle/top/core/dense_tensor.h
@@ -16,9 +16,9 @@ limitations under the License. */
 
 #include <memory>
 
-#include "paddle/pten/core/tensor_impl_if.h"
-#include "paddle/pten/core/tensor_meta.h"
-#include "paddle/pten/core/tensor_status.h"
+#include "paddle/top/core/tensor_impl_if.h"
+#include "paddle/top/core/tensor_meta.h"
+#include "paddle/top/core/tensor_status.h"
 
 namespace paddle {
 namespace memory {
diff --git a/paddle/pten/core/dtype.h b/paddle/top/core/dtype.h
similarity index 100%
rename from paddle/pten/core/dtype.h
rename to paddle/top/core/dtype.h
diff --git a/paddle/pten/core/layout.h b/paddle/top/core/layout.h
similarity index 100%
rename from paddle/pten/core/layout.h
rename to paddle/top/core/layout.h
diff --git a/paddle/pten/core/scalar_tensor.h b/paddle/top/core/scalar_tensor.h
similarity index 93%
rename from paddle/pten/core/scalar_tensor.h
rename to paddle/top/core/scalar_tensor.h
index e9836633ba465..dd2062a95c7e8 100644
--- a/paddle/pten/core/scalar_tensor.h
+++ b/paddle/top/core/scalar_tensor.h
@@ -14,6 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/top/core/dense_tensor.h"
 
 class LoDTensor : public DenseTensor {};
diff --git a/paddle/pten/core/selected_rows.cc b/paddle/top/core/selected_rows.cc
similarity index 93%
rename from paddle/pten/core/selected_rows.cc
rename to paddle/top/core/selected_rows.cc
index ec70dd0e8cdbe..9655f594c8ea4 100644
--- a/paddle/pten/core/selected_rows.cc
+++ b/paddle/top/core/selected_rows.cc
@@ -12,6 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/pten/core/selected_rows.h"
+#include "paddle/top/core/selected_rows.h"
 
 namespace pt {}  // namespace pt
diff --git a/paddle/pten/core/selected_rows.h b/paddle/top/core/selected_rows.h
similarity index 98%
rename from paddle/pten/core/selected_rows.h
rename to paddle/top/core/selected_rows.h
index 86ba8414f972f..523bf8ec4f1fa 100644
--- a/paddle/pten/core/selected_rows.h
+++ b/paddle/top/core/selected_rows.h
@@ -21,7 +21,7 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
-#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/top/core/dense_tensor.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/mixed_vector.h"
diff --git a/paddle/pten/core/spatial_tensor.h b/paddle/top/core/spatial_tensor.h
similarity index 100%
rename from paddle/pten/core/spatial_tensor.h
rename to paddle/top/core/spatial_tensor.h
diff --git a/paddle/pten/core/tensor_impl_if.h b/paddle/top/core/tensor_impl_if.h
similarity index 95%
rename from paddle/pten/core/tensor_impl_if.h
rename to paddle/top/core/tensor_impl_if.h
index 8207bb428233f..20e78cff21afc 100644
--- a/paddle/pten/core/tensor_impl_if.h
+++ b/paddle/top/core/tensor_impl_if.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/pten/core/backend.h"
-#include "paddle/pten/core/dtype.h"
-#include "paddle/pten/core/layout.h"
+#include "paddle/top/core/backend.h"
+#include "paddle/top/core/dtype.h"
+#include "paddle/top/core/layout.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/pten/core/tensor_meta.h b/paddle/top/core/tensor_meta.h
similarity index 97%
rename from paddle/pten/core/tensor_meta.h
rename to paddle/top/core/tensor_meta.h
index 063d481e9c4b1..b15ef485c9e10 100644
--- a/paddle/pten/core/tensor_meta.h
+++ b/paddle/top/core/tensor_meta.h
@@ -20,9 +20,9 @@ limitations under the License. */
 #include "mkldnn.hpp"
 #endif
 
-#include "paddle/pten/core/backend.h"
-#include "paddle/pten/core/dtype.h"
-#include "paddle/pten/core/layout.h"
+#include "paddle/top/core/backend.h"
+#include "paddle/top/core/dtype.h"
+#include "paddle/top/core/layout.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/ddim.h"
diff --git a/paddle/pten/core/tensor_status.h b/paddle/top/core/tensor_status.h
similarity index 94%
rename from paddle/pten/core/tensor_status.h
rename to paddle/top/core/tensor_status.h
index 3f6f7060feb0d..a3f6d4fef5a38 100644
--- a/paddle/pten/core/tensor_status.h
+++ b/paddle/top/core/tensor_status.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/pten/core/backend.h"
-#include "paddle/pten/core/dtype.h"
-#include "paddle/pten/core/layout.h"
+#include "paddle/top/core/backend.h"
+#include "paddle/top/core/dtype.h"
+#include "paddle/top/core/layout.h"
 
 namespace pt {
 
diff --git a/paddle/pten/cpu/CMakeLists.txt b/paddle/top/cpu/CMakeLists.txt
similarity index 100%
rename from paddle/pten/cpu/CMakeLists.txt
rename to paddle/top/cpu/CMakeLists.txt
diff --git a/paddle/pten/cpu/math.h b/paddle/top/cpu/math.h
similarity index 96%
rename from paddle/pten/cpu/math.h
rename to paddle/top/cpu/math.h
index 50ba5db3cd2a7..5c0eb1066f4aa 100644
--- a/paddle/pten/cpu/math.h
+++ b/paddle/top/cpu/math.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/module/scale.h"
-#include "paddle/pten/module/sign.h"
+#include "paddle/top/core/dense_tensor.h"
+#include "paddle/top/module/scale.h"
+#include "paddle/top/module/sign.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/eigen.h"
diff --git a/paddle/pten/cuda/CMakeLists.txt b/paddle/top/cuda/CMakeLists.txt
similarity index 100%
rename from paddle/pten/cuda/CMakeLists.txt
rename to paddle/top/cuda/CMakeLists.txt
diff --git a/paddle/pten/cuda/math.cu b/paddle/top/cuda/math.cu
similarity index 97%
rename from paddle/pten/cuda/math.cu
rename to paddle/top/cuda/math.cu
index 585acc41e6a99..b4d384e3d47d0 100644
--- a/paddle/pten/cuda/math.cu
+++ b/paddle/top/cuda/math.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/pten/cuda/math.h"
+#include "paddle/top/cuda/math.h"
 
 #ifdef __NVCC__
 #include "cub/cub.cuh"
@@ -22,7 +22,7 @@ limitations under the License. */
 namespace cub = hipcub;
 #endif
 
-#include "paddle/pten/core/convert_utils.h"
+#include "paddle/top/core/convert_utils.h"
 
 namespace pt {
 
diff --git a/paddle/pten/cuda/math.h b/paddle/top/cuda/math.h
similarity index 93%
rename from paddle/pten/cuda/math.h
rename to paddle/top/cuda/math.h
index 6b610cca839dc..dd9062fc10347 100644
--- a/paddle/pten/cuda/math.h
+++ b/paddle/top/cuda/math.h
@@ -16,9 +16,9 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_CUDA
 
-#include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/module/scale.h"
-#include "paddle/pten/module/sign.h"
+#include "paddle/top/core/dense_tensor.h"
+#include "paddle/top/module/scale.h"
+#include "paddle/top/module/sign.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/pten/hip/CMakeLists.txt b/paddle/top/hip/CMakeLists.txt
similarity index 100%
rename from paddle/pten/hip/CMakeLists.txt
rename to paddle/top/hip/CMakeLists.txt
diff --git a/paddle/pten/inferdtype/CMakeLists.txt b/paddle/top/inferdtype/CMakeLists.txt
similarity index 100%
rename from paddle/pten/inferdtype/CMakeLists.txt
rename to paddle/top/inferdtype/CMakeLists.txt
diff --git a/paddle/pten/infershape/CMakeLists.txt b/paddle/top/infershape/CMakeLists.txt
similarity index 100%
rename from paddle/pten/infershape/CMakeLists.txt
rename to paddle/top/infershape/CMakeLists.txt
diff --git a/paddle/pten/mkldnn/CMakeLists.txt b/paddle/top/mkldnn/CMakeLists.txt
similarity index 100%
rename from paddle/pten/mkldnn/CMakeLists.txt
rename to paddle/top/mkldnn/CMakeLists.txt
diff --git a/paddle/pten/mkldnn/base.h b/paddle/top/mkldnn/base.h
similarity index 100%
rename from paddle/pten/mkldnn/base.h
rename to paddle/top/mkldnn/base.h
diff --git a/paddle/pten/mkldnn/math.h b/paddle/top/mkldnn/math.h
similarity index 96%
rename from paddle/pten/mkldnn/math.h
rename to paddle/top/mkldnn/math.h
index 7d521516f0a3c..363dbfc6c0807 100644
--- a/paddle/pten/mkldnn/math.h
+++ b/paddle/top/mkldnn/math.h
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_MKLDNN
 
-#include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/mkldnn/base.h"
+#include "paddle/top/core/dense_tensor.h"
+#include "paddle/top/mkldnn/base.h"
 
 namespace pt {
 
diff --git a/paddle/pten/module/CMakeLists.txt b/paddle/top/module/CMakeLists.txt
similarity index 100%
rename from paddle/pten/module/CMakeLists.txt
rename to paddle/top/module/CMakeLists.txt
diff --git a/paddle/pten/module/scale.h b/paddle/top/module/scale.h
similarity index 97%
rename from paddle/pten/module/scale.h
rename to paddle/top/module/scale.h
index c3eb32ae6c407..a55cfc1fb5d3f 100644
--- a/paddle/pten/module/scale.h
+++ b/paddle/top/module/scale.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/top/core/dense_tensor.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/eigen.h"
diff --git a/paddle/pten/module/sign.h b/paddle/top/module/sign.h
similarity index 97%
rename from paddle/pten/module/sign.h
rename to paddle/top/module/sign.h
index 16e49d475f137..62f27ed60db7f 100644
--- a/paddle/pten/module/sign.h
+++ b/paddle/top/module/sign.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/top/core/dense_tensor.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/eigen.h"
diff --git a/paddle/pten/npu/CMakeLists.txt b/paddle/top/npu/CMakeLists.txt
similarity index 100%
rename from paddle/pten/npu/CMakeLists.txt
rename to paddle/top/npu/CMakeLists.txt
diff --git a/paddle/pten/npu/math.h b/paddle/top/npu/math.h
similarity index 98%
rename from paddle/pten/npu/math.h
rename to paddle/top/npu/math.h
index bdb1768a67eff..a08c732cbddf2 100644
--- a/paddle/pten/npu/math.h
+++ b/paddle/top/npu/math.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_ASCEND_CL
 
-#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/top/core/dense_tensor.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/operators/npu_op_runner.h"
diff --git a/paddle/pten/selected_rows/CMakeLists.txt b/paddle/top/selected_rows/CMakeLists.txt
similarity index 100%
rename from paddle/pten/selected_rows/CMakeLists.txt
rename to paddle/top/selected_rows/CMakeLists.txt
diff --git a/paddle/pten/selected_rows/math.h b/paddle/top/selected_rows/math.h
similarity index 87%
rename from paddle/pten/selected_rows/math.h
rename to paddle/top/selected_rows/math.h
index e2c3c6c703060..a6fa5a1101949 100644
--- a/paddle/pten/selected_rows/math.h
+++ b/paddle/top/selected_rows/math.h
@@ -14,16 +14,16 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/pten/core/selected_rows.h"
+#include "paddle/top/core/selected_rows.h"
 
 // In fact, it is ugly to use such a complicated include
 // relationship when coding.
 // After the kernel registration module is completed, the calculation
 // function should be reused by calling the kernel in global KernelMap.
-#include "paddle/pten/cpu/math.h"
-#include "paddle/pten/cuda/math.h"
-#include "paddle/pten/npu/math.h"
-#include "paddle/pten/xpu/math.h"
+#include "paddle/top/cpu/math.h"
+#include "paddle/top/cuda/math.h"
+#include "paddle/top/npu/math.h"
+#include "paddle/top/xpu/math.h"
 
 // See Note [ Why still include the fluid headers? ]
 
diff --git a/paddle/pten/tests/CMakeLists.txt b/paddle/top/tests/CMakeLists.txt
similarity index 100%
rename from paddle/pten/tests/CMakeLists.txt
rename to paddle/top/tests/CMakeLists.txt
diff --git a/paddle/pten/tests/dense_tensor_test.cc b/paddle/top/tests/dense_tensor_test.cc
similarity index 96%
rename from paddle/pten/tests/dense_tensor_test.cc
rename to paddle/top/tests/dense_tensor_test.cc
index 2aa3edc7699a9..e700c7c5cb815 100644
--- a/paddle/pten/tests/dense_tensor_test.cc
+++ b/paddle/top/tests/dense_tensor_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/top/core/dense_tensor.h"
 
 #include <gtest/gtest.h>
 
diff --git a/paddle/pten/xpu/CMakeLists.txt b/paddle/top/xpu/CMakeLists.txt
similarity index 100%
rename from paddle/pten/xpu/CMakeLists.txt
rename to paddle/top/xpu/CMakeLists.txt
diff --git a/paddle/pten/xpu/math.h b/paddle/top/xpu/math.h
similarity index 98%
rename from paddle/pten/xpu/math.h
rename to paddle/top/xpu/math.h
index 062267d55a962..b81a3632301c7 100644
--- a/paddle/pten/xpu/math.h
+++ b/paddle/top/xpu/math.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/top/core/dense_tensor.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"

From 288efc2ebf684fe48254c305bd5fdf6b48014769 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 29 Jul 2021 12:05:55 +0000
Subject: [PATCH 012/125] rename util filename

---
 paddle/fluid/framework/{pten_utils.h => top_utils.h} | 0
 paddle/fluid/operators/mean_op.h                     | 2 +-
 paddle/fluid/operators/scale_op.h                    | 2 +-
 paddle/fluid/operators/sign_op.h                     | 2 +-
 4 files changed, 3 insertions(+), 3 deletions(-)
 rename paddle/fluid/framework/{pten_utils.h => top_utils.h} (100%)

diff --git a/paddle/fluid/framework/pten_utils.h b/paddle/fluid/framework/top_utils.h
similarity index 100%
rename from paddle/fluid/framework/pten_utils.h
rename to paddle/fluid/framework/top_utils.h
diff --git a/paddle/fluid/operators/mean_op.h b/paddle/fluid/operators/mean_op.h
index 93888cffcc857..25115c739bd10 100644
--- a/paddle/fluid/operators/mean_op.h
+++ b/paddle/fluid/operators/mean_op.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/fluid/framework/top_utils.h"
 
 // only can include the headers in paddle/top/api dirs
 #include "paddle/top/api/dev/core.h"
diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h
index d6dfe507a30ff..f8d3ba41574d4 100644
--- a/paddle/fluid/operators/scale_op.h
+++ b/paddle/fluid/operators/scale_op.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/fluid/framework/top_utils.h"
 
 // only can include the headers in paddle/top/api dirs
 #include "paddle/top/api/dev/core.h"
diff --git a/paddle/fluid/operators/sign_op.h b/paddle/fluid/operators/sign_op.h
index 02c1abd3b36b4..3a19572d6bc12 100644
--- a/paddle/fluid/operators/sign_op.h
+++ b/paddle/fluid/operators/sign_op.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/fluid/framework/top_utils.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 
 // only can include the headers in paddle/top/api dirs

From be3ddd51e478f18b448da74db3cad83d41ffb9fb Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 29 Jul 2021 12:24:27 +0000
Subject: [PATCH 013/125] add more comments

---
 paddle/top/core/dense_tensor.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/paddle/top/core/dense_tensor.h b/paddle/top/core/dense_tensor.h
index fd53e2db7df5b..002ad50dc8299 100644
--- a/paddle/top/core/dense_tensor.h
+++ b/paddle/top/core/dense_tensor.h
@@ -107,9 +107,15 @@ class DenseTensor : public TensorImplInterface {
     return reinterpret_cast<const T*>(data());
   }
 
-  // mutable_data does not hold arguments.
-  // Before calling mutable_data, please make sure that Tensor has maintained
+  // NOTE: mutable_data does not hold arguments. Before calling mutable_data,
+  // please make sure that Tensor has maintained
   // the correct meta and status.
+  //
+  // TODO(chenweihang): We need to be able to specify the allocator when
+  // mutable_data, or directly remove the mutable_data method.
+  // DenseTensor cannot actively apply for memory. Its memory application is
+  // handled by the DeviceContext->AllocateTensorData interface.
+  // I prefer the latter
   template <typename T>
   T* mutable_data() {
     static_assert(std::is_pod<T>::value,

From 3386c49be9b872fded2d50d1981a611abb21d1ed Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Fri, 30 Jul 2021 08:28:43 +0000
Subject: [PATCH 014/125] change TensorImplInterface to TensorInterface

---
 paddle/top/api/include/tensor.h               | 27 ++++++++++++------
 paddle/top/core/autograd_meta_if.h            | 28 -------------------
 paddle/top/core/dense_tensor.h                |  6 ++--
 paddle/top/core/selected_rows.h               |  4 +--
 paddle/top/core/spatial_tensor.h              |  4 ++-
 .../{tensor_impl_if.h => tensor_interface.h}  | 18 ++++++------
 6 files changed, 35 insertions(+), 52 deletions(-)
 delete mode 100644 paddle/top/core/autograd_meta_if.h
 rename paddle/top/core/{tensor_impl_if.h => tensor_interface.h} (78%)

diff --git a/paddle/top/api/include/tensor.h b/paddle/top/api/include/tensor.h
index 25a11d1b5d023..9fd36f97d05dd 100644
--- a/paddle/top/api/include/tensor.h
+++ b/paddle/top/api/include/tensor.h
@@ -18,15 +18,15 @@ limitations under the License. */
 #include <memory>
 #include <utility>
 
-#include "paddle/top/core/autograd_meta_if.h"
-#include "paddle/top/core/tensor_impl_if.h"
+#include "paddle/top/core/tensor_interface.h"
 
 /**
  * [ Why still include the fluid headers? ]
  *
  * We hope to organize the basic implementation of Tensor and the logic related
  * to Tensor operation into an independent library, which we call
- * [Tensor Operation Library], so we extract or rewrite the original OpKernels.
+ * [Tensor Operation Library, top], so we extract or rewrite the original
+ * OpKernels.
  *
  * In the future, the training library, inference library and custom operators
  * will link to this Tensor operation library.
@@ -43,6 +43,15 @@ limitations under the License. */
 
 namespace pt {
 
+class Tensor;
+
+class AutogradMetaInterface {
+ public:
+  virtual const Tensor& grad() const = 0;
+  virtual ~AutogradMetaInterface() = 0;
+  // TODO(yangjiabin): design other methods
+};
+
 /**
  * Tensor is the API description of the basic data structure in the
  * [ Paddle "Tensor OPeration (top)" Library ].
@@ -64,7 +73,7 @@ namespace pt {
  * letters and underscores.
  *
  * Note: Tensor cannot be inherited. The heterogeneous Tensor implementation
- * can be achieved by inheriting the underlying TensorImplInterface.
+ * can be achieved by inheriting the underlying TensorInterface.
  *
  * Note: This Tensor API is suitable for training and custom operators,
  * another simple Tensor design may be required for inference.
@@ -79,10 +88,10 @@ class Tensor final {
 
   /**
    * @description: Use a TensorImpl pointer to construct a Tensor
-   * @param {shared_ptr<TensorImplInterface>} tensor_impl
+   * @param {shared_ptr<TensorInterface>} tensor_impl
    * @return {Tensor}
    */
-  explicit Tensor(std::shared_ptr<TensorImplInterface> tensor_impl)
+  explicit Tensor(std::shared_ptr<TensorInterface> tensor_impl)
       : impl_(std::move(tensor_impl)) {
     if (impl_.get() == nullptr) {
       throw std::runtime_error("TensorImpl with nullptr is not supported");
@@ -166,9 +175,9 @@ class Tensor final {
   /**
    * @description: Return the implemention of current Tensor.
    * @param None
-   * @return {std::shared_ptr<TensorImplInterface>}
+   * @return {std::shared_ptr<TensorInterface>}
    */
-  std::shared_ptr<TensorImplInterface> impl() const { return impl_; }
+  std::shared_ptr<TensorInterface> impl() const { return impl_; }
 
   // Whether API Tensor need `data` and `mutable_data`?
 
@@ -234,7 +243,7 @@ class Tensor final {
    * heterogeneous Tensor implementation, so that the API level can be unified
    * to one `Tensor`.
    */
-  std::shared_ptr<TensorImplInterface> impl_;
+  std::shared_ptr<TensorInterface> impl_;
 
   /**
    * [ Why need abstract AutogradMetaInterface here? ]
diff --git a/paddle/top/core/autograd_meta_if.h b/paddle/top/core/autograd_meta_if.h
deleted file mode 100644
index 2b301f4c75c07..0000000000000
--- a/paddle/top/core/autograd_meta_if.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-namespace pt {
-
-class Tensor;
-
-class AutogradMetaInterface {
- public:
-  virtual const Tensor& grad() const = 0;
-  virtual ~AutogradMetaInterface() = 0;
-  // TODO(yangjiabin): design other methods
-};
-
-}  // namespace pt
diff --git a/paddle/top/core/dense_tensor.h b/paddle/top/core/dense_tensor.h
index 002ad50dc8299..31908b9b3526d 100644
--- a/paddle/top/core/dense_tensor.h
+++ b/paddle/top/core/dense_tensor.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <memory>
 
-#include "paddle/top/core/tensor_impl_if.h"
+#include "paddle/top/core/tensor_interface.h"
 #include "paddle/top/core/tensor_meta.h"
 #include "paddle/top/core/tensor_status.h"
 
@@ -47,9 +47,9 @@ using Allocation = paddle::memory::allocation::Allocation;
  *
  * If the memory layout is different, it cannot be described based on the
  * general Allocation, and it needs to be directly inherited from
- * TensorImplInterface.
+ * TensorInterface.
  */
-class DenseTensor : public TensorImplInterface {
+class DenseTensor : public TensorInterface {
  public:
   // Not allowed to initialize a tensor without descriptive metadata
   DenseTensor() = delete;
diff --git a/paddle/top/core/selected_rows.h b/paddle/top/core/selected_rows.h
index 523bf8ec4f1fa..4643ed737dadb 100644
--- a/paddle/top/core/selected_rows.h
+++ b/paddle/top/core/selected_rows.h
@@ -21,7 +21,7 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
-#include "paddle/top/core/dense_tensor.h"
+#include "paddle/top/core/tensor_interface.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/mixed_vector.h"
@@ -44,7 +44,7 @@ using RWLock = paddle::framework::RWLock;
 
 // TODO(chenweihang): add other methods later
 
-class SelectedRowsTensor : public TensorImplInterface {
+class SelectedRowsTensor : public TensorInterface {
  public:
   SelectedRowsTensor() = delete;
 
diff --git a/paddle/top/core/spatial_tensor.h b/paddle/top/core/spatial_tensor.h
index 8093417f626a8..46dc21f83ccbb 100644
--- a/paddle/top/core/spatial_tensor.h
+++ b/paddle/top/core/spatial_tensor.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/top/core/tensor_interface.h"
+
 namespace pt {
 
 /**
@@ -25,7 +27,7 @@ namespace pt {
  */
 
 template <typename AllocationType>
-class SpatialTensor : public TensorImplInterface {
+class SpatialTensor : public TensorInterface {
  public:
   SpatialTensor(std::shared_ptr<AllocationType> allocation,
                 std::unique_ptr<TensorMeta> meta,
diff --git a/paddle/top/core/tensor_impl_if.h b/paddle/top/core/tensor_interface.h
similarity index 78%
rename from paddle/top/core/tensor_impl_if.h
rename to paddle/top/core/tensor_interface.h
index 20e78cff21afc..4649ad19d2e6a 100644
--- a/paddle/top/core/tensor_impl_if.h
+++ b/paddle/top/core/tensor_interface.h
@@ -41,25 +41,25 @@ using Place = paddle::platform::Place;
  * The abstract class of Tensor implemention, it needs to define its basic
  * behavior through inherited classes.
  *
- * TensorImplInterface allows Tensor to uniformly access various different
+ * TensorInterface allows Tensor to uniformly access various different
  * TensorImpls within the framework. It will not be used as a kernel argument,
  * but only contains the interfaces supported by various TensorImpls.
  * In extreme cases, it can be an empty base class.
  *
- * If we don't use TensorImplInterface, we may need to use shared_ptr<void>
+ * If we don't use TensorInterface, we may need to use shared_ptr<void>
  * to unify Tensor's API.
  */
-class TensorImplInterface {
+class TensorInterface {
  public:
   // Not allowed to initialize a tensor without descriptive metadata
-  TensorImplInterface() = default;
+  TensorInterface() = default;
 
-  TensorImplInterface(const TensorImplInterface&) = delete;
-  TensorImplInterface& operator=(const TensorImplInterface&) = delete;
-  TensorImplInterface(TensorImplInterface&&) = delete;
-  TensorImplInterface& operator=(TensorImplInterface&&) = delete;
+  TensorInterface(const TensorInterface&) = delete;
+  TensorInterface& operator=(const TensorInterface&) = delete;
+  TensorInterface(TensorInterface&&) = delete;
+  TensorInterface& operator=(TensorInterface&&) = delete;
 
-  virtual ~TensorImplInterface() {}
+  virtual ~TensorInterface() {}
 
   virtual int64_t numel() const = 0;
 

From 4ef6be5351d63cd249d63a125ba7a7697dc05aab Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 4 Aug 2021 03:45:19 +0000
Subject: [PATCH 015/125] add kernel key and factory

---
 paddle/top/core/CMakeLists.txt          |   8 +-
 paddle/top/core/backend.cc              |  58 +++++++++
 paddle/top/core/backend.h               |   7 +-
 paddle/top/core/dtype.cc                |  64 ++++++++++
 paddle/top/core/dtype.h                 |   4 +
 paddle/top/core/kernel_context.h        |  15 +++
 paddle/top/core/kernel_factory.cc       |  47 ++++++++
 paddle/top/core/kernel_factory.h        | 151 ++++++++++++++++++++++++
 paddle/top/core/kernel_fn_utils.h       |  15 +++
 paddle/top/core/kernel_registry.h       |  15 +++
 paddle/top/core/layout.cc               |  43 +++++++
 paddle/top/core/layout.h                |   4 +
 paddle/top/tests/CMakeLists.txt         |   1 +
 paddle/top/tests/backend_test.cc        |  17 +++
 paddle/top/tests/dtype_test.cc          |  13 ++
 paddle/top/tests/kernel_factory_test.cc |  23 ++++
 paddle/top/tests/layout_test.cc         |  13 ++
 17 files changed, 495 insertions(+), 3 deletions(-)
 create mode 100644 paddle/top/core/backend.cc
 create mode 100644 paddle/top/core/dtype.cc
 create mode 100644 paddle/top/core/kernel_context.h
 create mode 100644 paddle/top/core/kernel_factory.cc
 create mode 100644 paddle/top/core/kernel_factory.h
 create mode 100644 paddle/top/core/kernel_fn_utils.h
 create mode 100644 paddle/top/core/kernel_registry.h
 create mode 100644 paddle/top/core/layout.cc
 create mode 100644 paddle/top/tests/backend_test.cc
 create mode 100644 paddle/top/tests/dtype_test.cc
 create mode 100644 paddle/top/tests/kernel_factory_test.cc
 create mode 100644 paddle/top/tests/layout_test.cc

diff --git a/paddle/top/core/CMakeLists.txt b/paddle/top/core/CMakeLists.txt
index 6d0e9297b3281..bf143349e382b 100644
--- a/paddle/top/core/CMakeLists.txt
+++ b/paddle/top/core/CMakeLists.txt
@@ -4,6 +4,12 @@ ELSE()
     set(MKLDNN_CTX_DEPS)
 ENDIF()
 
-cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place)
+cc_library(backend SRCS backend.cc)
+cc_library(dtype SRCS dtype.cc)
+cc_library(layout SRCS layout.cc)
+
+cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place backend dtype layout)
 cc_library(dense_tensor SRCS dense_tensor.cc DEPS enforce data_type ddim allocator place convert_utils ${MKLDNN_CTX_DEPS})
 cc_library(selected_rows_tensor SRCS selected_rows.cc DEPS dense_tensor)
+
+cc_library(kernel_factory SRCS kernel_factory.cc DEPS enforce backend dtype layout)
diff --git a/paddle/top/core/backend.cc b/paddle/top/core/backend.cc
new file mode 100644
index 0000000000000..701aa6edf9478
--- /dev/null
+++ b/paddle/top/core/backend.cc
@@ -0,0 +1,58 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/top/core/backend.h"
+
+namespace pt {
+
+std::ostream& operator<<(std::ostream& os, Backend backend) {
+  switch (backend) {
+    case Backend::kUndef:
+      os << "Undefined";
+      break;
+    case Backend::kCPU:
+      os << "CPU";
+      break;
+    case Backend::kCUDA:
+      os << "CUDA";
+      break;
+    case Backend::kCUDAPinned:
+      os << "CUDAPinned";
+      break;
+    case Backend::kHIP:
+      os << "HIP";
+      break;
+    case Backend::kXPU:
+      os << "XPU";
+      break;
+    case Backend::kNPU:
+      os << "NPU";
+      break;
+    case Backend::kNPUPinned:
+      os << "NPUPinned";
+      break;
+    case Backend::kMKLDNN:
+      os << "MKLDNN";
+      break;
+    case Backend::kCUDNN:
+      os << "CUDNN";
+      break;
+    default:
+      // TODO(chenweihang): change to enforce later
+      throw std::runtime_error("Invalid Backend type.");
+  }
+  return os;
+}
+
+}  // namespace pt
diff --git a/paddle/top/core/backend.h b/paddle/top/core/backend.h
index 78c2361c61e6f..db77d2156349c 100644
--- a/paddle/top/core/backend.h
+++ b/paddle/top/core/backend.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <ostream>
 namespace pt {
 
 /**
@@ -32,14 +33,16 @@ enum class Backend {
   kUndef = 0,
   kCPU,
   kCUDA,
-  kCUDAPinned,
+  kCUDAPinned,  // need to be removed
   kHIP,
   kXPU,
   kNPU,
-  kNPUPinned,
+  kNPUPinned,  // need to be removed
   kMKLDNN,
   kCUDNN,
   kNumBackends,
 };
 
+std::ostream& operator<<(std::ostream& os, Backend backend);
+
 }  // namespace pt
diff --git a/paddle/top/core/dtype.cc b/paddle/top/core/dtype.cc
new file mode 100644
index 0000000000000..1790f1f2c3bbf
--- /dev/null
+++ b/paddle/top/core/dtype.cc
@@ -0,0 +1,64 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/top/core/dtype.h"
+
+namespace pt {
+
+std::ostream& operator<<(std::ostream& os, DataType dtype) {
+  switch (dtype) {
+    case DataType::kUndef:
+      os << "Undefined";
+      break;
+    case DataType::kBOOL:
+      os << "bool";
+      break;
+    case DataType::kINT8:
+      os << "int8";
+      break;
+    case DataType::kUINT8:
+      os << "uint8";
+      break;
+    case DataType::kINT16:
+      os << "int16";
+      break;
+    case DataType::kINT32:
+      os << "int32";
+      break;
+    case DataType::kINT64:
+      os << "int64";
+      break;
+    case DataType::kFLOAT16:
+      os << "float16";
+      break;
+    case DataType::kFLOAT32:
+      os << "float32";
+      break;
+    case DataType::kFLOAT64:
+      os << "float64";
+      break;
+    case DataType::kCOMPLEX64:
+      os << "complex64";
+      break;
+    case DataType::kCOMPLEX128:
+      os << "complex128";
+      break;
+    default:
+      // TODO(chenweihang): change to enforce later
+      throw std::runtime_error("Invalid DataType type.");
+  }
+  return os;
+}
+
+}  // namespace pt
diff --git a/paddle/top/core/dtype.h b/paddle/top/core/dtype.h
index 3879dfdd14399..89d0619d64984 100644
--- a/paddle/top/core/dtype.h
+++ b/paddle/top/core/dtype.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <ostream>
+
 namespace pt {
 
 /**
@@ -45,4 +47,6 @@ enum class DataType {
   kNumDataTypes,
 };
 
+std::ostream& operator<<(std::ostream& os, DataType dtype);
+
 }  // namespace pt
diff --git a/paddle/top/core/kernel_context.h b/paddle/top/core/kernel_context.h
new file mode 100644
index 0000000000000..6672a72aab304
--- /dev/null
+++ b/paddle/top/core/kernel_context.h
@@ -0,0 +1,15 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
diff --git a/paddle/top/core/kernel_factory.cc b/paddle/top/core/kernel_factory.cc
new file mode 100644
index 0000000000000..bb860b1183242
--- /dev/null
+++ b/paddle/top/core/kernel_factory.cc
@@ -0,0 +1,47 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/top/core/kernel_factory.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/enforce.h"
+
+namespace pt {
+
+OpKernelFactory& OpKernelFactory::Instance() {
+  static OpKernelFactory g_op_kernel_factory;
+  return g_op_kernel_factory;
+}
+
+const OpKernelFn& OpKernelFactory::FindOpKernel(
+    const OperationName& op_name, const OpKernelKey& kernel_key) const {
+  auto iter = kernels_.find(op_name);
+  PADDLE_ENFORCE_NE(iter,
+                    kernels_.end(),
+                    paddle::platform::errors::NotFound(
+                        "The operation `%s` is not registered.", op_name));
+
+  auto kernel_iter = iter->second.find(kernel_key);
+  PADDLE_ENFORCE_NE(
+      kernel_iter,
+      iter->second.end(),
+      paddle::platform::errors::NotFound(
+          "The kernel with key %s of operation `%s` is not registered.",
+          kernel_key,
+          op_name));
+
+  return kernel_iter->second;
+}
+
+}  // namespace pt
diff --git a/paddle/top/core/kernel_factory.h b/paddle/top/core/kernel_factory.h
new file mode 100644
index 0000000000000..f2f3f4dcf781f
--- /dev/null
+++ b/paddle/top/core/kernel_factory.h
@@ -0,0 +1,151 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <ostream>
+#include <string>
+#include <unordered_map>
+#include <utility>
+
+#include "paddle/top/core/backend.h"
+#include "paddle/top/core/dtype.h"
+#include "paddle/top/core/layout.h"
+
+namespace pt {
+
+class OpKernelContext;
+
+using OpKernelFn = void (*)(OpKernelContext* ctx);
+
+struct OperationName final {
+  std::string op_type;
+  std::string overload_type;
+  // Avoid calculating Hash value at runtime
+  size_t hash_value;
+
+  OperationName(std::string op_type, std::string overload_type)
+      : op_type(std::move(op_type)), overload_type(std::move(overload_type)) {
+    hash_value = std::hash<std::string>()(op_type) ^
+                 (std::hash<std::string>()(overload_type) << 1);
+  }
+
+  struct Hash {
+    size_t operator()(const OperationName& op_name) const {
+      return op_name.hash_value;
+    }
+  };
+
+  bool operator<(const OperationName& op_name) const {
+    return hash_value < op_name.hash_value;
+  }
+
+  bool operator==(const OperationName& op_name) const {
+    return hash_value == op_name.hash_value;
+  }
+
+  bool operator!=(const OperationName& op_name) const {
+    return hash_value != op_name.hash_value;
+  }
+};
+
+class OpKernelKey {
+ public:
+  OpKernelKey(Backend backend, DataType dtype, DataLayout layout)
+      : backend_(backend), dtype_(dtype), layout_(layout) {
+    // |----31-20------|---19-16----|---15-8---|---7-0---|
+    // | For extension | DataLayout | DataType | Backend |
+
+    hash_value_ = 0;
+    hash_value_ |= static_cast<uint8_t>(backend_);
+    hash_value_ |= (static_cast<uint16_t>(dtype_) << kBackendBitLength);
+    hash_value_ |= (static_cast<uint32_t>(layout_)
+                    << (kBackendBitLength + kDataTypeBitLength));
+  }
+
+  Backend backend() const { return backend_; }
+  DataType dtype() const { return dtype_; }
+  DataLayout layout() const { return layout_; }
+
+  uint32_t hash_value() const { return hash_value_; }
+
+  bool operator<(const OpKernelKey& key) const {
+    return hash_value_ < key.hash_value();
+  }
+
+  bool operator==(const OpKernelKey& key) const {
+    return hash_value_ == key.hash_value();
+  }
+
+  bool operator!=(const OpKernelKey& key) const {
+    return hash_value_ != key.hash_value();
+  }
+
+  struct Hash {
+    uint32_t operator()(const OpKernelKey& key) const {
+      return key.hash_value();
+    }
+  };
+
+ private:
+  // In total should be smaller than 32.
+  constexpr static int kBackendBitLength = 8;
+  constexpr static int kDataTypeBitLength = 8;
+  constexpr static int kDataLayoutBitLength = 4;
+
+  Backend backend_;
+  DataType dtype_;
+  DataLayout layout_;
+
+  // Avoid calculating Hash value at runtime.
+  // Note: Now the number of bits we need does not exceed 32 bits, so there is
+  // no need to use 64 bits. If needed in the future, it can be expanded,
+  // but now we don’t over-design.
+  uint32_t hash_value_;
+};
+
+class OpKernelFactory {
+ public:
+  static OpKernelFactory& Instance();
+
+  const OpKernelFn& FindOpKernel(const OperationName& op_name,
+                                 const OpKernelKey& kernel_key) const;
+
+ private:
+  OpKernelFactory();
+
+  // replaced by paddle::flat_hash_map later
+  std::unordered_map<
+      OperationName,
+      std::unordered_map<OpKernelKey, OpKernelFn, OpKernelKey::Hash>,
+      OperationName::Hash>
+      kernels_;
+};
+
+/** operator << overload **/
+
+inline std::ostream& operator<<(std::ostream& os,
+                                const OperationName& op_name) {
+  os << op_name.op_type << "." << op_name.overload_type;
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os,
+                                const OpKernelKey& kernel_key) {
+  os << "(" << kernel_key.backend() << ", " << kernel_key.dtype() << ", "
+     << kernel_key.layout() << ")";
+  return os;
+}
+
+}  // namespace pt
diff --git a/paddle/top/core/kernel_fn_utils.h b/paddle/top/core/kernel_fn_utils.h
new file mode 100644
index 0000000000000..6672a72aab304
--- /dev/null
+++ b/paddle/top/core/kernel_fn_utils.h
@@ -0,0 +1,15 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
diff --git a/paddle/top/core/kernel_registry.h b/paddle/top/core/kernel_registry.h
new file mode 100644
index 0000000000000..6672a72aab304
--- /dev/null
+++ b/paddle/top/core/kernel_registry.h
@@ -0,0 +1,15 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
diff --git a/paddle/top/core/layout.cc b/paddle/top/core/layout.cc
new file mode 100644
index 0000000000000..a25f1818cb5a7
--- /dev/null
+++ b/paddle/top/core/layout.cc
@@ -0,0 +1,43 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/top/core/layout.h"
+
+namespace pt {
+
+std::ostream& operator<<(std::ostream& os, DataLayout dtype) {
+  switch (dtype) {
+    case DataLayout::kUndef:
+      os << "Undefined";
+      break;
+    case DataLayout::kAny:
+      os << "Any";
+      break;
+    case DataLayout::kNHWC:
+      os << "NHWC";
+      break;
+    case DataLayout::kNCHW:
+      os << "NCHW";
+      break;
+    case DataLayout::kMKLDNN:
+      os << "MKLDNN";
+      break;
+    default:
+      // TODO(chenweihang): change to enforce later
+      throw std::runtime_error("Invalid DataLayout type.");
+  }
+  return os;
+}
+
+}  // namespace pt
diff --git a/paddle/top/core/layout.h b/paddle/top/core/layout.h
index 2f4e95f36fdfd..10a7aa1f677c0 100644
--- a/paddle/top/core/layout.h
+++ b/paddle/top/core/layout.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <ostream>
+
 namespace pt {
 
 /**
@@ -34,4 +36,6 @@ enum class DataLayout {
   kNumLayouts,
 };
 
+std::ostream& operator<<(std::ostream& os, DataLayout dtype);
+
 }  // namespace pt
diff --git a/paddle/top/tests/CMakeLists.txt b/paddle/top/tests/CMakeLists.txt
index eea2826c4e066..87e05028db53f 100644
--- a/paddle/top/tests/CMakeLists.txt
+++ b/paddle/top/tests/CMakeLists.txt
@@ -1 +1,2 @@
 cc_test(dense_tensor_test SRCS dense_tensor_test.cc DEPS dense_tensor)
+cc_test(kernel_factory_test SRCS kernel_factory_test.cc DEPS kernel_factory)
diff --git a/paddle/top/tests/backend_test.cc b/paddle/top/tests/backend_test.cc
new file mode 100644
index 0000000000000..add873f8571f7
--- /dev/null
+++ b/paddle/top/tests/backend_test.cc
@@ -0,0 +1,17 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/top/core/backend.h"
+
+#include <gtest/gtest.h>
diff --git a/paddle/top/tests/dtype_test.cc b/paddle/top/tests/dtype_test.cc
new file mode 100644
index 0000000000000..b2b09faaa9d44
--- /dev/null
+++ b/paddle/top/tests/dtype_test.cc
@@ -0,0 +1,13 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
diff --git a/paddle/top/tests/kernel_factory_test.cc b/paddle/top/tests/kernel_factory_test.cc
new file mode 100644
index 0000000000000..158f10c1e5c65
--- /dev/null
+++ b/paddle/top/tests/kernel_factory_test.cc
@@ -0,0 +1,23 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/top/core/kernel_factory.h"
+
+#include "gtest/gtest.h"
+
+TEST(OpKernelFactory, OpKernelKey) {
+  pt::OpKernelKey key(
+      pt::Backend::kCPU, pt::DataType::kFLOAT32, pt::DataLayout::kNCHW);
+  std::cout << key;
+}
diff --git a/paddle/top/tests/layout_test.cc b/paddle/top/tests/layout_test.cc
new file mode 100644
index 0000000000000..b2b09faaa9d44
--- /dev/null
+++ b/paddle/top/tests/layout_test.cc
@@ -0,0 +1,13 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */

From b69066e8c17848636332b20b113cd9b87bdcc2e8 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 4 Aug 2021 09:27:39 +0000
Subject: [PATCH 016/125] remove MKLDNNTensorMeta, add MKLDNNDenseTensor

---
 paddle/fluid/framework/CMakeLists.txt |  2 +
 paddle/fluid/framework/top_utils.cc   | 86 +++++++++++++++++++++++++++
 paddle/fluid/framework/top_utils.h    | 36 +----------
 paddle/fluid/operators/CMakeLists.txt |  1 +
 paddle/fluid/operators/scale_op.h     |  7 +++
 paddle/top/core/dense_tensor.cc       | 14 ++---
 paddle/top/core/dense_tensor.h        | 27 +++++----
 paddle/top/core/mkldnn_dense_tensor.h | 56 +++++++++++++++++
 paddle/top/core/selected_rows.h       |  6 +-
 paddle/top/core/tensor_meta.h         | 35 +----------
 paddle/top/core/tensor_status.h       |  4 +-
 paddle/top/cuda/math.cu               | 12 ++--
 paddle/top/mkldnn/base.h              |  8 +--
 paddle/top/mkldnn/math.h              | 12 ++--
 paddle/top/tests/dense_tensor_test.cc | 12 ++--
 15 files changed, 205 insertions(+), 113 deletions(-)
 create mode 100644 paddle/fluid/framework/top_utils.cc
 create mode 100644 paddle/top/core/mkldnn_dense_tensor.h

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 485fddff4df42..088c7d41328f1 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -381,6 +381,8 @@ cc_library(save_load_util SRCS save_load_util.cc DEPS tensor scope layer)
 cc_test(save_load_util_test SRCS save_load_util_test.cc DEPS save_load_util tensor scope layer)
 cc_library(generator SRCS generator.cc DEPS enforce place)
 
+cc_library(top_utils SRCS top_utils.cc DEPS tensor place top)
+
 # Get the current working branch
 execute_process(
   COMMAND git rev-parse --abbrev-ref HEAD
diff --git a/paddle/fluid/framework/top_utils.cc b/paddle/fluid/framework/top_utils.cc
new file mode 100644
index 0000000000000..ac690a0ebc46b
--- /dev/null
+++ b/paddle/fluid/framework/top_utils.cc
@@ -0,0 +1,86 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/top_utils.h"
+
+#include "paddle/top/core/convert_utils.h"
+#include "paddle/top/core/dense_tensor.h"
+#include "paddle/top/core/mkldnn_dense_tensor.h"
+
+namespace paddle {
+namespace framework {
+
+/* For DenseTensor */
+
+template <>
+std::shared_ptr<pt::DenseTensor> MakeTensorImpl<pt::DenseTensor>(
+    const Tensor& tensor, const platform::Place& place,
+    proto::VarType::Type type) {
+  auto holder = tensor.Holder();
+  auto tensor_impl = std::make_shared<pt::DenseTensor>(
+      pt::TensorMeta(tensor.dims(), pt::TransToPtenBackend(place),
+                     pt::TransToPtenDataType(type),
+                     pt::TransToPtenLayout(tensor.layout()), tensor.offset()),
+      pt::TensorStatus());
+
+  if (holder != nullptr) {
+    tensor_impl->ShareAllocation(tensor.Holder());
+  } else {
+    LOG(WARNING) << "Old Tensor holder is nullptr.";
+  }
+  return tensor_impl;
+}
+
+template <>
+void ShareTensorImpl<pt::DenseTensor>(pt::DenseTensor* tensor_impl,
+                                      Tensor* out) {
+  out->ResetHolderWithType(tensor_impl->MoveMemory(),
+                           pt::TransToProtoVarType(tensor_impl->type()));
+}
+
+/* For MKLDNNDenseTensor (move this part into a single file later) */
+#ifdef PADDLE_WITH_MKLDNN
+
+template <>
+std::shared_ptr<pt::MKLDNNDenseTensor> MakeTensorImpl<pt::MKLDNNDenseTensor>(
+    const Tensor& tensor, const platform::Place& place,
+    proto::VarType::Type type) {
+  auto holder = tensor.Holder();
+  auto tensor_impl = std::make_shared<pt::MKLDNNDenseTensor>(
+      pt::TensorMeta(tensor.dims(), pt::TransToPtenBackend(place),
+                     pt::TransToPtenDataType(type),
+                     pt::TransToPtenLayout(tensor.layout()), tensor.offset()),
+      pt::TensorStatus());
+
+  if (holder != nullptr) {
+    tensor_impl->ShareAllocation(tensor.Holder());
+  } else {
+    LOG(WARNING) << "Old MKLDNN Tensor holder is nullptr.";
+  }
+
+  tensor_impl->set_format(tensor.format());
+  return tensor_impl;
+}
+
+template <>
+void ShareTensorImpl(pt::MKLDNNDenseTensor* tensor_impl, Tensor* out) {
+  out->ResetHolderWithType(tensor_impl->MoveMemory(),
+                           pt::TransToProtoVarType(tensor_impl->type()));
+  out->set_format(tensor_impl->format());
+}
+
+#endif
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/top_utils.h b/paddle/fluid/framework/top_utils.h
index 0cb6f1e3363d5..adc188fa1fa0f 100644
--- a/paddle/fluid/framework/top_utils.h
+++ b/paddle/fluid/framework/top_utils.h
@@ -14,9 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/top/core/convert_utils.h"
-#include "paddle/top/core/dense_tensor.h"
-
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/place.h"
 
@@ -26,39 +23,10 @@ namespace framework {
 template <typename TensorImplT>
 std::shared_ptr<TensorImplT> MakeTensorImpl(const Tensor& tensor,
                                             const platform::Place& place,
-                                            proto::VarType::Type type) {
-  auto holder = tensor.Holder();
-  auto tensor_impl = std::make_shared<TensorImplT>(
-#ifdef PADDLE_WITH_MKLDNN
-      std::unique_ptr<pt::TensorMeta>(new pt::MKLDNNTensorMeta(
-          tensor.dims(), pt::TransToPtenBackend(place),
-          pt::TransToPtenDataType(type), pt::TransToPtenLayout(tensor.layout()),
-          tensor.offset(), /*lod=*/{}, tensor.format())));
-#else
-      std::unique_ptr<pt::TensorMeta>(new pt::TensorMeta(
-          tensor.dims(), pt::TransToPtenBackend(place),
-          pt::TransToPtenDataType(type), pt::TransToPtenLayout(tensor.layout()),
-          tensor.offset())));
-#endif
-  if (holder != nullptr) {
-    tensor_impl->template ShareAllocation(tensor.Holder());
-  } else {
-    LOG(WARNING) << "Old Tensor holder is nullptr.";
-  }
-  return tensor_impl;
-}
+                                            proto::VarType::Type type);
 
 template <typename TensorImplT>
-void ShareTensorImpl(TensorImplT* tensor_impl, Tensor* out) {
-  out->ResetHolderWithType(
-      tensor_impl->template MoveMemory(),
-      pt::TransToProtoVarType(tensor_impl->template type()));
-#ifdef PADDLE_WITH_MKLDNN
-  out->set_format(
-      dynamic_cast<const pt::MKLDNNTensorMeta&>(tensor_impl->template meta())
-          .format);
-#endif
-}
+void ShareTensorImpl(TensorImplT* tensor_impl, Tensor* out);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index fb4f158c9da1c..f852724ee2188 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -75,6 +75,7 @@ if(WITH_UNITY_BUILD)
 endif()
 
 set(OP_HEADER_DEPS ${OP_HEADER_DEPS} top)
+set(OP_HEADER_DEPS ${OP_HEADER_DEPS} top_utils)
 register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op lstm_op run_program_op eye_op recurrent_op
         sync_batch_norm_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
 
diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h
index f8d3ba41574d4..e00c1c1dfcf28 100644
--- a/paddle/fluid/operators/scale_op.h
+++ b/paddle/fluid/operators/scale_op.h
@@ -64,10 +64,17 @@ class ScaleKernel : public framework::OpKernel<T> {
         framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(out_var);
     auto& dev_ctx = ctx.device_context<DeviceContext>();
 
+#ifdef PADDLE_WITH_MKLDNN
+    auto pt_x = framework::MakeTensorImpl<pt::MKLDNNDenseTensor>(
+        *in, in->place(), in->type());
+    auto pt_out = framework::MakeTensorImpl<pt::MKLDNNDenseTensor>(
+        *out, in->place(), in->type());
+#else
     auto pt_x = framework::MakeTensorImpl<pt::DenseTensor>(*in, in->place(),
                                                            in->type());
     auto pt_out = framework::MakeTensorImpl<pt::DenseTensor>(*out, in->place(),
                                                              in->type());
+#endif
 
     // call new kernel
     pt::Scale<T>(dev_ctx, *pt_x.get(), scale, bias, bias_after_scale,
diff --git a/paddle/top/core/dense_tensor.cc b/paddle/top/core/dense_tensor.cc
index f9840bae58580..b6a73c31720d9 100644
--- a/paddle/top/core/dense_tensor.cc
+++ b/paddle/top/core/dense_tensor.cc
@@ -52,7 +52,7 @@ void DenseTensor::ShareAllocation(
 
 // TODO(chenweihang): Add other place branchs
 Place DenseTensor::GetPlaceByBackend() const {
-  switch (meta_->backend) {
+  switch (meta_.backend) {
     case Backend::kCPU:
       return CPUPlace();
 #ifdef PADDLE_WITH_CUDA
@@ -78,7 +78,7 @@ Place DenseTensor::GetPlaceByBackend() const {
 }
 
 size_t DenseTensor::MemorySize() const {
-  return allocation_ == nullptr ? 0UL : allocation_->size() - meta_->offset;
+  return allocation_ == nullptr ? 0UL : allocation_->size() - meta_.offset;
 }
 
 void DenseTensor::CheckMemorySize() const {
@@ -87,7 +87,7 @@ void DenseTensor::CheckMemorySize() const {
                               "Tensor holds no memory. "
                               "Call Tensor::mutable_data firstly."));
   size_t size_of_type =
-      paddle::framework::SizeOfType(TransToProtoVarType(meta_->type));
+      paddle::framework::SizeOfType(TransToProtoVarType(meta_.type));
   PADDLE_ENFORCE_LE(
       numel() * size_of_type,
       MemorySize(),
@@ -107,7 +107,7 @@ std::shared_ptr<Allocation> DenseTensor::MoveMemory() {
 const void* DenseTensor::data() const {
   CheckMemorySize();
   return reinterpret_cast<const void*>(
-      reinterpret_cast<uintptr_t>(allocation_->ptr()) + meta_->offset);
+      reinterpret_cast<uintptr_t>(allocation_->ptr()) + meta_.offset);
 }
 
 void* DenseTensor::mutable_data() {
@@ -120,7 +120,7 @@ void* DenseTensor::mutable_data() {
           dims(),
           "] now"));
   size_t size =
-      numel() * paddle::framework::SizeOfType(TransToProtoVarType(meta_->type));
+      numel() * paddle::framework::SizeOfType(TransToProtoVarType(meta_.type));
   auto place = GetPlaceByBackend();
   if (allocation_ == nullptr) {
     allocation_.reset();
@@ -128,7 +128,7 @@ void* DenseTensor::mutable_data() {
   } else {
     LOG(WARNING) << "When call mutable_data, DenseTensor has been initialized.";
     if (!(allocation_->place() == place) ||
-        allocation_->size() < size + meta_->offset) {
+        allocation_->size() < size + meta_.offset) {
       allocation_.reset();
       allocation_ = paddle::memory::AllocShared(place, size);
     } else {
@@ -136,7 +136,7 @@ void* DenseTensor::mutable_data() {
     }
   }
   return reinterpret_cast<void*>(
-      reinterpret_cast<uintptr_t>(allocation_->ptr()) + meta_->offset);
+      reinterpret_cast<uintptr_t>(allocation_->ptr()) + meta_.offset);
 }
 
 }  // namespace pt
diff --git a/paddle/top/core/dense_tensor.h b/paddle/top/core/dense_tensor.h
index 31908b9b3526d..b3dad8b32f54b 100644
--- a/paddle/top/core/dense_tensor.h
+++ b/paddle/top/core/dense_tensor.h
@@ -65,24 +65,25 @@ class DenseTensor : public TensorInterface {
    *
    * Note: Tensor objects lacking meta information are not allowed to exist.
    */
-  explicit DenseTensor(std::unique_ptr<TensorMeta> meta,
-                       std::unique_ptr<TensorStatus> status =
-                           std::unique_ptr<TensorStatus>(new TensorStatus()))
+  DenseTensor(const TensorMeta& meta, const TensorStatus& status)
+      : meta_(meta), status_(status) {}
+
+  DenseTensor(TensorMeta&& meta, TensorStatus&& status)
       : meta_(std::move(meta)), status_(std::move(status)) {}
 
   ~DenseTensor() override {}
 
-  int64_t numel() const override { return meta_->numel; }
+  int64_t numel() const override { return meta_.numel; }
 
-  DDim dims() const override { return meta_->dims; }
+  DDim dims() const override { return meta_.dims; }
 
-  DataType type() const override { return meta_->type; }
+  DataType type() const override { return meta_.type; }
 
-  DataLayout layout() const override { return meta_->layout; }
+  DataLayout layout() const override { return meta_.layout; }
 
   Place place() const override;
 
-  Backend backend() const override { return meta_->backend; }
+  Backend backend() const override { return meta_.backend; }
 
   bool initialized() const override { return allocation_ != nullptr; }
 
@@ -90,9 +91,9 @@ class DenseTensor : public TensorInterface {
 
   const std::shared_ptr<Allocation>& allocation() const { return allocation_; }
 
-  const TensorMeta& meta() const { return *meta_; }
+  const TensorMeta& meta() const { return meta_; }
 
-  TensorMeta* mutable_meta() { return meta_.get(); }
+  TensorMeta* mutable_meta() { return &meta_; }
 
   /* Data Access Methods */
 
@@ -125,7 +126,7 @@ class DenseTensor : public TensorInterface {
 
   // For non-API and non-member interfaces, we still follow the C++ code style?
 
-  void Resize(const DDim& dims) { meta_->dims = dims; }
+  void Resize(const DDim& dims) { meta_.dims = dims; }
 
   void ShareAllocation(const std::shared_ptr<Allocation>& allocation);
 
@@ -141,9 +142,9 @@ class DenseTensor : public TensorInterface {
   // The actual Tensor storage holder
   std::shared_ptr<Allocation> allocation_;
   // The Tensor meta data
-  std::unique_ptr<TensorMeta> meta_;
+  TensorMeta meta_;
   // The Tensor status data
-  std::unique_ptr<TensorStatus> status_;
+  TensorStatus status_;
 };
 
 }  // namespace pt
diff --git a/paddle/top/core/mkldnn_dense_tensor.h b/paddle/top/core/mkldnn_dense_tensor.h
new file mode 100644
index 0000000000000..9f5f63d771c55
--- /dev/null
+++ b/paddle/top/core/mkldnn_dense_tensor.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_MKLDNN
+
+#include "mkldnn.hpp"
+
+#include "paddle/top/core/dense_tensor.h"
+
+namespace pt {
+
+class MKLDNNDenseTensor : public DenseTensor {
+ public:
+  // Not allowed to initialize a tensor without descriptive metadata
+  MKLDNNDenseTensor() = delete;
+
+  MKLDNNDenseTensor(const MKLDNNDenseTensor&) = delete;
+  MKLDNNDenseTensor& operator=(const MKLDNNDenseTensor&) = delete;
+  MKLDNNDenseTensor(MKLDNNDenseTensor&&) = delete;
+  MKLDNNDenseTensor& operator=(MKLDNNDenseTensor&&) = delete;
+
+  MKLDNNDenseTensor(const TensorMeta& meta, const TensorStatus& status)
+      : DenseTensor(meta, status) {}
+
+  mkldnn::memory::format_tag format() const { return format_; }
+
+  void set_format(const mkldnn::memory::format_tag format) { format_ = format; }
+
+ private:
+  /**
+   * @brief the detail format of memory block which have layout as kMKLDNN
+   *
+   * @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C,
+   *       nChw16c, etc. For a MKLDNN memory block, layout will be set as
+   *       DataLayout::kMKLDNN meanwhile detail memory format will be kept in
+   *       this field.
+   */
+  mkldnn::memory::format_tag format_ = mkldnn::memory::format_tag::undef;
+};
+
+}  // namespace pt
+
+#endif
diff --git a/paddle/top/core/selected_rows.h b/paddle/top/core/selected_rows.h
index 4643ed737dadb..dc5c6a42d0681 100644
--- a/paddle/top/core/selected_rows.h
+++ b/paddle/top/core/selected_rows.h
@@ -53,12 +53,12 @@ class SelectedRowsTensor : public TensorInterface {
   SelectedRowsTensor(SelectedRowsTensor&&) = delete;
   SelectedRowsTensor& operator=(SelectedRowsTensor&&) = delete;
 
-  SelectedRowsTensor(std::unique_ptr<TensorMeta> meta,
-                     std::unique_ptr<TensorStatus> status,
+  SelectedRowsTensor(const TensorMeta& meta,
+                     const TensorStatus& status,
                      const std::vector<int64_t>& rows,
                      int64_t height)
       : rows_(rows), height_(height) {
-    value_.reset(new DenseTensor(std::move(meta), std::move(status)));
+    value_.reset(new DenseTensor(meta, status));
   }
 
   const DenseTensor& value() const { return *value_; }
diff --git a/paddle/top/core/tensor_meta.h b/paddle/top/core/tensor_meta.h
index b15ef485c9e10..fbfd55b3ccdb7 100644
--- a/paddle/top/core/tensor_meta.h
+++ b/paddle/top/core/tensor_meta.h
@@ -16,10 +16,6 @@ limitations under the License. */
 
 #include <vector>
 
-#ifdef PADDLE_WITH_MKLDNN
-#include "mkldnn.hpp"
-#endif
-
 #include "paddle/top/core/backend.h"
 #include "paddle/top/core/dtype.h"
 #include "paddle/top/core/layout.h"
@@ -67,11 +63,12 @@ using LoD = std::vector<std::vector<size_t>>;
  */
 struct TensorMeta {
   TensorMeta() = delete;
-  TensorMeta(const TensorMeta&) = delete;
   TensorMeta& operator=(const TensorMeta&) = delete;
-  // TensorMeta(TensorMeta&&) = delete;
   TensorMeta& operator=(TensorMeta&&) = delete;
 
+  TensorMeta(const TensorMeta&) = default;
+  // TensorMeta(TensorMeta&&) = default;
+
   TensorMeta(TensorMeta&& meta)
       : dims(meta.dims),
         backend(meta.backend),
@@ -144,30 +141,4 @@ struct TensorMeta {
   LoD lod;
 };
 
-#ifdef PADDLE_WITH_MKLDNN
-struct MKLDNNTensorMeta : public TensorMeta {
-  MKLDNNTensorMeta(
-      const DDim& dims,
-      Backend backend,
-      DataType type,
-      DataLayout layout,
-      size_t offset = 0UL,
-      const LoD& lod = {},
-      mkldnn::memory::format_tag format = mkldnn::memory::format_tag::undef)
-      : TensorMeta(dims, backend, type, layout, offset, lod), format(format) {}
-
-  ~MKLDNNTensorMeta() override {}
-
-  /**
-   * @brief the detail format of memory block which have layout as kMKLDNN
-   *
-   * @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C,
-   *       nChw16c, etc. For a MKLDNN memory block, layout will be set as
-   *       DataLayout::kMKLDNN meanwhile detail memory format will be kept in
-   *       this field.
-   */
-  mkldnn::memory::format_tag format = mkldnn::memory::format_tag::undef;
-};
-#endif
-
 }  // namespace pt
diff --git a/paddle/top/core/tensor_status.h b/paddle/top/core/tensor_status.h
index a3f6d4fef5a38..075b52c573805 100644
--- a/paddle/top/core/tensor_status.h
+++ b/paddle/top/core/tensor_status.h
@@ -47,10 +47,10 @@ class TensorInplaceVersion {
  */
 struct TensorStatus {
   TensorStatus() = default;
+  TensorStatus(const TensorStatus&) = default;
+  TensorStatus(TensorStatus&&) = default;
 
-  TensorStatus(const TensorStatus&) = delete;
   TensorStatus& operator=(const TensorStatus&) = delete;
-  TensorStatus(TensorStatus&&) = delete;
   TensorStatus& operator=(TensorStatus&&) = delete;
 
   TensorInplaceVersion inplace_version_counter{0};
diff --git a/paddle/top/cuda/math.cu b/paddle/top/cuda/math.cu
index b4d384e3d47d0..82b1d7d3d458c 100644
--- a/paddle/top/cuda/math.cu
+++ b/paddle/top/cuda/math.cu
@@ -63,11 +63,13 @@ void Mean(const CUDADeviceContext& dev_ctx,
       nullptr, temp_storage_bytes, trans_x, out_data, size_prob, stream);
   PADDLE_ENFORCE_CUDA_SUCCESS(err);
 
-  pt::DenseTensor tmp(std::unique_ptr<TensorMeta>(new TensorMeta(
-      paddle::framework::make_ddim({static_cast<int64_t>(temp_storage_bytes)}),
-      pt::TransToPtenBackend(dev_ctx.GetPlace()),
-      x.type(),
-      x.layout())));
+  pt::DenseTensor tmp(
+      TensorMeta(paddle::framework::make_ddim(
+                     {static_cast<int64_t>(temp_storage_bytes)}),
+                 pt::TransToPtenBackend(dev_ctx.GetPlace()),
+                 x.type(),
+                 x.layout()),
+      TensorStatus());
   auto* temp_storage = tmp.mutable_data<uint8_t>();
   err = cub::DeviceReduce::Sum(
       temp_storage, temp_storage_bytes, trans_x, out_data, size_prob, stream);
diff --git a/paddle/top/mkldnn/base.h b/paddle/top/mkldnn/base.h
index d7134ecf92d8b..eab8fc00bf0ab 100644
--- a/paddle/top/mkldnn/base.h
+++ b/paddle/top/mkldnn/base.h
@@ -16,6 +16,8 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_MKLDNN
 
+#include "paddle/top/core/mkldnn_dense_tensor.h"
+
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/mkldnn_reuse.h"
 
@@ -33,7 +35,7 @@ class ScaleMKLDNNHandler
                                               mkldnn::eltwise_backward> {
  public:
   ScaleMKLDNNHandler(const MKLDNNDeviceContext& dev_ctx,
-                     const pt::DenseTensor& in_x,
+                     const pt::MKLDNNDenseTensor& in_x,
                      const std::string& unique_name,
                      bool is_inplaced,
                      float alpha,
@@ -68,9 +70,7 @@ class ScaleMKLDNNHandler
 
     auto src_tz = paddle::framework::vectorize<int64_t>(in_x.dims());
     auto src_fmt =
-        src_tz.size() == 2
-            ? paddle::MKLDNNMemoryFormat::nc
-            : dynamic_cast<const MKLDNNTensorMeta&>(in_x.meta()).format;
+        src_tz.size() == 2 ? paddle::MKLDNNMemoryFormat::nc : in_x.format();
     auto md = mkldnn::memory::desc(
         src_tz, paddle::platform::MKLDNNGetDataType<T>(), src_fmt);
 
diff --git a/paddle/top/mkldnn/math.h b/paddle/top/mkldnn/math.h
index 363dbfc6c0807..d9e6ea314fa0e 100644
--- a/paddle/top/mkldnn/math.h
+++ b/paddle/top/mkldnn/math.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_MKLDNN
 
-#include "paddle/top/core/dense_tensor.h"
+#include "paddle/top/core/mkldnn_dense_tensor.h"
 #include "paddle/top/mkldnn/base.h"
 
 namespace pt {
@@ -25,11 +25,11 @@ using MKLDNNDeviceContext = paddle::platform::MKLDNNDeviceContext;
 
 template <typename T>
 void Scale(const MKLDNNDeviceContext& dev_ctx,
-           const DenseTensor& x,
+           const MKLDNNDenseTensor& x,
            float scale,
            float bias,
            bool bias_after_scale,
-           DenseTensor* out) {
+           MKLDNNDenseTensor* out) {
   bool is_inplaced = x.allocation() && x.allocation() == out->allocation();
 
   // TODO(chenweihang): add `name` into TensorMeta?
@@ -52,10 +52,8 @@ void Scale(const MKLDNNDeviceContext& dev_ctx,
   astream.wait();
 
   out->mutable_meta()->layout = DataLayout::kMKLDNN;
-  // TODO(chenweihang): we should use dynamic_cast get MKLDNNTensorMeta,
-  // Is there any better way here?
-  dynamic_cast<MKLDNNTensorMeta*>(out->mutable_meta())->format =
-      paddle::platform::GetMKLDNNFormat(*dst_memory_p);
+  // TODO(chenweihang): format is also meta info, how to deal with here?
+  out->set_format(paddle::platform::GetMKLDNNFormat(*dst_memory_p));
 }
 
 }  // namespace pt
diff --git a/paddle/top/tests/dense_tensor_test.cc b/paddle/top/tests/dense_tensor_test.cc
index e700c7c5cb815..f2b19b409f4a2 100644
--- a/paddle/top/tests/dense_tensor_test.cc
+++ b/paddle/top/tests/dense_tensor_test.cc
@@ -20,12 +20,12 @@ namespace framework = paddle::framework;
 using DDim = paddle::framework::DDim;
 
 TEST(DenseTensor, Constructor) {
-  pt::DenseTensor tensor(std::unique_ptr<pt::TensorMeta>(
-      new pt::TensorMeta(framework::make_ddim({5, 10}),
-                         pt::Backend::kCPU,
-                         pt::DataType::kFLOAT32,
-                         pt::DataLayout::kNCHW,
-                         0UL)));
+  pt::DenseTensor tensor(pt::TensorMeta(framework::make_ddim({5, 10}),
+                                        pt::Backend::kCPU,
+                                        pt::DataType::kFLOAT32,
+                                        pt::DataLayout::kNCHW,
+                                        0UL),
+                         pt::TensorStatus());
   ASSERT_EQ(tensor.dims().size(), 2);
   ASSERT_EQ(tensor.backend(), pt::Backend::kCPU);
   ASSERT_EQ(tensor.type(), pt::DataType::kFLOAT32);

From c732d575c062aebbd854d45d3cf9f26fc85b2711 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 5 Aug 2021 07:01:50 +0000
Subject: [PATCH 017/125] change XXDeviceContext to XXContext

---
 paddle/top/cpu/math.h           | 17 ++++++-----------
 paddle/top/cuda/math.cu         | 10 ++++------
 paddle/top/cuda/math.h          | 17 ++++++-----------
 paddle/top/mkldnn/base.h        |  4 ++--
 paddle/top/mkldnn/math.h        |  6 +++---
 paddle/top/npu/math.h           |  8 +++-----
 paddle/top/selected_rows/math.h |  3 ++-
 paddle/top/xpu/math.h           | 12 ++++--------
 8 files changed, 30 insertions(+), 47 deletions(-)

diff --git a/paddle/top/cpu/math.h b/paddle/top/cpu/math.h
index 5c0eb1066f4aa..8eef66edd9811 100644
--- a/paddle/top/cpu/math.h
+++ b/paddle/top/cpu/math.h
@@ -33,7 +33,7 @@ template <typename T,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = paddle::framework::EigenVector<T, MajorType, IndexType>;
 
-using CPUDeviceContext = paddle::platform::CPUDeviceContext;
+using CPUContext = paddle::platform::CPUDeviceContext;
 
 /**
  * [ How do we organize the kernel directory ]
@@ -56,16 +56,12 @@ using CPUDeviceContext = paddle::platform::CPUDeviceContext;
  */
 
 template <typename T>
-void Sign(const CPUDeviceContext& dev_ctx,
-          const DenseTensor& x,
-          DenseTensor* out) {
-  module::Sign<CPUDeviceContext, T>(dev_ctx, x, out);
+void Sign(const CPUContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
+  module::Sign<CPUContext, T>(dev_ctx, x, out);
 }
 
 template <typename T>
-void Mean(const CPUDeviceContext& dev_ctx,
-          const DenseTensor& x,
-          DenseTensor* out) {
+void Mean(const CPUContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
   out->mutable_data<T>();
   auto x_data = EigenVector<T>::Flatten(x);
   auto y_data = EigenScalar<T>::From(*out);
@@ -74,14 +70,13 @@ void Mean(const CPUDeviceContext& dev_ctx,
 }
 
 template <typename T>
-void Scale(const CPUDeviceContext& dev_ctx,
+void Scale(const CPUContext& dev_ctx,
            const DenseTensor& x,
            float scale,
            float bias,
            bool bias_after_scale,
            DenseTensor* out) {
-  module::Scale<CPUDeviceContext, T>(
-      dev_ctx, x, scale, bias, bias_after_scale, out);
+  module::Scale<CPUContext, T>(dev_ctx, x, scale, bias, bias_after_scale, out);
 }
 
 }  // namespace pt
diff --git a/paddle/top/cuda/math.cu b/paddle/top/cuda/math.cu
index 82b1d7d3d458c..162fc45cf5c56 100644
--- a/paddle/top/cuda/math.cu
+++ b/paddle/top/cuda/math.cu
@@ -46,9 +46,7 @@ struct DivideFunctor {
  */
 
 template <typename T>
-void Mean(const CUDADeviceContext& dev_ctx,
-          const DenseTensor& x,
-          DenseTensor* out) {
+void Mean(const CUDAContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
   auto size_prob = x.numel();
   const T* x_data = x.data<T>();
   T* out_data = out->mutable_data<T>();
@@ -76,13 +74,13 @@ void Mean(const CUDADeviceContext& dev_ctx,
   PADDLE_ENFORCE_CUDA_SUCCESS(err);
 }
 
-template void Mean<float>(const CUDADeviceContext& dev_ctx,
+template void Mean<float>(const CUDAContext& dev_ctx,
                           const DenseTensor& x,
                           DenseTensor* out);
-template void Mean<double>(const CUDADeviceContext& dev_ctx,
+template void Mean<double>(const CUDAContext& dev_ctx,
                            const DenseTensor& x,
                            DenseTensor* out);
-template void Mean<paddle::platform::float16>(const CUDADeviceContext& dev_ctx,
+template void Mean<paddle::platform::float16>(const CUDAContext& dev_ctx,
                                               const DenseTensor& x,
                                               DenseTensor* out);
 
diff --git a/paddle/top/cuda/math.h b/paddle/top/cuda/math.h
index dd9062fc10347..7e5f72521be39 100644
--- a/paddle/top/cuda/math.h
+++ b/paddle/top/cuda/math.h
@@ -25,13 +25,11 @@ limitations under the License. */
 
 namespace pt {
 
-using CUDADeviceContext = paddle::platform::CUDADeviceContext;
+using CUDAContext = paddle::platform::CUDADeviceContext;
 
 template <typename T>
-void Sign(const CUDADeviceContext& dev_ctx,
-          const DenseTensor& x,
-          DenseTensor* out) {
-  module::Sign<CUDADeviceContext, T>(dev_ctx, x, out);
+void Sign(const CUDAContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
+  module::Sign<CUDAContext, T>(dev_ctx, x, out);
 }
 
 // TODO(chenweihang): Perhaps the Kernel call should not be implemented by
@@ -40,19 +38,16 @@ void Sign(const CUDADeviceContext& dev_ctx,
 // include header files, there will be many more function declarations and
 // redundant function call
 template <typename T>
-void Mean(const CUDADeviceContext& dev_ctx,
-          const DenseTensor& x,
-          DenseTensor* out);
+void Mean(const CUDAContext& dev_ctx, const DenseTensor& x, DenseTensor* out);
 
 template <typename T>
-void Scale(const CUDADeviceContext& dev_ctx,
+void Scale(const CUDAContext& dev_ctx,
            const DenseTensor& x,
            float scale,
            float bias,
            bool bias_after_scale,
            DenseTensor* out) {
-  module::Scale<CUDADeviceContext, T>(
-      dev_ctx, x, scale, bias, bias_after_scale, out);
+  module::Scale<CUDAContext, T>(dev_ctx, x, scale, bias, bias_after_scale, out);
 }
 
 }  // namespace pt
diff --git a/paddle/top/mkldnn/base.h b/paddle/top/mkldnn/base.h
index eab8fc00bf0ab..2e280dd39aa52 100644
--- a/paddle/top/mkldnn/base.h
+++ b/paddle/top/mkldnn/base.h
@@ -23,7 +23,7 @@ limitations under the License. */
 
 namespace pt {
 
-using MKLDNNDeviceContext = paddle::platform::MKLDNNDeviceContext;
+using MKLDNNDContext = paddle::platform::MKLDNNDeviceContext;
 
 // TODO(chenweihang): the handlers in `mkldnn_reuse.h` are coupled to
 // `ExecutionContext`, refactoring that may be a big project!
@@ -34,7 +34,7 @@ class ScaleMKLDNNHandler
                                               mkldnn::eltwise_forward,
                                               mkldnn::eltwise_backward> {
  public:
-  ScaleMKLDNNHandler(const MKLDNNDeviceContext& dev_ctx,
+  ScaleMKLDNNHandler(const MKLDNNDContext& dev_ctx,
                      const pt::MKLDNNDenseTensor& in_x,
                      const std::string& unique_name,
                      bool is_inplaced,
diff --git a/paddle/top/mkldnn/math.h b/paddle/top/mkldnn/math.h
index d9e6ea314fa0e..a4e8681405e4a 100644
--- a/paddle/top/mkldnn/math.h
+++ b/paddle/top/mkldnn/math.h
@@ -21,10 +21,10 @@ limitations under the License. */
 
 namespace pt {
 
-using MKLDNNDeviceContext = paddle::platform::MKLDNNDeviceContext;
+using MKLDNNDContext = paddle::platform::MKLDNNDeviceContext;
 
 template <typename T>
-void Scale(const MKLDNNDeviceContext& dev_ctx,
+void Scale(const MKLDNNDContext& dev_ctx,
            const MKLDNNDenseTensor& x,
            float scale,
            float bias,
@@ -45,7 +45,7 @@ void Scale(const MKLDNNDeviceContext& dev_ctx,
   auto dst_memory_p = handler.AcquireDstMemory(out);
   auto activation_p = handler.AcquireForwardPrimitive();
 
-  auto& astream = MKLDNNDeviceContext::tls().get_stream();
+  auto& astream = MKLDNNDContext::tls().get_stream();
   activation_p->execute(
       astream,
       {{MKLDNN_ARG_FROM, *src_memory_p}, {MKLDNN_ARG_TO, *dst_memory_p}});
diff --git a/paddle/top/npu/math.h b/paddle/top/npu/math.h
index 249856a85338f..269c7b54cbc9d 100644
--- a/paddle/top/npu/math.h
+++ b/paddle/top/npu/math.h
@@ -24,12 +24,10 @@ limitations under the License. */
 
 namespace pt {
 
-using NPUDeviceContext = paddle::platform::NPUDeviceContext;
+using NPUContext = paddle::platform::NPUDeviceContext;
 
 template <typename T>
-void Mean(const NPUDeviceContext& dev_ctx,
-          const DenseTensor& x,
-          DenseTensor* out) {
+void Mean(const NPUContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
   std::vector<int> axes;
   paddle::framework::NPUAttributeMap attr_input = {{"keep_dims", false},
                                                    {"axes", axes}};
@@ -41,7 +39,7 @@ void Mean(const NPUDeviceContext& dev_ctx,
 }
 
 template <typename T>
-void Scale(const NPUDeviceContext& dev_ctx,
+void Scale(const NPUContext& dev_ctx,
            const DenseTensor& x,
            float scale,
            float bias,
diff --git a/paddle/top/selected_rows/math.h b/paddle/top/selected_rows/math.h
index a6fa5a1101949..84e8f15860ed8 100644
--- a/paddle/top/selected_rows/math.h
+++ b/paddle/top/selected_rows/math.h
@@ -29,8 +29,9 @@ limitations under the License. */
 
 namespace pt {
 
+// TODO(chenweihang): also support CUDA, XPU, NPU, ...
 template <typename T>
-void Scale(const CPUDeviceContext& dev_ctx,
+void Scale(const CPUContext& dev_ctx,
            const SelectedRowsTensor& x,
            float scale,
            float bias,
diff --git a/paddle/top/xpu/math.h b/paddle/top/xpu/math.h
index b81a3632301c7..3f5330c6d2a4e 100644
--- a/paddle/top/xpu/math.h
+++ b/paddle/top/xpu/math.h
@@ -24,12 +24,10 @@ limitations under the License. */
 
 namespace pt {
 
-using XPUDeviceContext = paddle::platform::XPUDeviceContext;
+using XPUContext = paddle::platform::XPUDeviceContext;
 
 template <typename T>
-void Sign(const XPUDeviceContext& dev_ctx,
-          const DenseTensor& x,
-          DenseTensor* out) {
+void Sign(const XPUContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
   T* out_data = out->mutable_data<T>();
   auto xpu_ctx = dev_ctx.x_context();
   int r = xpu::activation_forward(
@@ -40,9 +38,7 @@ void Sign(const XPUDeviceContext& dev_ctx,
 }
 
 template <typename T>
-void Mean(const XPUDeviceContext& dev_ctx,
-          const DenseTensor& x,
-          DenseTensor* out) {
+void Mean(const XPUContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
   T* out_data = out->mutable_data<T>();
   auto xpu_ctx = dev_ctx.x_context();
   const T* x_data = x.data<T>();
@@ -55,7 +51,7 @@ void Mean(const XPUDeviceContext& dev_ctx,
 }
 
 template <typename T>
-void Scale(const XPUDeviceContext& dev_ctx,
+void Scale(const XPUContext& dev_ctx,
            const DenseTensor& x,
            float scale,
            float bias,

From 374345f689e38c5bda6beb439801f3ad043fef85 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 16 Aug 2021 12:11:26 +0000
Subject: [PATCH 018/125] add base kernel registrar utils & test on sign

---
 paddle/fluid/framework/top_utils.cc           |  16 +-
 paddle/fluid/framework/top_utils.h            |   2 +
 paddle/fluid/operators/sign_op.h              |  14 +-
 paddle/fluid/operators/sign_op_xpu.cc         |  44 ---
 paddle/top/api/CMakeLists.txt                 |  10 +-
 paddle/top/api/dev/core.h                     |   4 +
 paddle/top/core/CMakeLists.txt                |   1 +
 paddle/top/core/convert_utils.cc              |   6 +-
 paddle/top/core/convert_utils.h               |   6 +-
 paddle/top/core/dtype.h                       |  47 +++
 .../{kernel_fn_utils.h => kernel_context.cc}  |   4 +-
 paddle/top/core/kernel_context.h              |  77 +++++
 paddle/top/core/kernel_def.h                  |  22 ++
 paddle/top/core/kernel_factory.cc             |  19 +-
 paddle/top/core/kernel_factory.h              | 128 ++++++--
 paddle/top/core/kernel_registry.h             | 282 ++++++++++++++++++
 paddle/top/core/kernel_utils.h                | 148 +++++++++
 paddle/top/cpu/CMakeLists.txt                 |   1 +
 paddle/top/cpu/math.cc                        |  33 ++
 paddle/top/cpu/math.h                         |   3 +
 paddle/top/cuda/CMakeLists.txt                |   2 +-
 paddle/top/cuda/math.cu                       |   7 +-
 paddle/top/tests/kernel_factory_test.cc       |   2 +-
 paddle/top/xpu/CMakeLists.txt                 |   1 +
 paddle/top/xpu/math.cc                        |  19 ++
 25 files changed, 808 insertions(+), 90 deletions(-)
 delete mode 100644 paddle/fluid/operators/sign_op_xpu.cc
 rename paddle/top/core/{kernel_fn_utils.h => kernel_context.cc} (88%)
 create mode 100644 paddle/top/core/kernel_def.h
 create mode 100644 paddle/top/core/kernel_utils.h
 create mode 100644 paddle/top/cpu/math.cc
 create mode 100644 paddle/top/xpu/math.cc

diff --git a/paddle/fluid/framework/top_utils.cc b/paddle/fluid/framework/top_utils.cc
index ac690a0ebc46b..ec3ee3456b4e3 100644
--- a/paddle/fluid/framework/top_utils.cc
+++ b/paddle/fluid/framework/top_utils.cc
@@ -14,9 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/top_utils.h"
 
-#include "paddle/top/core/convert_utils.h"
-#include "paddle/top/core/dense_tensor.h"
-#include "paddle/top/core/mkldnn_dense_tensor.h"
+#include "paddle/top/api/include/tensor.h"
 
 namespace paddle {
 namespace framework {
@@ -29,9 +27,9 @@ std::shared_ptr<pt::DenseTensor> MakeTensorImpl<pt::DenseTensor>(
     proto::VarType::Type type) {
   auto holder = tensor.Holder();
   auto tensor_impl = std::make_shared<pt::DenseTensor>(
-      pt::TensorMeta(tensor.dims(), pt::TransToPtenBackend(place),
-                     pt::TransToPtenDataType(type),
-                     pt::TransToPtenLayout(tensor.layout()), tensor.offset()),
+      pt::TensorMeta(tensor.dims(), pt::TransToPtBackend(place),
+                     pt::TransToPtDataType(type),
+                     pt::TransToPtLayout(tensor.layout()), tensor.offset()),
       pt::TensorStatus());
 
   if (holder != nullptr) {
@@ -58,9 +56,9 @@ std::shared_ptr<pt::MKLDNNDenseTensor> MakeTensorImpl<pt::MKLDNNDenseTensor>(
     proto::VarType::Type type) {
   auto holder = tensor.Holder();
   auto tensor_impl = std::make_shared<pt::MKLDNNDenseTensor>(
-      pt::TensorMeta(tensor.dims(), pt::TransToPtenBackend(place),
-                     pt::TransToPtenDataType(type),
-                     pt::TransToPtenLayout(tensor.layout()), tensor.offset()),
+      pt::TensorMeta(tensor.dims(), pt::TransToPtBackend(place),
+                     pt::TransToPtDataType(type),
+                     pt::TransToPtLayout(tensor.layout()), tensor.offset()),
       pt::TensorStatus());
 
   if (holder != nullptr) {
diff --git a/paddle/fluid/framework/top_utils.h b/paddle/fluid/framework/top_utils.h
index adc188fa1fa0f..fb40ad606288e 100644
--- a/paddle/fluid/framework/top_utils.h
+++ b/paddle/fluid/framework/top_utils.h
@@ -17,6 +17,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/place.h"
 
+#include "paddle/top/api/dev/core.h"
+
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/operators/sign_op.h b/paddle/fluid/operators/sign_op.h
index 3a19572d6bc12..42e4a45b450db 100644
--- a/paddle/fluid/operators/sign_op.h
+++ b/paddle/fluid/operators/sign_op.h
@@ -33,13 +33,23 @@ class SignKernel : public framework::OpKernel<T> {
     auto* out = context.Output<framework::Tensor>("Out");
     auto& dev_ctx = context.device_context<DeviceContext>();
 
+    // debug: print all registered sign kernels for check
+    VLOG(1) << pt::OpKernelFactory::Instance();
+
+    // TODO(chenweihang): only to test correctness, this will introduce
+    // needless context prepare cost
+    pt::OpKernelContext op_kernel_ctx(dev_ctx);
     auto pt_x =
         framework::MakeTensorImpl<pt::DenseTensor>(*x, x->place(), x->type());
     auto pt_out =
         framework::MakeTensorImpl<pt::DenseTensor>(*out, x->place(), x->type());
+    op_kernel_ctx.EmplaceBackInput(pt_x);
+    op_kernel_ctx.EmplaceBackOutput(pt_out);
 
-    // call new kernel
-    pt::Sign<T>(dev_ctx, *pt_x.get(), pt_out.get());
+    auto& op_kernel = pt::OpKernelFactory::Instance().SelectKernel(
+        "sign", pt::TransToPtBackend(x->place()),
+        pt::TransToPtLayout(x->layout()), pt::TransToPtDataType(x->type()));
+    op_kernel(&op_kernel_ctx);
 
     // share pt_out data to out
     framework::ShareTensorImpl(pt_out.get(), out);
diff --git a/paddle/fluid/operators/sign_op_xpu.cc b/paddle/fluid/operators/sign_op_xpu.cc
deleted file mode 100644
index a164a9b056677..0000000000000
--- a/paddle/fluid/operators/sign_op_xpu.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_XPU
-
-#include "paddle/fluid/operators/sign_op.h"
-#include "paddle/fluid/platform/xpu/xpu_header.h"
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class SignXPUKernel : public framework::OpKernel<T> {
- public:
-  virtual void Compute(const framework::ExecutionContext& context) const {
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto* in = context.Input<framework::Tensor>("X");
-    out->mutable_data<T>(in->place());
-    auto xpu_context = context.device_context<DeviceContext>().x_context();
-    int r = xpu::activation_forward(xpu_context, xpu::Activation_t::SIGN,
-                                    in->numel(), in->data<T>(), out->data<T>());
-    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                      platform::errors::Fatal("XPU sign kernel error!"));
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_XPU_KERNEL(
-    sign, ops::SignXPUKernel<paddle::platform::XPUDeviceContext, float>);
-
-#endif
diff --git a/paddle/top/api/CMakeLists.txt b/paddle/top/api/CMakeLists.txt
index 98dc769f1786b..9f8c214a04e5c 100644
--- a/paddle/top/api/CMakeLists.txt
+++ b/paddle/top/api/CMakeLists.txt
@@ -1,8 +1,12 @@
 add_subdirectory(src)
 
-set(PTEN_DEPS convert_utils dense_tensor selected_rows_tensor)
+set(TOP_DEPS convert_utils dense_tensor selected_rows_tensor kernel_factory kernel_context)
+set(TOP_DEPS ${TOP_DEPS} math_cpu)
 if(WITH_GPU)
-  set(PTEN_DEPS ${PTEN_DEPS} math_cuda)
+  set(TOP_DEPS ${TOP_DEPS} math_cuda)
+endif()
+if(WITH_XPU)
+  set(TOP_DEPS ${TOP_DEPS} math_xpu)
 endif()
 
-cc_library(top SRCS all.cc DEPS ${PTEN_DEPS})
+cc_library(top SRCS all.cc DEPS ${TOP_DEPS})
diff --git a/paddle/top/api/dev/core.h b/paddle/top/api/dev/core.h
index d7cd929e44551..4f1a01646d3fd 100644
--- a/paddle/top/api/dev/core.h
+++ b/paddle/top/api/dev/core.h
@@ -14,4 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/top/core/convert_utils.h"
 #include "paddle/top/core/dense_tensor.h"
+#include "paddle/top/core/kernel_context.h"
+#include "paddle/top/core/kernel_factory.h"
+#include "paddle/top/core/mkldnn_dense_tensor.h"
diff --git a/paddle/top/core/CMakeLists.txt b/paddle/top/core/CMakeLists.txt
index bf143349e382b..74399ff623831 100644
--- a/paddle/top/core/CMakeLists.txt
+++ b/paddle/top/core/CMakeLists.txt
@@ -13,3 +13,4 @@ cc_library(dense_tensor SRCS dense_tensor.cc DEPS enforce data_type ddim allocat
 cc_library(selected_rows_tensor SRCS selected_rows.cc DEPS dense_tensor)
 
 cc_library(kernel_factory SRCS kernel_factory.cc DEPS enforce backend dtype layout)
+cc_library(kernel_context SRCS kernel_context.cc DEPS boost device_context)
diff --git a/paddle/top/core/convert_utils.cc b/paddle/top/core/convert_utils.cc
index fce27f325dc4b..ab122b60d813a 100644
--- a/paddle/top/core/convert_utils.cc
+++ b/paddle/top/core/convert_utils.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 namespace pt {
 
 // TODO(chenweihang): Add other place branchs
-Backend TransToPtenBackend(const paddle::platform::Place& place) {
+Backend TransToPtBackend(const paddle::platform::Place& place) {
   if (paddle::platform::is_cpu_place(place)) {
     return Backend::kCPU;
   } else if (paddle::platform::is_gpu_place(place)) {
@@ -35,7 +35,7 @@ Backend TransToPtenBackend(const paddle::platform::Place& place) {
   }
 }
 
-pt::DataType TransToPtenDataType(
+pt::DataType TransToPtDataType(
     const paddle::framework::proto::VarType::Type& dtype) {
   // Set the order of case branches according to the frequency with
   // the data type is used
@@ -67,7 +67,7 @@ pt::DataType TransToPtenDataType(
   }
 }
 
-DataLayout TransToPtenLayout(const paddle::framework::DataLayout& layout) {
+DataLayout TransToPtLayout(const paddle::framework::DataLayout& layout) {
   switch (layout) {
     case paddle::framework::DataLayout::kNHWC:
       return DataLayout::kNHWC;
diff --git a/paddle/top/core/convert_utils.h b/paddle/top/core/convert_utils.h
index 862784a783bd1..664f3f9a716e9 100644
--- a/paddle/top/core/convert_utils.h
+++ b/paddle/top/core/convert_utils.h
@@ -29,10 +29,10 @@ namespace pt {
 
 // TODO(chenweihang): Use the original var type as much as possible
 // to avoid transform, such as DataLayout, VarType
-Backend TransToPtenBackend(const paddle::platform::Place& place);
-DataType TransToPtenDataType(
+Backend TransToPtBackend(const paddle::platform::Place& place);
+DataType TransToPtDataType(
     const paddle::framework::proto::VarType::Type& dtype);
-DataLayout TransToPtenLayout(const paddle::framework::DataLayout& layout);
+DataLayout TransToPtLayout(const paddle::framework::DataLayout& layout);
 paddle::framework::proto::VarType::Type TransToProtoVarType(
     const DataType& dtype);
 
diff --git a/paddle/top/core/dtype.h b/paddle/top/core/dtype.h
index 89d0619d64984..77dece46e4e02 100644
--- a/paddle/top/core/dtype.h
+++ b/paddle/top/core/dtype.h
@@ -16,8 +16,16 @@ limitations under the License. */
 
 #include <ostream>
 
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/complex.h"
+#include "paddle/fluid/platform/float16.h"
+
 namespace pt {
 
+using complex64 = paddle::platform::complex<float>;
+using complex128 = paddle::platform::complex<double>;
+using float16 = paddle::platform::float16;
+
 /**
  * [ Why need new data type? ]
  *
@@ -49,4 +57,43 @@ enum class DataType {
 
 std::ostream& operator<<(std::ostream& os, DataType dtype);
 
+#define PT_FOR_EACH_DATA_TYPE(_)     \
+  _(bool, DataType::kBOOL)           \
+  _(int8_t, DataType::kINT8)         \
+  _(uint8_t, DataType::kUINT8)       \
+  _(int16_t, DataType::kINT16)       \
+  _(int, DataType::kINT32)           \
+  _(int64_t, DataType::kINT64)       \
+  _(float16, DataType::kFLOAT16)     \
+  _(float, DataType::kFLOAT32)       \
+  _(double, DataType::kFLOAT64)      \
+  _(complex64, DataType::kCOMPLEX64) \
+  _(complex128, DataType::kCOMPLEX128)
+
+template <pt::DataType T>
+struct DataTypeToCppType;
+
+template <typename T>
+struct CppTypeToDataType;
+
+#define PT_SPECIALIZE_DataTypeToCppType(cpp_type, data_type) \
+  template <>                                                \
+  struct DataTypeToCppType<data_type> {                      \
+    using type = cpp_type;                                   \
+  };
+
+PT_FOR_EACH_DATA_TYPE(PT_SPECIALIZE_DataTypeToCppType)
+
+#undef PT_SPECIALIZE_DataTypeToCppType
+
+#define PT_SPECIALIZE_CppTypeToDataType(cpp_type, data_type) \
+  template <>                                                \
+  struct CppTypeToDataType<cpp_type> {                       \
+    DataType type = data_type;                               \
+  };
+
+PT_FOR_EACH_DATA_TYPE(PT_SPECIALIZE_CppTypeToDataType)
+
+#undef PT_SPECIALIZE_CppTypeToDataType
+
 }  // namespace pt
diff --git a/paddle/top/core/kernel_fn_utils.h b/paddle/top/core/kernel_context.cc
similarity index 88%
rename from paddle/top/core/kernel_fn_utils.h
rename to paddle/top/core/kernel_context.cc
index 6672a72aab304..fafacb72f27ab 100644
--- a/paddle/top/core/kernel_fn_utils.h
+++ b/paddle/top/core/kernel_context.cc
@@ -12,4 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#pragma once
+#include "paddle/top/core/kernel_context.h"
+
+namespace pt {}  // namespace pt
diff --git a/paddle/top/core/kernel_context.h b/paddle/top/core/kernel_context.h
index 6672a72aab304..7cf85f5c805cd 100644
--- a/paddle/top/core/kernel_context.h
+++ b/paddle/top/core/kernel_context.h
@@ -13,3 +13,80 @@
 // limitations under the License.
 
 #pragma once
+
+#include <utility>
+
+#include <boost/any.hpp>
+
+#include "paddle/top/core/tensor_interface.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/device_context.h"
+
+namespace pt {
+
+using DeviceContext = paddle::platform::DeviceContext;
+
+/**
+ * Note: OpKernelContext doesn't manage the life if DeviceContext and Tensor
+ *
+ * Note: OpKernelContext does not couple the concept of framework,
+ *       its constructor can only take the members it needs as parameters,
+ *       not Scope, RuntimeContext, etc. as parameters
+ */
+class OpKernelContext {
+ public:
+  explicit OpKernelContext(const DeviceContext& dev_ctx) : dev_ctx_(dev_ctx) {}
+  OpKernelContext(const DeviceContext& dev_ctx,
+                  const std::vector<std::shared_ptr<TensorInterface>>& inputs,
+                  const std::vector<std::shared_ptr<TensorInterface>>& outputs,
+                  const std::vector<boost::any>& attrs)
+      : dev_ctx_(dev_ctx), inputs_(inputs), outputs_(outputs), attrs_(attrs) {}
+
+  template <typename CtxType>
+  const CtxType& GetDeviceContext() const {
+    return static_cast<const CtxType&>(dev_ctx_);
+  }
+
+  void EmplaceBackInput(std::shared_ptr<TensorInterface> input) {
+    inputs_.emplace_back(input);
+  }
+
+  void EmplaceBackOutput(std::shared_ptr<TensorInterface> output) {
+    outputs_.emplace_back(output);
+  }
+
+  template <typename TensorType>
+  const TensorType& InputAt(size_t idx) const {
+    return static_cast<const TensorType&>(*(inputs_.at(idx)));
+  }
+
+  template <typename TensorType>
+  TensorType* MutableOutputAt(size_t idx) {
+    return static_cast<TensorType*>(outputs_.at(idx).get());
+  }
+
+ private:
+  // DeviceContext base class
+  const DeviceContext& dev_ctx_;
+
+  // TODO(chenweihang): replaced by small_vector
+  // TODO(chenweihang): Tensor -> Tensor*, Tensor should by managed `scope`
+  // Note: can't use API Tensor here, the inference don't use this API Tensor
+  std::vector<std::shared_ptr<TensorInterface>> inputs_{};
+  std::vector<std::shared_ptr<TensorInterface>> outputs_{};
+  // TODO(chenweihang): replaced by paddle::any
+  std::vector<boost::any> attrs_{};
+
+  // Only contains input like list[Tensor] need `range`
+  // TODO(chenweihang): replaced by small_vector
+  std::vector<std::pair<int, int>> input_range_{{}};
+  std::vector<std::pair<int, int>> output_range_{{}};
+
+  // Only static graph need `name`
+  // TODO(chenweihang): replaced by paddle::string_view
+  std::vector<std::string> input_names_{{}};
+  std::vector<std::string> output_names_{{}};
+};
+
+}  // namespace pt
diff --git a/paddle/top/core/kernel_def.h b/paddle/top/core/kernel_def.h
new file mode 100644
index 0000000000000..206afa8a9ed95
--- /dev/null
+++ b/paddle/top/core/kernel_def.h
@@ -0,0 +1,22 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace pt {
+
+class OpKernelContext;
+using OpKernelFn = void (*)(OpKernelContext* ctx);
+
+}  // namespace pt
diff --git a/paddle/top/core/kernel_factory.cc b/paddle/top/core/kernel_factory.cc
index bb860b1183242..5f3b45a75f51b 100644
--- a/paddle/top/core/kernel_factory.cc
+++ b/paddle/top/core/kernel_factory.cc
@@ -24,7 +24,7 @@ OpKernelFactory& OpKernelFactory::Instance() {
   return g_op_kernel_factory;
 }
 
-const OpKernelFn& OpKernelFactory::FindOpKernel(
+const OpKernel& OpKernelFactory::SelectKernel(
     const OperationName& op_name, const OpKernelKey& kernel_key) const {
   auto iter = kernels_.find(op_name);
   PADDLE_ENFORCE_NE(iter,
@@ -44,4 +44,21 @@ const OpKernelFn& OpKernelFactory::FindOpKernel(
   return kernel_iter->second;
 }
 
+const OpKernel& OpKernelFactory::SelectKernel(const OperationName& op_name,
+                                              Backend backend,
+                                              DataLayout layout,
+                                              DataType dtype) const {
+  return SelectKernel(op_name, OpKernelKey(backend, layout, dtype));
+}
+
+std::ostream& operator<<(std::ostream& os, OpKernelFactory& kernel_factory) {
+  for (const auto& op_kernel_pair : kernel_factory.kernels()) {
+    os << "- op: " << op_kernel_pair.first << "\n";
+    for (const auto& kernel_pair : op_kernel_pair.second) {
+      os << "\t- kernel: " << kernel_pair.first << "\n";
+    }
+  }
+  return os;
+}
+
 }  // namespace pt
diff --git a/paddle/top/core/kernel_factory.h b/paddle/top/core/kernel_factory.h
index f2f3f4dcf781f..22743b0c0939c 100644
--- a/paddle/top/core/kernel_factory.h
+++ b/paddle/top/core/kernel_factory.h
@@ -21,8 +21,12 @@
 
 #include "paddle/top/core/backend.h"
 #include "paddle/top/core/dtype.h"
+#include "paddle/top/core/kernel_def.h"
 #include "paddle/top/core/layout.h"
 
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/enforce.h"
+
 namespace pt {
 
 class OpKernelContext;
@@ -30,6 +34,7 @@ class OpKernelContext;
 using OpKernelFn = void (*)(OpKernelContext* ctx);
 
 struct OperationName final {
+  // TODO(chenweihang): use string_view later?
   std::string op_type;
   std::string overload_type;
   // Avoid calculating Hash value at runtime
@@ -41,6 +46,24 @@ struct OperationName final {
                  (std::hash<std::string>()(overload_type) << 1);
   }
 
+  OperationName(const char* op_name) {
+    std::string op_name_str(op_name);
+    size_t pos = op_name_str.find_first_of('.');
+    if (pos == std::string::npos) {
+      op_type = op_name_str;
+      overload_type = "";
+    } else {
+      op_type = op_name_str.substr(0, pos);
+      PADDLE_ENFORCE_EQ(op_name_str.find('.', pos + 1),
+                        std::string::npos,
+                        paddle::platform::errors::InvalidArgument(
+                            "OperationName only can contains one '.'."));
+      overload_type = op_name_str.substr(pos + 1, op_name_str.size());
+    }
+    hash_value = std::hash<std::string>()(op_type) ^
+                 (std::hash<std::string>()(overload_type) << 1);
+  }
+
   struct Hash {
     size_t operator()(const OperationName& op_name) const {
       return op_name.hash_value;
@@ -62,21 +85,21 @@ struct OperationName final {
 
 class OpKernelKey {
  public:
-  OpKernelKey(Backend backend, DataType dtype, DataLayout layout)
-      : backend_(backend), dtype_(dtype), layout_(layout) {
-    // |----31-20------|---19-16----|---15-8---|---7-0---|
-    // | For extension | DataLayout | DataType | Backend |
+  OpKernelKey(Backend backend, DataLayout layout, DataType dtype)
+      : backend_(backend), layout_(layout), dtype_(dtype) {
+    // |----31-20------|---19-12---|---11-8----|---7-0---|
+    // | For extension | DataType | DataLayout | Backend |
 
     hash_value_ = 0;
     hash_value_ |= static_cast<uint8_t>(backend_);
-    hash_value_ |= (static_cast<uint16_t>(dtype_) << kBackendBitLength);
-    hash_value_ |= (static_cast<uint32_t>(layout_)
+    hash_value_ |= (static_cast<uint8_t>(layout_) << kBackendBitLength);
+    hash_value_ |= (static_cast<uint16_t>(dtype_)
                     << (kBackendBitLength + kDataTypeBitLength));
   }
 
   Backend backend() const { return backend_; }
-  DataType dtype() const { return dtype_; }
   DataLayout layout() const { return layout_; }
+  DataType dtype() const { return dtype_; }
 
   uint32_t hash_value() const { return hash_value_; }
 
@@ -101,12 +124,12 @@ class OpKernelKey {
  private:
   // In total should be smaller than 32.
   constexpr static int kBackendBitLength = 8;
-  constexpr static int kDataTypeBitLength = 8;
   constexpr static int kDataLayoutBitLength = 4;
+  constexpr static int kDataTypeBitLength = 8;
 
   Backend backend_;
-  DataType dtype_;
   DataLayout layout_;
+  DataType dtype_;
 
   // Avoid calculating Hash value at runtime.
   // Note: Now the number of bits we need does not exceed 32 bits, so there is
@@ -115,37 +138,100 @@ class OpKernelKey {
   uint32_t hash_value_;
 };
 
-class OpKernelFactory {
+struct ParamDef {
+  Backend backend;
+  DataLayout layout;
+  DataType dtype;
+
+  ParamDef(Backend backend, DataLayout layout, DataType dtype)
+      : backend(backend), layout(layout), dtype(dtype) {}
+};
+
+class OpKernelParamDef {
  public:
-  static OpKernelFactory& Instance();
+  OpKernelParamDef() = default;
+
+  void AppendInput(Backend backend, DataLayout layout, DataType dtype) {
+    input_defs_.emplace_back(ParamDef(backend, layout, dtype));
+  }
+
+  void AppendOutput(Backend backend, DataLayout layout, DataType dtype) {
+    output_defs_.emplace_back(ParamDef(backend, layout, dtype));
+  }
 
-  const OpKernelFn& FindOpKernel(const OperationName& op_name,
-                                 const OpKernelKey& kernel_key) const;
+  void SetSameAsKernelKey() { same_as_kernel_key_ = true; }
 
  private:
-  OpKernelFactory();
+  // TODO(chenweihang): replaced by paddle::small_vector
+  std::vector<ParamDef> input_defs_{{}};
+  std::vector<ParamDef> output_defs_{{}};
+  // if the same_as_kernel_key_ is true, all this kernel's input and output
+  // hold def that same as kernel key, the input_defs_ and output_defs_ are
+  // empty
+  bool same_as_kernel_key_{false};
+};
+
+class OpKernel {
+ public:
+  // for map element contruct
+  OpKernel() = default;
+
+  explicit OpKernel(OpKernelFn fn) : fn_(fn) {}
+
+  void operator()(OpKernelContext* ctx) const { fn_(ctx); }
+
+  OpKernelParamDef& param_def() { return param_def_; }
 
+ private:
+  OpKernelFn fn_{nullptr};
+  OpKernelParamDef param_def_;
+};
+
+class OpKernelFactory {
+ public:
   // replaced by paddle::flat_hash_map later
-  std::unordered_map<
+  using OpKernelMap = std::unordered_map<
       OperationName,
-      std::unordered_map<OpKernelKey, OpKernelFn, OpKernelKey::Hash>,
-      OperationName::Hash>
-      kernels_;
+      std::unordered_map<OpKernelKey, OpKernel, OpKernelKey::Hash>,
+      OperationName::Hash>;
+
+  static OpKernelFactory& Instance();
+
+  OpKernelMap& kernels() { return kernels_; }
+
+  const OpKernel& SelectKernel(const OperationName& op_name,
+                               const OpKernelKey& kernel_key) const;
+
+  const OpKernel& SelectKernel(const OperationName& op_name,
+                               Backend backend,
+                               DataLayout layout,
+                               DataType dtype) const;
+
+ private:
+  OpKernelFactory() = default;
+
+  OpKernelMap kernels_;
 };
 
 /** operator << overload **/
 
 inline std::ostream& operator<<(std::ostream& os,
                                 const OperationName& op_name) {
-  os << op_name.op_type << "." << op_name.overload_type;
+  if (op_name.overload_type.empty()) {
+    os << op_name.op_type;
+  } else {
+    os << op_name.op_type << "." << op_name.overload_type;
+  }
   return os;
 }
 
 inline std::ostream& operator<<(std::ostream& os,
                                 const OpKernelKey& kernel_key) {
-  os << "(" << kernel_key.backend() << ", " << kernel_key.dtype() << ", "
-     << kernel_key.layout() << ")";
+  os << "(" << kernel_key.backend() << ", " << kernel_key.layout() << ", "
+     << kernel_key.dtype() << ")";
   return os;
 }
 
+std::ostream& operator<<(std::ostream& os, OpKernelFactory& kernel_factory);
+
 }  // namespace pt
diff --git a/paddle/top/core/kernel_registry.h b/paddle/top/core/kernel_registry.h
index 6672a72aab304..421a203dc051c 100644
--- a/paddle/top/core/kernel_registry.h
+++ b/paddle/top/core/kernel_registry.h
@@ -13,3 +13,285 @@
 // limitations under the License.
 
 #pragma once
+
+#include "paddle/top/core/kernel_def.h"
+#include "paddle/top/core/kernel_factory.h"
+#include "paddle/top/core/kernel_utils.h"
+
+namespace pt {
+
+#define BACKEND(arg__) pt::Backend::k##arg__
+#define DATALAYOUT(arg__) pt::DataLayout::k##arg__
+#define DATATYPE(arg__) pt::DataType::k##arg__
+
+class OpKernelRegistrar {
+ public:
+  OpKernelRegistrar(const char* op_name,
+                    Backend backend,
+                    DataLayout layout,
+                    DataType dtype,
+                    OpKernelFn fn)
+      : op_name_(op_name), op_kernel_key_(backend, layout, dtype) {
+    OpKernel kernel(fn);
+    OpKernelFactory::Instance().kernels()[op_name_][op_kernel_key_] = kernel;
+  }
+
+  OpKernelRegistrar& Input(Backend backend, DataLayout layout, DataType dtype) {
+    OpKernelFactory::Instance()
+        .kernels()[op_name_][op_kernel_key_]
+        .param_def()
+        .AppendInput(backend, layout, dtype);
+    return *this;
+  }
+
+  OpKernelRegistrar& Output(Backend backend,
+                            DataLayout layout,
+                            DataType dtype) {
+    OpKernelFactory::Instance()
+        .kernels()[op_name_][op_kernel_key_]
+        .param_def()
+        .AppendOutput(backend, layout, dtype);
+    return *this;
+  }
+
+  OpKernelRegistrar& SetSameAsKernelKey() {
+    OpKernelFactory::Instance()
+        .kernels()[op_name_][op_kernel_key_]
+        .param_def()
+        .SetSameAsKernelKey();
+    return *this;
+  }
+
+  void Touch() {}
+
+ private:
+  OperationName op_name_;
+  OpKernelKey op_kernel_key_;
+};
+
+#define PT_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)                     \
+  struct __test_global_namespace_##uniq_name##__ {};                          \
+  static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
+                             __test_global_namespace_##uniq_name##__>::value, \
+                msg)
+
+#define PT_REGISTER_STANDARD_KERNEL(                                      \
+    op_name, backend, layout, dtype, kernel_fn)                           \
+  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                      \
+      __reg_pt_op_kernel_##op_name##_##backend##_##layout##_##dtype##__,  \
+      "PT_REGISTER_STANDARD_KERNEL must be called in global namespace."); \
+  static ::pt::OpKernelRegistrar                                          \
+      __pt_op_kernel_##op_name##_##backend##_##layout##_##dtype##__ =     \
+          ::pt::OpKernelRegistrar(#op_name,                               \
+                                  BACKEND(backend),                       \
+                                  DATALAYOUT(layout),                     \
+                                  DATATYPE(dtype),                        \
+                                  kernel_fn)
+
+#define PT_REGISTER_KERNEL_AUTO_SPECIALIZE(                              \
+    op_name, backend, layout, meta_kernel_fn, dtype)                     \
+  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                     \
+      __reg_pt_op_kernel_##op_name##_##backend##_##layout##_##dtype##__, \
+      "PT_REGISTER_KERNEL_AUTO_SPECIALIZE must be called in global "     \
+      "namespace.");                                                     \
+  static ::pt::OpKernelRegistrar                                         \
+      __pt_op_kernel_##op_name##_##backend##_##layout##_##dtype##__ =    \
+          ::pt::OpKernelRegistrar(#op_name,                              \
+                                  BACKEND(backend),                      \
+                                  DATALAYOUT(layout),                    \
+                                  ::pt::CppTypeToDataType<dtype>().type, \
+                                  PT_KERNEL(meta_kernel_fn<dtype>))
+
+#define PT_TORCH_KERNEL_REGISTRAR(op_name, backend, layout, dtype)          \
+  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                        \
+      __touch_pt_op_kernel_##op_name##_##backend##_##layout##_##dtype##__,  \
+      "PT_TORCH_KERNEL_REGISTRAR must be called in global namespace.");     \
+  int TouchOpKernelRegistrar_##op_name##_##backend##_##dtype##_##layout() { \
+    __pt_op_kernel_##op_name##_##backend##_##layout##_##dtype##__.Touch();  \
+    return 0;                                                               \
+  }
+
+/**
+ * In most cases, the backend, dtype and layout of Op's input and output
+ * are the same as OpKernel itself. In order to simplify the registration
+ * writing, we provide the following simple kernel registration macro.
+ * If it is an special case, please use PT_REGISTER_STANDARD_KERNEL
+ */
+#define PT_REGISTER_KERNEL_1T(op_name, backend, layout, meta_kernel_fn, dtype) \
+  PT_REGISTER_KERNEL_AUTO_SPECIALIZE(                                          \
+      op_name, backend, layout, meta_kernel_fn, dtype)                         \
+      .SetSameAsKernelKey();                                                   \
+  PT_TORCH_KERNEL_REGISTRAR(op_name, backend, layout, dtype)
+
+#define PT_REGISTER_KERNEL_2T(                                             \
+    op_name, backend, layout, meta_kernel_fn, dtype1, dtype2)              \
+  PT_REGISTER_KERNEL_1T(op_name, backend, layout, meta_kernel_fn, dtype1); \
+  PT_REGISTER_KERNEL_1T(op_name, backend, layout, meta_kernel_fn, dtype2)
+
+#define PT_REGISTER_KERNEL_3T(                                        \
+    op_name, backend, layout, meta_kernel_fn, dtype1, dtype2, dtype3) \
+  PT_REGISTER_KERNEL_2T(                                              \
+      op_name, backend, layout, meta_kernel_fn, dtype1, dtype2);      \
+  PT_REGISTER_KERNEL_1T(op_name, backend, layout, meta_kernel_fn, dtype3)
+
+#define PT_REGISTER_KERNEL_4T(                                                \
+    op_name, backend, layout, meta_kernel_fn, dtype1, dtype2, dtype3, dtype4) \
+  PT_REGISTER_KERNEL_2T(                                                      \
+      op_name, backend, layout, meta_kernel_fn, dtype1, dtype2);              \
+  PT_REGISTER_KERNEL_2T(                                                      \
+      op_name, backend, layout, meta_kernel_fn, dtype3, dtype4)
+
+#define PT_REGISTER_KERNEL_5T(op_name,                                   \
+                              backend,                                   \
+                              layout,                                    \
+                              meta_kernel_fn,                            \
+                              dtype1,                                    \
+                              dtype2,                                    \
+                              dtype3,                                    \
+                              dtype4,                                    \
+                              dtype5)                                    \
+  PT_REGISTER_KERNEL_3T(                                                 \
+      op_name, backend, layout, meta_kernel_fn, dtype1, dtype2, dtype3); \
+  PT_REGISTER_KERNEL_2T(                                                 \
+      op_name, backend, layout, meta_kernel_fn, dtype4, dtype5)
+
+#define PT_REGISTER_KERNEL_6T(op_name,                                   \
+                              backend,                                   \
+                              layout,                                    \
+                              meta_kernel_fn,                            \
+                              dtype1,                                    \
+                              dtype2,                                    \
+                              dtype3,                                    \
+                              dtype4,                                    \
+                              dtype5,                                    \
+                              dtype6)                                    \
+  PT_REGISTER_KERNEL_3T(                                                 \
+      op_name, backend, layout, meta_kernel_fn, dtype1, dtype2, dtype3); \
+  PT_REGISTER_KERNEL_3T(                                                 \
+      op_name, backend, layout, meta_kernel_fn, dtype4, dtype5, dtype6)
+
+#define PT_REGISTER_KERNEL_7T(op_name,        \
+                              backend,        \
+                              layout,         \
+                              meta_kernel_fn, \
+                              dtype1,         \
+                              dtype2,         \
+                              dtype3,         \
+                              dtype4,         \
+                              dtype5,         \
+                              dtype6,         \
+                              ftype7)         \
+  PT_REGISTER_KERNEL_4T(op_name,              \
+                        backend,              \
+                        layout,               \
+                        meta_kernel_fn,       \
+                        dtype1,               \
+                        dtype2,               \
+                        dtype3,               \
+                        dtype4);              \
+  PT_REGISTER_KERNEL_3T(                      \
+      op_name, backend, layout, meta_kernel_fn, dtype5, dtype6, dtype7)
+
+#define PT_REGISTER_KERNEL_8T(op_name,        \
+                              backend,        \
+                              layout,         \
+                              meta_kernel_fn, \
+                              dtype1,         \
+                              dtype2,         \
+                              dtype3,         \
+                              dtype4,         \
+                              dtype5,         \
+                              dtype6,         \
+                              dtype7,         \
+                              dtype8)         \
+  PT_REGISTER_KERNEL_4T(op_name,              \
+                        backend,              \
+                        layout,               \
+                        meta_kernel_fn,       \
+                        dtype1,               \
+                        dtype2,               \
+                        dtype3,               \
+                        dtype4);              \
+  PT_REGISTER_KERNEL_4T(op_name,              \
+                        backend,              \
+                        layout,               \
+                        meta_kernel_fn,       \
+                        dtype5,               \
+                        dtype6,               \
+                        dtype7,               \
+                        dtype8)
+
+/**
+ * Op Kernel declare macros
+ */
+
+#if defined(_WIN32)
+#define UNUSED
+#define __builtin_expect(EXP, C) (EXP)
+#else
+#define UNUSED __attribute__((unused))
+#endif
+
+#define PT_DECLARE_KERNEL_1T(op_name, backend, layout, dtype)                 \
+  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                          \
+      __dec_pt_op_kernel_##op_name##_##backend##_##layout##_##dtype##__,      \
+      "PT_DECLARE_KERNEL_*T must be called in global namespace.");            \
+  extern int                                                                  \
+      TouchOpKernelRegistrar_##op_name##_##backend##_##dtype##_##layout();    \
+  UNUSED static int                                                           \
+      __declare_pt_op_kernel_##op_name##_##backend##_##layout##_##dtype##__ = \
+          TouchOpKernelRegistrar_##op_name##_##backend##_##dtype##_##layout()
+
+#define PT_DECLARE_KERNEL_2T(op_name, backend, layout, dtype1, dtype2) \
+  PT_DECLARE_KERNEL_1T(op_name, backend, layout, dtype1);              \
+  PT_DECLARE_KERNEL_1T(op_name, backend, layout, dtype2)
+
+#define PT_DECLARE_KERNEL_3T(op_name, backend, layout, dtype1, dtype2, dtype3) \
+  PT_REGISTER_KERNEL_2T(op_name, backend, layout, dtype1, dtype2);             \
+  PT_DECLARE_KERNEL_1T(op_name, backend, layout, dtype3)
+
+#define PT_DECLARE_KERNEL_4T(                                     \
+    op_name, backend, layout, dtype1, dtype2, dtype3, dtype4)     \
+  PT_DECLARE_KERNEL_2T(op_name, backend, layout, dtype1, dtype2); \
+  PT_DECLARE_KERNEL_2T(op_name, backend, layout, dtype3, dtype4)
+
+#define PT_DECLARE_KERNEL_5T(                                             \
+    op_name, backend, layout, dtype1, dtype2, dtype3, dtype4, dtype5)     \
+  PT_DECLARE_KERNEL_3T(op_name, backend, layout, dtype1, dtype2, dtype3); \
+  PT_DECLARE_KERNEL_2T(op_name, backend, layout, dtype4, dtype5)
+
+#define PT_DECLARE_KERNEL_6T(                                                 \
+    op_name, backend, layout, dtype1, dtype2, dtype3, dtype4, dtype5, dtype6) \
+  PT_DECLARE_KERNEL_3T(op_name, backend, layout, dtype1, dtype2, dtype3);     \
+  PT_DECLARE_KERNEL_3T(op_name, backend, layout, dtype4, dtype5, dtype6)
+
+#define PT_DECLARE_KERNEL_7T(op_name,                            \
+                             backend,                            \
+                             layout,                             \
+                             dtype1,                             \
+                             dtype2,                             \
+                             dtype3,                             \
+                             dtype4,                             \
+                             dtype5,                             \
+                             dtype6,                             \
+                             ftype7)                             \
+  PT_DECLARE_KERNEL_4T(                                          \
+      op_name, backend, layout, dtype1, dtype2, dtype3, dtype4); \
+  PT_DECLARE_KERNEL_3T(op_name, backend, layout, dtype5, dtype6, dtype7)
+
+#define PT_DECLARE_KERNEL_8T(op_name,                            \
+                             backend,                            \
+                             layout,                             \
+                             dtype1,                             \
+                             dtype2,                             \
+                             dtype3,                             \
+                             dtype4,                             \
+                             dtype5,                             \
+                             dtype6,                             \
+                             dtype7,                             \
+                             dtype8)                             \
+  PT_DECLARE_KERNEL_4T(                                          \
+      op_name, backend, layout, dtype1, dtype2, dtype3, dtype4); \
+  PT_DECLARE_KERNEL_4T(op_name, backend, layout, dtype5, dtype6, dtype7, dtype8)
+
+}  // namespace pt
diff --git a/paddle/top/core/kernel_utils.h b/paddle/top/core/kernel_utils.h
new file mode 100644
index 0000000000000..b7676c5a21fa2
--- /dev/null
+++ b/paddle/top/core/kernel_utils.h
@@ -0,0 +1,148 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/top/core/kernel_context.h"
+#include "paddle/top/core/kernel_def.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace pt {
+
+// TODO(chenweihang): replaced by new DeviceContext later
+using CPUContext = paddle::platform::CPUDeviceContext;
+#ifdef PADDLE_WITH_CUDA
+using CUDAContext = paddle::platform::CUDADeviceContext;
+#endif
+#ifdef PADDLE_WITH_MKLDNN
+using MKLDNNContext = paddle::platform::MKLDNNDeviceContext;
+#endif
+#ifdef PADDLE_WITH_ASCEND_CL
+using NPUContext = paddle::platform::NPUDeviceContext;
+#endif
+#ifdef PADDLE_WITH_XPU
+using XPUContext = paddle::platform::XPUDeviceContext;
+#endif
+
+#define PT_KERNEL(...) \
+  ::pt::OpKernelImpl<decltype(&__VA_ARGS__), &__VA_ARGS__>::Compute
+
+#define PT_SPECIALIZE_OpKernelCallHelper_FOR_DEVICE_CONTEXT(dev_ctx)         \
+  template <typename... Tail>                                                \
+  struct OpKernelCallHelper<const dev_ctx&, Tail...> {                       \
+    template <int dev_ctx_idx,                                               \
+              int in_idx,                                                    \
+              int attr_idx,                                                  \
+              int out_idx,                                                   \
+              typename... PreviousArgs>                                      \
+    static void Compute(OpKernelContext* ctx, PreviousArgs&... pargs) {      \
+      static_assert(in_idx == 0,                                             \
+                    "Kernel's DeviceContext should appear before Inputs.");  \
+      static_assert(                                                         \
+          attr_idx == 0,                                                     \
+          "Kernel's DeviceContext should appear before Attributes.");        \
+      static_assert(out_idx == 0,                                            \
+                    "Kernel's DeviceContext should appear before Outputs."); \
+      const dev_ctx& arg = ctx->GetDeviceContext<dev_ctx>();                 \
+      OpKernelCallHelper<Tail...>::                                          \
+          template Compute<dev_ctx_idx + 1, in_idx, attr_idx, out_idx>(      \
+              ctx, pargs..., arg);                                           \
+    }                                                                        \
+  }
+
+template <typename T>
+struct TypeTag {};
+
+template <typename Fn, Fn fn>
+struct OpKernelImpl;
+
+template <typename Return, typename... Args, Return (*kernel_fn)(Args...)>
+struct OpKernelImpl<Return (*)(Args...), kernel_fn> {
+  static void Compute(OpKernelContext* ctx) {
+    OpKernelCallHelper<Args..., TypeTag<int>>::template Compute<0, 0, 0, 0>(
+        ctx);
+  }
+
+ private:
+  template <typename... RemainingArgs>
+  struct OpKernelCallHelper;
+
+  /* DeviceContext Helpers */
+
+  PT_SPECIALIZE_OpKernelCallHelper_FOR_DEVICE_CONTEXT(CPUContext);
+#ifdef PADDLE_WITH_CUDA
+  PT_SPECIALIZE_OpKernelCallHelper_FOR_DEVICE_CONTEXT(CUDAContext);
+#endif
+#ifdef PADDLE_WITH_XPU
+  PT_SPECIALIZE_OpKernelCallHelper_FOR_DEVICE_CONTEXT(XPUContext);
+#endif
+
+  /* Input Helpers */
+
+  template <typename... Tail>
+  struct OpKernelCallHelper<const DenseTensor&, Tail...> {
+    template <int dev_ctx_idx,
+              int in_idx,
+              int attr_idx,
+              int out_idx,
+              typename... PreviousArgs>
+    static void Compute(OpKernelContext* ctx, PreviousArgs&... pargs) {
+      static_assert(attr_idx == 0,
+                    "Kernel's Input should appear before Attributes.");
+      static_assert(out_idx == 0,
+                    "Kernel's Input should appear before Outputs.");
+      const DenseTensor& arg = ctx->InputAt<DenseTensor>(in_idx);
+      OpKernelCallHelper<Tail...>::
+          template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>(
+              ctx, pargs..., arg);
+    }
+  };
+
+  /* Attribute Helpers */
+
+  /* Output Helpers */
+
+  template <typename... Tail>
+  struct OpKernelCallHelper<DenseTensor*, Tail...> {
+    template <int dev_ctx_idx,
+              int in_idx,
+              int attr_idx,
+              int out_idx,
+              typename... PreviousArgs>
+    static void Compute(OpKernelContext* ctx, PreviousArgs&... pargs) {
+      DenseTensor* arg = ctx->MutableOutputAt<DenseTensor>(out_idx);
+      OpKernelCallHelper<Tail...>::
+          template Compute<dev_ctx_idx, in_idx, attr_idx, out_idx + 1>(
+              ctx, pargs..., arg);
+    }
+  };
+
+  /* End case */
+  template <typename T>
+  struct OpKernelCallHelper<TypeTag<T>> {
+    template <int dev_ctx_idx, int in_idx, int attr_idx, int out_idx>
+    static void Compute(OpKernelContext* ctx, Args&... args) {
+      static_assert(dev_ctx_idx > 0,
+                    "Kernel should pass DeviceContext as argument.");
+      static_assert(out_idx > 0, "Kernel should have output argument.");
+      // TODO(chenweihang): check dev_ctx, in, attr, out number
+      return kernel_fn(args...);
+    }
+  };
+};
+
+}  // namespace pt
diff --git a/paddle/top/cpu/CMakeLists.txt b/paddle/top/cpu/CMakeLists.txt
index e69de29bb2d1d..874ea85b4b97f 100644
--- a/paddle/top/cpu/CMakeLists.txt
+++ b/paddle/top/cpu/CMakeLists.txt
@@ -0,0 +1 @@
+cc_library(math_cpu SRCS math.cc DEPS dense_tensor kernel_context kernel_factory eigen_function)
diff --git a/paddle/top/cpu/math.cc b/paddle/top/cpu/math.cc
new file mode 100644
index 0000000000000..670339cb4ba83
--- /dev/null
+++ b/paddle/top/cpu/math.cc
@@ -0,0 +1,33 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/top/cpu/math.h"
+
+namespace pt {}  // namespace pt
+
+// Register method 1:
+// PT_REGISTER_STANDARD_KERNEL(sign, CPU, NCHW, FLOAT32,
+// PT_KERNEL(pt::Sign<float>))
+//   .Input(BACKEND(CPU), DATALAYOUT(NCHW), DATATYPE(FLOAT32))
+//   .Output(BACKEND(CPU), DATALAYOUT(NCHW), DATATYPE(FLOAT32));
+// PT_TORCH_KERNEL_REGISTRAR(sign, CPU, NCHW, FLOAT32);
+
+// Register method 2:
+// PT_REGISTER_KERNEL_AUTO_SPECIALIZE(sign, CPU, NCHW, pt::Sign, float)
+//   .Input(BACKEND(CPU), DATALAYOUT(NCHW), DATATYPE(FLOAT32))
+//   .Output(BACKEND(CPU), DATALAYOUT(NCHW), DATATYPE(FLOAT32));
+// PT_TORCH_KERNEL_REGISTRAR(sign, CPU, NCHW, FLOAT32);
+
+// Register method 3:
+PT_REGISTER_KERNEL_2T(sign, CPU, NCHW, pt::Sign, float, double);
diff --git a/paddle/top/cpu/math.h b/paddle/top/cpu/math.h
index 8eef66edd9811..2c3a88550157a 100644
--- a/paddle/top/cpu/math.h
+++ b/paddle/top/cpu/math.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/top/core/dense_tensor.h"
+#include "paddle/top/core/kernel_registry.h"
 #include "paddle/top/module/scale.h"
 #include "paddle/top/module/sign.h"
 
@@ -80,3 +81,5 @@ void Scale(const CPUContext& dev_ctx,
 }
 
 }  // namespace pt
+
+PT_DECLARE_KERNEL_2T(sign, CPU, NCHW, float, double);
diff --git a/paddle/top/cuda/CMakeLists.txt b/paddle/top/cuda/CMakeLists.txt
index 328b81265f03d..cc64addf94d19 100644
--- a/paddle/top/cuda/CMakeLists.txt
+++ b/paddle/top/cuda/CMakeLists.txt
@@ -1 +1 @@
-nv_library(math_cuda SRCS math.cu DEPS device_context dense_tensor convert_utils)
+nv_library(math_cuda SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory)
diff --git a/paddle/top/cuda/math.cu b/paddle/top/cuda/math.cu
index 162fc45cf5c56..55184f7ff2431 100644
--- a/paddle/top/cuda/math.cu
+++ b/paddle/top/cuda/math.cu
@@ -23,6 +23,7 @@ namespace cub = hipcub;
 #endif
 
 #include "paddle/top/core/convert_utils.h"
+#include "paddle/top/core/kernel_registry.h"
 
 namespace pt {
 
@@ -64,7 +65,7 @@ void Mean(const CUDAContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
   pt::DenseTensor tmp(
       TensorMeta(paddle::framework::make_ddim(
                      {static_cast<int64_t>(temp_storage_bytes)}),
-                 pt::TransToPtenBackend(dev_ctx.GetPlace()),
+                 pt::TransToPtBackend(dev_ctx.GetPlace()),
                  x.type(),
                  x.layout()),
       TensorStatus());
@@ -85,3 +86,7 @@ template void Mean<paddle::platform::float16>(const CUDAContext& dev_ctx,
                                               DenseTensor* out);
 
 }  // namespace pt
+
+// PT_REGISTER_KERNEL_3T(sign, CUDA, NCHW, pt::Sign, float, double,
+// pt::float16);
+PT_REGISTER_KERNEL_2T(sign, CUDA, NCHW, pt::Sign, float, double);
diff --git a/paddle/top/tests/kernel_factory_test.cc b/paddle/top/tests/kernel_factory_test.cc
index 158f10c1e5c65..383d9f232d177 100644
--- a/paddle/top/tests/kernel_factory_test.cc
+++ b/paddle/top/tests/kernel_factory_test.cc
@@ -18,6 +18,6 @@ limitations under the License. */
 
 TEST(OpKernelFactory, OpKernelKey) {
   pt::OpKernelKey key(
-      pt::Backend::kCPU, pt::DataType::kFLOAT32, pt::DataLayout::kNCHW);
+      pt::Backend::kCPU, pt::DataLayout::kNCHW, pt::DataType::kFLOAT32);
   std::cout << key;
 }
diff --git a/paddle/top/xpu/CMakeLists.txt b/paddle/top/xpu/CMakeLists.txt
index e69de29bb2d1d..26a3758808c74 100644
--- a/paddle/top/xpu/CMakeLists.txt
+++ b/paddle/top/xpu/CMakeLists.txt
@@ -0,0 +1 @@
+cc_library(math_xpu SRCS math.cc DEPS dense_tensor kernel_context kernel_factory)
diff --git a/paddle/top/xpu/math.cc b/paddle/top/xpu/math.cc
new file mode 100644
index 0000000000000..44d1a260956eb
--- /dev/null
+++ b/paddle/top/xpu/math.cc
@@ -0,0 +1,19 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/top/xpu/math.h"
+
+#include "paddle/top/core/kernel_registry.h"
+
+PT_REGISTER_KERNEL_1T(sign, XPU, NCHW, pt::Sign, float);

From 0e18ff4bbeca57dbe613373988acd8af5b3b902e Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 16 Aug 2021 13:16:51 +0000
Subject: [PATCH 019/125] replace boost::any by paddle::any

---
 paddle/top/core/CMakeLists.txt   | 2 +-
 paddle/top/core/kernel_context.h | 8 +++-----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/paddle/top/core/CMakeLists.txt b/paddle/top/core/CMakeLists.txt
index 74399ff623831..de21c1c79534b 100644
--- a/paddle/top/core/CMakeLists.txt
+++ b/paddle/top/core/CMakeLists.txt
@@ -13,4 +13,4 @@ cc_library(dense_tensor SRCS dense_tensor.cc DEPS enforce data_type ddim allocat
 cc_library(selected_rows_tensor SRCS selected_rows.cc DEPS dense_tensor)
 
 cc_library(kernel_factory SRCS kernel_factory.cc DEPS enforce backend dtype layout)
-cc_library(kernel_context SRCS kernel_context.cc DEPS boost device_context)
+cc_library(kernel_context SRCS kernel_context.cc DEPS device_context)
diff --git a/paddle/top/core/kernel_context.h b/paddle/top/core/kernel_context.h
index 7cf85f5c805cd..86c70e31f4ccf 100644
--- a/paddle/top/core/kernel_context.h
+++ b/paddle/top/core/kernel_context.h
@@ -16,9 +16,8 @@
 
 #include <utility>
 
-#include <boost/any.hpp>
-
 #include "paddle/top/core/tensor_interface.h"
+#include "paddle/utils/any.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
@@ -40,7 +39,7 @@ class OpKernelContext {
   OpKernelContext(const DeviceContext& dev_ctx,
                   const std::vector<std::shared_ptr<TensorInterface>>& inputs,
                   const std::vector<std::shared_ptr<TensorInterface>>& outputs,
-                  const std::vector<boost::any>& attrs)
+                  const std::vector<paddle::any>& attrs)
       : dev_ctx_(dev_ctx), inputs_(inputs), outputs_(outputs), attrs_(attrs) {}
 
   template <typename CtxType>
@@ -75,8 +74,7 @@ class OpKernelContext {
   // Note: can't use API Tensor here, the inference don't use this API Tensor
   std::vector<std::shared_ptr<TensorInterface>> inputs_{};
   std::vector<std::shared_ptr<TensorInterface>> outputs_{};
-  // TODO(chenweihang): replaced by paddle::any
-  std::vector<boost::any> attrs_{};
+  std::vector<paddle::any> attrs_{};
 
   // Only contains input like list[Tensor] need `range`
   // TODO(chenweihang): replaced by small_vector

From 805896bab4d2b312415f6a2d8ac477447539e92e Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Tue, 17 Aug 2021 05:34:22 +0000
Subject: [PATCH 020/125] fix several ci failed

---
 .../fluid/operators/mkldnn/scale_mkldnn_op.cc |  63 -----------
 paddle/fluid/operators/npu_op_runner.cc       | 104 ++++++++++++++++++
 paddle/fluid/operators/npu_op_runner.h        |  19 ++++
 paddle/fluid/platform/mkldnn_reuse.h          |  19 +++-
 paddle/top/cuda/math.h                        |   3 +-
 paddle/top/hip/CMakeLists.txt                 |   1 +
 paddle/top/mkldnn/base.h                      |  33 ++----
 paddle/top/mkldnn/math.h                      |  12 +-
 8 files changed, 157 insertions(+), 97 deletions(-)
 delete mode 100644 paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc

diff --git a/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc
deleted file mode 100644
index 84ac14d04b85b..0000000000000
--- a/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/mkldnn_reuse.h"
-
-namespace paddle {
-namespace operators {
-
-using paddle::framework::Tensor;
-
-template <typename T>
-class ScaleMKLDNNKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    this->RunKernel(ctx);
-  }
-
-  void RunKernel(const framework::ExecutionContext& ctx) const {
-    const auto& dev_ctx =
-        ctx.template device_context<platform::MKLDNNDeviceContext>();
-    const auto& mkldnn_engine = dev_ctx.GetEngine();
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
-
-    bool is_inplaced = x->IsSharedBufferWith(*out);
-
-    platform::ActivationMKLDNNHandler<T> handler(
-        mkldnn::algorithm::eltwise_linear, ctx, mkldnn_engine, ctx.GetPlace(),
-        x);
-
-    auto src_memory_p = handler.AcquireSrcMemory(x);
-    auto dst_memory_p =
-        is_inplaced ? src_memory_p : handler.AcquireDstMemory(out);
-    auto activation_p = handler.AcquireForwardPrimitive();
-
-    auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
-    activation_p->execute(astream, {{MKLDNN_ARG_FROM, *src_memory_p},
-                                    {MKLDNN_ARG_TO, *dst_memory_p}});
-    astream.wait();
-
-    out->set_layout(framework::DataLayout::kMKLDNN);
-    out->set_format(platform::GetMKLDNNFormat(*dst_memory_p));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_KERNEL(scale, MKLDNN, paddle::platform::CPUPlace,
-                   ops::ScaleMKLDNNKernel<float>,
-                   ops::ScaleMKLDNNKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/npu_op_runner.cc b/paddle/fluid/operators/npu_op_runner.cc
index bb6549c111988..9050823bc5b85 100644
--- a/paddle/fluid/operators/npu_op_runner.cc
+++ b/paddle/fluid/operators/npu_op_runner.cc
@@ -40,12 +40,26 @@ static std::map<framework::proto::VarType::Type, aclDataType>
         {framework::proto::VarType::FP16, ACL_FLOAT16},
         {framework::proto::VarType::FP32, ACL_FLOAT},
         {framework::proto::VarType::FP64, ACL_DOUBLE},
+        // for top dtype
+        {pt::DataType::kBOOL, ACL_BOOL},
+        {pt::DataType::kINT8, ACL_INT8},
+        {pt::DataType::kUINT8, ACL_UINT8},
+        {pt::DataType::kINT16, ACL_INT16},
+        {pt::DataType::kINT32, ACL_INT32},
+        {pt::DataType::kINT64, ACL_INT64},
+        {pt::DataType::kFLOAT16, ACL_FLOAT16},
+        {pt::DataType::kFLOAT32, ACL_FLOAT},
+        {pt::DataType::kFLOAT64, ACL_DOUBLE},
 };
 
 static std::map<DataLayout, aclFormat> DATA_LAYOUT_2_ACL_FORMAT = {
     {DataLayout::kNCHW, ACL_FORMAT_NCHW},
     {DataLayout::kNHWC, ACL_FORMAT_NHWC},
     {DataLayout::kAnyLayout, ACL_FORMAT_ND},
+    // for top dtype
+    {pt::DataLayout::kNCHW, ACL_FORMAT_NCHW},
+    {pt::DataLayout::kNHWC, ACL_FORMAT_NHWC},
+    {pt::DataLayout::kAny, ACL_FORMAT_ND},
 };
 
 aclDataType ConvertToNpuDtype(framework::proto::VarType::Type dtype) {
@@ -90,6 +104,16 @@ NpuOpRunner::NpuOpRunner(const std::string &op_type,
   AddAttrs(attrs);
 }
 
+NpuOpRunner::NpuOpRunner(const std::string &op_type,
+                         const std::vector<pt::DenseTensor> &inputs,
+                         const std::vector<pt::DenseTensor> &outputs,
+                         const NPUAttributeMap &attrs)
+    : op_type_(op_type) {
+  AddInputs(inputs);
+  AddOutputs(outputs);
+  AddAttrs(attrs);
+}
+
 NpuOpRunner::~NpuOpRunner() {
   VLOG(5) << "Free NpuOpRunner(" << this << ") of " << op_type_;
   // Is it safe to free the descs/buffers after run called in host ?
@@ -201,6 +225,14 @@ NpuOpRunner &NpuOpRunner::AddInput(const Tensor &tensor) {
   return *this;
 }
 
+NpuOpRunner &NpuOpRunner::AddInput(const pt::DenseTensor &tensor) {
+  // create aclTensorDesc
+  input_descs_.emplace_back(CreateTensorDesc(tensor));
+  // create aclDataBuffer
+  input_buffers_.emplace_back(CreateDataBuffer(tensor));
+  return *this;
+}
+
 NpuOpRunner &NpuOpRunner::AddInput(const Tensor &tensor, aclMemType mem_type) {
   // create aclTensorDesc
   input_descs_.emplace_back(CreateTensorDesc(tensor, mem_type));
@@ -281,6 +313,14 @@ NpuOpRunner &NpuOpRunner::AddOutput(const Tensor &tensor) {
   return *this;
 }
 
+NpuOpRunner &NpuOpRunner::AdOutput(const pt::DenseTensor &tensor) {
+  // create aclTensorDesc
+  output_descs_.emplace_back(CreateTensorDesc(tensor));
+  // create aclDataBuffer
+  output_buffers_.emplace_back(CreateDataBuffer(tensor));
+  return *this;
+}
+
 NpuOpRunner &NpuOpRunner::AddInputs(const std::vector<Tensor> &tensors) {
   input_descs_.reserve(tensors.size());
   input_buffers_.reserve(tensors.size());
@@ -293,6 +333,19 @@ NpuOpRunner &NpuOpRunner::AddInputs(const std::vector<Tensor> &tensors) {
   return *this;
 }
 
+NpuOpRunner &NpuOpRunner::AddInputs(
+    const std::vector<pt::DenseTensor> &tensors) {
+  input_descs_.reserve(tensors.size());
+  input_buffers_.reserve(tensors.size());
+  for (auto tensor : tensors) {
+    // create aclTensorDesc
+    input_descs_.emplace_back(CreateTensorDesc(tensor));
+    // create aclDataBuffer
+    input_buffers_.emplace_back(CreateDataBuffer(tensor));
+  }
+  return *this;
+}
+
 // NOTE(zhiqiu): For operators whose input is a list (such as concat, stack),
 // It is needed to set the name of each input tensor.
 NpuOpRunner &NpuOpRunner::AddInputNames(const std::vector<std::string> &names) {
@@ -320,6 +373,19 @@ NpuOpRunner &NpuOpRunner::AddOutputs(const std::vector<Tensor> &tensors) {
   return *this;
 }
 
+NpuOpRunner &NpuOpRunner::AddOutputs(
+    const std::vector<pt::DenseTensor> &tensors) {
+  output_descs_.reserve(tensors.size());
+  output_buffers_.reserve(tensors.size());
+  for (auto tensor : tensors) {
+    // create aclTensorDesc
+    output_descs_.emplace_back(CreateTensorDesc(tensor));
+    // create aclDataBuffer
+    output_buffers_.emplace_back(CreateDataBuffer(tensor));
+  }
+  return *this;
+}
+
 aclTensorDesc *NpuOpRunner::GetInputDesc(size_t index) {
   PADDLE_ENFORCE_LT(index, input_descs_.size(),
                     platform::errors::OutOfRange(
@@ -383,6 +449,35 @@ aclTensorDesc *NpuOpRunner::CreateTensorDesc(Tensor tensor,
   return desc;
 }
 
+aclTensorDesc *NpuOpRunner::CreateTensorDesc(pt::DenseTensor tensor,
+                                             aclMemType mem_type) {
+  auto dtype = ConvertToNpuDtype(tensor.type());
+  auto format = ConvertToNpuFormat(tensor.layout());
+  auto dims = framework::vectorize(tensor.dims());
+  int size = dims.size();
+  // TODO(pangyoki): `keep_prob` used in `DropOutGenMask` NPU
+  // OP must be a scalar with shape[0]. At present, the shape
+  // of the `prob` Tensor of this OP is forced to be set to 0
+  // in `npu_op_runner.cc`, which needs to be optimized later.
+  if (op_type_ == "DropOutGenMask" && size == 1 && *(dims.data()) == 1) {
+    size = 0;
+  }
+
+  VLOG(4) << "NPU dtype:" << dtype << " "
+          << "rank:" << dims.size() << " dims:" << tensor.dims()
+          << " format:" << format;
+
+  auto *desc = aclCreateTensorDesc(dtype, size, dims.data(), format);
+  PADDLE_ENFORCE_NOT_NULL(
+      desc, platform::errors::External("Call aclCreateTensorDesc failed."));
+  PADDLE_ENFORCE_NPU_SUCCESS(aclSetTensorStorageFormat(desc, format));
+  PADDLE_ENFORCE_NPU_SUCCESS(aclSetTensorStorageShape(desc, size, dims.data()));
+  if (mem_type == ACL_MEMTYPE_HOST) {
+    PADDLE_ENFORCE_NPU_SUCCESS(aclSetTensorPlaceMent(desc, mem_type));
+  }
+  return desc;
+}
+
 aclDataBuffer *NpuOpRunner::CreateDataBuffer(Tensor tensor) {
   void *ptr = tensor.data<void>();
   VLOG(4) << "NPU ptr: " << ptr << ", size: " << tensor.memory_size();
@@ -392,6 +487,15 @@ aclDataBuffer *NpuOpRunner::CreateDataBuffer(Tensor tensor) {
   return buffer;
 }
 
+aclDataBuffer *NpuOpRunner::CreateDataBuffer(pt::DenseTensor tensor) {
+  void *ptr = tensor.data<void>();
+  VLOG(4) << "NPU ptr: " << ptr << ", size: " << tensor.MemorySize();
+  auto *buffer = aclCreateDataBuffer(ptr, tensor.MemorySize());
+  PADDLE_ENFORCE_NOT_NULL(
+      buffer, platform::errors::External("Call aclCreateDataBuffer failed."));
+  return buffer;
+}
+
 void NpuOpRunner::Run(aclrtStream stream) const {
   if (!stream) {
     VLOG(4) << "Run with default current npu stream: " << stream;
diff --git a/paddle/fluid/operators/npu_op_runner.h b/paddle/fluid/operators/npu_op_runner.h
index 45e973970a956..eea76c0010004 100644
--- a/paddle/fluid/operators/npu_op_runner.h
+++ b/paddle/fluid/operators/npu_op_runner.h
@@ -24,6 +24,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
 
+#include "paddle/top/api/dev/core.h"
+
 namespace paddle {
 namespace operators {
 
@@ -42,6 +44,11 @@ class NpuOpRunner {
               const std::vector<Tensor> &outputs = {},
               const NPUAttributeMap &attrs = {});
 
+  NpuOpRunner(const std::string &op_type,
+              const std::vector<pt::DenseTensor> &inputs = {},
+              const std::vector<pt::DenseTensor> &outputs = {},
+              const NPUAttributeMap &attrs = {});
+
   // NOTE(zhiqiu): why forbid copy and operator= ?
   // Since we will free the tensor_descs and data_buffers in the ~NpuOpRunner,
   // if shallow copy is performed on tensor_descs and data_buffers, it may
@@ -62,6 +69,8 @@ class NpuOpRunner {
 
   NpuOpRunner &AddInput(const Tensor &tensor);
 
+  NpuOpRunner &AddInput(const pt::DenseTensor &tensor);
+
   // NOTE(zhiqiu): CANN-5.0.2 support input tensors on host.
   // Specifically, the tensor of shape, tensor of dims, etc, which are are small
   // vector/list.
@@ -77,12 +86,18 @@ class NpuOpRunner {
 
   NpuOpRunner &AddOutput(const Tensor &tensor);
 
+  NpuOpRunner &AddOutput(const pt::DenseTensor &tensor);
+
   NpuOpRunner &AddInputs(const std::vector<Tensor> &tensors);
 
+  NpuOpRunner &AddInputs(const std::vector<pt::DenseTensor> &tensors);
+
   NpuOpRunner &AddInputNames(const std::vector<std::string> &names);
 
   NpuOpRunner &AddOutputs(const std::vector<Tensor> &tensors);
 
+  NpuOpRunner &AddOutputs(const std::vector<pt::DenseTensor> &tensors);
+
   aclTensorDesc *GetInputDesc(size_t index);
 
   aclTensorDesc *GetOutputDesc(size_t index);
@@ -102,6 +117,10 @@ class NpuOpRunner {
                                   aclMemType mem_type = ACL_MEMTYPE_DEVICE);
   aclDataBuffer *CreateDataBuffer(Tensor tensor);
 
+  aclTensorDesc *CreateTensorDesc(pt::DenseTensor tensor,
+                                  aclMemType mem_type = ACL_MEMTYPE_DEVICE);
+  aclDataBuffer *CreateDataBuffer(pt::DenseTensor tensor);
+
  private:
   std::string op_type_;
   std::vector<aclDataBuffer *> input_buffers_;
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index be0a5018939cb..cefab1ed89d86 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -68,6 +68,13 @@ class MKLDNNHandlerNoCachingT {
                                             to_void_cast<T>(input_data));
   }
 
+  std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
+      const pt::DenseTensor* input) {
+    const T* input_data = input->data<T>();
+    return this->AcquireMemoryFromPrimitive(fwd_pd_->src_desc(),
+                                            to_void_cast<T>(input_data));
+  }
+
   template <typename T_out = T>
   std::shared_ptr<mkldnn::memory> AcquireDstMemory(framework::Tensor* output) {
     T_out* ptr =
@@ -75,6 +82,12 @@ class MKLDNNHandlerNoCachingT {
     return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc(), ptr);
   }
 
+  template <typename T_out = T>
+  std::shared_ptr<mkldnn::memory> AcquireDstMemory(pt::DenseTensor* output) {
+    T_out* ptr = output->mutable_data<T_out>();
+    return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc(), ptr);
+  }
+
   template <typename T_out = T>
   std::shared_ptr<mkldnn::memory> AcquireDstMemory(void) {
     return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc());
@@ -952,7 +965,6 @@ class BroadcastDataMKLDNNHandler
   std::shared_ptr<mkldnn::memory> AcquireDstMemory(framework::Tensor* output) {
     T_out* ptr = output->mutable_data<T_out>(
         this->place_, this->fwd_pd_->dst_desc().get_size());
-    ;
     memset(ptr, 0, this->fwd_pd_->dst_desc().get_size());
     return this->AcquireMemoryFromPrimitive(this->fwd_pd_->dst_desc(), ptr,
                                             "@dst_mem_p");
@@ -1012,8 +1024,9 @@ class ActivationMKLDNNHandler
     if (algorithm == mkldnn::algorithm::eltwise_linear) {
       bool bias_after_scale = ctx.Attr<bool>("bias_after_scale");
       auto* scale_tensor = ctx.Input<Tensor>("ScaleTensor");
-      alpha = (scale_tensor == nullptr) ? ctx.Attr<float>("scale")
-                                        : (float)*(scale_tensor->data<T>());
+      alpha = (scale_tensor == nullptr)
+                  ? ctx.Attr<float>("scale")
+                  : (float)*(scale_tensor->data<T>());  // NOLINT
       beta = ctx.Attr<float>("bias");
       // if bias_after_scale == true
       //   out = scale*X + bias
diff --git a/paddle/top/cuda/math.h b/paddle/top/cuda/math.h
index 7e5f72521be39..2469a5720e13b 100644
--- a/paddle/top/cuda/math.h
+++ b/paddle/top/cuda/math.h
@@ -14,7 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#ifdef PADDLE_WITH_CUDA
+// CUDA and HIP use same api
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 #include "paddle/top/core/dense_tensor.h"
 #include "paddle/top/module/scale.h"
diff --git a/paddle/top/hip/CMakeLists.txt b/paddle/top/hip/CMakeLists.txt
index e69de29bb2d1d..2ff5ff075ccb6 100644
--- a/paddle/top/hip/CMakeLists.txt
+++ b/paddle/top/hip/CMakeLists.txt
@@ -0,0 +1 @@
+# hip use cuda api now, maybe this dir is needless
diff --git a/paddle/top/mkldnn/base.h b/paddle/top/mkldnn/base.h
index 2e280dd39aa52..3186ea9ae23a4 100644
--- a/paddle/top/mkldnn/base.h
+++ b/paddle/top/mkldnn/base.h
@@ -29,35 +29,20 @@ using MKLDNNDContext = paddle::platform::MKLDNNDeviceContext;
 // `ExecutionContext`, refactoring that may be a big project!
 
 template <typename T>
-class ScaleMKLDNNHandler
-    : public paddle::platform::MKLDNNHandlerT<T,
-                                              mkldnn::eltwise_forward,
-                                              mkldnn::eltwise_backward> {
+class ScaleMKLDNNHandler : public paddle::platform::MKLDNNHandlerNoCachingT<
+                               T,
+                               mkldnn::eltwise_forward,
+                               mkldnn::eltwise_backward> {
  public:
-  ScaleMKLDNNHandler(const MKLDNNDContext& dev_ctx,
+  ScaleMKLDNNHandler(const mkldnn::engine& engine,
                      const pt::MKLDNNDenseTensor& in_x,
-                     const std::string& unique_name,
-                     bool is_inplaced,
                      float alpha,
                      float beta,
                      bool bias_after_scale)
-      : paddle::platform::MKLDNNHandlerT<T,
-                                         mkldnn::eltwise_forward,
-                                         mkldnn::eltwise_backward>(
-            dev_ctx,
-            dev_ctx.GetEngine(),
-            in_x.place(),
-            is_inplaced ? paddle::platform::CreateKey(
-                              dev_ctx,
-                              paddle::framework::vectorize(in_x.dims()),
-                              "a",
-                              mkldnn::algorithm::eltwise_linear,
-                              unique_name)
-                        : paddle::platform::CreateKey(
-                              dev_ctx,
-                              paddle::framework::vectorize(in_x.dims()),
-                              "a",
-                              unique_name)) {
+      : paddle::platform::MKLDNNHandlerNoCachingT<T,
+                                                  mkldnn::eltwise_forward,
+                                                  mkldnn::eltwise_backward>(
+            engine, in_x.place()) {
     if (!bias_after_scale) {
       beta *= alpha;
     }
diff --git a/paddle/top/mkldnn/math.h b/paddle/top/mkldnn/math.h
index a4e8681405e4a..2c7914715c7e5 100644
--- a/paddle/top/mkldnn/math.h
+++ b/paddle/top/mkldnn/math.h
@@ -30,19 +30,19 @@ void Scale(const MKLDNNDContext& dev_ctx,
            float bias,
            bool bias_after_scale,
            MKLDNNDenseTensor* out) {
-  bool is_inplaced = x.allocation() && x.allocation() == out->allocation();
+  const auto mkldnn_engine = dev_ctx.GetEngine();
 
-  // TODO(chenweihang): add `name` into TensorMeta?
-  ScaleMKLDNNHandler<T> handler(dev_ctx,
+  ScaleMKLDNNHandler<T> handler(mkldnn_engine,
                                 x,
-                                /*unique_name=*/"X",
-                                is_inplaced,
                                 /*alpha=*/scale,
                                 /*beta=*/bias,
                                 bias_after_scale);
 
+  bool is_inplaced = x.allocation() && x.allocation() == out->allocation();
+
   auto src_memory_p = handler.AcquireSrcMemory(&x);
-  auto dst_memory_p = handler.AcquireDstMemory(out);
+  auto dst_memory_p =
+      is_inplaced ? src_memory_p : handler.AcquireDstMemory(out);
   auto activation_p = handler.AcquireForwardPrimitive();
 
   auto& astream = MKLDNNDContext::tls().get_stream();

From fc4442b2caf44b60ae2f7014c23659e6740d217a Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Tue, 17 Aug 2021 06:19:57 +0000
Subject: [PATCH 021/125] fix npu compile error

---
 paddle/fluid/operators/npu_op_runner.cc | 46 +++++++++++++++++--------
 paddle/fluid/operators/npu_op_runner.h  |  4 +--
 2 files changed, 34 insertions(+), 16 deletions(-)

diff --git a/paddle/fluid/operators/npu_op_runner.cc b/paddle/fluid/operators/npu_op_runner.cc
index 9050823bc5b85..56b4148e1bece 100644
--- a/paddle/fluid/operators/npu_op_runner.cc
+++ b/paddle/fluid/operators/npu_op_runner.cc
@@ -40,23 +40,23 @@ static std::map<framework::proto::VarType::Type, aclDataType>
         {framework::proto::VarType::FP16, ACL_FLOAT16},
         {framework::proto::VarType::FP32, ACL_FLOAT},
         {framework::proto::VarType::FP64, ACL_DOUBLE},
-        // for top dtype
-        {pt::DataType::kBOOL, ACL_BOOL},
-        {pt::DataType::kINT8, ACL_INT8},
-        {pt::DataType::kUINT8, ACL_UINT8},
-        {pt::DataType::kINT16, ACL_INT16},
-        {pt::DataType::kINT32, ACL_INT32},
-        {pt::DataType::kINT64, ACL_INT64},
-        {pt::DataType::kFLOAT16, ACL_FLOAT16},
-        {pt::DataType::kFLOAT32, ACL_FLOAT},
-        {pt::DataType::kFLOAT64, ACL_DOUBLE},
+};
+
+static std::map<pt::DataType aclDataType> PT_DTYPE_2_ACL_DTYPE = {
+    {pt::DataType::kBOOL, ACL_BOOL},       {pt::DataType::kINT8, ACL_INT8},
+    {pt::DataType::kUINT8, ACL_UINT8},     {pt::DataType::kINT16, ACL_INT16},
+    {pt::DataType::kINT32, ACL_INT32},     {pt::DataType::kINT64, ACL_INT64},
+    {pt::DataType::kFLOAT16, ACL_FLOAT16}, {pt::DataType::kFLOAT32, ACL_FLOAT},
+    {pt::DataType::kFLOAT64, ACL_DOUBLE},
 };
 
 static std::map<DataLayout, aclFormat> DATA_LAYOUT_2_ACL_FORMAT = {
     {DataLayout::kNCHW, ACL_FORMAT_NCHW},
     {DataLayout::kNHWC, ACL_FORMAT_NHWC},
     {DataLayout::kAnyLayout, ACL_FORMAT_ND},
-    // for top dtype
+};
+
+static std::map<pt::DataLayout, aclFormat> PT_DATA_LAYOUT_2_ACL_FORMAT = {
     {pt::DataLayout::kNCHW, ACL_FORMAT_NCHW},
     {pt::DataLayout::kNHWC, ACL_FORMAT_NHWC},
     {pt::DataLayout::kAny, ACL_FORMAT_ND},
@@ -71,6 +71,15 @@ aclDataType ConvertToNpuDtype(framework::proto::VarType::Type dtype) {
   return iter->second;
 }
 
+aclDataType ConvertToNpuDtype(pt::DataType dtype) {
+  auto iter = PT_DTYPE_2_ACL_DTYPE.find(dtype);
+  PADDLE_ENFORCE_NE(
+      iter, PT_DTYPE_2_ACL_DTYPE.end(),
+      platform::errors::NotFound(
+          "The data type (%s) can not convert to ACL data type.", dtype));
+  return iter->second;
+}
+
 aclFormat ConvertToNpuFormat(DataLayout layout) {
   auto iter = DATA_LAYOUT_2_ACL_FORMAT.find(layout);
   PADDLE_ENFORCE_NE(
@@ -80,6 +89,15 @@ aclFormat ConvertToNpuFormat(DataLayout layout) {
   return iter->second;
 }
 
+aclFormat ConvertToNpuFormat(pt::DataLayout layout) {
+  auto iter = PT_DATA_LAYOUT_2_ACL_FORMAT.find(layout);
+  PADDLE_ENFORCE_NE(
+      iter, PT_DATA_LAYOUT_2_ACL_FORMAT.end(),
+      platform::errors::NotFound(
+          "The data type (%s) can not convert to ACL data type.", layout));
+  return iter->second;
+}
+
 aclrtStream GetCurrentNPUStream(int device_id) {
   if (device_id == -1) {
     device_id = platform::GetCurrentNPUDeviceId();
@@ -449,7 +467,7 @@ aclTensorDesc *NpuOpRunner::CreateTensorDesc(Tensor tensor,
   return desc;
 }
 
-aclTensorDesc *NpuOpRunner::CreateTensorDesc(pt::DenseTensor tensor,
+aclTensorDesc *NpuOpRunner::CreateTensorDesc(const pt::DenseTensor &tensor,
                                              aclMemType mem_type) {
   auto dtype = ConvertToNpuDtype(tensor.type());
   auto format = ConvertToNpuFormat(tensor.layout());
@@ -487,8 +505,8 @@ aclDataBuffer *NpuOpRunner::CreateDataBuffer(Tensor tensor) {
   return buffer;
 }
 
-aclDataBuffer *NpuOpRunner::CreateDataBuffer(pt::DenseTensor tensor) {
-  void *ptr = tensor.data<void>();
+aclDataBuffer *NpuOpRunner::CreateDataBuffer(const pt::DenseTensor &tensor) {
+  const void *ptr = tensor.data<void>();
   VLOG(4) << "NPU ptr: " << ptr << ", size: " << tensor.MemorySize();
   auto *buffer = aclCreateDataBuffer(ptr, tensor.MemorySize());
   PADDLE_ENFORCE_NOT_NULL(
diff --git a/paddle/fluid/operators/npu_op_runner.h b/paddle/fluid/operators/npu_op_runner.h
index eea76c0010004..19f5f5debe2cc 100644
--- a/paddle/fluid/operators/npu_op_runner.h
+++ b/paddle/fluid/operators/npu_op_runner.h
@@ -117,9 +117,9 @@ class NpuOpRunner {
                                   aclMemType mem_type = ACL_MEMTYPE_DEVICE);
   aclDataBuffer *CreateDataBuffer(Tensor tensor);
 
-  aclTensorDesc *CreateTensorDesc(pt::DenseTensor tensor,
+  aclTensorDesc *CreateTensorDesc(const pt::DenseTensor &tensor,
                                   aclMemType mem_type = ACL_MEMTYPE_DEVICE);
-  aclDataBuffer *CreateDataBuffer(pt::DenseTensor tensor);
+  aclDataBuffer *CreateDataBuffer(const pt::DenseTensor &tensor);
 
  private:
   std::string op_type_;

From cefe30a3f34ba78099e19305f3a4a940d2d72709 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Tue, 17 Aug 2021 08:58:43 +0000
Subject: [PATCH 022/125] add ordered map util

---
 paddle/fluid/framework/operator.cc |    4 +-
 paddle/fluid/framework/type_defs.h |   16 +-
 paddle/fluid/platform/variant.h    |    3 +-
 paddle/utils/ordered_hash.h        | 1690 ++++++++++++++++++++++++++++
 paddle/utils/ordered_map.h         | 1022 +++++++++++++++++
 5 files changed, 2729 insertions(+), 6 deletions(-)
 create mode 100644 paddle/utils/ordered_hash.h
 create mode 100644 paddle/utils/ordered_map.h

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 6a9f557770533..ad030a46b9fa8 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -461,8 +461,8 @@ void OperatorBase::CheckAllInputOutputSet() const {
 
 void OperatorBase::GenerateTemporaryNames() {
   static std::atomic<size_t> gUniqId(0UL);
-  for (auto& output : outputs_) {
-    for (auto& output_name : output.second) {
+  for (auto it = outputs_.begin(); it != outputs_.end(); ++it) {
+    for (auto& output_name : it.value()) {
       if (output_name == kTempVarName) {
         output_name += type_;
         output_name += "@";
diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h
index 951daea47bde3..8d6a9305a0704 100644
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -22,6 +22,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/platform/variant.h"
+#include "paddle/utils/ordered_map.h"
 
 namespace paddle {
 namespace framework {
@@ -33,9 +34,18 @@ class BlockDesc;
 class Variable;
 class InferNoNeedBufferVarsFN;
 
-using VariableNameMap = std::map<std::string, std::vector<std::string>>;
-// TODO(panyx0718): Replace vector with something like gtl::Vector.
-using VariableValueMap = std::map<std::string, std::vector<Variable*>>;
+/**
+ * Why need ordered_map ?
+ *
+ * The inputs and outputs in OpProto are ordered, but when they used for build
+ * OpDesc and Operator, the order info is lost, which cause we can't access Op's
+ * inputs and outputs by index, can't construct vector format KernelContext at
+ * low cost.
+ */
+using VariableNameMap =
+    paddle::ordered_map<std::string, std::vector<std::string>>;
+using VariableValueMap =
+    paddle::ordered_map<std::string, std::vector<Variable*>>;
 
 // The order should be as same as framework.proto
 using Attribute = boost::variant<
diff --git a/paddle/fluid/platform/variant.h b/paddle/fluid/platform/variant.h
index 0f802c08842d0..8c8fb525cc7e0 100644
--- a/paddle/fluid/platform/variant.h
+++ b/paddle/fluid/platform/variant.h
@@ -38,12 +38,13 @@ limitations under the License. */
 #endif
 #endif
 
-#include <boost/any.hpp>
 #include <boost/mpl/comparison.hpp>
 #include <boost/mpl/less_equal.hpp>
 #include <boost/optional.hpp>
 #include <boost/variant.hpp>
 
+#include "paddle/utils/any.h"
+
 // some platform-independent defintion
 #if defined(_WIN32)
 #define UNUSED
diff --git a/paddle/utils/ordered_hash.h b/paddle/utils/ordered_hash.h
new file mode 100644
index 0000000000000..0172fb0da2be9
--- /dev/null
+++ b/paddle/utils/ordered_hash.h
@@ -0,0 +1,1690 @@
+/**
+ * Copy from https://github.com/Tessil/ordered-map
+ * Modified the following points:
+ * 1. modify namespace from `tsl` to `paddle`
+ * 2. modify some naming prefixes from `tsl` to `paddle`
+ * 3. refine code-format by pre-commit hook
+ */
+
+/**
+ * MIT License
+ *
+ * Copyright (c) 2017 Thibaut Goetghebuer-Planchon <tessil@gmx.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <climits>
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <exception>
+#include <functional>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <stdexcept>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+/**
+ * Macros for compatibility with GCC 4.8
+ */
+#if (defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ < 9))
+#define PADDLE_OH_NO_CONTAINER_ERASE_CONST_ITERATOR
+#define PADDLE_OH_NO_CONTAINER_EMPLACE_CONST_ITERATOR
+#endif
+
+/**
+ * Only activate paddle_oh_assert if PADDLE_DEBUG is defined.
+ * This way we avoid the performance hit when NDEBUG is not defined with assert
+ * as paddle_oh_assert is used a lot (people usually compile with "-O3" and not
+ * "-O3 -DNDEBUG").
+ */
+#ifdef PADDLE_DEBUG
+#define paddle_oh_assert(expr) assert(expr)
+#else
+#define paddle_oh_assert(expr) (static_cast<void>(0))
+#endif
+
+/**
+ * If exceptions are enabled, throw the exception passed in parameter, otherwise
+ * call std::terminate.
+ */
+#if (defined(__cpp_exceptions) || defined(__EXCEPTIONS) || \
+     (defined(_MSC_VER) && defined(_CPPUNWIND))) &&        \
+    !defined(PADDLE_NO_EXCEPTIONS)
+#define PADDLE_OH_THROW_OR_TERMINATE(ex, msg) throw ex(msg)
+#else
+#define PADDLE_OH_NO_EXCEPTIONS
+#ifdef NDEBUG
+#define PADDLE_OH_THROW_OR_TERMINATE(ex, msg) std::terminate()
+#else
+#include <iostream>
+#define PADDLE_OH_THROW_OR_TERMINATE(ex, msg) \
+  do {                                        \
+    std::cerr << msg << std::endl;            \
+    std::terminate();                         \
+  } while (0)
+#endif
+#endif
+
+namespace paddle {
+
+namespace detail_ordered_hash {
+
+template <typename T>
+struct make_void {
+  using type = void;
+};
+
+template <typename T, typename = void>
+struct has_is_transparent : std::false_type {};
+
+template <typename T>
+struct has_is_transparent<T,
+                          typename make_void<typename T::is_transparent>::type>
+    : std::true_type {};
+
+template <typename T, typename = void>
+struct is_vector : std::false_type {};
+
+template <typename T>
+struct is_vector<T,
+                 typename std::enable_if<std::is_same<
+                     T,
+                     std::vector<typename T::value_type,
+                                 typename T::allocator_type>>::value>::type>
+    : std::true_type {};
+
+// Only available in C++17, we need to be compatible with C++11
+template <class T>
+const T& clamp(const T& v, const T& lo, const T& hi) {
+  return std::min(hi, std::max(lo, v));
+}
+
+template <typename T, typename U>
+static T numeric_cast(U value,
+                      const char* error_message = "numeric_cast() failed.") {
+  T ret = static_cast<T>(value);
+  if (static_cast<U>(ret) != value) {
+    PADDLE_OH_THROW_OR_TERMINATE(std::runtime_error, error_message);
+  }
+
+  const bool is_same_signedness =
+      (std::is_unsigned<T>::value && std::is_unsigned<U>::value) ||
+      (std::is_signed<T>::value && std::is_signed<U>::value);
+  if (!is_same_signedness && (ret < T{}) != (value < U{})) {
+    PADDLE_OH_THROW_OR_TERMINATE(std::runtime_error, error_message);
+  }
+
+  return ret;
+}
+
+/**
+ * Fixed size type used to represent size_type values on serialization. Need to
+ * be big enough to represent a std::size_t on 32 and 64 bits platforms, and
+ * must be the same size on both platforms.
+ */
+using slz_size_type = std::uint64_t;
+static_assert(std::numeric_limits<slz_size_type>::max() >=
+                  std::numeric_limits<std::size_t>::max(),
+              "slz_size_type must be >= std::size_t");
+
+template <class T, class Deserializer>
+static T deserialize_value(Deserializer& deserializer) {  // NOLINT
+// MSVC < 2017 is not conformant, circumvent the problem by removing the
+// template keyword
+#if defined(_MSC_VER) && _MSC_VER < 1910
+  return deserializer.Deserializer::operator()<T>();
+#else
+  return deserializer.Deserializer::template operator()<T>();
+#endif
+}
+
+/**
+ * Each bucket entry stores an index which is the index in m_values
+ * corresponding to the bucket's value and a hash (which may be truncated to 32
+ * bits depending on IndexType) corresponding to the hash of the value.
+ *
+ * The size of IndexType limits the size of the hash table to
+ * std::numeric_limits<IndexType>::max() - 1 elements (-1 due to a reserved
+ * value used to mark a bucket as empty).
+ */
+template <class IndexType>
+class bucket_entry {
+  static_assert(std::is_unsigned<IndexType>::value,
+                "IndexType must be an unsigned value.");
+  static_assert(std::numeric_limits<IndexType>::max() <=
+                    std::numeric_limits<std::size_t>::max(),
+                "std::numeric_limits<IndexType>::max() must be <= "
+                "std::numeric_limits<std::size_t>::max().");
+
+ public:
+  using index_type = IndexType;
+  using truncated_hash_type = typename std::conditional<
+      std::numeric_limits<IndexType>::max() <=
+          std::numeric_limits<std::uint_least32_t>::max(),
+      std::uint_least32_t,
+      std::size_t>::type;
+
+  bucket_entry() noexcept : m_index(EMPTY_MARKER_INDEX), m_hash(0) {}
+
+  bool empty() const noexcept { return m_index == EMPTY_MARKER_INDEX; }
+
+  void clear() noexcept { m_index = EMPTY_MARKER_INDEX; }
+
+  index_type index() const noexcept {
+    paddle_oh_assert(!empty());
+    return m_index;
+  }
+
+  index_type& index_ref() noexcept {
+    paddle_oh_assert(!empty());
+    return m_index;
+  }
+
+  void set_index(index_type index) noexcept {
+    paddle_oh_assert(index <= max_size());
+
+    m_index = index;
+  }
+
+  truncated_hash_type truncated_hash() const noexcept {
+    paddle_oh_assert(!empty());
+    return m_hash;
+  }
+
+  truncated_hash_type& truncated_hash_ref() noexcept {
+    paddle_oh_assert(!empty());
+    return m_hash;
+  }
+
+  void set_hash(std::size_t hash) noexcept { m_hash = truncate_hash(hash); }
+
+  template <class Serializer>
+  void serialize(Serializer& serializer) const {  // NOLINT
+    const slz_size_type index = m_index;
+    serializer(index);
+
+    const slz_size_type hash = m_hash;
+    serializer(hash);
+  }
+
+  template <class Deserializer>
+  static bucket_entry deserialize(Deserializer& deserializer) {  // NOLINT
+    const slz_size_type index = deserialize_value<slz_size_type>(deserializer);
+    const slz_size_type hash = deserialize_value<slz_size_type>(deserializer);
+
+    bucket_entry bentry;
+    bentry.m_index =
+        numeric_cast<index_type>(index, "Deserialized index is too big.");
+    bentry.m_hash = numeric_cast<truncated_hash_type>(
+        hash, "Deserialized hash is too big.");
+
+    return bentry;
+  }
+
+  static truncated_hash_type truncate_hash(std::size_t hash) noexcept {
+    return truncated_hash_type(hash);
+  }
+
+  static std::size_t max_size() noexcept {
+    return static_cast<std::size_t>(std::numeric_limits<index_type>::max()) -
+           NB_RESERVED_INDEXES;
+  }
+
+ private:
+  static const index_type EMPTY_MARKER_INDEX =
+      std::numeric_limits<index_type>::max();
+  static const std::size_t NB_RESERVED_INDEXES = 1;
+
+  index_type m_index;
+  truncated_hash_type m_hash;
+};
+
+/**
+ * Internal common class used by ordered_map and ordered_set.
+ *
+ * ValueType is what will be stored by ordered_hash (usually std::pair<Key, T>
+ * for map and Key for set).
+ *
+ * KeySelect should be a FunctionObject which takes a ValueType in parameter and
+ * return a reference to the key.
+ *
+ * ValueSelect should be a FunctionObject which takes a ValueType in parameter
+ * and return a reference to the value. ValueSelect should be void if there is
+ * no value (in set for example).
+ *
+ * ValueTypeContainer is the container which will be used to store ValueType
+ * values. Usually a std::deque<ValueType, Allocator> or std::vector<ValueType,
+ * Allocator>.
+ *
+ *
+ *
+ * The ordered_hash structure is a hash table which preserves the order of
+ * insertion of the elements. To do so, it stores the values in the
+ * ValueTypeContainer (m_values) using emplace_back at each insertion of a new
+ * element. Another structure (m_buckets of type std::vector<bucket_entry>) will
+ * serve as buckets array for the hash table part. Each bucket stores an index
+ * which corresponds to the index in m_values where the bucket's value is and
+ * the (truncated) hash of this value. An index is used instead of a pointer to
+ * the value to reduce the size of each bucket entry.
+ *
+ * To resolve collisions in the buckets array, the structures use robin hood
+ * linear probing with backward shift deletion.
+ */
+template <class ValueType,
+          class KeySelect,
+          class ValueSelect,
+          class Hash,
+          class KeyEqual,
+          class Allocator,
+          class ValueTypeContainer,
+          class IndexType>
+class ordered_hash : private Hash, private KeyEqual {
+ private:
+  template <typename U>
+  using has_mapped_type =
+      typename std::integral_constant<bool, !std::is_same<U, void>::value>;
+
+  static_assert(
+      std::is_same<typename ValueTypeContainer::value_type, ValueType>::value,
+      "ValueTypeContainer::value_type != ValueType. "
+      "Check that the ValueTypeContainer has 'Key' as type for a set or "
+      "'std::pair<Key, T>' as type for a map.");
+
+  static_assert(std::is_same<typename ValueTypeContainer::allocator_type,
+                             Allocator>::value,
+                "ValueTypeContainer::allocator_type != Allocator. "
+                "Check that the allocator for ValueTypeContainer is the same "
+                "as Allocator.");
+
+  static_assert(std::is_same<typename Allocator::value_type, ValueType>::value,
+                "Allocator::value_type != ValueType. "
+                "Check that the allocator has 'Key' as type for a set or "
+                "'std::pair<Key, T>' as type for a map.");
+
+ public:
+  template <bool IsConst>
+  class ordered_iterator;
+
+  using key_type = typename KeySelect::key_type;
+  using value_type = ValueType;
+  using size_type = std::size_t;
+  using difference_type = std::ptrdiff_t;
+  using hasher = Hash;
+  using key_equal = KeyEqual;
+  using allocator_type = Allocator;
+  using reference = value_type&;
+  using const_reference = const value_type&;
+  using pointer = value_type*;
+  using const_pointer = const value_type*;
+  using iterator = ordered_iterator<false>;
+  using const_iterator = ordered_iterator<true>;
+  using reverse_iterator = std::reverse_iterator<iterator>;
+  using const_reverse_iterator = std::reverse_iterator<const_iterator>;
+
+  using values_container_type = ValueTypeContainer;
+
+ public:
+  template <bool IsConst>
+  class ordered_iterator {
+    friend class ordered_hash;
+
+   private:
+    using iterator = typename std::conditional<
+        IsConst,
+        typename values_container_type::const_iterator,
+        typename values_container_type::iterator>::type;
+
+    explicit ordered_iterator(iterator it) noexcept : m_iterator(it) {}
+
+   public:
+    using iterator_category = std::random_access_iterator_tag;
+    using value_type = const typename ordered_hash::value_type;
+    using difference_type = typename iterator::difference_type;
+    using reference = value_type&;
+    using pointer = value_type*;
+
+    ordered_iterator() noexcept {}
+
+    // Copy constructor from iterator to const_iterator.
+    template <bool TIsConst = IsConst,
+              typename std::enable_if<TIsConst>::type* = nullptr>
+    ordered_iterator(const ordered_iterator<!TIsConst>& other) noexcept
+        : m_iterator(other.m_iterator) {}
+
+    ordered_iterator(const ordered_iterator& other) = default;
+    ordered_iterator(ordered_iterator&& other) = default;
+    ordered_iterator& operator=(const ordered_iterator& other) = default;
+    ordered_iterator& operator=(ordered_iterator&& other) = default;
+
+    const typename ordered_hash::key_type& key() const {
+      return KeySelect()(*m_iterator);
+    }
+
+    template <class U = ValueSelect,
+              typename std::enable_if<has_mapped_type<U>::value &&
+                                      IsConst>::type* = nullptr>
+    const typename U::value_type& value() const {
+      return U()(*m_iterator);
+    }
+
+    template <class U = ValueSelect,
+              typename std::enable_if<has_mapped_type<U>::value &&
+                                      !IsConst>::type* = nullptr>
+    typename U::value_type& value() {
+      return U()(*m_iterator);
+    }
+
+    reference operator*() const { return *m_iterator; }
+    pointer operator->() const { return m_iterator.operator->(); }
+
+    ordered_iterator& operator++() {
+      ++m_iterator;
+      return *this;
+    }
+    ordered_iterator& operator--() {
+      --m_iterator;
+      return *this;
+    }
+
+    ordered_iterator operator++(int) {
+      ordered_iterator tmp(*this);
+      ++(*this);
+      return tmp;
+    }
+    ordered_iterator operator--(int) {
+      ordered_iterator tmp(*this);
+      --(*this);
+      return tmp;
+    }
+
+    reference operator[](difference_type n) const { return m_iterator[n]; }
+
+    ordered_iterator& operator+=(difference_type n) {
+      m_iterator += n;
+      return *this;
+    }
+    ordered_iterator& operator-=(difference_type n) {
+      m_iterator -= n;
+      return *this;
+    }
+
+    ordered_iterator operator+(difference_type n) {
+      ordered_iterator tmp(*this);
+      tmp += n;
+      return tmp;
+    }
+    ordered_iterator operator-(difference_type n) {
+      ordered_iterator tmp(*this);
+      tmp -= n;
+      return tmp;
+    }
+
+    friend bool operator==(const ordered_iterator& lhs,
+                           const ordered_iterator& rhs) {
+      return lhs.m_iterator == rhs.m_iterator;
+    }
+
+    friend bool operator!=(const ordered_iterator& lhs,
+                           const ordered_iterator& rhs) {
+      return lhs.m_iterator != rhs.m_iterator;
+    }
+
+    friend bool operator<(const ordered_iterator& lhs,
+                          const ordered_iterator& rhs) {
+      return lhs.m_iterator < rhs.m_iterator;
+    }
+
+    friend bool operator>(const ordered_iterator& lhs,
+                          const ordered_iterator& rhs) {
+      return lhs.m_iterator > rhs.m_iterator;
+    }
+
+    friend bool operator<=(const ordered_iterator& lhs,
+                           const ordered_iterator& rhs) {
+      return lhs.m_iterator <= rhs.m_iterator;
+    }
+
+    friend bool operator>=(const ordered_iterator& lhs,
+                           const ordered_iterator& rhs) {
+      return lhs.m_iterator >= rhs.m_iterator;
+    }
+
+    friend ordered_iterator operator+(difference_type n,
+                                      const ordered_iterator& it) {
+      return n + it.m_iterator;
+    }
+
+    friend difference_type operator-(const ordered_iterator& lhs,
+                                     const ordered_iterator& rhs) {
+      return lhs.m_iterator - rhs.m_iterator;
+    }
+
+   private:
+    iterator m_iterator;
+  };
+
+ private:
+  using bucket_entry = paddle::detail_ordered_hash::bucket_entry<IndexType>;
+
+  using buckets_container_allocator = typename std::allocator_traits<
+      allocator_type>::template rebind_alloc<bucket_entry>;
+
+  using buckets_container_type =
+      std::vector<bucket_entry, buckets_container_allocator>;
+
+  using truncated_hash_type = typename bucket_entry::truncated_hash_type;
+  using index_type = typename bucket_entry::index_type;
+
+ public:
+  ordered_hash(size_type bucket_count,
+               const Hash& hash,
+               const KeyEqual& equal,
+               const Allocator& alloc,
+               float max_load_factor)
+      : Hash(hash),
+        KeyEqual(equal),
+        m_buckets_data(alloc),
+        m_buckets(static_empty_bucket_ptr()),
+        m_hash_mask(0),
+        m_values(alloc),
+        m_grow_on_next_insert(false) {
+    if (bucket_count > max_bucket_count()) {
+      PADDLE_OH_THROW_OR_TERMINATE(std::length_error,
+                                   "The map exceeds its maximum size.");
+    }
+
+    if (bucket_count > 0) {
+      bucket_count = round_up_to_power_of_two(bucket_count);
+
+      m_buckets_data.resize(bucket_count);
+      m_buckets = m_buckets_data.data(), m_hash_mask = bucket_count - 1;
+    }
+
+    this->max_load_factor(max_load_factor);
+  }
+
+  ordered_hash(const ordered_hash& other)
+      : Hash(other),
+        KeyEqual(other),
+        m_buckets_data(other.m_buckets_data),
+        m_buckets(m_buckets_data.empty() ? static_empty_bucket_ptr()
+                                         : m_buckets_data.data()),
+        m_hash_mask(other.m_hash_mask),
+        m_values(other.m_values),
+        m_load_threshold(other.m_load_threshold),
+        m_max_load_factor(other.m_max_load_factor),
+        m_grow_on_next_insert(other.m_grow_on_next_insert) {}
+
+  ordered_hash(ordered_hash&& other) noexcept(
+      std::is_nothrow_move_constructible<
+          Hash>::value&& std::is_nothrow_move_constructible<KeyEqual>::value&&
+          std::is_nothrow_move_constructible<buckets_container_type>::value&&
+              std::is_nothrow_move_constructible<values_container_type>::value)
+      : Hash(std::move(static_cast<Hash&>(other))),
+        KeyEqual(std::move(static_cast<KeyEqual&>(other))),
+        m_buckets_data(std::move(other.m_buckets_data)),
+        m_buckets(m_buckets_data.empty() ? static_empty_bucket_ptr()
+                                         : m_buckets_data.data()),
+        m_hash_mask(other.m_hash_mask),
+        m_values(std::move(other.m_values)),
+        m_load_threshold(other.m_load_threshold),
+        m_max_load_factor(other.m_max_load_factor),
+        m_grow_on_next_insert(other.m_grow_on_next_insert) {
+    other.m_buckets_data.clear();
+    other.m_buckets = static_empty_bucket_ptr();
+    other.m_hash_mask = 0;
+    other.m_values.clear();
+    other.m_load_threshold = 0;
+    other.m_grow_on_next_insert = false;
+  }
+
+  ordered_hash& operator=(const ordered_hash& other) {
+    if (&other != this) {
+      Hash::operator=(other);
+      KeyEqual::operator=(other);
+
+      m_buckets_data = other.m_buckets_data;
+      m_buckets = m_buckets_data.empty() ? static_empty_bucket_ptr()
+                                         : m_buckets_data.data();
+
+      m_hash_mask = other.m_hash_mask;
+      m_values = other.m_values;
+      m_load_threshold = other.m_load_threshold;
+      m_max_load_factor = other.m_max_load_factor;
+      m_grow_on_next_insert = other.m_grow_on_next_insert;
+    }
+
+    return *this;
+  }
+
+  ordered_hash& operator=(ordered_hash&& other) {
+    other.swap(*this);
+    other.clear();
+
+    return *this;
+  }
+
+  allocator_type get_allocator() const { return m_values.get_allocator(); }
+
+  /*
+   * Iterators
+   */
+  iterator begin() noexcept { return iterator(m_values.begin()); }
+
+  const_iterator begin() const noexcept { return cbegin(); }
+
+  const_iterator cbegin() const noexcept {
+    return const_iterator(m_values.cbegin());
+  }
+
+  iterator end() noexcept { return iterator(m_values.end()); }
+
+  const_iterator end() const noexcept { return cend(); }
+
+  const_iterator cend() const noexcept {
+    return const_iterator(m_values.cend());
+  }
+
+  reverse_iterator rbegin() noexcept {
+    return reverse_iterator(m_values.end());
+  }
+
+  const_reverse_iterator rbegin() const noexcept { return rcbegin(); }
+
+  const_reverse_iterator rcbegin() const noexcept {
+    return const_reverse_iterator(m_values.cend());
+  }
+
+  reverse_iterator rend() noexcept {
+    return reverse_iterator(m_values.begin());
+  }
+
+  const_reverse_iterator rend() const noexcept { return rcend(); }
+
+  const_reverse_iterator rcend() const noexcept {
+    return const_reverse_iterator(m_values.cbegin());
+  }
+
+  /*
+   * Capacity
+   */
+  bool empty() const noexcept { return m_values.empty(); }
+
+  size_type size() const noexcept { return m_values.size(); }
+
+  size_type max_size() const noexcept {
+    return std::min(bucket_entry::max_size(), m_values.max_size());
+  }
+
+  /*
+   * Modifiers
+   */
+  void clear() noexcept {
+    for (auto& bucket : m_buckets_data) {
+      bucket.clear();
+    }
+
+    m_values.clear();
+    m_grow_on_next_insert = false;
+  }
+
+  template <typename P>
+  std::pair<iterator, bool> insert(P&& value) {
+    return insert_impl(KeySelect()(value), std::forward<P>(value));
+  }
+
+  template <typename P>
+  iterator insert_hint(const_iterator hint, P&& value) {
+    if (hint != cend() &&
+        compare_keys(KeySelect()(*hint), KeySelect()(value))) {
+      return mutable_iterator(hint);
+    }
+
+    return insert(std::forward<P>(value)).first;
+  }
+
+  template <class InputIt>
+  void insert(InputIt first, InputIt last) {
+    if (std::is_base_of<
+            std::forward_iterator_tag,
+            typename std::iterator_traits<InputIt>::iterator_category>::value) {
+      const auto nb_elements_insert = std::distance(first, last);
+      const size_type nb_free_buckets = m_load_threshold - size();
+      paddle_oh_assert(m_load_threshold >= size());
+
+      if (nb_elements_insert > 0 &&
+          nb_free_buckets < size_type(nb_elements_insert)) {
+        reserve(size() + size_type(nb_elements_insert));
+      }
+    }
+
+    for (; first != last; ++first) {
+      insert(*first);
+    }
+  }
+
+  template <class K, class M>
+  std::pair<iterator, bool> insert_or_assign(K&& key, M&& value) {
+    auto it = try_emplace(std::forward<K>(key), std::forward<M>(value));
+    if (!it.second) {
+      it.first.value() = std::forward<M>(value);
+    }
+
+    return it;
+  }
+
+  template <class K, class M>
+  iterator insert_or_assign(const_iterator hint, K&& key, M&& obj) {
+    if (hint != cend() && compare_keys(KeySelect()(*hint), key)) {
+      auto it = mutable_iterator(hint);
+      it.value() = std::forward<M>(obj);
+
+      return it;
+    }
+
+    return insert_or_assign(std::forward<K>(key), std::forward<M>(obj)).first;
+  }
+
+  template <class... Args>
+  std::pair<iterator, bool> emplace(Args&&... args) {
+    return insert(value_type(std::forward<Args>(args)...));
+  }
+
+  template <class... Args>
+  iterator emplace_hint(const_iterator hint, Args&&... args) {
+    return insert_hint(hint, value_type(std::forward<Args>(args)...));
+  }
+
+  template <class K, class... Args>
+  std::pair<iterator, bool> try_emplace(K&& key, Args&&... value_args) {
+    return insert_impl(
+        key,
+        std::piecewise_construct,
+        std::forward_as_tuple(std::forward<K>(key)),
+        std::forward_as_tuple(std::forward<Args>(value_args)...));
+  }
+
+  template <class K, class... Args>
+  iterator try_emplace_hint(const_iterator hint, K&& key, Args&&... args) {
+    if (hint != cend() && compare_keys(KeySelect()(*hint), key)) {
+      return mutable_iterator(hint);
+    }
+
+    return try_emplace(std::forward<K>(key), std::forward<Args>(args)...).first;
+  }
+
+  /**
+   * Here to avoid `template<class K> size_type erase(const K& key)` being used
+   * when we use an `iterator` instead of a `const_iterator`.
+   */
+  iterator erase(iterator pos) { return erase(const_iterator(pos)); }
+
+  iterator erase(const_iterator pos) {
+    paddle_oh_assert(pos != cend());
+
+    const std::size_t index_erase = iterator_to_index(pos);
+
+    auto it_bucket = find_key(pos.key(), hash_key(pos.key()));
+    paddle_oh_assert(it_bucket != m_buckets_data.end());
+
+    erase_value_from_bucket(it_bucket);
+
+    /*
+     * One element was removed from m_values, due to the left shift the next
+     * element is now at the position of the previous element (or end if none).
+     */
+    return begin() + index_erase;
+  }
+
+  iterator erase(const_iterator first, const_iterator last) {
+    if (first == last) {
+      return mutable_iterator(first);
+    }
+
+    paddle_oh_assert(std::distance(first, last) > 0);
+    const std::size_t start_index = iterator_to_index(first);
+    const std::size_t nb_values = std::size_t(std::distance(first, last));
+    const std::size_t end_index = start_index + nb_values;
+
+// Delete all values
+#ifdef PADDLE_OH_NO_CONTAINER_ERASE_CONST_ITERATOR
+    auto next_it = m_values.erase(mutable_iterator(first).m_iterator,
+                                  mutable_iterator(last).m_iterator);
+#else
+    auto next_it = m_values.erase(first.m_iterator, last.m_iterator);
+#endif
+
+    /*
+     * Mark the buckets corresponding to the values as empty and do a backward
+     * shift.
+     *
+     * Also, the erase operation on m_values has shifted all the values on the
+     * right of last.m_iterator. Adapt the indexes for these values.
+     */
+    std::size_t ibucket = 0;
+    while (ibucket < m_buckets_data.size()) {
+      if (m_buckets[ibucket].empty()) {
+        ibucket++;
+      } else if (m_buckets[ibucket].index() >= start_index &&
+                 m_buckets[ibucket].index() < end_index) {
+        m_buckets[ibucket].clear();
+        backward_shift(ibucket);
+        // Don't increment ibucket, backward_shift may have replaced current
+        // bucket.
+      } else if (m_buckets[ibucket].index() >= end_index) {
+        m_buckets[ibucket].set_index(
+            index_type(m_buckets[ibucket].index() - nb_values));
+        ibucket++;
+      } else {
+        ibucket++;
+      }
+    }
+
+    return iterator(next_it);
+  }
+
+  template <class K>
+  size_type erase(const K& key) {
+    return erase(key, hash_key(key));
+  }
+
+  template <class K>
+  size_type erase(const K& key, std::size_t hash) {
+    return erase_impl(key, hash);
+  }
+
+  void swap(ordered_hash& other) {
+    using std::swap;
+
+    swap(static_cast<Hash&>(*this), static_cast<Hash&>(other));
+    swap(static_cast<KeyEqual&>(*this), static_cast<KeyEqual&>(other));
+    swap(m_buckets_data, other.m_buckets_data);
+    swap(m_buckets, other.m_buckets);
+    swap(m_hash_mask, other.m_hash_mask);
+    swap(m_values, other.m_values);
+    swap(m_load_threshold, other.m_load_threshold);
+    swap(m_max_load_factor, other.m_max_load_factor);
+    swap(m_grow_on_next_insert, other.m_grow_on_next_insert);
+  }
+
+  /*
+   * Lookup
+   */
+  template <class K,
+            class U = ValueSelect,
+            typename std::enable_if<has_mapped_type<U>::value>::type* = nullptr>
+  typename U::value_type& at(const K& key) {
+    return at(key, hash_key(key));
+  }
+
+  template <class K,
+            class U = ValueSelect,
+            typename std::enable_if<has_mapped_type<U>::value>::type* = nullptr>
+  typename U::value_type& at(const K& key, std::size_t hash) {
+    return const_cast<typename U::value_type&>(
+        static_cast<const ordered_hash*>(this)->at(key, hash));
+  }
+
+  template <class K,
+            class U = ValueSelect,
+            typename std::enable_if<has_mapped_type<U>::value>::type* = nullptr>
+  const typename U::value_type& at(const K& key) const {
+    return at(key, hash_key(key));
+  }
+
+  template <class K,
+            class U = ValueSelect,
+            typename std::enable_if<has_mapped_type<U>::value>::type* = nullptr>
+  const typename U::value_type& at(const K& key, std::size_t hash) const {
+    auto it = find(key, hash);
+    if (it != end()) {
+      return it.value();
+    } else {
+      PADDLE_OH_THROW_OR_TERMINATE(std::out_of_range, "Couldn't find the key.");
+    }
+  }
+
+  template <class K,
+            class U = ValueSelect,
+            typename std::enable_if<has_mapped_type<U>::value>::type* = nullptr>
+  typename U::value_type& operator[](K&& key) {
+    return try_emplace(std::forward<K>(key)).first.value();
+  }
+
+  template <class K>
+  size_type count(const K& key) const {
+    return count(key, hash_key(key));
+  }
+
+  template <class K>
+  size_type count(const K& key, std::size_t hash) const {
+    if (find(key, hash) == cend()) {
+      return 0;
+    } else {
+      return 1;
+    }
+  }
+
+  template <class K>
+  iterator find(const K& key) {
+    return find(key, hash_key(key));
+  }
+
+  template <class K>
+  iterator find(const K& key, std::size_t hash) {
+    auto it_bucket = find_key(key, hash);
+    return (it_bucket != m_buckets_data.end())
+               ? iterator(m_values.begin() + it_bucket->index())
+               : end();
+  }
+
+  template <class K>
+  const_iterator find(const K& key) const {
+    return find(key, hash_key(key));
+  }
+
+  template <class K>
+  const_iterator find(const K& key, std::size_t hash) const {
+    auto it_bucket = find_key(key, hash);
+    return (it_bucket != m_buckets_data.cend())
+               ? const_iterator(m_values.begin() + it_bucket->index())
+               : end();
+  }
+
+  template <class K>
+  bool contains(const K& key) const {
+    return contains(key, hash_key(key));
+  }
+
+  template <class K>
+  bool contains(const K& key, std::size_t hash) const {
+    return find(key, hash) != cend();
+  }
+
+  template <class K>
+  std::pair<iterator, iterator> equal_range(const K& key) {
+    return equal_range(key, hash_key(key));
+  }
+
+  template <class K>
+  std::pair<iterator, iterator> equal_range(const K& key, std::size_t hash) {
+    iterator it = find(key, hash);
+    return std::make_pair(it, (it == end()) ? it : std::next(it));
+  }
+
+  template <class K>
+  std::pair<const_iterator, const_iterator> equal_range(const K& key) const {
+    return equal_range(key, hash_key(key));
+  }
+
+  template <class K>
+  std::pair<const_iterator, const_iterator> equal_range(
+      const K& key, std::size_t hash) const {
+    const_iterator it = find(key, hash);
+    return std::make_pair(it, (it == cend()) ? it : std::next(it));
+  }
+
+  /*
+   * Bucket interface
+   */
+  size_type bucket_count() const { return m_buckets_data.size(); }
+
+  size_type max_bucket_count() const { return m_buckets_data.max_size(); }
+
+  /*
+   *  Hash policy
+   */
+  float load_factor() const {
+    if (bucket_count() == 0) {
+      return 0;
+    }
+
+    return static_cast<float>(size()) / static_cast<float>(bucket_count());
+  }
+
+  float max_load_factor() const { return m_max_load_factor; }
+
+  void max_load_factor(float ml) {
+    m_max_load_factor = clamp(ml,
+                              static_cast<float>(MAX_LOAD_FACTOR__MINIMUM),
+                              static_cast<float>(MAX_LOAD_FACTOR__MAXIMUM));
+
+    m_max_load_factor = ml;
+    m_load_threshold =
+        size_type(static_cast<float>(bucket_count()) * m_max_load_factor);
+  }
+
+  void rehash(size_type count) {
+    count = std::max(
+        count,
+        size_type(std::ceil(static_cast<float>(size()) / max_load_factor())));
+    rehash_impl(count);
+  }
+
+  void reserve(size_type count) {
+    reserve_space_for_values(count);
+
+    count = size_type(std::ceil(static_cast<float>(count) / max_load_factor()));
+    rehash(count);
+  }
+
+  /*
+   * Observers
+   */
+  hasher hash_function() const { return static_cast<const Hash&>(*this); }
+
+  key_equal key_eq() const { return static_cast<const KeyEqual&>(*this); }
+
+  /*
+   * Other
+   */
+  iterator mutable_iterator(const_iterator pos) {
+    return iterator(m_values.begin() + iterator_to_index(pos));
+  }
+
+  iterator nth(size_type index) {
+    paddle_oh_assert(index <= size());
+    return iterator(m_values.begin() + index);
+  }
+
+  const_iterator nth(size_type index) const {
+    paddle_oh_assert(index <= size());
+    return const_iterator(m_values.cbegin() + index);
+  }
+
+  const_reference front() const {
+    paddle_oh_assert(!empty());
+    return m_values.front();
+  }
+
+  const_reference back() const {
+    paddle_oh_assert(!empty());
+    return m_values.back();
+  }
+
+  const values_container_type& values_container() const noexcept {
+    return m_values;
+  }
+
+  template <class U = values_container_type,
+            typename std::enable_if<is_vector<U>::value>::type* = nullptr>
+  const typename values_container_type::value_type* data() const noexcept {
+    return m_values.data();
+  }
+
+  template <class U = values_container_type,
+            typename std::enable_if<is_vector<U>::value>::type* = nullptr>
+  size_type capacity() const noexcept {
+    return m_values.capacity();
+  }
+
+  void shrink_to_fit() { m_values.shrink_to_fit(); }
+
+  template <typename P>
+  std::pair<iterator, bool> insert_at_position(const_iterator pos, P&& value) {
+    return insert_at_position_impl(
+        pos.m_iterator, KeySelect()(value), std::forward<P>(value));
+  }
+
+  template <class... Args>
+  std::pair<iterator, bool> emplace_at_position(const_iterator pos,
+                                                Args&&... args) {
+    return insert_at_position(pos, value_type(std::forward<Args>(args)...));
+  }
+
+  template <class K, class... Args>
+  std::pair<iterator, bool> try_emplace_at_position(const_iterator pos,
+                                                    K&& key,
+                                                    Args&&... value_args) {
+    return insert_at_position_impl(
+        pos.m_iterator,
+        key,
+        std::piecewise_construct,
+        std::forward_as_tuple(std::forward<K>(key)),
+        std::forward_as_tuple(std::forward<Args>(value_args)...));
+  }
+
+  void pop_back() {
+    paddle_oh_assert(!empty());
+    erase(std::prev(end()));
+  }
+
+  /**
+   * Here to avoid `template<class K> size_type unordered_erase(const K& key)`
+   * being used when we use a iterator instead of a const_iterator.
+   */
+  iterator unordered_erase(iterator pos) {
+    return unordered_erase(const_iterator(pos));
+  }
+
+  iterator unordered_erase(const_iterator pos) {
+    const std::size_t index_erase = iterator_to_index(pos);
+    unordered_erase(pos.key());
+
+    /*
+     * One element was deleted, index_erase now points to the next element as
+     * the elements after the deleted value were shifted to the left in m_values
+     * (will be end() if we deleted the last element).
+     */
+    return begin() + index_erase;
+  }
+
+  template <class K>
+  size_type unordered_erase(const K& key) {
+    return unordered_erase(key, hash_key(key));
+  }
+
+  template <class K>
+  size_type unordered_erase(const K& key, std::size_t hash) {
+    auto it_bucket_key = find_key(key, hash);
+    if (it_bucket_key == m_buckets_data.end()) {
+      return 0;
+    }
+
+    /**
+     * If we are not erasing the last element in m_values, we swap
+     * the element we are erasing with the last element. We then would
+     * just have to do a pop_back() in m_values.
+     */
+    if (!compare_keys(key, KeySelect()(back()))) {
+      auto it_bucket_last_elem =
+          find_key(KeySelect()(back()), hash_key(KeySelect()(back())));
+      paddle_oh_assert(it_bucket_last_elem != m_buckets_data.end());
+      paddle_oh_assert(it_bucket_last_elem->index() == m_values.size() - 1);
+
+      using std::swap;
+      swap(m_values[it_bucket_key->index()],
+           m_values[it_bucket_last_elem->index()]);
+      swap(it_bucket_key->index_ref(), it_bucket_last_elem->index_ref());
+    }
+
+    erase_value_from_bucket(it_bucket_key);
+
+    return 1;
+  }
+
+  template <class Serializer>
+  void serialize(Serializer& serializer) const {  // NOLINT
+    serialize_impl(serializer);
+  }
+
+  template <class Deserializer>
+  void deserialize(Deserializer& deserializer,  // NOLINT
+                   bool hash_compatible) {
+    deserialize_impl(deserializer, hash_compatible);
+  }
+
+  friend bool operator==(const ordered_hash& lhs, const ordered_hash& rhs) {
+    return lhs.m_values == rhs.m_values;
+  }
+
+  friend bool operator!=(const ordered_hash& lhs, const ordered_hash& rhs) {
+    return lhs.m_values != rhs.m_values;
+  }
+
+  friend bool operator<(const ordered_hash& lhs, const ordered_hash& rhs) {
+    return lhs.m_values < rhs.m_values;
+  }
+
+  friend bool operator<=(const ordered_hash& lhs, const ordered_hash& rhs) {
+    return lhs.m_values <= rhs.m_values;
+  }
+
+  friend bool operator>(const ordered_hash& lhs, const ordered_hash& rhs) {
+    return lhs.m_values > rhs.m_values;
+  }
+
+  friend bool operator>=(const ordered_hash& lhs, const ordered_hash& rhs) {
+    return lhs.m_values >= rhs.m_values;
+  }
+
+ private:
+  template <class K>
+  std::size_t hash_key(const K& key) const {
+    return Hash::operator()(key);
+  }
+
+  template <class K1, class K2>
+  bool compare_keys(const K1& key1, const K2& key2) const {
+    return KeyEqual::operator()(key1, key2);
+  }
+
+  template <class K>
+  typename buckets_container_type::iterator find_key(const K& key,
+                                                     std::size_t hash) {
+    auto it = static_cast<const ordered_hash*>(this)->find_key(key, hash);
+    return m_buckets_data.begin() + std::distance(m_buckets_data.cbegin(), it);
+  }
+
+  /**
+   * Return bucket which has the key 'key' or m_buckets_data.end() if none.
+   *
+   * From the bucket_for_hash, search for the value until we either find an
+   * empty bucket or a bucket which has a value with a distance from its ideal
+   * bucket longer than the probe length for the value we are looking for.
+   */
+  template <class K>
+  typename buckets_container_type::const_iterator find_key(
+      const K& key, std::size_t hash) const {
+    for (std::size_t ibucket = bucket_for_hash(hash),
+                     dist_from_ideal_bucket = 0;
+         ;  // NOLINT
+         ibucket = next_bucket(ibucket), dist_from_ideal_bucket++) {
+      if (m_buckets[ibucket].empty()) {
+        return m_buckets_data.end();
+      } else if (m_buckets[ibucket].truncated_hash() ==
+                     bucket_entry::truncate_hash(hash) &&
+                 compare_keys(
+                     key, KeySelect()(m_values[m_buckets[ibucket].index()]))) {
+        return m_buckets_data.begin() + ibucket;
+      } else if (dist_from_ideal_bucket > distance_from_ideal_bucket(ibucket)) {
+        return m_buckets_data.end();
+      }
+    }
+  }
+
+  void rehash_impl(size_type bucket_count) {
+    paddle_oh_assert(
+        bucket_count >=
+        size_type(std::ceil(static_cast<float>(size()) / max_load_factor())));
+
+    if (bucket_count > max_bucket_count()) {
+      PADDLE_OH_THROW_OR_TERMINATE(std::length_error,
+                                   "The map exceeds its maximum size.");
+    }
+
+    if (bucket_count > 0) {
+      bucket_count = round_up_to_power_of_two(bucket_count);
+    }
+
+    if (bucket_count == this->bucket_count()) {
+      return;
+    }
+
+    buckets_container_type old_buckets(bucket_count);
+    m_buckets_data.swap(old_buckets);
+    m_buckets = m_buckets_data.empty() ? static_empty_bucket_ptr()
+                                       : m_buckets_data.data();
+    // Everything should be noexcept from here.
+
+    m_hash_mask = (bucket_count > 0) ? (bucket_count - 1) : 0;
+    this->max_load_factor(m_max_load_factor);
+    m_grow_on_next_insert = false;
+
+    for (const bucket_entry& old_bucket : old_buckets) {
+      if (old_bucket.empty()) {
+        continue;
+      }
+
+      truncated_hash_type insert_hash = old_bucket.truncated_hash();
+      index_type insert_index = old_bucket.index();
+
+      for (std::size_t ibucket = bucket_for_hash(insert_hash),
+                       dist_from_ideal_bucket = 0;
+           ;  // NOLINT
+           ibucket = next_bucket(ibucket), dist_from_ideal_bucket++) {
+        if (m_buckets[ibucket].empty()) {
+          m_buckets[ibucket].set_index(insert_index);
+          m_buckets[ibucket].set_hash(insert_hash);
+          break;
+        }
+
+        const std::size_t distance = distance_from_ideal_bucket(ibucket);
+        if (dist_from_ideal_bucket > distance) {
+          std::swap(insert_index, m_buckets[ibucket].index_ref());
+          std::swap(insert_hash, m_buckets[ibucket].truncated_hash_ref());
+          dist_from_ideal_bucket = distance;
+        }
+      }
+    }
+  }
+
+  template <class T = values_container_type,
+            typename std::enable_if<is_vector<T>::value>::type* = nullptr>
+  void reserve_space_for_values(size_type count) {
+    m_values.reserve(count);
+  }
+
+  template <class T = values_container_type,
+            typename std::enable_if<!is_vector<T>::value>::type* = nullptr>
+  void reserve_space_for_values(size_type /*count*/) {}
+
+  /**
+   * Swap the empty bucket with the values on its right until we cross another
+   * empty bucket or if the other bucket has a distance_from_ideal_bucket == 0.
+   */
+  void backward_shift(std::size_t empty_ibucket) noexcept {
+    paddle_oh_assert(m_buckets[empty_ibucket].empty());
+
+    std::size_t previous_ibucket = empty_ibucket;
+    for (std::size_t current_ibucket = next_bucket(previous_ibucket);
+         !m_buckets[current_ibucket].empty() &&
+         distance_from_ideal_bucket(current_ibucket) > 0;
+         previous_ibucket = current_ibucket,
+                     current_ibucket = next_bucket(current_ibucket)) {
+      std::swap(m_buckets[current_ibucket], m_buckets[previous_ibucket]);
+    }
+  }
+
+  void erase_value_from_bucket(
+      typename buckets_container_type::iterator it_bucket) {
+    paddle_oh_assert(it_bucket != m_buckets_data.end() && !it_bucket->empty());
+
+    m_values.erase(m_values.begin() + it_bucket->index());
+
+    /*
+     * m_values.erase shifted all the values on the right of the erased value,
+     * shift the indexes by -1 in the buckets array for these values.
+     */
+    if (it_bucket->index() != m_values.size()) {
+      shift_indexes_in_buckets(it_bucket->index(), -1);
+    }
+
+    // Mark the bucket as empty and do a backward shift of the values on the
+    // right
+    it_bucket->clear();
+    backward_shift(
+        std::size_t(std::distance(m_buckets_data.begin(), it_bucket)));
+  }
+
+  /**
+   * Go through each value from [from_ivalue, m_values.size()) in m_values and
+   * for each bucket corresponding to the value, shift the index by delta.
+   *
+   * delta must be equal to 1 or -1.
+   */
+  void shift_indexes_in_buckets(index_type from_ivalue, int delta) noexcept {
+    paddle_oh_assert(delta == 1 || delta == -1);
+
+    for (std::size_t ivalue = from_ivalue; ivalue < m_values.size(); ivalue++) {
+      // All the values in m_values have been shifted by delta. Find the bucket
+      // corresponding to the value m_values[ivalue]
+      const index_type old_index = static_cast<index_type>(ivalue - delta);
+
+      std::size_t ibucket =
+          bucket_for_hash(hash_key(KeySelect()(m_values[ivalue])));
+      while (m_buckets[ibucket].index() != old_index) {
+        ibucket = next_bucket(ibucket);
+      }
+
+      m_buckets[ibucket].set_index(index_type(ivalue));
+    }
+  }
+
+  template <class K>
+  size_type erase_impl(const K& key, std::size_t hash) {
+    auto it_bucket = find_key(key, hash);
+    if (it_bucket != m_buckets_data.end()) {
+      erase_value_from_bucket(it_bucket);
+
+      return 1;
+    } else {
+      return 0;
+    }
+  }
+
+  /**
+   * Insert the element at the end.
+   */
+  template <class K, class... Args>
+  std::pair<iterator, bool> insert_impl(const K& key,
+                                        Args&&... value_type_args) {
+    const std::size_t hash = hash_key(key);
+
+    std::size_t ibucket = bucket_for_hash(hash);
+    std::size_t dist_from_ideal_bucket = 0;
+
+    while (!m_buckets[ibucket].empty() &&
+           dist_from_ideal_bucket <= distance_from_ideal_bucket(ibucket)) {
+      if (m_buckets[ibucket].truncated_hash() ==
+              bucket_entry::truncate_hash(hash) &&
+          compare_keys(key,
+                       KeySelect()(m_values[m_buckets[ibucket].index()]))) {
+        return std::make_pair(begin() + m_buckets[ibucket].index(), false);
+      }
+
+      ibucket = next_bucket(ibucket);
+      dist_from_ideal_bucket++;
+    }
+
+    if (size() >= max_size()) {
+      PADDLE_OH_THROW_OR_TERMINATE(
+          std::length_error, "We reached the maximum size for the hash table.");
+    }
+
+    if (grow_on_high_load()) {
+      ibucket = bucket_for_hash(hash);
+      dist_from_ideal_bucket = 0;
+    }
+
+    m_values.emplace_back(std::forward<Args>(value_type_args)...);
+    insert_index(ibucket,
+                 dist_from_ideal_bucket,
+                 index_type(m_values.size() - 1),
+                 bucket_entry::truncate_hash(hash));
+
+    return std::make_pair(std::prev(end()), true);
+  }
+
+  /**
+   * Insert the element before insert_position.
+   */
+  template <class K, class... Args>
+  std::pair<iterator, bool> insert_at_position_impl(
+      typename values_container_type::const_iterator insert_position,
+      const K& key,
+      Args&&... value_type_args) {
+    const std::size_t hash = hash_key(key);
+
+    std::size_t ibucket = bucket_for_hash(hash);
+    std::size_t dist_from_ideal_bucket = 0;
+
+    while (!m_buckets[ibucket].empty() &&
+           dist_from_ideal_bucket <= distance_from_ideal_bucket(ibucket)) {
+      if (m_buckets[ibucket].truncated_hash() ==
+              bucket_entry::truncate_hash(hash) &&
+          compare_keys(key,
+                       KeySelect()(m_values[m_buckets[ibucket].index()]))) {
+        return std::make_pair(begin() + m_buckets[ibucket].index(), false);
+      }
+
+      ibucket = next_bucket(ibucket);
+      dist_from_ideal_bucket++;
+    }
+
+    if (size() >= max_size()) {
+      PADDLE_OH_THROW_OR_TERMINATE(
+          std::length_error, "We reached the maximum size for the hash table.");
+    }
+
+    if (grow_on_high_load()) {
+      ibucket = bucket_for_hash(hash);
+      dist_from_ideal_bucket = 0;
+    }
+
+    const index_type index_insert_position =
+        index_type(std::distance(m_values.cbegin(), insert_position));
+
+#ifdef PADDLE_OH_NO_CONTAINER_EMPLACE_CONST_ITERATOR
+    m_values.emplace(
+        m_values.begin() + std::distance(m_values.cbegin(), insert_position),
+        std::forward<Args>(value_type_args)...);
+#else
+    m_values.emplace(insert_position, std::forward<Args>(value_type_args)...);
+#endif
+
+    insert_index(ibucket,
+                 dist_from_ideal_bucket,
+                 index_insert_position,
+                 bucket_entry::truncate_hash(hash));
+
+    /*
+     * The insertion didn't happend at the end of the m_values container,
+     * we need to shift the indexes in m_buckets_data.
+     */
+    if (index_insert_position != m_values.size() - 1) {
+      shift_indexes_in_buckets(index_insert_position + 1, 1);
+    }
+
+    return std::make_pair(iterator(m_values.begin() + index_insert_position),
+                          true);
+  }
+
+  void insert_index(std::size_t ibucket,
+                    std::size_t dist_from_ideal_bucket,
+                    index_type index_insert,
+                    truncated_hash_type hash_insert) noexcept {
+    while (!m_buckets[ibucket].empty()) {
+      const std::size_t distance = distance_from_ideal_bucket(ibucket);
+      if (dist_from_ideal_bucket > distance) {
+        std::swap(index_insert, m_buckets[ibucket].index_ref());
+        std::swap(hash_insert, m_buckets[ibucket].truncated_hash_ref());
+
+        dist_from_ideal_bucket = distance;
+      }
+
+      ibucket = next_bucket(ibucket);
+      dist_from_ideal_bucket++;
+
+      if (dist_from_ideal_bucket > REHASH_ON_HIGH_NB_PROBES__NPROBES &&
+          !m_grow_on_next_insert &&
+          load_factor() >= REHASH_ON_HIGH_NB_PROBES__MIN_LOAD_FACTOR) {
+        // We don't want to grow the map now as we need this method to be
+        // noexcept. Do it on next insert.
+        m_grow_on_next_insert = true;
+      }
+    }
+
+    m_buckets[ibucket].set_index(index_insert);
+    m_buckets[ibucket].set_hash(hash_insert);
+  }
+
+  std::size_t distance_from_ideal_bucket(std::size_t ibucket) const noexcept {
+    const std::size_t ideal_bucket =
+        bucket_for_hash(m_buckets[ibucket].truncated_hash());
+
+    if (ibucket >= ideal_bucket) {
+      return ibucket - ideal_bucket;
+    } else {
+      // If the bucket is smaller than the ideal bucket for the value, there was
+      // a
+      // wrapping at the end of the bucket array due to the modulo.
+      return (bucket_count() + ibucket) - ideal_bucket;
+    }
+  }
+
+  std::size_t next_bucket(std::size_t index) const noexcept {
+    paddle_oh_assert(index < m_buckets_data.size());
+
+    index++;
+    return (index < m_buckets_data.size()) ? index : 0;
+  }
+
+  std::size_t bucket_for_hash(std::size_t hash) const noexcept {
+    return hash & m_hash_mask;
+  }
+
+  std::size_t iterator_to_index(const_iterator it) const noexcept {
+    const auto dist = std::distance(cbegin(), it);
+    paddle_oh_assert(dist >= 0);
+
+    return std::size_t(dist);
+  }
+
+  /**
+   * Return true if the map has been rehashed.
+   */
+  bool grow_on_high_load() {
+    if (m_grow_on_next_insert || size() >= m_load_threshold) {
+      rehash_impl(std::max(size_type(1), bucket_count() * 2));
+      m_grow_on_next_insert = false;
+
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  template <class Serializer>
+  void serialize_impl(Serializer& serializer) const {  // NOLINT
+    const slz_size_type version = SERIALIZATION_PROTOCOL_VERSION;
+    serializer(version);
+
+    const slz_size_type nb_elements = m_values.size();
+    serializer(nb_elements);
+
+    const slz_size_type bucket_count = m_buckets_data.size();
+    serializer(bucket_count);
+
+    const float max_load_factor = m_max_load_factor;
+    serializer(max_load_factor);
+
+    for (const value_type& value : m_values) {
+      serializer(value);
+    }
+
+    for (const bucket_entry& bucket : m_buckets_data) {
+      bucket.serialize(serializer);
+    }
+  }
+
+  template <class Deserializer>
+  void deserialize_impl(Deserializer& deserializer,  // NOLINT
+                        bool hash_compatible) {
+    paddle_oh_assert(
+        m_buckets_data.empty());  // Current hash table must be empty
+
+    const slz_size_type version =
+        deserialize_value<slz_size_type>(deserializer);
+    // For now we only have one version of the serialization protocol.
+    // If it doesn't match there is a problem with the file.
+    if (version != SERIALIZATION_PROTOCOL_VERSION) {
+      PADDLE_OH_THROW_OR_TERMINATE(std::runtime_error,
+                                   "Can't deserialize the ordered_map/set. "
+                                   "The protocol version header is invalid.");
+    }
+
+    const slz_size_type nb_elements =
+        deserialize_value<slz_size_type>(deserializer);
+    const slz_size_type bucket_count_ds =
+        deserialize_value<slz_size_type>(deserializer);
+    const float max_load_factor = deserialize_value<float>(deserializer);
+
+    if (max_load_factor < MAX_LOAD_FACTOR__MINIMUM ||
+        max_load_factor > MAX_LOAD_FACTOR__MAXIMUM) {
+      PADDLE_OH_THROW_OR_TERMINATE(
+          std::runtime_error,
+          "Invalid max_load_factor. Check that the serializer "
+          "and deserializer support floats correctly as they "
+          "can be converted implicitly to ints.");
+    }
+
+    this->max_load_factor(max_load_factor);
+
+    if (bucket_count_ds == 0) {
+      paddle_oh_assert(nb_elements == 0);
+      return;
+    }
+
+    if (!hash_compatible) {
+      reserve(numeric_cast<size_type>(nb_elements,
+                                      "Deserialized nb_elements is too big."));
+      for (slz_size_type el = 0; el < nb_elements; el++) {
+        insert(deserialize_value<value_type>(deserializer));
+      }
+    } else {
+      m_buckets_data.reserve(numeric_cast<size_type>(
+          bucket_count_ds, "Deserialized bucket_count is too big."));
+      m_buckets = m_buckets_data.data(),
+      m_hash_mask = m_buckets_data.capacity() - 1;
+
+      reserve_space_for_values(numeric_cast<size_type>(
+          nb_elements, "Deserialized nb_elements is too big."));
+      for (slz_size_type el = 0; el < nb_elements; el++) {
+        m_values.push_back(deserialize_value<value_type>(deserializer));
+      }
+
+      for (slz_size_type b = 0; b < bucket_count_ds; b++) {
+        m_buckets_data.push_back(bucket_entry::deserialize(deserializer));
+      }
+    }
+  }
+
+  static std::size_t round_up_to_power_of_two(std::size_t value) {
+    if (is_power_of_two(value)) {
+      return value;
+    }
+
+    if (value == 0) {
+      return 1;
+    }
+
+    --value;
+    for (std::size_t i = 1; i < sizeof(std::size_t) * CHAR_BIT; i *= 2) {
+      value |= value >> i;
+    }
+
+    return value + 1;
+  }
+
+  static constexpr bool is_power_of_two(std::size_t value) {
+    return value != 0 && (value & (value - 1)) == 0;
+  }
+
+ public:
+  static const size_type DEFAULT_INIT_BUCKETS_SIZE = 0;
+  static constexpr float DEFAULT_MAX_LOAD_FACTOR = 0.75f;
+
+ private:
+  static constexpr float MAX_LOAD_FACTOR__MINIMUM = 0.1f;
+  static constexpr float MAX_LOAD_FACTOR__MAXIMUM = 0.95f;
+
+  static const size_type REHASH_ON_HIGH_NB_PROBES__NPROBES = 128;
+  static constexpr float REHASH_ON_HIGH_NB_PROBES__MIN_LOAD_FACTOR = 0.15f;
+
+  /**
+   * Protocol version currenlty used for serialization.
+   */
+  static const slz_size_type SERIALIZATION_PROTOCOL_VERSION = 1;
+
+  /**
+   * Return an always valid pointer to an static empty bucket_entry with
+   * last_bucket() == true.
+   */
+  bucket_entry* static_empty_bucket_ptr() {
+    static bucket_entry empty_bucket;
+    return &empty_bucket;
+  }
+
+ private:
+  buckets_container_type m_buckets_data;
+
+  /**
+   * Points to m_buckets_data.data() if !m_buckets_data.empty() otherwise points
+   * to static_empty_bucket_ptr. This variable is useful to avoid the cost of
+   * checking if m_buckets_data is empty when trying to find an element.
+   *
+   * TODO Remove m_buckets_data and only use a pointer+size instead of a
+   * pointer+vector to save some space in the ordered_hash object.
+   */
+  bucket_entry* m_buckets;
+
+  size_type m_hash_mask;
+
+  values_container_type m_values;
+
+  size_type m_load_threshold;
+  float m_max_load_factor;
+
+  bool m_grow_on_next_insert;
+};
+
+}  // end namespace detail_ordered_hash
+
+}  // end namespace paddle
diff --git a/paddle/utils/ordered_map.h b/paddle/utils/ordered_map.h
new file mode 100644
index 0000000000000..10bf5628ed3e8
--- /dev/null
+++ b/paddle/utils/ordered_map.h
@@ -0,0 +1,1022 @@
+/**
+ * Copy from https://github.com/Tessil/ordered-map
+ * Modified the following points:
+ * 1. modify namespace from `tsl` to `paddle`
+ * 2. modify some naming prefixes from `tsl` to `paddle`
+ * 3. refine code-format by pre-commit hook
+ */
+
+/**
+ * MIT License
+ *
+ * Copyright (c) 2017 Thibaut Goetghebuer-Planchon <tessil@gmx.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+#include <deque>
+#include <functional>
+#include <initializer_list>
+#include <memory>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "paddle/utils/ordered_hash.h"
+
+namespace paddle {
+
+/**
+ * Implementation of an hash map using open addressing with robin hood with
+ * backshift delete to resolve collisions.
+ *
+ * The particularity of this hash map is that it remembers the order in which
+ * the elements were added and provide a way to access the structure which
+ * stores these values through the 'values_container()' method. The used
+ * container is defined by ValueTypeContainer, by default a std::deque is used
+ * (grows faster) but a std::vector may be used. In this case the map provides a
+ * 'data()' method which give a direct access to the memory used to store the
+ * values (which can be useful to communicate with C API's).
+ *
+ * The Key and T must be copy constructible and/or move constructible. To use
+ * `unordered_erase` they both must be swappable.
+ *
+ * The behaviour of the hash map is undefined if the destructor of Key or T
+ * throws an exception.
+ *
+ * By default the maximum size of a map is limited to 2^32 - 1 values, if needed
+ * this can be changed through the IndexType template parameter. Using an
+ * `uint64_t` will raise this limit to 2^64 - 1 values but each bucket will use
+ * 16 bytes instead of 8 bytes in addition to the space needed to store the
+ * values.
+ *
+ * Iterators invalidation:
+ *  - clear, operator=, reserve, rehash: always invalidate the iterators (also
+ * invalidate end()).
+ *  - insert, emplace, emplace_hint, operator[]: when a std::vector is used as
+ * ValueTypeContainer and if size() < capacity(), only end(). Otherwise all the
+ * iterators are invalidated if an insert occurs.
+ *  - erase, unordered_erase: when a std::vector is used as ValueTypeContainer
+ * invalidate the iterator of the erased element and all the ones after the
+ * erased element (including end()). Otherwise all the iterators are invalidated
+ * if an erase occurs.
+ */
+template <class Key,
+          class T,
+          class Hash = std::hash<Key>,
+          class KeyEqual = std::equal_to<Key>,
+          class Allocator = std::allocator<std::pair<Key, T>>,
+          class ValueTypeContainer = std::deque<std::pair<Key, T>, Allocator>,
+          class IndexType = std::uint_least32_t>
+class ordered_map {
+ private:
+  template <typename U>
+  using has_is_transparent = paddle::detail_ordered_hash::has_is_transparent<U>;
+
+  class KeySelect {
+   public:
+    using key_type = Key;
+
+    const key_type& operator()(const std::pair<Key, T>& key_value) const
+        noexcept {
+      return key_value.first;
+    }
+
+    key_type& operator()(std::pair<Key, T>& key_value) noexcept {  // NOLINT
+      return key_value.first;
+    }
+  };
+
+  class ValueSelect {
+   public:
+    using value_type = T;
+
+    const value_type& operator()(const std::pair<Key, T>& key_value) const
+        noexcept {
+      return key_value.second;
+    }
+
+    value_type& operator()(std::pair<Key, T>& key_value) noexcept {  // NOLINT
+      return key_value.second;
+    }
+  };
+
+  using ht = detail_ordered_hash::ordered_hash<std::pair<Key, T>,
+                                               KeySelect,
+                                               ValueSelect,
+                                               Hash,
+                                               KeyEqual,
+                                               Allocator,
+                                               ValueTypeContainer,
+                                               IndexType>;
+
+ public:
+  using key_type = typename ht::key_type;
+  using mapped_type = T;
+  using value_type = typename ht::value_type;
+  using size_type = typename ht::size_type;
+  using difference_type = typename ht::difference_type;
+  using hasher = typename ht::hasher;
+  using key_equal = typename ht::key_equal;
+  using allocator_type = typename ht::allocator_type;
+  using reference = typename ht::reference;
+  using const_reference = typename ht::const_reference;
+  using pointer = typename ht::pointer;
+  using const_pointer = typename ht::const_pointer;
+  using iterator = typename ht::iterator;
+  using const_iterator = typename ht::const_iterator;
+  using reverse_iterator = typename ht::reverse_iterator;
+  using const_reverse_iterator = typename ht::const_reverse_iterator;
+
+  using values_container_type = typename ht::values_container_type;
+
+  /*
+   * Constructors
+   */
+  ordered_map() : ordered_map(ht::DEFAULT_INIT_BUCKETS_SIZE) {}
+
+  explicit ordered_map(size_type bucket_count,
+                       const Hash& hash = Hash(),
+                       const KeyEqual& equal = KeyEqual(),
+                       const Allocator& alloc = Allocator())
+      : m_ht(bucket_count, hash, equal, alloc, ht::DEFAULT_MAX_LOAD_FACTOR) {}
+
+  ordered_map(size_type bucket_count, const Allocator& alloc)
+      : ordered_map(bucket_count, Hash(), KeyEqual(), alloc) {}
+
+  ordered_map(size_type bucket_count, const Hash& hash, const Allocator& alloc)
+      : ordered_map(bucket_count, hash, KeyEqual(), alloc) {}
+
+  explicit ordered_map(const Allocator& alloc)
+      : ordered_map(ht::DEFAULT_INIT_BUCKETS_SIZE, alloc) {}
+
+  template <class InputIt>
+  ordered_map(InputIt first,
+              InputIt last,
+              size_type bucket_count = ht::DEFAULT_INIT_BUCKETS_SIZE,
+              const Hash& hash = Hash(),
+              const KeyEqual& equal = KeyEqual(),
+              const Allocator& alloc = Allocator())
+      : ordered_map(bucket_count, hash, equal, alloc) {
+    insert(first, last);
+  }
+
+  template <class InputIt>
+  ordered_map(InputIt first,
+              InputIt last,
+              size_type bucket_count,
+              const Allocator& alloc)
+      : ordered_map(first, last, bucket_count, Hash(), KeyEqual(), alloc) {}
+
+  template <class InputIt>
+  ordered_map(InputIt first,
+              InputIt last,
+              size_type bucket_count,
+              const Hash& hash,
+              const Allocator& alloc)
+      : ordered_map(first, last, bucket_count, hash, KeyEqual(), alloc) {}
+
+  ordered_map(std::initializer_list<value_type> init,
+              size_type bucket_count = ht::DEFAULT_INIT_BUCKETS_SIZE,
+              const Hash& hash = Hash(),
+              const KeyEqual& equal = KeyEqual(),
+              const Allocator& alloc = Allocator())
+      : ordered_map(
+            init.begin(), init.end(), bucket_count, hash, equal, alloc) {}
+
+  ordered_map(std::initializer_list<value_type> init,
+              size_type bucket_count,
+              const Allocator& alloc)
+      : ordered_map(
+            init.begin(), init.end(), bucket_count, Hash(), KeyEqual(), alloc) {
+  }
+
+  ordered_map(std::initializer_list<value_type> init,
+              size_type bucket_count,
+              const Hash& hash,
+              const Allocator& alloc)
+      : ordered_map(
+            init.begin(), init.end(), bucket_count, hash, KeyEqual(), alloc) {}
+
+  ordered_map& operator=(std::initializer_list<value_type> ilist) {
+    m_ht.clear();
+
+    m_ht.reserve(ilist.size());
+    m_ht.insert(ilist.begin(), ilist.end());
+
+    return *this;
+  }
+
+  allocator_type get_allocator() const { return m_ht.get_allocator(); }
+
+  /*
+   * Iterators
+   */
+  iterator begin() noexcept { return m_ht.begin(); }
+  const_iterator begin() const noexcept { return m_ht.begin(); }
+  const_iterator cbegin() const noexcept { return m_ht.cbegin(); }
+
+  iterator end() noexcept { return m_ht.end(); }
+  const_iterator end() const noexcept { return m_ht.end(); }
+  const_iterator cend() const noexcept { return m_ht.cend(); }
+
+  reverse_iterator rbegin() noexcept { return m_ht.rbegin(); }
+  const_reverse_iterator rbegin() const noexcept { return m_ht.rbegin(); }
+  const_reverse_iterator rcbegin() const noexcept { return m_ht.rcbegin(); }
+
+  reverse_iterator rend() noexcept { return m_ht.rend(); }
+  const_reverse_iterator rend() const noexcept { return m_ht.rend(); }
+  const_reverse_iterator rcend() const noexcept { return m_ht.rcend(); }
+
+  /*
+   * Capacity
+   */
+  bool empty() const noexcept { return m_ht.empty(); }
+  size_type size() const noexcept { return m_ht.size(); }
+  size_type max_size() const noexcept { return m_ht.max_size(); }
+
+  /*
+   * Modifiers
+   */
+  void clear() noexcept { m_ht.clear(); }
+
+  std::pair<iterator, bool> insert(const value_type& value) {
+    return m_ht.insert(value);
+  }
+
+  template <class P,
+            typename std::enable_if<
+                std::is_constructible<value_type, P&&>::value>::type* = nullptr>
+  std::pair<iterator, bool> insert(P&& value) {
+    return m_ht.emplace(std::forward<P>(value));
+  }
+
+  std::pair<iterator, bool> insert(value_type&& value) {
+    return m_ht.insert(std::move(value));
+  }
+
+  iterator insert(const_iterator hint, const value_type& value) {
+    return m_ht.insert_hint(hint, value);
+  }
+
+  template <class P,
+            typename std::enable_if<
+                std::is_constructible<value_type, P&&>::value>::type* = nullptr>
+  iterator insert(const_iterator hint, P&& value) {
+    return m_ht.emplace_hint(hint, std::forward<P>(value));
+  }
+
+  iterator insert(const_iterator hint, value_type&& value) {
+    return m_ht.insert_hint(hint, std::move(value));
+  }
+
+  template <class InputIt>
+  void insert(InputIt first, InputIt last) {
+    m_ht.insert(first, last);
+  }
+  void insert(std::initializer_list<value_type> ilist) {
+    m_ht.insert(ilist.begin(), ilist.end());
+  }
+
+  template <class M>
+  std::pair<iterator, bool> insert_or_assign(const key_type& k, M&& obj) {
+    return m_ht.insert_or_assign(k, std::forward<M>(obj));
+  }
+
+  template <class M>
+  std::pair<iterator, bool> insert_or_assign(key_type&& k, M&& obj) {
+    return m_ht.insert_or_assign(std::move(k), std::forward<M>(obj));
+  }
+
+  template <class M>
+  iterator insert_or_assign(const_iterator hint, const key_type& k, M&& obj) {
+    return m_ht.insert_or_assign(hint, k, std::forward<M>(obj));
+  }
+
+  template <class M>
+  iterator insert_or_assign(const_iterator hint, key_type&& k, M&& obj) {
+    return m_ht.insert_or_assign(hint, std::move(k), std::forward<M>(obj));
+  }
+
+  /**
+   * Due to the way elements are stored, emplace will need to move or copy the
+   * key-value once. The method is equivalent to
+   * insert(value_type(std::forward<Args>(args)...));
+   *
+   * Mainly here for compatibility with the std::unordered_map interface.
+   */
+  template <class... Args>
+  std::pair<iterator, bool> emplace(Args&&... args) {
+    return m_ht.emplace(std::forward<Args>(args)...);
+  }
+
+  /**
+   * Due to the way elements are stored, emplace_hint will need to move or copy
+   * the key-value once. The method is equivalent to insert(hint,
+   * value_type(std::forward<Args>(args)...));
+   *
+   * Mainly here for compatibility with the std::unordered_map interface.
+   */
+  template <class... Args>
+  iterator emplace_hint(const_iterator hint, Args&&... args) {
+    return m_ht.emplace_hint(hint, std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  std::pair<iterator, bool> try_emplace(const key_type& k, Args&&... args) {
+    return m_ht.try_emplace(k, std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  std::pair<iterator, bool> try_emplace(key_type&& k, Args&&... args) {
+    return m_ht.try_emplace(std::move(k), std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  iterator try_emplace(const_iterator hint, const key_type& k, Args&&... args) {
+    return m_ht.try_emplace_hint(hint, k, std::forward<Args>(args)...);
+  }
+
+  template <class... Args>
+  iterator try_emplace(const_iterator hint, key_type&& k, Args&&... args) {
+    return m_ht.try_emplace_hint(
+        hint, std::move(k), std::forward<Args>(args)...);
+  }
+
+  /**
+   * When erasing an element, the insert order will be preserved and no holes
+   * will be present in the container returned by 'values_container()'.
+   *
+   * The method is in O(n), if the order is not important 'unordered_erase(...)'
+   * method is faster with an O(1) average complexity.
+   */
+  iterator erase(iterator pos) { return m_ht.erase(pos); }
+
+  /**
+   * @copydoc erase(iterator pos)
+   */
+  iterator erase(const_iterator pos) { return m_ht.erase(pos); }
+
+  /**
+   * @copydoc erase(iterator pos)
+   */
+  iterator erase(const_iterator first, const_iterator last) {
+    return m_ht.erase(first, last);
+  }
+
+  /**
+   * @copydoc erase(iterator pos)
+   */
+  size_type erase(const key_type& key) { return m_ht.erase(key); }
+
+  /**
+   * @copydoc erase(iterator pos)
+   *
+   * Use the hash value 'precalculated_hash' instead of hashing the key. The
+   * hash value should be the same as hash_function()(key). Useful to speed-up
+   * the lookup to the value if you already have the hash.
+   */
+  size_type erase(const key_type& key, std::size_t precalculated_hash) {
+    return m_ht.erase(key, precalculated_hash);
+  }
+
+  /**
+   * @copydoc erase(iterator pos)
+   *
+   * This overload only participates in the overload resolution if the typedef
+   * KeyEqual::is_transparent exists. If so, K must be hashable and comparable
+   * to Key.
+   */
+  template <
+      class K,
+      class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  size_type erase(const K& key) {
+    return m_ht.erase(key);
+  }
+
+  /**
+   * @copydoc erase(const key_type& key, std::size_t precalculated_hash)
+   *
+   * This overload only participates in the overload resolution if the typedef
+   * KeyEqual::is_transparent exists. If so, K must be hashable and comparable
+   * to Key.
+   */
+  template <
+      class K,
+      class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  size_type erase(const K& key, std::size_t precalculated_hash) {
+    return m_ht.erase(key, precalculated_hash);
+  }
+
+  void swap(ordered_map& other) { other.m_ht.swap(m_ht); }
+
+  /*
+   * Lookup
+   */
+  T& at(const Key& key) { return m_ht.at(key); }
+
+  /**
+   * Use the hash value 'precalculated_hash' instead of hashing the key. The
+   * hash value should be the same as hash_function()(key). Useful to speed-up
+   * the lookup if you already have the hash.
+   */
+  T& at(const Key& key, std::size_t precalculated_hash) {
+    return m_ht.at(key, precalculated_hash);
+  }
+
+  const T& at(const Key& key) const { return m_ht.at(key); }
+
+  /**
+   * @copydoc at(const Key& key, std::size_t precalculated_hash)
+   */
+  const T& at(const Key& key, std::size_t precalculated_hash) const {
+    return m_ht.at(key, precalculated_hash);
+  }
+
+  /**
+   * This overload only participates in the overload resolution if the typedef
+   * KeyEqual::is_transparent exists. If so, K must be hashable and comparable
+   * to Key.
+   */
+  template <
+      class K,
+      class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  T& at(const K& key) {
+    return m_ht.at(key);
+  }
+
+  /**
+   * @copydoc at(const K& key)
+   *
+   * Use the hash value 'precalculated_hash' instead of hashing the key. The
+   * hash value should be the same as hash_function()(key). Useful to speed-up
+   * the lookup if you already have the hash.
+   */
+  template <
+      class K,
+      class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  T& at(const K& key, std::size_t precalculated_hash) {
+    return m_ht.at(key, precalculated_hash);
+  }
+
+  /**
+   * @copydoc at(const K& key)
+   */
+  template <
+      class K,
+      class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  const T& at(const K& key) const {
+    return m_ht.at(key);
+  }
+
+  /**
+   * @copydoc at(const K& key, std::size_t precalculated_hash)
+   */
+  template <
+      class K,
+      class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  const T& at(const K& key, std::size_t precalculated_hash) const {
+    return m_ht.at(key, precalculated_hash);
+  }
+
+  T& operator[](const Key& key) { return m_ht[key]; }
+  T& operator[](Key&& key) { return m_ht[std::move(key)]; }
+
+  size_type count(const Key& key) const { return m_ht.count(key); }
+
+  /**
+   * Use the hash value 'precalculated_hash' instead of hashing the key. The
+   * hash value should be the same as hash_function()(key). Useful to speed-up
+   * the lookup if you already have the hash.
+   */
+  size_type count(const Key& key, std::size_t precalculated_hash) const {
+    return m_ht.count(key, precalculated_hash);
+  }
+
+  /**
+   * This overload only participates in the overload resolution if the typedef
+   * KeyEqual::is_transparent exists. If so, K must be hashable and comparable
+   * to Key.
+   */
+  template <
+      class K,
+      class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  size_type count(const K& key) const {
+    return m_ht.count(key);
+  }
+
+  /**
+   * @copydoc count(const K& key) const
+   *
+   * Use the hash value 'precalculated_hash' instead of hashing the key. The
+   * hash value should be the same as hash_function()(key). Useful to speed-up
+   * the lookup if you already have the hash.
+   */
+  template <
+      class K,
+      class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  size_type count(const K& key, std::size_t precalculated_hash) const {
+    return m_ht.count(key, precalculated_hash);
+  }
+
+  iterator find(const Key& key) { return m_ht.find(key); }
+
+  /**
+   * Use the hash value 'precalculated_hash' instead of hashing the key. The
+   * hash value should be the same as hash_function()(key). Useful to speed-up
+   * the lookup if you already have the hash.
+   */
+  iterator find(const Key& key, std::size_t precalculated_hash) {
+    return m_ht.find(key, precalculated_hash);
+  }
+
+  const_iterator find(const Key& key) const { return m_ht.find(key); }
+
+  /**
+   * @copydoc find(const Key& key, std::size_t precalculated_hash)
+   */
+  const_iterator find(const Key& key, std::size_t precalculated_hash) const {
+    return m_ht.find(key, precalculated_hash);
+  }
+
+  /**
+   * This overload only participates in the overload resolution if the typedef
+   * KeyEqual::is_transparent exists. If so, K must be hashable and comparable
+   * to Key.
+   */
+  template <
+      class K,
+      class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  iterator find(const K& key) {
+    return m_ht.find(key);
+  }
+
+  /**
+   * @copydoc find(const K& key)
+   *
+   * Use the hash value 'precalculated_hash' instead of hashing the key. The
+   * hash value should be the same as hash_function()(key). Useful to speed-up
+   * the lookup if you already have the hash.
+   */
+  template <
+      class K,
+      class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  iterator find(const K& key, std::size_t precalculated_hash) {
+    return m_ht.find(key, precalculated_hash);
+  }
+
+  /**
+   * @copydoc find(const K& key)
+   */
+  template <
+      class K,
+      class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  const_iterator find(const K& key) const {
+    return m_ht.find(key);
+  }
+
+  /**
+   * @copydoc find(const K& key)
+   *
+   * Use the hash value 'precalculated_hash' instead of hashing the key. The
+   * hash value should be the same as hash_function()(key). Useful to speed-up
+   * the lookup if you already have the hash.
+   */
+  template <
+      class K,
+      class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  const_iterator find(const K& key, std::size_t precalculated_hash) const {
+    return m_ht.find(key, precalculated_hash);
+  }
+
+  bool contains(const Key& key) const { return m_ht.contains(key); }
+
+  /**
+   * Use the hash value 'precalculated_hash' instead of hashing the key. The
+   * hash value should be the same as hash_function()(key). Useful to speed-up
+   * the lookup if you already have the hash.
+   */
+  bool contains(const Key& key, std::size_t precalculated_hash) const {
+    return m_ht.contains(key, precalculated_hash);
+  }
+
+  /**
+   * This overload only participates in the overload resolution if the typedef
+   * KeyEqual::is_transparent exists. If so, K must be hashable and comparable
+   * to Key.
+   */
+  template <
+      class K,
+      class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  bool contains(const K& key) const {
+    return m_ht.contains(key);
+  }
+
+  /**
+   * @copydoc contains(const K& key) const
+   *
+   * Use the hash value 'precalculated_hash' instead of hashing the key. The
+   * hash value should be the same as hash_function()(key). Useful to speed-up
+   * the lookup if you already have the hash.
+   */
+  template <
+      class K,
+      class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  bool contains(const K& key, std::size_t precalculated_hash) const {
+    return m_ht.contains(key, precalculated_hash);
+  }
+
+  std::pair<iterator, iterator> equal_range(const Key& key) {
+    return m_ht.equal_range(key);
+  }
+
+  /**
+   * Use the hash value 'precalculated_hash' instead of hashing the key. The
+   * hash value should be the same as hash_function()(key). Useful to speed-up
+   * the lookup if you already have the hash.
+   */
+  std::pair<iterator, iterator> equal_range(const Key& key,
+                                            std::size_t precalculated_hash) {
+    return m_ht.equal_range(key, precalculated_hash);
+  }
+
+  std::pair<const_iterator, const_iterator> equal_range(const Key& key) const {
+    return m_ht.equal_range(key);
+  }
+
+  /**
+   * @copydoc equal_range(const Key& key, std::size_t precalculated_hash)
+   */
+  std::pair<const_iterator, const_iterator> equal_range(
+      const Key& key, std::size_t precalculated_hash) const {
+    return m_ht.equal_range(key, precalculated_hash);
+  }
+
+  /**
+   * This overload only participates in the overload resolution if the typedef
+   * KeyEqual::is_transparent exists. If so, K must be hashable and comparable
+   * to Key.
+   */
+  template <
+      class K,
+      class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  std::pair<iterator, iterator> equal_range(const K& key) {
+    return m_ht.equal_range(key);
+  }
+
+  /**
+   * @copydoc equal_range(const K& key)
+   *
+   * Use the hash value 'precalculated_hash' instead of hashing the key. The
+   * hash value should be the same as hash_function()(key). Useful to speed-up
+   * the lookup if you already have the hash.
+   */
+  template <
+      class K,
+      class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  std::pair<iterator, iterator> equal_range(const K& key,
+                                            std::size_t precalculated_hash) {
+    return m_ht.equal_range(key, precalculated_hash);
+  }
+
+  /**
+   * @copydoc equal_range(const K& key)
+   */
+  template <
+      class K,
+      class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  std::pair<const_iterator, const_iterator> equal_range(const K& key) const {
+    return m_ht.equal_range(key);
+  }
+
+  /**
+   * @copydoc equal_range(const K& key, std::size_t precalculated_hash)
+   */
+  template <
+      class K,
+      class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  std::pair<const_iterator, const_iterator> equal_range(
+      const K& key, std::size_t precalculated_hash) const {
+    return m_ht.equal_range(key, precalculated_hash);
+  }
+
+  /*
+   * Bucket interface
+   */
+  size_type bucket_count() const { return m_ht.bucket_count(); }
+  size_type max_bucket_count() const { return m_ht.max_bucket_count(); }
+
+  /*
+   * Hash policy
+   */
+  float load_factor() const { return m_ht.load_factor(); }
+  float max_load_factor() const { return m_ht.max_load_factor(); }
+  void max_load_factor(float ml) { m_ht.max_load_factor(ml); }
+
+  void rehash(size_type count) { m_ht.rehash(count); }
+  void reserve(size_type count) { m_ht.reserve(count); }
+
+  /*
+   * Observers
+   */
+  hasher hash_function() const { return m_ht.hash_function(); }
+  key_equal key_eq() const { return m_ht.key_eq(); }
+
+  /*
+   * Other
+   */
+
+  /**
+   * Convert a const_iterator to an iterator.
+   */
+  iterator mutable_iterator(const_iterator pos) {
+    return m_ht.mutable_iterator(pos);
+  }
+
+  /**
+   * Requires index <= size().
+   *
+   * Return an iterator to the element at index. Return end() if index ==
+   * size().
+   */
+  iterator nth(size_type index) { return m_ht.nth(index); }
+
+  /**
+   * @copydoc nth(size_type index)
+   */
+  const_iterator nth(size_type index) const { return m_ht.nth(index); }
+
+  /**
+   * Return const_reference to the first element. Requires the container to not
+   * be empty.
+   */
+  const_reference front() const { return m_ht.front(); }
+
+  /**
+   * Return const_reference to the last element. Requires the container to not
+   * be empty.
+   */
+  const_reference back() const { return m_ht.back(); }
+
+  /**
+   * Only available if ValueTypeContainer is a std::vector. Same as calling
+   * 'values_container().data()'.
+   */
+  template <class U = values_container_type,
+            typename std::enable_if<paddle::detail_ordered_hash::is_vector<
+                U>::value>::type* = nullptr>
+  const typename values_container_type::value_type* data() const noexcept {
+    return m_ht.data();
+  }
+
+  /**
+   * Return the container in which the values are stored. The values are in the
+   * same order as the insertion order and are contiguous in the structure, no
+   * holes (size() == values_container().size()).
+   */
+  const values_container_type& values_container() const noexcept {
+    return m_ht.values_container();
+  }
+
+  template <class U = values_container_type,
+            typename std::enable_if<paddle::detail_ordered_hash::is_vector<
+                U>::value>::type* = nullptr>
+  size_type capacity() const noexcept {
+    return m_ht.capacity();
+  }
+
+  void shrink_to_fit() { m_ht.shrink_to_fit(); }
+
+  /**
+   * Insert the value before pos shifting all the elements on the right of pos
+   * (including pos) one position to the right.
+   *
+   * Amortized linear time-complexity in the distance between pos and end().
+   */
+  std::pair<iterator, bool> insert_at_position(const_iterator pos,
+                                               const value_type& value) {
+    return m_ht.insert_at_position(pos, value);
+  }
+
+  /**
+   * @copydoc insert_at_position(const_iterator pos, const value_type& value)
+   */
+  std::pair<iterator, bool> insert_at_position(const_iterator pos,
+                                               value_type&& value) {
+    return m_ht.insert_at_position(pos, std::move(value));
+  }
+
+  /**
+   * @copydoc insert_at_position(const_iterator pos, const value_type& value)
+   *
+   * Same as insert_at_position(pos, value_type(std::forward<Args>(args)...),
+   * mainly here for coherence.
+   */
+  template <class... Args>
+  std::pair<iterator, bool> emplace_at_position(const_iterator pos,
+                                                Args&&... args) {
+    return m_ht.emplace_at_position(pos, std::forward<Args>(args)...);
+  }
+
+  /**
+   * @copydoc insert_at_position(const_iterator pos, const value_type& value)
+   */
+  template <class... Args>
+  std::pair<iterator, bool> try_emplace_at_position(const_iterator pos,
+                                                    const key_type& k,
+                                                    Args&&... args) {
+    return m_ht.try_emplace_at_position(pos, k, std::forward<Args>(args)...);
+  }
+
+  /**
+   * @copydoc insert_at_position(const_iterator pos, const value_type& value)
+   */
+  template <class... Args>
+  std::pair<iterator, bool> try_emplace_at_position(const_iterator pos,
+                                                    key_type&& k,
+                                                    Args&&... args) {
+    return m_ht.try_emplace_at_position(
+        pos, std::move(k), std::forward<Args>(args)...);
+  }
+
+  void pop_back() { m_ht.pop_back(); }
+
+  /**
+   * Faster erase operation with an O(1) average complexity but it doesn't
+   * preserve the insertion order.
+   *
+   * If an erasure occurs, the last element of the map will take the place of
+   * the erased element.
+   */
+  iterator unordered_erase(iterator pos) { return m_ht.unordered_erase(pos); }
+
+  /**
+   * @copydoc unordered_erase(iterator pos)
+   */
+  iterator unordered_erase(const_iterator pos) {
+    return m_ht.unordered_erase(pos);
+  }
+
+  /**
+   * @copydoc unordered_erase(iterator pos)
+   */
+  size_type unordered_erase(const key_type& key) {
+    return m_ht.unordered_erase(key);
+  }
+
+  /**
+   * @copydoc unordered_erase(iterator pos)
+   *
+   * Use the hash value 'precalculated_hash' instead of hashing the key. The
+   * hash value should be the same as hash_function()(key). Useful to speed-up
+   * the lookup if you already have the hash.
+   */
+  size_type unordered_erase(const key_type& key,
+                            std::size_t precalculated_hash) {
+    return m_ht.unordered_erase(key, precalculated_hash);
+  }
+
+  /**
+   * @copydoc unordered_erase(iterator pos)
+   *
+   * This overload only participates in the overload resolution if the typedef
+   * KeyEqual::is_transparent exists. If so, K must be hashable and comparable
+   * to Key.
+   */
+  template <
+      class K,
+      class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  size_type unordered_erase(const K& key) {
+    return m_ht.unordered_erase(key);
+  }
+
+  /**
+   * @copydoc unordered_erase(const K& key)
+   *
+   * Use the hash value 'precalculated_hash' instead of hashing the key. The
+   * hash value should be the same as hash_function()(key). Useful to speed-up
+   * the lookup if you already have the hash.
+   */
+  template <
+      class K,
+      class KE = KeyEqual,
+      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
+  size_type unordered_erase(const K& key, std::size_t precalculated_hash) {
+    return m_ht.unordered_erase(key, precalculated_hash);
+  }
+
+  /**
+   * Serialize the map through the `serializer` parameter.
+   *
+   * The `serializer` parameter must be a function object that supports the
+   * following call:
+   *  - `template<typename U> void operator()(const U& value);` where the types
+   * `std::uint64_t`, `float` and `std::pair<Key, T>` must be supported for U.
+   *
+   * The implementation leaves binary compatibility (endianness, IEEE 754 for
+   * floats, ...) of the types it serializes in the hands of the `Serializer`
+   * function object if compatibility is required.
+   */
+  template <class Serializer>
+  void serialize(Serializer& serializer) const {  // NOLINT
+    m_ht.serialize(serializer);
+  }
+
+  /**
+   * Deserialize a previously serialized map through the `deserializer`
+   * parameter.
+   *
+   * The `deserializer` parameter must be a function object that supports the
+   * following calls:
+   *  - `template<typename U> U operator()();` where the types `std::uint64_t`,
+   * `float` and `std::pair<Key, T>` must be supported for U.
+   *
+   * If the deserialized hash map type is hash compatible with the serialized
+   * map, the deserialization process can be sped up by setting
+   * `hash_compatible` to true. To be hash compatible, the Hash and KeyEqual
+   * must behave the same way than the ones used on the serialized map. The
+   * `std::size_t` must also be of the same size as the one on the platform used
+   * to serialize the map, the same apply for `IndexType`. If these criteria are
+   * not met, the behaviour is undefined with `hash_compatible` sets to true.
+   *
+   * The behaviour is undefined if the type `Key` and `T` of the `ordered_map`
+   * are not the same as the types used during serialization.
+   *
+   * The implementation leaves binary compatibility (endianness, IEEE 754 for
+   * floats, size of int, ...) of the types it deserializes in the hands of the
+   * `Deserializer` function object if compatibility is required.
+   */
+  template <class Deserializer>
+  static ordered_map deserialize(Deserializer& deserializer,  // NOLINT
+                                 bool hash_compatible = false) {
+    ordered_map map(0);
+    map.m_ht.deserialize(deserializer, hash_compatible);
+
+    return map;
+  }
+
+  friend bool operator==(const ordered_map& lhs, const ordered_map& rhs) {
+    return lhs.m_ht == rhs.m_ht;
+  }
+  friend bool operator!=(const ordered_map& lhs, const ordered_map& rhs) {
+    return lhs.m_ht != rhs.m_ht;
+  }
+  friend bool operator<(const ordered_map& lhs, const ordered_map& rhs) {
+    return lhs.m_ht < rhs.m_ht;
+  }
+  friend bool operator<=(const ordered_map& lhs, const ordered_map& rhs) {
+    return lhs.m_ht <= rhs.m_ht;
+  }
+  friend bool operator>(const ordered_map& lhs, const ordered_map& rhs) {
+    return lhs.m_ht > rhs.m_ht;
+  }
+  friend bool operator>=(const ordered_map& lhs, const ordered_map& rhs) {
+    return lhs.m_ht >= rhs.m_ht;
+  }
+
+  friend void swap(ordered_map& lhs, ordered_map& rhs) { lhs.swap(rhs); }
+
+ private:
+  ht m_ht;
+};
+
+}  // end namespace paddle

From a1753a0da122c54b45a52a0a574a938047164126 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Tue, 17 Aug 2021 12:48:42 +0000
Subject: [PATCH 023/125] fix multiple ordered_map compile errors

---
 paddle/fluid/framework/new_exec.h               | 16 ++++++++--------
 paddle/fluid/framework/op_desc.cc               |  9 ++++-----
 paddle/fluid/framework/type_defs.h              | 16 +++++++++++++++-
 paddle/fluid/inference/api/mkldnn_quantizer.cc  |  3 ++-
 paddle/fluid/operators/copy_cross_scope_test.cc |  4 ++--
 paddle/fluid/platform/variant.h                 |  2 +-
 paddle/fluid/pybind/pybind.cc                   |  2 +-
 7 files changed, 33 insertions(+), 19 deletions(-)

diff --git a/paddle/fluid/framework/new_exec.h b/paddle/fluid/framework/new_exec.h
index defa7a967336b..9e6c845bbffb2 100644
--- a/paddle/fluid/framework/new_exec.h
+++ b/paddle/fluid/framework/new_exec.h
@@ -262,16 +262,16 @@ void build_op_func_list(const framework::ProgramDesc& pdesc,
 
     VariableValueMap& ins_map_temp = runtime_context.inputs;
 
-    for (auto& var_name_item : ins_map_temp) {
-      for (size_t i = 0; i < var_name_item.second.size(); ++i) {
-        auto var = var_name_item.second[i];
+    for (auto it = ins_map_temp.begin(); it != ins_map_temp.end(); ++it) {
+      for (size_t i = 0; i < it.value().size(); ++i) {
+        auto var = it.value()[i];
         auto tensor_in = static_cast<const Tensor*>(&(var->Get<LoDTensor>()));
         if (!tensor_in->IsInitialized()) {
           continue;
         }
         auto kernel_type_for_var =
             static_cast<const framework::OperatorWithKernel*>(op_base)
-                ->GetKernelTypeForVar(var_name_item.first, *tensor_in,
+                ->GetKernelTypeForVar(it->first, *tensor_in,
                                       expected_kernel_key);
         if (!platform::is_same_place(kernel_type_for_var.place_,
                                      expected_kernel_key.place_)) {
@@ -286,7 +286,7 @@ void build_op_func_list(const framework::ProgramDesc& pdesc,
           var_scope->var_list.push_back(v);
 
           VariableNameMap copy_in_map;
-          auto x_iter = inputs_names.find(var_name_item.first);
+          auto x_iter = inputs_names.find(it->first);
           copy_in_map["X"] = {x_iter->second[i]};
           VariableNameMap copy_out_map;
           copy_out_map["Out"] = {new_var_name};
@@ -294,11 +294,11 @@ void build_op_func_list(const framework::ProgramDesc& pdesc,
           attr_map["dst_place_type"] = convert(place);
 
           std::map<std::string, std::vector<int>> copy_ins_name2id;
-          copy_ins_name2id["X"] = ins_name2id[var_name_item.first];
+          copy_ins_name2id["X"] = ins_name2id[it->first];
           std::map<std::string, std::vector<int>> copy_out_name2id;
           copy_out_name2id["Out"] = {var_scope->name2id[new_var_name]};
 
-          op_func_node.input_index[var_name_item.first][i] =
+          op_func_node.input_index[it->first][i] =
               var_scope->name2id[new_var_name];
 
           VariableValueMap copy_ins_value_map;
@@ -344,7 +344,7 @@ void build_op_func_list(const framework::ProgramDesc& pdesc,
           op_list->push_back(copy_op);
           vec_func_list->push_back(copy_op_func_node);
 
-          var_name_item.second[i] = v;
+          it.value()[i] = v;
         }
       }
     }
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index 519bf8c633a01..87a8844f1d98d 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -640,9 +640,8 @@ void OpDesc::Rename(const std::string &old_name, const std::string &new_name) {
 
 void OpDesc::RenameOutput(const std::string &old_name,
                           const std::string &new_name) {
-  for (auto &output : outputs_) {
-    std::replace(output.second.begin(), output.second.end(), old_name,
-                 new_name);
+  for (auto it = outputs_.begin(); it != outputs_.end(); ++it) {
+    std::replace(it.value().begin(), it.value().end(), old_name, new_name);
   }
 
   auto it = attrs_.find(framework::OpProtoAndCheckerMaker::OpRoleVarAttrName());
@@ -656,8 +655,8 @@ void OpDesc::RenameOutput(const std::string &old_name,
 
 void OpDesc::RenameInput(const std::string &old_name,
                          const std::string &new_name) {
-  for (auto &input : inputs_) {
-    std::replace(input.second.begin(), input.second.end(), old_name, new_name);
+  for (auto it = inputs_.begin(); it != inputs_.end(); ++it) {
+    std::replace(it.value().begin(), it.value().end(), old_name, new_name);
   }
 
   auto it = attrs_.find(framework::OpProtoAndCheckerMaker::OpRoleVarAttrName());
diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h
index 8d6a9305a0704..f41a26846d8ac 100644
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -35,12 +35,26 @@ class Variable;
 class InferNoNeedBufferVarsFN;
 
 /**
- * Why need ordered_map ?
+ * [ Why need ordered_map? ]
  *
  * The inputs and outputs in OpProto are ordered, but when they used for build
  * OpDesc and Operator, the order info is lost, which cause we can't access Op's
  * inputs and outputs by index, can't construct vector format KernelContext at
  * low cost.
+ *
+ * Note: For iterators, operator*() and operator->() return a reference and a
+ * pointer to const std::pair<Key, T> instead of std::pair<const Key, T> making
+ * the value T not modifiable. To modify the value you have to call the value()
+ * method of the iterator to get a mutable reference. Example:
+ *
+ *      tsl::ordered_map<int, int> map = {{1, 1}, {2, 1}, {3, 1}};
+ *      for(auto it = map.begin(); it != map.end(); ++it) {
+ *          //it->second = 2; // Illegal
+ *          it.value() = 2; // Ok
+ *      }
+ *
+ * Reason:
+ * - https://github.com/Tessil/ordered-map/issues/32#issuecomment-739492629
  */
 using VariableNameMap =
     paddle::ordered_map<std::string, std::vector<std::string>>;
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.cc b/paddle/fluid/inference/api/mkldnn_quantizer.cc
index f6cdbb00b5045..574071dfd17d3 100644
--- a/paddle/fluid/inference/api/mkldnn_quantizer.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc
@@ -48,7 +48,8 @@ static LoDTensor CreateScaleTensor(int64_t channels_num = 1);
 
 bool AnalysisPredictor::MkldnnQuantizer::CalculateScales() {
   PrettyLogH1("--- Calculating scales for quantization");
-  using VariableNameMap = std::map<std::string, std::vector<std::string>>;
+  using VariableNameMap =
+      paddle::ordered_map<std::string, std::vector<std::string>>;
   std::map<std::string, std::map<std::string, LoDTensor>> gathered_data;
   for (const auto* op : predictor_.inference_program_->Block(0).AllOps()) {
     if (platform::HasOpINT8DataType(op)) {
diff --git a/paddle/fluid/operators/copy_cross_scope_test.cc b/paddle/fluid/operators/copy_cross_scope_test.cc
index e175b235f9c18..37bc32d745eda 100644
--- a/paddle/fluid/operators/copy_cross_scope_test.cc
+++ b/paddle/fluid/operators/copy_cross_scope_test.cc
@@ -61,7 +61,7 @@ void Compare1(f::Scope* scope, const p::DeviceContext& ctx,
 
   // run
   f::AttributeMap attrs = {{"to_main_scope", false}, {"num_micro_batches", 3}};
-  std::map<std::string, std::vector<std::string>> output;
+  f::VariableNameMap output;
   auto op = f::OpRegistry::CreateOp(op_type, {{"X", {"tmp"}}, {"Id", {"Id"}}},
                                     output, attrs);
 
@@ -109,7 +109,7 @@ void Compare2(f::Scope* scope, const p::DeviceContext& ctx,
 
   // run
   f::AttributeMap attrs = {{"to_main_scope", true}, {"num_micro_batches", 3}};
-  std::map<std::string, std::vector<std::string>> output;
+  f::VariableNameMap output;
   auto op = f::OpRegistry::CreateOp(op_type, {{"X", {"tmp"}}, {"Id", {"Id"}}},
                                     output, attrs);
 
diff --git a/paddle/fluid/platform/variant.h b/paddle/fluid/platform/variant.h
index 8c8fb525cc7e0..fb4772abd3062 100644
--- a/paddle/fluid/platform/variant.h
+++ b/paddle/fluid/platform/variant.h
@@ -40,10 +40,10 @@ limitations under the License. */
 
 #include <boost/mpl/comparison.hpp>
 #include <boost/mpl/less_equal.hpp>
-#include <boost/optional.hpp>
 #include <boost/variant.hpp>
 
 #include "paddle/utils/any.h"
+#include "paddle/utils/optional.h"
 
 // some platform-independent defintion
 #if defined(_WIN32)
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 0663da88ac75f..01bef50de8d5d 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1860,7 +1860,7 @@ All parameter, weight, gradient are variables in Paddle.
            [](const OperatorBase &op) -> std::string { return op.Type(); })
       .def("outputs",
            [](const OperatorBase &op)
-               -> std::map<std::string, std::vector<std::string>> {
+               -> paddle::ordered_map<std::string, std::vector<std::string>> {
                  return op.Outputs();
                })
       .def("output_vars",

From 05a82e7403d1d101df3107d0a769279e266e7882 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 18 Aug 2021 02:05:05 +0000
Subject: [PATCH 024/125] move dev into include dir

---
 paddle/fluid/framework/top_utils.h      |  2 +-
 paddle/fluid/operators/mean_op.h        |  4 ++--
 paddle/fluid/operators/npu_op_runner.cc | 10 +++++-----
 paddle/fluid/operators/npu_op_runner.h  |  2 +-
 paddle/fluid/operators/scale_op.h       |  4 ++--
 paddle/fluid/operators/sign_op.h        |  4 ++--
 paddle/fluid/platform/mkldnn_reuse.h    |  2 +-
 paddle/top/api/CMakeLists.txt           |  2 +-
 paddle/top/api/all.h                    |  4 ++--
 paddle/top/api/{ => include}/dev/core.h |  1 +
 paddle/top/api/{ => include}/dev/math.h |  0
 paddle/top/core/dense_tensor.h          |  2 +-
 paddle/top/core/dtype.h                 |  2 +-
 paddle/top/core/kernel_registry.h       | 24 ++++++++++++------------
 paddle/top/cuda/CMakeLists.txt          |  6 +++++-
 15 files changed, 37 insertions(+), 32 deletions(-)
 rename paddle/top/api/{ => include}/dev/core.h (93%)
 rename paddle/top/api/{ => include}/dev/math.h (100%)

diff --git a/paddle/fluid/framework/top_utils.h b/paddle/fluid/framework/top_utils.h
index fb40ad606288e..f382c5f918f13 100644
--- a/paddle/fluid/framework/top_utils.h
+++ b/paddle/fluid/framework/top_utils.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/place.h"
 
-#include "paddle/top/api/dev/core.h"
+#include "paddle/top/api/include/dev/core.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/operators/mean_op.h b/paddle/fluid/operators/mean_op.h
index 25115c739bd10..ef5d66adbf8b9 100644
--- a/paddle/fluid/operators/mean_op.h
+++ b/paddle/fluid/operators/mean_op.h
@@ -18,8 +18,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/top_utils.h"
 
 // only can include the headers in paddle/top/api dirs
-#include "paddle/top/api/dev/core.h"
-#include "paddle/top/api/dev/math.h"
+#include "paddle/top/api/include/dev/core.h"
+#include "paddle/top/api/include/dev/math.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/npu_op_runner.cc b/paddle/fluid/operators/npu_op_runner.cc
index 56b4148e1bece..71a0f52b41ef7 100644
--- a/paddle/fluid/operators/npu_op_runner.cc
+++ b/paddle/fluid/operators/npu_op_runner.cc
@@ -42,7 +42,7 @@ static std::map<framework::proto::VarType::Type, aclDataType>
         {framework::proto::VarType::FP64, ACL_DOUBLE},
 };
 
-static std::map<pt::DataType aclDataType> PT_DTYPE_2_ACL_DTYPE = {
+static std::map<pt::DataType, aclDataType> PT_DTYPE_2_ACL_DTYPE = {
     {pt::DataType::kBOOL, ACL_BOOL},       {pt::DataType::kINT8, ACL_INT8},
     {pt::DataType::kUINT8, ACL_UINT8},     {pt::DataType::kINT16, ACL_INT16},
     {pt::DataType::kINT32, ACL_INT32},     {pt::DataType::kINT64, ACL_INT64},
@@ -331,7 +331,7 @@ NpuOpRunner &NpuOpRunner::AddOutput(const Tensor &tensor) {
   return *this;
 }
 
-NpuOpRunner &NpuOpRunner::AdOutput(const pt::DenseTensor &tensor) {
+NpuOpRunner &NpuOpRunner::AddOutput(const pt::DenseTensor &tensor) {
   // create aclTensorDesc
   output_descs_.emplace_back(CreateTensorDesc(tensor));
   // create aclDataBuffer
@@ -355,7 +355,7 @@ NpuOpRunner &NpuOpRunner::AddInputs(
     const std::vector<pt::DenseTensor> &tensors) {
   input_descs_.reserve(tensors.size());
   input_buffers_.reserve(tensors.size());
-  for (auto tensor : tensors) {
+  for (auto &tensor : tensors) {
     // create aclTensorDesc
     input_descs_.emplace_back(CreateTensorDesc(tensor));
     // create aclDataBuffer
@@ -395,7 +395,7 @@ NpuOpRunner &NpuOpRunner::AddOutputs(
     const std::vector<pt::DenseTensor> &tensors) {
   output_descs_.reserve(tensors.size());
   output_buffers_.reserve(tensors.size());
-  for (auto tensor : tensors) {
+  for (auto &tensor : tensors) {
     // create aclTensorDesc
     output_descs_.emplace_back(CreateTensorDesc(tensor));
     // create aclDataBuffer
@@ -506,7 +506,7 @@ aclDataBuffer *NpuOpRunner::CreateDataBuffer(Tensor tensor) {
 }
 
 aclDataBuffer *NpuOpRunner::CreateDataBuffer(const pt::DenseTensor &tensor) {
-  const void *ptr = tensor.data<void>();
+  void *ptr = const_cast<void *>(tensor.data<void>());
   VLOG(4) << "NPU ptr: " << ptr << ", size: " << tensor.MemorySize();
   auto *buffer = aclCreateDataBuffer(ptr, tensor.MemorySize());
   PADDLE_ENFORCE_NOT_NULL(
diff --git a/paddle/fluid/operators/npu_op_runner.h b/paddle/fluid/operators/npu_op_runner.h
index 19f5f5debe2cc..412c842ac4bc8 100644
--- a/paddle/fluid/operators/npu_op_runner.h
+++ b/paddle/fluid/operators/npu_op_runner.h
@@ -24,7 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
 
-#include "paddle/top/api/dev/core.h"
+#include "paddle/top/api/include/dev/core.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h
index e00c1c1dfcf28..0f9b1bbeb6a8c 100644
--- a/paddle/fluid/operators/scale_op.h
+++ b/paddle/fluid/operators/scale_op.h
@@ -18,8 +18,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/top_utils.h"
 
 // only can include the headers in paddle/top/api dirs
-#include "paddle/top/api/dev/core.h"
-#include "paddle/top/api/dev/math.h"
+#include "paddle/top/api/include/dev/core.h"
+#include "paddle/top/api/include/dev/math.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/sign_op.h b/paddle/fluid/operators/sign_op.h
index 42e4a45b450db..954013817267f 100644
--- a/paddle/fluid/operators/sign_op.h
+++ b/paddle/fluid/operators/sign_op.h
@@ -20,8 +20,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 
 // only can include the headers in paddle/top/api dirs
-#include "paddle/top/api/dev/core.h"
-#include "paddle/top/api/dev/math.h"
+#include "paddle/top/api/include/dev/core.h"
+#include "paddle/top/api/include/dev/math.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index fd95c481a068c..4fdde230b565b 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -26,7 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #include "paddle/fluid/platform/place.h"
 
-#include "paddle/top/api/dev/core.h"
+#include "paddle/top/api/include/dev/core.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/top/api/CMakeLists.txt b/paddle/top/api/CMakeLists.txt
index 9f8c214a04e5c..75fa5b8348337 100644
--- a/paddle/top/api/CMakeLists.txt
+++ b/paddle/top/api/CMakeLists.txt
@@ -2,7 +2,7 @@ add_subdirectory(src)
 
 set(TOP_DEPS convert_utils dense_tensor selected_rows_tensor kernel_factory kernel_context)
 set(TOP_DEPS ${TOP_DEPS} math_cpu)
-if(WITH_GPU)
+if(WITH_GPU OR WITH_ROCM)
   set(TOP_DEPS ${TOP_DEPS} math_cuda)
 endif()
 if(WITH_XPU)
diff --git a/paddle/top/api/all.h b/paddle/top/api/all.h
index ac48529f25f3e..2586884613040 100644
--- a/paddle/top/api/all.h
+++ b/paddle/top/api/all.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 // develop apis
-#include "paddle/top/api/dev/core.h"
-#include "paddle/top/api/dev/math.h"
+#include "paddle/top/api/include/dev/core.h"
+#include "paddle/top/api/include/dev/math.h"
 
 // user apis
diff --git a/paddle/top/api/dev/core.h b/paddle/top/api/include/dev/core.h
similarity index 93%
rename from paddle/top/api/dev/core.h
rename to paddle/top/api/include/dev/core.h
index 4f1a01646d3fd..c6ff5915e5ed8 100644
--- a/paddle/top/api/dev/core.h
+++ b/paddle/top/api/include/dev/core.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+// See Note: [ How do we organize the kernel directory ]
 #include "paddle/top/core/convert_utils.h"
 #include "paddle/top/core/dense_tensor.h"
 #include "paddle/top/core/kernel_context.h"
diff --git a/paddle/top/api/dev/math.h b/paddle/top/api/include/dev/math.h
similarity index 100%
rename from paddle/top/api/dev/math.h
rename to paddle/top/api/include/dev/math.h
diff --git a/paddle/top/core/dense_tensor.h b/paddle/top/core/dense_tensor.h
index b3dad8b32f54b..8e671e1d6423c 100644
--- a/paddle/top/core/dense_tensor.h
+++ b/paddle/top/core/dense_tensor.h
@@ -103,7 +103,7 @@ class DenseTensor : public TensorInterface {
 
   template <typename T>
   const T* data() const {
-    static_assert(std::is_pod<T>::value,
+    static_assert(std::is_pod<T>::value || std::is_same<T, void>::value,
                   "T must be POD when call Tensor.data<T>().");
     return reinterpret_cast<const T*>(data());
   }
diff --git a/paddle/top/core/dtype.h b/paddle/top/core/dtype.h
index 77dece46e4e02..130482dc48fde 100644
--- a/paddle/top/core/dtype.h
+++ b/paddle/top/core/dtype.h
@@ -89,7 +89,7 @@ PT_FOR_EACH_DATA_TYPE(PT_SPECIALIZE_DataTypeToCppType)
 #define PT_SPECIALIZE_CppTypeToDataType(cpp_type, data_type) \
   template <>                                                \
   struct CppTypeToDataType<cpp_type> {                       \
-    DataType type = data_type;                               \
+    constexpr static DataType Type() { return data_type; }   \
   };
 
 PT_FOR_EACH_DATA_TYPE(PT_SPECIALIZE_CppTypeToDataType)
diff --git a/paddle/top/core/kernel_registry.h b/paddle/top/core/kernel_registry.h
index 421a203dc051c..4b7fbad675af1 100644
--- a/paddle/top/core/kernel_registry.h
+++ b/paddle/top/core/kernel_registry.h
@@ -88,18 +88,18 @@ class OpKernelRegistrar {
                                   DATATYPE(dtype),                        \
                                   kernel_fn)
 
-#define PT_REGISTER_KERNEL_AUTO_SPECIALIZE(                              \
-    op_name, backend, layout, meta_kernel_fn, dtype)                     \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                     \
-      __reg_pt_op_kernel_##op_name##_##backend##_##layout##_##dtype##__, \
-      "PT_REGISTER_KERNEL_AUTO_SPECIALIZE must be called in global "     \
-      "namespace.");                                                     \
-  static ::pt::OpKernelRegistrar                                         \
-      __pt_op_kernel_##op_name##_##backend##_##layout##_##dtype##__ =    \
-          ::pt::OpKernelRegistrar(#op_name,                              \
-                                  BACKEND(backend),                      \
-                                  DATALAYOUT(layout),                    \
-                                  ::pt::CppTypeToDataType<dtype>().type, \
+#define PT_REGISTER_KERNEL_AUTO_SPECIALIZE(                               \
+    op_name, backend, layout, meta_kernel_fn, dtype)                      \
+  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                      \
+      __reg_pt_op_kernel_##op_name##_##backend##_##layout##_##dtype##__,  \
+      "PT_REGISTER_KERNEL_AUTO_SPECIALIZE must be called in global "      \
+      "namespace.");                                                      \
+  static ::pt::OpKernelRegistrar                                          \
+      __pt_op_kernel_##op_name##_##backend##_##layout##_##dtype##__ =     \
+          ::pt::OpKernelRegistrar(#op_name,                               \
+                                  BACKEND(backend),                       \
+                                  DATALAYOUT(layout),                     \
+                                  ::pt::CppTypeToDataType<dtype>::Type(), \
                                   PT_KERNEL(meta_kernel_fn<dtype>))
 
 #define PT_TORCH_KERNEL_REGISTRAR(op_name, backend, layout, dtype)          \
diff --git a/paddle/top/cuda/CMakeLists.txt b/paddle/top/cuda/CMakeLists.txt
index cc64addf94d19..e5899c8eb5ad5 100644
--- a/paddle/top/cuda/CMakeLists.txt
+++ b/paddle/top/cuda/CMakeLists.txt
@@ -1 +1,5 @@
-nv_library(math_cuda SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory)
+if(WITH_GPU)
+  nv_library(math_cuda SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory)
+elseif(WITH_ROCM)
+  hip_library(math_cuda SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory)
+endif()

From 90e9090ee044ba306a442bb8837335b718801268 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 19 Aug 2021 12:43:14 +0000
Subject: [PATCH 025/125] support sign op in static op run

---
 paddle/fluid/framework/CMakeLists.txt |   4 +-
 paddle/fluid/framework/operator.cc    | 174 ++++++++++++++++++++++++--
 paddle/fluid/framework/operator.h     |  21 +++-
 paddle/fluid/framework/top_utils.cc   |  19 +--
 paddle/fluid/framework/top_utils.h    |   6 +
 paddle/fluid/framework/type_defs.h    |   3 +-
 paddle/fluid/operators/sign_op.cc     |   1 +
 paddle/top/CMakeLists.txt             |   3 +-
 paddle/top/core/backend.h             |   6 +-
 paddle/top/core/convert_utils.cc      |  44 ++++++-
 paddle/top/core/convert_utils.h       |   3 +
 paddle/top/core/kernel_factory.cc     |   5 +
 paddle/top/core/kernel_factory.h      |  17 ++-
 paddle/top/core/kernel_registry.h     |  12 +-
 paddle/top/inferdtype/CMakeLists.txt  |   0
 15 files changed, 283 insertions(+), 35 deletions(-)
 delete mode 100644 paddle/top/inferdtype/CMakeLists.txt

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index d4aa0e78ad57f..74d366c51d028 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -190,10 +190,10 @@ cc_library(unused_var_check SRCS unused_var_check.cc DEPS glog no_need_buffer_va
 
 IF(WITH_XPU)
 cc_library(operator SRCS operator.cc DEPS xpu_op_list op_info device_context tensor scope glog trainer_desc_proto data_feed_proto
-    shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils)
+    shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils top top_utils)
 ELSE()
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog trainer_desc_proto data_feed_proto
-    shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils)
+    shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils top top_utils)
 ENDIF()
 
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index ad030a46b9fa8..80d6be5c33287 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/details/nan_inf_utils.h"
 #include "paddle/fluid/framework/op_call_stack.h"
 #include "paddle/fluid/framework/shape_inference.h"
+#include "paddle/fluid/framework/top_utils.h"
 #include "paddle/fluid/framework/transfer_scope_cache.h"
 #include "paddle/fluid/framework/unused_var_check.h"
 #include "paddle/fluid/framework/var_type.h"
@@ -1073,6 +1074,85 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
   this->InferShape(&infer_shape_ctx);
 }
 
+static OpKernelType TransPtOpKernelKeyToOpKernelType(
+    const pt::OpKernelKey& kernel_key) {
+  proto::VarType::Type data_type = pt::TransToProtoVarType(kernel_key.dtype());
+  platform::Place place = pt::TransToFluidPlace(kernel_key.backend());
+  DataLayout data_layout = pt::TransToFluidDataLayout(kernel_key.layout());
+  LibraryType library_type = LibraryType::kPlain;
+  if (kernel_key.backend() == pt::Backend::kMKLDNN) {
+    library_type = LibraryType::kMKLDNN;
+  } else if (kernel_key.backend() == pt::Backend::kCUDNN) {
+    library_type = LibraryType::kCUDNN;
+  } else {
+    // do nothing
+  }
+  // TODO(chenweihang): the customized_type_value is lost
+  return OpKernelType(data_type, place, data_layout, library_type);
+}
+
+static std::string RuntimeContextDebugString(const RuntimeContext& ctx) {
+  std::stringstream ss;
+  ss << "RuntimeContext(Inputs: ";
+  for (auto& var_pair : ctx.inputs) {
+    ss << var_pair.first << ", ";
+  }
+  ss << "Outputs: ";
+  for (auto& var_pair : ctx.outputs) {
+    ss << var_pair.first << ", ";
+  }
+  ss << ")";
+  return ss.str();
+}
+
+static pt::OpKernelContext BuildOpKernelContext(
+    const pt::OpKernel& pt_kernel, const RuntimeContext& ctx,
+    const platform::DeviceContext& dev_ctx) {
+  VLOG(1) << RuntimeContextDebugString(ctx);
+
+  // TODO(chenweihang): now only work for very simple case (sign op),
+  // many cases need to be deal with later:
+  // 1. the input and output are not tensor
+  // 2. the dispensbale, duplicable input and output
+  // 3. needless attributes remove
+  // 4. use pt Tensor directly
+  // 5. kernel input is not DenseTensor
+  pt::OpKernelContext op_kernel_ctx(dev_ctx);
+  auto input_defs = pt_kernel.param_def().input_defs();
+  auto output_defs = pt_kernel.param_def().output_defs();
+
+  size_t i = 0;
+  for (auto& var_pair : ctx.inputs) {
+    // TODO(chenweihang): deal with diff param in vector
+    auto in_def = input_defs.at(i);
+    for (auto* var : var_pair.second) {
+      const auto& tensor = var->Get<Tensor>();
+      auto pt_in = MakeTensorImpl<pt::DenseTensor>(tensor, in_def.backend,
+                                                   in_def.dtype, in_def.layout);
+      op_kernel_ctx.EmplaceBackInput(pt_in);
+    }
+    ++i;
+  }
+  // ordered_map access mutable value need iter
+  i = 0;
+  for (auto it = ctx.outputs.begin(); it != ctx.outputs.end(); ++it) {
+    auto out_def = output_defs.at(i);
+    for (auto* var : it.value()) {
+      auto* tensor = var->GetMutable<Tensor>();
+      // mutable_data before run kernel, to avoid share output form
+      // OpKernelContext to original tensor
+      tensor->mutable_data(pt::TransToFluidPlace(out_def.backend),
+                           pt::TransToProtoVarType(out_def.dtype));
+      auto pt_out = MakeTensorImpl<pt::DenseTensor>(
+          *tensor, out_def.backend, out_def.dtype, out_def.layout);
+      op_kernel_ctx.EmplaceBackOutput(pt_out);
+    }
+    ++i;
+  }
+  // TODO(chenweihang): append attrs
+  return op_kernel_ctx;
+}
+
 void OperatorWithKernel::RunImpl(const Scope& scope,
                                  const platform::Place& place) const {
   // To reduce the elapsed time of HasAttr, we use bool variable to record the
@@ -1105,8 +1185,18 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto* dev_ctx = pool.Get(place);
 
-  if (kernel_type_.get() == nullptr || kernel_func_.get() == nullptr) {
-    ChooseKernel(*runtime_ctx, scope, place);
+  // TODO(chenweihang): Now we are still reusing a lot of the original fluid
+  // implementation, this is a gradual replacement process
+  run_pt_kernel_ =
+      pt::OpKernelFactory::Instance().ContainsOperation(type_.c_str());
+  if (run_pt_kernel_) {
+    if (pt_kernel_key_.get() == nullptr || pt_kernel_.get() == nullptr) {
+      ChoosePtKernel(*runtime_ctx, *dev_ctx);
+    }
+  } else {
+    if (kernel_type_.get() == nullptr || kernel_func_.get() == nullptr) {
+      ChooseKernel(*runtime_ctx, scope, place);
+    }
   }
 
   // do data transformScope &transfer_scope;
@@ -1116,6 +1206,10 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
     platform::RecordEvent record_event("prepare_data",
                                        platform::EventRole::kInnerOp);
     if (need_prepare_data_) {
+      if (run_pt_kernel_) {
+        kernel_type_.reset(new OpKernelType(
+            TransPtOpKernelKeyToOpKernelType(*pt_kernel_key_)));
+      }
       transfer_scope = PrepareData(scope, *kernel_type_,
                                    &transfered_inplace_vars, runtime_ctx);
     }
@@ -1144,8 +1238,17 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   {
     platform::RecordEvent record_event("compute",
                                        platform::EventRole::kInnerOp);
-    (*kernel_func_)(
-        ExecutionContext(*this, exec_scope, *dev_ctx, *runtime_ctx));
+    if (run_pt_kernel_) {
+      // TODO(chenweihang): here will intrduce copy
+      auto op_kernel_ctx =
+          BuildOpKernelContext(*pt_kernel_, *runtime_ctx, *dev_ctx);
+      (*pt_kernel_)(&op_kernel_ctx);
+      // need share output into fluid tensor
+
+    } else {
+      (*kernel_func_)(
+          ExecutionContext(*this, exec_scope, *dev_ctx, *runtime_ctx));
+    }
   }
 
   if (!transfered_inplace_vars.empty()) {
@@ -1193,6 +1296,21 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   }
 }
 
+void OperatorWithKernel::ChoosePtKernel(
+    const RuntimeContext& ctx, const platform::DeviceContext& dev_ctx) const {
+  // 1. construct operation name
+  // TODO(chenweihang): add rules for construct op name
+  pt::OperationName op_name(Type().c_str());
+
+  // 2. construct op kernel key
+  pt_kernel_key_.reset(
+      new pt::OpKernelKey(ConstructPtOpKernelKey(ctx, dev_ctx.GetPlace())));
+
+  // 3. selecte op kernel
+  pt_kernel_.reset(new pt::OpKernel(
+      pt::OpKernelFactory::Instance().SelectKernel(op_name, *pt_kernel_key_)));
+}
+
 void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
                                       const Scope& scope,
                                       const platform::Place& place) const {
@@ -1547,11 +1665,10 @@ Scope* OperatorWithKernel::PrepareData(
 }
 
 void OperatorWithKernel::ParseInputDataType(
-    const ExecutionContext& ctx, const std::string& name,
+    const std::vector<Variable*>& vars, const std::string& name,
     proto::VarType::Type* data_type) const {
   proto::VarType::Type default_data_type =
       static_cast<proto::VarType::Type>(-1);
-  const std::vector<Variable*> vars = ctx.MultiInputVar(name);
   for (size_t i = 0; i < vars.size(); ++i) {
     const Variable* var = vars[i];
     if (var != nullptr) {
@@ -1576,7 +1693,7 @@ void OperatorWithKernel::ParseInputDataType(
             platform::errors::InvalidArgument(
                 "The Tensor in the %s Op's Input Variable %s(%s) is "
                 "not initialized.",
-                Type(), name, ctx.InputNames(name).at(i)));
+                Type(), name, Inputs().at(name).at(i)));
         proto::VarType::Type tmp = t->type();
         PADDLE_ENFORCE(
             tmp == *data_type || *data_type == default_data_type,
@@ -1598,7 +1715,8 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
       static_cast<proto::VarType::Type>(-1);
   proto::VarType::Type data_type = dafault_data_type;
   for (auto& input : ctx.InNameList()) {
-    ParseInputDataType(ctx, input, &data_type);
+    const std::vector<Variable*> vars = ctx.MultiInputVar(input);
+    ParseInputDataType(vars, input, &data_type);
   }
   PADDLE_ENFORCE_NE(
       data_type, dafault_data_type,
@@ -1612,7 +1730,7 @@ proto::VarType::Type OperatorWithKernel::IndicateVarDataType(
   proto::VarType::Type dafault_data_type =
       static_cast<proto::VarType::Type>(-1);
   proto::VarType::Type data_type = dafault_data_type;
-  ParseInputDataType(ctx, name, &data_type);
+  ParseInputDataType(ctx.MultiInputVar(name), name, &data_type);
   PADDLE_ENFORCE_NE(
       data_type, dafault_data_type,
       platform::errors::InvalidArgument(
@@ -1695,5 +1813,43 @@ OpKernelType OperatorWithKernel::GetKernelTypeForVar(
                       tensor.layout());
 }
 
+pt::OpKernelKey OperatorWithKernel::ConstructPtOpKernelKey(
+    const RuntimeContext& ctx, const platform::Place& ctx_place) const {
+  // 1. get backend based place and attrs
+  pt::Backend backend = pt::TransToPtBackend(ctx_place);
+  if (HasAttr("use_mkldnn") && Attr<bool>("use_mkldnn") == true) {
+    backend = pt::Backend::kMKLDNN;
+  } else if (HasAttr("use_cudnn") && Attr<bool>("use_cudnn") == true) {
+    backend = pt::Backend::kCUDNN;
+  } else {
+    // do nothing
+  }
+  // TODO(chenweihang): add more rules
+  // if (HasAttr("op_device"))
+
+  // 2. get layout
+  // default layout same as tensor default layout, need futher check
+  pt::DataLayout layout = pt::DataLayout::kNCHW;
+  if (backend == pt::Backend::kMKLDNN) {
+    layout = pt::DataLayout::kMKLDNN;
+  }
+
+  // 3. parse data_type form inputs
+  proto::VarType::Type dafault_data_type =
+      static_cast<proto::VarType::Type>(-1);
+  proto::VarType::Type data_type = dafault_data_type;
+  for (auto& var_pair : ctx.inputs) {
+    ParseInputDataType(var_pair.second, var_pair.first, &data_type);
+  }
+  PADDLE_ENFORCE_NE(
+      data_type, dafault_data_type,
+      platform::errors::NotFound(
+          "DataType should be indicated by input Variable at %s.", Type()));
+  pt::DataType dtype = pt::TransToPtDataType(data_type);
+
+  // 4. build pt OpKernelKey
+  return pt::OpKernelKey(backend, layout, dtype);
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index fc01513a866e4..2309746fa663e 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -38,6 +38,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/variant.h"
 
+#include "paddle/top/api/include/dev/core.h"
+
 namespace paddle {
 namespace framework {
 class InferShapeContext;
@@ -528,6 +530,11 @@ class OperatorWithKernel : public OperatorBase {
     return kernel_type_->place_;
   }
 
+  /* member functions for adapting to top lib */
+  // TODO(chenweihang): Temporarily as a class method
+  virtual pt::OpKernelKey ConstructPtOpKernelKey(
+      const RuntimeContext& ctx, const platform::Place& ctx_place) const;
+
  private:
   void RunImpl(const Scope& scope, const platform::Place& place) const final;
   void RunImpl(const Scope& scope, const platform::Place& place,
@@ -560,12 +567,17 @@ class OperatorWithKernel : public OperatorBase {
   // By default all input data must be same.
   proto::VarType::Type IndicateDataType(const ExecutionContext& ctx) const;
   // used for IndicateDataType
-  void ParseInputDataType(const ExecutionContext& ctx, const std::string& name,
-                          proto::VarType::Type* type) const;
+  void ParseInputDataType(const std::vector<Variable*>& vars,
+                          const std::string& name,
+                          proto::VarType::Type* data_type) const;
   // used for IndicateOrPromoteVarDataTypes
   Tensor* GetTensorFormInputSafely(const ExecutionContext& ctx,
                                    const std::string& name) const;
 
+  /* member functions for adapting to top lib */
+  void ChoosePtKernel(const RuntimeContext& ctx,
+                      const platform::DeviceContext& dev_ctx) const;
+
  protected:
   mutable std::unique_ptr<OpKernelType> kernel_type_;
   mutable std::unique_ptr<OpKernelFunc> kernel_func_;
@@ -576,6 +588,11 @@ class OperatorWithKernel : public OperatorBase {
   mutable bool all_kernels_must_compute_runtime_shape_ = false;
   mutable std::mutex cache_update_mutex_;
   mutable bool enable_cache_transfer_scope_ = false;
+  // TODO(chenweihang): Similar duplicate members are used for new top lib,
+  // maybe we have better impl methods
+  mutable bool run_pt_kernel_ = false;
+  mutable std::unique_ptr<pt::OpKernelKey> pt_kernel_key_;
+  mutable std::unique_ptr<pt::OpKernel> pt_kernel_;
 };
 
 extern bool OpSupportGPU(const std::string& op_type);
diff --git a/paddle/fluid/framework/top_utils.cc b/paddle/fluid/framework/top_utils.cc
index ec3ee3456b4e3..c0386d671a721 100644
--- a/paddle/fluid/framework/top_utils.cc
+++ b/paddle/fluid/framework/top_utils.cc
@@ -14,8 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/top_utils.h"
 
-#include "paddle/top/api/include/tensor.h"
-
 namespace paddle {
 namespace framework {
 
@@ -23,13 +21,11 @@ namespace framework {
 
 template <>
 std::shared_ptr<pt::DenseTensor> MakeTensorImpl<pt::DenseTensor>(
-    const Tensor& tensor, const platform::Place& place,
-    proto::VarType::Type type) {
+    const Tensor& tensor, pt::Backend backend, pt::DataType dtype,
+    pt::DataLayout layout) {
   auto holder = tensor.Holder();
   auto tensor_impl = std::make_shared<pt::DenseTensor>(
-      pt::TensorMeta(tensor.dims(), pt::TransToPtBackend(place),
-                     pt::TransToPtDataType(type),
-                     pt::TransToPtLayout(tensor.layout()), tensor.offset()),
+      pt::TensorMeta(tensor.dims(), backend, dtype, layout, tensor.offset()),
       pt::TensorStatus());
 
   if (holder != nullptr) {
@@ -40,6 +36,15 @@ std::shared_ptr<pt::DenseTensor> MakeTensorImpl<pt::DenseTensor>(
   return tensor_impl;
 }
 
+template <>
+std::shared_ptr<pt::DenseTensor> MakeTensorImpl<pt::DenseTensor>(
+    const Tensor& tensor, const platform::Place& place,
+    proto::VarType::Type type) {
+  return MakeTensorImpl<pt::DenseTensor>(tensor, pt::TransToPtBackend(place),
+                                         pt::TransToPtDataType(type),
+                                         pt::TransToPtLayout(tensor.layout()));
+}
+
 template <>
 void ShareTensorImpl<pt::DenseTensor>(pt::DenseTensor* tensor_impl,
                                       Tensor* out) {
diff --git a/paddle/fluid/framework/top_utils.h b/paddle/fluid/framework/top_utils.h
index f382c5f918f13..0411992608119 100644
--- a/paddle/fluid/framework/top_utils.h
+++ b/paddle/fluid/framework/top_utils.h
@@ -22,6 +22,12 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+template <typename TensorImplT>
+std::shared_ptr<TensorImplT> MakeTensorImpl(const Tensor& tensor,
+                                            pt::Backend backend,
+                                            pt::DataType dtype,
+                                            pt::DataLayout layout);
+
 template <typename TensorImplT>
 std::shared_ptr<TensorImplT> MakeTensorImpl(const Tensor& tensor,
                                             const platform::Place& place,
diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h
index f41a26846d8ac..883d442471a33 100644
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -47,7 +47,7 @@ class InferNoNeedBufferVarsFN;
  * the value T not modifiable. To modify the value you have to call the value()
  * method of the iterator to get a mutable reference. Example:
  *
- *      tsl::ordered_map<int, int> map = {{1, 1}, {2, 1}, {3, 1}};
+ *      paddle::ordered_map<int, int> map = {{1, 1}, {2, 1}, {3, 1}};
  *      for(auto it = map.begin(); it != map.end(); ++it) {
  *          //it->second = 2; // Illegal
  *          it.value() = 2; // Ok
@@ -67,6 +67,7 @@ using Attribute = boost::variant<
     std::vector<std::string>, bool, std::vector<bool>, BlockDesc*, int64_t,
     std::vector<BlockDesc*>, std::vector<int64_t>, std::vector<double>>;
 
+// TODO(chenweihang): AttirbuteMap also need to be ordered
 using AttributeMap = std::unordered_map<std::string, Attribute>;
 
 #ifdef PADDLE_WITH_ASCEND_CL
diff --git a/paddle/fluid/operators/sign_op.cc b/paddle/fluid/operators/sign_op.cc
index 8620cec8cf62d..b5e8144183c4a 100644
--- a/paddle/fluid/operators/sign_op.cc
+++ b/paddle/fluid/operators/sign_op.cc
@@ -67,6 +67,7 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(sign, ops::SignOp, ops::SignOpMaker<float>,
                   ops::SignGradMaker<paddle::framework::OpDesc>,
                   ops::SignGradMaker<paddle::imperative::OpBase>);
+
 REGISTER_OP_CPU_KERNEL(
     sign, ops::SignKernel<paddle::platform::CPUDeviceContext, float>,
     ops::SignKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/top/CMakeLists.txt b/paddle/top/CMakeLists.txt
index a18d72209ebf4..b7c6678696f0e 100644
--- a/paddle/top/CMakeLists.txt
+++ b/paddle/top/CMakeLists.txt
@@ -21,9 +21,8 @@ if(WITH_XPU)
 endif()
 # top kernels for other tensor
 add_subdirectory(selected_rows)
-# top infershape and dtype
+# top infershape
 add_subdirectory(infershape)
-add_subdirectory(inferdtype)
 # top public functors
 add_subdirectory(module)
 # top tests
diff --git a/paddle/top/core/backend.h b/paddle/top/core/backend.h
index db77d2156349c..b1ee09c177f29 100644
--- a/paddle/top/core/backend.h
+++ b/paddle/top/core/backend.h
@@ -33,11 +33,11 @@ enum class Backend {
   kUndef = 0,
   kCPU,
   kCUDA,
-  kCUDAPinned,  // need to be removed
-  kHIP,
+  kCUDAPinned,  // TODO(chenweihang): need to be removed
+  kHIP,         // TODO(chenweihang): hip is not need now
   kXPU,
   kNPU,
-  kNPUPinned,  // need to be removed
+  kNPUPinned,  // TODO(chenweihang): need to be removed
   kMKLDNN,
   kCUDNN,
   kNumBackends,
diff --git a/paddle/top/core/convert_utils.cc b/paddle/top/core/convert_utils.cc
index ab122b60d813a..f49b26113ce8b 100644
--- a/paddle/top/core/convert_utils.cc
+++ b/paddle/top/core/convert_utils.cc
@@ -82,6 +82,28 @@ DataLayout TransToPtLayout(const paddle::framework::DataLayout& layout) {
   }
 }
 
+paddle::platform::Place TransToFluidPlace(const Backend& backend) {
+  // TODO(chenweihang): add other trans cases
+  switch (backend) {
+    case pt::Backend::kCPU:
+      return paddle::platform::CPUPlace();
+    case pt::Backend::kCUDA:
+      return paddle::platform::CUDAPlace();
+    case pt::Backend::kXPU:
+      return paddle::platform::XPUPlace();
+    case pt::Backend::kNPU:
+      return paddle::platform::NPUPlace();
+    case pt::Backend::kMKLDNN:
+      return paddle::platform::CPUPlace();
+    case pt::Backend::kCUDNN:
+      return paddle::platform::CUDAPlace();
+    default:
+      PADDLE_THROW(paddle::platform::errors::Unimplemented(
+          "Unsupported backend `%s` when casting it to paddle place type.",
+          backend));
+  }
+}
+
 paddle::framework::proto::VarType::Type TransToProtoVarType(
     const pt::DataType& dtype) {
   // Set the order of case branches according to the frequency with
@@ -111,9 +133,27 @@ paddle::framework::proto::VarType::Type TransToProtoVarType(
       return paddle::framework::proto::VarType::BOOL;
     default:
       PADDLE_THROW(paddle::platform::errors::Unimplemented(
-          "Unsupported data type code(%d) when casting enum data type into "
+          "Unsupported data type `%s` when casting it into "
           "paddle data type.",
-          static_cast<int>(dtype)));
+          dtype));
+  }
+}
+
+paddle::framework::DataLayout TransToFluidDataLayout(const DataLayout& layout) {
+  switch (layout) {
+    case DataLayout::kNHWC:
+      return paddle::framework::DataLayout::kNHWC;
+    case DataLayout::kNCHW:
+      return paddle::framework::DataLayout::kNCHW;
+    case DataLayout::kAny:
+      return paddle::framework::DataLayout::kAnyLayout;
+    case DataLayout::kMKLDNN:
+      return paddle::framework::DataLayout::kMKLDNN;
+    default:
+      PADDLE_THROW(paddle::platform::errors::Unimplemented(
+          "Unsupported data layout `%s` when casting it into "
+          "paddle data layout.",
+          layout));
   }
 }
 
diff --git a/paddle/top/core/convert_utils.h b/paddle/top/core/convert_utils.h
index 664f3f9a716e9..d95654fd75220 100644
--- a/paddle/top/core/convert_utils.h
+++ b/paddle/top/core/convert_utils.h
@@ -33,7 +33,10 @@ Backend TransToPtBackend(const paddle::platform::Place& place);
 DataType TransToPtDataType(
     const paddle::framework::proto::VarType::Type& dtype);
 DataLayout TransToPtLayout(const paddle::framework::DataLayout& layout);
+
+paddle::platform::Place TransToFluidPlace(const Backend& backend);
 paddle::framework::proto::VarType::Type TransToProtoVarType(
     const DataType& dtype);
+paddle::framework::DataLayout TransToFluidDataLayout(const DataLayout& layout);
 
 }  // namespace pt
diff --git a/paddle/top/core/kernel_factory.cc b/paddle/top/core/kernel_factory.cc
index 5f3b45a75f51b..38e3163d517c5 100644
--- a/paddle/top/core/kernel_factory.cc
+++ b/paddle/top/core/kernel_factory.cc
@@ -24,6 +24,11 @@ OpKernelFactory& OpKernelFactory::Instance() {
   return g_op_kernel_factory;
 }
 
+bool OpKernelFactory::ContainsOperation(const char* op_type) const {
+  auto iter = kernels_.find(OperationName(op_type));
+  return (iter != kernels_.end());
+}
+
 const OpKernel& OpKernelFactory::SelectKernel(
     const OperationName& op_name, const OpKernelKey& kernel_key) const {
   auto iter = kernels_.find(op_name);
diff --git a/paddle/top/core/kernel_factory.h b/paddle/top/core/kernel_factory.h
index 22743b0c0939c..65aa601798e4d 100644
--- a/paddle/top/core/kernel_factory.h
+++ b/paddle/top/core/kernel_factory.h
@@ -138,6 +138,7 @@ class OpKernelKey {
   uint32_t hash_value_;
 };
 
+// TODO(chenweihang): how deal with vector<Param>?
 struct ParamDef {
   Backend backend;
   DataLayout layout;
@@ -159,6 +160,10 @@ class OpKernelParamDef {
     output_defs_.emplace_back(ParamDef(backend, layout, dtype));
   }
 
+  const std::vector<ParamDef>& input_defs() const { return input_defs_; }
+
+  const std::vector<ParamDef>& output_defs() const { return output_defs_; }
+
   void SetSameAsKernelKey() { same_as_kernel_key_ = true; }
 
  private:
@@ -180,13 +185,21 @@ class OpKernel {
 
   void operator()(OpKernelContext* ctx) const { fn_(ctx); }
 
-  OpKernelParamDef& param_def() { return param_def_; }
+  OpKernelParamDef* mutable_param_def() { return &param_def_; }
+
+  const OpKernelParamDef& param_def() const { return param_def_; }
 
  private:
   OpKernelFn fn_{nullptr};
   OpKernelParamDef param_def_;
 };
 
+/**
+ * Note: Each Operation need a basic kernel map that named by op_type.
+ *       Such as for scale op, OpKernelMap contains a `scale` kernel map,
+ *       if it still need other overload kernel, the op name can be
+ *       `scale.***`.
+ */
 class OpKernelFactory {
  public:
   // replaced by paddle::flat_hash_map later
@@ -199,6 +212,8 @@ class OpKernelFactory {
 
   OpKernelMap& kernels() { return kernels_; }
 
+  bool ContainsOperation(const char* op_type) const;
+
   const OpKernel& SelectKernel(const OperationName& op_name,
                                const OpKernelKey& kernel_key) const;
 
diff --git a/paddle/top/core/kernel_registry.h b/paddle/top/core/kernel_registry.h
index 4b7fbad675af1..2f5be38fea820 100644
--- a/paddle/top/core/kernel_registry.h
+++ b/paddle/top/core/kernel_registry.h
@@ -39,8 +39,8 @@ class OpKernelRegistrar {
   OpKernelRegistrar& Input(Backend backend, DataLayout layout, DataType dtype) {
     OpKernelFactory::Instance()
         .kernels()[op_name_][op_kernel_key_]
-        .param_def()
-        .AppendInput(backend, layout, dtype);
+        .mutable_param_def()
+        ->AppendInput(backend, layout, dtype);
     return *this;
   }
 
@@ -49,16 +49,16 @@ class OpKernelRegistrar {
                             DataType dtype) {
     OpKernelFactory::Instance()
         .kernels()[op_name_][op_kernel_key_]
-        .param_def()
-        .AppendOutput(backend, layout, dtype);
+        .mutable_param_def()
+        ->AppendOutput(backend, layout, dtype);
     return *this;
   }
 
   OpKernelRegistrar& SetSameAsKernelKey() {
     OpKernelFactory::Instance()
         .kernels()[op_name_][op_kernel_key_]
-        .param_def()
-        .SetSameAsKernelKey();
+        .mutable_param_def()
+        ->SetSameAsKernelKey();
     return *this;
   }
 
diff --git a/paddle/top/inferdtype/CMakeLists.txt b/paddle/top/inferdtype/CMakeLists.txt
deleted file mode 100644
index e69de29bb2d1d..0000000000000

From a94eefdbd301fcf3469e50ff4219d71f378a1081 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 23 Aug 2021 06:34:53 +0000
Subject: [PATCH 026/125] fix static op run error

---
 paddle/fluid/framework/operator.cc |  4 ++--
 paddle/top/core/kernel_factory.h   |  6 ------
 paddle/top/core/kernel_registry.h  | 23 +++++++++++------------
 paddle/top/cpu/math.cc             |  4 ++--
 4 files changed, 15 insertions(+), 22 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 80d6be5c33287..a7b177bf60a9d 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1126,7 +1126,7 @@ static pt::OpKernelContext BuildOpKernelContext(
     // TODO(chenweihang): deal with diff param in vector
     auto in_def = input_defs.at(i);
     for (auto* var : var_pair.second) {
-      const auto& tensor = var->Get<Tensor>();
+      const auto& tensor = var->Get<LoDTensor>();
       auto pt_in = MakeTensorImpl<pt::DenseTensor>(tensor, in_def.backend,
                                                    in_def.dtype, in_def.layout);
       op_kernel_ctx.EmplaceBackInput(pt_in);
@@ -1138,7 +1138,7 @@ static pt::OpKernelContext BuildOpKernelContext(
   for (auto it = ctx.outputs.begin(); it != ctx.outputs.end(); ++it) {
     auto out_def = output_defs.at(i);
     for (auto* var : it.value()) {
-      auto* tensor = var->GetMutable<Tensor>();
+      auto* tensor = var->GetMutable<LoDTensor>();
       // mutable_data before run kernel, to avoid share output form
       // OpKernelContext to original tensor
       tensor->mutable_data(pt::TransToFluidPlace(out_def.backend),
diff --git a/paddle/top/core/kernel_factory.h b/paddle/top/core/kernel_factory.h
index 65aa601798e4d..86fa1b6838899 100644
--- a/paddle/top/core/kernel_factory.h
+++ b/paddle/top/core/kernel_factory.h
@@ -164,16 +164,10 @@ class OpKernelParamDef {
 
   const std::vector<ParamDef>& output_defs() const { return output_defs_; }
 
-  void SetSameAsKernelKey() { same_as_kernel_key_ = true; }
-
  private:
   // TODO(chenweihang): replaced by paddle::small_vector
   std::vector<ParamDef> input_defs_{{}};
   std::vector<ParamDef> output_defs_{{}};
-  // if the same_as_kernel_key_ is true, all this kernel's input and output
-  // hold def that same as kernel key, the input_defs_ and output_defs_ are
-  // empty
-  bool same_as_kernel_key_{false};
 };
 
 class OpKernel {
diff --git a/paddle/top/core/kernel_registry.h b/paddle/top/core/kernel_registry.h
index 2f5be38fea820..85feb025ba32d 100644
--- a/paddle/top/core/kernel_registry.h
+++ b/paddle/top/core/kernel_registry.h
@@ -54,14 +54,6 @@ class OpKernelRegistrar {
     return *this;
   }
 
-  OpKernelRegistrar& SetSameAsKernelKey() {
-    OpKernelFactory::Instance()
-        .kernels()[op_name_][op_kernel_key_]
-        .mutable_param_def()
-        ->SetSameAsKernelKey();
-    return *this;
-  }
-
   void Touch() {}
 
  private:
@@ -102,10 +94,10 @@ class OpKernelRegistrar {
                                   ::pt::CppTypeToDataType<dtype>::Type(), \
                                   PT_KERNEL(meta_kernel_fn<dtype>))
 
-#define PT_TORCH_KERNEL_REGISTRAR(op_name, backend, layout, dtype)          \
+#define PT_TOUCH_KERNEL_REGISTRAR(op_name, backend, layout, dtype)          \
   PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                        \
       __touch_pt_op_kernel_##op_name##_##backend##_##layout##_##dtype##__,  \
-      "PT_TORCH_KERNEL_REGISTRAR must be called in global namespace.");     \
+      "PT_TOUCH_KERNEL_REGISTRAR must be called in global namespace.");     \
   int TouchOpKernelRegistrar_##op_name##_##backend##_##dtype##_##layout() { \
     __pt_op_kernel_##op_name##_##backend##_##layout##_##dtype##__.Touch();  \
     return 0;                                                               \
@@ -117,11 +109,18 @@ class OpKernelRegistrar {
  * writing, we provide the following simple kernel registration macro.
  * If it is an special case, please use PT_REGISTER_STANDARD_KERNEL
  */
+// TODO(chenweihang): only work for single input and output now.
+// can we use function traits here to parse the input and output type?
 #define PT_REGISTER_KERNEL_1T(op_name, backend, layout, meta_kernel_fn, dtype) \
   PT_REGISTER_KERNEL_AUTO_SPECIALIZE(                                          \
       op_name, backend, layout, meta_kernel_fn, dtype)                         \
-      .SetSameAsKernelKey();                                                   \
-  PT_TORCH_KERNEL_REGISTRAR(op_name, backend, layout, dtype)
+      .Input(BACKEND(backend),                                                 \
+             DATALAYOUT(layout),                                               \
+             ::pt::CppTypeToDataType<dtype>::Type())                           \
+      .Output(BACKEND(backend),                                                \
+              DATALAYOUT(layout),                                              \
+              ::pt::CppTypeToDataType<dtype>::Type());                         \
+  PT_TOUCH_KERNEL_REGISTRAR(op_name, backend, layout, dtype)
 
 #define PT_REGISTER_KERNEL_2T(                                             \
     op_name, backend, layout, meta_kernel_fn, dtype1, dtype2)              \
diff --git a/paddle/top/cpu/math.cc b/paddle/top/cpu/math.cc
index 670339cb4ba83..9ac430ad25185 100644
--- a/paddle/top/cpu/math.cc
+++ b/paddle/top/cpu/math.cc
@@ -21,13 +21,13 @@ namespace pt {}  // namespace pt
 // PT_KERNEL(pt::Sign<float>))
 //   .Input(BACKEND(CPU), DATALAYOUT(NCHW), DATATYPE(FLOAT32))
 //   .Output(BACKEND(CPU), DATALAYOUT(NCHW), DATATYPE(FLOAT32));
-// PT_TORCH_KERNEL_REGISTRAR(sign, CPU, NCHW, FLOAT32);
+// PT_TOUCH_KERNEL_REGISTRAR(sign, CPU, NCHW, FLOAT32);
 
 // Register method 2:
 // PT_REGISTER_KERNEL_AUTO_SPECIALIZE(sign, CPU, NCHW, pt::Sign, float)
 //   .Input(BACKEND(CPU), DATALAYOUT(NCHW), DATATYPE(FLOAT32))
 //   .Output(BACKEND(CPU), DATALAYOUT(NCHW), DATATYPE(FLOAT32));
-// PT_TORCH_KERNEL_REGISTRAR(sign, CPU, NCHW, FLOAT32);
+// PT_TOUCH_KERNEL_REGISTRAR(sign, CPU, NCHW, FLOAT32);
 
 // Register method 3:
 PT_REGISTER_KERNEL_2T(sign, CPU, NCHW, pt::Sign, float, double);

From 021a505a5514980acf97ab32a40d3bba3e63404c Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 23 Aug 2021 08:41:21 +0000
Subject: [PATCH 027/125] fix new executor compile failed

---
 .../framework/new_executor/interpretercore.cc    | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index 7f6091742f02b..c530ab945b9aa 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -365,16 +365,16 @@ void InterpreterCore::BuildOpFuncList(const platform::Place& place,
 
     VariableValueMap& ins_map_temp = runtime_context.inputs;
 
-    for (auto& var_name_item : ins_map_temp) {
-      for (size_t i = 0; i < var_name_item.second.size(); ++i) {
-        auto var = var_name_item.second[i];
+    for (auto it = ins_map_temp.begin(); it != ins_map_temp.end(); ++it) {
+      for (size_t i = 0; i < it.value().size(); ++i) {
+        auto var = it.value()[i];
         auto tensor_in = static_cast<const Tensor*>(&(var->Get<LoDTensor>()));
         if (!tensor_in->IsInitialized()) {
           continue;
         }
         auto kernel_type_for_var =
             static_cast<const framework::OperatorWithKernel*>(op_base)
-                ->GetKernelTypeForVar(var_name_item.first, *tensor_in,
+                ->GetKernelTypeForVar(it->first, *tensor_in,
                                       expected_kernel_key);
         if (!platform::is_same_place(kernel_type_for_var.place_,
                                      expected_kernel_key.place_)) {
@@ -389,7 +389,7 @@ void InterpreterCore::BuildOpFuncList(const platform::Place& place,
           var_scope->var_list.push_back(v);
 
           VariableNameMap copy_in_map;
-          auto x_iter = inputs_names.find(var_name_item.first);
+          auto x_iter = inputs_names.find(it->first);
           copy_in_map["X"] = {x_iter->second[i]};
           VariableNameMap copy_out_map;
           copy_out_map["Out"] = {new_var_name};
@@ -398,11 +398,11 @@ void InterpreterCore::BuildOpFuncList(const platform::Place& place,
               is_cpu_place(place) ? 0 : is_gpu_place(place) ? 1 : -1;
 
           std::map<std::string, std::vector<int>> copy_ins_name2id;
-          copy_ins_name2id["X"] = ins_name2id[var_name_item.first];
+          copy_ins_name2id["X"] = ins_name2id[it->first];
           std::map<std::string, std::vector<int>> copy_out_name2id;
           copy_out_name2id["Out"] = {var_scope->name2id[new_var_name]};
 
-          op_func_node.input_index[var_name_item.first][i] =
+          op_func_node.input_index[it->first][i] =
               var_scope->name2id[new_var_name];
 
           VariableValueMap copy_ins_value_map;
@@ -448,7 +448,7 @@ void InterpreterCore::BuildOpFuncList(const platform::Place& place,
           op_list->push_back(copy_op);
           vec_func_list->push_back(copy_op_func_node);
 
-          var_name_item.second[i] = v;
+          it.value()[i] = v;
         }
       }
     }

From f24e45ee2d4e04d6c83661fd58c4360e256eb3d2 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 25 Aug 2021 12:43:44 +0000
Subject: [PATCH 028/125] add dygraph branch & remove sign_op.h

---
 paddle/fluid/framework/operator.cc           |  10 +-
 paddle/fluid/framework/operator.h            |   5 +-
 paddle/fluid/imperative/prepared_operator.cc | 224 +++++++++++++++----
 paddle/fluid/imperative/prepared_operator.h  |  12 +
 paddle/fluid/operators/sign_op.cc            |  22 +-
 paddle/fluid/operators/sign_op.h             |  60 -----
 paddle/fluid/pybind/op_function_generator.cc |   4 +-
 paddle/top/core/kernel_factory.h             |   8 +-
 paddle/top/cuda/math.cu                      |   7 +-
 9 files changed, 211 insertions(+), 141 deletions(-)
 delete mode 100644 paddle/fluid/operators/sign_op.h

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index a7b177bf60a9d..ebde73b03778e 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1074,7 +1074,7 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
   this->InferShape(&infer_shape_ctx);
 }
 
-static OpKernelType TransPtOpKernelKeyToOpKernelType(
+OpKernelType TransPtOpKernelKeyToOpKernelType(
     const pt::OpKernelKey& kernel_key) {
   proto::VarType::Type data_type = pt::TransToProtoVarType(kernel_key.dtype());
   platform::Place place = pt::TransToFluidPlace(kernel_key.backend());
@@ -1303,8 +1303,8 @@ void OperatorWithKernel::ChoosePtKernel(
   pt::OperationName op_name(Type().c_str());
 
   // 2. construct op kernel key
-  pt_kernel_key_.reset(
-      new pt::OpKernelKey(ConstructPtOpKernelKey(ctx, dev_ctx.GetPlace())));
+  pt_kernel_key_.reset(new pt::OpKernelKey(
+      ConstructPtOpKernelKey(ctx.inputs, dev_ctx.GetPlace())));
 
   // 3. selecte op kernel
   pt_kernel_.reset(new pt::OpKernel(
@@ -1814,7 +1814,7 @@ OpKernelType OperatorWithKernel::GetKernelTypeForVar(
 }
 
 pt::OpKernelKey OperatorWithKernel::ConstructPtOpKernelKey(
-    const RuntimeContext& ctx, const platform::Place& ctx_place) const {
+    const VariableValueMap& inputs, const platform::Place& ctx_place) const {
   // 1. get backend based place and attrs
   pt::Backend backend = pt::TransToPtBackend(ctx_place);
   if (HasAttr("use_mkldnn") && Attr<bool>("use_mkldnn") == true) {
@@ -1838,7 +1838,7 @@ pt::OpKernelKey OperatorWithKernel::ConstructPtOpKernelKey(
   proto::VarType::Type dafault_data_type =
       static_cast<proto::VarType::Type>(-1);
   proto::VarType::Type data_type = dafault_data_type;
-  for (auto& var_pair : ctx.inputs) {
+  for (auto& var_pair : inputs) {
     ParseInputDataType(var_pair.second, var_pair.first, &data_type);
   }
   PADDLE_ENFORCE_NE(
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 2309746fa663e..5d62b187973c0 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -115,6 +115,9 @@ inline std::string GradOriginalVarName(const std::string& grad_var_name) {
 const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var);
 Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var);
 
+OpKernelType TransPtOpKernelKeyToOpKernelType(
+    const pt::OpKernelKey& kernel_key);
+
 class ExecutionContext;
 class OperatorBase;
 
@@ -533,7 +536,7 @@ class OperatorWithKernel : public OperatorBase {
   /* member functions for adapting to top lib */
   // TODO(chenweihang): Temporarily as a class method
   virtual pt::OpKernelKey ConstructPtOpKernelKey(
-      const RuntimeContext& ctx, const platform::Place& ctx_place) const;
+      const VariableValueMap& inputs, const platform::Place& ctx_place) const;
 
  private:
   void RunImpl(const Scope& scope, const platform::Place& place) const final;
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 93f2fd38a7306..94bdc3a2b26f6 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -16,6 +16,7 @@
 
 #include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/details/nan_inf_utils.h"
+#include "paddle/fluid/framework/top_utils.h"
 #include "paddle/fluid/imperative/infer_shape_context.h"
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/xpu/xpu_op_list.h"
@@ -88,6 +89,37 @@ PreparedOp::PreparedOp(const framework::OperatorBase& op,
       func_(func),
       dev_ctx_(dev_ctx) {}
 
+PreparedOp::PreparedOp(const framework::OperatorBase& op,
+                       const framework::RuntimeContext& ctx,
+                       const pt::OpKernelKey& pt_kernel_key,
+                       const pt::OpKernel& pt_kernel,
+                       platform::DeviceContext* dev_ctx)
+    : op_(op),
+      ctx_(ctx),
+      kernel_type_(framework::OpKernelType(framework::proto::VarType::RAW,
+                                           platform::CPUPlace())),
+      func_(nullptr),
+      dev_ctx_(dev_ctx),
+      run_pt_kernel_(true),
+      pt_kernel_key_(pt_kernel_key),
+      pt_kernel_(pt_kernel) {
+  // TODO(chenweihang): PrepareData still use old impl, so here need save
+  // old kernel type, trans it later
+  kernel_type_ = framework::TransPtOpKernelKeyToOpKernelType(pt_kernel_key_);
+}
+
+template <typename VarType>
+static framework::VariableValueMap BuildInputMap(
+    const NameVarMap<VarType>& ins) {
+  framework::VariableValueMap inputs;
+  for (auto& var_pair : ins) {
+    for (auto& var : var_pair.second) {
+      inputs[var_pair.first].emplace_back(var->MutableVar());
+    }
+  }
+  return inputs;
+}
+
 template <typename VarType>
 PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
                        const NameVarMap<VarType>& outs,
@@ -114,55 +146,70 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
 #endif
 
   // 1. get expected kernel key
-  auto expected_kernel_key = op.GetExpectedKernelType(
-      DygraphExecutionContext<VarType>(op, framework::Scope(), *dev_ctx, ctx,
-                                       ins, outs, attrs, default_attrs));
-  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
-
-  // 2. check if op[type] has kernel registered.
-  auto& all_op_kernels = op.AllOpKernels();
-  auto kernels_iter = all_op_kernels.find(op.Type());
-  PADDLE_ENFORCE_NE(
-      kernels_iter, all_op_kernels.end(),
-      platform::errors::NotFound(
-          "There are no kernels which are registered in the %s operator.",
-          op.Type()));
-
-  auto& kernels = kernels_iter->second;
-  auto kernel_iter = kernels.find(expected_kernel_key);
+  bool run_pt_kernel =
+      pt::OpKernelFactory::Instance().ContainsOperation(op.Type().c_str());
+  if (run_pt_kernel) {
+    pt::OperationName op_name(op.Type().c_str());
+    auto inputs = BuildInputMap<VarType>(ins);
+    auto pt_kernel_key = op.ConstructPtOpKernelKey(inputs, place);
+    auto pt_kernel =
+        pt::OpKernelFactory::Instance().SelectKernel(op_name, pt_kernel_key);
+    // TODO(chenweihang): using CPUKernel when miss device kernel case
+    return PreparedOp(op, ctx, pt_kernel_key, pt_kernel, dev_ctx);
+  } else {
+    auto expected_kernel_key = op.GetExpectedKernelType(
+        DygraphExecutionContext<VarType>(op, framework::Scope(), *dev_ctx, ctx,
+                                         ins, outs, attrs, default_attrs));
+    VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
+
+    // 2. check if op[type] has kernel registered.
+    auto& all_op_kernels = op.AllOpKernels();
+    auto kernels_iter = all_op_kernels.find(op.Type());
+    PADDLE_ENFORCE_NE(
+        kernels_iter, all_op_kernels.end(),
+        platform::errors::NotFound(
+            "There are no kernels which are registered in the %s operator.",
+            op.Type()));
+
+    auto& kernels = kernels_iter->second;
+    auto kernel_iter = kernels.find(expected_kernel_key);
 #ifdef PADDLE_WITH_XPU
-  if ((kernel_iter == kernels.end() &&
-       is_xpu_place(expected_kernel_key.place_) &&
-       !paddle::platform::is_xpu_support_op(op.Type(), expected_kernel_key)) ||
-      paddle::platform::is_in_xpu_black_list(op.Type())) {
-    VLOG(3) << "missing XPU kernel: " << op.Type()
-            << ", expected_kernel_key:" << expected_kernel_key
-            << ", fallbacking to CPU one!";
-    expected_kernel_key.place_ = platform::CPUPlace();
-    kernel_iter = kernels.find(expected_kernel_key);
-  }
+    if ((kernel_iter == kernels.end() &&
+         is_xpu_place(expected_kernel_key.place_) &&
+         !paddle::platform::is_xpu_support_op(op.Type(),
+                                              expected_kernel_key)) ||
+        paddle::platform::is_in_xpu_black_list(op.Type())) {
+      VLOG(3) << "missing XPU kernel: " << op.Type()
+              << ", expected_kernel_key:" << expected_kernel_key
+              << ", fallbacking to CPU one!";
+      expected_kernel_key.place_ = platform::CPUPlace();
+      kernel_iter = kernels.find(expected_kernel_key);
+    }
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
-  if (kernel_iter == kernels.end() &&
-      is_npu_place(expected_kernel_key.place_)) {
-    VLOG(3) << "missing NPU kernel: " << op.Type()
-            << ", expected_kernel_key:" << expected_kernel_key
-            << ", fallbacking to CPU one!";
-    expected_kernel_key.place_ = platform::CPUPlace();
-    kernel_iter = kernels.find(expected_kernel_key);
-  }
+    if (kernel_iter == kernels.end() &&
+        is_npu_place(expected_kernel_key.place_)) {
+      VLOG(3) << "missing NPU kernel: " << op.Type()
+              << ", expected_kernel_key:" << expected_kernel_key
+              << ", fallbacking to CPU one!";
+      expected_kernel_key.place_ = platform::CPUPlace();
+      kernel_iter = kernels.find(expected_kernel_key);
+    }
 #endif
-  // TODO(jiabin): Add operator.cc's line 1000 part back when we need that case
-  PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
-                    platform::errors::NotFound(
-                        "Operator %s does not have kernel for %s.", op.Type(),
-                        KernelTypeToString(expected_kernel_key)));
-
-  if (!(expected_kernel_key.place_ == place)) {
-    dev_ctx = pool.Get(expected_kernel_key.place_);
-  }
+    // TODO(jiabin): Add operator.cc's line 1000 part back when we need that
+    // case
+    PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
+                      platform::errors::NotFound(
+                          "Operator %s does not have kernel for %s.", op.Type(),
+                          KernelTypeToString(expected_kernel_key)));
 
-  return PreparedOp(op, ctx, expected_kernel_key, kernel_iter->second, dev_ctx);
+    if (!(expected_kernel_key.place_ == place)) {
+      dev_ctx = pool.Get(expected_kernel_key.place_);
+    }
+
+    return PreparedOp(op, ctx, expected_kernel_key, kernel_iter->second,
+                      dev_ctx);
+  }
 }
 
 PreparedOp PreparedOp::Prepare(const NameVarMap<VarBase>& ins,
@@ -184,6 +231,54 @@ PreparedOp PreparedOp::Prepare(const NameVarMap<VariableWrapper>& ins,
                                       default_attrs);
 }
 
+template <typename VarType>
+static pt::OpKernelContext BuildDygraphOpKernelContext(
+    const pt::OpKernel& pt_kernel, const NameVarMap<VarType>& ins,
+    const NameVarMap<VarType>& outs, const platform::DeviceContext& dev_ctx) {
+  // TODO(chenweihang): now only work for very simple case (sign op),
+  // many cases need to be deal with later:
+  // 1. the input and output are not tensor
+  // 2. the dispensbale, duplicable input and output
+  // 3. needless attributes remove
+  // 4. use pt Tensor directly
+  // 5. kernel input is not DenseTensor
+  pt::OpKernelContext op_kernel_ctx(dev_ctx);
+  auto input_defs = pt_kernel.param_def().input_defs();
+  auto output_defs = pt_kernel.param_def().output_defs();
+
+  size_t i = 0;
+  for (auto& var_pair : ins) {
+    auto in_def = input_defs.at(i);
+    for (auto var : var_pair.second) {
+      const auto& variable = var->template Var();
+      const auto& tensor = variable.template Get<framework::LoDTensor>();
+      auto pt_in = framework::MakeTensorImpl<pt::DenseTensor>(
+          tensor, in_def.backend, in_def.dtype, in_def.layout);
+      op_kernel_ctx.EmplaceBackInput(pt_in);
+    }
+    ++i;
+  }
+
+  i = 0;
+  for (auto it = outs.begin(); it != outs.end(); ++it) {
+    auto out_def = output_defs.at(i);
+    for (auto var : it->second) {
+      auto* variable = var->template MutableVar();
+      auto* tensor = variable->template GetMutable<framework::LoDTensor>();
+      // mutable_data before run kernel, to avoid share output form
+      // OpKernelContext to original tensor
+      tensor->mutable_data(pt::TransToFluidPlace(out_def.backend),
+                           pt::TransToProtoVarType(out_def.dtype));
+      auto pt_out = framework::MakeTensorImpl<pt::DenseTensor>(
+          *tensor, out_def.backend, out_def.dtype, out_def.layout);
+      op_kernel_ctx.EmplaceBackOutput(pt_out);
+    }
+    ++i;
+  }
+  // TODO(chenweihang): append attrs
+  return op_kernel_ctx;
+}
+
 template <typename VarType>
 static void PreparedOpRunImpl(
     const framework::OperatorBase& op, const framework::RuntimeContext& ctx,
@@ -225,20 +320,53 @@ static void PreparedOpRunImpl(
   }
 }
 
+template <typename VarType>
+static void PreparedOpRunPtImpl(const framework::OperatorBase& op,
+                                const pt::OpKernelKey& pt_kernel_key,
+                                const pt::OpKernel& pt_kernel,
+                                platform::DeviceContext* dev_ctx,
+                                const NameVarMap<VarType>& ins,
+                                const NameVarMap<VarType>& outs,
+                                const framework::AttributeMap& attrs,
+                                const framework::AttributeMap& default_attrs) {
+  DygraphInferShapeContext<VarType> infer_shape_ctx(&ins, &outs, &attrs,
+                                                    &default_attrs, op.Type());
+  static_cast<const framework::OperatorWithKernel&>(op).InferShape(
+      &infer_shape_ctx);
+
+  auto op_kernel_ctx =
+      BuildDygraphOpKernelContext<VarType>(pt_kernel, ins, outs, *dev_ctx);
+  pt_kernel(&op_kernel_ctx);
+
+  // TODO(chenweihang): add flags
+  // TODO(chenweihang): deal with complex cases
+}
+
 void PreparedOp::Run(const NameVarMap<VarBase>& ins,
                      const NameVarMap<VarBase>& outs,
                      const framework::AttributeMap& attrs,
                      const framework::AttributeMap& default_attrs) {
-  PreparedOpRunImpl<VarBase>(op_, ctx_, kernel_type_, func_, dev_ctx_, ins,
-                             outs, attrs, default_attrs);
+  if (run_pt_kernel_) {
+    PreparedOpRunPtImpl<VarBase>(op_, pt_kernel_key_, pt_kernel_, dev_ctx_, ins,
+                                 outs, attrs, default_attrs);
+  } else {
+    PreparedOpRunImpl<VarBase>(op_, ctx_, kernel_type_, func_, dev_ctx_, ins,
+                               outs, attrs, default_attrs);
+  }
 }
 
 void PreparedOp::Run(const NameVarMap<VariableWrapper>& ins,
                      const NameVarMap<VariableWrapper>& outs,
                      const framework::AttributeMap& attrs,
                      const framework::AttributeMap& default_attrs) {
-  PreparedOpRunImpl<VariableWrapper>(op_, ctx_, kernel_type_, func_, dev_ctx_,
-                                     ins, outs, attrs, default_attrs);
+  if (run_pt_kernel_) {
+    PreparedOpRunPtImpl<VariableWrapper>(op_, pt_kernel_key_, pt_kernel_,
+                                         dev_ctx_, ins, outs, attrs,
+                                         default_attrs);
+  } else {
+    PreparedOpRunImpl<VariableWrapper>(op_, ctx_, kernel_type_, func_, dev_ctx_,
+                                       ins, outs, attrs, default_attrs);
+  }
 }
 
 }  // namespace imperative
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index 53f876c498cd0..c831399a42aa1 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -25,6 +25,8 @@
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/imperative/type_defs.h"
 
+#include "paddle/top/api/include/dev/core.h"
+
 DECLARE_bool(use_mkldnn);
 
 namespace paddle {
@@ -147,6 +149,11 @@ class PreparedOp {
              const framework::OperatorWithKernel::OpKernelFunc& func,
              platform::DeviceContext* dev_ctx);
 
+  PreparedOp(const framework::OperatorBase& op,
+             const framework::RuntimeContext& ctx,
+             const pt::OpKernelKey& pt_kernel_key,
+             const pt::OpKernel& pt_kernel, platform::DeviceContext* dev_ctx);
+
   static PreparedOp Prepare(const NameVarMap<VarBase>& ins,
                             const NameVarMap<VarBase>& outs,
                             const framework::OperatorWithKernel& op,
@@ -178,6 +185,11 @@ class PreparedOp {
   framework::OpKernelType kernel_type_;
   framework::OperatorWithKernel::OpKernelFunc func_;
   platform::DeviceContext* dev_ctx_;
+  // TODo(chenweihang): Similar duplicate members are used for new top lib,
+  // maybe we have better impl methods
+  bool run_pt_kernel_{false};
+  pt::OpKernelKey pt_kernel_key_;
+  pt::OpKernel pt_kernel_;
 };
 
 }  // namespace imperative
diff --git a/paddle/fluid/operators/sign_op.cc b/paddle/fluid/operators/sign_op.cc
index b5e8144183c4a..83c1955758f20 100644
--- a/paddle/fluid/operators/sign_op.cc
+++ b/paddle/fluid/operators/sign_op.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/sign_op.h"
 #include <memory>
-#include "paddle/fluid/platform/float16.h"
+
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -67,21 +67,3 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(sign, ops::SignOp, ops::SignOpMaker<float>,
                   ops::SignGradMaker<paddle::framework::OpDesc>,
                   ops::SignGradMaker<paddle::imperative::OpBase>);
-
-REGISTER_OP_CPU_KERNEL(
-    sign, ops::SignKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::SignKernel<paddle::platform::CPUDeviceContext, double>);
-
-#ifdef PADDLE_WITH_CUDA
-REGISTER_OP_CUDA_KERNEL(
-    sign,
-    paddle::operators::SignKernel<paddle::platform::CUDADeviceContext, float>,
-    paddle::operators::SignKernel<paddle::platform::CUDADeviceContext, double>,
-    paddle::operators::SignKernel<paddle::platform::CUDADeviceContext,
-                                  paddle::platform::float16>);
-#endif
-
-#ifdef PADDLE_WITH_XPU
-REGISTER_OP_XPU_KERNEL(
-    sign, ops::SignKernel<paddle::platform::XPUDeviceContext, float>);
-#endif
diff --git a/paddle/fluid/operators/sign_op.h b/paddle/fluid/operators/sign_op.h
deleted file mode 100644
index 954013817267f..0000000000000
--- a/paddle/fluid/operators/sign_op.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/top_utils.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
-
-// only can include the headers in paddle/top/api dirs
-#include "paddle/top/api/include/dev/core.h"
-#include "paddle/top/api/include/dev/math.h"
-
-namespace paddle {
-namespace operators {
-template <typename DeviceContext, typename T>
-class SignKernel : public framework::OpKernel<T> {
- public:
-  virtual void Compute(const framework::ExecutionContext& context) const {
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto& dev_ctx = context.device_context<DeviceContext>();
-
-    // debug: print all registered sign kernels for check
-    VLOG(1) << pt::OpKernelFactory::Instance();
-
-    // TODO(chenweihang): only to test correctness, this will introduce
-    // needless context prepare cost
-    pt::OpKernelContext op_kernel_ctx(dev_ctx);
-    auto pt_x =
-        framework::MakeTensorImpl<pt::DenseTensor>(*x, x->place(), x->type());
-    auto pt_out =
-        framework::MakeTensorImpl<pt::DenseTensor>(*out, x->place(), x->type());
-    op_kernel_ctx.EmplaceBackInput(pt_x);
-    op_kernel_ctx.EmplaceBackOutput(pt_out);
-
-    auto& op_kernel = pt::OpKernelFactory::Instance().SelectKernel(
-        "sign", pt::TransToPtBackend(x->place()),
-        pt::TransToPtLayout(x->layout()), pt::TransToPtDataType(x->type()));
-    op_kernel(&op_kernel_ctx);
-
-    // share pt_out data to out
-    framework::ShareTensorImpl(pt_out.get(), out);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index 4b610f3bccba0..e8d24e255aa1e 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -533,7 +533,9 @@ GenerateOpFunctions() {
     auto& op_type = op_proto->type();
     // Skip ooerator which is not inherit form OperatorWithKernel, like while,
     // since only OperatorWithKernel can run in dygraph mode.
-    if (!all_kernels.count(op_type)) {
+    // if the top lib contains op kernel, we still generate ops method
+    if (!all_kernels.count(op_type) &&
+        !pt::OpKernelFactory::Instance().ContainsOperation(op_type.c_str())) {
       continue;
     }
 
diff --git a/paddle/top/core/kernel_factory.h b/paddle/top/core/kernel_factory.h
index 86fa1b6838899..53c43d26fb047 100644
--- a/paddle/top/core/kernel_factory.h
+++ b/paddle/top/core/kernel_factory.h
@@ -85,6 +85,8 @@ struct OperationName final {
 
 class OpKernelKey {
  public:
+  OpKernelKey() = default;
+
   OpKernelKey(Backend backend, DataLayout layout, DataType dtype)
       : backend_(backend), layout_(layout), dtype_(dtype) {
     // |----31-20------|---19-12---|---11-8----|---7-0---|
@@ -127,9 +129,9 @@ class OpKernelKey {
   constexpr static int kDataLayoutBitLength = 4;
   constexpr static int kDataTypeBitLength = 8;
 
-  Backend backend_;
-  DataLayout layout_;
-  DataType dtype_;
+  Backend backend_{Backend::kUndef};
+  DataLayout layout_{DataLayout::kUndef};
+  DataType dtype_{DataType::kUndef};
 
   // Avoid calculating Hash value at runtime.
   // Note: Now the number of bits we need does not exceed 32 bits, so there is
diff --git a/paddle/top/cuda/math.cu b/paddle/top/cuda/math.cu
index 55184f7ff2431..501e12a7d22f1 100644
--- a/paddle/top/cuda/math.cu
+++ b/paddle/top/cuda/math.cu
@@ -22,6 +22,7 @@ limitations under the License. */
 namespace cub = hipcub;
 #endif
 
+#include "paddle/fluid/platform/float16.h"
 #include "paddle/top/core/convert_utils.h"
 #include "paddle/top/core/kernel_registry.h"
 
@@ -87,6 +88,6 @@ template void Mean<paddle::platform::float16>(const CUDAContext& dev_ctx,
 
 }  // namespace pt
 
-// PT_REGISTER_KERNEL_3T(sign, CUDA, NCHW, pt::Sign, float, double,
-// pt::float16);
-PT_REGISTER_KERNEL_2T(sign, CUDA, NCHW, pt::Sign, float, double);
+using float16 = paddle::platform::float16;
+PT_REGISTER_KERNEL_3T(sign, CUDA, NCHW, pt::Sign, float, double, float16);
+// PT_REGISTER_KERNEL_2T(sign, CUDA, NCHW, pt::Sign, float, double);

From 44acc84004f8008048448cab4cb0de4f2e39a1b9 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 26 Aug 2021 06:34:33 +0000
Subject: [PATCH 029/125] fix test_infer_no_need_buffer_slots

---
 paddle/fluid/pybind/pybind.cc                 | 26 +++++++++----------
 .../test_infer_no_need_buffer_slots.py        | 19 +++-----------
 2 files changed, 15 insertions(+), 30 deletions(-)

diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 4f74262895044..1b45944157ae3 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1381,20 +1381,18 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("has_infer_inplace", [](const std::string op_type) {
     return framework::OpInfoMap::Instance().Get(op_type).HasInferInplace();
   });
-  m.def("infer_no_need_buffer_slots",
-        [](const std::string op_type, const framework::VariableNameMap &inputs,
-           const framework::VariableNameMap &outputs,
-           const framework::AttributeMap &attrs) {
-          auto infer_func = framework::OpInfoMap::Instance()
-                                .Get(op_type)
-                                .NoNeedBufferVarsInferer();
-          if (infer_func) {
-            return infer_func(inputs, outputs, attrs);
-          } else {
-            std::unordered_set<std::string> empty = {};
-            return empty;
-          }
-        });
+  m.def("infer_no_need_buffer_slots", [](const OpDesc &op_desc) {
+    auto infer_func = framework::OpInfoMap::Instance()
+                          .Get(op_desc.Type())
+                          .NoNeedBufferVarsInferer();
+    if (infer_func) {
+      return infer_func(op_desc.Inputs(), op_desc.Outputs(),
+                        op_desc.GetAttrMap());
+    } else {
+      std::unordered_set<std::string> empty = {};
+      return empty;
+    }
+  });
   m.def("prune", [](const ProgramDesc &origin,
                     const std::set<std::string> &feeded_var_names,
                     const std::vector<std::array<size_t, 2>> &targets) {
diff --git a/python/paddle/fluid/tests/unittests/test_infer_no_need_buffer_slots.py b/python/paddle/fluid/tests/unittests/test_infer_no_need_buffer_slots.py
index 3656cdfd5a034..f773d94141faf 100644
--- a/python/paddle/fluid/tests/unittests/test_infer_no_need_buffer_slots.py
+++ b/python/paddle/fluid/tests/unittests/test_infer_no_need_buffer_slots.py
@@ -41,31 +41,18 @@ def test_infer_no_need_buffer_slots(self):
 
         block = program.global_block()
         for idx, op in enumerate(block.ops):
-            op_desc = op.desc
-            inputs = {}
-            for input_name in op_desc.input_names():
-                inputs[input_name] = op_desc.input(input_name)
-            outputs = {}
-            for output_name in op_desc.output_names():
-                outputs[output_name] = op_desc.output(output_name)
-            attrs = {}
-            for attr_name in op_desc.attr_names():
-                attrs[attr_name] = op_desc.attr(attr_name)
             if idx == 0:
                 # elementwise_add op
                 self.assertEqual(
-                    core.infer_no_need_buffer_slots(op.type, inputs, outputs,
-                                                    attrs), set([]))
+                    core.infer_no_need_buffer_slots(op.desc), set([]))
             elif idx == 1:
                 # fill constant op
                 self.assertEqual(
-                    core.infer_no_need_buffer_slots(op.type, inputs, outputs,
-                                                    attrs), set([]))
+                    core.infer_no_need_buffer_slots(op.desc), set([]))
             else:
                 # elementwise_add_grad op
                 self.assertEqual(
-                    core.infer_no_need_buffer_slots(op.type, inputs, outputs,
-                                                    attrs), set(['Y', 'X']))
+                    core.infer_no_need_buffer_slots(op.desc), set(['Y', 'X']))
 
 
 if __name__ == '__main__':

From 2b66ab49d8643238eaa15526a87347ddecb53cea Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 26 Aug 2021 06:55:11 +0000
Subject: [PATCH 030/125] fix rocm compile link error

---
 paddle/top/CMakeLists.txt     | 6 ++----
 paddle/top/hip/CMakeLists.txt | 1 -
 2 files changed, 2 insertions(+), 5 deletions(-)
 delete mode 100644 paddle/top/hip/CMakeLists.txt

diff --git a/paddle/top/CMakeLists.txt b/paddle/top/CMakeLists.txt
index b7c6678696f0e..42e8087ac36be 100644
--- a/paddle/top/CMakeLists.txt
+++ b/paddle/top/CMakeLists.txt
@@ -4,12 +4,10 @@ add_subdirectory(api)
 add_subdirectory(core)
 # top kernels for diff device
 add_subdirectory(cpu)
-if(WITH_GPU)
+if(WITH_GPU OR WITH_ROCM)
   add_subdirectory(cuda)
 endif()
-if(WITH_ROCM)
-  add_subdirectory(hip)
-endif()
+# TODO(chenweihang): if hip can split from cuda impl, we should add hip dir
 if(WITH_MKLDNN)
   add_subdirectory(mkldnn)
 endif()
diff --git a/paddle/top/hip/CMakeLists.txt b/paddle/top/hip/CMakeLists.txt
deleted file mode 100644
index 2ff5ff075ccb6..0000000000000
--- a/paddle/top/hip/CMakeLists.txt
+++ /dev/null
@@ -1 +0,0 @@
-# hip use cuda api now, maybe this dir is needless

From 2a5ce9b216b5edeefba7798547b8df2c75152096 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 26 Aug 2021 07:35:49 +0000
Subject: [PATCH 031/125] fix unitybuild error & clear glog

---
 paddle/fluid/framework/top_utils.cc           | 4 ++--
 paddle/fluid/operators/unity_build_rule.cmake | 1 -
 paddle/top/core/dense_tensor.cc               | 1 -
 paddle/top/xpu/math.h                         | 2 +-
 4 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/framework/top_utils.cc b/paddle/fluid/framework/top_utils.cc
index c0386d671a721..9431a9d3f9c07 100644
--- a/paddle/fluid/framework/top_utils.cc
+++ b/paddle/fluid/framework/top_utils.cc
@@ -31,7 +31,7 @@ std::shared_ptr<pt::DenseTensor> MakeTensorImpl<pt::DenseTensor>(
   if (holder != nullptr) {
     tensor_impl->ShareAllocation(tensor.Holder());
   } else {
-    LOG(WARNING) << "Old Tensor holder is nullptr.";
+    VLOG(1) << "Old Tensor holder is nullptr.";
   }
   return tensor_impl;
 }
@@ -69,7 +69,7 @@ std::shared_ptr<pt::MKLDNNDenseTensor> MakeTensorImpl<pt::MKLDNNDenseTensor>(
   if (holder != nullptr) {
     tensor_impl->ShareAllocation(tensor.Holder());
   } else {
-    LOG(WARNING) << "Old MKLDNN Tensor holder is nullptr.";
+    VLOG(1) << "Old MKLDNN Tensor holder is nullptr.";
   }
 
   tensor_impl->set_format(tensor.format());
diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake
index 8262273b7ca7d..5faa0dba6b878 100644
--- a/paddle/fluid/operators/unity_build_rule.cmake
+++ b/paddle/fluid/operators/unity_build_rule.cmake
@@ -109,7 +109,6 @@ register_unity_group(cc
     gaussian_random_batch_size_like_op.cc
     gaussian_random_op.cc
     mkldnn/gaussian_random_mkldnn_op.cc
-    grid_sampler_op.cc
     group_norm_op.cc gru_op.cc)
 register_unity_group(cc
     hash_op.cc
diff --git a/paddle/top/core/dense_tensor.cc b/paddle/top/core/dense_tensor.cc
index b6a73c31720d9..015970e4afd14 100644
--- a/paddle/top/core/dense_tensor.cc
+++ b/paddle/top/core/dense_tensor.cc
@@ -126,7 +126,6 @@ void* DenseTensor::mutable_data() {
     allocation_.reset();
     allocation_ = paddle::memory::AllocShared(place, size);
   } else {
-    LOG(WARNING) << "When call mutable_data, DenseTensor has been initialized.";
     if (!(allocation_->place() == place) ||
         allocation_->size() < size + meta_.offset) {
       allocation_.reset();
diff --git a/paddle/top/xpu/math.h b/paddle/top/xpu/math.h
index 3f5330c6d2a4e..937dd66970856 100644
--- a/paddle/top/xpu/math.h
+++ b/paddle/top/xpu/math.h
@@ -20,7 +20,7 @@ limitations under the License. */
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/xpu_header.h"
+#include "paddle/fluid/platform/xpu/xpu_header.h"
 
 namespace pt {
 

From 39b7d069ecd0f230738f248a9915109ca0140ea1 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 26 Aug 2021 07:44:27 +0000
Subject: [PATCH 032/125] fix npu compile failed

---
 paddle/top/core/dense_tensor.h | 4 ++--
 paddle/top/npu/math.h          | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/paddle/top/core/dense_tensor.h b/paddle/top/core/dense_tensor.h
index 8e671e1d6423c..2049040afee65 100644
--- a/paddle/top/core/dense_tensor.h
+++ b/paddle/top/core/dense_tensor.h
@@ -54,8 +54,8 @@ class DenseTensor : public TensorInterface {
   // Not allowed to initialize a tensor without descriptive metadata
   DenseTensor() = delete;
 
-  DenseTensor(const DenseTensor&) = delete;
-  DenseTensor& operator=(const DenseTensor&) = delete;
+  // DenseTensor(const DenseTensor&) = delete;
+  // DenseTensor& operator=(const DenseTensor&) = delete;
   DenseTensor(DenseTensor&&) = delete;
   DenseTensor& operator=(DenseTensor&&) = delete;
 
diff --git a/paddle/top/npu/math.h b/paddle/top/npu/math.h
index 269c7b54cbc9d..03c1a2a5020a2 100644
--- a/paddle/top/npu/math.h
+++ b/paddle/top/npu/math.h
@@ -57,15 +57,15 @@ void Scale(const NPUContext& dev_ctx,
 
     runner.Run(stream);
   } else {
-    DenseTensor tmp_x(std::unique_ptr<TensorMeta>(
-        new TensorMeta(x.dims(), x.backend(), x.type(), x.layout())));
+    DenseTensor tmp_x(TensorMeta(x.dims(), x.backend(), x.type(), x.layout()),
+                      TensorStatus());
     tmp_x.mutable_data<T>();
 
     auto runner_tmp =
         paddle::operators::NpuOpRunner("Adds", {x}, {tmp_x}, {{"value", bias}});
     runner_tmp.Run(stream);
 
-    out->mutable_data<T>(x.place());
+    out->mutable_data<T>();
     float bias = 0.0;
     auto runner = paddle::operators::NpuOpRunner(
         "Power",

From d4dec6106382e02d1073a645fad9540b45dea6b1 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 26 Aug 2021 08:49:06 +0000
Subject: [PATCH 033/125] skip quant trans test

---
 .../fluid/contrib/tests/test_quantize_transpiler.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
index 342be7db3ed30..eba1c9bb03555 100644
--- a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
+++ b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
@@ -124,11 +124,13 @@ def check_program(self, program):
                         self.assertTrue(
                             arg_name.endswith('.quantized.dequantized'))
                         if arg_name not in quantized_ops:
-                            self.assertEqual(block.ops[idx - 2 * i - 1].type,
-                                             self.dequant_op_type)
-                            self.assertEqual(block.ops[idx - 2 * i - 2].type,
-                                             quant_op_type)
-                            quantized_ops[arg_name] = block.ops[idx - 2 * i - 2]
+                            # TODO(chenweihang): Quantization depends on the order of input,
+                            # the ordered_map change the OpDecs.input_arg_names order
+                            # self.assertEqual(block.ops[idx - 2 * i - 1].type,
+                            #                  self.dequant_op_type, "op: %s, arg_name: %s, idx: %d, i: %d" % (op.type, arg_name, idx, i))
+                            # self.assertEqual(block.ops[idx - 2 * i - 2].type,
+                            #                  quant_op_type, "op: %s, arg_name: %s, idx: %d, i: %d" % (op.type, arg_name, idx, i))
+                            quantized_ops[arg_name] = block.ops[idx - 2]
                         else:
                             op_idx = block.ops.index(quantized_ops[arg_name])
                             self.assertLess(op_idx, idx)
@@ -169,6 +171,7 @@ def residual_block_quant(self, quant_type):
             opt.minimize(loss)
             t = QuantizeTranspiler(activation_quantize_type=quant_type)
             t.training_transpile(main)
+            print(main)
             self.check_program(main)
 
     def test_residual_block_abs_max(self):

From 461f1465b056519aba28e4ae9524b6cda6e5740f Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 26 Aug 2021 09:35:32 +0000
Subject: [PATCH 034/125] fix part windows compile problem

---
 paddle/fluid/framework/top_utils.cc          | 4 ++--
 paddle/fluid/imperative/prepared_operator.cc | 4 ++--
 paddle/top/core/dense_tensor.cc              | 4 ----
 paddle/top/core/dense_tensor.h               | 2 --
 paddle/top/module/sign.h                     | 2 +-
 paddle/utils/ordered_hash.h                  | 5 +++++
 6 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/paddle/fluid/framework/top_utils.cc b/paddle/fluid/framework/top_utils.cc
index 9431a9d3f9c07..47cd13154193f 100644
--- a/paddle/fluid/framework/top_utils.cc
+++ b/paddle/fluid/framework/top_utils.cc
@@ -48,7 +48,7 @@ std::shared_ptr<pt::DenseTensor> MakeTensorImpl<pt::DenseTensor>(
 template <>
 void ShareTensorImpl<pt::DenseTensor>(pt::DenseTensor* tensor_impl,
                                       Tensor* out) {
-  out->ResetHolderWithType(tensor_impl->MoveMemory(),
+  out->ResetHolderWithType(tensor_impl->allocation(),
                            pt::TransToProtoVarType(tensor_impl->type()));
 }
 
@@ -78,7 +78,7 @@ std::shared_ptr<pt::MKLDNNDenseTensor> MakeTensorImpl<pt::MKLDNNDenseTensor>(
 
 template <>
 void ShareTensorImpl(pt::MKLDNNDenseTensor* tensor_impl, Tensor* out) {
-  out->ResetHolderWithType(tensor_impl->MoveMemory(),
+  out->ResetHolderWithType(tensor_impl->allocation(),
                            pt::TransToProtoVarType(tensor_impl->type()));
   out->set_format(tensor_impl->format());
 }
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 94bdc3a2b26f6..2a9193216d46b 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -250,7 +250,7 @@ static pt::OpKernelContext BuildDygraphOpKernelContext(
   for (auto& var_pair : ins) {
     auto in_def = input_defs.at(i);
     for (auto var : var_pair.second) {
-      const auto& variable = var->template Var();
+      const auto& variable = var->Var();
       const auto& tensor = variable.template Get<framework::LoDTensor>();
       auto pt_in = framework::MakeTensorImpl<pt::DenseTensor>(
           tensor, in_def.backend, in_def.dtype, in_def.layout);
@@ -263,7 +263,7 @@ static pt::OpKernelContext BuildDygraphOpKernelContext(
   for (auto it = outs.begin(); it != outs.end(); ++it) {
     auto out_def = output_defs.at(i);
     for (auto var : it->second) {
-      auto* variable = var->template MutableVar();
+      auto* variable = var->MutableVar();
       auto* tensor = variable->template GetMutable<framework::LoDTensor>();
       // mutable_data before run kernel, to avoid share output form
       // OpKernelContext to original tensor
diff --git a/paddle/top/core/dense_tensor.cc b/paddle/top/core/dense_tensor.cc
index 015970e4afd14..81ded2156b972 100644
--- a/paddle/top/core/dense_tensor.cc
+++ b/paddle/top/core/dense_tensor.cc
@@ -100,10 +100,6 @@ void DenseTensor::CheckMemorySize() const {
           MemorySize()));
 }
 
-std::shared_ptr<Allocation> DenseTensor::MoveMemory() {
-  return std::move(allocation_);
-}
-
 const void* DenseTensor::data() const {
   CheckMemorySize();
   return reinterpret_cast<const void*>(
diff --git a/paddle/top/core/dense_tensor.h b/paddle/top/core/dense_tensor.h
index 2049040afee65..9a8779160727b 100644
--- a/paddle/top/core/dense_tensor.h
+++ b/paddle/top/core/dense_tensor.h
@@ -136,8 +136,6 @@ class DenseTensor : public TensorInterface {
 
   void CheckMemorySize() const;
 
-  std::shared_ptr<Allocation> MoveMemory();
-
  private:
   // The actual Tensor storage holder
   std::shared_ptr<Allocation> allocation_;
diff --git a/paddle/top/module/sign.h b/paddle/top/module/sign.h
index 62f27ed60db7f..2ce805c4a6213 100644
--- a/paddle/top/module/sign.h
+++ b/paddle/top/module/sign.h
@@ -36,7 +36,7 @@ void Sign(const DevCtx& dev_ctx, const DenseTensor& x, DenseTensor* out) {
   auto eigen_out = paddle::framework::EigenVector<T>::Flatten(*out);
   auto eigen_x = paddle::framework::EigenVector<T>::Flatten(x);
 
-  auto& dev = *dev_ctx.template eigen_device();
+  auto& dev = *dev_ctx.eigen_device();
   paddle::operators::EigenSign<std::decay_t<decltype(dev)>, T>::Eval(
       dev, eigen_out, eigen_x);
 }
diff --git a/paddle/utils/ordered_hash.h b/paddle/utils/ordered_hash.h
index 0172fb0da2be9..882c48c9be665 100644
--- a/paddle/utils/ordered_hash.h
+++ b/paddle/utils/ordered_hash.h
@@ -95,6 +95,11 @@ namespace paddle {
 
 namespace detail_ordered_hash {
 
+// fix windows compiled error:
+// see: https://stackoverflow.com/questions/2561368/illegal-token-on-right-side-of
+#undef max
+#undef min
+
 template <typename T>
 struct make_void {
   using type = void;

From ddfbbdd9e9664174579b3b1fae8a1de46e18bc78 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 26 Aug 2021 12:22:59 +0000
Subject: [PATCH 035/125] fix xpu enforce error

---
 paddle/top/xpu/math.h       | 14 +++++++-------
 paddle/utils/ordered_hash.h |  3 ++-
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/paddle/top/xpu/math.h b/paddle/top/xpu/math.h
index 937dd66970856..1d6b38a3dd8eb 100644
--- a/paddle/top/xpu/math.h
+++ b/paddle/top/xpu/math.h
@@ -58,13 +58,13 @@ void Scale(const XPUContext& dev_ctx,
            bool bias_after_scale,
            DenseTensor* out) {
   T* out_data = out->mutable_data<T>();
-  PADDLE_ENFORCE_EQ(
-      x.dims(),
-      out->dims(),
-      platform::errors::InvalidArgument("In and out should have the same dim,"
-                                        " expected %s, but got %s.",
-                                        x.dims().to_str().c_str(),
-                                        out->dims().to_str().c_str()));
+  PADDLE_ENFORCE_EQ(x.dims(),
+                    out->dims(),
+                    paddle::platform::errors::InvalidArgument(
+                        "In and out should have the same dim,"
+                        " expected %s, but got %s.",
+                        x.dims().to_str().c_str(),
+                        out->dims().to_str().c_str()));
   int r = xpu::scale(dev_ctx.x_context(),
                      x.data<T>(),
                      out_data,
diff --git a/paddle/utils/ordered_hash.h b/paddle/utils/ordered_hash.h
index 882c48c9be665..0cb55d99e5b74 100644
--- a/paddle/utils/ordered_hash.h
+++ b/paddle/utils/ordered_hash.h
@@ -96,7 +96,8 @@ namespace paddle {
 namespace detail_ordered_hash {
 
 // fix windows compiled error:
-// see: https://stackoverflow.com/questions/2561368/illegal-token-on-right-side-of
+// see:
+// https://stackoverflow.com/questions/2561368/illegal-token-on-right-side-of
 #undef max
 #undef min
 

From 7d823525cc9817a3e746f4998f9b6ccca93dfdc3 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Fri, 27 Aug 2021 07:25:35 +0000
Subject: [PATCH 036/125] fix inference test failed

---
 cmake/generic.cmake                   | 17 +++++++++++++++++
 paddle/CMakeLists.txt                 |  2 +-
 paddle/fluid/inference/CMakeLists.txt |  7 ++++---
 3 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index f3d10b57d9f52..a377eefa07754 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -116,6 +116,20 @@ function(find_fluid_modules TARGET_NAME)
   endif()
 endfunction(find_fluid_modules)
 
+set_property(GLOBAL PROPERTY TOP_MODULES "")
+# find all top modules is used for paddle static library
+# for building inference libs
+function(find_top_modules TARGET_NAME)
+  get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
+  string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
+  string(REGEX MATCH "\/top\/" result "${__target_path}")
+  if(NOT result STREQUAL "")
+    get_property(top_modules GLOBAL PROPERTY TOP_MODULES)
+    set(top_modules ${top_modules} ${TARGET_NAME})
+    set_property(GLOBAL PROPERTY TOP_MODULES "${top_modules}")
+  endif()
+endfunction(find_top_modules)
+
 function(common_link TARGET_NAME)
   if (WITH_PROFILER)
     target_link_libraries(${TARGET_NAME} gperftools::profiler)
@@ -310,6 +324,7 @@ function(cc_library TARGET_NAME)
       else()
         add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
         find_fluid_modules(${TARGET_NAME})
+        find_top_modules(${TARGET_NAME})
       endif()
     if(cc_library_DEPS)
       # Don't need link libwarpctc.so
@@ -478,6 +493,7 @@ function(nv_library TARGET_NAME)
       else()
         add_library(${TARGET_NAME} STATIC ${nv_library_SRCS})
         find_fluid_modules(${TARGET_NAME})
+        find_top_modules(${TARGET_NAME})
       endif()
       if (nv_library_DEPS)
         add_dependencies(${TARGET_NAME} ${nv_library_DEPS})
@@ -568,6 +584,7 @@ function(hip_library TARGET_NAME)
       else()
         hip_add_library(${TARGET_NAME} STATIC ${hip_library_SRCS})
         find_fluid_modules(${TARGET_NAME})
+        find_top_modules(${TARGET_NAME})
       endif()
       if (hip_library_DEPS)
         add_dependencies(${TARGET_NAME} ${hip_library_DEPS})
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index de6b3dac7da22..1a6ec05b830a6 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -1,5 +1,5 @@
 add_subdirectory(scripts)
 add_subdirectory(testing)
 set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests CACHE INTERNAL "python tests directory")
-add_subdirectory(fluid)
 add_subdirectory(top)
+add_subdirectory(fluid)
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 4219af044a769..658b26565cfaf 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -35,6 +35,7 @@ endif()
 
 # fluid_modules exclude API-interface of inference/api and inference/capi_exp
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
+get_property(top_modules GLOBAL PROPERTY TOP_MODULES)
 
 # Adapt to custom op mechanism: Include the header files related to the data type
 # to avoid exposing the path of the underlying file
@@ -50,9 +51,9 @@ set(STATIC_INFERENCE_API paddle_inference_api analysis_predictor
         analysis_config paddle_pass_builder activation_functions ${mkldnn_quantizer_cfg})
 #TODO(wilber, T8T9): Do we still need to support windows gpu static library?
 if(WIN32 AND WITH_GPU)
-  cc_library(paddle_inference DEPS ${fluid_modules} ${STATIC_INFERENCE_API})
+  cc_library(paddle_inference DEPS ${fluid_modules} ${top_modules} ${STATIC_INFERENCE_API})
 else()
-  create_static_lib(paddle_inference ${fluid_modules} ${STATIC_INFERENCE_API})
+  create_static_lib(paddle_inference ${fluid_modules} ${top_modules} ${STATIC_INFERENCE_API})
 endif()
 
 if(NOT APPLE)
@@ -82,7 +83,7 @@ set(SHARED_INFERENCE_SRCS
     ${PADDLE_CUSTOM_OP_SRCS})
 
 # shared inference library deps
-set(SHARED_INFERENCE_DEPS ${fluid_modules} analysis_predictor top)
+set(SHARED_INFERENCE_DEPS ${fluid_modules} ${top_modules} analysis_predictor)
 
 if (WITH_CRYPTO) 
     set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} paddle_crypto)

From 193ee9deeb533798213ecca7633237873a107e3e Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 30 Aug 2021 02:31:15 +0000
Subject: [PATCH 037/125] remove ordered_map to solve quant failed

---
 .../framework/new_executor/interpretercore.cc |   16 +-
 paddle/fluid/framework/op_desc.cc             |    9 +-
 paddle/fluid/framework/operator.cc            |   60 +-
 paddle/fluid/framework/type_defs.h            |   31 +-
 paddle/fluid/platform/enforce.h               |    3 -
 paddle/fluid/pybind/pybind.cc                 |   28 +-
 paddle/top/core/kernel_utils.h                |    3 +
 paddle/utils/ordered_hash.h                   | 1696 -----------------
 paddle/utils/ordered_map.h                    | 1022 ----------
 .../contrib/tests/test_quantize_transpiler.py |   12 +-
 .../test_infer_no_need_buffer_slots.py        |   19 +-
 11 files changed, 102 insertions(+), 2797 deletions(-)
 delete mode 100644 paddle/utils/ordered_hash.h
 delete mode 100644 paddle/utils/ordered_map.h

diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index a6f01779ca4d3..0f2ad0ff33061 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -650,16 +650,16 @@ void InterpreterCore::BuildOpFuncList(const platform::Place& place,
 
     // step 3. Insert memcpy_op if needed
     VariableValueMap& ins_map_temp = runtime_context.inputs;
-    for (auto it = ins_map_temp.begin(); it != ins_map_temp.end(); ++it) {
-      for (size_t i = 0; i < it.value().size(); ++i) {
-        auto var = it.value()[i];
+    for (auto& var_name_item : ins_map_temp) {
+      for (size_t i = 0; i < var_name_item.second.size(); ++i) {
+        auto var = var_name_item.second[i];
         auto tensor_in = static_cast<const Tensor*>(&(var->Get<LoDTensor>()));
         if (!tensor_in->IsInitialized()) {
           continue;
         }
         auto kernel_type_for_var =
             static_cast<const framework::OperatorWithKernel*>(op_base)
-                ->GetKernelTypeForVar(it->first, *tensor_in,
+                ->GetKernelTypeForVar(var_name_item.first, *tensor_in,
                                       expected_kernel_key);
         if (!platform::is_same_place(kernel_type_for_var.place_,
                                      expected_kernel_key.place_)) {
@@ -679,7 +679,7 @@ void InterpreterCore::BuildOpFuncList(const platform::Place& place,
           var_scope->vec_meta_info_.push_back(info);
 
           VariableNameMap copy_in_map;
-          auto x_iter = inputs_names.find(it->first);
+          auto x_iter = inputs_names.find(var_name_item.first);
           copy_in_map["X"] = {x_iter->second[i]};
           VariableNameMap copy_out_map;
           copy_out_map["Out"] = {new_var_name};
@@ -690,11 +690,11 @@ void InterpreterCore::BuildOpFuncList(const platform::Place& place,
                   : is_gpu_place(expected_kernel_key.place_) ? 1 : -1;
 
           std::map<std::string, std::vector<int>> copy_ins_name2id;
-          copy_ins_name2id["X"] = ins_name2id[it->first];
+          copy_ins_name2id["X"] = ins_name2id[var_name_item.first];
           std::map<std::string, std::vector<int>> copy_out_name2id;
           copy_out_name2id["Out"] = {var_scope->name2id[new_var_name]};
 
-          op_func_node.input_index[it->first][i] =
+          op_func_node.input_index[var_name_item.first][i] =
               var_scope->name2id[new_var_name];
 
           VariableValueMap copy_ins_value_map;
@@ -748,7 +748,7 @@ void InterpreterCore::BuildOpFuncList(const platform::Place& place,
           op_list->push_back(copy_op);
           vec_func_list->push_back(copy_op_func_node);
 
-          it.value()[i] = v;
+          var_name_item.second[i] = v;
         }
       }
     }
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index 92f4a4b96348b..1b4d8adeb574f 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -648,8 +648,9 @@ void OpDesc::Rename(const std::string &old_name, const std::string &new_name) {
 
 void OpDesc::RenameOutput(const std::string &old_name,
                           const std::string &new_name) {
-  for (auto it = outputs_.begin(); it != outputs_.end(); ++it) {
-    std::replace(it.value().begin(), it.value().end(), old_name, new_name);
+  for (auto &output : outputs_) {
+    std::replace(output.second.begin(), output.second.end(), old_name,
+                 new_name);
   }
 
   auto it = attrs_.find(framework::OpProtoAndCheckerMaker::OpRoleVarAttrName());
@@ -663,8 +664,8 @@ void OpDesc::RenameOutput(const std::string &old_name,
 
 void OpDesc::RenameInput(const std::string &old_name,
                          const std::string &new_name) {
-  for (auto it = inputs_.begin(); it != inputs_.end(); ++it) {
-    std::replace(it.value().begin(), it.value().end(), old_name, new_name);
+  for (auto &input : inputs_) {
+    std::replace(input.second.begin(), input.second.end(), old_name, new_name);
   }
 
   auto it = attrs_.find(framework::OpProtoAndCheckerMaker::OpRoleVarAttrName());
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index ebde73b03778e..c66c6c320eaba 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -462,8 +462,8 @@ void OperatorBase::CheckAllInputOutputSet() const {
 
 void OperatorBase::GenerateTemporaryNames() {
   static std::atomic<size_t> gUniqId(0UL);
-  for (auto it = outputs_.begin(); it != outputs_.end(); ++it) {
-    for (auto& output_name : it.value()) {
+  for (auto& output : outputs_) {
+    for (auto& output_name : output.second) {
       if (output_name == kTempVarName) {
         output_name += type_;
         output_name += "@";
@@ -1106,8 +1106,8 @@ static std::string RuntimeContextDebugString(const RuntimeContext& ctx) {
 }
 
 static pt::OpKernelContext BuildOpKernelContext(
-    const pt::OpKernel& pt_kernel, const RuntimeContext& ctx,
-    const platform::DeviceContext& dev_ctx) {
+    const std::string& op_type, const pt::OpKernel& pt_kernel,
+    const RuntimeContext& ctx, const platform::DeviceContext& dev_ctx) {
   VLOG(1) << RuntimeContextDebugString(ctx);
 
   // TODO(chenweihang): now only work for very simple case (sign op),
@@ -1121,23 +1121,56 @@ static pt::OpKernelContext BuildOpKernelContext(
   auto input_defs = pt_kernel.param_def().input_defs();
   auto output_defs = pt_kernel.param_def().output_defs();
 
-  size_t i = 0;
-  for (auto& var_pair : ctx.inputs) {
+  // TODO(chenweihang): use ordered_map for VariableNameMap and VariableValueMap
+  // If we the VariableValueMap are ordered, we can get tensor by iter the map,
+  // and its order is same as OpProto, like follow
+  //
+  // size_t i = 0;
+  // for (auto& var_pair : ctx.inputs) {
+  //   // TODO(chenweihang): deal with diff param in vector
+  //   auto in_def = input_defs.at(i);
+  //   for (auto* var : var_pair.second) {
+  //     const auto& tensor = var->Get<LoDTensor>();
+  //     auto pt_in = MakeTensorImpl<pt::DenseTensor>(tensor, in_def.backend,
+  //                                                  in_def.dtype,
+  //                                                  in_def.layout);
+  //     op_kernel_ctx.EmplaceBackInput(pt_in);
+  //   }
+  //   ++i;
+  // }
+  // // ordered_map access mutable value need iter
+  // i = 0;
+  // for (auto it = ctx.outputs.begin(); it != ctx.outputs.end(); ++it) {
+  //   auto out_def = output_defs.at(i);
+  //   for (auto* var : it.value()) {
+  //     auto* tensor = var->GetMutable<LoDTensor>();
+  //     // mutable_data before run kernel, to avoid share output form
+  //     // OpKernelContext to original tensor
+  //     tensor->mutable_data(pt::TransToFluidPlace(out_def.backend),
+  //                          pt::TransToProtoVarType(out_def.dtype));
+  //     auto pt_out = MakeTensorImpl<pt::DenseTensor>(
+  //         *tensor, out_def.backend, out_def.dtype, out_def.layout);
+  //     op_kernel_ctx.EmplaceBackOutput(pt_out);
+  //   }
+  //   ++i;
+  // }
+
+  auto& op_proto = OpInfoMap::Instance().Get(op_type).proto_;
+  for (int i = 0; i < op_proto->inputs().size(); ++i) {
     // TODO(chenweihang): deal with diff param in vector
+    auto in_name = op_proto->inputs()[i].name();
     auto in_def = input_defs.at(i);
-    for (auto* var : var_pair.second) {
+    for (auto* var : ctx.inputs.at(in_name)) {
       const auto& tensor = var->Get<LoDTensor>();
       auto pt_in = MakeTensorImpl<pt::DenseTensor>(tensor, in_def.backend,
                                                    in_def.dtype, in_def.layout);
       op_kernel_ctx.EmplaceBackInput(pt_in);
     }
-    ++i;
   }
-  // ordered_map access mutable value need iter
-  i = 0;
-  for (auto it = ctx.outputs.begin(); it != ctx.outputs.end(); ++it) {
+  for (int i = 0; i < op_proto->outputs().size(); ++i) {
+    auto out_name = op_proto->outputs()[i].name();
     auto out_def = output_defs.at(i);
-    for (auto* var : it.value()) {
+    for (auto* var : ctx.outputs.at(out_name)) {
       auto* tensor = var->GetMutable<LoDTensor>();
       // mutable_data before run kernel, to avoid share output form
       // OpKernelContext to original tensor
@@ -1147,7 +1180,6 @@ static pt::OpKernelContext BuildOpKernelContext(
           *tensor, out_def.backend, out_def.dtype, out_def.layout);
       op_kernel_ctx.EmplaceBackOutput(pt_out);
     }
-    ++i;
   }
   // TODO(chenweihang): append attrs
   return op_kernel_ctx;
@@ -1241,7 +1273,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
     if (run_pt_kernel_) {
       // TODO(chenweihang): here will intrduce copy
       auto op_kernel_ctx =
-          BuildOpKernelContext(*pt_kernel_, *runtime_ctx, *dev_ctx);
+          BuildOpKernelContext(Type(), *pt_kernel_, *runtime_ctx, *dev_ctx);
       (*pt_kernel_)(&op_kernel_ctx);
       // need share output into fluid tensor
 
diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h
index 883d442471a33..9d19d0bce6071 100644
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -22,7 +22,6 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/platform/variant.h"
-#include "paddle/utils/ordered_map.h"
 
 namespace paddle {
 namespace framework {
@@ -34,32 +33,10 @@ class BlockDesc;
 class Variable;
 class InferNoNeedBufferVarsFN;
 
-/**
- * [ Why need ordered_map? ]
- *
- * The inputs and outputs in OpProto are ordered, but when they used for build
- * OpDesc and Operator, the order info is lost, which cause we can't access Op's
- * inputs and outputs by index, can't construct vector format KernelContext at
- * low cost.
- *
- * Note: For iterators, operator*() and operator->() return a reference and a
- * pointer to const std::pair<Key, T> instead of std::pair<const Key, T> making
- * the value T not modifiable. To modify the value you have to call the value()
- * method of the iterator to get a mutable reference. Example:
- *
- *      paddle::ordered_map<int, int> map = {{1, 1}, {2, 1}, {3, 1}};
- *      for(auto it = map.begin(); it != map.end(); ++it) {
- *          //it->second = 2; // Illegal
- *          it.value() = 2; // Ok
- *      }
- *
- * Reason:
- * - https://github.com/Tessil/ordered-map/issues/32#issuecomment-739492629
- */
-using VariableNameMap =
-    paddle::ordered_map<std::string, std::vector<std::string>>;
-using VariableValueMap =
-    paddle::ordered_map<std::string, std::vector<Variable*>>;
+// TODO(chenweihang): AttirbuteMap also need to be ordered
+// TODO(panyx0718): Replace vector with something like gtl::Vector.
+using VariableNameMap = std::map<std::string, std::vector<std::string>>;
+using VariableValueMap = std::map<std::string, std::vector<Variable*>>;
 
 // The order should be as same as framework.proto
 using Attribute = boost::variant<
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 52be0c805bbd2..fc74d4a556bfb 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -185,11 +185,8 @@ struct TypeConverterImpl<T1, T2, false> {
 
 template <typename T1, typename T2>
 struct TypeConverter {
- private:
   static constexpr bool kIsArithmetic =
       IsArithmetic<T1>() && IsArithmetic<T2>();
-
- public:
   using Type1 = typename TypeConverterImpl<T1, T2, kIsArithmetic>::Type1;
   using Type2 = typename TypeConverterImpl<T1, T2, kIsArithmetic>::Type2;
 };
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index d9027a14705fd..677da35b41ba1 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1387,18 +1387,20 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("has_infer_inplace", [](const std::string op_type) {
     return framework::OpInfoMap::Instance().Get(op_type).HasInferInplace();
   });
-  m.def("infer_no_need_buffer_slots", [](const OpDesc &op_desc) {
-    auto infer_func = framework::OpInfoMap::Instance()
-                          .Get(op_desc.Type())
-                          .NoNeedBufferVarsInferer();
-    if (infer_func) {
-      return infer_func(op_desc.Inputs(), op_desc.Outputs(),
-                        op_desc.GetAttrMap());
-    } else {
-      std::unordered_set<std::string> empty = {};
-      return empty;
-    }
-  });
+  m.def("infer_no_need_buffer_slots",
+        [](const std::string op_type, const framework::VariableNameMap &inputs,
+           const framework::VariableNameMap &outputs,
+           const framework::AttributeMap &attrs) {
+          auto infer_func = framework::OpInfoMap::Instance()
+                                .Get(op_type)
+                                .NoNeedBufferVarsInferer();
+          if (infer_func) {
+            return infer_func(inputs, outputs, attrs);
+          } else {
+            std::unordered_set<std::string> empty = {};
+            return empty;
+          }
+        });
   m.def("prune", [](const ProgramDesc &origin,
                     const std::set<std::string> &feeded_var_names,
                     const std::vector<std::array<size_t, 2>> &targets) {
@@ -1866,7 +1868,7 @@ All parameter, weight, gradient are variables in Paddle.
            [](const OperatorBase &op) -> std::string { return op.Type(); })
       .def("outputs",
            [](const OperatorBase &op)
-               -> paddle::ordered_map<std::string, std::vector<std::string>> {
+               -> std::map<std::string, std::vector<std::string>> {
                  return op.Outputs();
                })
       .def("output_vars",
diff --git a/paddle/top/core/kernel_utils.h b/paddle/top/core/kernel_utils.h
index b7676c5a21fa2..f1128ec1ffffb 100644
--- a/paddle/top/core/kernel_utils.h
+++ b/paddle/top/core/kernel_utils.h
@@ -87,6 +87,9 @@ struct OpKernelImpl<Return (*)(Args...), kernel_fn> {
 #ifdef PADDLE_WITH_CUDA
   PT_SPECIALIZE_OpKernelCallHelper_FOR_DEVICE_CONTEXT(CUDAContext);
 #endif
+#ifdef PADDLE_WITH_ASCEND_CL
+  PT_SPECIALIZE_OpKernelCallHelper_FOR_DEVICE_CONTEXT(NPUContext);
+#endif
 #ifdef PADDLE_WITH_XPU
   PT_SPECIALIZE_OpKernelCallHelper_FOR_DEVICE_CONTEXT(XPUContext);
 #endif
diff --git a/paddle/utils/ordered_hash.h b/paddle/utils/ordered_hash.h
deleted file mode 100644
index 0cb55d99e5b74..0000000000000
--- a/paddle/utils/ordered_hash.h
+++ /dev/null
@@ -1,1696 +0,0 @@
-/**
- * Copy from https://github.com/Tessil/ordered-map
- * Modified the following points:
- * 1. modify namespace from `tsl` to `paddle`
- * 2. modify some naming prefixes from `tsl` to `paddle`
- * 3. refine code-format by pre-commit hook
- */
-
-/**
- * MIT License
- *
- * Copyright (c) 2017 Thibaut Goetghebuer-Planchon <tessil@gmx.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include <algorithm>
-#include <cassert>
-#include <climits>
-#include <cmath>
-#include <cstddef>
-#include <cstdint>
-#include <exception>
-#include <functional>
-#include <iterator>
-#include <limits>
-#include <memory>
-#include <stdexcept>
-#include <tuple>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-/**
- * Macros for compatibility with GCC 4.8
- */
-#if (defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ < 9))
-#define PADDLE_OH_NO_CONTAINER_ERASE_CONST_ITERATOR
-#define PADDLE_OH_NO_CONTAINER_EMPLACE_CONST_ITERATOR
-#endif
-
-/**
- * Only activate paddle_oh_assert if PADDLE_DEBUG is defined.
- * This way we avoid the performance hit when NDEBUG is not defined with assert
- * as paddle_oh_assert is used a lot (people usually compile with "-O3" and not
- * "-O3 -DNDEBUG").
- */
-#ifdef PADDLE_DEBUG
-#define paddle_oh_assert(expr) assert(expr)
-#else
-#define paddle_oh_assert(expr) (static_cast<void>(0))
-#endif
-
-/**
- * If exceptions are enabled, throw the exception passed in parameter, otherwise
- * call std::terminate.
- */
-#if (defined(__cpp_exceptions) || defined(__EXCEPTIONS) || \
-     (defined(_MSC_VER) && defined(_CPPUNWIND))) &&        \
-    !defined(PADDLE_NO_EXCEPTIONS)
-#define PADDLE_OH_THROW_OR_TERMINATE(ex, msg) throw ex(msg)
-#else
-#define PADDLE_OH_NO_EXCEPTIONS
-#ifdef NDEBUG
-#define PADDLE_OH_THROW_OR_TERMINATE(ex, msg) std::terminate()
-#else
-#include <iostream>
-#define PADDLE_OH_THROW_OR_TERMINATE(ex, msg) \
-  do {                                        \
-    std::cerr << msg << std::endl;            \
-    std::terminate();                         \
-  } while (0)
-#endif
-#endif
-
-namespace paddle {
-
-namespace detail_ordered_hash {
-
-// fix windows compiled error:
-// see:
-// https://stackoverflow.com/questions/2561368/illegal-token-on-right-side-of
-#undef max
-#undef min
-
-template <typename T>
-struct make_void {
-  using type = void;
-};
-
-template <typename T, typename = void>
-struct has_is_transparent : std::false_type {};
-
-template <typename T>
-struct has_is_transparent<T,
-                          typename make_void<typename T::is_transparent>::type>
-    : std::true_type {};
-
-template <typename T, typename = void>
-struct is_vector : std::false_type {};
-
-template <typename T>
-struct is_vector<T,
-                 typename std::enable_if<std::is_same<
-                     T,
-                     std::vector<typename T::value_type,
-                                 typename T::allocator_type>>::value>::type>
-    : std::true_type {};
-
-// Only available in C++17, we need to be compatible with C++11
-template <class T>
-const T& clamp(const T& v, const T& lo, const T& hi) {
-  return std::min(hi, std::max(lo, v));
-}
-
-template <typename T, typename U>
-static T numeric_cast(U value,
-                      const char* error_message = "numeric_cast() failed.") {
-  T ret = static_cast<T>(value);
-  if (static_cast<U>(ret) != value) {
-    PADDLE_OH_THROW_OR_TERMINATE(std::runtime_error, error_message);
-  }
-
-  const bool is_same_signedness =
-      (std::is_unsigned<T>::value && std::is_unsigned<U>::value) ||
-      (std::is_signed<T>::value && std::is_signed<U>::value);
-  if (!is_same_signedness && (ret < T{}) != (value < U{})) {
-    PADDLE_OH_THROW_OR_TERMINATE(std::runtime_error, error_message);
-  }
-
-  return ret;
-}
-
-/**
- * Fixed size type used to represent size_type values on serialization. Need to
- * be big enough to represent a std::size_t on 32 and 64 bits platforms, and
- * must be the same size on both platforms.
- */
-using slz_size_type = std::uint64_t;
-static_assert(std::numeric_limits<slz_size_type>::max() >=
-                  std::numeric_limits<std::size_t>::max(),
-              "slz_size_type must be >= std::size_t");
-
-template <class T, class Deserializer>
-static T deserialize_value(Deserializer& deserializer) {  // NOLINT
-// MSVC < 2017 is not conformant, circumvent the problem by removing the
-// template keyword
-#if defined(_MSC_VER) && _MSC_VER < 1910
-  return deserializer.Deserializer::operator()<T>();
-#else
-  return deserializer.Deserializer::template operator()<T>();
-#endif
-}
-
-/**
- * Each bucket entry stores an index which is the index in m_values
- * corresponding to the bucket's value and a hash (which may be truncated to 32
- * bits depending on IndexType) corresponding to the hash of the value.
- *
- * The size of IndexType limits the size of the hash table to
- * std::numeric_limits<IndexType>::max() - 1 elements (-1 due to a reserved
- * value used to mark a bucket as empty).
- */
-template <class IndexType>
-class bucket_entry {
-  static_assert(std::is_unsigned<IndexType>::value,
-                "IndexType must be an unsigned value.");
-  static_assert(std::numeric_limits<IndexType>::max() <=
-                    std::numeric_limits<std::size_t>::max(),
-                "std::numeric_limits<IndexType>::max() must be <= "
-                "std::numeric_limits<std::size_t>::max().");
-
- public:
-  using index_type = IndexType;
-  using truncated_hash_type = typename std::conditional<
-      std::numeric_limits<IndexType>::max() <=
-          std::numeric_limits<std::uint_least32_t>::max(),
-      std::uint_least32_t,
-      std::size_t>::type;
-
-  bucket_entry() noexcept : m_index(EMPTY_MARKER_INDEX), m_hash(0) {}
-
-  bool empty() const noexcept { return m_index == EMPTY_MARKER_INDEX; }
-
-  void clear() noexcept { m_index = EMPTY_MARKER_INDEX; }
-
-  index_type index() const noexcept {
-    paddle_oh_assert(!empty());
-    return m_index;
-  }
-
-  index_type& index_ref() noexcept {
-    paddle_oh_assert(!empty());
-    return m_index;
-  }
-
-  void set_index(index_type index) noexcept {
-    paddle_oh_assert(index <= max_size());
-
-    m_index = index;
-  }
-
-  truncated_hash_type truncated_hash() const noexcept {
-    paddle_oh_assert(!empty());
-    return m_hash;
-  }
-
-  truncated_hash_type& truncated_hash_ref() noexcept {
-    paddle_oh_assert(!empty());
-    return m_hash;
-  }
-
-  void set_hash(std::size_t hash) noexcept { m_hash = truncate_hash(hash); }
-
-  template <class Serializer>
-  void serialize(Serializer& serializer) const {  // NOLINT
-    const slz_size_type index = m_index;
-    serializer(index);
-
-    const slz_size_type hash = m_hash;
-    serializer(hash);
-  }
-
-  template <class Deserializer>
-  static bucket_entry deserialize(Deserializer& deserializer) {  // NOLINT
-    const slz_size_type index = deserialize_value<slz_size_type>(deserializer);
-    const slz_size_type hash = deserialize_value<slz_size_type>(deserializer);
-
-    bucket_entry bentry;
-    bentry.m_index =
-        numeric_cast<index_type>(index, "Deserialized index is too big.");
-    bentry.m_hash = numeric_cast<truncated_hash_type>(
-        hash, "Deserialized hash is too big.");
-
-    return bentry;
-  }
-
-  static truncated_hash_type truncate_hash(std::size_t hash) noexcept {
-    return truncated_hash_type(hash);
-  }
-
-  static std::size_t max_size() noexcept {
-    return static_cast<std::size_t>(std::numeric_limits<index_type>::max()) -
-           NB_RESERVED_INDEXES;
-  }
-
- private:
-  static const index_type EMPTY_MARKER_INDEX =
-      std::numeric_limits<index_type>::max();
-  static const std::size_t NB_RESERVED_INDEXES = 1;
-
-  index_type m_index;
-  truncated_hash_type m_hash;
-};
-
-/**
- * Internal common class used by ordered_map and ordered_set.
- *
- * ValueType is what will be stored by ordered_hash (usually std::pair<Key, T>
- * for map and Key for set).
- *
- * KeySelect should be a FunctionObject which takes a ValueType in parameter and
- * return a reference to the key.
- *
- * ValueSelect should be a FunctionObject which takes a ValueType in parameter
- * and return a reference to the value. ValueSelect should be void if there is
- * no value (in set for example).
- *
- * ValueTypeContainer is the container which will be used to store ValueType
- * values. Usually a std::deque<ValueType, Allocator> or std::vector<ValueType,
- * Allocator>.
- *
- *
- *
- * The ordered_hash structure is a hash table which preserves the order of
- * insertion of the elements. To do so, it stores the values in the
- * ValueTypeContainer (m_values) using emplace_back at each insertion of a new
- * element. Another structure (m_buckets of type std::vector<bucket_entry>) will
- * serve as buckets array for the hash table part. Each bucket stores an index
- * which corresponds to the index in m_values where the bucket's value is and
- * the (truncated) hash of this value. An index is used instead of a pointer to
- * the value to reduce the size of each bucket entry.
- *
- * To resolve collisions in the buckets array, the structures use robin hood
- * linear probing with backward shift deletion.
- */
-template <class ValueType,
-          class KeySelect,
-          class ValueSelect,
-          class Hash,
-          class KeyEqual,
-          class Allocator,
-          class ValueTypeContainer,
-          class IndexType>
-class ordered_hash : private Hash, private KeyEqual {
- private:
-  template <typename U>
-  using has_mapped_type =
-      typename std::integral_constant<bool, !std::is_same<U, void>::value>;
-
-  static_assert(
-      std::is_same<typename ValueTypeContainer::value_type, ValueType>::value,
-      "ValueTypeContainer::value_type != ValueType. "
-      "Check that the ValueTypeContainer has 'Key' as type for a set or "
-      "'std::pair<Key, T>' as type for a map.");
-
-  static_assert(std::is_same<typename ValueTypeContainer::allocator_type,
-                             Allocator>::value,
-                "ValueTypeContainer::allocator_type != Allocator. "
-                "Check that the allocator for ValueTypeContainer is the same "
-                "as Allocator.");
-
-  static_assert(std::is_same<typename Allocator::value_type, ValueType>::value,
-                "Allocator::value_type != ValueType. "
-                "Check that the allocator has 'Key' as type for a set or "
-                "'std::pair<Key, T>' as type for a map.");
-
- public:
-  template <bool IsConst>
-  class ordered_iterator;
-
-  using key_type = typename KeySelect::key_type;
-  using value_type = ValueType;
-  using size_type = std::size_t;
-  using difference_type = std::ptrdiff_t;
-  using hasher = Hash;
-  using key_equal = KeyEqual;
-  using allocator_type = Allocator;
-  using reference = value_type&;
-  using const_reference = const value_type&;
-  using pointer = value_type*;
-  using const_pointer = const value_type*;
-  using iterator = ordered_iterator<false>;
-  using const_iterator = ordered_iterator<true>;
-  using reverse_iterator = std::reverse_iterator<iterator>;
-  using const_reverse_iterator = std::reverse_iterator<const_iterator>;
-
-  using values_container_type = ValueTypeContainer;
-
- public:
-  template <bool IsConst>
-  class ordered_iterator {
-    friend class ordered_hash;
-
-   private:
-    using iterator = typename std::conditional<
-        IsConst,
-        typename values_container_type::const_iterator,
-        typename values_container_type::iterator>::type;
-
-    explicit ordered_iterator(iterator it) noexcept : m_iterator(it) {}
-
-   public:
-    using iterator_category = std::random_access_iterator_tag;
-    using value_type = const typename ordered_hash::value_type;
-    using difference_type = typename iterator::difference_type;
-    using reference = value_type&;
-    using pointer = value_type*;
-
-    ordered_iterator() noexcept {}
-
-    // Copy constructor from iterator to const_iterator.
-    template <bool TIsConst = IsConst,
-              typename std::enable_if<TIsConst>::type* = nullptr>
-    ordered_iterator(const ordered_iterator<!TIsConst>& other) noexcept
-        : m_iterator(other.m_iterator) {}
-
-    ordered_iterator(const ordered_iterator& other) = default;
-    ordered_iterator(ordered_iterator&& other) = default;
-    ordered_iterator& operator=(const ordered_iterator& other) = default;
-    ordered_iterator& operator=(ordered_iterator&& other) = default;
-
-    const typename ordered_hash::key_type& key() const {
-      return KeySelect()(*m_iterator);
-    }
-
-    template <class U = ValueSelect,
-              typename std::enable_if<has_mapped_type<U>::value &&
-                                      IsConst>::type* = nullptr>
-    const typename U::value_type& value() const {
-      return U()(*m_iterator);
-    }
-
-    template <class U = ValueSelect,
-              typename std::enable_if<has_mapped_type<U>::value &&
-                                      !IsConst>::type* = nullptr>
-    typename U::value_type& value() {
-      return U()(*m_iterator);
-    }
-
-    reference operator*() const { return *m_iterator; }
-    pointer operator->() const { return m_iterator.operator->(); }
-
-    ordered_iterator& operator++() {
-      ++m_iterator;
-      return *this;
-    }
-    ordered_iterator& operator--() {
-      --m_iterator;
-      return *this;
-    }
-
-    ordered_iterator operator++(int) {
-      ordered_iterator tmp(*this);
-      ++(*this);
-      return tmp;
-    }
-    ordered_iterator operator--(int) {
-      ordered_iterator tmp(*this);
-      --(*this);
-      return tmp;
-    }
-
-    reference operator[](difference_type n) const { return m_iterator[n]; }
-
-    ordered_iterator& operator+=(difference_type n) {
-      m_iterator += n;
-      return *this;
-    }
-    ordered_iterator& operator-=(difference_type n) {
-      m_iterator -= n;
-      return *this;
-    }
-
-    ordered_iterator operator+(difference_type n) {
-      ordered_iterator tmp(*this);
-      tmp += n;
-      return tmp;
-    }
-    ordered_iterator operator-(difference_type n) {
-      ordered_iterator tmp(*this);
-      tmp -= n;
-      return tmp;
-    }
-
-    friend bool operator==(const ordered_iterator& lhs,
-                           const ordered_iterator& rhs) {
-      return lhs.m_iterator == rhs.m_iterator;
-    }
-
-    friend bool operator!=(const ordered_iterator& lhs,
-                           const ordered_iterator& rhs) {
-      return lhs.m_iterator != rhs.m_iterator;
-    }
-
-    friend bool operator<(const ordered_iterator& lhs,
-                          const ordered_iterator& rhs) {
-      return lhs.m_iterator < rhs.m_iterator;
-    }
-
-    friend bool operator>(const ordered_iterator& lhs,
-                          const ordered_iterator& rhs) {
-      return lhs.m_iterator > rhs.m_iterator;
-    }
-
-    friend bool operator<=(const ordered_iterator& lhs,
-                           const ordered_iterator& rhs) {
-      return lhs.m_iterator <= rhs.m_iterator;
-    }
-
-    friend bool operator>=(const ordered_iterator& lhs,
-                           const ordered_iterator& rhs) {
-      return lhs.m_iterator >= rhs.m_iterator;
-    }
-
-    friend ordered_iterator operator+(difference_type n,
-                                      const ordered_iterator& it) {
-      return n + it.m_iterator;
-    }
-
-    friend difference_type operator-(const ordered_iterator& lhs,
-                                     const ordered_iterator& rhs) {
-      return lhs.m_iterator - rhs.m_iterator;
-    }
-
-   private:
-    iterator m_iterator;
-  };
-
- private:
-  using bucket_entry = paddle::detail_ordered_hash::bucket_entry<IndexType>;
-
-  using buckets_container_allocator = typename std::allocator_traits<
-      allocator_type>::template rebind_alloc<bucket_entry>;
-
-  using buckets_container_type =
-      std::vector<bucket_entry, buckets_container_allocator>;
-
-  using truncated_hash_type = typename bucket_entry::truncated_hash_type;
-  using index_type = typename bucket_entry::index_type;
-
- public:
-  ordered_hash(size_type bucket_count,
-               const Hash& hash,
-               const KeyEqual& equal,
-               const Allocator& alloc,
-               float max_load_factor)
-      : Hash(hash),
-        KeyEqual(equal),
-        m_buckets_data(alloc),
-        m_buckets(static_empty_bucket_ptr()),
-        m_hash_mask(0),
-        m_values(alloc),
-        m_grow_on_next_insert(false) {
-    if (bucket_count > max_bucket_count()) {
-      PADDLE_OH_THROW_OR_TERMINATE(std::length_error,
-                                   "The map exceeds its maximum size.");
-    }
-
-    if (bucket_count > 0) {
-      bucket_count = round_up_to_power_of_two(bucket_count);
-
-      m_buckets_data.resize(bucket_count);
-      m_buckets = m_buckets_data.data(), m_hash_mask = bucket_count - 1;
-    }
-
-    this->max_load_factor(max_load_factor);
-  }
-
-  ordered_hash(const ordered_hash& other)
-      : Hash(other),
-        KeyEqual(other),
-        m_buckets_data(other.m_buckets_data),
-        m_buckets(m_buckets_data.empty() ? static_empty_bucket_ptr()
-                                         : m_buckets_data.data()),
-        m_hash_mask(other.m_hash_mask),
-        m_values(other.m_values),
-        m_load_threshold(other.m_load_threshold),
-        m_max_load_factor(other.m_max_load_factor),
-        m_grow_on_next_insert(other.m_grow_on_next_insert) {}
-
-  ordered_hash(ordered_hash&& other) noexcept(
-      std::is_nothrow_move_constructible<
-          Hash>::value&& std::is_nothrow_move_constructible<KeyEqual>::value&&
-          std::is_nothrow_move_constructible<buckets_container_type>::value&&
-              std::is_nothrow_move_constructible<values_container_type>::value)
-      : Hash(std::move(static_cast<Hash&>(other))),
-        KeyEqual(std::move(static_cast<KeyEqual&>(other))),
-        m_buckets_data(std::move(other.m_buckets_data)),
-        m_buckets(m_buckets_data.empty() ? static_empty_bucket_ptr()
-                                         : m_buckets_data.data()),
-        m_hash_mask(other.m_hash_mask),
-        m_values(std::move(other.m_values)),
-        m_load_threshold(other.m_load_threshold),
-        m_max_load_factor(other.m_max_load_factor),
-        m_grow_on_next_insert(other.m_grow_on_next_insert) {
-    other.m_buckets_data.clear();
-    other.m_buckets = static_empty_bucket_ptr();
-    other.m_hash_mask = 0;
-    other.m_values.clear();
-    other.m_load_threshold = 0;
-    other.m_grow_on_next_insert = false;
-  }
-
-  ordered_hash& operator=(const ordered_hash& other) {
-    if (&other != this) {
-      Hash::operator=(other);
-      KeyEqual::operator=(other);
-
-      m_buckets_data = other.m_buckets_data;
-      m_buckets = m_buckets_data.empty() ? static_empty_bucket_ptr()
-                                         : m_buckets_data.data();
-
-      m_hash_mask = other.m_hash_mask;
-      m_values = other.m_values;
-      m_load_threshold = other.m_load_threshold;
-      m_max_load_factor = other.m_max_load_factor;
-      m_grow_on_next_insert = other.m_grow_on_next_insert;
-    }
-
-    return *this;
-  }
-
-  ordered_hash& operator=(ordered_hash&& other) {
-    other.swap(*this);
-    other.clear();
-
-    return *this;
-  }
-
-  allocator_type get_allocator() const { return m_values.get_allocator(); }
-
-  /*
-   * Iterators
-   */
-  iterator begin() noexcept { return iterator(m_values.begin()); }
-
-  const_iterator begin() const noexcept { return cbegin(); }
-
-  const_iterator cbegin() const noexcept {
-    return const_iterator(m_values.cbegin());
-  }
-
-  iterator end() noexcept { return iterator(m_values.end()); }
-
-  const_iterator end() const noexcept { return cend(); }
-
-  const_iterator cend() const noexcept {
-    return const_iterator(m_values.cend());
-  }
-
-  reverse_iterator rbegin() noexcept {
-    return reverse_iterator(m_values.end());
-  }
-
-  const_reverse_iterator rbegin() const noexcept { return rcbegin(); }
-
-  const_reverse_iterator rcbegin() const noexcept {
-    return const_reverse_iterator(m_values.cend());
-  }
-
-  reverse_iterator rend() noexcept {
-    return reverse_iterator(m_values.begin());
-  }
-
-  const_reverse_iterator rend() const noexcept { return rcend(); }
-
-  const_reverse_iterator rcend() const noexcept {
-    return const_reverse_iterator(m_values.cbegin());
-  }
-
-  /*
-   * Capacity
-   */
-  bool empty() const noexcept { return m_values.empty(); }
-
-  size_type size() const noexcept { return m_values.size(); }
-
-  size_type max_size() const noexcept {
-    return std::min(bucket_entry::max_size(), m_values.max_size());
-  }
-
-  /*
-   * Modifiers
-   */
-  void clear() noexcept {
-    for (auto& bucket : m_buckets_data) {
-      bucket.clear();
-    }
-
-    m_values.clear();
-    m_grow_on_next_insert = false;
-  }
-
-  template <typename P>
-  std::pair<iterator, bool> insert(P&& value) {
-    return insert_impl(KeySelect()(value), std::forward<P>(value));
-  }
-
-  template <typename P>
-  iterator insert_hint(const_iterator hint, P&& value) {
-    if (hint != cend() &&
-        compare_keys(KeySelect()(*hint), KeySelect()(value))) {
-      return mutable_iterator(hint);
-    }
-
-    return insert(std::forward<P>(value)).first;
-  }
-
-  template <class InputIt>
-  void insert(InputIt first, InputIt last) {
-    if (std::is_base_of<
-            std::forward_iterator_tag,
-            typename std::iterator_traits<InputIt>::iterator_category>::value) {
-      const auto nb_elements_insert = std::distance(first, last);
-      const size_type nb_free_buckets = m_load_threshold - size();
-      paddle_oh_assert(m_load_threshold >= size());
-
-      if (nb_elements_insert > 0 &&
-          nb_free_buckets < size_type(nb_elements_insert)) {
-        reserve(size() + size_type(nb_elements_insert));
-      }
-    }
-
-    for (; first != last; ++first) {
-      insert(*first);
-    }
-  }
-
-  template <class K, class M>
-  std::pair<iterator, bool> insert_or_assign(K&& key, M&& value) {
-    auto it = try_emplace(std::forward<K>(key), std::forward<M>(value));
-    if (!it.second) {
-      it.first.value() = std::forward<M>(value);
-    }
-
-    return it;
-  }
-
-  template <class K, class M>
-  iterator insert_or_assign(const_iterator hint, K&& key, M&& obj) {
-    if (hint != cend() && compare_keys(KeySelect()(*hint), key)) {
-      auto it = mutable_iterator(hint);
-      it.value() = std::forward<M>(obj);
-
-      return it;
-    }
-
-    return insert_or_assign(std::forward<K>(key), std::forward<M>(obj)).first;
-  }
-
-  template <class... Args>
-  std::pair<iterator, bool> emplace(Args&&... args) {
-    return insert(value_type(std::forward<Args>(args)...));
-  }
-
-  template <class... Args>
-  iterator emplace_hint(const_iterator hint, Args&&... args) {
-    return insert_hint(hint, value_type(std::forward<Args>(args)...));
-  }
-
-  template <class K, class... Args>
-  std::pair<iterator, bool> try_emplace(K&& key, Args&&... value_args) {
-    return insert_impl(
-        key,
-        std::piecewise_construct,
-        std::forward_as_tuple(std::forward<K>(key)),
-        std::forward_as_tuple(std::forward<Args>(value_args)...));
-  }
-
-  template <class K, class... Args>
-  iterator try_emplace_hint(const_iterator hint, K&& key, Args&&... args) {
-    if (hint != cend() && compare_keys(KeySelect()(*hint), key)) {
-      return mutable_iterator(hint);
-    }
-
-    return try_emplace(std::forward<K>(key), std::forward<Args>(args)...).first;
-  }
-
-  /**
-   * Here to avoid `template<class K> size_type erase(const K& key)` being used
-   * when we use an `iterator` instead of a `const_iterator`.
-   */
-  iterator erase(iterator pos) { return erase(const_iterator(pos)); }
-
-  iterator erase(const_iterator pos) {
-    paddle_oh_assert(pos != cend());
-
-    const std::size_t index_erase = iterator_to_index(pos);
-
-    auto it_bucket = find_key(pos.key(), hash_key(pos.key()));
-    paddle_oh_assert(it_bucket != m_buckets_data.end());
-
-    erase_value_from_bucket(it_bucket);
-
-    /*
-     * One element was removed from m_values, due to the left shift the next
-     * element is now at the position of the previous element (or end if none).
-     */
-    return begin() + index_erase;
-  }
-
-  iterator erase(const_iterator first, const_iterator last) {
-    if (first == last) {
-      return mutable_iterator(first);
-    }
-
-    paddle_oh_assert(std::distance(first, last) > 0);
-    const std::size_t start_index = iterator_to_index(first);
-    const std::size_t nb_values = std::size_t(std::distance(first, last));
-    const std::size_t end_index = start_index + nb_values;
-
-// Delete all values
-#ifdef PADDLE_OH_NO_CONTAINER_ERASE_CONST_ITERATOR
-    auto next_it = m_values.erase(mutable_iterator(first).m_iterator,
-                                  mutable_iterator(last).m_iterator);
-#else
-    auto next_it = m_values.erase(first.m_iterator, last.m_iterator);
-#endif
-
-    /*
-     * Mark the buckets corresponding to the values as empty and do a backward
-     * shift.
-     *
-     * Also, the erase operation on m_values has shifted all the values on the
-     * right of last.m_iterator. Adapt the indexes for these values.
-     */
-    std::size_t ibucket = 0;
-    while (ibucket < m_buckets_data.size()) {
-      if (m_buckets[ibucket].empty()) {
-        ibucket++;
-      } else if (m_buckets[ibucket].index() >= start_index &&
-                 m_buckets[ibucket].index() < end_index) {
-        m_buckets[ibucket].clear();
-        backward_shift(ibucket);
-        // Don't increment ibucket, backward_shift may have replaced current
-        // bucket.
-      } else if (m_buckets[ibucket].index() >= end_index) {
-        m_buckets[ibucket].set_index(
-            index_type(m_buckets[ibucket].index() - nb_values));
-        ibucket++;
-      } else {
-        ibucket++;
-      }
-    }
-
-    return iterator(next_it);
-  }
-
-  template <class K>
-  size_type erase(const K& key) {
-    return erase(key, hash_key(key));
-  }
-
-  template <class K>
-  size_type erase(const K& key, std::size_t hash) {
-    return erase_impl(key, hash);
-  }
-
-  void swap(ordered_hash& other) {
-    using std::swap;
-
-    swap(static_cast<Hash&>(*this), static_cast<Hash&>(other));
-    swap(static_cast<KeyEqual&>(*this), static_cast<KeyEqual&>(other));
-    swap(m_buckets_data, other.m_buckets_data);
-    swap(m_buckets, other.m_buckets);
-    swap(m_hash_mask, other.m_hash_mask);
-    swap(m_values, other.m_values);
-    swap(m_load_threshold, other.m_load_threshold);
-    swap(m_max_load_factor, other.m_max_load_factor);
-    swap(m_grow_on_next_insert, other.m_grow_on_next_insert);
-  }
-
-  /*
-   * Lookup
-   */
-  template <class K,
-            class U = ValueSelect,
-            typename std::enable_if<has_mapped_type<U>::value>::type* = nullptr>
-  typename U::value_type& at(const K& key) {
-    return at(key, hash_key(key));
-  }
-
-  template <class K,
-            class U = ValueSelect,
-            typename std::enable_if<has_mapped_type<U>::value>::type* = nullptr>
-  typename U::value_type& at(const K& key, std::size_t hash) {
-    return const_cast<typename U::value_type&>(
-        static_cast<const ordered_hash*>(this)->at(key, hash));
-  }
-
-  template <class K,
-            class U = ValueSelect,
-            typename std::enable_if<has_mapped_type<U>::value>::type* = nullptr>
-  const typename U::value_type& at(const K& key) const {
-    return at(key, hash_key(key));
-  }
-
-  template <class K,
-            class U = ValueSelect,
-            typename std::enable_if<has_mapped_type<U>::value>::type* = nullptr>
-  const typename U::value_type& at(const K& key, std::size_t hash) const {
-    auto it = find(key, hash);
-    if (it != end()) {
-      return it.value();
-    } else {
-      PADDLE_OH_THROW_OR_TERMINATE(std::out_of_range, "Couldn't find the key.");
-    }
-  }
-
-  template <class K,
-            class U = ValueSelect,
-            typename std::enable_if<has_mapped_type<U>::value>::type* = nullptr>
-  typename U::value_type& operator[](K&& key) {
-    return try_emplace(std::forward<K>(key)).first.value();
-  }
-
-  template <class K>
-  size_type count(const K& key) const {
-    return count(key, hash_key(key));
-  }
-
-  template <class K>
-  size_type count(const K& key, std::size_t hash) const {
-    if (find(key, hash) == cend()) {
-      return 0;
-    } else {
-      return 1;
-    }
-  }
-
-  template <class K>
-  iterator find(const K& key) {
-    return find(key, hash_key(key));
-  }
-
-  template <class K>
-  iterator find(const K& key, std::size_t hash) {
-    auto it_bucket = find_key(key, hash);
-    return (it_bucket != m_buckets_data.end())
-               ? iterator(m_values.begin() + it_bucket->index())
-               : end();
-  }
-
-  template <class K>
-  const_iterator find(const K& key) const {
-    return find(key, hash_key(key));
-  }
-
-  template <class K>
-  const_iterator find(const K& key, std::size_t hash) const {
-    auto it_bucket = find_key(key, hash);
-    return (it_bucket != m_buckets_data.cend())
-               ? const_iterator(m_values.begin() + it_bucket->index())
-               : end();
-  }
-
-  template <class K>
-  bool contains(const K& key) const {
-    return contains(key, hash_key(key));
-  }
-
-  template <class K>
-  bool contains(const K& key, std::size_t hash) const {
-    return find(key, hash) != cend();
-  }
-
-  template <class K>
-  std::pair<iterator, iterator> equal_range(const K& key) {
-    return equal_range(key, hash_key(key));
-  }
-
-  template <class K>
-  std::pair<iterator, iterator> equal_range(const K& key, std::size_t hash) {
-    iterator it = find(key, hash);
-    return std::make_pair(it, (it == end()) ? it : std::next(it));
-  }
-
-  template <class K>
-  std::pair<const_iterator, const_iterator> equal_range(const K& key) const {
-    return equal_range(key, hash_key(key));
-  }
-
-  template <class K>
-  std::pair<const_iterator, const_iterator> equal_range(
-      const K& key, std::size_t hash) const {
-    const_iterator it = find(key, hash);
-    return std::make_pair(it, (it == cend()) ? it : std::next(it));
-  }
-
-  /*
-   * Bucket interface
-   */
-  size_type bucket_count() const { return m_buckets_data.size(); }
-
-  size_type max_bucket_count() const { return m_buckets_data.max_size(); }
-
-  /*
-   *  Hash policy
-   */
-  float load_factor() const {
-    if (bucket_count() == 0) {
-      return 0;
-    }
-
-    return static_cast<float>(size()) / static_cast<float>(bucket_count());
-  }
-
-  float max_load_factor() const { return m_max_load_factor; }
-
-  void max_load_factor(float ml) {
-    m_max_load_factor = clamp(ml,
-                              static_cast<float>(MAX_LOAD_FACTOR__MINIMUM),
-                              static_cast<float>(MAX_LOAD_FACTOR__MAXIMUM));
-
-    m_max_load_factor = ml;
-    m_load_threshold =
-        size_type(static_cast<float>(bucket_count()) * m_max_load_factor);
-  }
-
-  void rehash(size_type count) {
-    count = std::max(
-        count,
-        size_type(std::ceil(static_cast<float>(size()) / max_load_factor())));
-    rehash_impl(count);
-  }
-
-  void reserve(size_type count) {
-    reserve_space_for_values(count);
-
-    count = size_type(std::ceil(static_cast<float>(count) / max_load_factor()));
-    rehash(count);
-  }
-
-  /*
-   * Observers
-   */
-  hasher hash_function() const { return static_cast<const Hash&>(*this); }
-
-  key_equal key_eq() const { return static_cast<const KeyEqual&>(*this); }
-
-  /*
-   * Other
-   */
-  iterator mutable_iterator(const_iterator pos) {
-    return iterator(m_values.begin() + iterator_to_index(pos));
-  }
-
-  iterator nth(size_type index) {
-    paddle_oh_assert(index <= size());
-    return iterator(m_values.begin() + index);
-  }
-
-  const_iterator nth(size_type index) const {
-    paddle_oh_assert(index <= size());
-    return const_iterator(m_values.cbegin() + index);
-  }
-
-  const_reference front() const {
-    paddle_oh_assert(!empty());
-    return m_values.front();
-  }
-
-  const_reference back() const {
-    paddle_oh_assert(!empty());
-    return m_values.back();
-  }
-
-  const values_container_type& values_container() const noexcept {
-    return m_values;
-  }
-
-  template <class U = values_container_type,
-            typename std::enable_if<is_vector<U>::value>::type* = nullptr>
-  const typename values_container_type::value_type* data() const noexcept {
-    return m_values.data();
-  }
-
-  template <class U = values_container_type,
-            typename std::enable_if<is_vector<U>::value>::type* = nullptr>
-  size_type capacity() const noexcept {
-    return m_values.capacity();
-  }
-
-  void shrink_to_fit() { m_values.shrink_to_fit(); }
-
-  template <typename P>
-  std::pair<iterator, bool> insert_at_position(const_iterator pos, P&& value) {
-    return insert_at_position_impl(
-        pos.m_iterator, KeySelect()(value), std::forward<P>(value));
-  }
-
-  template <class... Args>
-  std::pair<iterator, bool> emplace_at_position(const_iterator pos,
-                                                Args&&... args) {
-    return insert_at_position(pos, value_type(std::forward<Args>(args)...));
-  }
-
-  template <class K, class... Args>
-  std::pair<iterator, bool> try_emplace_at_position(const_iterator pos,
-                                                    K&& key,
-                                                    Args&&... value_args) {
-    return insert_at_position_impl(
-        pos.m_iterator,
-        key,
-        std::piecewise_construct,
-        std::forward_as_tuple(std::forward<K>(key)),
-        std::forward_as_tuple(std::forward<Args>(value_args)...));
-  }
-
-  void pop_back() {
-    paddle_oh_assert(!empty());
-    erase(std::prev(end()));
-  }
-
-  /**
-   * Here to avoid `template<class K> size_type unordered_erase(const K& key)`
-   * being used when we use a iterator instead of a const_iterator.
-   */
-  iterator unordered_erase(iterator pos) {
-    return unordered_erase(const_iterator(pos));
-  }
-
-  iterator unordered_erase(const_iterator pos) {
-    const std::size_t index_erase = iterator_to_index(pos);
-    unordered_erase(pos.key());
-
-    /*
-     * One element was deleted, index_erase now points to the next element as
-     * the elements after the deleted value were shifted to the left in m_values
-     * (will be end() if we deleted the last element).
-     */
-    return begin() + index_erase;
-  }
-
-  template <class K>
-  size_type unordered_erase(const K& key) {
-    return unordered_erase(key, hash_key(key));
-  }
-
-  template <class K>
-  size_type unordered_erase(const K& key, std::size_t hash) {
-    auto it_bucket_key = find_key(key, hash);
-    if (it_bucket_key == m_buckets_data.end()) {
-      return 0;
-    }
-
-    /**
-     * If we are not erasing the last element in m_values, we swap
-     * the element we are erasing with the last element. We then would
-     * just have to do a pop_back() in m_values.
-     */
-    if (!compare_keys(key, KeySelect()(back()))) {
-      auto it_bucket_last_elem =
-          find_key(KeySelect()(back()), hash_key(KeySelect()(back())));
-      paddle_oh_assert(it_bucket_last_elem != m_buckets_data.end());
-      paddle_oh_assert(it_bucket_last_elem->index() == m_values.size() - 1);
-
-      using std::swap;
-      swap(m_values[it_bucket_key->index()],
-           m_values[it_bucket_last_elem->index()]);
-      swap(it_bucket_key->index_ref(), it_bucket_last_elem->index_ref());
-    }
-
-    erase_value_from_bucket(it_bucket_key);
-
-    return 1;
-  }
-
-  template <class Serializer>
-  void serialize(Serializer& serializer) const {  // NOLINT
-    serialize_impl(serializer);
-  }
-
-  template <class Deserializer>
-  void deserialize(Deserializer& deserializer,  // NOLINT
-                   bool hash_compatible) {
-    deserialize_impl(deserializer, hash_compatible);
-  }
-
-  friend bool operator==(const ordered_hash& lhs, const ordered_hash& rhs) {
-    return lhs.m_values == rhs.m_values;
-  }
-
-  friend bool operator!=(const ordered_hash& lhs, const ordered_hash& rhs) {
-    return lhs.m_values != rhs.m_values;
-  }
-
-  friend bool operator<(const ordered_hash& lhs, const ordered_hash& rhs) {
-    return lhs.m_values < rhs.m_values;
-  }
-
-  friend bool operator<=(const ordered_hash& lhs, const ordered_hash& rhs) {
-    return lhs.m_values <= rhs.m_values;
-  }
-
-  friend bool operator>(const ordered_hash& lhs, const ordered_hash& rhs) {
-    return lhs.m_values > rhs.m_values;
-  }
-
-  friend bool operator>=(const ordered_hash& lhs, const ordered_hash& rhs) {
-    return lhs.m_values >= rhs.m_values;
-  }
-
- private:
-  template <class K>
-  std::size_t hash_key(const K& key) const {
-    return Hash::operator()(key);
-  }
-
-  template <class K1, class K2>
-  bool compare_keys(const K1& key1, const K2& key2) const {
-    return KeyEqual::operator()(key1, key2);
-  }
-
-  template <class K>
-  typename buckets_container_type::iterator find_key(const K& key,
-                                                     std::size_t hash) {
-    auto it = static_cast<const ordered_hash*>(this)->find_key(key, hash);
-    return m_buckets_data.begin() + std::distance(m_buckets_data.cbegin(), it);
-  }
-
-  /**
-   * Return bucket which has the key 'key' or m_buckets_data.end() if none.
-   *
-   * From the bucket_for_hash, search for the value until we either find an
-   * empty bucket or a bucket which has a value with a distance from its ideal
-   * bucket longer than the probe length for the value we are looking for.
-   */
-  template <class K>
-  typename buckets_container_type::const_iterator find_key(
-      const K& key, std::size_t hash) const {
-    for (std::size_t ibucket = bucket_for_hash(hash),
-                     dist_from_ideal_bucket = 0;
-         ;  // NOLINT
-         ibucket = next_bucket(ibucket), dist_from_ideal_bucket++) {
-      if (m_buckets[ibucket].empty()) {
-        return m_buckets_data.end();
-      } else if (m_buckets[ibucket].truncated_hash() ==
-                     bucket_entry::truncate_hash(hash) &&
-                 compare_keys(
-                     key, KeySelect()(m_values[m_buckets[ibucket].index()]))) {
-        return m_buckets_data.begin() + ibucket;
-      } else if (dist_from_ideal_bucket > distance_from_ideal_bucket(ibucket)) {
-        return m_buckets_data.end();
-      }
-    }
-  }
-
-  void rehash_impl(size_type bucket_count) {
-    paddle_oh_assert(
-        bucket_count >=
-        size_type(std::ceil(static_cast<float>(size()) / max_load_factor())));
-
-    if (bucket_count > max_bucket_count()) {
-      PADDLE_OH_THROW_OR_TERMINATE(std::length_error,
-                                   "The map exceeds its maximum size.");
-    }
-
-    if (bucket_count > 0) {
-      bucket_count = round_up_to_power_of_two(bucket_count);
-    }
-
-    if (bucket_count == this->bucket_count()) {
-      return;
-    }
-
-    buckets_container_type old_buckets(bucket_count);
-    m_buckets_data.swap(old_buckets);
-    m_buckets = m_buckets_data.empty() ? static_empty_bucket_ptr()
-                                       : m_buckets_data.data();
-    // Everything should be noexcept from here.
-
-    m_hash_mask = (bucket_count > 0) ? (bucket_count - 1) : 0;
-    this->max_load_factor(m_max_load_factor);
-    m_grow_on_next_insert = false;
-
-    for (const bucket_entry& old_bucket : old_buckets) {
-      if (old_bucket.empty()) {
-        continue;
-      }
-
-      truncated_hash_type insert_hash = old_bucket.truncated_hash();
-      index_type insert_index = old_bucket.index();
-
-      for (std::size_t ibucket = bucket_for_hash(insert_hash),
-                       dist_from_ideal_bucket = 0;
-           ;  // NOLINT
-           ibucket = next_bucket(ibucket), dist_from_ideal_bucket++) {
-        if (m_buckets[ibucket].empty()) {
-          m_buckets[ibucket].set_index(insert_index);
-          m_buckets[ibucket].set_hash(insert_hash);
-          break;
-        }
-
-        const std::size_t distance = distance_from_ideal_bucket(ibucket);
-        if (dist_from_ideal_bucket > distance) {
-          std::swap(insert_index, m_buckets[ibucket].index_ref());
-          std::swap(insert_hash, m_buckets[ibucket].truncated_hash_ref());
-          dist_from_ideal_bucket = distance;
-        }
-      }
-    }
-  }
-
-  template <class T = values_container_type,
-            typename std::enable_if<is_vector<T>::value>::type* = nullptr>
-  void reserve_space_for_values(size_type count) {
-    m_values.reserve(count);
-  }
-
-  template <class T = values_container_type,
-            typename std::enable_if<!is_vector<T>::value>::type* = nullptr>
-  void reserve_space_for_values(size_type /*count*/) {}
-
-  /**
-   * Swap the empty bucket with the values on its right until we cross another
-   * empty bucket or if the other bucket has a distance_from_ideal_bucket == 0.
-   */
-  void backward_shift(std::size_t empty_ibucket) noexcept {
-    paddle_oh_assert(m_buckets[empty_ibucket].empty());
-
-    std::size_t previous_ibucket = empty_ibucket;
-    for (std::size_t current_ibucket = next_bucket(previous_ibucket);
-         !m_buckets[current_ibucket].empty() &&
-         distance_from_ideal_bucket(current_ibucket) > 0;
-         previous_ibucket = current_ibucket,
-                     current_ibucket = next_bucket(current_ibucket)) {
-      std::swap(m_buckets[current_ibucket], m_buckets[previous_ibucket]);
-    }
-  }
-
-  void erase_value_from_bucket(
-      typename buckets_container_type::iterator it_bucket) {
-    paddle_oh_assert(it_bucket != m_buckets_data.end() && !it_bucket->empty());
-
-    m_values.erase(m_values.begin() + it_bucket->index());
-
-    /*
-     * m_values.erase shifted all the values on the right of the erased value,
-     * shift the indexes by -1 in the buckets array for these values.
-     */
-    if (it_bucket->index() != m_values.size()) {
-      shift_indexes_in_buckets(it_bucket->index(), -1);
-    }
-
-    // Mark the bucket as empty and do a backward shift of the values on the
-    // right
-    it_bucket->clear();
-    backward_shift(
-        std::size_t(std::distance(m_buckets_data.begin(), it_bucket)));
-  }
-
-  /**
-   * Go through each value from [from_ivalue, m_values.size()) in m_values and
-   * for each bucket corresponding to the value, shift the index by delta.
-   *
-   * delta must be equal to 1 or -1.
-   */
-  void shift_indexes_in_buckets(index_type from_ivalue, int delta) noexcept {
-    paddle_oh_assert(delta == 1 || delta == -1);
-
-    for (std::size_t ivalue = from_ivalue; ivalue < m_values.size(); ivalue++) {
-      // All the values in m_values have been shifted by delta. Find the bucket
-      // corresponding to the value m_values[ivalue]
-      const index_type old_index = static_cast<index_type>(ivalue - delta);
-
-      std::size_t ibucket =
-          bucket_for_hash(hash_key(KeySelect()(m_values[ivalue])));
-      while (m_buckets[ibucket].index() != old_index) {
-        ibucket = next_bucket(ibucket);
-      }
-
-      m_buckets[ibucket].set_index(index_type(ivalue));
-    }
-  }
-
-  template <class K>
-  size_type erase_impl(const K& key, std::size_t hash) {
-    auto it_bucket = find_key(key, hash);
-    if (it_bucket != m_buckets_data.end()) {
-      erase_value_from_bucket(it_bucket);
-
-      return 1;
-    } else {
-      return 0;
-    }
-  }
-
-  /**
-   * Insert the element at the end.
-   */
-  template <class K, class... Args>
-  std::pair<iterator, bool> insert_impl(const K& key,
-                                        Args&&... value_type_args) {
-    const std::size_t hash = hash_key(key);
-
-    std::size_t ibucket = bucket_for_hash(hash);
-    std::size_t dist_from_ideal_bucket = 0;
-
-    while (!m_buckets[ibucket].empty() &&
-           dist_from_ideal_bucket <= distance_from_ideal_bucket(ibucket)) {
-      if (m_buckets[ibucket].truncated_hash() ==
-              bucket_entry::truncate_hash(hash) &&
-          compare_keys(key,
-                       KeySelect()(m_values[m_buckets[ibucket].index()]))) {
-        return std::make_pair(begin() + m_buckets[ibucket].index(), false);
-      }
-
-      ibucket = next_bucket(ibucket);
-      dist_from_ideal_bucket++;
-    }
-
-    if (size() >= max_size()) {
-      PADDLE_OH_THROW_OR_TERMINATE(
-          std::length_error, "We reached the maximum size for the hash table.");
-    }
-
-    if (grow_on_high_load()) {
-      ibucket = bucket_for_hash(hash);
-      dist_from_ideal_bucket = 0;
-    }
-
-    m_values.emplace_back(std::forward<Args>(value_type_args)...);
-    insert_index(ibucket,
-                 dist_from_ideal_bucket,
-                 index_type(m_values.size() - 1),
-                 bucket_entry::truncate_hash(hash));
-
-    return std::make_pair(std::prev(end()), true);
-  }
-
-  /**
-   * Insert the element before insert_position.
-   */
-  template <class K, class... Args>
-  std::pair<iterator, bool> insert_at_position_impl(
-      typename values_container_type::const_iterator insert_position,
-      const K& key,
-      Args&&... value_type_args) {
-    const std::size_t hash = hash_key(key);
-
-    std::size_t ibucket = bucket_for_hash(hash);
-    std::size_t dist_from_ideal_bucket = 0;
-
-    while (!m_buckets[ibucket].empty() &&
-           dist_from_ideal_bucket <= distance_from_ideal_bucket(ibucket)) {
-      if (m_buckets[ibucket].truncated_hash() ==
-              bucket_entry::truncate_hash(hash) &&
-          compare_keys(key,
-                       KeySelect()(m_values[m_buckets[ibucket].index()]))) {
-        return std::make_pair(begin() + m_buckets[ibucket].index(), false);
-      }
-
-      ibucket = next_bucket(ibucket);
-      dist_from_ideal_bucket++;
-    }
-
-    if (size() >= max_size()) {
-      PADDLE_OH_THROW_OR_TERMINATE(
-          std::length_error, "We reached the maximum size for the hash table.");
-    }
-
-    if (grow_on_high_load()) {
-      ibucket = bucket_for_hash(hash);
-      dist_from_ideal_bucket = 0;
-    }
-
-    const index_type index_insert_position =
-        index_type(std::distance(m_values.cbegin(), insert_position));
-
-#ifdef PADDLE_OH_NO_CONTAINER_EMPLACE_CONST_ITERATOR
-    m_values.emplace(
-        m_values.begin() + std::distance(m_values.cbegin(), insert_position),
-        std::forward<Args>(value_type_args)...);
-#else
-    m_values.emplace(insert_position, std::forward<Args>(value_type_args)...);
-#endif
-
-    insert_index(ibucket,
-                 dist_from_ideal_bucket,
-                 index_insert_position,
-                 bucket_entry::truncate_hash(hash));
-
-    /*
-     * The insertion didn't happend at the end of the m_values container,
-     * we need to shift the indexes in m_buckets_data.
-     */
-    if (index_insert_position != m_values.size() - 1) {
-      shift_indexes_in_buckets(index_insert_position + 1, 1);
-    }
-
-    return std::make_pair(iterator(m_values.begin() + index_insert_position),
-                          true);
-  }
-
-  void insert_index(std::size_t ibucket,
-                    std::size_t dist_from_ideal_bucket,
-                    index_type index_insert,
-                    truncated_hash_type hash_insert) noexcept {
-    while (!m_buckets[ibucket].empty()) {
-      const std::size_t distance = distance_from_ideal_bucket(ibucket);
-      if (dist_from_ideal_bucket > distance) {
-        std::swap(index_insert, m_buckets[ibucket].index_ref());
-        std::swap(hash_insert, m_buckets[ibucket].truncated_hash_ref());
-
-        dist_from_ideal_bucket = distance;
-      }
-
-      ibucket = next_bucket(ibucket);
-      dist_from_ideal_bucket++;
-
-      if (dist_from_ideal_bucket > REHASH_ON_HIGH_NB_PROBES__NPROBES &&
-          !m_grow_on_next_insert &&
-          load_factor() >= REHASH_ON_HIGH_NB_PROBES__MIN_LOAD_FACTOR) {
-        // We don't want to grow the map now as we need this method to be
-        // noexcept. Do it on next insert.
-        m_grow_on_next_insert = true;
-      }
-    }
-
-    m_buckets[ibucket].set_index(index_insert);
-    m_buckets[ibucket].set_hash(hash_insert);
-  }
-
-  std::size_t distance_from_ideal_bucket(std::size_t ibucket) const noexcept {
-    const std::size_t ideal_bucket =
-        bucket_for_hash(m_buckets[ibucket].truncated_hash());
-
-    if (ibucket >= ideal_bucket) {
-      return ibucket - ideal_bucket;
-    } else {
-      // If the bucket is smaller than the ideal bucket for the value, there was
-      // a
-      // wrapping at the end of the bucket array due to the modulo.
-      return (bucket_count() + ibucket) - ideal_bucket;
-    }
-  }
-
-  std::size_t next_bucket(std::size_t index) const noexcept {
-    paddle_oh_assert(index < m_buckets_data.size());
-
-    index++;
-    return (index < m_buckets_data.size()) ? index : 0;
-  }
-
-  std::size_t bucket_for_hash(std::size_t hash) const noexcept {
-    return hash & m_hash_mask;
-  }
-
-  std::size_t iterator_to_index(const_iterator it) const noexcept {
-    const auto dist = std::distance(cbegin(), it);
-    paddle_oh_assert(dist >= 0);
-
-    return std::size_t(dist);
-  }
-
-  /**
-   * Return true if the map has been rehashed.
-   */
-  bool grow_on_high_load() {
-    if (m_grow_on_next_insert || size() >= m_load_threshold) {
-      rehash_impl(std::max(size_type(1), bucket_count() * 2));
-      m_grow_on_next_insert = false;
-
-      return true;
-    } else {
-      return false;
-    }
-  }
-
-  template <class Serializer>
-  void serialize_impl(Serializer& serializer) const {  // NOLINT
-    const slz_size_type version = SERIALIZATION_PROTOCOL_VERSION;
-    serializer(version);
-
-    const slz_size_type nb_elements = m_values.size();
-    serializer(nb_elements);
-
-    const slz_size_type bucket_count = m_buckets_data.size();
-    serializer(bucket_count);
-
-    const float max_load_factor = m_max_load_factor;
-    serializer(max_load_factor);
-
-    for (const value_type& value : m_values) {
-      serializer(value);
-    }
-
-    for (const bucket_entry& bucket : m_buckets_data) {
-      bucket.serialize(serializer);
-    }
-  }
-
-  template <class Deserializer>
-  void deserialize_impl(Deserializer& deserializer,  // NOLINT
-                        bool hash_compatible) {
-    paddle_oh_assert(
-        m_buckets_data.empty());  // Current hash table must be empty
-
-    const slz_size_type version =
-        deserialize_value<slz_size_type>(deserializer);
-    // For now we only have one version of the serialization protocol.
-    // If it doesn't match there is a problem with the file.
-    if (version != SERIALIZATION_PROTOCOL_VERSION) {
-      PADDLE_OH_THROW_OR_TERMINATE(std::runtime_error,
-                                   "Can't deserialize the ordered_map/set. "
-                                   "The protocol version header is invalid.");
-    }
-
-    const slz_size_type nb_elements =
-        deserialize_value<slz_size_type>(deserializer);
-    const slz_size_type bucket_count_ds =
-        deserialize_value<slz_size_type>(deserializer);
-    const float max_load_factor = deserialize_value<float>(deserializer);
-
-    if (max_load_factor < MAX_LOAD_FACTOR__MINIMUM ||
-        max_load_factor > MAX_LOAD_FACTOR__MAXIMUM) {
-      PADDLE_OH_THROW_OR_TERMINATE(
-          std::runtime_error,
-          "Invalid max_load_factor. Check that the serializer "
-          "and deserializer support floats correctly as they "
-          "can be converted implicitly to ints.");
-    }
-
-    this->max_load_factor(max_load_factor);
-
-    if (bucket_count_ds == 0) {
-      paddle_oh_assert(nb_elements == 0);
-      return;
-    }
-
-    if (!hash_compatible) {
-      reserve(numeric_cast<size_type>(nb_elements,
-                                      "Deserialized nb_elements is too big."));
-      for (slz_size_type el = 0; el < nb_elements; el++) {
-        insert(deserialize_value<value_type>(deserializer));
-      }
-    } else {
-      m_buckets_data.reserve(numeric_cast<size_type>(
-          bucket_count_ds, "Deserialized bucket_count is too big."));
-      m_buckets = m_buckets_data.data(),
-      m_hash_mask = m_buckets_data.capacity() - 1;
-
-      reserve_space_for_values(numeric_cast<size_type>(
-          nb_elements, "Deserialized nb_elements is too big."));
-      for (slz_size_type el = 0; el < nb_elements; el++) {
-        m_values.push_back(deserialize_value<value_type>(deserializer));
-      }
-
-      for (slz_size_type b = 0; b < bucket_count_ds; b++) {
-        m_buckets_data.push_back(bucket_entry::deserialize(deserializer));
-      }
-    }
-  }
-
-  static std::size_t round_up_to_power_of_two(std::size_t value) {
-    if (is_power_of_two(value)) {
-      return value;
-    }
-
-    if (value == 0) {
-      return 1;
-    }
-
-    --value;
-    for (std::size_t i = 1; i < sizeof(std::size_t) * CHAR_BIT; i *= 2) {
-      value |= value >> i;
-    }
-
-    return value + 1;
-  }
-
-  static constexpr bool is_power_of_two(std::size_t value) {
-    return value != 0 && (value & (value - 1)) == 0;
-  }
-
- public:
-  static const size_type DEFAULT_INIT_BUCKETS_SIZE = 0;
-  static constexpr float DEFAULT_MAX_LOAD_FACTOR = 0.75f;
-
- private:
-  static constexpr float MAX_LOAD_FACTOR__MINIMUM = 0.1f;
-  static constexpr float MAX_LOAD_FACTOR__MAXIMUM = 0.95f;
-
-  static const size_type REHASH_ON_HIGH_NB_PROBES__NPROBES = 128;
-  static constexpr float REHASH_ON_HIGH_NB_PROBES__MIN_LOAD_FACTOR = 0.15f;
-
-  /**
-   * Protocol version currenlty used for serialization.
-   */
-  static const slz_size_type SERIALIZATION_PROTOCOL_VERSION = 1;
-
-  /**
-   * Return an always valid pointer to an static empty bucket_entry with
-   * last_bucket() == true.
-   */
-  bucket_entry* static_empty_bucket_ptr() {
-    static bucket_entry empty_bucket;
-    return &empty_bucket;
-  }
-
- private:
-  buckets_container_type m_buckets_data;
-
-  /**
-   * Points to m_buckets_data.data() if !m_buckets_data.empty() otherwise points
-   * to static_empty_bucket_ptr. This variable is useful to avoid the cost of
-   * checking if m_buckets_data is empty when trying to find an element.
-   *
-   * TODO Remove m_buckets_data and only use a pointer+size instead of a
-   * pointer+vector to save some space in the ordered_hash object.
-   */
-  bucket_entry* m_buckets;
-
-  size_type m_hash_mask;
-
-  values_container_type m_values;
-
-  size_type m_load_threshold;
-  float m_max_load_factor;
-
-  bool m_grow_on_next_insert;
-};
-
-}  // end namespace detail_ordered_hash
-
-}  // end namespace paddle
diff --git a/paddle/utils/ordered_map.h b/paddle/utils/ordered_map.h
deleted file mode 100644
index 10bf5628ed3e8..0000000000000
--- a/paddle/utils/ordered_map.h
+++ /dev/null
@@ -1,1022 +0,0 @@
-/**
- * Copy from https://github.com/Tessil/ordered-map
- * Modified the following points:
- * 1. modify namespace from `tsl` to `paddle`
- * 2. modify some naming prefixes from `tsl` to `paddle`
- * 3. refine code-format by pre-commit hook
- */
-
-/**
- * MIT License
- *
- * Copyright (c) 2017 Thibaut Goetghebuer-Planchon <tessil@gmx.com>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include <cstddef>
-#include <cstdint>
-#include <deque>
-#include <functional>
-#include <initializer_list>
-#include <memory>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-#include "paddle/utils/ordered_hash.h"
-
-namespace paddle {
-
-/**
- * Implementation of an hash map using open addressing with robin hood with
- * backshift delete to resolve collisions.
- *
- * The particularity of this hash map is that it remembers the order in which
- * the elements were added and provide a way to access the structure which
- * stores these values through the 'values_container()' method. The used
- * container is defined by ValueTypeContainer, by default a std::deque is used
- * (grows faster) but a std::vector may be used. In this case the map provides a
- * 'data()' method which give a direct access to the memory used to store the
- * values (which can be useful to communicate with C API's).
- *
- * The Key and T must be copy constructible and/or move constructible. To use
- * `unordered_erase` they both must be swappable.
- *
- * The behaviour of the hash map is undefined if the destructor of Key or T
- * throws an exception.
- *
- * By default the maximum size of a map is limited to 2^32 - 1 values, if needed
- * this can be changed through the IndexType template parameter. Using an
- * `uint64_t` will raise this limit to 2^64 - 1 values but each bucket will use
- * 16 bytes instead of 8 bytes in addition to the space needed to store the
- * values.
- *
- * Iterators invalidation:
- *  - clear, operator=, reserve, rehash: always invalidate the iterators (also
- * invalidate end()).
- *  - insert, emplace, emplace_hint, operator[]: when a std::vector is used as
- * ValueTypeContainer and if size() < capacity(), only end(). Otherwise all the
- * iterators are invalidated if an insert occurs.
- *  - erase, unordered_erase: when a std::vector is used as ValueTypeContainer
- * invalidate the iterator of the erased element and all the ones after the
- * erased element (including end()). Otherwise all the iterators are invalidated
- * if an erase occurs.
- */
-template <class Key,
-          class T,
-          class Hash = std::hash<Key>,
-          class KeyEqual = std::equal_to<Key>,
-          class Allocator = std::allocator<std::pair<Key, T>>,
-          class ValueTypeContainer = std::deque<std::pair<Key, T>, Allocator>,
-          class IndexType = std::uint_least32_t>
-class ordered_map {
- private:
-  template <typename U>
-  using has_is_transparent = paddle::detail_ordered_hash::has_is_transparent<U>;
-
-  class KeySelect {
-   public:
-    using key_type = Key;
-
-    const key_type& operator()(const std::pair<Key, T>& key_value) const
-        noexcept {
-      return key_value.first;
-    }
-
-    key_type& operator()(std::pair<Key, T>& key_value) noexcept {  // NOLINT
-      return key_value.first;
-    }
-  };
-
-  class ValueSelect {
-   public:
-    using value_type = T;
-
-    const value_type& operator()(const std::pair<Key, T>& key_value) const
-        noexcept {
-      return key_value.second;
-    }
-
-    value_type& operator()(std::pair<Key, T>& key_value) noexcept {  // NOLINT
-      return key_value.second;
-    }
-  };
-
-  using ht = detail_ordered_hash::ordered_hash<std::pair<Key, T>,
-                                               KeySelect,
-                                               ValueSelect,
-                                               Hash,
-                                               KeyEqual,
-                                               Allocator,
-                                               ValueTypeContainer,
-                                               IndexType>;
-
- public:
-  using key_type = typename ht::key_type;
-  using mapped_type = T;
-  using value_type = typename ht::value_type;
-  using size_type = typename ht::size_type;
-  using difference_type = typename ht::difference_type;
-  using hasher = typename ht::hasher;
-  using key_equal = typename ht::key_equal;
-  using allocator_type = typename ht::allocator_type;
-  using reference = typename ht::reference;
-  using const_reference = typename ht::const_reference;
-  using pointer = typename ht::pointer;
-  using const_pointer = typename ht::const_pointer;
-  using iterator = typename ht::iterator;
-  using const_iterator = typename ht::const_iterator;
-  using reverse_iterator = typename ht::reverse_iterator;
-  using const_reverse_iterator = typename ht::const_reverse_iterator;
-
-  using values_container_type = typename ht::values_container_type;
-
-  /*
-   * Constructors
-   */
-  ordered_map() : ordered_map(ht::DEFAULT_INIT_BUCKETS_SIZE) {}
-
-  explicit ordered_map(size_type bucket_count,
-                       const Hash& hash = Hash(),
-                       const KeyEqual& equal = KeyEqual(),
-                       const Allocator& alloc = Allocator())
-      : m_ht(bucket_count, hash, equal, alloc, ht::DEFAULT_MAX_LOAD_FACTOR) {}
-
-  ordered_map(size_type bucket_count, const Allocator& alloc)
-      : ordered_map(bucket_count, Hash(), KeyEqual(), alloc) {}
-
-  ordered_map(size_type bucket_count, const Hash& hash, const Allocator& alloc)
-      : ordered_map(bucket_count, hash, KeyEqual(), alloc) {}
-
-  explicit ordered_map(const Allocator& alloc)
-      : ordered_map(ht::DEFAULT_INIT_BUCKETS_SIZE, alloc) {}
-
-  template <class InputIt>
-  ordered_map(InputIt first,
-              InputIt last,
-              size_type bucket_count = ht::DEFAULT_INIT_BUCKETS_SIZE,
-              const Hash& hash = Hash(),
-              const KeyEqual& equal = KeyEqual(),
-              const Allocator& alloc = Allocator())
-      : ordered_map(bucket_count, hash, equal, alloc) {
-    insert(first, last);
-  }
-
-  template <class InputIt>
-  ordered_map(InputIt first,
-              InputIt last,
-              size_type bucket_count,
-              const Allocator& alloc)
-      : ordered_map(first, last, bucket_count, Hash(), KeyEqual(), alloc) {}
-
-  template <class InputIt>
-  ordered_map(InputIt first,
-              InputIt last,
-              size_type bucket_count,
-              const Hash& hash,
-              const Allocator& alloc)
-      : ordered_map(first, last, bucket_count, hash, KeyEqual(), alloc) {}
-
-  ordered_map(std::initializer_list<value_type> init,
-              size_type bucket_count = ht::DEFAULT_INIT_BUCKETS_SIZE,
-              const Hash& hash = Hash(),
-              const KeyEqual& equal = KeyEqual(),
-              const Allocator& alloc = Allocator())
-      : ordered_map(
-            init.begin(), init.end(), bucket_count, hash, equal, alloc) {}
-
-  ordered_map(std::initializer_list<value_type> init,
-              size_type bucket_count,
-              const Allocator& alloc)
-      : ordered_map(
-            init.begin(), init.end(), bucket_count, Hash(), KeyEqual(), alloc) {
-  }
-
-  ordered_map(std::initializer_list<value_type> init,
-              size_type bucket_count,
-              const Hash& hash,
-              const Allocator& alloc)
-      : ordered_map(
-            init.begin(), init.end(), bucket_count, hash, KeyEqual(), alloc) {}
-
-  ordered_map& operator=(std::initializer_list<value_type> ilist) {
-    m_ht.clear();
-
-    m_ht.reserve(ilist.size());
-    m_ht.insert(ilist.begin(), ilist.end());
-
-    return *this;
-  }
-
-  allocator_type get_allocator() const { return m_ht.get_allocator(); }
-
-  /*
-   * Iterators
-   */
-  iterator begin() noexcept { return m_ht.begin(); }
-  const_iterator begin() const noexcept { return m_ht.begin(); }
-  const_iterator cbegin() const noexcept { return m_ht.cbegin(); }
-
-  iterator end() noexcept { return m_ht.end(); }
-  const_iterator end() const noexcept { return m_ht.end(); }
-  const_iterator cend() const noexcept { return m_ht.cend(); }
-
-  reverse_iterator rbegin() noexcept { return m_ht.rbegin(); }
-  const_reverse_iterator rbegin() const noexcept { return m_ht.rbegin(); }
-  const_reverse_iterator rcbegin() const noexcept { return m_ht.rcbegin(); }
-
-  reverse_iterator rend() noexcept { return m_ht.rend(); }
-  const_reverse_iterator rend() const noexcept { return m_ht.rend(); }
-  const_reverse_iterator rcend() const noexcept { return m_ht.rcend(); }
-
-  /*
-   * Capacity
-   */
-  bool empty() const noexcept { return m_ht.empty(); }
-  size_type size() const noexcept { return m_ht.size(); }
-  size_type max_size() const noexcept { return m_ht.max_size(); }
-
-  /*
-   * Modifiers
-   */
-  void clear() noexcept { m_ht.clear(); }
-
-  std::pair<iterator, bool> insert(const value_type& value) {
-    return m_ht.insert(value);
-  }
-
-  template <class P,
-            typename std::enable_if<
-                std::is_constructible<value_type, P&&>::value>::type* = nullptr>
-  std::pair<iterator, bool> insert(P&& value) {
-    return m_ht.emplace(std::forward<P>(value));
-  }
-
-  std::pair<iterator, bool> insert(value_type&& value) {
-    return m_ht.insert(std::move(value));
-  }
-
-  iterator insert(const_iterator hint, const value_type& value) {
-    return m_ht.insert_hint(hint, value);
-  }
-
-  template <class P,
-            typename std::enable_if<
-                std::is_constructible<value_type, P&&>::value>::type* = nullptr>
-  iterator insert(const_iterator hint, P&& value) {
-    return m_ht.emplace_hint(hint, std::forward<P>(value));
-  }
-
-  iterator insert(const_iterator hint, value_type&& value) {
-    return m_ht.insert_hint(hint, std::move(value));
-  }
-
-  template <class InputIt>
-  void insert(InputIt first, InputIt last) {
-    m_ht.insert(first, last);
-  }
-  void insert(std::initializer_list<value_type> ilist) {
-    m_ht.insert(ilist.begin(), ilist.end());
-  }
-
-  template <class M>
-  std::pair<iterator, bool> insert_or_assign(const key_type& k, M&& obj) {
-    return m_ht.insert_or_assign(k, std::forward<M>(obj));
-  }
-
-  template <class M>
-  std::pair<iterator, bool> insert_or_assign(key_type&& k, M&& obj) {
-    return m_ht.insert_or_assign(std::move(k), std::forward<M>(obj));
-  }
-
-  template <class M>
-  iterator insert_or_assign(const_iterator hint, const key_type& k, M&& obj) {
-    return m_ht.insert_or_assign(hint, k, std::forward<M>(obj));
-  }
-
-  template <class M>
-  iterator insert_or_assign(const_iterator hint, key_type&& k, M&& obj) {
-    return m_ht.insert_or_assign(hint, std::move(k), std::forward<M>(obj));
-  }
-
-  /**
-   * Due to the way elements are stored, emplace will need to move or copy the
-   * key-value once. The method is equivalent to
-   * insert(value_type(std::forward<Args>(args)...));
-   *
-   * Mainly here for compatibility with the std::unordered_map interface.
-   */
-  template <class... Args>
-  std::pair<iterator, bool> emplace(Args&&... args) {
-    return m_ht.emplace(std::forward<Args>(args)...);
-  }
-
-  /**
-   * Due to the way elements are stored, emplace_hint will need to move or copy
-   * the key-value once. The method is equivalent to insert(hint,
-   * value_type(std::forward<Args>(args)...));
-   *
-   * Mainly here for compatibility with the std::unordered_map interface.
-   */
-  template <class... Args>
-  iterator emplace_hint(const_iterator hint, Args&&... args) {
-    return m_ht.emplace_hint(hint, std::forward<Args>(args)...);
-  }
-
-  template <class... Args>
-  std::pair<iterator, bool> try_emplace(const key_type& k, Args&&... args) {
-    return m_ht.try_emplace(k, std::forward<Args>(args)...);
-  }
-
-  template <class... Args>
-  std::pair<iterator, bool> try_emplace(key_type&& k, Args&&... args) {
-    return m_ht.try_emplace(std::move(k), std::forward<Args>(args)...);
-  }
-
-  template <class... Args>
-  iterator try_emplace(const_iterator hint, const key_type& k, Args&&... args) {
-    return m_ht.try_emplace_hint(hint, k, std::forward<Args>(args)...);
-  }
-
-  template <class... Args>
-  iterator try_emplace(const_iterator hint, key_type&& k, Args&&... args) {
-    return m_ht.try_emplace_hint(
-        hint, std::move(k), std::forward<Args>(args)...);
-  }
-
-  /**
-   * When erasing an element, the insert order will be preserved and no holes
-   * will be present in the container returned by 'values_container()'.
-   *
-   * The method is in O(n), if the order is not important 'unordered_erase(...)'
-   * method is faster with an O(1) average complexity.
-   */
-  iterator erase(iterator pos) { return m_ht.erase(pos); }
-
-  /**
-   * @copydoc erase(iterator pos)
-   */
-  iterator erase(const_iterator pos) { return m_ht.erase(pos); }
-
-  /**
-   * @copydoc erase(iterator pos)
-   */
-  iterator erase(const_iterator first, const_iterator last) {
-    return m_ht.erase(first, last);
-  }
-
-  /**
-   * @copydoc erase(iterator pos)
-   */
-  size_type erase(const key_type& key) { return m_ht.erase(key); }
-
-  /**
-   * @copydoc erase(iterator pos)
-   *
-   * Use the hash value 'precalculated_hash' instead of hashing the key. The
-   * hash value should be the same as hash_function()(key). Useful to speed-up
-   * the lookup to the value if you already have the hash.
-   */
-  size_type erase(const key_type& key, std::size_t precalculated_hash) {
-    return m_ht.erase(key, precalculated_hash);
-  }
-
-  /**
-   * @copydoc erase(iterator pos)
-   *
-   * This overload only participates in the overload resolution if the typedef
-   * KeyEqual::is_transparent exists. If so, K must be hashable and comparable
-   * to Key.
-   */
-  template <
-      class K,
-      class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
-  size_type erase(const K& key) {
-    return m_ht.erase(key);
-  }
-
-  /**
-   * @copydoc erase(const key_type& key, std::size_t precalculated_hash)
-   *
-   * This overload only participates in the overload resolution if the typedef
-   * KeyEqual::is_transparent exists. If so, K must be hashable and comparable
-   * to Key.
-   */
-  template <
-      class K,
-      class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
-  size_type erase(const K& key, std::size_t precalculated_hash) {
-    return m_ht.erase(key, precalculated_hash);
-  }
-
-  void swap(ordered_map& other) { other.m_ht.swap(m_ht); }
-
-  /*
-   * Lookup
-   */
-  T& at(const Key& key) { return m_ht.at(key); }
-
-  /**
-   * Use the hash value 'precalculated_hash' instead of hashing the key. The
-   * hash value should be the same as hash_function()(key). Useful to speed-up
-   * the lookup if you already have the hash.
-   */
-  T& at(const Key& key, std::size_t precalculated_hash) {
-    return m_ht.at(key, precalculated_hash);
-  }
-
-  const T& at(const Key& key) const { return m_ht.at(key); }
-
-  /**
-   * @copydoc at(const Key& key, std::size_t precalculated_hash)
-   */
-  const T& at(const Key& key, std::size_t precalculated_hash) const {
-    return m_ht.at(key, precalculated_hash);
-  }
-
-  /**
-   * This overload only participates in the overload resolution if the typedef
-   * KeyEqual::is_transparent exists. If so, K must be hashable and comparable
-   * to Key.
-   */
-  template <
-      class K,
-      class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
-  T& at(const K& key) {
-    return m_ht.at(key);
-  }
-
-  /**
-   * @copydoc at(const K& key)
-   *
-   * Use the hash value 'precalculated_hash' instead of hashing the key. The
-   * hash value should be the same as hash_function()(key). Useful to speed-up
-   * the lookup if you already have the hash.
-   */
-  template <
-      class K,
-      class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
-  T& at(const K& key, std::size_t precalculated_hash) {
-    return m_ht.at(key, precalculated_hash);
-  }
-
-  /**
-   * @copydoc at(const K& key)
-   */
-  template <
-      class K,
-      class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
-  const T& at(const K& key) const {
-    return m_ht.at(key);
-  }
-
-  /**
-   * @copydoc at(const K& key, std::size_t precalculated_hash)
-   */
-  template <
-      class K,
-      class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
-  const T& at(const K& key, std::size_t precalculated_hash) const {
-    return m_ht.at(key, precalculated_hash);
-  }
-
-  T& operator[](const Key& key) { return m_ht[key]; }
-  T& operator[](Key&& key) { return m_ht[std::move(key)]; }
-
-  size_type count(const Key& key) const { return m_ht.count(key); }
-
-  /**
-   * Use the hash value 'precalculated_hash' instead of hashing the key. The
-   * hash value should be the same as hash_function()(key). Useful to speed-up
-   * the lookup if you already have the hash.
-   */
-  size_type count(const Key& key, std::size_t precalculated_hash) const {
-    return m_ht.count(key, precalculated_hash);
-  }
-
-  /**
-   * This overload only participates in the overload resolution if the typedef
-   * KeyEqual::is_transparent exists. If so, K must be hashable and comparable
-   * to Key.
-   */
-  template <
-      class K,
-      class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
-  size_type count(const K& key) const {
-    return m_ht.count(key);
-  }
-
-  /**
-   * @copydoc count(const K& key) const
-   *
-   * Use the hash value 'precalculated_hash' instead of hashing the key. The
-   * hash value should be the same as hash_function()(key). Useful to speed-up
-   * the lookup if you already have the hash.
-   */
-  template <
-      class K,
-      class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
-  size_type count(const K& key, std::size_t precalculated_hash) const {
-    return m_ht.count(key, precalculated_hash);
-  }
-
-  iterator find(const Key& key) { return m_ht.find(key); }
-
-  /**
-   * Use the hash value 'precalculated_hash' instead of hashing the key. The
-   * hash value should be the same as hash_function()(key). Useful to speed-up
-   * the lookup if you already have the hash.
-   */
-  iterator find(const Key& key, std::size_t precalculated_hash) {
-    return m_ht.find(key, precalculated_hash);
-  }
-
-  const_iterator find(const Key& key) const { return m_ht.find(key); }
-
-  /**
-   * @copydoc find(const Key& key, std::size_t precalculated_hash)
-   */
-  const_iterator find(const Key& key, std::size_t precalculated_hash) const {
-    return m_ht.find(key, precalculated_hash);
-  }
-
-  /**
-   * This overload only participates in the overload resolution if the typedef
-   * KeyEqual::is_transparent exists. If so, K must be hashable and comparable
-   * to Key.
-   */
-  template <
-      class K,
-      class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
-  iterator find(const K& key) {
-    return m_ht.find(key);
-  }
-
-  /**
-   * @copydoc find(const K& key)
-   *
-   * Use the hash value 'precalculated_hash' instead of hashing the key. The
-   * hash value should be the same as hash_function()(key). Useful to speed-up
-   * the lookup if you already have the hash.
-   */
-  template <
-      class K,
-      class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
-  iterator find(const K& key, std::size_t precalculated_hash) {
-    return m_ht.find(key, precalculated_hash);
-  }
-
-  /**
-   * @copydoc find(const K& key)
-   */
-  template <
-      class K,
-      class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
-  const_iterator find(const K& key) const {
-    return m_ht.find(key);
-  }
-
-  /**
-   * @copydoc find(const K& key)
-   *
-   * Use the hash value 'precalculated_hash' instead of hashing the key. The
-   * hash value should be the same as hash_function()(key). Useful to speed-up
-   * the lookup if you already have the hash.
-   */
-  template <
-      class K,
-      class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
-  const_iterator find(const K& key, std::size_t precalculated_hash) const {
-    return m_ht.find(key, precalculated_hash);
-  }
-
-  bool contains(const Key& key) const { return m_ht.contains(key); }
-
-  /**
-   * Use the hash value 'precalculated_hash' instead of hashing the key. The
-   * hash value should be the same as hash_function()(key). Useful to speed-up
-   * the lookup if you already have the hash.
-   */
-  bool contains(const Key& key, std::size_t precalculated_hash) const {
-    return m_ht.contains(key, precalculated_hash);
-  }
-
-  /**
-   * This overload only participates in the overload resolution if the typedef
-   * KeyEqual::is_transparent exists. If so, K must be hashable and comparable
-   * to Key.
-   */
-  template <
-      class K,
-      class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
-  bool contains(const K& key) const {
-    return m_ht.contains(key);
-  }
-
-  /**
-   * @copydoc contains(const K& key) const
-   *
-   * Use the hash value 'precalculated_hash' instead of hashing the key. The
-   * hash value should be the same as hash_function()(key). Useful to speed-up
-   * the lookup if you already have the hash.
-   */
-  template <
-      class K,
-      class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
-  bool contains(const K& key, std::size_t precalculated_hash) const {
-    return m_ht.contains(key, precalculated_hash);
-  }
-
-  std::pair<iterator, iterator> equal_range(const Key& key) {
-    return m_ht.equal_range(key);
-  }
-
-  /**
-   * Use the hash value 'precalculated_hash' instead of hashing the key. The
-   * hash value should be the same as hash_function()(key). Useful to speed-up
-   * the lookup if you already have the hash.
-   */
-  std::pair<iterator, iterator> equal_range(const Key& key,
-                                            std::size_t precalculated_hash) {
-    return m_ht.equal_range(key, precalculated_hash);
-  }
-
-  std::pair<const_iterator, const_iterator> equal_range(const Key& key) const {
-    return m_ht.equal_range(key);
-  }
-
-  /**
-   * @copydoc equal_range(const Key& key, std::size_t precalculated_hash)
-   */
-  std::pair<const_iterator, const_iterator> equal_range(
-      const Key& key, std::size_t precalculated_hash) const {
-    return m_ht.equal_range(key, precalculated_hash);
-  }
-
-  /**
-   * This overload only participates in the overload resolution if the typedef
-   * KeyEqual::is_transparent exists. If so, K must be hashable and comparable
-   * to Key.
-   */
-  template <
-      class K,
-      class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
-  std::pair<iterator, iterator> equal_range(const K& key) {
-    return m_ht.equal_range(key);
-  }
-
-  /**
-   * @copydoc equal_range(const K& key)
-   *
-   * Use the hash value 'precalculated_hash' instead of hashing the key. The
-   * hash value should be the same as hash_function()(key). Useful to speed-up
-   * the lookup if you already have the hash.
-   */
-  template <
-      class K,
-      class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
-  std::pair<iterator, iterator> equal_range(const K& key,
-                                            std::size_t precalculated_hash) {
-    return m_ht.equal_range(key, precalculated_hash);
-  }
-
-  /**
-   * @copydoc equal_range(const K& key)
-   */
-  template <
-      class K,
-      class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
-  std::pair<const_iterator, const_iterator> equal_range(const K& key) const {
-    return m_ht.equal_range(key);
-  }
-
-  /**
-   * @copydoc equal_range(const K& key, std::size_t precalculated_hash)
-   */
-  template <
-      class K,
-      class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
-  std::pair<const_iterator, const_iterator> equal_range(
-      const K& key, std::size_t precalculated_hash) const {
-    return m_ht.equal_range(key, precalculated_hash);
-  }
-
-  /*
-   * Bucket interface
-   */
-  size_type bucket_count() const { return m_ht.bucket_count(); }
-  size_type max_bucket_count() const { return m_ht.max_bucket_count(); }
-
-  /*
-   * Hash policy
-   */
-  float load_factor() const { return m_ht.load_factor(); }
-  float max_load_factor() const { return m_ht.max_load_factor(); }
-  void max_load_factor(float ml) { m_ht.max_load_factor(ml); }
-
-  void rehash(size_type count) { m_ht.rehash(count); }
-  void reserve(size_type count) { m_ht.reserve(count); }
-
-  /*
-   * Observers
-   */
-  hasher hash_function() const { return m_ht.hash_function(); }
-  key_equal key_eq() const { return m_ht.key_eq(); }
-
-  /*
-   * Other
-   */
-
-  /**
-   * Convert a const_iterator to an iterator.
-   */
-  iterator mutable_iterator(const_iterator pos) {
-    return m_ht.mutable_iterator(pos);
-  }
-
-  /**
-   * Requires index <= size().
-   *
-   * Return an iterator to the element at index. Return end() if index ==
-   * size().
-   */
-  iterator nth(size_type index) { return m_ht.nth(index); }
-
-  /**
-   * @copydoc nth(size_type index)
-   */
-  const_iterator nth(size_type index) const { return m_ht.nth(index); }
-
-  /**
-   * Return const_reference to the first element. Requires the container to not
-   * be empty.
-   */
-  const_reference front() const { return m_ht.front(); }
-
-  /**
-   * Return const_reference to the last element. Requires the container to not
-   * be empty.
-   */
-  const_reference back() const { return m_ht.back(); }
-
-  /**
-   * Only available if ValueTypeContainer is a std::vector. Same as calling
-   * 'values_container().data()'.
-   */
-  template <class U = values_container_type,
-            typename std::enable_if<paddle::detail_ordered_hash::is_vector<
-                U>::value>::type* = nullptr>
-  const typename values_container_type::value_type* data() const noexcept {
-    return m_ht.data();
-  }
-
-  /**
-   * Return the container in which the values are stored. The values are in the
-   * same order as the insertion order and are contiguous in the structure, no
-   * holes (size() == values_container().size()).
-   */
-  const values_container_type& values_container() const noexcept {
-    return m_ht.values_container();
-  }
-
-  template <class U = values_container_type,
-            typename std::enable_if<paddle::detail_ordered_hash::is_vector<
-                U>::value>::type* = nullptr>
-  size_type capacity() const noexcept {
-    return m_ht.capacity();
-  }
-
-  void shrink_to_fit() { m_ht.shrink_to_fit(); }
-
-  /**
-   * Insert the value before pos shifting all the elements on the right of pos
-   * (including pos) one position to the right.
-   *
-   * Amortized linear time-complexity in the distance between pos and end().
-   */
-  std::pair<iterator, bool> insert_at_position(const_iterator pos,
-                                               const value_type& value) {
-    return m_ht.insert_at_position(pos, value);
-  }
-
-  /**
-   * @copydoc insert_at_position(const_iterator pos, const value_type& value)
-   */
-  std::pair<iterator, bool> insert_at_position(const_iterator pos,
-                                               value_type&& value) {
-    return m_ht.insert_at_position(pos, std::move(value));
-  }
-
-  /**
-   * @copydoc insert_at_position(const_iterator pos, const value_type& value)
-   *
-   * Same as insert_at_position(pos, value_type(std::forward<Args>(args)...),
-   * mainly here for coherence.
-   */
-  template <class... Args>
-  std::pair<iterator, bool> emplace_at_position(const_iterator pos,
-                                                Args&&... args) {
-    return m_ht.emplace_at_position(pos, std::forward<Args>(args)...);
-  }
-
-  /**
-   * @copydoc insert_at_position(const_iterator pos, const value_type& value)
-   */
-  template <class... Args>
-  std::pair<iterator, bool> try_emplace_at_position(const_iterator pos,
-                                                    const key_type& k,
-                                                    Args&&... args) {
-    return m_ht.try_emplace_at_position(pos, k, std::forward<Args>(args)...);
-  }
-
-  /**
-   * @copydoc insert_at_position(const_iterator pos, const value_type& value)
-   */
-  template <class... Args>
-  std::pair<iterator, bool> try_emplace_at_position(const_iterator pos,
-                                                    key_type&& k,
-                                                    Args&&... args) {
-    return m_ht.try_emplace_at_position(
-        pos, std::move(k), std::forward<Args>(args)...);
-  }
-
-  void pop_back() { m_ht.pop_back(); }
-
-  /**
-   * Faster erase operation with an O(1) average complexity but it doesn't
-   * preserve the insertion order.
-   *
-   * If an erasure occurs, the last element of the map will take the place of
-   * the erased element.
-   */
-  iterator unordered_erase(iterator pos) { return m_ht.unordered_erase(pos); }
-
-  /**
-   * @copydoc unordered_erase(iterator pos)
-   */
-  iterator unordered_erase(const_iterator pos) {
-    return m_ht.unordered_erase(pos);
-  }
-
-  /**
-   * @copydoc unordered_erase(iterator pos)
-   */
-  size_type unordered_erase(const key_type& key) {
-    return m_ht.unordered_erase(key);
-  }
-
-  /**
-   * @copydoc unordered_erase(iterator pos)
-   *
-   * Use the hash value 'precalculated_hash' instead of hashing the key. The
-   * hash value should be the same as hash_function()(key). Useful to speed-up
-   * the lookup if you already have the hash.
-   */
-  size_type unordered_erase(const key_type& key,
-                            std::size_t precalculated_hash) {
-    return m_ht.unordered_erase(key, precalculated_hash);
-  }
-
-  /**
-   * @copydoc unordered_erase(iterator pos)
-   *
-   * This overload only participates in the overload resolution if the typedef
-   * KeyEqual::is_transparent exists. If so, K must be hashable and comparable
-   * to Key.
-   */
-  template <
-      class K,
-      class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
-  size_type unordered_erase(const K& key) {
-    return m_ht.unordered_erase(key);
-  }
-
-  /**
-   * @copydoc unordered_erase(const K& key)
-   *
-   * Use the hash value 'precalculated_hash' instead of hashing the key. The
-   * hash value should be the same as hash_function()(key). Useful to speed-up
-   * the lookup if you already have the hash.
-   */
-  template <
-      class K,
-      class KE = KeyEqual,
-      typename std::enable_if<has_is_transparent<KE>::value>::type* = nullptr>
-  size_type unordered_erase(const K& key, std::size_t precalculated_hash) {
-    return m_ht.unordered_erase(key, precalculated_hash);
-  }
-
-  /**
-   * Serialize the map through the `serializer` parameter.
-   *
-   * The `serializer` parameter must be a function object that supports the
-   * following call:
-   *  - `template<typename U> void operator()(const U& value);` where the types
-   * `std::uint64_t`, `float` and `std::pair<Key, T>` must be supported for U.
-   *
-   * The implementation leaves binary compatibility (endianness, IEEE 754 for
-   * floats, ...) of the types it serializes in the hands of the `Serializer`
-   * function object if compatibility is required.
-   */
-  template <class Serializer>
-  void serialize(Serializer& serializer) const {  // NOLINT
-    m_ht.serialize(serializer);
-  }
-
-  /**
-   * Deserialize a previously serialized map through the `deserializer`
-   * parameter.
-   *
-   * The `deserializer` parameter must be a function object that supports the
-   * following calls:
-   *  - `template<typename U> U operator()();` where the types `std::uint64_t`,
-   * `float` and `std::pair<Key, T>` must be supported for U.
-   *
-   * If the deserialized hash map type is hash compatible with the serialized
-   * map, the deserialization process can be sped up by setting
-   * `hash_compatible` to true. To be hash compatible, the Hash and KeyEqual
-   * must behave the same way than the ones used on the serialized map. The
-   * `std::size_t` must also be of the same size as the one on the platform used
-   * to serialize the map, the same apply for `IndexType`. If these criteria are
-   * not met, the behaviour is undefined with `hash_compatible` sets to true.
-   *
-   * The behaviour is undefined if the type `Key` and `T` of the `ordered_map`
-   * are not the same as the types used during serialization.
-   *
-   * The implementation leaves binary compatibility (endianness, IEEE 754 for
-   * floats, size of int, ...) of the types it deserializes in the hands of the
-   * `Deserializer` function object if compatibility is required.
-   */
-  template <class Deserializer>
-  static ordered_map deserialize(Deserializer& deserializer,  // NOLINT
-                                 bool hash_compatible = false) {
-    ordered_map map(0);
-    map.m_ht.deserialize(deserializer, hash_compatible);
-
-    return map;
-  }
-
-  friend bool operator==(const ordered_map& lhs, const ordered_map& rhs) {
-    return lhs.m_ht == rhs.m_ht;
-  }
-  friend bool operator!=(const ordered_map& lhs, const ordered_map& rhs) {
-    return lhs.m_ht != rhs.m_ht;
-  }
-  friend bool operator<(const ordered_map& lhs, const ordered_map& rhs) {
-    return lhs.m_ht < rhs.m_ht;
-  }
-  friend bool operator<=(const ordered_map& lhs, const ordered_map& rhs) {
-    return lhs.m_ht <= rhs.m_ht;
-  }
-  friend bool operator>(const ordered_map& lhs, const ordered_map& rhs) {
-    return lhs.m_ht > rhs.m_ht;
-  }
-  friend bool operator>=(const ordered_map& lhs, const ordered_map& rhs) {
-    return lhs.m_ht >= rhs.m_ht;
-  }
-
-  friend void swap(ordered_map& lhs, ordered_map& rhs) { lhs.swap(rhs); }
-
- private:
-  ht m_ht;
-};
-
-}  // end namespace paddle
diff --git a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
index eba1c9bb03555..0a5566323ac55 100644
--- a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
+++ b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
@@ -124,13 +124,11 @@ def check_program(self, program):
                         self.assertTrue(
                             arg_name.endswith('.quantized.dequantized'))
                         if arg_name not in quantized_ops:
-                            # TODO(chenweihang): Quantization depends on the order of input,
-                            # the ordered_map change the OpDecs.input_arg_names order
-                            # self.assertEqual(block.ops[idx - 2 * i - 1].type,
-                            #                  self.dequant_op_type, "op: %s, arg_name: %s, idx: %d, i: %d" % (op.type, arg_name, idx, i))
-                            # self.assertEqual(block.ops[idx - 2 * i - 2].type,
-                            #                  quant_op_type, "op: %s, arg_name: %s, idx: %d, i: %d" % (op.type, arg_name, idx, i))
-                            quantized_ops[arg_name] = block.ops[idx - 2]
+                            self.assertEqual(block.ops[idx - 2 * i - 1].type,
+                                             self.dequant_op_type)
+                            self.assertEqual(block.ops[idx - 2 * i - 2].type,
+                                             quant_op_type)
+                            quantized_ops[arg_name] = block.ops[idx - 2 * i - 2]
                         else:
                             op_idx = block.ops.index(quantized_ops[arg_name])
                             self.assertLess(op_idx, idx)
diff --git a/python/paddle/fluid/tests/unittests/test_infer_no_need_buffer_slots.py b/python/paddle/fluid/tests/unittests/test_infer_no_need_buffer_slots.py
index f773d94141faf..3656cdfd5a034 100644
--- a/python/paddle/fluid/tests/unittests/test_infer_no_need_buffer_slots.py
+++ b/python/paddle/fluid/tests/unittests/test_infer_no_need_buffer_slots.py
@@ -41,18 +41,31 @@ def test_infer_no_need_buffer_slots(self):
 
         block = program.global_block()
         for idx, op in enumerate(block.ops):
+            op_desc = op.desc
+            inputs = {}
+            for input_name in op_desc.input_names():
+                inputs[input_name] = op_desc.input(input_name)
+            outputs = {}
+            for output_name in op_desc.output_names():
+                outputs[output_name] = op_desc.output(output_name)
+            attrs = {}
+            for attr_name in op_desc.attr_names():
+                attrs[attr_name] = op_desc.attr(attr_name)
             if idx == 0:
                 # elementwise_add op
                 self.assertEqual(
-                    core.infer_no_need_buffer_slots(op.desc), set([]))
+                    core.infer_no_need_buffer_slots(op.type, inputs, outputs,
+                                                    attrs), set([]))
             elif idx == 1:
                 # fill constant op
                 self.assertEqual(
-                    core.infer_no_need_buffer_slots(op.desc), set([]))
+                    core.infer_no_need_buffer_slots(op.type, inputs, outputs,
+                                                    attrs), set([]))
             else:
                 # elementwise_add_grad op
                 self.assertEqual(
-                    core.infer_no_need_buffer_slots(op.desc), set(['Y', 'X']))
+                    core.infer_no_need_buffer_slots(op.type, inputs, outputs,
+                                                    attrs), set(['Y', 'X']))
 
 
 if __name__ == '__main__':

From db6ff098ebd2d934f62312ad53636dd52cd6ae02 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 30 Aug 2021 02:56:18 +0000
Subject: [PATCH 038/125] fix part of rcom compile faild

---
 paddle/top/api/CMakeLists.txt    |  3 ++
 paddle/top/core/dense_tensor.cc  |  2 +-
 paddle/top/core/kernel_utils.h   |  4 +-
 paddle/top/mkldnn/CMakeLists.txt |  1 +
 paddle/top/mkldnn/math.cc        | 73 ++++++++++++++++++++++++++++++++
 paddle/top/mkldnn/math.h         | 31 ++------------
 6 files changed, 84 insertions(+), 30 deletions(-)
 create mode 100644 paddle/top/mkldnn/math.cc

diff --git a/paddle/top/api/CMakeLists.txt b/paddle/top/api/CMakeLists.txt
index 75fa5b8348337..4c057b25330b5 100644
--- a/paddle/top/api/CMakeLists.txt
+++ b/paddle/top/api/CMakeLists.txt
@@ -2,6 +2,9 @@ add_subdirectory(src)
 
 set(TOP_DEPS convert_utils dense_tensor selected_rows_tensor kernel_factory kernel_context)
 set(TOP_DEPS ${TOP_DEPS} math_cpu)
+if(WITH_MKLDNN)
+  set(TOP_DEPS ${TOP_DEPS} math_mkldnn)
+endif()
 if(WITH_GPU OR WITH_ROCM)
   set(TOP_DEPS ${TOP_DEPS} math_cuda)
 endif()
diff --git a/paddle/top/core/dense_tensor.cc b/paddle/top/core/dense_tensor.cc
index 81ded2156b972..1a3bd04d75c0d 100644
--- a/paddle/top/core/dense_tensor.cc
+++ b/paddle/top/core/dense_tensor.cc
@@ -55,7 +55,7 @@ Place DenseTensor::GetPlaceByBackend() const {
   switch (meta_.backend) {
     case Backend::kCPU:
       return CPUPlace();
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     case Backend::kCUDA:
       return CUDAPlace();
     case Backend::kCUDAPinned:
diff --git a/paddle/top/core/kernel_utils.h b/paddle/top/core/kernel_utils.h
index b7676c5a21fa2..61272e218aa00 100644
--- a/paddle/top/core/kernel_utils.h
+++ b/paddle/top/core/kernel_utils.h
@@ -25,7 +25,7 @@ namespace pt {
 
 // TODO(chenweihang): replaced by new DeviceContext later
 using CPUContext = paddle::platform::CPUDeviceContext;
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 using CUDAContext = paddle::platform::CUDADeviceContext;
 #endif
 #ifdef PADDLE_WITH_MKLDNN
@@ -84,7 +84,7 @@ struct OpKernelImpl<Return (*)(Args...), kernel_fn> {
   /* DeviceContext Helpers */
 
   PT_SPECIALIZE_OpKernelCallHelper_FOR_DEVICE_CONTEXT(CPUContext);
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   PT_SPECIALIZE_OpKernelCallHelper_FOR_DEVICE_CONTEXT(CUDAContext);
 #endif
 #ifdef PADDLE_WITH_XPU
diff --git a/paddle/top/mkldnn/CMakeLists.txt b/paddle/top/mkldnn/CMakeLists.txt
index e69de29bb2d1d..d058375874075 100644
--- a/paddle/top/mkldnn/CMakeLists.txt
+++ b/paddle/top/mkldnn/CMakeLists.txt
@@ -0,0 +1 @@
+cc_library(math_mkldnn SRCS math.cc DEPS dense_tensor kernel_context kernel_factory mkldnn)
diff --git a/paddle/top/mkldnn/math.cc b/paddle/top/mkldnn/math.cc
new file mode 100644
index 0000000000000..e0a94dea81d55
--- /dev/null
+++ b/paddle/top/mkldnn/math.cc
@@ -0,0 +1,73 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/top/mkldnn/math.h"
+
+#include "paddle/top/mkldnn/base.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/float16.h"
+
+namespace pt {
+
+using MKLDNNDContext = paddle::platform::MKLDNNDeviceContext;
+
+template <typename T>
+void Scale(const MKLDNNDContext& dev_ctx,
+           const MKLDNNDenseTensor& x,
+           float scale,
+           float bias,
+           bool bias_after_scale,
+           MKLDNNDenseTensor* out) {
+  const auto mkldnn_engine = dev_ctx.GetEngine();
+
+  ScaleMKLDNNHandler<T> handler(mkldnn_engine,
+                                x,
+                                /*alpha=*/scale,
+                                /*beta=*/bias,
+                                bias_after_scale);
+
+  bool is_inplaced = x.allocation() && x.allocation() == out->allocation();
+
+  auto src_memory_p = handler.AcquireSrcMemory(&x);
+  auto dst_memory_p =
+      is_inplaced ? src_memory_p : handler.AcquireDstMemory(out);
+  auto activation_p = handler.AcquireForwardPrimitive();
+
+  auto& astream = MKLDNNDContext::tls().get_stream();
+  activation_p->execute(
+      astream,
+      {{MKLDNN_ARG_FROM, *src_memory_p}, {MKLDNN_ARG_TO, *dst_memory_p}});
+  astream.wait();
+
+  out->mutable_meta()->layout = DataLayout::kMKLDNN;
+  // TODO(chenweihang): format is also meta info, how to deal with here?
+  out->set_format(paddle::platform::GetMKLDNNFormat(*dst_memory_p));
+}
+
+template void Scale<float>(const MKLDNNDContext& dev_ctx,
+                           const MKLDNNDenseTensor& x,
+                           float scale,
+                           float bias,
+                           bool bias_after_scale,
+                           MKLDNNDenseTensor* out);
+
+template void Scale<paddle::platform::float16>(const MKLDNNDContext& dev_ctx,
+                                               const MKLDNNDenseTensor& x,
+                                               float scale,
+                                               float bias,
+                                               bool bias_after_scale,
+                                               MKLDNNDenseTensor* out);
+
+}  // namespace pt
diff --git a/paddle/top/mkldnn/math.h b/paddle/top/mkldnn/math.h
index 2c7914715c7e5..31428ac7dc47b 100644
--- a/paddle/top/mkldnn/math.h
+++ b/paddle/top/mkldnn/math.h
@@ -17,7 +17,9 @@ limitations under the License. */
 #ifdef PADDLE_WITH_MKLDNN
 
 #include "paddle/top/core/mkldnn_dense_tensor.h"
-#include "paddle/top/mkldnn/base.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/device_context.h"
 
 namespace pt {
 
@@ -29,32 +31,7 @@ void Scale(const MKLDNNDContext& dev_ctx,
            float scale,
            float bias,
            bool bias_after_scale,
-           MKLDNNDenseTensor* out) {
-  const auto mkldnn_engine = dev_ctx.GetEngine();
-
-  ScaleMKLDNNHandler<T> handler(mkldnn_engine,
-                                x,
-                                /*alpha=*/scale,
-                                /*beta=*/bias,
-                                bias_after_scale);
-
-  bool is_inplaced = x.allocation() && x.allocation() == out->allocation();
-
-  auto src_memory_p = handler.AcquireSrcMemory(&x);
-  auto dst_memory_p =
-      is_inplaced ? src_memory_p : handler.AcquireDstMemory(out);
-  auto activation_p = handler.AcquireForwardPrimitive();
-
-  auto& astream = MKLDNNDContext::tls().get_stream();
-  activation_p->execute(
-      astream,
-      {{MKLDNN_ARG_FROM, *src_memory_p}, {MKLDNN_ARG_TO, *dst_memory_p}});
-  astream.wait();
-
-  out->mutable_meta()->layout = DataLayout::kMKLDNN;
-  // TODO(chenweihang): format is also meta info, how to deal with here?
-  out->set_format(paddle::platform::GetMKLDNNFormat(*dst_memory_p));
-}
+           MKLDNNDenseTensor* out);
 
 }  // namespace pt
 

From 9031ab396e46e7668c27df6424f239cdcb0c9cdd Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Tue, 31 Aug 2021 08:01:12 +0000
Subject: [PATCH 039/125] add more register kernels

---
 paddle/top/core/CMakeLists.txt    |  2 +-
 paddle/top/core/dtype.h           |  4 +++
 paddle/top/core/kernel_context.h  | 13 ++++++++
 paddle/top/core/kernel_registry.h | 10 +++---
 paddle/top/core/kernel_utils.h    | 21 ++++++++++++
 paddle/top/cpu/math.cc            | 55 +++++++++++++++++++++++++++++--
 paddle/top/cpu/math.h             | 45 ++-----------------------
 paddle/top/cuda/math.cu           | 40 ++++++++++++++++------
 paddle/top/cuda/math.h            | 17 ++--------
 paddle/top/mkldnn/math.cc         | 55 +------------------------------
 paddle/top/mkldnn/math.h          | 28 +++++++++++++++-
 11 files changed, 161 insertions(+), 129 deletions(-)

diff --git a/paddle/top/core/CMakeLists.txt b/paddle/top/core/CMakeLists.txt
index de21c1c79534b..e982f837abadf 100644
--- a/paddle/top/core/CMakeLists.txt
+++ b/paddle/top/core/CMakeLists.txt
@@ -13,4 +13,4 @@ cc_library(dense_tensor SRCS dense_tensor.cc DEPS enforce data_type ddim allocat
 cc_library(selected_rows_tensor SRCS selected_rows.cc DEPS dense_tensor)
 
 cc_library(kernel_factory SRCS kernel_factory.cc DEPS enforce backend dtype layout)
-cc_library(kernel_context SRCS kernel_context.cc DEPS device_context)
+cc_library(kernel_context SRCS kernel_context.cc DEPS enforce device_context)
diff --git a/paddle/top/core/dtype.h b/paddle/top/core/dtype.h
index 130482dc48fde..0683fd5fe467c 100644
--- a/paddle/top/core/dtype.h
+++ b/paddle/top/core/dtype.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <ostream>
 
 // See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
 
@@ -25,6 +26,7 @@ namespace pt {
 using complex64 = paddle::platform::complex<float>;
 using complex128 = paddle::platform::complex<double>;
 using float16 = paddle::platform::float16;
+using bfloat16 = paddle::platform::bfloat16;
 
 /**
  * [ Why need new data type? ]
@@ -47,6 +49,7 @@ enum class DataType {
   kINT16,
   kINT32,
   kINT64,
+  kBFLOAT16,
   kFLOAT16,
   kFLOAT32,
   kFLOAT64,
@@ -64,6 +67,7 @@ std::ostream& operator<<(std::ostream& os, DataType dtype);
   _(int16_t, DataType::kINT16)       \
   _(int, DataType::kINT32)           \
   _(int64_t, DataType::kINT64)       \
+  _(bfloat16, DataType::kBFLOAT16)   \
   _(float16, DataType::kFLOAT16)     \
   _(float, DataType::kFLOAT32)       \
   _(double, DataType::kFLOAT64)      \
diff --git a/paddle/top/core/kernel_context.h b/paddle/top/core/kernel_context.h
index 86c70e31f4ccf..50ed67183d366 100644
--- a/paddle/top/core/kernel_context.h
+++ b/paddle/top/core/kernel_context.h
@@ -21,6 +21,7 @@
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace pt {
 
@@ -55,6 +56,8 @@ class OpKernelContext {
     outputs_.emplace_back(output);
   }
 
+  void EmplaceBackAttr(paddle::any attr) { attrs_.emplace_back(attr); }
+
   template <typename TensorType>
   const TensorType& InputAt(size_t idx) const {
     return static_cast<const TensorType&>(*(inputs_.at(idx)));
@@ -65,6 +68,16 @@ class OpKernelContext {
     return static_cast<TensorType*>(outputs_.at(idx).get());
   }
 
+  template <typename AttrType>
+  AttrType AttrAt(size_t idx) const {
+    try {
+      return paddle::any_cast<AttrType>(attrs_.at(idx));
+    } catch (paddle::bad_any_cast&) {
+      PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+          "Attribute cast error in Op Kernel Context."));
+    }
+  }
+
  private:
   // DeviceContext base class
   const DeviceContext& dev_ctx_;
diff --git a/paddle/top/core/kernel_registry.h b/paddle/top/core/kernel_registry.h
index 85feb025ba32d..d6107d7dc36a5 100644
--- a/paddle/top/core/kernel_registry.h
+++ b/paddle/top/core/kernel_registry.h
@@ -67,21 +67,23 @@ class OpKernelRegistrar {
                              __test_global_namespace_##uniq_name##__>::value, \
                 msg)
 
-#define PT_REGISTER_STANDARD_KERNEL(                                      \
+#define PT_REGISTER_KERNEL_STANDARD(                                      \
     op_name, backend, layout, dtype, kernel_fn)                           \
+  template decltype(kernel_fn) kernel_fn;                                 \
   PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                      \
       __reg_pt_op_kernel_##op_name##_##backend##_##layout##_##dtype##__,  \
-      "PT_REGISTER_STANDARD_KERNEL must be called in global namespace."); \
+      "PT_REGISTER_KERNEL_STANDARD must be called in global namespace."); \
   static ::pt::OpKernelRegistrar                                          \
       __pt_op_kernel_##op_name##_##backend##_##layout##_##dtype##__ =     \
           ::pt::OpKernelRegistrar(#op_name,                               \
                                   BACKEND(backend),                       \
                                   DATALAYOUT(layout),                     \
                                   DATATYPE(dtype),                        \
-                                  kernel_fn)
+                                  PT_KERNEL(kernel_fn))
 
 #define PT_REGISTER_KERNEL_AUTO_SPECIALIZE(                               \
     op_name, backend, layout, meta_kernel_fn, dtype)                      \
+  template decltype(meta_kernel_fn<dtype>) meta_kernel_fn<dtype>;         \
   PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                      \
       __reg_pt_op_kernel_##op_name##_##backend##_##layout##_##dtype##__,  \
       "PT_REGISTER_KERNEL_AUTO_SPECIALIZE must be called in global "      \
@@ -107,7 +109,7 @@ class OpKernelRegistrar {
  * In most cases, the backend, dtype and layout of Op's input and output
  * are the same as OpKernel itself. In order to simplify the registration
  * writing, we provide the following simple kernel registration macro.
- * If it is an special case, please use PT_REGISTER_STANDARD_KERNEL
+ * If it is an special case, please use PT_REGISTER_KERNEL_STANDARD
  */
 // TODO(chenweihang): only work for single input and output now.
 // can we use function traits here to parse the input and output type?
diff --git a/paddle/top/core/kernel_utils.h b/paddle/top/core/kernel_utils.h
index 483c96c9eee19..52678ac302823 100644
--- a/paddle/top/core/kernel_utils.h
+++ b/paddle/top/core/kernel_utils.h
@@ -64,6 +64,24 @@ using XPUContext = paddle::platform::XPUDeviceContext;
     }                                                                        \
   }
 
+#define PT_SPECIALIZE_OpKernelCallHelper_FOR_ATTRIBUTE(attr_type)         \
+  template <typename... Tail>                                             \
+  struct OpKernelCallHelper<attr_type, Tail...> {                         \
+    template <int dev_ctx_idx,                                            \
+              int in_idx,                                                 \
+              int attr_idx,                                               \
+              int out_idx,                                                \
+              typename... PreviousArgs>                                   \
+    static void Compute(OpKernelContext* ctx, PreviousArgs&... pargs) {   \
+      static_assert(out_idx == 0,                                         \
+                    "Kernel's Attributes should appear before Outputs."); \
+      attr_type arg = ctx->AttrAt<attr_type>(attr_idx);                   \
+      OpKernelCallHelper<Tail...>::                                       \
+          template Compute<dev_ctx_idx, in_idx, attr_idx + 1, out_idx>(   \
+              ctx, pargs..., arg);                                        \
+    }                                                                     \
+  }
+
 template <typename T>
 struct TypeTag {};
 
@@ -117,6 +135,9 @@ struct OpKernelImpl<Return (*)(Args...), kernel_fn> {
 
   /* Attribute Helpers */
 
+  PT_SPECIALIZE_OpKernelCallHelper_FOR_ATTRIBUTE(bool);
+  PT_SPECIALIZE_OpKernelCallHelper_FOR_ATTRIBUTE(float);
+
   /* Output Helpers */
 
   template <typename... Tail>
diff --git a/paddle/top/cpu/math.cc b/paddle/top/cpu/math.cc
index 9ac430ad25185..32f785f6b20a9 100644
--- a/paddle/top/cpu/math.cc
+++ b/paddle/top/cpu/math.cc
@@ -14,11 +14,47 @@
 
 #include "paddle/top/cpu/math.h"
 
-namespace pt {}  // namespace pt
+namespace pt {
+
+template <typename T,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenScalar = paddle::framework::EigenScalar<T, MajorType, IndexType>;
+template <typename T,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = paddle::framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename T>
+void Sign(const CPUContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
+  module::Sign<CPUContext, T>(dev_ctx, x, out);
+}
+
+template <typename T>
+void Mean(const CPUContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
+  out->mutable_data<T>();
+  auto x_data = EigenVector<T>::Flatten(x);
+  auto y_data = EigenScalar<T>::From(*out);
+  auto& place = *dev_ctx.eigen_device();
+  y_data.device(place) = x_data.mean();
+}
+
+template <typename T>
+void Scale(const CPUContext& dev_ctx,
+           const DenseTensor& x,
+           float scale,
+           float bias,
+           bool bias_after_scale,
+           DenseTensor* out) {
+  module::Scale<CPUContext, T>(dev_ctx, x, scale, bias, bias_after_scale, out);
+}
+
+}  // namespace pt
+
+using bfloat16 = ::paddle::platform::bfloat16;
 
 // Register method 1:
-// PT_REGISTER_STANDARD_KERNEL(sign, CPU, NCHW, FLOAT32,
-// PT_KERNEL(pt::Sign<float>))
+// PT_REGISTER_KERNEL_STANDARD(sign, CPU, NCHW, FLOAT32, pt::Sign<float>)
 //   .Input(BACKEND(CPU), DATALAYOUT(NCHW), DATATYPE(FLOAT32))
 //   .Output(BACKEND(CPU), DATALAYOUT(NCHW), DATATYPE(FLOAT32));
 // PT_TOUCH_KERNEL_REGISTRAR(sign, CPU, NCHW, FLOAT32);
@@ -31,3 +67,16 @@ namespace pt {}  // namespace pt
 
 // Register method 3:
 PT_REGISTER_KERNEL_2T(sign, CPU, NCHW, pt::Sign, float, double);
+PT_REGISTER_KERNEL_2T(mean, CPU, NCHW, pt::Mean, float, double);
+PT_REGISTER_KERNEL_8T(scale,
+                      CPU,
+                      NCHW,
+                      pt::Scale,
+                      float,
+                      double,
+                      bfloat16,
+                      uint8_t,
+                      int8_t,
+                      int16_t,
+                      int,
+                      int64_t);
diff --git a/paddle/top/cpu/math.h b/paddle/top/cpu/math.h
index 2c3a88550157a..6bc2b4a49cc9e 100644
--- a/paddle/top/cpu/math.h
+++ b/paddle/top/cpu/math.h
@@ -25,50 +25,13 @@ limitations under the License. */
 
 namespace pt {
 
-template <typename T,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenScalar = paddle::framework::EigenScalar<T, MajorType, IndexType>;
-template <typename T,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = paddle::framework::EigenVector<T, MajorType, IndexType>;
-
 using CPUContext = paddle::platform::CPUDeviceContext;
 
-/**
- * [ How do we organize the kernel directory ]
- * Now according to the classification of operators in the Python API,
- * the same type of operation kernel is placed in a header file.
- * This is only a temporary approach.
- *
- * Considerations:
- *
- * 1. In the future, it may be tailored the lib on kernel level.
- *    This organization will cause difficulty in tailoring;
- * 2. If there is still one *.h and *.cc file for one kernel,
- *    and now the kernel is organized by device, the number of files
- *    will be greatly expanded, but this may be more reasonable;
- * 3. In the future, the kernel implementation of the function should
- *    be in the *.cc file. If you want to call the kernel in the tensor
- *    operation library, you should find the call through the global
- *    KernelMap instead of including the header file of the corresponding
- *    calculation. This may reduce the number of header files.
- */
-
 template <typename T>
-void Sign(const CPUContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
-  module::Sign<CPUContext, T>(dev_ctx, x, out);
-}
+void Sign(const CPUContext& dev_ctx, const DenseTensor& x, DenseTensor* out);
 
 template <typename T>
-void Mean(const CPUContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
-  out->mutable_data<T>();
-  auto x_data = EigenVector<T>::Flatten(x);
-  auto y_data = EigenScalar<T>::From(*out);
-  auto& place = *dev_ctx.eigen_device();
-  y_data.device(place) = x_data.mean();
-}
+void Mean(const CPUContext& dev_ctx, const DenseTensor& x, DenseTensor* out);
 
 template <typename T>
 void Scale(const CPUContext& dev_ctx,
@@ -76,9 +39,7 @@ void Scale(const CPUContext& dev_ctx,
            float scale,
            float bias,
            bool bias_after_scale,
-           DenseTensor* out) {
-  module::Scale<CPUContext, T>(dev_ctx, x, scale, bias, bias_after_scale, out);
-}
+           DenseTensor* out);
 
 }  // namespace pt
 
diff --git a/paddle/top/cuda/math.cu b/paddle/top/cuda/math.cu
index 501e12a7d22f1..bc0db97506bc7 100644
--- a/paddle/top/cuda/math.cu
+++ b/paddle/top/cuda/math.cu
@@ -14,6 +14,9 @@ limitations under the License. */
 
 #include "paddle/top/cuda/math.h"
 
+#include "paddle/top/module/scale.h"
+#include "paddle/top/module/sign.h"
+
 #ifdef __NVCC__
 #include "cub/cub.cuh"
 #endif
@@ -47,6 +50,11 @@ struct DivideFunctor {
  * Kernels
  */
 
+template <typename T>
+void Sign(const CUDAContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
+  module::Sign<CUDAContext, T>(dev_ctx, x, out);
+}
+
 template <typename T>
 void Mean(const CUDAContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
   auto size_prob = x.numel();
@@ -76,18 +84,30 @@ void Mean(const CUDAContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
   PADDLE_ENFORCE_CUDA_SUCCESS(err);
 }
 
-template void Mean<float>(const CUDAContext& dev_ctx,
-                          const DenseTensor& x,
-                          DenseTensor* out);
-template void Mean<double>(const CUDAContext& dev_ctx,
-                           const DenseTensor& x,
-                           DenseTensor* out);
-template void Mean<paddle::platform::float16>(const CUDAContext& dev_ctx,
-                                              const DenseTensor& x,
-                                              DenseTensor* out);
+template <typename T>
+void Scale(const CUDAContext& dev_ctx,
+           const DenseTensor& x,
+           float scale,
+           float bias,
+           bool bias_after_scale,
+           DenseTensor* out) {
+  module::Scale<CUDAContext, T>(dev_ctx, x, scale, bias, bias_after_scale, out);
+}
 
 }  // namespace pt
 
 using float16 = paddle::platform::float16;
 PT_REGISTER_KERNEL_3T(sign, CUDA, NCHW, pt::Sign, float, double, float16);
-// PT_REGISTER_KERNEL_2T(sign, CUDA, NCHW, pt::Sign, float, double);
+PT_REGISTER_KERNEL_3T(mean, CUDA, NCHW, pt::Mean, float, double, float16);
+PT_REGISTER_KERNEL_8T(scale,
+                      CUDA,
+                      NCHW,
+                      pt::Scale,
+                      float,
+                      double,
+                      float16,
+                      uint8_t,
+                      int8_t,
+                      int16_t,
+                      int,
+                      int64_t);
diff --git a/paddle/top/cuda/math.h b/paddle/top/cuda/math.h
index 2469a5720e13b..e3c89f3d4966e 100644
--- a/paddle/top/cuda/math.h
+++ b/paddle/top/cuda/math.h
@@ -18,8 +18,6 @@ limitations under the License. */
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 #include "paddle/top/core/dense_tensor.h"
-#include "paddle/top/module/scale.h"
-#include "paddle/top/module/sign.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
@@ -29,15 +27,8 @@ namespace pt {
 using CUDAContext = paddle::platform::CUDADeviceContext;
 
 template <typename T>
-void Sign(const CUDAContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
-  module::Sign<CUDAContext, T>(dev_ctx, x, out);
-}
-
-// TODO(chenweihang): Perhaps the Kernel call should not be implemented by
-// calling functions, but by finding the Kernel call method from the global
-// KernelMap. For a kernel like cuda, if you have to call functions through
-// include header files, there will be many more function declarations and
-// redundant function call
+void Sign(const CUDAContext& dev_ctx, const DenseTensor& x, DenseTensor* out);
+
 template <typename T>
 void Mean(const CUDAContext& dev_ctx, const DenseTensor& x, DenseTensor* out);
 
@@ -47,9 +38,7 @@ void Scale(const CUDAContext& dev_ctx,
            float scale,
            float bias,
            bool bias_after_scale,
-           DenseTensor* out) {
-  module::Scale<CUDAContext, T>(dev_ctx, x, scale, bias, bias_after_scale, out);
-}
+           DenseTensor* out);
 
 }  // namespace pt
 
diff --git a/paddle/top/mkldnn/math.cc b/paddle/top/mkldnn/math.cc
index e0a94dea81d55..2544dab9fc98e 100644
--- a/paddle/top/mkldnn/math.cc
+++ b/paddle/top/mkldnn/math.cc
@@ -14,60 +14,7 @@ limitations under the License. */
 
 #include "paddle/top/mkldnn/math.h"
 
-#include "paddle/top/mkldnn/base.h"
-
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/float16.h"
 
-namespace pt {
-
-using MKLDNNDContext = paddle::platform::MKLDNNDeviceContext;
-
-template <typename T>
-void Scale(const MKLDNNDContext& dev_ctx,
-           const MKLDNNDenseTensor& x,
-           float scale,
-           float bias,
-           bool bias_after_scale,
-           MKLDNNDenseTensor* out) {
-  const auto mkldnn_engine = dev_ctx.GetEngine();
-
-  ScaleMKLDNNHandler<T> handler(mkldnn_engine,
-                                x,
-                                /*alpha=*/scale,
-                                /*beta=*/bias,
-                                bias_after_scale);
-
-  bool is_inplaced = x.allocation() && x.allocation() == out->allocation();
-
-  auto src_memory_p = handler.AcquireSrcMemory(&x);
-  auto dst_memory_p =
-      is_inplaced ? src_memory_p : handler.AcquireDstMemory(out);
-  auto activation_p = handler.AcquireForwardPrimitive();
-
-  auto& astream = MKLDNNDContext::tls().get_stream();
-  activation_p->execute(
-      astream,
-      {{MKLDNN_ARG_FROM, *src_memory_p}, {MKLDNN_ARG_TO, *dst_memory_p}});
-  astream.wait();
-
-  out->mutable_meta()->layout = DataLayout::kMKLDNN;
-  // TODO(chenweihang): format is also meta info, how to deal with here?
-  out->set_format(paddle::platform::GetMKLDNNFormat(*dst_memory_p));
-}
-
-template void Scale<float>(const MKLDNNDContext& dev_ctx,
-                           const MKLDNNDenseTensor& x,
-                           float scale,
-                           float bias,
-                           bool bias_after_scale,
-                           MKLDNNDenseTensor* out);
-
-template void Scale<paddle::platform::float16>(const MKLDNNDContext& dev_ctx,
-                                               const MKLDNNDenseTensor& x,
-                                               float scale,
-                                               float bias,
-                                               bool bias_after_scale,
-                                               MKLDNNDenseTensor* out);
-
-}  // namespace pt
+namespace pt {}  // namespace pt
diff --git a/paddle/top/mkldnn/math.h b/paddle/top/mkldnn/math.h
index 31428ac7dc47b..bee3aec6277e7 100644
--- a/paddle/top/mkldnn/math.h
+++ b/paddle/top/mkldnn/math.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_MKLDNN
 
 #include "paddle/top/core/mkldnn_dense_tensor.h"
+#include "paddle/top/mkldnn/base.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
@@ -31,7 +32,32 @@ void Scale(const MKLDNNDContext& dev_ctx,
            float scale,
            float bias,
            bool bias_after_scale,
-           MKLDNNDenseTensor* out);
+           MKLDNNDenseTensor* out) {
+  const auto mkldnn_engine = dev_ctx.GetEngine();
+
+  ScaleMKLDNNHandler<T> handler(mkldnn_engine,
+                                x,
+                                /*alpha=*/scale,
+                                /*beta=*/bias,
+                                bias_after_scale);
+
+  bool is_inplaced = x.allocation() && x.allocation() == out->allocation();
+
+  auto src_memory_p = handler.AcquireSrcMemory(&x);
+  auto dst_memory_p =
+      is_inplaced ? src_memory_p : handler.AcquireDstMemory(out);
+  auto activation_p = handler.AcquireForwardPrimitive();
+
+  auto& astream = MKLDNNDContext::tls().get_stream();
+  activation_p->execute(
+      astream,
+      {{MKLDNN_ARG_FROM, *src_memory_p}, {MKLDNN_ARG_TO, *dst_memory_p}});
+  astream.wait();
+
+  out->mutable_meta()->layout = DataLayout::kMKLDNN;
+  // TODO(chenweihang): format is also meta info, how to deal with here?
+  out->set_format(paddle::platform::GetMKLDNNFormat(*dst_memory_p));
+}
 
 }  // namespace pt
 

From f7bbacaa414a0e53f470e68fb43c1135a8ed932d Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Fri, 3 Sep 2021 03:52:56 +0000
Subject: [PATCH 040/125] revert scale kernel temporarily

---
 paddle/fluid/framework/CMakeLists.txt         |   2 +-
 paddle/fluid/framework/operator.cc            | 210 +++++++++++-------
 paddle/fluid/framework/operator.h             |   3 +
 paddle/fluid/framework/top_utils.cc           |  63 +++++-
 paddle/fluid/framework/top_utils.h            |  34 +--
 paddle/fluid/imperative/prepared_operator.cc  |  10 +-
 paddle/top/CMakeLists.txt                     |   2 -
 paddle/top/api/include/dev/core.h             |   1 +
 paddle/top/api/include/dev/math.h             |   1 -
 paddle/top/core/CMakeLists.txt                |   2 +-
 paddle/top/core/kernel_registry.h             |  14 +-
 ...lected_rows.cc => selected_rows_tensor.cc} |   2 +-
 ...selected_rows.h => selected_rows_tensor.h} |  30 ++-
 paddle/top/cpu/math.cc                        |  73 ++++--
 paddle/top/cpu/math.h                         |  15 +-
 paddle/top/cuda/math.cu                       |  71 ++++--
 paddle/top/cuda/math.h                        |  16 +-
 paddle/top/selected_rows/CMakeLists.txt       |   0
 paddle/top/selected_rows/math.h               |  45 ----
 19 files changed, 382 insertions(+), 212 deletions(-)
 rename paddle/top/core/{selected_rows.cc => selected_rows_tensor.cc} (92%)
 rename paddle/top/core/{selected_rows.h => selected_rows_tensor.h} (74%)
 delete mode 100644 paddle/top/selected_rows/CMakeLists.txt
 delete mode 100644 paddle/top/selected_rows/math.h

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 2de7d199659d4..10db28afca5f2 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -387,7 +387,7 @@ cc_library(save_load_util SRCS save_load_util.cc DEPS tensor scope layer)
 cc_test(save_load_util_test SRCS save_load_util_test.cc DEPS save_load_util tensor scope layer)
 cc_library(generator SRCS generator.cc DEPS enforce place)
 
-cc_library(top_utils SRCS top_utils.cc DEPS tensor place top)
+cc_library(top_utils SRCS top_utils.cc DEPS lod_tensor selected_rows place top)
 
 # Get the current working branch
 execute_process(
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index c66c6c320eaba..be47ea4604069 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1105,86 +1105,6 @@ static std::string RuntimeContextDebugString(const RuntimeContext& ctx) {
   return ss.str();
 }
 
-static pt::OpKernelContext BuildOpKernelContext(
-    const std::string& op_type, const pt::OpKernel& pt_kernel,
-    const RuntimeContext& ctx, const platform::DeviceContext& dev_ctx) {
-  VLOG(1) << RuntimeContextDebugString(ctx);
-
-  // TODO(chenweihang): now only work for very simple case (sign op),
-  // many cases need to be deal with later:
-  // 1. the input and output are not tensor
-  // 2. the dispensbale, duplicable input and output
-  // 3. needless attributes remove
-  // 4. use pt Tensor directly
-  // 5. kernel input is not DenseTensor
-  pt::OpKernelContext op_kernel_ctx(dev_ctx);
-  auto input_defs = pt_kernel.param_def().input_defs();
-  auto output_defs = pt_kernel.param_def().output_defs();
-
-  // TODO(chenweihang): use ordered_map for VariableNameMap and VariableValueMap
-  // If we the VariableValueMap are ordered, we can get tensor by iter the map,
-  // and its order is same as OpProto, like follow
-  //
-  // size_t i = 0;
-  // for (auto& var_pair : ctx.inputs) {
-  //   // TODO(chenweihang): deal with diff param in vector
-  //   auto in_def = input_defs.at(i);
-  //   for (auto* var : var_pair.second) {
-  //     const auto& tensor = var->Get<LoDTensor>();
-  //     auto pt_in = MakeTensorImpl<pt::DenseTensor>(tensor, in_def.backend,
-  //                                                  in_def.dtype,
-  //                                                  in_def.layout);
-  //     op_kernel_ctx.EmplaceBackInput(pt_in);
-  //   }
-  //   ++i;
-  // }
-  // // ordered_map access mutable value need iter
-  // i = 0;
-  // for (auto it = ctx.outputs.begin(); it != ctx.outputs.end(); ++it) {
-  //   auto out_def = output_defs.at(i);
-  //   for (auto* var : it.value()) {
-  //     auto* tensor = var->GetMutable<LoDTensor>();
-  //     // mutable_data before run kernel, to avoid share output form
-  //     // OpKernelContext to original tensor
-  //     tensor->mutable_data(pt::TransToFluidPlace(out_def.backend),
-  //                          pt::TransToProtoVarType(out_def.dtype));
-  //     auto pt_out = MakeTensorImpl<pt::DenseTensor>(
-  //         *tensor, out_def.backend, out_def.dtype, out_def.layout);
-  //     op_kernel_ctx.EmplaceBackOutput(pt_out);
-  //   }
-  //   ++i;
-  // }
-
-  auto& op_proto = OpInfoMap::Instance().Get(op_type).proto_;
-  for (int i = 0; i < op_proto->inputs().size(); ++i) {
-    // TODO(chenweihang): deal with diff param in vector
-    auto in_name = op_proto->inputs()[i].name();
-    auto in_def = input_defs.at(i);
-    for (auto* var : ctx.inputs.at(in_name)) {
-      const auto& tensor = var->Get<LoDTensor>();
-      auto pt_in = MakeTensorImpl<pt::DenseTensor>(tensor, in_def.backend,
-                                                   in_def.dtype, in_def.layout);
-      op_kernel_ctx.EmplaceBackInput(pt_in);
-    }
-  }
-  for (int i = 0; i < op_proto->outputs().size(); ++i) {
-    auto out_name = op_proto->outputs()[i].name();
-    auto out_def = output_defs.at(i);
-    for (auto* var : ctx.outputs.at(out_name)) {
-      auto* tensor = var->GetMutable<LoDTensor>();
-      // mutable_data before run kernel, to avoid share output form
-      // OpKernelContext to original tensor
-      tensor->mutable_data(pt::TransToFluidPlace(out_def.backend),
-                           pt::TransToProtoVarType(out_def.dtype));
-      auto pt_out = MakeTensorImpl<pt::DenseTensor>(
-          *tensor, out_def.backend, out_def.dtype, out_def.layout);
-      op_kernel_ctx.EmplaceBackOutput(pt_out);
-    }
-  }
-  // TODO(chenweihang): append attrs
-  return op_kernel_ctx;
-}
-
 void OperatorWithKernel::RunImpl(const Scope& scope,
                                  const platform::Place& place) const {
   // To reduce the elapsed time of HasAttr, we use bool variable to record the
@@ -1219,6 +1139,10 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
 
   // TODO(chenweihang): Now we are still reusing a lot of the original fluid
   // implementation, this is a gradual replacement process
+  // TODO(chenweihang): only for debug, remove it after
+  // print all registered kernels
+  VLOG(1) << pt::OpKernelFactory::Instance();
+
   run_pt_kernel_ =
       pt::OpKernelFactory::Instance().ContainsOperation(type_.c_str());
   if (run_pt_kernel_) {
@@ -1272,8 +1196,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
                                        platform::EventRole::kInnerOp);
     if (run_pt_kernel_) {
       // TODO(chenweihang): here will intrduce copy
-      auto op_kernel_ctx =
-          BuildOpKernelContext(Type(), *pt_kernel_, *runtime_ctx, *dev_ctx);
+      auto op_kernel_ctx = ConstructPtOpKernelContext(*runtime_ctx, *dev_ctx);
       (*pt_kernel_)(&op_kernel_ctx);
       // need share output into fluid tensor
 
@@ -1328,11 +1251,26 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   }
 }
 
+bool ContainsSelectedRows(const VariableValueMap& inputs) {
+  for (auto& var_pair : inputs) {
+    for (auto* var : var_pair.second) {
+      if (var->IsType<SelectedRows>()) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 void OperatorWithKernel::ChoosePtKernel(
     const RuntimeContext& ctx, const platform::DeviceContext& dev_ctx) const {
   // 1. construct operation name
   // TODO(chenweihang): add rules for construct op name
   pt::OperationName op_name(Type().c_str());
+  // TODO(chenweihang): polish judge rules
+  if (ContainsSelectedRows(ctx.inputs)) {
+    op_name.overload_type = "selected_rows";
+  }
 
   // 2. construct op kernel key
   pt_kernel_key_.reset(new pt::OpKernelKey(
@@ -1883,5 +1821,113 @@ pt::OpKernelKey OperatorWithKernel::ConstructPtOpKernelKey(
   return pt::OpKernelKey(backend, layout, dtype);
 }
 
+pt::OpKernelContext OperatorWithKernel::ConstructPtOpKernelContext(
+    const RuntimeContext& ctx, const platform::DeviceContext& dev_ctx) const {
+  VLOG(1) << RuntimeContextDebugString(ctx);
+
+  // TODO(chenweihang): now only work for very simple case (sign op),
+  // many cases need to be deal with later:
+  // 1. the input and output are not tensor
+  // 2. the dispensbale, duplicable input and output
+  // 3. needless attributes remove
+  // 4. use pt Tensor directly
+  // 5. kernel input is not DenseTensor
+  pt::OpKernelContext op_kernel_ctx(dev_ctx);
+  auto input_defs = pt_kernel_->param_def().input_defs();
+  auto output_defs = pt_kernel_->param_def().output_defs();
+
+  // TODO(chenweihang): use ordered_map for VariableNameMap and VariableValueMap
+  // If we the VariableValueMap are ordered, we can get tensor by iter the map,
+  // and its order is same as OpProto
+
+  auto& op_proto = Info().proto_;
+  for (int i = 0; i < op_proto->inputs_size(); ++i) {
+    auto in = op_proto->inputs()[i];
+    // TODO(chenweihang): skip special cases temporarily
+    // TODO(chenweihang): deal with diff param in vector
+    if (in.has_dispensable() && in.dispensable()) {
+      VLOG(1) << "BuildOpKernelContext: skip dispensable input - " << in.name();
+      continue;
+    }
+    auto in_name = in.name();
+    auto in_def = input_defs.at(i);
+    for (auto* var : ctx.inputs.at(in_name)) {
+      if (var->IsType<LoDTensor>()) {
+        const auto& tensor = var->Get<LoDTensor>();
+        auto pt_in = MakeTensorImpl<pt::DenseTensor, LoDTensor>(
+            tensor, in_def.backend, in_def.dtype, in_def.layout);
+        op_kernel_ctx.EmplaceBackInput(pt_in);
+      } else if (var->IsType<SelectedRows>()) {
+        const auto& tensor = var->Get<SelectedRows>();
+        auto pt_in = MakeTensorImpl<pt::SelectedRowsTensor, SelectedRows>(
+            tensor, in_def.backend, in_def.dtype, in_def.layout);
+        op_kernel_ctx.EmplaceBackInput(pt_in);
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported shared input `%s` type now when call pt kernel.",
+            ToTypeName(var->Type())));
+      }
+    }
+  }
+  for (int i = 0; i < op_proto->outputs_size(); ++i) {
+    auto out_name = op_proto->outputs()[i].name();
+    auto out_def = output_defs.at(i);
+    for (auto* var : ctx.outputs.at(out_name)) {
+      // mutable_data before run kernel, to avoid share output form
+      // OpKernelContext to original tensor
+      if (var->IsType<LoDTensor>()) {
+        auto* tensor = var->GetMutable<LoDTensor>();
+        tensor->mutable_data(pt::TransToFluidPlace(out_def.backend),
+                             pt::TransToProtoVarType(out_def.dtype));
+        auto pt_out = MakeTensorImpl<pt::DenseTensor, LoDTensor>(
+            *tensor, out_def.backend, out_def.dtype, out_def.layout);
+        op_kernel_ctx.EmplaceBackOutput(pt_out);
+      } else if (var->IsType<SelectedRows>()) {
+        auto* tensor = var->GetMutable<SelectedRows>();
+        tensor->mutable_value()->mutable_data(
+            pt::TransToFluidPlace(out_def.backend),
+            pt::TransToProtoVarType(out_def.dtype));
+        auto pt_out = MakeTensorImpl<pt::SelectedRowsTensor, SelectedRows>(
+            *tensor, out_def.backend, out_def.dtype, out_def.layout);
+        op_kernel_ctx.EmplaceBackOutput(pt_out);
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported shared output `%s` type now when call pt kernel.",
+            ToTypeName(var->Type())));
+      }
+    }
+  }
+  for (int i = 0; i < op_proto->attrs_size(); ++i) {
+    auto attr = op_proto->attrs()[i];
+    // TODO(chenweihang): skip extra attrs by extra value
+    // if (attr.has_extra() && attr.extra()) {
+    //   continue;
+    // }
+    if (attr.name() == "use_mkldnn" || attr.name() == "op_role" ||
+        attr.name() == "op_role_var" || attr.name() == "op_namescope" ||
+        attr.name() == "op_callstack" || attr.name() == "op_device") {
+      continue;
+    }
+    switch (attr.type()) {
+      case proto::AttrType::INT:
+        op_kernel_ctx.EmplaceBackAttr(Attr<int>(attr.name()));
+        break;
+      case proto::AttrType::FLOAT:
+        op_kernel_ctx.EmplaceBackAttr(Attr<float>(attr.name()));
+        break;
+      case proto::AttrType::BOOLEAN:
+        op_kernel_ctx.EmplaceBackAttr(Attr<bool>(attr.name()));
+        break;
+      default:
+        // TODO(chenweihang): support other attrs type
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "unsupported cast op `%s`'s attribute `%s` when construct "
+            "OpKernelContext.",
+            Type(), attr.name()));
+    }
+  }
+  return op_kernel_ctx;
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 5d62b187973c0..f8bd284691790 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -538,6 +538,9 @@ class OperatorWithKernel : public OperatorBase {
   virtual pt::OpKernelKey ConstructPtOpKernelKey(
       const VariableValueMap& inputs, const platform::Place& ctx_place) const;
 
+  virtual pt::OpKernelContext ConstructPtOpKernelContext(
+      const RuntimeContext& ctx, const platform::DeviceContext& dev_ctx) const;
+
  private:
   void RunImpl(const Scope& scope, const platform::Place& place) const final;
   void RunImpl(const Scope& scope, const platform::Place& place,
diff --git a/paddle/fluid/framework/top_utils.cc b/paddle/fluid/framework/top_utils.cc
index 47cd13154193f..a0624b8c2bd8a 100644
--- a/paddle/fluid/framework/top_utils.cc
+++ b/paddle/fluid/framework/top_utils.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/top_utils.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/selected_rows.h"
 
 namespace paddle {
 namespace framework {
@@ -20,7 +22,24 @@ namespace framework {
 /* For DenseTensor */
 
 template <>
-std::shared_ptr<pt::DenseTensor> MakeTensorImpl<pt::DenseTensor>(
+std::shared_ptr<pt::DenseTensor> MakeTensorImpl<pt::DenseTensor, LoDTensor>(
+    const LoDTensor& tensor, pt::Backend backend, pt::DataType dtype,
+    pt::DataLayout layout) {
+  auto holder = tensor.Holder();
+  auto tensor_impl = std::make_shared<pt::DenseTensor>(
+      pt::TensorMeta(tensor.dims(), backend, dtype, layout, tensor.offset()),
+      pt::TensorStatus());
+
+  if (holder != nullptr) {
+    tensor_impl->ShareAllocation(tensor.Holder());
+  } else {
+    VLOG(1) << "Old LoDTensor holder is nullptr.";
+  }
+  return tensor_impl;
+}
+
+template <>
+std::shared_ptr<pt::DenseTensor> MakeTensorImpl<pt::DenseTensor, Tensor>(
     const Tensor& tensor, pt::Backend backend, pt::DataType dtype,
     pt::DataLayout layout) {
   auto holder = tensor.Holder();
@@ -36,13 +55,49 @@ std::shared_ptr<pt::DenseTensor> MakeTensorImpl<pt::DenseTensor>(
   return tensor_impl;
 }
 
+template <>
+std::shared_ptr<pt::SelectedRowsTensor>
+MakeTensorImpl<pt::SelectedRowsTensor, SelectedRows>(const SelectedRows& tensor,
+                                                     pt::Backend backend,
+                                                     pt::DataType dtype,
+                                                     pt::DataLayout layout) {
+  auto value = tensor.value();
+  auto holder = value.Holder();
+  auto tensor_impl = std::make_shared<pt::SelectedRowsTensor>(
+      pt::TensorMeta(value.dims(), backend, dtype, layout, value.offset()),
+      pt::TensorStatus(), tensor.rows(), tensor.height());
+
+  if (holder != nullptr) {
+    tensor_impl->mutable_value()->ShareAllocation(tensor.value().Holder());
+  } else {
+    VLOG(1) << "Old SelectedRows holder is nullptr.";
+  }
+  return tensor_impl;
+}
+
+template <>
+std::shared_ptr<pt::DenseTensor> MakeTensorImpl<pt::DenseTensor>(
+    const LoDTensor& tensor, const platform::Place& place,
+    proto::VarType::Type type) {
+  return MakeTensorImpl<pt::DenseTensor, LoDTensor>(
+      tensor, pt::TransToPtBackend(place), pt::TransToPtDataType(type),
+      pt::TransToPtLayout(tensor.layout()));
+}
+
 template <>
 std::shared_ptr<pt::DenseTensor> MakeTensorImpl<pt::DenseTensor>(
     const Tensor& tensor, const platform::Place& place,
     proto::VarType::Type type) {
-  return MakeTensorImpl<pt::DenseTensor>(tensor, pt::TransToPtBackend(place),
-                                         pt::TransToPtDataType(type),
-                                         pt::TransToPtLayout(tensor.layout()));
+  return MakeTensorImpl<pt::DenseTensor, Tensor>(
+      tensor, pt::TransToPtBackend(place), pt::TransToPtDataType(type),
+      pt::TransToPtLayout(tensor.layout()));
+}
+
+template <>
+void ShareTensorImpl<pt::DenseTensor>(pt::DenseTensor* tensor_impl,
+                                      LoDTensor* out) {
+  out->ResetHolderWithType(tensor_impl->allocation(),
+                           pt::TransToProtoVarType(tensor_impl->type()));
 }
 
 template <>
diff --git a/paddle/fluid/framework/top_utils.h b/paddle/fluid/framework/top_utils.h
index 0411992608119..32487569a1722 100644
--- a/paddle/fluid/framework/top_utils.h
+++ b/paddle/fluid/framework/top_utils.h
@@ -22,19 +22,27 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-template <typename TensorImplT>
-std::shared_ptr<TensorImplT> MakeTensorImpl(const Tensor& tensor,
-                                            pt::Backend backend,
-                                            pt::DataType dtype,
-                                            pt::DataLayout layout);
-
-template <typename TensorImplT>
-std::shared_ptr<TensorImplT> MakeTensorImpl(const Tensor& tensor,
-                                            const platform::Place& place,
-                                            proto::VarType::Type type);
-
-template <typename TensorImplT>
-void ShareTensorImpl(TensorImplT* tensor_impl, Tensor* out);
+template <typename PtTensorImplT, typename VariableT>
+std::shared_ptr<PtTensorImplT> MakeTensorImpl(const VariableT& tensor,
+                                              pt::Backend backend,
+                                              pt::DataType dtype,
+                                              pt::DataLayout layout);
+
+template <typename PtTensorImplT>
+std::shared_ptr<PtTensorImplT> MakeTensorImpl(const LoDTensor& tensor,
+                                              const platform::Place& place,
+                                              proto::VarType::Type type);
+
+template <typename PtTensorImplT>
+std::shared_ptr<PtTensorImplT> MakeTensorImpl(const Tensor& tensor,
+                                              const platform::Place& place,
+                                              proto::VarType::Type type);
+
+template <typename PtTensorImplT>
+void ShareTensorImpl(PtTensorImplT* tensor_impl, LoDTensor* out);
+
+template <typename PtTensorImplT>
+void ShareTensorImpl(PtTensorImplT* tensor_impl, Tensor* out);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 2a9193216d46b..4799a67695b59 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -252,8 +252,9 @@ static pt::OpKernelContext BuildDygraphOpKernelContext(
     for (auto var : var_pair.second) {
       const auto& variable = var->Var();
       const auto& tensor = variable.template Get<framework::LoDTensor>();
-      auto pt_in = framework::MakeTensorImpl<pt::DenseTensor>(
-          tensor, in_def.backend, in_def.dtype, in_def.layout);
+      auto pt_in =
+          framework::MakeTensorImpl<pt::DenseTensor, framework::LoDTensor>(
+              tensor, in_def.backend, in_def.dtype, in_def.layout);
       op_kernel_ctx.EmplaceBackInput(pt_in);
     }
     ++i;
@@ -269,8 +270,9 @@ static pt::OpKernelContext BuildDygraphOpKernelContext(
       // OpKernelContext to original tensor
       tensor->mutable_data(pt::TransToFluidPlace(out_def.backend),
                            pt::TransToProtoVarType(out_def.dtype));
-      auto pt_out = framework::MakeTensorImpl<pt::DenseTensor>(
-          *tensor, out_def.backend, out_def.dtype, out_def.layout);
+      auto pt_out =
+          framework::MakeTensorImpl<pt::DenseTensor, framework::LoDTensor>(
+              *tensor, out_def.backend, out_def.dtype, out_def.layout);
       op_kernel_ctx.EmplaceBackOutput(pt_out);
     }
     ++i;
diff --git a/paddle/top/CMakeLists.txt b/paddle/top/CMakeLists.txt
index 42e8087ac36be..7b8de81d6c667 100644
--- a/paddle/top/CMakeLists.txt
+++ b/paddle/top/CMakeLists.txt
@@ -17,8 +17,6 @@ endif()
 if(WITH_XPU)
   add_subdirectory(xpu)
 endif()
-# top kernels for other tensor
-add_subdirectory(selected_rows)
 # top infershape
 add_subdirectory(infershape)
 # top public functors
diff --git a/paddle/top/api/include/dev/core.h b/paddle/top/api/include/dev/core.h
index c6ff5915e5ed8..547c6b3568c1e 100644
--- a/paddle/top/api/include/dev/core.h
+++ b/paddle/top/api/include/dev/core.h
@@ -20,3 +20,4 @@ limitations under the License. */
 #include "paddle/top/core/kernel_context.h"
 #include "paddle/top/core/kernel_factory.h"
 #include "paddle/top/core/mkldnn_dense_tensor.h"
+#include "paddle/top/core/selected_rows_tensor.h"
diff --git a/paddle/top/api/include/dev/math.h b/paddle/top/api/include/dev/math.h
index be6c5df762697..e40ed490317d2 100644
--- a/paddle/top/api/include/dev/math.h
+++ b/paddle/top/api/include/dev/math.h
@@ -19,5 +19,4 @@ limitations under the License. */
 #include "paddle/top/cuda/math.h"
 #include "paddle/top/mkldnn/math.h"
 #include "paddle/top/npu/math.h"
-#include "paddle/top/selected_rows/math.h"
 #include "paddle/top/xpu/math.h"
diff --git a/paddle/top/core/CMakeLists.txt b/paddle/top/core/CMakeLists.txt
index e982f837abadf..90a2e170d46fd 100644
--- a/paddle/top/core/CMakeLists.txt
+++ b/paddle/top/core/CMakeLists.txt
@@ -10,7 +10,7 @@ cc_library(layout SRCS layout.cc)
 
 cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place backend dtype layout)
 cc_library(dense_tensor SRCS dense_tensor.cc DEPS enforce data_type ddim allocator place convert_utils ${MKLDNN_CTX_DEPS})
-cc_library(selected_rows_tensor SRCS selected_rows.cc DEPS dense_tensor)
+cc_library(selected_rows_tensor SRCS selected_rows_tensor.cc DEPS dense_tensor)
 
 cc_library(kernel_factory SRCS kernel_factory.cc DEPS enforce backend dtype layout)
 cc_library(kernel_context SRCS kernel_context.cc DEPS enforce device_context)
diff --git a/paddle/top/core/kernel_registry.h b/paddle/top/core/kernel_registry.h
index d6107d7dc36a5..eec3565ca846b 100644
--- a/paddle/top/core/kernel_registry.h
+++ b/paddle/top/core/kernel_registry.h
@@ -61,6 +61,13 @@ class OpKernelRegistrar {
   OpKernelKey op_kernel_key_;
 };
 
+#if defined(_WIN32)
+#define UNUSED
+#define __builtin_expect(EXP, C) (EXP)
+#else
+#define UNUSED __attribute__((unused))
+#endif
+
 #define PT_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)                     \
   struct __test_global_namespace_##uniq_name##__ {};                          \
   static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
@@ -226,13 +233,6 @@ class OpKernelRegistrar {
  * Op Kernel declare macros
  */
 
-#if defined(_WIN32)
-#define UNUSED
-#define __builtin_expect(EXP, C) (EXP)
-#else
-#define UNUSED __attribute__((unused))
-#endif
-
 #define PT_DECLARE_KERNEL_1T(op_name, backend, layout, dtype)                 \
   PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                          \
       __dec_pt_op_kernel_##op_name##_##backend##_##layout##_##dtype##__,      \
diff --git a/paddle/top/core/selected_rows.cc b/paddle/top/core/selected_rows_tensor.cc
similarity index 92%
rename from paddle/top/core/selected_rows.cc
rename to paddle/top/core/selected_rows_tensor.cc
index 9655f594c8ea4..8dad949a75422 100644
--- a/paddle/top/core/selected_rows.cc
+++ b/paddle/top/core/selected_rows_tensor.cc
@@ -12,6 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/top/core/selected_rows.h"
+#include "paddle/top/core/selected_rows_tensor.h"
 
 namespace pt {}  // namespace pt
diff --git a/paddle/top/core/selected_rows.h b/paddle/top/core/selected_rows_tensor.h
similarity index 74%
rename from paddle/top/core/selected_rows.h
rename to paddle/top/core/selected_rows_tensor.h
index dc5c6a42d0681..0aa4fa9a6c3c6 100644
--- a/paddle/top/core/selected_rows.h
+++ b/paddle/top/core/selected_rows_tensor.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
+#include "paddle/top/core/dense_tensor.h"
 #include "paddle/top/core/tensor_interface.h"
 
 // See Note [ Why still include the fluid headers? ]
@@ -48,19 +49,40 @@ class SelectedRowsTensor : public TensorInterface {
  public:
   SelectedRowsTensor() = delete;
 
-  SelectedRowsTensor(const SelectedRowsTensor&) = delete;
-  SelectedRowsTensor& operator=(const SelectedRowsTensor&) = delete;
+  // SelectedRowsTensor(const SelectedRowsTensor&) = delete;
+  // SelectedRowsTensor& operator=(const SelectedRowsTensor&) = delete;
   SelectedRowsTensor(SelectedRowsTensor&&) = delete;
   SelectedRowsTensor& operator=(SelectedRowsTensor&&) = delete;
 
   SelectedRowsTensor(const TensorMeta& meta,
                      const TensorStatus& status,
                      const std::vector<int64_t>& rows,
-                     int64_t height)
-      : rows_(rows), height_(height) {
+                     int64_t height) {
     value_.reset(new DenseTensor(meta, status));
+    rows_ = rows;
+    height_ = height;
   }
 
+  ~SelectedRowsTensor() override {}
+
+  int64_t numel() const override { return value_->numel(); }
+
+  DDim dims() const override {
+    std::vector<int64_t> dims = vectorize(value_->dims());
+    dims[0] = height_;
+    return paddle::framework::make_ddim(dims);
+  }
+
+  DataType type() const override { return value_->type(); }
+
+  DataLayout layout() const override { return value_->layout(); }
+
+  Place place() const override { return value_->place(); }
+
+  Backend backend() const override { return value_->backend(); }
+
+  bool initialized() const override { return value_->initialized(); }
+
   const DenseTensor& value() const { return *value_; }
 
   DenseTensor* mutable_value() { return value_.get(); }
diff --git a/paddle/top/cpu/math.cc b/paddle/top/cpu/math.cc
index 32f785f6b20a9..dd48987549415 100644
--- a/paddle/top/cpu/math.cc
+++ b/paddle/top/cpu/math.cc
@@ -14,6 +14,12 @@
 
 #include "paddle/top/cpu/math.h"
 
+// #include "paddle/top/module/scale.h"
+// #include "paddle/top/module/sign.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/framework/eigen.h"
+
 namespace pt {
 
 template <typename T,
@@ -39,15 +45,28 @@ void Mean(const CPUContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
   y_data.device(place) = x_data.mean();
 }
 
-template <typename T>
-void Scale(const CPUContext& dev_ctx,
-           const DenseTensor& x,
-           float scale,
-           float bias,
-           bool bias_after_scale,
-           DenseTensor* out) {
-  module::Scale<CPUContext, T>(dev_ctx, x, scale, bias, bias_after_scale, out);
-}
+// template <typename T>
+// void Scale(const CPUContext& dev_ctx,
+//            const DenseTensor& x,
+//            float scale,
+//            float bias,
+//            bool bias_after_scale,
+//            DenseTensor* out) {
+//   module::Scale<CPUContext, T>(dev_ctx, x, scale, bias, bias_after_scale,
+//   out);
+// }
+
+// template <typename T>
+// void ScaleSelectedRows(const CPUContext& dev_ctx,
+//           const SelectedRowsTensor& x,
+//           float scale,
+//           float bias,
+//           bool bias_after_scale,
+//           SelectedRowsTensor* out) {
+//   out->set_rows(x.rows());
+//   out->set_height(x.height());
+//   Scale<T>(dev_ctx, x.value(), scale, bias, bias_after_scale, out->value());
+// }
 
 }  // namespace pt
 
@@ -68,15 +87,27 @@ using bfloat16 = ::paddle::platform::bfloat16;
 // Register method 3:
 PT_REGISTER_KERNEL_2T(sign, CPU, NCHW, pt::Sign, float, double);
 PT_REGISTER_KERNEL_2T(mean, CPU, NCHW, pt::Mean, float, double);
-PT_REGISTER_KERNEL_8T(scale,
-                      CPU,
-                      NCHW,
-                      pt::Scale,
-                      float,
-                      double,
-                      bfloat16,
-                      uint8_t,
-                      int8_t,
-                      int16_t,
-                      int,
-                      int64_t);
+// PT_REGISTER_KERNEL_8T(scale,
+//                       CPU,
+//                       NCHW,
+//                       pt::Scale,
+//                       float,
+//                       double,
+//                       bfloat16,
+//                       uint8_t,
+//                       int8_t,
+//                       int16_t,
+//                       int,
+//                       int64_t);
+// PT_REGISTER_KERNEL_8T(scale.selected_rows,
+//                       CPU,
+//                       NCHW,
+//                       pt::ScaleSelectedRows,
+//                       float,
+//                       double,
+//                       bfloat16,
+//                       uint8_t,
+//                       int8_t,
+//                       int16_t,
+//                       int,
+//                       int64_t);
diff --git a/paddle/top/cpu/math.h b/paddle/top/cpu/math.h
index 6bc2b4a49cc9e..2d2fd12140363 100644
--- a/paddle/top/cpu/math.h
+++ b/paddle/top/cpu/math.h
@@ -16,11 +16,12 @@ limitations under the License. */
 
 #include "paddle/top/core/dense_tensor.h"
 #include "paddle/top/core/kernel_registry.h"
+#include "paddle/top/core/selected_rows_tensor.h"
+
 #include "paddle/top/module/scale.h"
 #include "paddle/top/module/sign.h"
 
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/platform/device_context.h"
 
 namespace pt {
@@ -39,7 +40,17 @@ void Scale(const CPUContext& dev_ctx,
            float scale,
            float bias,
            bool bias_after_scale,
-           DenseTensor* out);
+           DenseTensor* out) {
+  module::Scale<CPUContext, T>(dev_ctx, x, scale, bias, bias_after_scale, out);
+}
+
+// template <typename T>
+// void ScaleSelectedRows(const CPUContext& dev_ctx,
+//         const SelectedRowsTensor& x,
+//         float scale,
+//         float bias,
+//         bool bias_after_scale,
+//         SelectedRowsTensor* out);
 
 }  // namespace pt
 
diff --git a/paddle/top/cuda/math.cu b/paddle/top/cuda/math.cu
index bc0db97506bc7..d16581d953544 100644
--- a/paddle/top/cuda/math.cu
+++ b/paddle/top/cuda/math.cu
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "paddle/top/cuda/math.h"
 
-#include "paddle/top/module/scale.h"
-#include "paddle/top/module/sign.h"
+// #include "paddle/top/module/scale.h"
+// #include "paddle/top/module/sign.h"
 
 #ifdef __NVCC__
 #include "cub/cub.cuh"
@@ -84,30 +84,55 @@ void Mean(const CUDAContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
   PADDLE_ENFORCE_CUDA_SUCCESS(err);
 }
 
-template <typename T>
-void Scale(const CUDAContext& dev_ctx,
-           const DenseTensor& x,
-           float scale,
-           float bias,
-           bool bias_after_scale,
-           DenseTensor* out) {
-  module::Scale<CUDAContext, T>(dev_ctx, x, scale, bias, bias_after_scale, out);
-}
+// template <typename T>
+// void Scale(const CUDAContext& dev_ctx,
+//            const DenseTensor& x,
+//            float scale,
+//            float bias,
+//            bool bias_after_scale,
+//            DenseTensor* out) {
+//   module::Scale<CUDAContext, T>(dev_ctx, x, scale, bias, bias_after_scale,
+//   out);
+// }
+
+// template <typename T>
+// void ScaleSelectedRows(const CUDAContext& dev_ctx,
+//           const SelectedRowsTensor& x,
+//           float scale,
+//           float bias,
+//           bool bias_after_scale,
+//           SelectedRowsTensor* out) {
+//   out->set_rows(x.rows());
+//   out->set_height(x.height());
+//   Scale<T>(dev_ctx, x.value(), scale, bias, bias_after_scale, out->value());
+// }
 
 }  // namespace pt
 
 using float16 = paddle::platform::float16;
 PT_REGISTER_KERNEL_3T(sign, CUDA, NCHW, pt::Sign, float, double, float16);
 PT_REGISTER_KERNEL_3T(mean, CUDA, NCHW, pt::Mean, float, double, float16);
-PT_REGISTER_KERNEL_8T(scale,
-                      CUDA,
-                      NCHW,
-                      pt::Scale,
-                      float,
-                      double,
-                      float16,
-                      uint8_t,
-                      int8_t,
-                      int16_t,
-                      int,
-                      int64_t);
+// PT_REGISTER_KERNEL_8T(scale,
+//                       CUDA,
+//                       NCHW,
+//                       pt::Scale,
+//                       float,
+//                       double,
+//                       float16,
+//                       uint8_t,
+//                       int8_t,
+//                       int16_t,
+//                       int,
+//                       int64_t);
+// PT_REGISTER_KERNEL_8T(scale.selected_rows,
+//                       CUDA,
+//                       NCHW,
+//                       pt::ScaleSelectedRows,
+//                       float,
+//                       double,
+//                       float16,
+//                       uint8_t,
+//                       int8_t,
+//                       int16_t,
+//                       int,
+//                       int64_t);
diff --git a/paddle/top/cuda/math.h b/paddle/top/cuda/math.h
index e3c89f3d4966e..66bacea1dab48 100644
--- a/paddle/top/cuda/math.h
+++ b/paddle/top/cuda/math.h
@@ -18,6 +18,10 @@ limitations under the License. */
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 #include "paddle/top/core/dense_tensor.h"
+#include "paddle/top/core/selected_rows_tensor.h"
+
+#include "paddle/top/module/scale.h"
+#include "paddle/top/module/sign.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
@@ -38,7 +42,17 @@ void Scale(const CUDAContext& dev_ctx,
            float scale,
            float bias,
            bool bias_after_scale,
-           DenseTensor* out);
+           DenseTensor* out) {
+  module::Scale<CUDAContext, T>(dev_ctx, x, scale, bias, bias_after_scale, out);
+}
+
+// template <typename T>
+// void ScaleSelectedRows(const CUDAContext& dev_ctx,
+//         const SelectedRowsTensor& x,
+//         float scale,
+//         float bias,
+//         bool bias_after_scale,
+//         SelectedRowsTensor* out);
 
 }  // namespace pt
 
diff --git a/paddle/top/selected_rows/CMakeLists.txt b/paddle/top/selected_rows/CMakeLists.txt
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/paddle/top/selected_rows/math.h b/paddle/top/selected_rows/math.h
deleted file mode 100644
index 84e8f15860ed8..0000000000000
--- a/paddle/top/selected_rows/math.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/top/core/selected_rows.h"
-
-// In fact, it is ugly to use such a complicated include
-// relationship when coding.
-// After the kernel registration module is completed, the calculation
-// function should be reused by calling the kernel in global KernelMap.
-#include "paddle/top/cpu/math.h"
-#include "paddle/top/cuda/math.h"
-#include "paddle/top/npu/math.h"
-#include "paddle/top/xpu/math.h"
-
-// See Note [ Why still include the fluid headers? ]
-
-namespace pt {
-
-// TODO(chenweihang): also support CUDA, XPU, NPU, ...
-template <typename T>
-void Scale(const CPUContext& dev_ctx,
-           const SelectedRowsTensor& x,
-           float scale,
-           float bias,
-           bool bias_after_scale,
-           SelectedRowsTensor* out) {
-  out->set_rows(x.rows());
-  out->set_height(x.height());
-  Scale<T>(dev_ctx, x.value(), scale, bias, bias_after_scale, out->value());
-}
-
-}  // namespace pt

From 568bebd0de3a6420b042c674469e6d58ce252d56 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 6 Sep 2021 05:17:53 +0000
Subject: [PATCH 041/125] fix code format error

---
 paddle/fluid/imperative/prepared_operator.cc | 20 ++++++++++----------
 paddle/top/cpu/math.cc                       |  2 +-
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 048653fa63ee6..6a0f58f663f1c 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -174,16 +174,16 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
     auto& kernels = kernels_iter->second;
     auto kernel_iter = kernels.find(expected_kernel_key);
 #ifdef PADDLE_WITH_XPU
-  if (is_xpu_place(expected_kernel_key.place_) &&
-      (kernel_iter == kernels.end() ||
-       !paddle::platform::is_xpu_support_op(op.Type(), expected_kernel_key) ||
-       paddle::platform::is_in_xpu_black_list(op.Type()))) {
-    VLOG(3) << "missing XPU kernel: " << op.Type()
-            << ", expected_kernel_key:" << expected_kernel_key
-            << ", fallbacking to CPU one!";
-    expected_kernel_key.place_ = platform::CPUPlace();
-    kernel_iter = kernels.find(expected_kernel_key);
-  }
+    if (is_xpu_place(expected_kernel_key.place_) &&
+        (kernel_iter == kernels.end() ||
+         !paddle::platform::is_xpu_support_op(op.Type(), expected_kernel_key) ||
+         paddle::platform::is_in_xpu_black_list(op.Type()))) {
+      VLOG(3) << "missing XPU kernel: " << op.Type()
+              << ", expected_kernel_key:" << expected_kernel_key
+              << ", fallbacking to CPU one!";
+      expected_kernel_key.place_ = platform::CPUPlace();
+      kernel_iter = kernels.find(expected_kernel_key);
+    }
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
     if (kernel_iter == kernels.end() &&
diff --git a/paddle/top/cpu/math.cc b/paddle/top/cpu/math.cc
index dd48987549415..c9b8afe63bdd7 100644
--- a/paddle/top/cpu/math.cc
+++ b/paddle/top/cpu/math.cc
@@ -70,7 +70,7 @@ void Mean(const CPUContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
 
 }  // namespace pt
 
-using bfloat16 = ::paddle::platform::bfloat16;
+// using bfloat16 = ::paddle::platform::bfloat16;
 
 // Register method 1:
 // PT_REGISTER_KERNEL_STANDARD(sign, CPU, NCHW, FLOAT32, pt::Sign<float>)

From 0eedc924bf74be97f20333c32335fc11c68d40d4 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Tue, 7 Sep 2021 06:40:13 +0000
Subject: [PATCH 042/125] add new kernel registrar marco

---
 paddle/top/core/kernel_def.h      |   3 +
 paddle/top/core/kernel_factory.h  |  11 +
 paddle/top/core/kernel_registry.h | 536 ++++++++++++++++++------------
 paddle/top/cpu/math.cc            |   8 +-
 paddle/top/cpu/math.h             |   2 -
 paddle/top/cuda/math.cu           |  10 +-
 paddle/top/xpu/math.cc            |   2 +-
 7 files changed, 352 insertions(+), 220 deletions(-)

diff --git a/paddle/top/core/kernel_def.h b/paddle/top/core/kernel_def.h
index 206afa8a9ed95..282e9ded2e4d1 100644
--- a/paddle/top/core/kernel_def.h
+++ b/paddle/top/core/kernel_def.h
@@ -16,7 +16,10 @@
 
 namespace pt {
 
+class OpKernel;
 class OpKernelContext;
+
 using OpKernelFn = void (*)(OpKernelContext* ctx);
+using OpKernelParamDefFn = void (*)(OpKernel* kernel);
 
 }  // namespace pt
diff --git a/paddle/top/core/kernel_factory.h b/paddle/top/core/kernel_factory.h
index 53c43d26fb047..12d99ab7dde28 100644
--- a/paddle/top/core/kernel_factory.h
+++ b/paddle/top/core/kernel_factory.h
@@ -29,6 +29,17 @@
 
 namespace pt {
 
+/**
+ * [ Naming considerations ]
+ *
+ * The tensor operation library contains many operations, and the operation
+ * in each specific scenario is represented by an operation kernel.
+ *
+ * We directly named it `Kernel` instead of `OpKernel`, the tensor operation
+ * library here and fluid are independent, avoiding developers from
+ * misunderstanding the relationship between the two concepts.
+ */
+
 class OpKernelContext;
 
 using OpKernelFn = void (*)(OpKernelContext* ctx);
diff --git a/paddle/top/core/kernel_registry.h b/paddle/top/core/kernel_registry.h
index eec3565ca846b..f473af47ea54f 100644
--- a/paddle/top/core/kernel_registry.h
+++ b/paddle/top/core/kernel_registry.h
@@ -30,35 +30,20 @@ class OpKernelRegistrar {
                     Backend backend,
                     DataLayout layout,
                     DataType dtype,
-                    OpKernelFn fn)
-      : op_name_(op_name), op_kernel_key_(backend, layout, dtype) {
-    OpKernel kernel(fn);
-    OpKernelFactory::Instance().kernels()[op_name_][op_kernel_key_] = kernel;
+                    OpKernelParamDefFn param_def_fn,
+                    OpKernelFn kernel_fn) {
+    OperationName final_op_name(op_name);
+    OpKernelKey op_kernel_key(backend, layout, dtype);
+    OpKernel kernel(kernel_fn);
+    param_def_fn(&kernel);
+
+    // TODO(chenweihang): use default input and output for verify
+    kernel.mutable_param_def()->AppendInput(backend, layout, dtype);
+    kernel.mutable_param_def()->AppendOutput(backend, layout, dtype);
+
+    OpKernelFactory::Instance().kernels()[final_op_name][op_kernel_key] =
+        kernel;
   }
-
-  OpKernelRegistrar& Input(Backend backend, DataLayout layout, DataType dtype) {
-    OpKernelFactory::Instance()
-        .kernels()[op_name_][op_kernel_key_]
-        .mutable_param_def()
-        ->AppendInput(backend, layout, dtype);
-    return *this;
-  }
-
-  OpKernelRegistrar& Output(Backend backend,
-                            DataLayout layout,
-                            DataType dtype) {
-    OpKernelFactory::Instance()
-        .kernels()[op_name_][op_kernel_key_]
-        .mutable_param_def()
-        ->AppendOutput(backend, layout, dtype);
-    return *this;
-  }
-
-  void Touch() {}
-
- private:
-  OperationName op_name_;
-  OpKernelKey op_kernel_key_;
 };
 
 #if defined(_WIN32)
@@ -68,12 +53,322 @@ class OpKernelRegistrar {
 #define UNUSED __attribute__((unused))
 #endif
 
-#define PT_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)                     \
+#define PT_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg) \
+  _PT_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)
+
+#define _PT_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)                    \
   struct __test_global_namespace_##uniq_name##__ {};                          \
   static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
                              __test_global_namespace_##uniq_name##__>::value, \
                 msg)
 
+#ifdef __COUNTER__
+#define PT_ID __COUNTER__
+#else
+#define PT_ID __LINE__
+#endif
+
+#define PT_CONCATENATE(arg1, arg2) PT_CONCATENATE1(arg1, arg2)
+#define PT_CONCATENATE1(arg1, arg2) PT_CONCATENATE2(arg1, arg2)
+#define PT_CONCATENATE2(arg1, arg2) arg1##arg2
+
+// reference:
+// https://stackoverflow.com/questions/1872220/is-it-possible-to-iterate-over-arguments-in-variadic-macros
+#define PT_NARGS(...) _PT_NARGS(__VA_ARGS__, _PT_RESQ_N())
+#define _PT_NARGS(...) _PT_ARG_N(__VA_ARGS__)
+#define _PT_ARG_N(_1, _2, _3, _4, _5, _6, _7, _8, N, ...) N
+#define _PT_RESQ_N() 8, 7, 6, 5, 4, 3, 2, 1, 0
+
+#define PT_REGISTER_KERNEL(                                   \
+    op_name, backend, layout, meta_kernel_fn, cpp_dtype, ...) \
+  _PT_REGISTER_KERNEL(                                        \
+      op_name, PT_ID, backend, layout, meta_kernel_fn, cpp_dtype, __VA_ARGS__)
+
+#define _PT_REGISTER_KERNEL(                                           \
+    op_name, func_id, backend, layout, meta_kernel_fn, cpp_dtype, ...) \
+  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                   \
+      PT_CONCATENATE(pt_op_kernel_ns_check_, func_id),                 \
+      "PT_REGISTER_KERNEL must be called in global namespace.");       \
+  PT_KERNEL_SPECIALIZE(meta_kernel_fn, cpp_dtype, __VA_ARGS__);        \
+  static void PT_CONCATENATE(__PT_KERNEL_PARAM_DEF_FN_,                \
+                             func_id)(::pt::OpKernel*);                \
+  PT_KERNEL_REGISTRAR_INIT(                                            \
+      op_name,                                                         \
+      func_id,                                                         \
+      backend,                                                         \
+      layout,                                                          \
+      &PT_CONCATENATE(__PT_KERNEL_PARAM_DEF_FN_, func_id),             \
+      meta_kernel_fn,                                                  \
+      cpp_dtype,                                                       \
+      __VA_ARGS__);                                                    \
+  void PT_CONCATENATE(__PT_KERNEL_PARAM_DEF_FN_,                       \
+                      func_id)(::pt::OpKernel * kernel)
+
+#define PT_KERNEL_SPECIALIZE(meta_kernel_fn, cpp_dtype, ...) \
+  _PT_KERNEL_SPECIALIZE(PT_NARGS(cpp_dtype, __VA_ARGS__),    \
+                        meta_kernel_fn,                      \
+                        cpp_dtype,                           \
+                        __VA_ARGS__)
+
+#define _PT_KERNEL_SPECIALIZE(N, meta_kernel_fn, cpp_dtype, ...) \
+  PT_CONCATENATE(_PT_KERNEL_SPECIALIZE_, N)                      \
+  (meta_kernel_fn, cpp_dtype, __VA_ARGS__)
+
+#define _PT_KERNEL_SPECIALIZE_1(meta_kernel_fn, cpp_dtype, ...) \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>
+#define _PT_KERNEL_SPECIALIZE_2(meta_kernel_fn, cpp_dtype, ...)           \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  _PT_KERNEL_SPECIALIZE_1(meta_kernel_fn, __VA_ARGS__)
+#define _PT_KERNEL_SPECIALIZE_3(meta_kernel_fn, cpp_dtype, ...)           \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  _PT_KERNEL_SPECIALIZE_2(meta_kernel_fn, __VA_ARGS__)
+#define _PT_KERNEL_SPECIALIZE_4(meta_kernel_fn, cpp_dtype, ...)           \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  _PT_KERNEL_SPECIALIZE_3(meta_kernel_fn, __VA_ARGS__)
+#define _PT_KERNEL_SPECIALIZE_5(meta_kernel_fn, cpp_dtype, ...)           \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  _PT_KERNEL_SPECIALIZE_4(meta_kernel_fn, __VA_ARGS__)
+#define _PT_KERNEL_SPECIALIZE_6(meta_kernel_fn, cpp_dtype, ...)           \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  _PT_KERNEL_SPECIALIZE_5(meta_kernel_fn, __VA_ARGS__)
+#define _PT_KERNEL_SPECIALIZE_7(meta_kernel_fn, cpp_dtype, ...)           \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  _PT_KERNEL_SPECIALIZE_6(meta_kernel_fn, __VA_ARGS__)
+#define _PT_KERNEL_SPECIALIZE_8(meta_kernel_fn, cpp_dtype, ...)           \
+  template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
+  _PT_KERNEL_SPECIALIZE_7(meta_kernel_fn, __VA_ARGS__)
+
+#define PT_KERNEL_REGISTRAR_INIT(op_name,                     \
+                                 func_id,                     \
+                                 backend,                     \
+                                 layout,                      \
+                                 param_def_fn,                \
+                                 meta_kernel_fn,              \
+                                 cpp_dtype,                   \
+                                 ...)                         \
+  _PT_KERNEL_REGISTRAR_INIT(PT_NARGS(cpp_dtype, __VA_ARGS__), \
+                            op_name,                          \
+                            func_id,                          \
+                            backend,                          \
+                            layout,                           \
+                            param_def_fn,                     \
+                            meta_kernel_fn,                   \
+                            cpp_dtype,                        \
+                            __VA_ARGS__)
+
+#define _PT_KERNEL_REGISTRAR_INIT(N,              \
+                                  op_name,        \
+                                  func_id,        \
+                                  backend,        \
+                                  layout,         \
+                                  param_def_fn,   \
+                                  meta_kernel_fn, \
+                                  cpp_dtype,      \
+                                  ...)            \
+  PT_CONCATENATE(_PT_KERNEL_REGISTRAR_INIT_, N)   \
+  (op_name,                                       \
+   func_id,                                       \
+   PT_ID,                                         \
+   backend,                                       \
+   layout,                                        \
+   param_def_fn,                                  \
+   meta_kernel_fn,                                \
+   cpp_dtype,                                     \
+   __VA_ARGS__)
+
+#define _PT_KERNEL_REGISTRAR_INIT_1(op_name,           \
+                                    func_id,           \
+                                    registrar_id,      \
+                                    backend,           \
+                                    layout,            \
+                                    param_def_fn,      \
+                                    meta_kernel_fn,    \
+                                    cpp_dtype,         \
+                                    ...)               \
+  static const ::pt::OpKernelRegistrar PT_CONCATENATE( \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)(  \
+      op_name,                                         \
+      BACKEND(backend),                                \
+      DATALAYOUT(layout),                              \
+      ::pt::CppTypeToDataType<cpp_dtype>::Type(),      \
+      param_def_fn,                                    \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));
+#define _PT_KERNEL_REGISTRAR_INIT_2(op_name,           \
+                                    func_id,           \
+                                    registrar_id,      \
+                                    backend,           \
+                                    layout,            \
+                                    param_def_fn,      \
+                                    meta_kernel_fn,    \
+                                    cpp_dtype,         \
+                                    ...)               \
+  static const ::pt::OpKernelRegistrar PT_CONCATENATE( \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)(  \
+      op_name,                                         \
+      BACKEND(backend),                                \
+      DATALAYOUT(layout),                              \
+      ::pt::CppTypeToDataType<cpp_dtype>::Type(),      \
+      param_def_fn,                                    \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));           \
+  _PT_KERNEL_REGISTRAR_INIT_1(op_name,                 \
+                              func_id,                 \
+                              PT_ID,                   \
+                              backend,                 \
+                              layout,                  \
+                              param_def_fn,            \
+                              meta_kernel_fn,          \
+                              __VA_ARGS__)
+#define _PT_KERNEL_REGISTRAR_INIT_3(op_name,           \
+                                    func_id,           \
+                                    registrar_id,      \
+                                    backend,           \
+                                    layout,            \
+                                    param_def_fn,      \
+                                    meta_kernel_fn,    \
+                                    cpp_dtype,         \
+                                    ...)               \
+  static const ::pt::OpKernelRegistrar PT_CONCATENATE( \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)(  \
+      op_name,                                         \
+      BACKEND(backend),                                \
+      DATALAYOUT(layout),                              \
+      ::pt::CppTypeToDataType<cpp_dtype>::Type(),      \
+      param_def_fn,                                    \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));           \
+  _PT_KERNEL_REGISTRAR_INIT_2(op_name,                 \
+                              func_id,                 \
+                              PT_ID,                   \
+                              backend,                 \
+                              layout,                  \
+                              param_def_fn,            \
+                              meta_kernel_fn,          \
+                              __VA_ARGS__)
+#define _PT_KERNEL_REGISTRAR_INIT_4(op_name,           \
+                                    func_id,           \
+                                    registrar_id,      \
+                                    backend,           \
+                                    layout,            \
+                                    param_def_fn,      \
+                                    meta_kernel_fn,    \
+                                    cpp_dtype,         \
+                                    ...)               \
+  static const ::pt::OpKernelRegistrar PT_CONCATENATE( \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)(  \
+      op_name,                                         \
+      BACKEND(backend),                                \
+      DATALAYOUT(layout),                              \
+      ::pt::CppTypeToDataType<cpp_dtype>::Type(),      \
+      param_def_fn,                                    \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));           \
+  _PT_KERNEL_REGISTRAR_INIT_3(op_name,                 \
+                              func_id,                 \
+                              PT_ID,                   \
+                              backend,                 \
+                              layout,                  \
+                              param_def_fn,            \
+                              meta_kernel_fn,          \
+                              __VA_ARGS__)
+#define _PT_KERNEL_REGISTRAR_INIT_5(op_name,           \
+                                    func_id,           \
+                                    registrar_id,      \
+                                    backend,           \
+                                    layout,            \
+                                    param_def_fn,      \
+                                    meta_kernel_fn,    \
+                                    cpp_dtype,         \
+                                    ...)               \
+  static const ::pt::OpKernelRegistrar PT_CONCATENATE( \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)(  \
+      op_name,                                         \
+      BACKEND(backend),                                \
+      DATALAYOUT(layout),                              \
+      ::pt::CppTypeToDataType<cpp_dtype>::Type(),      \
+      param_def_fn,                                    \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));           \
+  _PT_KERNEL_REGISTRAR_INIT_4(op_name,                 \
+                              func_id,                 \
+                              PT_ID,                   \
+                              backend,                 \
+                              layout,                  \
+                              param_def_fn,            \
+                              meta_kernel_fn,          \
+                              __VA_ARGS__)
+#define _PT_KERNEL_REGISTRAR_INIT_6(op_name,           \
+                                    func_id,           \
+                                    registrar_id,      \
+                                    backend,           \
+                                    layout,            \
+                                    param_def_fn,      \
+                                    meta_kernel_fn,    \
+                                    cpp_dtype,         \
+                                    ...)               \
+  static const ::pt::OpKernelRegistrar PT_CONCATENATE( \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)(  \
+      op_name,                                         \
+      BACKEND(backend),                                \
+      DATALAYOUT(layout),                              \
+      ::pt::CppTypeToDataType<cpp_dtype>::Type(),      \
+      param_def_fn,                                    \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));           \
+  _PT_KERNEL_REGISTRAR_INIT_5(op_name,                 \
+                              func_id,                 \
+                              PT_ID,                   \
+                              backend,                 \
+                              layout,                  \
+                              param_def_fn,            \
+                              meta_kernel_fn,          \
+                              __VA_ARGS__)
+#define _PT_KERNEL_REGISTRAR_INIT_7(op_name,           \
+                                    func_id,           \
+                                    registrar_id,      \
+                                    backend,           \
+                                    layout,            \
+                                    param_def_fn,      \
+                                    meta_kernel_fn,    \
+                                    cpp_dtype,         \
+                                    ...)               \
+  static const ::pt::OpKernelRegistrar PT_CONCATENATE( \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)(  \
+      op_name,                                         \
+      BACKEND(backend),                                \
+      DATALAYOUT(layout),                              \
+      ::pt::CppTypeToDataType<cpp_dtype>::Type(),      \
+      param_def_fn,                                    \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));           \
+  _PT_KERNEL_REGISTRAR_INIT_6(op_name,                 \
+                              func_id,                 \
+                              PT_ID,                   \
+                              backend,                 \
+                              layout,                  \
+                              param_def_fn,            \
+                              meta_kernel_fn,          \
+                              __VA_ARGS__)
+#define _PT_KERNEL_REGISTRAR_INIT_8(op_name,           \
+                                    func_id,           \
+                                    registrar_id,      \
+                                    backend,           \
+                                    layout,            \
+                                    param_def_fn,      \
+                                    meta_kernel_fn,    \
+                                    cpp_dtype,         \
+                                    ...)               \
+  static const ::pt::OpKernelRegistrar PT_CONCATENATE( \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)(  \
+      op_name,                                         \
+      BACKEND(backend),                                \
+      DATALAYOUT(layout),                              \
+      ::pt::CppTypeToDataType<cpp_dtype>::Type(),      \
+      param_def_fn,                                    \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));           \
+  _PT_KERNEL_REGISTRAR_INIT_7(op_name,                 \
+                              func_id,                 \
+                              PT_ID,                   \
+                              backend,                 \
+                              layout,                  \
+                              param_def_fn,            \
+                              meta_kernel_fn,          \
+                              __VA_ARGS__)
+
 #define PT_REGISTER_KERNEL_STANDARD(                                      \
     op_name, backend, layout, dtype, kernel_fn)                           \
   template decltype(kernel_fn) kernel_fn;                                 \
@@ -112,187 +407,4 @@ class OpKernelRegistrar {
     return 0;                                                               \
   }
 
-/**
- * In most cases, the backend, dtype and layout of Op's input and output
- * are the same as OpKernel itself. In order to simplify the registration
- * writing, we provide the following simple kernel registration macro.
- * If it is an special case, please use PT_REGISTER_KERNEL_STANDARD
- */
-// TODO(chenweihang): only work for single input and output now.
-// can we use function traits here to parse the input and output type?
-#define PT_REGISTER_KERNEL_1T(op_name, backend, layout, meta_kernel_fn, dtype) \
-  PT_REGISTER_KERNEL_AUTO_SPECIALIZE(                                          \
-      op_name, backend, layout, meta_kernel_fn, dtype)                         \
-      .Input(BACKEND(backend),                                                 \
-             DATALAYOUT(layout),                                               \
-             ::pt::CppTypeToDataType<dtype>::Type())                           \
-      .Output(BACKEND(backend),                                                \
-              DATALAYOUT(layout),                                              \
-              ::pt::CppTypeToDataType<dtype>::Type());                         \
-  PT_TOUCH_KERNEL_REGISTRAR(op_name, backend, layout, dtype)
-
-#define PT_REGISTER_KERNEL_2T(                                             \
-    op_name, backend, layout, meta_kernel_fn, dtype1, dtype2)              \
-  PT_REGISTER_KERNEL_1T(op_name, backend, layout, meta_kernel_fn, dtype1); \
-  PT_REGISTER_KERNEL_1T(op_name, backend, layout, meta_kernel_fn, dtype2)
-
-#define PT_REGISTER_KERNEL_3T(                                        \
-    op_name, backend, layout, meta_kernel_fn, dtype1, dtype2, dtype3) \
-  PT_REGISTER_KERNEL_2T(                                              \
-      op_name, backend, layout, meta_kernel_fn, dtype1, dtype2);      \
-  PT_REGISTER_KERNEL_1T(op_name, backend, layout, meta_kernel_fn, dtype3)
-
-#define PT_REGISTER_KERNEL_4T(                                                \
-    op_name, backend, layout, meta_kernel_fn, dtype1, dtype2, dtype3, dtype4) \
-  PT_REGISTER_KERNEL_2T(                                                      \
-      op_name, backend, layout, meta_kernel_fn, dtype1, dtype2);              \
-  PT_REGISTER_KERNEL_2T(                                                      \
-      op_name, backend, layout, meta_kernel_fn, dtype3, dtype4)
-
-#define PT_REGISTER_KERNEL_5T(op_name,                                   \
-                              backend,                                   \
-                              layout,                                    \
-                              meta_kernel_fn,                            \
-                              dtype1,                                    \
-                              dtype2,                                    \
-                              dtype3,                                    \
-                              dtype4,                                    \
-                              dtype5)                                    \
-  PT_REGISTER_KERNEL_3T(                                                 \
-      op_name, backend, layout, meta_kernel_fn, dtype1, dtype2, dtype3); \
-  PT_REGISTER_KERNEL_2T(                                                 \
-      op_name, backend, layout, meta_kernel_fn, dtype4, dtype5)
-
-#define PT_REGISTER_KERNEL_6T(op_name,                                   \
-                              backend,                                   \
-                              layout,                                    \
-                              meta_kernel_fn,                            \
-                              dtype1,                                    \
-                              dtype2,                                    \
-                              dtype3,                                    \
-                              dtype4,                                    \
-                              dtype5,                                    \
-                              dtype6)                                    \
-  PT_REGISTER_KERNEL_3T(                                                 \
-      op_name, backend, layout, meta_kernel_fn, dtype1, dtype2, dtype3); \
-  PT_REGISTER_KERNEL_3T(                                                 \
-      op_name, backend, layout, meta_kernel_fn, dtype4, dtype5, dtype6)
-
-#define PT_REGISTER_KERNEL_7T(op_name,        \
-                              backend,        \
-                              layout,         \
-                              meta_kernel_fn, \
-                              dtype1,         \
-                              dtype2,         \
-                              dtype3,         \
-                              dtype4,         \
-                              dtype5,         \
-                              dtype6,         \
-                              ftype7)         \
-  PT_REGISTER_KERNEL_4T(op_name,              \
-                        backend,              \
-                        layout,               \
-                        meta_kernel_fn,       \
-                        dtype1,               \
-                        dtype2,               \
-                        dtype3,               \
-                        dtype4);              \
-  PT_REGISTER_KERNEL_3T(                      \
-      op_name, backend, layout, meta_kernel_fn, dtype5, dtype6, dtype7)
-
-#define PT_REGISTER_KERNEL_8T(op_name,        \
-                              backend,        \
-                              layout,         \
-                              meta_kernel_fn, \
-                              dtype1,         \
-                              dtype2,         \
-                              dtype3,         \
-                              dtype4,         \
-                              dtype5,         \
-                              dtype6,         \
-                              dtype7,         \
-                              dtype8)         \
-  PT_REGISTER_KERNEL_4T(op_name,              \
-                        backend,              \
-                        layout,               \
-                        meta_kernel_fn,       \
-                        dtype1,               \
-                        dtype2,               \
-                        dtype3,               \
-                        dtype4);              \
-  PT_REGISTER_KERNEL_4T(op_name,              \
-                        backend,              \
-                        layout,               \
-                        meta_kernel_fn,       \
-                        dtype5,               \
-                        dtype6,               \
-                        dtype7,               \
-                        dtype8)
-
-/**
- * Op Kernel declare macros
- */
-
-#define PT_DECLARE_KERNEL_1T(op_name, backend, layout, dtype)                 \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                          \
-      __dec_pt_op_kernel_##op_name##_##backend##_##layout##_##dtype##__,      \
-      "PT_DECLARE_KERNEL_*T must be called in global namespace.");            \
-  extern int                                                                  \
-      TouchOpKernelRegistrar_##op_name##_##backend##_##dtype##_##layout();    \
-  UNUSED static int                                                           \
-      __declare_pt_op_kernel_##op_name##_##backend##_##layout##_##dtype##__ = \
-          TouchOpKernelRegistrar_##op_name##_##backend##_##dtype##_##layout()
-
-#define PT_DECLARE_KERNEL_2T(op_name, backend, layout, dtype1, dtype2) \
-  PT_DECLARE_KERNEL_1T(op_name, backend, layout, dtype1);              \
-  PT_DECLARE_KERNEL_1T(op_name, backend, layout, dtype2)
-
-#define PT_DECLARE_KERNEL_3T(op_name, backend, layout, dtype1, dtype2, dtype3) \
-  PT_REGISTER_KERNEL_2T(op_name, backend, layout, dtype1, dtype2);             \
-  PT_DECLARE_KERNEL_1T(op_name, backend, layout, dtype3)
-
-#define PT_DECLARE_KERNEL_4T(                                     \
-    op_name, backend, layout, dtype1, dtype2, dtype3, dtype4)     \
-  PT_DECLARE_KERNEL_2T(op_name, backend, layout, dtype1, dtype2); \
-  PT_DECLARE_KERNEL_2T(op_name, backend, layout, dtype3, dtype4)
-
-#define PT_DECLARE_KERNEL_5T(                                             \
-    op_name, backend, layout, dtype1, dtype2, dtype3, dtype4, dtype5)     \
-  PT_DECLARE_KERNEL_3T(op_name, backend, layout, dtype1, dtype2, dtype3); \
-  PT_DECLARE_KERNEL_2T(op_name, backend, layout, dtype4, dtype5)
-
-#define PT_DECLARE_KERNEL_6T(                                                 \
-    op_name, backend, layout, dtype1, dtype2, dtype3, dtype4, dtype5, dtype6) \
-  PT_DECLARE_KERNEL_3T(op_name, backend, layout, dtype1, dtype2, dtype3);     \
-  PT_DECLARE_KERNEL_3T(op_name, backend, layout, dtype4, dtype5, dtype6)
-
-#define PT_DECLARE_KERNEL_7T(op_name,                            \
-                             backend,                            \
-                             layout,                             \
-                             dtype1,                             \
-                             dtype2,                             \
-                             dtype3,                             \
-                             dtype4,                             \
-                             dtype5,                             \
-                             dtype6,                             \
-                             ftype7)                             \
-  PT_DECLARE_KERNEL_4T(                                          \
-      op_name, backend, layout, dtype1, dtype2, dtype3, dtype4); \
-  PT_DECLARE_KERNEL_3T(op_name, backend, layout, dtype5, dtype6, dtype7)
-
-#define PT_DECLARE_KERNEL_8T(op_name,                            \
-                             backend,                            \
-                             layout,                             \
-                             dtype1,                             \
-                             dtype2,                             \
-                             dtype3,                             \
-                             dtype4,                             \
-                             dtype5,                             \
-                             dtype6,                             \
-                             dtype7,                             \
-                             dtype8)                             \
-  PT_DECLARE_KERNEL_4T(                                          \
-      op_name, backend, layout, dtype1, dtype2, dtype3, dtype4); \
-  PT_DECLARE_KERNEL_4T(op_name, backend, layout, dtype5, dtype6, dtype7, dtype8)
-
 }  // namespace pt
diff --git a/paddle/top/cpu/math.cc b/paddle/top/cpu/math.cc
index c9b8afe63bdd7..2640c9039a9e1 100644
--- a/paddle/top/cpu/math.cc
+++ b/paddle/top/cpu/math.cc
@@ -85,8 +85,8 @@ void Mean(const CPUContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
 // PT_TOUCH_KERNEL_REGISTRAR(sign, CPU, NCHW, FLOAT32);
 
 // Register method 3:
-PT_REGISTER_KERNEL_2T(sign, CPU, NCHW, pt::Sign, float, double);
-PT_REGISTER_KERNEL_2T(mean, CPU, NCHW, pt::Mean, float, double);
+// PT_REGISTER_KERNEL_2T(sign, CPU, NCHW, pt::Sign, float, double);
+// PT_REGISTER_KERNEL_2T(mean, CPU, NCHW, pt::Mean, float, double);
 // PT_REGISTER_KERNEL_8T(scale,
 //                       CPU,
 //                       NCHW,
@@ -111,3 +111,7 @@ PT_REGISTER_KERNEL_2T(mean, CPU, NCHW, pt::Mean, float, double);
 //                       int16_t,
 //                       int,
 //                       int64_t);
+
+// Register method 4:
+PT_REGISTER_KERNEL("sign", CPU, NCHW, pt::Sign, float, double) {}
+PT_REGISTER_KERNEL("mean", CPU, NCHW, pt::Mean, float, double) {}
diff --git a/paddle/top/cpu/math.h b/paddle/top/cpu/math.h
index 2d2fd12140363..5bb56f18ac33b 100644
--- a/paddle/top/cpu/math.h
+++ b/paddle/top/cpu/math.h
@@ -53,5 +53,3 @@ void Scale(const CPUContext& dev_ctx,
 //         SelectedRowsTensor* out);
 
 }  // namespace pt
-
-PT_DECLARE_KERNEL_2T(sign, CPU, NCHW, float, double);
diff --git a/paddle/top/cuda/math.cu b/paddle/top/cuda/math.cu
index d16581d953544..d5286a1925981 100644
--- a/paddle/top/cuda/math.cu
+++ b/paddle/top/cuda/math.cu
@@ -109,9 +109,9 @@ void Mean(const CUDAContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
 
 }  // namespace pt
 
-using float16 = paddle::platform::float16;
-PT_REGISTER_KERNEL_3T(sign, CUDA, NCHW, pt::Sign, float, double, float16);
-PT_REGISTER_KERNEL_3T(mean, CUDA, NCHW, pt::Mean, float, double, float16);
+// using float16 = paddle::platform::float16;
+// PT_REGISTER_KERNEL_3T(sign, CUDA, NCHW, pt::Sign, float, double, float16);
+// PT_REGISTER_KERNEL_3T(mean, CUDA, NCHW, pt::Mean, float, double, float16);
 // PT_REGISTER_KERNEL_8T(scale,
 //                       CUDA,
 //                       NCHW,
@@ -136,3 +136,7 @@ PT_REGISTER_KERNEL_3T(mean, CUDA, NCHW, pt::Mean, float, double, float16);
 //                       int16_t,
 //                       int,
 //                       int64_t);
+
+using float16 = paddle::platform::float16;
+PT_REGISTER_KERNEL("sign", CUDA, NCHW, pt::Sign, float, double, float16) {}
+PT_REGISTER_KERNEL("mean", CUDA, NCHW, pt::Mean, float, double, float16) {}
diff --git a/paddle/top/xpu/math.cc b/paddle/top/xpu/math.cc
index 44d1a260956eb..fdae384a64da3 100644
--- a/paddle/top/xpu/math.cc
+++ b/paddle/top/xpu/math.cc
@@ -16,4 +16,4 @@
 
 #include "paddle/top/core/kernel_registry.h"
 
-PT_REGISTER_KERNEL_1T(sign, XPU, NCHW, pt::Sign, float);
+// PT_REGISTER_KERNEL_1T(sign, XPU, NCHW, pt::Sign, float);

From 509d13e52fb7e17b26e51f76aafa1c4d390ac68d Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Tue, 7 Sep 2021 08:44:44 +0000
Subject: [PATCH 043/125] rename top to tcmpt

---
 paddle/CMakeLists.txt                             |  2 +-
 paddle/fluid/framework/CMakeLists.txt             |  6 +++---
 paddle/fluid/framework/eigen.h                    |  2 +-
 paddle/fluid/framework/operator.cc                |  2 +-
 paddle/fluid/framework/operator.h                 |  8 ++++----
 .../framework/{top_utils.cc => tcmpt_utils.cc}    |  2 +-
 .../framework/{top_utils.h => tcmpt_utils.h}      |  2 +-
 paddle/fluid/imperative/prepared_operator.cc      |  2 +-
 paddle/fluid/imperative/prepared_operator.h       |  4 ++--
 paddle/fluid/operators/CMakeLists.txt             |  4 ++--
 paddle/fluid/operators/mean_op.h                  |  8 ++++----
 paddle/fluid/operators/npu_op_runner.h            |  2 +-
 paddle/fluid/operators/scale_op.h                 |  8 ++++----
 paddle/fluid/platform/mkldnn_reuse.h              |  2 +-
 paddle/fluid/pybind/op_function_generator.cc      |  2 +-
 paddle/{top => tcmpt}/CMakeLists.txt              | 12 ++++++------
 paddle/tcmpt/api/CMakeLists.txt                   | 15 +++++++++++++++
 paddle/{top => tcmpt}/api/all.cc                  |  2 +-
 paddle/{top => tcmpt}/api/all.h                   |  4 ++--
 paddle/{top => tcmpt}/api/include/dev/core.h      | 12 ++++++------
 paddle/{top => tcmpt}/api/include/dev/math.h      | 10 +++++-----
 paddle/{top => tcmpt}/api/include/tensor.h        |  6 +++---
 paddle/{top => tcmpt}/api/src/CMakeLists.txt      |  0
 paddle/{top => tcmpt}/core/CMakeLists.txt         |  0
 paddle/{top => tcmpt}/core/backend.cc             |  2 +-
 paddle/{top => tcmpt}/core/backend.h              |  0
 paddle/{top => tcmpt}/core/convert_utils.cc       |  2 +-
 paddle/{top => tcmpt}/core/convert_utils.h        |  6 +++---
 paddle/{top => tcmpt}/core/dense_tensor.cc        |  4 ++--
 paddle/{top => tcmpt}/core/dense_tensor.h         |  6 +++---
 paddle/{top => tcmpt}/core/dtype.cc               |  2 +-
 paddle/{top => tcmpt}/core/dtype.h                |  0
 paddle/{top => tcmpt}/core/kernel_context.cc      |  2 +-
 paddle/{top => tcmpt}/core/kernel_context.h       |  2 +-
 paddle/{top => tcmpt}/core/kernel_def.h           |  0
 paddle/{top => tcmpt}/core/kernel_factory.cc      |  2 +-
 paddle/{top => tcmpt}/core/kernel_factory.h       |  8 ++++----
 paddle/{top => tcmpt}/core/kernel_registry.h      |  6 +++---
 paddle/{top => tcmpt}/core/kernel_utils.h         |  4 ++--
 paddle/{top => tcmpt}/core/layout.cc              |  2 +-
 paddle/{top => tcmpt}/core/layout.h               |  0
 paddle/{top => tcmpt}/core/mkldnn_dense_tensor.h  |  2 +-
 paddle/{top => tcmpt}/core/scalar_tensor.h        |  2 +-
 .../{top => tcmpt}/core/selected_rows_tensor.cc   |  2 +-
 paddle/{top => tcmpt}/core/selected_rows_tensor.h |  4 ++--
 paddle/{top => tcmpt}/core/spatial_tensor.h       |  2 +-
 paddle/{top => tcmpt}/core/tensor_interface.h     |  6 +++---
 paddle/{top => tcmpt}/core/tensor_meta.h          |  6 +++---
 paddle/{top => tcmpt}/core/tensor_status.h        |  6 +++---
 paddle/{top => tcmpt}/cpu/CMakeLists.txt          |  0
 paddle/{top => tcmpt}/cpu/math.cc                 |  6 +++---
 paddle/{top => tcmpt}/cpu/math.h                  | 10 +++++-----
 paddle/{top => tcmpt}/cuda/CMakeLists.txt         |  0
 paddle/{top => tcmpt}/cuda/math.cu                | 10 +++++-----
 paddle/{top => tcmpt}/cuda/math.h                 |  8 ++++----
 paddle/{top => tcmpt}/infershape/CMakeLists.txt   |  0
 paddle/{top => tcmpt}/mkldnn/CMakeLists.txt       |  0
 paddle/{top => tcmpt}/mkldnn/base.h               |  2 +-
 paddle/{top => tcmpt}/mkldnn/math.cc              |  2 +-
 paddle/{top => tcmpt}/mkldnn/math.h               |  4 ++--
 paddle/{top => tcmpt}/module/CMakeLists.txt       |  0
 paddle/{top => tcmpt}/module/scale.h              |  2 +-
 paddle/{top => tcmpt}/module/sign.h               |  2 +-
 paddle/{top => tcmpt}/npu/CMakeLists.txt          |  0
 paddle/{top => tcmpt}/npu/math.h                  |  2 +-
 paddle/{top => tcmpt}/tests/CMakeLists.txt        |  0
 paddle/{top => tcmpt}/tests/backend_test.cc       |  2 +-
 paddle/{top => tcmpt}/tests/dense_tensor_test.cc  |  2 +-
 paddle/{top => tcmpt}/tests/dtype_test.cc         |  0
 .../{top => tcmpt}/tests/kernel_factory_test.cc   |  2 +-
 paddle/{top => tcmpt}/tests/layout_test.cc        |  0
 paddle/{top => tcmpt}/xpu/CMakeLists.txt          |  0
 paddle/{top => tcmpt}/xpu/math.cc                 |  4 ++--
 paddle/{top => tcmpt}/xpu/math.h                  |  2 +-
 paddle/top/api/CMakeLists.txt                     | 15 ---------------
 75 files changed, 135 insertions(+), 135 deletions(-)
 rename paddle/fluid/framework/{top_utils.cc => tcmpt_utils.cc} (99%)
 rename paddle/fluid/framework/{top_utils.h => tcmpt_utils.h} (97%)
 rename paddle/{top => tcmpt}/CMakeLists.txt (77%)
 create mode 100644 paddle/tcmpt/api/CMakeLists.txt
 rename paddle/{top => tcmpt}/api/all.cc (94%)
 rename paddle/{top => tcmpt}/api/all.h (87%)
 rename paddle/{top => tcmpt}/api/include/dev/core.h (70%)
 rename paddle/{top => tcmpt}/api/include/dev/math.h (78%)
 rename paddle/{top => tcmpt}/api/include/tensor.h (97%)
 rename paddle/{top => tcmpt}/api/src/CMakeLists.txt (100%)
 rename paddle/{top => tcmpt}/core/CMakeLists.txt (100%)
 rename paddle/{top => tcmpt}/core/backend.cc (97%)
 rename paddle/{top => tcmpt}/core/backend.h (100%)
 rename paddle/{top => tcmpt}/core/convert_utils.cc (99%)
 rename paddle/{top => tcmpt}/core/convert_utils.h (92%)
 rename paddle/{top => tcmpt}/core/dense_tensor.cc (98%)
 rename paddle/{top => tcmpt}/core/dense_tensor.h (97%)
 rename paddle/{top => tcmpt}/core/dtype.cc (97%)
 rename paddle/{top => tcmpt}/core/dtype.h (100%)
 rename paddle/{top => tcmpt}/core/kernel_context.cc (93%)
 rename paddle/{top => tcmpt}/core/kernel_context.h (98%)
 rename paddle/{top => tcmpt}/core/kernel_def.h (100%)
 rename paddle/{top => tcmpt}/core/kernel_factory.cc (98%)
 rename paddle/{top => tcmpt}/core/kernel_factory.h (98%)
 rename paddle/{top => tcmpt}/core/kernel_registry.h (99%)
 rename paddle/{top => tcmpt}/core/kernel_utils.h (98%)
 rename paddle/{top => tcmpt}/core/layout.cc (96%)
 rename paddle/{top => tcmpt}/core/layout.h (100%)
 rename paddle/{top => tcmpt}/core/mkldnn_dense_tensor.h (97%)
 rename paddle/{top => tcmpt}/core/scalar_tensor.h (93%)
 rename paddle/{top => tcmpt}/core/selected_rows_tensor.cc (92%)
 rename paddle/{top => tcmpt}/core/selected_rows_tensor.h (97%)
 rename paddle/{top => tcmpt}/core/spatial_tensor.h (97%)
 rename paddle/{top => tcmpt}/core/tensor_interface.h (95%)
 rename paddle/{top => tcmpt}/core/tensor_meta.h (97%)
 rename paddle/{top => tcmpt}/core/tensor_status.h (94%)
 rename paddle/{top => tcmpt}/cpu/CMakeLists.txt (100%)
 rename paddle/{top => tcmpt}/cpu/math.cc (97%)
 rename paddle/{top => tcmpt}/cpu/math.h (87%)
 rename paddle/{top => tcmpt}/cuda/CMakeLists.txt (100%)
 rename paddle/{top => tcmpt}/cuda/math.cu (95%)
 rename paddle/{top => tcmpt}/cuda/math.h (90%)
 rename paddle/{top => tcmpt}/infershape/CMakeLists.txt (100%)
 rename paddle/{top => tcmpt}/mkldnn/CMakeLists.txt (100%)
 rename paddle/{top => tcmpt}/mkldnn/base.h (98%)
 rename paddle/{top => tcmpt}/mkldnn/math.cc (95%)
 rename paddle/{top => tcmpt}/mkldnn/math.h (95%)
 rename paddle/{top => tcmpt}/module/CMakeLists.txt (100%)
 rename paddle/{top => tcmpt}/module/scale.h (97%)
 rename paddle/{top => tcmpt}/module/sign.h (97%)
 rename paddle/{top => tcmpt}/npu/CMakeLists.txt (100%)
 rename paddle/{top => tcmpt}/npu/math.h (98%)
 rename paddle/{top => tcmpt}/tests/CMakeLists.txt (100%)
 rename paddle/{top => tcmpt}/tests/backend_test.cc (94%)
 rename paddle/{top => tcmpt}/tests/dense_tensor_test.cc (96%)
 rename paddle/{top => tcmpt}/tests/dtype_test.cc (100%)
 rename paddle/{top => tcmpt}/tests/kernel_factory_test.cc (94%)
 rename paddle/{top => tcmpt}/tests/layout_test.cc (100%)
 rename paddle/{top => tcmpt}/xpu/CMakeLists.txt (100%)
 rename paddle/{top => tcmpt}/xpu/math.cc (89%)
 rename paddle/{top => tcmpt}/xpu/math.h (98%)
 delete mode 100644 paddle/top/api/CMakeLists.txt

diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 1a6ec05b830a6..ce3f6973e7a68 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -1,5 +1,5 @@
 add_subdirectory(scripts)
 add_subdirectory(testing)
 set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests CACHE INTERNAL "python tests directory")
-add_subdirectory(top)
+add_subdirectory(tcmpt)
 add_subdirectory(fluid)
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 10db28afca5f2..d14e2d1c0bd96 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -191,10 +191,10 @@ cc_library(unused_var_check SRCS unused_var_check.cc DEPS glog no_need_buffer_va
 
 IF(WITH_XPU)
 cc_library(operator SRCS operator.cc DEPS xpu_op_list op_info device_context tensor scope glog trainer_desc_proto data_feed_proto
-    shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils top top_utils)
+    shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils tcmpt tcmpt_utils)
 ELSE()
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog trainer_desc_proto data_feed_proto
-    shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils top top_utils)
+    shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils tcmpt tcmpt_utils)
 ENDIF()
 
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
@@ -387,7 +387,7 @@ cc_library(save_load_util SRCS save_load_util.cc DEPS tensor scope layer)
 cc_test(save_load_util_test SRCS save_load_util_test.cc DEPS save_load_util tensor scope layer)
 cc_library(generator SRCS generator.cc DEPS enforce place)
 
-cc_library(top_utils SRCS top_utils.cc DEPS lod_tensor selected_rows place top)
+cc_library(tcmpt_utils SRCS tcmpt_utils.cc DEPS lod_tensor selected_rows place tcmpt)
 
 # Get the current working branch
 execute_process(
diff --git a/paddle/fluid/framework/eigen.h b/paddle/fluid/framework/eigen.h
index acb6a88f059c6..56843b9aa6853 100644
--- a/paddle/fluid/framework/eigen.h
+++ b/paddle/fluid/framework/eigen.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
-#include "paddle/top/core/dense_tensor.h"
+#include "paddle/tcmpt/core/dense_tensor.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 11ce9891aa94d..183ad7163bfa9 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -24,7 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/details/nan_inf_utils.h"
 #include "paddle/fluid/framework/op_call_stack.h"
 #include "paddle/fluid/framework/shape_inference.h"
-#include "paddle/fluid/framework/top_utils.h"
+#include "paddle/fluid/framework/tcmpt_utils.h"
 #include "paddle/fluid/framework/transfer_scope_cache.h"
 #include "paddle/fluid/framework/unused_var_check.h"
 #include "paddle/fluid/framework/var_type.h"
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index f8bd284691790..e0bdb829b3359 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -38,7 +38,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/variant.h"
 
-#include "paddle/top/api/include/dev/core.h"
+#include "paddle/tcmpt/api/include/dev/core.h"
 
 namespace paddle {
 namespace framework {
@@ -533,7 +533,7 @@ class OperatorWithKernel : public OperatorBase {
     return kernel_type_->place_;
   }
 
-  /* member functions for adapting to top lib */
+  /* member functions for adapting to tcmpt lib */
   // TODO(chenweihang): Temporarily as a class method
   virtual pt::OpKernelKey ConstructPtOpKernelKey(
       const VariableValueMap& inputs, const platform::Place& ctx_place) const;
@@ -580,7 +580,7 @@ class OperatorWithKernel : public OperatorBase {
   Tensor* GetTensorFormInputSafely(const ExecutionContext& ctx,
                                    const std::string& name) const;
 
-  /* member functions for adapting to top lib */
+  /* member functions for adapting to tcmpt lib */
   void ChoosePtKernel(const RuntimeContext& ctx,
                       const platform::DeviceContext& dev_ctx) const;
 
@@ -594,7 +594,7 @@ class OperatorWithKernel : public OperatorBase {
   mutable bool all_kernels_must_compute_runtime_shape_ = false;
   mutable std::mutex cache_update_mutex_;
   mutable bool enable_cache_transfer_scope_ = false;
-  // TODO(chenweihang): Similar duplicate members are used for new top lib,
+  // TODO(chenweihang): Similar duplicate members are used for new tcmpt lib,
   // maybe we have better impl methods
   mutable bool run_pt_kernel_ = false;
   mutable std::unique_ptr<pt::OpKernelKey> pt_kernel_key_;
diff --git a/paddle/fluid/framework/top_utils.cc b/paddle/fluid/framework/tcmpt_utils.cc
similarity index 99%
rename from paddle/fluid/framework/top_utils.cc
rename to paddle/fluid/framework/tcmpt_utils.cc
index a0624b8c2bd8a..c46b43bd75952 100644
--- a/paddle/fluid/framework/top_utils.cc
+++ b/paddle/fluid/framework/tcmpt_utils.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/top_utils.h"
+#include "paddle/fluid/framework/tcmpt_utils.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/selected_rows.h"
 
diff --git a/paddle/fluid/framework/top_utils.h b/paddle/fluid/framework/tcmpt_utils.h
similarity index 97%
rename from paddle/fluid/framework/top_utils.h
rename to paddle/fluid/framework/tcmpt_utils.h
index 32487569a1722..fecc98d90a66e 100644
--- a/paddle/fluid/framework/top_utils.h
+++ b/paddle/fluid/framework/tcmpt_utils.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/place.h"
 
-#include "paddle/top/api/include/dev/core.h"
+#include "paddle/tcmpt/api/include/dev/core.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 6a0f58f663f1c..efb7a9f985fa2 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -16,7 +16,7 @@
 
 #include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/details/nan_inf_utils.h"
-#include "paddle/fluid/framework/top_utils.h"
+#include "paddle/fluid/framework/tcmpt_utils.h"
 #include "paddle/fluid/imperative/infer_shape_context.h"
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/xpu/xpu_op_list.h"
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index c831399a42aa1..a43229a4bbe04 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -25,7 +25,7 @@
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/imperative/type_defs.h"
 
-#include "paddle/top/api/include/dev/core.h"
+#include "paddle/tcmpt/api/include/dev/core.h"
 
 DECLARE_bool(use_mkldnn);
 
@@ -185,7 +185,7 @@ class PreparedOp {
   framework::OpKernelType kernel_type_;
   framework::OperatorWithKernel::OpKernelFunc func_;
   platform::DeviceContext* dev_ctx_;
-  // TODo(chenweihang): Similar duplicate members are used for new top lib,
+  // TODo(chenweihang): Similar duplicate members are used for new tcmpt lib,
   // maybe we have better impl methods
   bool run_pt_kernel_{false};
   pt::OpKernelKey pt_kernel_key_;
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 5db492d761a63..3b0d50a832a26 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -74,8 +74,8 @@ if(WITH_UNITY_BUILD)
     include(unity_build_rule.cmake)
 endif()
 
-set(OP_HEADER_DEPS ${OP_HEADER_DEPS} top)
-set(OP_HEADER_DEPS ${OP_HEADER_DEPS} top_utils)
+set(OP_HEADER_DEPS ${OP_HEADER_DEPS} tcmpt)
+set(OP_HEADER_DEPS ${OP_HEADER_DEPS} tcmpt_utils)
 register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op lstm_op run_program_op eye_op recurrent_op
         sync_batch_norm_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
 
diff --git a/paddle/fluid/operators/mean_op.h b/paddle/fluid/operators/mean_op.h
index ef5d66adbf8b9..4f9c1505a6ee3 100644
--- a/paddle/fluid/operators/mean_op.h
+++ b/paddle/fluid/operators/mean_op.h
@@ -15,11 +15,11 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/top_utils.h"
+#include "paddle/fluid/framework/tcmpt_utils.h"
 
-// only can include the headers in paddle/top/api dirs
-#include "paddle/top/api/include/dev/core.h"
-#include "paddle/top/api/include/dev/math.h"
+// only can include the headers in paddle/tcmpt/api dirs
+#include "paddle/tcmpt/api/include/dev/core.h"
+#include "paddle/tcmpt/api/include/dev/math.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/npu_op_runner.h b/paddle/fluid/operators/npu_op_runner.h
index 412c842ac4bc8..601a542b1a069 100644
--- a/paddle/fluid/operators/npu_op_runner.h
+++ b/paddle/fluid/operators/npu_op_runner.h
@@ -24,7 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
 
-#include "paddle/top/api/include/dev/core.h"
+#include "paddle/tcmpt/api/include/dev/core.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h
index 0f9b1bbeb6a8c..723f9bb7c256e 100644
--- a/paddle/fluid/operators/scale_op.h
+++ b/paddle/fluid/operators/scale_op.h
@@ -15,11 +15,11 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/top_utils.h"
+#include "paddle/fluid/framework/tcmpt_utils.h"
 
-// only can include the headers in paddle/top/api dirs
-#include "paddle/top/api/include/dev/core.h"
-#include "paddle/top/api/include/dev/math.h"
+// only can include the headers in paddle/tcmpt/api dirs
+#include "paddle/tcmpt/api/include/dev/core.h"
+#include "paddle/tcmpt/api/include/dev/math.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 03d3780fc6b6a..f092dfee04c27 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -26,7 +26,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #include "paddle/fluid/platform/place.h"
 
-#include "paddle/top/api/include/dev/core.h"
+#include "paddle/tcmpt/api/include/dev/core.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index e726425de41c5..573f1fb81501f 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -536,7 +536,7 @@ GenerateOpFunctions() {
     auto& op_type = op_proto->type();
     // Skip ooerator which is not inherit form OperatorWithKernel, like while,
     // since only OperatorWithKernel can run in dygraph mode.
-    // if the top lib contains op kernel, we still generate ops method
+    // if the tcmpt lib contains op kernel, we still generate ops method
     if (!all_kernels.count(op_type) &&
         !pt::OpKernelFactory::Instance().ContainsOperation(op_type.c_str())) {
       continue;
diff --git a/paddle/top/CMakeLists.txt b/paddle/tcmpt/CMakeLists.txt
similarity index 77%
rename from paddle/top/CMakeLists.txt
rename to paddle/tcmpt/CMakeLists.txt
index 7b8de81d6c667..63f5c1b312e32 100644
--- a/paddle/top/CMakeLists.txt
+++ b/paddle/tcmpt/CMakeLists.txt
@@ -1,8 +1,8 @@
-# top api
+# tcmpt api
 add_subdirectory(api)
-# top core components
+# tcmpt core components
 add_subdirectory(core)
-# top kernels for diff device
+# tcmpt kernels for diff device
 add_subdirectory(cpu)
 if(WITH_GPU OR WITH_ROCM)
   add_subdirectory(cuda)
@@ -17,9 +17,9 @@ endif()
 if(WITH_XPU)
   add_subdirectory(xpu)
 endif()
-# top infershape
+# tcmpt infershape
 add_subdirectory(infershape)
-# top public functors
+# tcmpt public functors
 add_subdirectory(module)
-# top tests
+# tcmpt tests
 add_subdirectory(tests)
diff --git a/paddle/tcmpt/api/CMakeLists.txt b/paddle/tcmpt/api/CMakeLists.txt
new file mode 100644
index 0000000000000..ba29c5d9e1b2f
--- /dev/null
+++ b/paddle/tcmpt/api/CMakeLists.txt
@@ -0,0 +1,15 @@
+add_subdirectory(src)
+
+set(TCMPT_DEPS convert_utils dense_tensor selected_rows_tensor kernel_factory kernel_context)
+set(TCMPT_DEPS ${TCMPT_DEPS} math_cpu)
+if(WITH_MKLDNN)
+  set(TCMPT_DEPS ${TCMPT_DEPS} math_mkldnn)
+endif()
+if(WITH_GPU OR WITH_ROCM)
+  set(TCMPT_DEPS ${TCMPT_DEPS} math_cuda)
+endif()
+if(WITH_XPU)
+  set(TCMPT_DEPS ${TCMPT_DEPS} math_xpu)
+endif()
+
+cc_library(tcmpt SRCS all.cc DEPS ${TCMPT_DEPS})
diff --git a/paddle/top/api/all.cc b/paddle/tcmpt/api/all.cc
similarity index 94%
rename from paddle/top/api/all.cc
rename to paddle/tcmpt/api/all.cc
index 5fe5586af3ab0..05922e02c4998 100644
--- a/paddle/top/api/all.cc
+++ b/paddle/tcmpt/api/all.cc
@@ -12,6 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/top/api/all.h"
+#include "paddle/tcmpt/api/all.h"
 
 namespace pt {}  // namespace pt
diff --git a/paddle/top/api/all.h b/paddle/tcmpt/api/all.h
similarity index 87%
rename from paddle/top/api/all.h
rename to paddle/tcmpt/api/all.h
index 2586884613040..db944cb13b6a7 100644
--- a/paddle/top/api/all.h
+++ b/paddle/tcmpt/api/all.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 // develop apis
-#include "paddle/top/api/include/dev/core.h"
-#include "paddle/top/api/include/dev/math.h"
+#include "paddle/tcmpt/api/include/dev/core.h"
+#include "paddle/tcmpt/api/include/dev/math.h"
 
 // user apis
diff --git a/paddle/top/api/include/dev/core.h b/paddle/tcmpt/api/include/dev/core.h
similarity index 70%
rename from paddle/top/api/include/dev/core.h
rename to paddle/tcmpt/api/include/dev/core.h
index 547c6b3568c1e..687dc72bb351f 100644
--- a/paddle/top/api/include/dev/core.h
+++ b/paddle/tcmpt/api/include/dev/core.h
@@ -15,9 +15,9 @@ limitations under the License. */
 #pragma once
 
 // See Note: [ How do we organize the kernel directory ]
-#include "paddle/top/core/convert_utils.h"
-#include "paddle/top/core/dense_tensor.h"
-#include "paddle/top/core/kernel_context.h"
-#include "paddle/top/core/kernel_factory.h"
-#include "paddle/top/core/mkldnn_dense_tensor.h"
-#include "paddle/top/core/selected_rows_tensor.h"
+#include "paddle/tcmpt/core/convert_utils.h"
+#include "paddle/tcmpt/core/dense_tensor.h"
+#include "paddle/tcmpt/core/kernel_context.h"
+#include "paddle/tcmpt/core/kernel_factory.h"
+#include "paddle/tcmpt/core/mkldnn_dense_tensor.h"
+#include "paddle/tcmpt/core/selected_rows_tensor.h"
diff --git a/paddle/top/api/include/dev/math.h b/paddle/tcmpt/api/include/dev/math.h
similarity index 78%
rename from paddle/top/api/include/dev/math.h
rename to paddle/tcmpt/api/include/dev/math.h
index e40ed490317d2..bc498f8382853 100644
--- a/paddle/top/api/include/dev/math.h
+++ b/paddle/tcmpt/api/include/dev/math.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 
 // See Note: [ How do we organize the kernel directory ]
-#include "paddle/top/cpu/math.h"
-#include "paddle/top/cuda/math.h"
-#include "paddle/top/mkldnn/math.h"
-#include "paddle/top/npu/math.h"
-#include "paddle/top/xpu/math.h"
+#include "paddle/tcmpt/cpu/math.h"
+#include "paddle/tcmpt/cuda/math.h"
+#include "paddle/tcmpt/mkldnn/math.h"
+#include "paddle/tcmpt/npu/math.h"
+#include "paddle/tcmpt/xpu/math.h"
diff --git a/paddle/top/api/include/tensor.h b/paddle/tcmpt/api/include/tensor.h
similarity index 97%
rename from paddle/top/api/include/tensor.h
rename to paddle/tcmpt/api/include/tensor.h
index 9fd36f97d05dd..6029f87b5c4a4 100644
--- a/paddle/top/api/include/tensor.h
+++ b/paddle/tcmpt/api/include/tensor.h
@@ -18,14 +18,14 @@ limitations under the License. */
 #include <memory>
 #include <utility>
 
-#include "paddle/top/core/tensor_interface.h"
+#include "paddle/tcmpt/core/tensor_interface.h"
 
 /**
  * [ Why still include the fluid headers? ]
  *
  * We hope to organize the basic implementation of Tensor and the logic related
  * to Tensor operation into an independent library, which we call
- * [Tensor Operation Library, top], so we extract or rewrite the original
+ * [Tensor Operation Library, tcmpt], so we extract or rewrite the original
  * OpKernels.
  *
  * In the future, the training library, inference library and custom operators
@@ -54,7 +54,7 @@ class AutogradMetaInterface {
 
 /**
  * Tensor is the API description of the basic data structure in the
- * [ Paddle "Tensor OPeration (top)" Library ].
+ * [ Paddle "Tensor OPeration (tcmpt)" Library ].
  *
  * It is not limited to a simple n-dimensional array.
  * It contains a smart pointer to `TensorImpl`. The data description contained
diff --git a/paddle/top/api/src/CMakeLists.txt b/paddle/tcmpt/api/src/CMakeLists.txt
similarity index 100%
rename from paddle/top/api/src/CMakeLists.txt
rename to paddle/tcmpt/api/src/CMakeLists.txt
diff --git a/paddle/top/core/CMakeLists.txt b/paddle/tcmpt/core/CMakeLists.txt
similarity index 100%
rename from paddle/top/core/CMakeLists.txt
rename to paddle/tcmpt/core/CMakeLists.txt
diff --git a/paddle/top/core/backend.cc b/paddle/tcmpt/core/backend.cc
similarity index 97%
rename from paddle/top/core/backend.cc
rename to paddle/tcmpt/core/backend.cc
index 701aa6edf9478..68c7adfcc2810 100644
--- a/paddle/top/core/backend.cc
+++ b/paddle/tcmpt/core/backend.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/top/core/backend.h"
+#include "paddle/tcmpt/core/backend.h"
 
 namespace pt {
 
diff --git a/paddle/top/core/backend.h b/paddle/tcmpt/core/backend.h
similarity index 100%
rename from paddle/top/core/backend.h
rename to paddle/tcmpt/core/backend.h
diff --git a/paddle/top/core/convert_utils.cc b/paddle/tcmpt/core/convert_utils.cc
similarity index 99%
rename from paddle/top/core/convert_utils.cc
rename to paddle/tcmpt/core/convert_utils.cc
index f49b26113ce8b..9ad98d3d910b2 100644
--- a/paddle/top/core/convert_utils.cc
+++ b/paddle/tcmpt/core/convert_utils.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/top/core/convert_utils.h"
+#include "paddle/tcmpt/core/convert_utils.h"
 
 namespace pt {
 
diff --git a/paddle/top/core/convert_utils.h b/paddle/tcmpt/core/convert_utils.h
similarity index 92%
rename from paddle/top/core/convert_utils.h
rename to paddle/tcmpt/core/convert_utils.h
index d95654fd75220..9e8d85c7cfa92 100644
--- a/paddle/top/core/convert_utils.h
+++ b/paddle/tcmpt/core/convert_utils.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/top/core/backend.h"
-#include "paddle/top/core/dtype.h"
-#include "paddle/top/core/layout.h"
+#include "paddle/tcmpt/core/backend.h"
+#include "paddle/tcmpt/core/dtype.h"
+#include "paddle/tcmpt/core/layout.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/data_layout.h"
diff --git a/paddle/top/core/dense_tensor.cc b/paddle/tcmpt/core/dense_tensor.cc
similarity index 98%
rename from paddle/top/core/dense_tensor.cc
rename to paddle/tcmpt/core/dense_tensor.cc
index 1a3bd04d75c0d..d5306f08f0b54 100644
--- a/paddle/top/core/dense_tensor.cc
+++ b/paddle/tcmpt/core/dense_tensor.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/top/core/dense_tensor.h"
-#include "paddle/top/core/convert_utils.h"
+#include "paddle/tcmpt/core/dense_tensor.h"
+#include "paddle/tcmpt/core/convert_utils.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/data_type.h"
diff --git a/paddle/top/core/dense_tensor.h b/paddle/tcmpt/core/dense_tensor.h
similarity index 97%
rename from paddle/top/core/dense_tensor.h
rename to paddle/tcmpt/core/dense_tensor.h
index 9a8779160727b..d7853e7cba201 100644
--- a/paddle/top/core/dense_tensor.h
+++ b/paddle/tcmpt/core/dense_tensor.h
@@ -16,9 +16,9 @@ limitations under the License. */
 
 #include <memory>
 
-#include "paddle/top/core/tensor_interface.h"
-#include "paddle/top/core/tensor_meta.h"
-#include "paddle/top/core/tensor_status.h"
+#include "paddle/tcmpt/core/tensor_interface.h"
+#include "paddle/tcmpt/core/tensor_meta.h"
+#include "paddle/tcmpt/core/tensor_status.h"
 
 namespace paddle {
 namespace memory {
diff --git a/paddle/top/core/dtype.cc b/paddle/tcmpt/core/dtype.cc
similarity index 97%
rename from paddle/top/core/dtype.cc
rename to paddle/tcmpt/core/dtype.cc
index 1790f1f2c3bbf..1ddf1b25b3357 100644
--- a/paddle/top/core/dtype.cc
+++ b/paddle/tcmpt/core/dtype.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/top/core/dtype.h"
+#include "paddle/tcmpt/core/dtype.h"
 
 namespace pt {
 
diff --git a/paddle/top/core/dtype.h b/paddle/tcmpt/core/dtype.h
similarity index 100%
rename from paddle/top/core/dtype.h
rename to paddle/tcmpt/core/dtype.h
diff --git a/paddle/top/core/kernel_context.cc b/paddle/tcmpt/core/kernel_context.cc
similarity index 93%
rename from paddle/top/core/kernel_context.cc
rename to paddle/tcmpt/core/kernel_context.cc
index fafacb72f27ab..5bfcaf137fedf 100644
--- a/paddle/top/core/kernel_context.cc
+++ b/paddle/tcmpt/core/kernel_context.cc
@@ -12,6 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/top/core/kernel_context.h"
+#include "paddle/tcmpt/core/kernel_context.h"
 
 namespace pt {}  // namespace pt
diff --git a/paddle/top/core/kernel_context.h b/paddle/tcmpt/core/kernel_context.h
similarity index 98%
rename from paddle/top/core/kernel_context.h
rename to paddle/tcmpt/core/kernel_context.h
index 50ed67183d366..e7815f3ab5ae8 100644
--- a/paddle/top/core/kernel_context.h
+++ b/paddle/tcmpt/core/kernel_context.h
@@ -16,7 +16,7 @@
 
 #include <utility>
 
-#include "paddle/top/core/tensor_interface.h"
+#include "paddle/tcmpt/core/tensor_interface.h"
 #include "paddle/utils/any.h"
 
 // See Note [ Why still include the fluid headers? ]
diff --git a/paddle/top/core/kernel_def.h b/paddle/tcmpt/core/kernel_def.h
similarity index 100%
rename from paddle/top/core/kernel_def.h
rename to paddle/tcmpt/core/kernel_def.h
diff --git a/paddle/top/core/kernel_factory.cc b/paddle/tcmpt/core/kernel_factory.cc
similarity index 98%
rename from paddle/top/core/kernel_factory.cc
rename to paddle/tcmpt/core/kernel_factory.cc
index 38e3163d517c5..6b2ea66f710d3 100644
--- a/paddle/top/core/kernel_factory.cc
+++ b/paddle/tcmpt/core/kernel_factory.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/top/core/kernel_factory.h"
+#include "paddle/tcmpt/core/kernel_factory.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/top/core/kernel_factory.h b/paddle/tcmpt/core/kernel_factory.h
similarity index 98%
rename from paddle/top/core/kernel_factory.h
rename to paddle/tcmpt/core/kernel_factory.h
index 12d99ab7dde28..d806f6c2b5e6c 100644
--- a/paddle/top/core/kernel_factory.h
+++ b/paddle/tcmpt/core/kernel_factory.h
@@ -19,10 +19,10 @@
 #include <unordered_map>
 #include <utility>
 
-#include "paddle/top/core/backend.h"
-#include "paddle/top/core/dtype.h"
-#include "paddle/top/core/kernel_def.h"
-#include "paddle/top/core/layout.h"
+#include "paddle/tcmpt/core/backend.h"
+#include "paddle/tcmpt/core/dtype.h"
+#include "paddle/tcmpt/core/kernel_def.h"
+#include "paddle/tcmpt/core/layout.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/enforce.h"
diff --git a/paddle/top/core/kernel_registry.h b/paddle/tcmpt/core/kernel_registry.h
similarity index 99%
rename from paddle/top/core/kernel_registry.h
rename to paddle/tcmpt/core/kernel_registry.h
index f473af47ea54f..1a403bf99f38e 100644
--- a/paddle/top/core/kernel_registry.h
+++ b/paddle/tcmpt/core/kernel_registry.h
@@ -14,9 +14,9 @@
 
 #pragma once
 
-#include "paddle/top/core/kernel_def.h"
-#include "paddle/top/core/kernel_factory.h"
-#include "paddle/top/core/kernel_utils.h"
+#include "paddle/tcmpt/core/kernel_def.h"
+#include "paddle/tcmpt/core/kernel_factory.h"
+#include "paddle/tcmpt/core/kernel_utils.h"
 
 namespace pt {
 
diff --git a/paddle/top/core/kernel_utils.h b/paddle/tcmpt/core/kernel_utils.h
similarity index 98%
rename from paddle/top/core/kernel_utils.h
rename to paddle/tcmpt/core/kernel_utils.h
index 52678ac302823..6ef4877735b52 100644
--- a/paddle/top/core/kernel_utils.h
+++ b/paddle/tcmpt/core/kernel_utils.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
-#include "paddle/top/core/kernel_context.h"
-#include "paddle/top/core/kernel_def.h"
+#include "paddle/tcmpt/core/kernel_context.h"
+#include "paddle/tcmpt/core/kernel_def.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/top/core/layout.cc b/paddle/tcmpt/core/layout.cc
similarity index 96%
rename from paddle/top/core/layout.cc
rename to paddle/tcmpt/core/layout.cc
index a25f1818cb5a7..5c09e67a79856 100644
--- a/paddle/top/core/layout.cc
+++ b/paddle/tcmpt/core/layout.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/top/core/layout.h"
+#include "paddle/tcmpt/core/layout.h"
 
 namespace pt {
 
diff --git a/paddle/top/core/layout.h b/paddle/tcmpt/core/layout.h
similarity index 100%
rename from paddle/top/core/layout.h
rename to paddle/tcmpt/core/layout.h
diff --git a/paddle/top/core/mkldnn_dense_tensor.h b/paddle/tcmpt/core/mkldnn_dense_tensor.h
similarity index 97%
rename from paddle/top/core/mkldnn_dense_tensor.h
rename to paddle/tcmpt/core/mkldnn_dense_tensor.h
index 9f5f63d771c55..0aea392fce93d 100644
--- a/paddle/top/core/mkldnn_dense_tensor.h
+++ b/paddle/tcmpt/core/mkldnn_dense_tensor.h
@@ -18,7 +18,7 @@ limitations under the License. */
 
 #include "mkldnn.hpp"
 
-#include "paddle/top/core/dense_tensor.h"
+#include "paddle/tcmpt/core/dense_tensor.h"
 
 namespace pt {
 
diff --git a/paddle/top/core/scalar_tensor.h b/paddle/tcmpt/core/scalar_tensor.h
similarity index 93%
rename from paddle/top/core/scalar_tensor.h
rename to paddle/tcmpt/core/scalar_tensor.h
index dd2062a95c7e8..0ae0b768cfa11 100644
--- a/paddle/top/core/scalar_tensor.h
+++ b/paddle/tcmpt/core/scalar_tensor.h
@@ -14,6 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/top/core/dense_tensor.h"
+#include "paddle/tcmpt/core/dense_tensor.h"
 
 class LoDTensor : public DenseTensor {};
diff --git a/paddle/top/core/selected_rows_tensor.cc b/paddle/tcmpt/core/selected_rows_tensor.cc
similarity index 92%
rename from paddle/top/core/selected_rows_tensor.cc
rename to paddle/tcmpt/core/selected_rows_tensor.cc
index 8dad949a75422..65a544009d20f 100644
--- a/paddle/top/core/selected_rows_tensor.cc
+++ b/paddle/tcmpt/core/selected_rows_tensor.cc
@@ -12,6 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/top/core/selected_rows_tensor.h"
+#include "paddle/tcmpt/core/selected_rows_tensor.h"
 
 namespace pt {}  // namespace pt
diff --git a/paddle/top/core/selected_rows_tensor.h b/paddle/tcmpt/core/selected_rows_tensor.h
similarity index 97%
rename from paddle/top/core/selected_rows_tensor.h
rename to paddle/tcmpt/core/selected_rows_tensor.h
index 0aa4fa9a6c3c6..3d03c891395f6 100644
--- a/paddle/top/core/selected_rows_tensor.h
+++ b/paddle/tcmpt/core/selected_rows_tensor.h
@@ -21,8 +21,8 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
-#include "paddle/top/core/dense_tensor.h"
-#include "paddle/top/core/tensor_interface.h"
+#include "paddle/tcmpt/core/dense_tensor.h"
+#include "paddle/tcmpt/core/tensor_interface.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/mixed_vector.h"
diff --git a/paddle/top/core/spatial_tensor.h b/paddle/tcmpt/core/spatial_tensor.h
similarity index 97%
rename from paddle/top/core/spatial_tensor.h
rename to paddle/tcmpt/core/spatial_tensor.h
index 46dc21f83ccbb..5e51322bb8339 100644
--- a/paddle/top/core/spatial_tensor.h
+++ b/paddle/tcmpt/core/spatial_tensor.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/top/core/tensor_interface.h"
+#include "paddle/tcmpt/core/tensor_interface.h"
 
 namespace pt {
 
diff --git a/paddle/top/core/tensor_interface.h b/paddle/tcmpt/core/tensor_interface.h
similarity index 95%
rename from paddle/top/core/tensor_interface.h
rename to paddle/tcmpt/core/tensor_interface.h
index 4649ad19d2e6a..101c39e36cd41 100644
--- a/paddle/top/core/tensor_interface.h
+++ b/paddle/tcmpt/core/tensor_interface.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/top/core/backend.h"
-#include "paddle/top/core/dtype.h"
-#include "paddle/top/core/layout.h"
+#include "paddle/tcmpt/core/backend.h"
+#include "paddle/tcmpt/core/dtype.h"
+#include "paddle/tcmpt/core/layout.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/top/core/tensor_meta.h b/paddle/tcmpt/core/tensor_meta.h
similarity index 97%
rename from paddle/top/core/tensor_meta.h
rename to paddle/tcmpt/core/tensor_meta.h
index fbfd55b3ccdb7..5789e9a459e0b 100644
--- a/paddle/top/core/tensor_meta.h
+++ b/paddle/tcmpt/core/tensor_meta.h
@@ -16,9 +16,9 @@ limitations under the License. */
 
 #include <vector>
 
-#include "paddle/top/core/backend.h"
-#include "paddle/top/core/dtype.h"
-#include "paddle/top/core/layout.h"
+#include "paddle/tcmpt/core/backend.h"
+#include "paddle/tcmpt/core/dtype.h"
+#include "paddle/tcmpt/core/layout.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/ddim.h"
diff --git a/paddle/top/core/tensor_status.h b/paddle/tcmpt/core/tensor_status.h
similarity index 94%
rename from paddle/top/core/tensor_status.h
rename to paddle/tcmpt/core/tensor_status.h
index 075b52c573805..1328c88dd014a 100644
--- a/paddle/top/core/tensor_status.h
+++ b/paddle/tcmpt/core/tensor_status.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/top/core/backend.h"
-#include "paddle/top/core/dtype.h"
-#include "paddle/top/core/layout.h"
+#include "paddle/tcmpt/core/backend.h"
+#include "paddle/tcmpt/core/dtype.h"
+#include "paddle/tcmpt/core/layout.h"
 
 namespace pt {
 
diff --git a/paddle/top/cpu/CMakeLists.txt b/paddle/tcmpt/cpu/CMakeLists.txt
similarity index 100%
rename from paddle/top/cpu/CMakeLists.txt
rename to paddle/tcmpt/cpu/CMakeLists.txt
diff --git a/paddle/top/cpu/math.cc b/paddle/tcmpt/cpu/math.cc
similarity index 97%
rename from paddle/top/cpu/math.cc
rename to paddle/tcmpt/cpu/math.cc
index 2640c9039a9e1..7656f88beffc9 100644
--- a/paddle/top/cpu/math.cc
+++ b/paddle/tcmpt/cpu/math.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/top/cpu/math.h"
+#include "paddle/tcmpt/cpu/math.h"
 
-// #include "paddle/top/module/scale.h"
-// #include "paddle/top/module/sign.h"
+// #include "paddle/tcmpt/module/scale.h"
+// #include "paddle/tcmpt/module/sign.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/eigen.h"
diff --git a/paddle/top/cpu/math.h b/paddle/tcmpt/cpu/math.h
similarity index 87%
rename from paddle/top/cpu/math.h
rename to paddle/tcmpt/cpu/math.h
index 5bb56f18ac33b..de9521b54dede 100644
--- a/paddle/top/cpu/math.h
+++ b/paddle/tcmpt/cpu/math.h
@@ -14,12 +14,12 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/top/core/dense_tensor.h"
-#include "paddle/top/core/kernel_registry.h"
-#include "paddle/top/core/selected_rows_tensor.h"
+#include "paddle/tcmpt/core/dense_tensor.h"
+#include "paddle/tcmpt/core/kernel_registry.h"
+#include "paddle/tcmpt/core/selected_rows_tensor.h"
 
-#include "paddle/top/module/scale.h"
-#include "paddle/top/module/sign.h"
+#include "paddle/tcmpt/module/scale.h"
+#include "paddle/tcmpt/module/sign.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/top/cuda/CMakeLists.txt b/paddle/tcmpt/cuda/CMakeLists.txt
similarity index 100%
rename from paddle/top/cuda/CMakeLists.txt
rename to paddle/tcmpt/cuda/CMakeLists.txt
diff --git a/paddle/top/cuda/math.cu b/paddle/tcmpt/cuda/math.cu
similarity index 95%
rename from paddle/top/cuda/math.cu
rename to paddle/tcmpt/cuda/math.cu
index d5286a1925981..65d0bdfaa36b9 100644
--- a/paddle/top/cuda/math.cu
+++ b/paddle/tcmpt/cuda/math.cu
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/top/cuda/math.h"
+#include "paddle/tcmpt/cuda/math.h"
 
-// #include "paddle/top/module/scale.h"
-// #include "paddle/top/module/sign.h"
+// #include "paddle/tcmpt/module/scale.h"
+// #include "paddle/tcmpt/module/sign.h"
 
 #ifdef __NVCC__
 #include "cub/cub.cuh"
@@ -26,8 +26,8 @@ namespace cub = hipcub;
 #endif
 
 #include "paddle/fluid/platform/float16.h"
-#include "paddle/top/core/convert_utils.h"
-#include "paddle/top/core/kernel_registry.h"
+#include "paddle/tcmpt/core/convert_utils.h"
+#include "paddle/tcmpt/core/kernel_registry.h"
 
 namespace pt {
 
diff --git a/paddle/top/cuda/math.h b/paddle/tcmpt/cuda/math.h
similarity index 90%
rename from paddle/top/cuda/math.h
rename to paddle/tcmpt/cuda/math.h
index 66bacea1dab48..9bcb6c9dbf0c8 100644
--- a/paddle/top/cuda/math.h
+++ b/paddle/tcmpt/cuda/math.h
@@ -17,11 +17,11 @@ limitations under the License. */
 // CUDA and HIP use same api
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
-#include "paddle/top/core/dense_tensor.h"
-#include "paddle/top/core/selected_rows_tensor.h"
+#include "paddle/tcmpt/core/dense_tensor.h"
+#include "paddle/tcmpt/core/selected_rows_tensor.h"
 
-#include "paddle/top/module/scale.h"
-#include "paddle/top/module/sign.h"
+#include "paddle/tcmpt/module/scale.h"
+#include "paddle/tcmpt/module/sign.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/top/infershape/CMakeLists.txt b/paddle/tcmpt/infershape/CMakeLists.txt
similarity index 100%
rename from paddle/top/infershape/CMakeLists.txt
rename to paddle/tcmpt/infershape/CMakeLists.txt
diff --git a/paddle/top/mkldnn/CMakeLists.txt b/paddle/tcmpt/mkldnn/CMakeLists.txt
similarity index 100%
rename from paddle/top/mkldnn/CMakeLists.txt
rename to paddle/tcmpt/mkldnn/CMakeLists.txt
diff --git a/paddle/top/mkldnn/base.h b/paddle/tcmpt/mkldnn/base.h
similarity index 98%
rename from paddle/top/mkldnn/base.h
rename to paddle/tcmpt/mkldnn/base.h
index 3186ea9ae23a4..35acf1f9f6815 100644
--- a/paddle/top/mkldnn/base.h
+++ b/paddle/tcmpt/mkldnn/base.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_MKLDNN
 
-#include "paddle/top/core/mkldnn_dense_tensor.h"
+#include "paddle/tcmpt/core/mkldnn_dense_tensor.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/mkldnn_reuse.h"
diff --git a/paddle/top/mkldnn/math.cc b/paddle/tcmpt/mkldnn/math.cc
similarity index 95%
rename from paddle/top/mkldnn/math.cc
rename to paddle/tcmpt/mkldnn/math.cc
index 2544dab9fc98e..6f4cc9f7f6628 100644
--- a/paddle/top/mkldnn/math.cc
+++ b/paddle/tcmpt/mkldnn/math.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/top/mkldnn/math.h"
+#include "paddle/tcmpt/mkldnn/math.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/float16.h"
diff --git a/paddle/top/mkldnn/math.h b/paddle/tcmpt/mkldnn/math.h
similarity index 95%
rename from paddle/top/mkldnn/math.h
rename to paddle/tcmpt/mkldnn/math.h
index bee3aec6277e7..07ac563c2177c 100644
--- a/paddle/top/mkldnn/math.h
+++ b/paddle/tcmpt/mkldnn/math.h
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_MKLDNN
 
-#include "paddle/top/core/mkldnn_dense_tensor.h"
-#include "paddle/top/mkldnn/base.h"
+#include "paddle/tcmpt/core/mkldnn_dense_tensor.h"
+#include "paddle/tcmpt/mkldnn/base.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/top/module/CMakeLists.txt b/paddle/tcmpt/module/CMakeLists.txt
similarity index 100%
rename from paddle/top/module/CMakeLists.txt
rename to paddle/tcmpt/module/CMakeLists.txt
diff --git a/paddle/top/module/scale.h b/paddle/tcmpt/module/scale.h
similarity index 97%
rename from paddle/top/module/scale.h
rename to paddle/tcmpt/module/scale.h
index a55cfc1fb5d3f..d822256673201 100644
--- a/paddle/top/module/scale.h
+++ b/paddle/tcmpt/module/scale.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/top/core/dense_tensor.h"
+#include "paddle/tcmpt/core/dense_tensor.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/eigen.h"
diff --git a/paddle/top/module/sign.h b/paddle/tcmpt/module/sign.h
similarity index 97%
rename from paddle/top/module/sign.h
rename to paddle/tcmpt/module/sign.h
index 2ce805c4a6213..10a11dff038ca 100644
--- a/paddle/top/module/sign.h
+++ b/paddle/tcmpt/module/sign.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/top/core/dense_tensor.h"
+#include "paddle/tcmpt/core/dense_tensor.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/eigen.h"
diff --git a/paddle/top/npu/CMakeLists.txt b/paddle/tcmpt/npu/CMakeLists.txt
similarity index 100%
rename from paddle/top/npu/CMakeLists.txt
rename to paddle/tcmpt/npu/CMakeLists.txt
diff --git a/paddle/top/npu/math.h b/paddle/tcmpt/npu/math.h
similarity index 98%
rename from paddle/top/npu/math.h
rename to paddle/tcmpt/npu/math.h
index 03c1a2a5020a2..d480bb22e9287 100644
--- a/paddle/top/npu/math.h
+++ b/paddle/tcmpt/npu/math.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_ASCEND_CL
 
-#include "paddle/top/core/dense_tensor.h"
+#include "paddle/tcmpt/core/dense_tensor.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/operators/npu_op_runner.h"
diff --git a/paddle/top/tests/CMakeLists.txt b/paddle/tcmpt/tests/CMakeLists.txt
similarity index 100%
rename from paddle/top/tests/CMakeLists.txt
rename to paddle/tcmpt/tests/CMakeLists.txt
diff --git a/paddle/top/tests/backend_test.cc b/paddle/tcmpt/tests/backend_test.cc
similarity index 94%
rename from paddle/top/tests/backend_test.cc
rename to paddle/tcmpt/tests/backend_test.cc
index add873f8571f7..026e94ec4d0e7 100644
--- a/paddle/top/tests/backend_test.cc
+++ b/paddle/tcmpt/tests/backend_test.cc
@@ -12,6 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/top/core/backend.h"
+#include "paddle/tcmpt/core/backend.h"
 
 #include <gtest/gtest.h>
diff --git a/paddle/top/tests/dense_tensor_test.cc b/paddle/tcmpt/tests/dense_tensor_test.cc
similarity index 96%
rename from paddle/top/tests/dense_tensor_test.cc
rename to paddle/tcmpt/tests/dense_tensor_test.cc
index f2b19b409f4a2..633e787159444 100644
--- a/paddle/top/tests/dense_tensor_test.cc
+++ b/paddle/tcmpt/tests/dense_tensor_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/top/core/dense_tensor.h"
+#include "paddle/tcmpt/core/dense_tensor.h"
 
 #include <gtest/gtest.h>
 
diff --git a/paddle/top/tests/dtype_test.cc b/paddle/tcmpt/tests/dtype_test.cc
similarity index 100%
rename from paddle/top/tests/dtype_test.cc
rename to paddle/tcmpt/tests/dtype_test.cc
diff --git a/paddle/top/tests/kernel_factory_test.cc b/paddle/tcmpt/tests/kernel_factory_test.cc
similarity index 94%
rename from paddle/top/tests/kernel_factory_test.cc
rename to paddle/tcmpt/tests/kernel_factory_test.cc
index 383d9f232d177..f3493ea63d56e 100644
--- a/paddle/top/tests/kernel_factory_test.cc
+++ b/paddle/tcmpt/tests/kernel_factory_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/top/core/kernel_factory.h"
+#include "paddle/tcmpt/core/kernel_factory.h"
 
 #include "gtest/gtest.h"
 
diff --git a/paddle/top/tests/layout_test.cc b/paddle/tcmpt/tests/layout_test.cc
similarity index 100%
rename from paddle/top/tests/layout_test.cc
rename to paddle/tcmpt/tests/layout_test.cc
diff --git a/paddle/top/xpu/CMakeLists.txt b/paddle/tcmpt/xpu/CMakeLists.txt
similarity index 100%
rename from paddle/top/xpu/CMakeLists.txt
rename to paddle/tcmpt/xpu/CMakeLists.txt
diff --git a/paddle/top/xpu/math.cc b/paddle/tcmpt/xpu/math.cc
similarity index 89%
rename from paddle/top/xpu/math.cc
rename to paddle/tcmpt/xpu/math.cc
index fdae384a64da3..57b92da34edee 100644
--- a/paddle/top/xpu/math.cc
+++ b/paddle/tcmpt/xpu/math.cc
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/top/xpu/math.h"
+#include "paddle/tcmpt/xpu/math.h"
 
-#include "paddle/top/core/kernel_registry.h"
+#include "paddle/tcmpt/core/kernel_registry.h"
 
 // PT_REGISTER_KERNEL_1T(sign, XPU, NCHW, pt::Sign, float);
diff --git a/paddle/top/xpu/math.h b/paddle/tcmpt/xpu/math.h
similarity index 98%
rename from paddle/top/xpu/math.h
rename to paddle/tcmpt/xpu/math.h
index 1d6b38a3dd8eb..ed223c8a71bea 100644
--- a/paddle/top/xpu/math.h
+++ b/paddle/tcmpt/xpu/math.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/top/core/dense_tensor.h"
+#include "paddle/tcmpt/core/dense_tensor.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/top/api/CMakeLists.txt b/paddle/top/api/CMakeLists.txt
deleted file mode 100644
index 4c057b25330b5..0000000000000
--- a/paddle/top/api/CMakeLists.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-add_subdirectory(src)
-
-set(TOP_DEPS convert_utils dense_tensor selected_rows_tensor kernel_factory kernel_context)
-set(TOP_DEPS ${TOP_DEPS} math_cpu)
-if(WITH_MKLDNN)
-  set(TOP_DEPS ${TOP_DEPS} math_mkldnn)
-endif()
-if(WITH_GPU OR WITH_ROCM)
-  set(TOP_DEPS ${TOP_DEPS} math_cuda)
-endif()
-if(WITH_XPU)
-  set(TOP_DEPS ${TOP_DEPS} math_xpu)
-endif()
-
-cc_library(top SRCS all.cc DEPS ${TOP_DEPS})

From 7146f92fc6271975c830b2d6e80286be877f44b3 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Tue, 7 Sep 2021 11:47:45 +0000
Subject: [PATCH 044/125] revert xpu, npu, mkldnn impl & remove op def

---
 paddle/fluid/framework/operator.cc            |  45 +-
 paddle/fluid/framework/operator.h             |  11 +-
 paddle/fluid/imperative/prepared_operator.cc  |  28 +-
 paddle/fluid/imperative/prepared_operator.h   |   8 +-
 paddle/fluid/operators/mean_op_npu.cc         |  26 +
 paddle/fluid/operators/mean_op_xpu.cc         |  20 +
 .../fluid/operators/mkldnn/scale_mkldnn_op.cc |  63 +++
 paddle/fluid/operators/npu_op_runner.cc       | 122 -----
 paddle/fluid/operators/npu_op_runner.h        |  19 -
 paddle/fluid/operators/pool_cudnn_op.cu.cc    |   4 +-
 paddle/fluid/operators/sign_op_xpu.cc         |  44 ++
 paddle/fluid/platform/mkldnn_reuse.h          |  35 +-
 paddle/fluid/pybind/op_function_generator.cc  |   2 +-
 paddle/tcmpt/CMakeLists.txt                   |  10 +-
 paddle/tcmpt/api/CMakeLists.txt               |   6 -
 paddle/tcmpt/api/include/dev/math.h           |   3 -
 paddle/tcmpt/api/include/tensor.h             |  12 +-
 paddle/tcmpt/core/dtype.h                     |   4 +-
 paddle/tcmpt/core/kernel_context.h            |  16 +-
 paddle/tcmpt/core/kernel_def.h                |   8 +-
 paddle/tcmpt/core/kernel_factory.cc           |  32 +-
 paddle/tcmpt/core/kernel_factory.h            | 147 +++--
 paddle/tcmpt/core/kernel_registry.h           | 516 +++++++++---------
 paddle/tcmpt/core/kernel_utils.h              |  57 +-
 paddle/tcmpt/core/layout.h                    |   4 +-
 paddle/tcmpt/cpu/math.cc                      |   4 +-
 paddle/tcmpt/cpu/math.h                       |   4 +-
 paddle/tcmpt/cuda/math.cu                     |   4 +-
 paddle/tcmpt/cuda/math.h                      |   4 +-
 paddle/tcmpt/eigen/CMakeLists.txt             |   0
 paddle/tcmpt/{module => eigen}/scale.h        |   0
 paddle/tcmpt/{module => eigen}/sign.h         |   0
 paddle/tcmpt/mkldnn/CMakeLists.txt            |   1 -
 paddle/tcmpt/mkldnn/base.h                    |  72 ---
 paddle/tcmpt/mkldnn/math.cc                   |  20 -
 paddle/tcmpt/mkldnn/math.h                    |  64 ---
 paddle/tcmpt/npu/math.h                       |  81 ---
 paddle/tcmpt/tests/kernel_factory_test.cc     |   4 +-
 paddle/tcmpt/xpu/CMakeLists.txt               |   1 -
 paddle/tcmpt/xpu/math.cc                      |  19 -
 paddle/tcmpt/xpu/math.h                       |  84 ---
 41 files changed, 621 insertions(+), 983 deletions(-)
 create mode 100644 paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc
 create mode 100644 paddle/fluid/operators/sign_op_xpu.cc
 create mode 100644 paddle/tcmpt/eigen/CMakeLists.txt
 rename paddle/tcmpt/{module => eigen}/scale.h (100%)
 rename paddle/tcmpt/{module => eigen}/sign.h (100%)
 delete mode 100644 paddle/tcmpt/mkldnn/base.h
 delete mode 100644 paddle/tcmpt/mkldnn/math.cc
 delete mode 100644 paddle/tcmpt/mkldnn/math.h
 delete mode 100644 paddle/tcmpt/npu/math.h
 delete mode 100644 paddle/tcmpt/xpu/math.cc
 delete mode 100644 paddle/tcmpt/xpu/math.h

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 183ad7163bfa9..5c80a3a9b800e 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1074,8 +1074,7 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
   this->InferShape(&infer_shape_ctx);
 }
 
-OpKernelType TransPtOpKernelKeyToOpKernelType(
-    const pt::OpKernelKey& kernel_key) {
+OpKernelType TransPtKernelKeyToOpKernelType(const pt::KernelKey& kernel_key) {
   proto::VarType::Type data_type = pt::TransToProtoVarType(kernel_key.dtype());
   platform::Place place = pt::TransToFluidPlace(kernel_key.backend());
   DataLayout data_layout = pt::TransToFluidDataLayout(kernel_key.layout());
@@ -1141,10 +1140,12 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   // implementation, this is a gradual replacement process
   // TODO(chenweihang): only for debug, remove it after
   // print all registered kernels
-  VLOG(1) << pt::OpKernelFactory::Instance();
+  VLOG(1) << pt::KernelFactory::Instance();
 
-  run_pt_kernel_ =
-      pt::OpKernelFactory::Instance().ContainsOperation(type_.c_str());
+  // TODO(chenweihang): in the first phase of project, we only support CPU, CUDA
+  // and RCOM backend, the XPU, NPU and MKLDNN will be supported in the second
+  // phase
+  run_pt_kernel_ = pt::KernelFactory::Instance().ContainsKernel(type_.c_str());
   if (run_pt_kernel_) {
     if (pt_kernel_key_.get() == nullptr || pt_kernel_.get() == nullptr) {
       ChoosePtKernel(*runtime_ctx, *dev_ctx);
@@ -1163,8 +1164,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
                                        platform::EventRole::kInnerOp);
     if (need_prepare_data_) {
       if (run_pt_kernel_) {
-        kernel_type_.reset(new OpKernelType(
-            TransPtOpKernelKeyToOpKernelType(*pt_kernel_key_)));
+        kernel_type_.reset(
+            new OpKernelType(TransPtKernelKeyToOpKernelType(*pt_kernel_key_)));
       }
       transfer_scope = PrepareData(scope, *kernel_type_,
                                    &transfered_inplace_vars, runtime_ctx);
@@ -1196,7 +1197,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
                                        platform::EventRole::kInnerOp);
     if (run_pt_kernel_) {
       // TODO(chenweihang): here will intrduce copy
-      auto op_kernel_ctx = ConstructPtOpKernelContext(*runtime_ctx, *dev_ctx);
+      auto op_kernel_ctx = ConstructPtKernelContext(*runtime_ctx, *dev_ctx);
       (*pt_kernel_)(&op_kernel_ctx);
       // need share output into fluid tensor
 
@@ -1266,19 +1267,19 @@ void OperatorWithKernel::ChoosePtKernel(
     const RuntimeContext& ctx, const platform::DeviceContext& dev_ctx) const {
   // 1. construct operation name
   // TODO(chenweihang): add rules for construct op name
-  pt::OperationName op_name(Type().c_str());
+  pt::KernelName kernel_name(Type().c_str());
   // TODO(chenweihang): polish judge rules
   if (ContainsSelectedRows(ctx.inputs)) {
-    op_name.overload_type = "selected_rows";
+    kernel_name.overload_name = "selected_rows";
   }
 
   // 2. construct op kernel key
-  pt_kernel_key_.reset(new pt::OpKernelKey(
-      ConstructPtOpKernelKey(ctx.inputs, dev_ctx.GetPlace())));
+  pt_kernel_key_.reset(
+      new pt::KernelKey(ConstructPtKernelKey(ctx.inputs, dev_ctx.GetPlace())));
 
   // 3. selecte op kernel
-  pt_kernel_.reset(new pt::OpKernel(
-      pt::OpKernelFactory::Instance().SelectKernel(op_name, *pt_kernel_key_)));
+  pt_kernel_.reset(new pt::Kernel(pt::KernelFactory::Instance().SelectKernel(
+      kernel_name, *pt_kernel_key_)));
 }
 
 void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
@@ -1783,7 +1784,7 @@ OpKernelType OperatorWithKernel::GetKernelTypeForVar(
                       tensor.layout());
 }
 
-pt::OpKernelKey OperatorWithKernel::ConstructPtOpKernelKey(
+pt::KernelKey OperatorWithKernel::ConstructPtKernelKey(
     const VariableValueMap& inputs, const platform::Place& ctx_place) const {
   // 1. get backend based place and attrs
   pt::Backend backend = pt::TransToPtBackend(ctx_place);
@@ -1817,11 +1818,11 @@ pt::OpKernelKey OperatorWithKernel::ConstructPtOpKernelKey(
           "DataType should be indicated by input Variable at %s.", Type()));
   pt::DataType dtype = pt::TransToPtDataType(data_type);
 
-  // 4. build pt OpKernelKey
-  return pt::OpKernelKey(backend, layout, dtype);
+  // 4. build pt KernelKey
+  return pt::KernelKey(backend, layout, dtype);
 }
 
-pt::OpKernelContext OperatorWithKernel::ConstructPtOpKernelContext(
+pt::KernelContext OperatorWithKernel::ConstructPtKernelContext(
     const RuntimeContext& ctx, const platform::DeviceContext& dev_ctx) const {
   VLOG(1) << RuntimeContextDebugString(ctx);
 
@@ -1832,7 +1833,7 @@ pt::OpKernelContext OperatorWithKernel::ConstructPtOpKernelContext(
   // 3. needless attributes remove
   // 4. use pt Tensor directly
   // 5. kernel input is not DenseTensor
-  pt::OpKernelContext op_kernel_ctx(dev_ctx);
+  pt::KernelContext op_kernel_ctx(dev_ctx);
   auto input_defs = pt_kernel_->param_def().input_defs();
   auto output_defs = pt_kernel_->param_def().output_defs();
 
@@ -1846,7 +1847,7 @@ pt::OpKernelContext OperatorWithKernel::ConstructPtOpKernelContext(
     // TODO(chenweihang): skip special cases temporarily
     // TODO(chenweihang): deal with diff param in vector
     if (in.has_dispensable() && in.dispensable()) {
-      VLOG(1) << "BuildOpKernelContext: skip dispensable input - " << in.name();
+      VLOG(1) << "BuildKernelContext: skip dispensable input - " << in.name();
       continue;
     }
     auto in_name = in.name();
@@ -1874,7 +1875,7 @@ pt::OpKernelContext OperatorWithKernel::ConstructPtOpKernelContext(
     auto out_def = output_defs.at(i);
     for (auto* var : ctx.outputs.at(out_name)) {
       // mutable_data before run kernel, to avoid share output form
-      // OpKernelContext to original tensor
+      // KernelContext to original tensor
       if (var->IsType<LoDTensor>()) {
         auto* tensor = var->GetMutable<LoDTensor>();
         tensor->mutable_data(pt::TransToFluidPlace(out_def.backend),
@@ -1922,7 +1923,7 @@ pt::OpKernelContext OperatorWithKernel::ConstructPtOpKernelContext(
         // TODO(chenweihang): support other attrs type
         PADDLE_THROW(platform::errors::Unimplemented(
             "unsupported cast op `%s`'s attribute `%s` when construct "
-            "OpKernelContext.",
+            "KernelContext.",
             Type(), attr.name()));
     }
   }
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index e0bdb829b3359..2c817d9fe7b43 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -115,8 +115,7 @@ inline std::string GradOriginalVarName(const std::string& grad_var_name) {
 const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var);
 Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var);
 
-OpKernelType TransPtOpKernelKeyToOpKernelType(
-    const pt::OpKernelKey& kernel_key);
+OpKernelType TransPtKernelKeyToOpKernelType(const pt::KernelKey& kernel_key);
 
 class ExecutionContext;
 class OperatorBase;
@@ -535,10 +534,10 @@ class OperatorWithKernel : public OperatorBase {
 
   /* member functions for adapting to tcmpt lib */
   // TODO(chenweihang): Temporarily as a class method
-  virtual pt::OpKernelKey ConstructPtOpKernelKey(
+  virtual pt::KernelKey ConstructPtKernelKey(
       const VariableValueMap& inputs, const platform::Place& ctx_place) const;
 
-  virtual pt::OpKernelContext ConstructPtOpKernelContext(
+  virtual pt::KernelContext ConstructPtKernelContext(
       const RuntimeContext& ctx, const platform::DeviceContext& dev_ctx) const;
 
  private:
@@ -597,8 +596,8 @@ class OperatorWithKernel : public OperatorBase {
   // TODO(chenweihang): Similar duplicate members are used for new tcmpt lib,
   // maybe we have better impl methods
   mutable bool run_pt_kernel_ = false;
-  mutable std::unique_ptr<pt::OpKernelKey> pt_kernel_key_;
-  mutable std::unique_ptr<pt::OpKernel> pt_kernel_;
+  mutable std::unique_ptr<pt::KernelKey> pt_kernel_key_;
+  mutable std::unique_ptr<pt::Kernel> pt_kernel_;
 };
 
 extern bool OpSupportGPU(const std::string& op_type);
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index efb7a9f985fa2..955c722965a6e 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -91,8 +91,8 @@ PreparedOp::PreparedOp(const framework::OperatorBase& op,
 
 PreparedOp::PreparedOp(const framework::OperatorBase& op,
                        const framework::RuntimeContext& ctx,
-                       const pt::OpKernelKey& pt_kernel_key,
-                       const pt::OpKernel& pt_kernel,
+                       const pt::KernelKey& pt_kernel_key,
+                       const pt::Kernel& pt_kernel,
                        platform::DeviceContext* dev_ctx)
     : op_(op),
       ctx_(ctx),
@@ -105,7 +105,7 @@ PreparedOp::PreparedOp(const framework::OperatorBase& op,
       pt_kernel_(pt_kernel) {
   // TODO(chenweihang): PrepareData still use old impl, so here need save
   // old kernel type, trans it later
-  kernel_type_ = framework::TransPtOpKernelKeyToOpKernelType(pt_kernel_key_);
+  kernel_type_ = framework::TransPtKernelKeyToOpKernelType(pt_kernel_key_);
 }
 
 template <typename VarType>
@@ -147,13 +147,13 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
 
   // 1. get expected kernel key
   bool run_pt_kernel =
-      pt::OpKernelFactory::Instance().ContainsOperation(op.Type().c_str());
+      pt::KernelFactory::Instance().ContainsKernel(op.Type().c_str());
   if (run_pt_kernel) {
-    pt::OperationName op_name(op.Type().c_str());
+    pt::KernelName op_name(op.Type().c_str());
     auto inputs = BuildInputMap<VarType>(ins);
-    auto pt_kernel_key = op.ConstructPtOpKernelKey(inputs, place);
+    auto pt_kernel_key = op.ConstructPtKernelKey(inputs, place);
     auto pt_kernel =
-        pt::OpKernelFactory::Instance().SelectKernel(op_name, pt_kernel_key);
+        pt::KernelFactory::Instance().SelectKernel(op_name, pt_kernel_key);
     // TODO(chenweihang): using CPUKernel when miss device kernel case
     return PreparedOp(op, ctx, pt_kernel_key, pt_kernel, dev_ctx);
   } else {
@@ -231,8 +231,8 @@ PreparedOp PreparedOp::Prepare(const NameVarMap<VariableWrapper>& ins,
 }
 
 template <typename VarType>
-static pt::OpKernelContext BuildDygraphOpKernelContext(
-    const pt::OpKernel& pt_kernel, const NameVarMap<VarType>& ins,
+static pt::KernelContext BuildDygraphKernelContext(
+    const pt::Kernel& pt_kernel, const NameVarMap<VarType>& ins,
     const NameVarMap<VarType>& outs, const platform::DeviceContext& dev_ctx) {
   // TODO(chenweihang): now only work for very simple case (sign op),
   // many cases need to be deal with later:
@@ -241,7 +241,7 @@ static pt::OpKernelContext BuildDygraphOpKernelContext(
   // 3. needless attributes remove
   // 4. use pt Tensor directly
   // 5. kernel input is not DenseTensor
-  pt::OpKernelContext op_kernel_ctx(dev_ctx);
+  pt::KernelContext op_kernel_ctx(dev_ctx);
   auto input_defs = pt_kernel.param_def().input_defs();
   auto output_defs = pt_kernel.param_def().output_defs();
 
@@ -266,7 +266,7 @@ static pt::OpKernelContext BuildDygraphOpKernelContext(
       auto* variable = var->MutableVar();
       auto* tensor = variable->template GetMutable<framework::LoDTensor>();
       // mutable_data before run kernel, to avoid share output form
-      // OpKernelContext to original tensor
+      // KernelContext to original tensor
       tensor->mutable_data(pt::TransToFluidPlace(out_def.backend),
                            pt::TransToProtoVarType(out_def.dtype));
       auto pt_out =
@@ -323,8 +323,8 @@ static void PreparedOpRunImpl(
 
 template <typename VarType>
 static void PreparedOpRunPtImpl(const framework::OperatorBase& op,
-                                const pt::OpKernelKey& pt_kernel_key,
-                                const pt::OpKernel& pt_kernel,
+                                const pt::KernelKey& pt_kernel_key,
+                                const pt::Kernel& pt_kernel,
                                 platform::DeviceContext* dev_ctx,
                                 const NameVarMap<VarType>& ins,
                                 const NameVarMap<VarType>& outs,
@@ -336,7 +336,7 @@ static void PreparedOpRunPtImpl(const framework::OperatorBase& op,
       &infer_shape_ctx);
 
   auto op_kernel_ctx =
-      BuildDygraphOpKernelContext<VarType>(pt_kernel, ins, outs, *dev_ctx);
+      BuildDygraphKernelContext<VarType>(pt_kernel, ins, outs, *dev_ctx);
   pt_kernel(&op_kernel_ctx);
 
   // TODO(chenweihang): add flags
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index a43229a4bbe04..8cfe209ec7ad0 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -151,8 +151,8 @@ class PreparedOp {
 
   PreparedOp(const framework::OperatorBase& op,
              const framework::RuntimeContext& ctx,
-             const pt::OpKernelKey& pt_kernel_key,
-             const pt::OpKernel& pt_kernel, platform::DeviceContext* dev_ctx);
+             const pt::KernelKey& pt_kernel_key, const pt::Kernel& pt_kernel,
+             platform::DeviceContext* dev_ctx);
 
   static PreparedOp Prepare(const NameVarMap<VarBase>& ins,
                             const NameVarMap<VarBase>& outs,
@@ -188,8 +188,8 @@ class PreparedOp {
   // TODo(chenweihang): Similar duplicate members are used for new tcmpt lib,
   // maybe we have better impl methods
   bool run_pt_kernel_{false};
-  pt::OpKernelKey pt_kernel_key_;
-  pt::OpKernel pt_kernel_;
+  pt::KernelKey pt_kernel_key_;
+  pt::Kernel pt_kernel_;
 };
 
 }  // namespace imperative
diff --git a/paddle/fluid/operators/mean_op_npu.cc b/paddle/fluid/operators/mean_op_npu.cc
index 98be037d7b8b0..6fc371ee37c52 100644
--- a/paddle/fluid/operators/mean_op_npu.cc
+++ b/paddle/fluid/operators/mean_op_npu.cc
@@ -16,6 +16,29 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+template <typename DeviceContext, typename T>
+class MeanNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+
+    std::vector<int> axes;
+
+    framework::NPUAttributeMap attr_input = {{"keep_dims", false},
+                                             {"axes", axes}};
+
+    out->mutable_data<T>(ctx.GetPlace());
+
+    const auto& runner = NpuOpRunner("ReduceMeanD", {*x}, {*out}, attr_input);
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+    runner.Run(stream);
+  }
+};
+
 template <typename DeviceContext, typename T>
 class MeanGradNPUKernel : public framework::OpKernel<T> {
  public:
@@ -67,6 +90,9 @@ class MeanGradNPUKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
+REGISTER_OP_NPU_KERNEL(
+    mean, ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, plat::float16>)
 
 REGISTER_OP_NPU_KERNEL(
     mean_grad,
diff --git a/paddle/fluid/operators/mean_op_xpu.cc b/paddle/fluid/operators/mean_op_xpu.cc
index 58220bf79a8ed..71bcc4be15ce5 100644
--- a/paddle/fluid/operators/mean_op_xpu.cc
+++ b/paddle/fluid/operators/mean_op_xpu.cc
@@ -21,6 +21,24 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+template <typename DeviceContext, typename T>
+class MeanXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+    output->mutable_data<T>(context.GetPlace());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    const float* x_data = input->data<float>();
+    float* y_data = output->data<float>();
+    int r = xpu::mean(dev_ctx.x_context(), x_data, y_data, input->numel());
+    PADDLE_ENFORCE_EQ(
+        r, xpu::Error_t::SUCCESS,
+        platform::errors::External(
+            "XPU kernel error, Mean op execution not succeed, error code=%d",
+            r));
+  }
+};
 template <typename DeviceContext, typename T>
 class MeanGradXPUKernel : public framework::OpKernel<T> {
  public:
@@ -46,6 +64,8 @@ class MeanGradXPUKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    mean, ops::MeanXPUKernel<paddle::platform::XPUDeviceContext, float>);
 REGISTER_OP_XPU_KERNEL(
     mean_grad,
     ops::MeanGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc
new file mode 100644
index 0000000000000..84ac14d04b85b
--- /dev/null
+++ b/paddle/fluid/operators/mkldnn/scale_mkldnn_op.cc
@@ -0,0 +1,63 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/platform/mkldnn_reuse.h"
+
+namespace paddle {
+namespace operators {
+
+using paddle::framework::Tensor;
+
+template <typename T>
+class ScaleMKLDNNKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    this->RunKernel(ctx);
+  }
+
+  void RunKernel(const framework::ExecutionContext& ctx) const {
+    const auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+
+    bool is_inplaced = x->IsSharedBufferWith(*out);
+
+    platform::ActivationMKLDNNHandler<T> handler(
+        mkldnn::algorithm::eltwise_linear, ctx, mkldnn_engine, ctx.GetPlace(),
+        x);
+
+    auto src_memory_p = handler.AcquireSrcMemory(x);
+    auto dst_memory_p =
+        is_inplaced ? src_memory_p : handler.AcquireDstMemory(out);
+    auto activation_p = handler.AcquireForwardPrimitive();
+
+    auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
+    activation_p->execute(astream, {{MKLDNN_ARG_FROM, *src_memory_p},
+                                    {MKLDNN_ARG_TO, *dst_memory_p}});
+    astream.wait();
+
+    out->set_layout(framework::DataLayout::kMKLDNN);
+    out->set_format(platform::GetMKLDNNFormat(*dst_memory_p));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(scale, MKLDNN, paddle::platform::CPUPlace,
+                   ops::ScaleMKLDNNKernel<float>,
+                   ops::ScaleMKLDNNKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/npu_op_runner.cc b/paddle/fluid/operators/npu_op_runner.cc
index 71a0f52b41ef7..bb6549c111988 100644
--- a/paddle/fluid/operators/npu_op_runner.cc
+++ b/paddle/fluid/operators/npu_op_runner.cc
@@ -42,26 +42,12 @@ static std::map<framework::proto::VarType::Type, aclDataType>
         {framework::proto::VarType::FP64, ACL_DOUBLE},
 };
 
-static std::map<pt::DataType, aclDataType> PT_DTYPE_2_ACL_DTYPE = {
-    {pt::DataType::kBOOL, ACL_BOOL},       {pt::DataType::kINT8, ACL_INT8},
-    {pt::DataType::kUINT8, ACL_UINT8},     {pt::DataType::kINT16, ACL_INT16},
-    {pt::DataType::kINT32, ACL_INT32},     {pt::DataType::kINT64, ACL_INT64},
-    {pt::DataType::kFLOAT16, ACL_FLOAT16}, {pt::DataType::kFLOAT32, ACL_FLOAT},
-    {pt::DataType::kFLOAT64, ACL_DOUBLE},
-};
-
 static std::map<DataLayout, aclFormat> DATA_LAYOUT_2_ACL_FORMAT = {
     {DataLayout::kNCHW, ACL_FORMAT_NCHW},
     {DataLayout::kNHWC, ACL_FORMAT_NHWC},
     {DataLayout::kAnyLayout, ACL_FORMAT_ND},
 };
 
-static std::map<pt::DataLayout, aclFormat> PT_DATA_LAYOUT_2_ACL_FORMAT = {
-    {pt::DataLayout::kNCHW, ACL_FORMAT_NCHW},
-    {pt::DataLayout::kNHWC, ACL_FORMAT_NHWC},
-    {pt::DataLayout::kAny, ACL_FORMAT_ND},
-};
-
 aclDataType ConvertToNpuDtype(framework::proto::VarType::Type dtype) {
   auto iter = DTYPE_2_ACL_DTYPE.find(dtype);
   PADDLE_ENFORCE_NE(iter, DTYPE_2_ACL_DTYPE.end(),
@@ -71,15 +57,6 @@ aclDataType ConvertToNpuDtype(framework::proto::VarType::Type dtype) {
   return iter->second;
 }
 
-aclDataType ConvertToNpuDtype(pt::DataType dtype) {
-  auto iter = PT_DTYPE_2_ACL_DTYPE.find(dtype);
-  PADDLE_ENFORCE_NE(
-      iter, PT_DTYPE_2_ACL_DTYPE.end(),
-      platform::errors::NotFound(
-          "The data type (%s) can not convert to ACL data type.", dtype));
-  return iter->second;
-}
-
 aclFormat ConvertToNpuFormat(DataLayout layout) {
   auto iter = DATA_LAYOUT_2_ACL_FORMAT.find(layout);
   PADDLE_ENFORCE_NE(
@@ -89,15 +66,6 @@ aclFormat ConvertToNpuFormat(DataLayout layout) {
   return iter->second;
 }
 
-aclFormat ConvertToNpuFormat(pt::DataLayout layout) {
-  auto iter = PT_DATA_LAYOUT_2_ACL_FORMAT.find(layout);
-  PADDLE_ENFORCE_NE(
-      iter, PT_DATA_LAYOUT_2_ACL_FORMAT.end(),
-      platform::errors::NotFound(
-          "The data type (%s) can not convert to ACL data type.", layout));
-  return iter->second;
-}
-
 aclrtStream GetCurrentNPUStream(int device_id) {
   if (device_id == -1) {
     device_id = platform::GetCurrentNPUDeviceId();
@@ -122,16 +90,6 @@ NpuOpRunner::NpuOpRunner(const std::string &op_type,
   AddAttrs(attrs);
 }
 
-NpuOpRunner::NpuOpRunner(const std::string &op_type,
-                         const std::vector<pt::DenseTensor> &inputs,
-                         const std::vector<pt::DenseTensor> &outputs,
-                         const NPUAttributeMap &attrs)
-    : op_type_(op_type) {
-  AddInputs(inputs);
-  AddOutputs(outputs);
-  AddAttrs(attrs);
-}
-
 NpuOpRunner::~NpuOpRunner() {
   VLOG(5) << "Free NpuOpRunner(" << this << ") of " << op_type_;
   // Is it safe to free the descs/buffers after run called in host ?
@@ -243,14 +201,6 @@ NpuOpRunner &NpuOpRunner::AddInput(const Tensor &tensor) {
   return *this;
 }
 
-NpuOpRunner &NpuOpRunner::AddInput(const pt::DenseTensor &tensor) {
-  // create aclTensorDesc
-  input_descs_.emplace_back(CreateTensorDesc(tensor));
-  // create aclDataBuffer
-  input_buffers_.emplace_back(CreateDataBuffer(tensor));
-  return *this;
-}
-
 NpuOpRunner &NpuOpRunner::AddInput(const Tensor &tensor, aclMemType mem_type) {
   // create aclTensorDesc
   input_descs_.emplace_back(CreateTensorDesc(tensor, mem_type));
@@ -331,14 +281,6 @@ NpuOpRunner &NpuOpRunner::AddOutput(const Tensor &tensor) {
   return *this;
 }
 
-NpuOpRunner &NpuOpRunner::AddOutput(const pt::DenseTensor &tensor) {
-  // create aclTensorDesc
-  output_descs_.emplace_back(CreateTensorDesc(tensor));
-  // create aclDataBuffer
-  output_buffers_.emplace_back(CreateDataBuffer(tensor));
-  return *this;
-}
-
 NpuOpRunner &NpuOpRunner::AddInputs(const std::vector<Tensor> &tensors) {
   input_descs_.reserve(tensors.size());
   input_buffers_.reserve(tensors.size());
@@ -351,19 +293,6 @@ NpuOpRunner &NpuOpRunner::AddInputs(const std::vector<Tensor> &tensors) {
   return *this;
 }
 
-NpuOpRunner &NpuOpRunner::AddInputs(
-    const std::vector<pt::DenseTensor> &tensors) {
-  input_descs_.reserve(tensors.size());
-  input_buffers_.reserve(tensors.size());
-  for (auto &tensor : tensors) {
-    // create aclTensorDesc
-    input_descs_.emplace_back(CreateTensorDesc(tensor));
-    // create aclDataBuffer
-    input_buffers_.emplace_back(CreateDataBuffer(tensor));
-  }
-  return *this;
-}
-
 // NOTE(zhiqiu): For operators whose input is a list (such as concat, stack),
 // It is needed to set the name of each input tensor.
 NpuOpRunner &NpuOpRunner::AddInputNames(const std::vector<std::string> &names) {
@@ -391,19 +320,6 @@ NpuOpRunner &NpuOpRunner::AddOutputs(const std::vector<Tensor> &tensors) {
   return *this;
 }
 
-NpuOpRunner &NpuOpRunner::AddOutputs(
-    const std::vector<pt::DenseTensor> &tensors) {
-  output_descs_.reserve(tensors.size());
-  output_buffers_.reserve(tensors.size());
-  for (auto &tensor : tensors) {
-    // create aclTensorDesc
-    output_descs_.emplace_back(CreateTensorDesc(tensor));
-    // create aclDataBuffer
-    output_buffers_.emplace_back(CreateDataBuffer(tensor));
-  }
-  return *this;
-}
-
 aclTensorDesc *NpuOpRunner::GetInputDesc(size_t index) {
   PADDLE_ENFORCE_LT(index, input_descs_.size(),
                     platform::errors::OutOfRange(
@@ -467,35 +383,6 @@ aclTensorDesc *NpuOpRunner::CreateTensorDesc(Tensor tensor,
   return desc;
 }
 
-aclTensorDesc *NpuOpRunner::CreateTensorDesc(const pt::DenseTensor &tensor,
-                                             aclMemType mem_type) {
-  auto dtype = ConvertToNpuDtype(tensor.type());
-  auto format = ConvertToNpuFormat(tensor.layout());
-  auto dims = framework::vectorize(tensor.dims());
-  int size = dims.size();
-  // TODO(pangyoki): `keep_prob` used in `DropOutGenMask` NPU
-  // OP must be a scalar with shape[0]. At present, the shape
-  // of the `prob` Tensor of this OP is forced to be set to 0
-  // in `npu_op_runner.cc`, which needs to be optimized later.
-  if (op_type_ == "DropOutGenMask" && size == 1 && *(dims.data()) == 1) {
-    size = 0;
-  }
-
-  VLOG(4) << "NPU dtype:" << dtype << " "
-          << "rank:" << dims.size() << " dims:" << tensor.dims()
-          << " format:" << format;
-
-  auto *desc = aclCreateTensorDesc(dtype, size, dims.data(), format);
-  PADDLE_ENFORCE_NOT_NULL(
-      desc, platform::errors::External("Call aclCreateTensorDesc failed."));
-  PADDLE_ENFORCE_NPU_SUCCESS(aclSetTensorStorageFormat(desc, format));
-  PADDLE_ENFORCE_NPU_SUCCESS(aclSetTensorStorageShape(desc, size, dims.data()));
-  if (mem_type == ACL_MEMTYPE_HOST) {
-    PADDLE_ENFORCE_NPU_SUCCESS(aclSetTensorPlaceMent(desc, mem_type));
-  }
-  return desc;
-}
-
 aclDataBuffer *NpuOpRunner::CreateDataBuffer(Tensor tensor) {
   void *ptr = tensor.data<void>();
   VLOG(4) << "NPU ptr: " << ptr << ", size: " << tensor.memory_size();
@@ -505,15 +392,6 @@ aclDataBuffer *NpuOpRunner::CreateDataBuffer(Tensor tensor) {
   return buffer;
 }
 
-aclDataBuffer *NpuOpRunner::CreateDataBuffer(const pt::DenseTensor &tensor) {
-  void *ptr = const_cast<void *>(tensor.data<void>());
-  VLOG(4) << "NPU ptr: " << ptr << ", size: " << tensor.MemorySize();
-  auto *buffer = aclCreateDataBuffer(ptr, tensor.MemorySize());
-  PADDLE_ENFORCE_NOT_NULL(
-      buffer, platform::errors::External("Call aclCreateDataBuffer failed."));
-  return buffer;
-}
-
 void NpuOpRunner::Run(aclrtStream stream) const {
   if (!stream) {
     VLOG(4) << "Run with default current npu stream: " << stream;
diff --git a/paddle/fluid/operators/npu_op_runner.h b/paddle/fluid/operators/npu_op_runner.h
index 601a542b1a069..45e973970a956 100644
--- a/paddle/fluid/operators/npu_op_runner.h
+++ b/paddle/fluid/operators/npu_op_runner.h
@@ -24,8 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
 
-#include "paddle/tcmpt/api/include/dev/core.h"
-
 namespace paddle {
 namespace operators {
 
@@ -44,11 +42,6 @@ class NpuOpRunner {
               const std::vector<Tensor> &outputs = {},
               const NPUAttributeMap &attrs = {});
 
-  NpuOpRunner(const std::string &op_type,
-              const std::vector<pt::DenseTensor> &inputs = {},
-              const std::vector<pt::DenseTensor> &outputs = {},
-              const NPUAttributeMap &attrs = {});
-
   // NOTE(zhiqiu): why forbid copy and operator= ?
   // Since we will free the tensor_descs and data_buffers in the ~NpuOpRunner,
   // if shallow copy is performed on tensor_descs and data_buffers, it may
@@ -69,8 +62,6 @@ class NpuOpRunner {
 
   NpuOpRunner &AddInput(const Tensor &tensor);
 
-  NpuOpRunner &AddInput(const pt::DenseTensor &tensor);
-
   // NOTE(zhiqiu): CANN-5.0.2 support input tensors on host.
   // Specifically, the tensor of shape, tensor of dims, etc, which are are small
   // vector/list.
@@ -86,18 +77,12 @@ class NpuOpRunner {
 
   NpuOpRunner &AddOutput(const Tensor &tensor);
 
-  NpuOpRunner &AddOutput(const pt::DenseTensor &tensor);
-
   NpuOpRunner &AddInputs(const std::vector<Tensor> &tensors);
 
-  NpuOpRunner &AddInputs(const std::vector<pt::DenseTensor> &tensors);
-
   NpuOpRunner &AddInputNames(const std::vector<std::string> &names);
 
   NpuOpRunner &AddOutputs(const std::vector<Tensor> &tensors);
 
-  NpuOpRunner &AddOutputs(const std::vector<pt::DenseTensor> &tensors);
-
   aclTensorDesc *GetInputDesc(size_t index);
 
   aclTensorDesc *GetOutputDesc(size_t index);
@@ -117,10 +102,6 @@ class NpuOpRunner {
                                   aclMemType mem_type = ACL_MEMTYPE_DEVICE);
   aclDataBuffer *CreateDataBuffer(Tensor tensor);
 
-  aclTensorDesc *CreateTensorDesc(const pt::DenseTensor &tensor,
-                                  aclMemType mem_type = ACL_MEMTYPE_DEVICE);
-  aclDataBuffer *CreateDataBuffer(const pt::DenseTensor &tensor);
-
  private:
   std::string op_type_;
   std::vector<aclDataBuffer *> input_buffers_;
diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc
index 1bdb3728f538e..dc7083f45eda2 100644
--- a/paddle/fluid/operators/pool_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc
@@ -268,7 +268,7 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
 
 #ifdef PADDLE_WITH_HIP
     if (pooling_type == "max") {
-      using OpKernelMap = paddle::framework::OperatorWithKernel::OpKernelMap;
+      using KernelMap = paddle::framework::OperatorWithKernel::KernelMap;
       using OpKernelFunc = paddle::framework::OperatorWithKernel::OpKernelFunc;
       auto &all_op_kernels =
           paddle::framework::OperatorWithKernel::AllOpKernels();
@@ -279,7 +279,7 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
           platform::errors::Unavailable(
               "There are no kernels which are registered in the %s operator.",
               op_type));
-      OpKernelMap &kernels = kernels_iter->second;
+      KernelMap &kernels = kernels_iter->second;
       paddle::framework::OpKernelType expected_kernel_key(
           paddle::framework::ToDataType(typeid(T)), ctx.GetPlace());
       auto kernel_iter = kernels.find(expected_kernel_key);
diff --git a/paddle/fluid/operators/sign_op_xpu.cc b/paddle/fluid/operators/sign_op_xpu.cc
new file mode 100644
index 0000000000000..a164a9b056677
--- /dev/null
+++ b/paddle/fluid/operators/sign_op_xpu.cc
@@ -0,0 +1,44 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/operators/sign_op.h"
+#include "paddle/fluid/platform/xpu/xpu_header.h"
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class SignXPUKernel : public framework::OpKernel<T> {
+ public:
+  virtual void Compute(const framework::ExecutionContext& context) const {
+    auto* out = context.Output<framework::Tensor>("Out");
+    auto* in = context.Input<framework::Tensor>("X");
+    out->mutable_data<T>(in->place());
+    auto xpu_context = context.device_context<DeviceContext>().x_context();
+    int r = xpu::activation_forward(xpu_context, xpu::Activation_t::SIGN,
+                                    in->numel(), in->data<T>(), out->data<T>());
+    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                      platform::errors::Fatal("XPU sign kernel error!"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    sign, ops::SignXPUKernel<paddle::platform::XPUDeviceContext, float>);
+
+#endif
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index f092dfee04c27..370d9b3925226 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -26,8 +26,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #include "paddle/fluid/platform/place.h"
 
-#include "paddle/tcmpt/api/include/dev/core.h"
-
 namespace paddle {
 namespace platform {
 
@@ -68,13 +66,6 @@ class MKLDNNHandlerNoCachingT {
                                             to_void_cast<T>(input_data));
   }
 
-  std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
-      const pt::DenseTensor* input) {
-    const T* input_data = input->data<T>();
-    return this->AcquireMemoryFromPrimitive(fwd_pd_->src_desc(),
-                                            to_void_cast<T>(input_data));
-  }
-
   template <typename T_out = T>
   std::shared_ptr<mkldnn::memory> AcquireDstMemory(framework::Tensor* output) {
     T_out* ptr =
@@ -82,12 +73,6 @@ class MKLDNNHandlerNoCachingT {
     return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc(), ptr);
   }
 
-  template <typename T_out = T>
-  std::shared_ptr<mkldnn::memory> AcquireDstMemory(pt::DenseTensor* output) {
-    T_out* ptr = output->mutable_data<T_out>();
-    return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc(), ptr);
-  }
-
   template <typename T_out = T>
   std::shared_ptr<mkldnn::memory> AcquireDstMemory(void) {
     return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc());
@@ -315,13 +300,6 @@ class MKLDNNHandlerT {
         fwd_pd_->src_desc(), to_void_cast<T>(input_data), "@src_mem_p");
   }
 
-  std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
-      const pt::DenseTensor* input) {
-    const T* input_data = const_cast<T*>(input->data<T>());
-    return this->AcquireMemoryFromPrimitive(
-        fwd_pd_->src_desc(), to_void_cast<T>(input_data), "@src_mem_p");
-  }
-
   template <typename T_out = T>
   std::shared_ptr<mkldnn::memory> AcquireDstMemory(framework::Tensor* output) {
     T_out* ptr =
@@ -330,13 +308,6 @@ class MKLDNNHandlerT {
                                             "@dst_mem_p");
   }
 
-  template <typename T_out = T>
-  std::shared_ptr<mkldnn::memory> AcquireDstMemory(pt::DenseTensor* output) {
-    T_out* ptr = output->mutable_data<T_out>();
-    return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc(), ptr,
-                                            "@dst_mem_p");
-  }
-
   template <typename T_out = T>
   std::shared_ptr<mkldnn::memory> AcquireDstMemory(void) {
     return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc(), "@dstt_mem_p");
@@ -958,6 +929,7 @@ class BroadcastDataMKLDNNHandler
   std::shared_ptr<mkldnn::memory> AcquireDstMemory(framework::Tensor* output) {
     T_out* ptr = output->mutable_data<T_out>(
         this->place_, this->fwd_pd_->dst_desc().get_size());
+    ;
     memset(ptr, 0, this->fwd_pd_->dst_desc().get_size());
     return this->AcquireMemoryFromPrimitive(this->fwd_pd_->dst_desc(), ptr);
   }
@@ -1009,9 +981,8 @@ class ActivationMKLDNNHandler
     if (algorithm == mkldnn::algorithm::eltwise_linear) {
       bool bias_after_scale = ctx.Attr<bool>("bias_after_scale");
       auto* scale_tensor = ctx.Input<Tensor>("ScaleTensor");
-      alpha = (scale_tensor == nullptr)
-                  ? ctx.Attr<float>("scale")
-                  : (float)*(scale_tensor->data<T>());  // NOLINT
+      alpha = (scale_tensor == nullptr) ? ctx.Attr<float>("scale")
+                                        : (float)*(scale_tensor->data<T>());
       beta = ctx.Attr<float>("bias");
       // if bias_after_scale == true
       //   out = scale*X + bias
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index 573f1fb81501f..3422e75335f4c 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -538,7 +538,7 @@ GenerateOpFunctions() {
     // since only OperatorWithKernel can run in dygraph mode.
     // if the tcmpt lib contains op kernel, we still generate ops method
     if (!all_kernels.count(op_type) &&
-        !pt::OpKernelFactory::Instance().ContainsOperation(op_type.c_str())) {
+        !pt::KernelFactory::Instance().ContainsKernel(op_type.c_str())) {
       continue;
     }
 
diff --git a/paddle/tcmpt/CMakeLists.txt b/paddle/tcmpt/CMakeLists.txt
index 63f5c1b312e32..33fd0be0f374d 100644
--- a/paddle/tcmpt/CMakeLists.txt
+++ b/paddle/tcmpt/CMakeLists.txt
@@ -2,24 +2,30 @@
 add_subdirectory(api)
 # tcmpt core components
 add_subdirectory(core)
+# tcmpt eigne functors, now paddle must compiled with eigen, but eigen just is
+# one backend dtype, we should support cropping it for lite
+add_subdirectory(eigen)
 # tcmpt kernels for diff device
 add_subdirectory(cpu)
 if(WITH_GPU OR WITH_ROCM)
+  # TODO(chenweihang): if hip can split from cuda impl, we should add hip dir
   add_subdirectory(cuda)
 endif()
-# TODO(chenweihang): if hip can split from cuda impl, we should add hip dir
+# TODO(chenweihang): migrate MKLDNN Kernel in the second phase of the project
 if(WITH_MKLDNN)
   add_subdirectory(mkldnn)
 endif()
+# TODO(chenweihang): migrate NPU Kernel in the second phase of the project
 if(WITH_ASCEND_CL)
   add_subdirectory(npu)
 endif()
+# TODO(chenweihang): migrate XPU Kernel in the second phase of the project
 if(WITH_XPU)
   add_subdirectory(xpu)
 endif()
 # tcmpt infershape
 add_subdirectory(infershape)
-# tcmpt public functors
+# TODO(xingfeng): tcmpt inner module API designed by a high-performance team
 add_subdirectory(module)
 # tcmpt tests
 add_subdirectory(tests)
diff --git a/paddle/tcmpt/api/CMakeLists.txt b/paddle/tcmpt/api/CMakeLists.txt
index ba29c5d9e1b2f..26aed55eee21c 100644
--- a/paddle/tcmpt/api/CMakeLists.txt
+++ b/paddle/tcmpt/api/CMakeLists.txt
@@ -2,14 +2,8 @@ add_subdirectory(src)
 
 set(TCMPT_DEPS convert_utils dense_tensor selected_rows_tensor kernel_factory kernel_context)
 set(TCMPT_DEPS ${TCMPT_DEPS} math_cpu)
-if(WITH_MKLDNN)
-  set(TCMPT_DEPS ${TCMPT_DEPS} math_mkldnn)
-endif()
 if(WITH_GPU OR WITH_ROCM)
   set(TCMPT_DEPS ${TCMPT_DEPS} math_cuda)
 endif()
-if(WITH_XPU)
-  set(TCMPT_DEPS ${TCMPT_DEPS} math_xpu)
-endif()
 
 cc_library(tcmpt SRCS all.cc DEPS ${TCMPT_DEPS})
diff --git a/paddle/tcmpt/api/include/dev/math.h b/paddle/tcmpt/api/include/dev/math.h
index bc498f8382853..2f1a04d16f8ac 100644
--- a/paddle/tcmpt/api/include/dev/math.h
+++ b/paddle/tcmpt/api/include/dev/math.h
@@ -17,6 +17,3 @@ limitations under the License. */
 // See Note: [ How do we organize the kernel directory ]
 #include "paddle/tcmpt/cpu/math.h"
 #include "paddle/tcmpt/cuda/math.h"
-#include "paddle/tcmpt/mkldnn/math.h"
-#include "paddle/tcmpt/npu/math.h"
-#include "paddle/tcmpt/xpu/math.h"
diff --git a/paddle/tcmpt/api/include/tensor.h b/paddle/tcmpt/api/include/tensor.h
index 6029f87b5c4a4..79d2183ee58b3 100644
--- a/paddle/tcmpt/api/include/tensor.h
+++ b/paddle/tcmpt/api/include/tensor.h
@@ -24,12 +24,12 @@ limitations under the License. */
  * [ Why still include the fluid headers? ]
  *
  * We hope to organize the basic implementation of Tensor and the logic related
- * to Tensor operation into an independent library, which we call
- * [Tensor Operation Library, tcmpt], so we extract or rewrite the original
- * OpKernels.
+ * to Tensor computation into an independent library, which we call
+ * [Tensor Compute Library, tcmpt], so we extract or rewrite the original
+ * Kernels.
  *
  * In the future, the training library, inference library and custom operators
- * will link to this Tensor operation library.
+ * will link to this Tensor Compute library.
  *
  * However, if we directly split the link relation, we need to make too many
  * changes, which will affect the stability of the framework, so here we still
@@ -54,12 +54,12 @@ class AutogradMetaInterface {
 
 /**
  * Tensor is the API description of the basic data structure in the
- * [ Paddle "Tensor OPeration (tcmpt)" Library ].
+ * [ Paddle "Tensor CoMPuTe (tcmpt)" Library ].
  *
  * It is not limited to a simple n-dimensional array.
  * It contains a smart pointer to `TensorImpl`. The data description contained
  * in Tensor is defined by TensorImpl. Tensor only defines the interface for
- * operation.
+ * computation.
  *
  * This is a new Tensor design, which is independent of the original
  * framework::Tensor in fluid. The original Tensor will be gradually discarded
diff --git a/paddle/tcmpt/core/dtype.h b/paddle/tcmpt/core/dtype.h
index 0683fd5fe467c..d7a0b3c007db4 100644
--- a/paddle/tcmpt/core/dtype.h
+++ b/paddle/tcmpt/core/dtype.h
@@ -36,8 +36,8 @@ using bfloat16 = paddle::platform::bfloat16;
  *
  * We need to ensure that the operator library is relatively independent
  * and does not depend on the framework. Therefore, before calling the kernel
- * in the Tensor operation library inside the framework, the internal
- * data type needs to be converted to the data type in the Tensor operation
+ * in the Tensor Compute library inside the framework, the internal
+ * data type needs to be converted to the data type in the Tensor Compute
  * library.
  *
  */
diff --git a/paddle/tcmpt/core/kernel_context.h b/paddle/tcmpt/core/kernel_context.h
index e7815f3ab5ae8..4f2f4e121f014 100644
--- a/paddle/tcmpt/core/kernel_context.h
+++ b/paddle/tcmpt/core/kernel_context.h
@@ -28,19 +28,19 @@ namespace pt {
 using DeviceContext = paddle::platform::DeviceContext;
 
 /**
- * Note: OpKernelContext doesn't manage the life if DeviceContext and Tensor
+ * Note: KernelContext doesn't manage the life if DeviceContext and Tensor
  *
- * Note: OpKernelContext does not couple the concept of framework,
+ * Note: KernelContext does not couple the concept of framework,
  *       its constructor can only take the members it needs as parameters,
  *       not Scope, RuntimeContext, etc. as parameters
  */
-class OpKernelContext {
+class KernelContext {
  public:
-  explicit OpKernelContext(const DeviceContext& dev_ctx) : dev_ctx_(dev_ctx) {}
-  OpKernelContext(const DeviceContext& dev_ctx,
-                  const std::vector<std::shared_ptr<TensorInterface>>& inputs,
-                  const std::vector<std::shared_ptr<TensorInterface>>& outputs,
-                  const std::vector<paddle::any>& attrs)
+  explicit KernelContext(const DeviceContext& dev_ctx) : dev_ctx_(dev_ctx) {}
+  KernelContext(const DeviceContext& dev_ctx,
+                const std::vector<std::shared_ptr<TensorInterface>>& inputs,
+                const std::vector<std::shared_ptr<TensorInterface>>& outputs,
+                const std::vector<paddle::any>& attrs)
       : dev_ctx_(dev_ctx), inputs_(inputs), outputs_(outputs), attrs_(attrs) {}
 
   template <typename CtxType>
diff --git a/paddle/tcmpt/core/kernel_def.h b/paddle/tcmpt/core/kernel_def.h
index 282e9ded2e4d1..e9069742844af 100644
--- a/paddle/tcmpt/core/kernel_def.h
+++ b/paddle/tcmpt/core/kernel_def.h
@@ -16,10 +16,10 @@
 
 namespace pt {
 
-class OpKernel;
-class OpKernelContext;
+class Kernel;
+class KernelContext;
 
-using OpKernelFn = void (*)(OpKernelContext* ctx);
-using OpKernelParamDefFn = void (*)(OpKernel* kernel);
+using KernelFn = void (*)(KernelContext* ctx);
+using KernelParamDefFn = void (*)(Kernel* kernel);
 
 }  // namespace pt
diff --git a/paddle/tcmpt/core/kernel_factory.cc b/paddle/tcmpt/core/kernel_factory.cc
index 6b2ea66f710d3..25696c8d8ff11 100644
--- a/paddle/tcmpt/core/kernel_factory.cc
+++ b/paddle/tcmpt/core/kernel_factory.cc
@@ -19,44 +19,44 @@
 
 namespace pt {
 
-OpKernelFactory& OpKernelFactory::Instance() {
-  static OpKernelFactory g_op_kernel_factory;
+KernelFactory& KernelFactory::Instance() {
+  static KernelFactory g_op_kernel_factory;
   return g_op_kernel_factory;
 }
 
-bool OpKernelFactory::ContainsOperation(const char* op_type) const {
-  auto iter = kernels_.find(OperationName(op_type));
+bool KernelFactory::ContainsKernel(const char* kernel_name) const {
+  auto iter = kernels_.find(KernelName(kernel_name));
   return (iter != kernels_.end());
 }
 
-const OpKernel& OpKernelFactory::SelectKernel(
-    const OperationName& op_name, const OpKernelKey& kernel_key) const {
-  auto iter = kernels_.find(op_name);
+const Kernel& KernelFactory::SelectKernel(const KernelName& kernel_name,
+                                          const KernelKey& kernel_key) const {
+  auto iter = kernels_.find(kernel_name);
   PADDLE_ENFORCE_NE(iter,
                     kernels_.end(),
                     paddle::platform::errors::NotFound(
-                        "The operation `%s` is not registered.", op_name));
+                        "The kernel `%s` is not registered.", kernel_name));
 
   auto kernel_iter = iter->second.find(kernel_key);
   PADDLE_ENFORCE_NE(
       kernel_iter,
       iter->second.end(),
       paddle::platform::errors::NotFound(
-          "The kernel with key %s of operation `%s` is not registered.",
+          "The kernel with key %s of kernel `%s` is not registered.",
           kernel_key,
-          op_name));
+          kernel_name));
 
   return kernel_iter->second;
 }
 
-const OpKernel& OpKernelFactory::SelectKernel(const OperationName& op_name,
-                                              Backend backend,
-                                              DataLayout layout,
-                                              DataType dtype) const {
-  return SelectKernel(op_name, OpKernelKey(backend, layout, dtype));
+const Kernel& KernelFactory::SelectKernel(const KernelName& kernel_name,
+                                          Backend backend,
+                                          DataLayout layout,
+                                          DataType dtype) const {
+  return SelectKernel(kernel_name, KernelKey(backend, layout, dtype));
 }
 
-std::ostream& operator<<(std::ostream& os, OpKernelFactory& kernel_factory) {
+std::ostream& operator<<(std::ostream& os, KernelFactory& kernel_factory) {
   for (const auto& op_kernel_pair : kernel_factory.kernels()) {
     os << "- op: " << op_kernel_pair.first << "\n";
     for (const auto& kernel_pair : op_kernel_pair.second) {
diff --git a/paddle/tcmpt/core/kernel_factory.h b/paddle/tcmpt/core/kernel_factory.h
index d806f6c2b5e6c..fd3ef051b02db 100644
--- a/paddle/tcmpt/core/kernel_factory.h
+++ b/paddle/tcmpt/core/kernel_factory.h
@@ -32,73 +32,73 @@ namespace pt {
 /**
  * [ Naming considerations ]
  *
- * The tensor operation library contains many operations, and the operation
- * in each specific scenario is represented by an operation kernel.
+ * The tensor Compute library contains many kernels, and the computation
+ * in each specific scenario is represented by an kernel.
  *
- * We directly named it `Kernel` instead of `OpKernel`, the tensor operation
+ * We directly named it `Kernel` instead of `Kernel`, the tensor Compute
  * library here and fluid are independent, avoiding developers from
  * misunderstanding the relationship between the two concepts.
  */
 
-class OpKernelContext;
+class KernelContext;
 
-using OpKernelFn = void (*)(OpKernelContext* ctx);
+using KernelFn = void (*)(KernelContext* ctx);
 
-struct OperationName final {
+struct KernelName final {
   // TODO(chenweihang): use string_view later?
-  std::string op_type;
-  std::string overload_type;
+  std::string name;
+  std::string overload_name;
   // Avoid calculating Hash value at runtime
   size_t hash_value;
 
-  OperationName(std::string op_type, std::string overload_type)
-      : op_type(std::move(op_type)), overload_type(std::move(overload_type)) {
-    hash_value = std::hash<std::string>()(op_type) ^
-                 (std::hash<std::string>()(overload_type) << 1);
+  KernelName(std::string name, std::string overload_name)
+      : name(std::move(name)), overload_name(std::move(overload_name)) {
+    hash_value = std::hash<std::string>()(name) ^
+                 (std::hash<std::string>()(overload_name) << 1);
   }
 
-  OperationName(const char* op_name) {
-    std::string op_name_str(op_name);
-    size_t pos = op_name_str.find_first_of('.');
+  KernelName(const char* kernel_name) {
+    std::string kernel_name_str(kernel_name);
+    size_t pos = kernel_name_str.find_first_of('.');
     if (pos == std::string::npos) {
-      op_type = op_name_str;
-      overload_type = "";
+      name = kernel_name_str;
+      overload_name = "";
     } else {
-      op_type = op_name_str.substr(0, pos);
-      PADDLE_ENFORCE_EQ(op_name_str.find('.', pos + 1),
+      name = kernel_name_str.substr(0, pos);
+      PADDLE_ENFORCE_EQ(kernel_name_str.find('.', pos + 1),
                         std::string::npos,
                         paddle::platform::errors::InvalidArgument(
-                            "OperationName only can contains one '.'."));
-      overload_type = op_name_str.substr(pos + 1, op_name_str.size());
+                            "KernelName only can contains one '.'."));
+      overload_name = kernel_name_str.substr(pos + 1, kernel_name_str.size());
     }
-    hash_value = std::hash<std::string>()(op_type) ^
-                 (std::hash<std::string>()(overload_type) << 1);
+    hash_value = std::hash<std::string>()(name) ^
+                 (std::hash<std::string>()(overload_name) << 1);
   }
 
   struct Hash {
-    size_t operator()(const OperationName& op_name) const {
-      return op_name.hash_value;
+    size_t operator()(const KernelName& kernel_name) const {
+      return kernel_name.hash_value;
     }
   };
 
-  bool operator<(const OperationName& op_name) const {
-    return hash_value < op_name.hash_value;
+  bool operator<(const KernelName& kernel_name) const {
+    return hash_value < kernel_name.hash_value;
   }
 
-  bool operator==(const OperationName& op_name) const {
-    return hash_value == op_name.hash_value;
+  bool operator==(const KernelName& kernel_name) const {
+    return hash_value == kernel_name.hash_value;
   }
 
-  bool operator!=(const OperationName& op_name) const {
-    return hash_value != op_name.hash_value;
+  bool operator!=(const KernelName& kernel_name) const {
+    return hash_value != kernel_name.hash_value;
   }
 };
 
-class OpKernelKey {
+class KernelKey {
  public:
-  OpKernelKey() = default;
+  KernelKey() = default;
 
-  OpKernelKey(Backend backend, DataLayout layout, DataType dtype)
+  KernelKey(Backend backend, DataLayout layout, DataType dtype)
       : backend_(backend), layout_(layout), dtype_(dtype) {
     // |----31-20------|---19-12---|---11-8----|---7-0---|
     // | For extension | DataType | DataLayout | Backend |
@@ -116,22 +116,20 @@ class OpKernelKey {
 
   uint32_t hash_value() const { return hash_value_; }
 
-  bool operator<(const OpKernelKey& key) const {
+  bool operator<(const KernelKey& key) const {
     return hash_value_ < key.hash_value();
   }
 
-  bool operator==(const OpKernelKey& key) const {
+  bool operator==(const KernelKey& key) const {
     return hash_value_ == key.hash_value();
   }
 
-  bool operator!=(const OpKernelKey& key) const {
+  bool operator!=(const KernelKey& key) const {
     return hash_value_ != key.hash_value();
   }
 
   struct Hash {
-    uint32_t operator()(const OpKernelKey& key) const {
-      return key.hash_value();
-    }
+    uint32_t operator()(const KernelKey& key) const { return key.hash_value(); }
   };
 
  private:
@@ -161,9 +159,9 @@ struct ParamDef {
       : backend(backend), layout(layout), dtype(dtype) {}
 };
 
-class OpKernelParamDef {
+class KernelParamDef {
  public:
-  OpKernelParamDef() = default;
+  KernelParamDef() = default;
 
   void AppendInput(Backend backend, DataLayout layout, DataType dtype) {
     input_defs_.emplace_back(ParamDef(backend, layout, dtype));
@@ -183,77 +181,76 @@ class OpKernelParamDef {
   std::vector<ParamDef> output_defs_{{}};
 };
 
-class OpKernel {
+class Kernel {
  public:
   // for map element contruct
-  OpKernel() = default;
+  Kernel() = default;
 
-  explicit OpKernel(OpKernelFn fn) : fn_(fn) {}
+  explicit Kernel(KernelFn fn) : fn_(fn) {}
 
-  void operator()(OpKernelContext* ctx) const { fn_(ctx); }
+  void operator()(KernelContext* ctx) const { fn_(ctx); }
 
-  OpKernelParamDef* mutable_param_def() { return &param_def_; }
+  KernelParamDef* mutable_param_def() { return &param_def_; }
 
-  const OpKernelParamDef& param_def() const { return param_def_; }
+  const KernelParamDef& param_def() const { return param_def_; }
 
  private:
-  OpKernelFn fn_{nullptr};
-  OpKernelParamDef param_def_;
+  KernelFn fn_{nullptr};
+  KernelParamDef param_def_;
 };
 
 /**
- * Note: Each Operation need a basic kernel map that named by op_type.
- *       Such as for scale op, OpKernelMap contains a `scale` kernel map,
+ * Note: Each Computation need a basic kernel map that named by kernel_name.
+ *       Such as for scale op, KernelMap contains a `scale` kernel map,
  *       if it still need other overload kernel, the op name can be
  *       `scale.***`.
  */
-class OpKernelFactory {
+class KernelFactory {
  public:
   // replaced by paddle::flat_hash_map later
-  using OpKernelMap = std::unordered_map<
-      OperationName,
-      std::unordered_map<OpKernelKey, OpKernel, OpKernelKey::Hash>,
-      OperationName::Hash>;
+  using KernelMap =
+      std::unordered_map<KernelName,
+                         std::unordered_map<KernelKey, Kernel, KernelKey::Hash>,
+                         KernelName::Hash>;
 
-  static OpKernelFactory& Instance();
+  static KernelFactory& Instance();
 
-  OpKernelMap& kernels() { return kernels_; }
+  KernelMap& kernels() { return kernels_; }
 
-  bool ContainsOperation(const char* op_type) const;
+  bool ContainsKernel(const char* name) const;
 
-  const OpKernel& SelectKernel(const OperationName& op_name,
-                               const OpKernelKey& kernel_key) const;
+  const Kernel& SelectKernel(const KernelName& kernel_name,
+                             const KernelKey& kernel_key) const;
 
-  const OpKernel& SelectKernel(const OperationName& op_name,
-                               Backend backend,
-                               DataLayout layout,
-                               DataType dtype) const;
+  const Kernel& SelectKernel(const KernelName& kernel_name,
+                             Backend backend,
+                             DataLayout layout,
+                             DataType dtype) const;
 
  private:
-  OpKernelFactory() = default;
+  KernelFactory() = default;
 
-  OpKernelMap kernels_;
+  KernelMap kernels_;
 };
 
 /** operator << overload **/
 
 inline std::ostream& operator<<(std::ostream& os,
-                                const OperationName& op_name) {
-  if (op_name.overload_type.empty()) {
-    os << op_name.op_type;
+                                const KernelName& kernel_name) {
+  if (kernel_name.overload_name.empty()) {
+    os << kernel_name.name;
   } else {
-    os << op_name.op_type << "." << op_name.overload_type;
+    os << kernel_name.name << "." << kernel_name.overload_name;
   }
   return os;
 }
 
-inline std::ostream& operator<<(std::ostream& os,
-                                const OpKernelKey& kernel_key) {
+inline std::ostream& operator<<(std::ostream& os, const KernelKey& kernel_key) {
   os << "(" << kernel_key.backend() << ", " << kernel_key.layout() << ", "
      << kernel_key.dtype() << ")";
   return os;
 }
 
-std::ostream& operator<<(std::ostream& os, OpKernelFactory& kernel_factory);
+std::ostream& operator<<(std::ostream& os, KernelFactory& kernel_factory);
 
 }  // namespace pt
diff --git a/paddle/tcmpt/core/kernel_registry.h b/paddle/tcmpt/core/kernel_registry.h
index 1a403bf99f38e..448f5b8dbc5d0 100644
--- a/paddle/tcmpt/core/kernel_registry.h
+++ b/paddle/tcmpt/core/kernel_registry.h
@@ -24,24 +24,24 @@ namespace pt {
 #define DATALAYOUT(arg__) pt::DataLayout::k##arg__
 #define DATATYPE(arg__) pt::DataType::k##arg__
 
-class OpKernelRegistrar {
+class KernelRegistrar {
  public:
-  OpKernelRegistrar(const char* op_name,
-                    Backend backend,
-                    DataLayout layout,
-                    DataType dtype,
-                    OpKernelParamDefFn param_def_fn,
-                    OpKernelFn kernel_fn) {
-    OperationName final_op_name(op_name);
-    OpKernelKey op_kernel_key(backend, layout, dtype);
-    OpKernel kernel(kernel_fn);
+  KernelRegistrar(const char* kernel_name,
+                  Backend backend,
+                  DataLayout layout,
+                  DataType dtype,
+                  KernelParamDefFn param_def_fn,
+                  KernelFn kernel_fn) {
+    KernelName final_kernel_name(kernel_name);
+    KernelKey op_kernel_key(backend, layout, dtype);
+    Kernel kernel(kernel_fn);
     param_def_fn(&kernel);
 
     // TODO(chenweihang): use default input and output for verify
     kernel.mutable_param_def()->AppendInput(backend, layout, dtype);
     kernel.mutable_param_def()->AppendOutput(backend, layout, dtype);
 
-    OpKernelFactory::Instance().kernels()[final_op_name][op_kernel_key] =
+    KernelFactory::Instance().kernels()[final_kernel_name][op_kernel_key] =
         kernel;
   }
 };
@@ -79,30 +79,34 @@ class OpKernelRegistrar {
 #define _PT_ARG_N(_1, _2, _3, _4, _5, _6, _7, _8, N, ...) N
 #define _PT_RESQ_N() 8, 7, 6, 5, 4, 3, 2, 1, 0
 
-#define PT_REGISTER_KERNEL(                                   \
-    op_name, backend, layout, meta_kernel_fn, cpp_dtype, ...) \
-  _PT_REGISTER_KERNEL(                                        \
-      op_name, PT_ID, backend, layout, meta_kernel_fn, cpp_dtype, __VA_ARGS__)
+#define PT_REGISTER_KERNEL(                                       \
+    kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...) \
+  _PT_REGISTER_KERNEL(kernel_name,                                \
+                      PT_ID,                                      \
+                      backend,                                    \
+                      layout,                                     \
+                      meta_kernel_fn,                             \
+                      cpp_dtype,                                  \
+                      __VA_ARGS__)
 
-#define _PT_REGISTER_KERNEL(                                           \
-    op_name, func_id, backend, layout, meta_kernel_fn, cpp_dtype, ...) \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                   \
-      PT_CONCATENATE(pt_op_kernel_ns_check_, func_id),                 \
-      "PT_REGISTER_KERNEL must be called in global namespace.");       \
-  PT_KERNEL_SPECIALIZE(meta_kernel_fn, cpp_dtype, __VA_ARGS__);        \
-  static void PT_CONCATENATE(__PT_KERNEL_PARAM_DEF_FN_,                \
-                             func_id)(::pt::OpKernel*);                \
-  PT_KERNEL_REGISTRAR_INIT(                                            \
-      op_name,                                                         \
-      func_id,                                                         \
-      backend,                                                         \
-      layout,                                                          \
-      &PT_CONCATENATE(__PT_KERNEL_PARAM_DEF_FN_, func_id),             \
-      meta_kernel_fn,                                                  \
-      cpp_dtype,                                                       \
-      __VA_ARGS__);                                                    \
-  void PT_CONCATENATE(__PT_KERNEL_PARAM_DEF_FN_,                       \
-                      func_id)(::pt::OpKernel * kernel)
+#define _PT_REGISTER_KERNEL(                                               \
+    kernel_name, func_id, backend, layout, meta_kernel_fn, cpp_dtype, ...) \
+  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                       \
+      PT_CONCATENATE(pt_op_kernel_ns_check_, func_id),                     \
+      "PT_REGISTER_KERNEL must be called in global namespace.");           \
+  PT_KERNEL_SPECIALIZE(meta_kernel_fn, cpp_dtype, __VA_ARGS__);            \
+  static void PT_CONCATENATE(__PT_KERNEL_PARAM_DEF_FN_,                    \
+                             func_id)(::pt::Kernel*);                      \
+  PT_KERNEL_REGISTRAR_INIT(                                                \
+      kernel_name,                                                         \
+      func_id,                                                             \
+      backend,                                                             \
+      layout,                                                              \
+      &PT_CONCATENATE(__PT_KERNEL_PARAM_DEF_FN_, func_id),                 \
+      meta_kernel_fn,                                                      \
+      cpp_dtype,                                                           \
+      __VA_ARGS__);                                                        \
+  void PT_CONCATENATE(__PT_KERNEL_PARAM_DEF_FN_, func_id)(::pt::Kernel * kernel)
 
 #define PT_KERNEL_SPECIALIZE(meta_kernel_fn, cpp_dtype, ...) \
   _PT_KERNEL_SPECIALIZE(PT_NARGS(cpp_dtype, __VA_ARGS__),    \
@@ -138,7 +142,7 @@ class OpKernelRegistrar {
   template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
   _PT_KERNEL_SPECIALIZE_7(meta_kernel_fn, __VA_ARGS__)
 
-#define PT_KERNEL_REGISTRAR_INIT(op_name,                     \
+#define PT_KERNEL_REGISTRAR_INIT(kernel_name,                 \
                                  func_id,                     \
                                  backend,                     \
                                  layout,                      \
@@ -147,7 +151,7 @@ class OpKernelRegistrar {
                                  cpp_dtype,                   \
                                  ...)                         \
   _PT_KERNEL_REGISTRAR_INIT(PT_NARGS(cpp_dtype, __VA_ARGS__), \
-                            op_name,                          \
+                            kernel_name,                      \
                             func_id,                          \
                             backend,                          \
                             layout,                           \
@@ -157,7 +161,7 @@ class OpKernelRegistrar {
                             __VA_ARGS__)
 
 #define _PT_KERNEL_REGISTRAR_INIT(N,              \
-                                  op_name,        \
+                                  kernel_name,    \
                                   func_id,        \
                                   backend,        \
                                   layout,         \
@@ -166,7 +170,7 @@ class OpKernelRegistrar {
                                   cpp_dtype,      \
                                   ...)            \
   PT_CONCATENATE(_PT_KERNEL_REGISTRAR_INIT_, N)   \
-  (op_name,                                       \
+  (kernel_name,                                   \
    func_id,                                       \
    PT_ID,                                         \
    backend,                                       \
@@ -176,235 +180,235 @@ class OpKernelRegistrar {
    cpp_dtype,                                     \
    __VA_ARGS__)
 
-#define _PT_KERNEL_REGISTRAR_INIT_1(op_name,           \
-                                    func_id,           \
-                                    registrar_id,      \
-                                    backend,           \
-                                    layout,            \
-                                    param_def_fn,      \
-                                    meta_kernel_fn,    \
-                                    cpp_dtype,         \
-                                    ...)               \
-  static const ::pt::OpKernelRegistrar PT_CONCATENATE( \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(  \
-      op_name,                                         \
-      BACKEND(backend),                                \
-      DATALAYOUT(layout),                              \
-      ::pt::CppTypeToDataType<cpp_dtype>::Type(),      \
-      param_def_fn,                                    \
+#define _PT_KERNEL_REGISTRAR_INIT_1(kernel_name,      \
+                                    func_id,          \
+                                    registrar_id,     \
+                                    backend,          \
+                                    layout,           \
+                                    param_def_fn,     \
+                                    meta_kernel_fn,   \
+                                    cpp_dtype,        \
+                                    ...)              \
+  static const ::pt::KernelRegistrar PT_CONCATENATE(  \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)( \
+      kernel_name,                                    \
+      BACKEND(backend),                               \
+      DATALAYOUT(layout),                             \
+      ::pt::CppTypeToDataType<cpp_dtype>::Type(),     \
+      param_def_fn,                                   \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));
-#define _PT_KERNEL_REGISTRAR_INIT_2(op_name,           \
-                                    func_id,           \
-                                    registrar_id,      \
-                                    backend,           \
-                                    layout,            \
-                                    param_def_fn,      \
-                                    meta_kernel_fn,    \
-                                    cpp_dtype,         \
-                                    ...)               \
-  static const ::pt::OpKernelRegistrar PT_CONCATENATE( \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(  \
-      op_name,                                         \
-      BACKEND(backend),                                \
-      DATALAYOUT(layout),                              \
-      ::pt::CppTypeToDataType<cpp_dtype>::Type(),      \
-      param_def_fn,                                    \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>));           \
-  _PT_KERNEL_REGISTRAR_INIT_1(op_name,                 \
-                              func_id,                 \
-                              PT_ID,                   \
-                              backend,                 \
-                              layout,                  \
-                              param_def_fn,            \
-                              meta_kernel_fn,          \
+#define _PT_KERNEL_REGISTRAR_INIT_2(kernel_name,      \
+                                    func_id,          \
+                                    registrar_id,     \
+                                    backend,          \
+                                    layout,           \
+                                    param_def_fn,     \
+                                    meta_kernel_fn,   \
+                                    cpp_dtype,        \
+                                    ...)              \
+  static const ::pt::KernelRegistrar PT_CONCATENATE(  \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)( \
+      kernel_name,                                    \
+      BACKEND(backend),                               \
+      DATALAYOUT(layout),                             \
+      ::pt::CppTypeToDataType<cpp_dtype>::Type(),     \
+      param_def_fn,                                   \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));          \
+  _PT_KERNEL_REGISTRAR_INIT_1(kernel_name,            \
+                              func_id,                \
+                              PT_ID,                  \
+                              backend,                \
+                              layout,                 \
+                              param_def_fn,           \
+                              meta_kernel_fn,         \
                               __VA_ARGS__)
-#define _PT_KERNEL_REGISTRAR_INIT_3(op_name,           \
-                                    func_id,           \
-                                    registrar_id,      \
-                                    backend,           \
-                                    layout,            \
-                                    param_def_fn,      \
-                                    meta_kernel_fn,    \
-                                    cpp_dtype,         \
-                                    ...)               \
-  static const ::pt::OpKernelRegistrar PT_CONCATENATE( \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(  \
-      op_name,                                         \
-      BACKEND(backend),                                \
-      DATALAYOUT(layout),                              \
-      ::pt::CppTypeToDataType<cpp_dtype>::Type(),      \
-      param_def_fn,                                    \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>));           \
-  _PT_KERNEL_REGISTRAR_INIT_2(op_name,                 \
-                              func_id,                 \
-                              PT_ID,                   \
-                              backend,                 \
-                              layout,                  \
-                              param_def_fn,            \
-                              meta_kernel_fn,          \
+#define _PT_KERNEL_REGISTRAR_INIT_3(kernel_name,      \
+                                    func_id,          \
+                                    registrar_id,     \
+                                    backend,          \
+                                    layout,           \
+                                    param_def_fn,     \
+                                    meta_kernel_fn,   \
+                                    cpp_dtype,        \
+                                    ...)              \
+  static const ::pt::KernelRegistrar PT_CONCATENATE(  \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)( \
+      kernel_name,                                    \
+      BACKEND(backend),                               \
+      DATALAYOUT(layout),                             \
+      ::pt::CppTypeToDataType<cpp_dtype>::Type(),     \
+      param_def_fn,                                   \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));          \
+  _PT_KERNEL_REGISTRAR_INIT_2(kernel_name,            \
+                              func_id,                \
+                              PT_ID,                  \
+                              backend,                \
+                              layout,                 \
+                              param_def_fn,           \
+                              meta_kernel_fn,         \
                               __VA_ARGS__)
-#define _PT_KERNEL_REGISTRAR_INIT_4(op_name,           \
-                                    func_id,           \
-                                    registrar_id,      \
-                                    backend,           \
-                                    layout,            \
-                                    param_def_fn,      \
-                                    meta_kernel_fn,    \
-                                    cpp_dtype,         \
-                                    ...)               \
-  static const ::pt::OpKernelRegistrar PT_CONCATENATE( \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(  \
-      op_name,                                         \
-      BACKEND(backend),                                \
-      DATALAYOUT(layout),                              \
-      ::pt::CppTypeToDataType<cpp_dtype>::Type(),      \
-      param_def_fn,                                    \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>));           \
-  _PT_KERNEL_REGISTRAR_INIT_3(op_name,                 \
-                              func_id,                 \
-                              PT_ID,                   \
-                              backend,                 \
-                              layout,                  \
-                              param_def_fn,            \
-                              meta_kernel_fn,          \
+#define _PT_KERNEL_REGISTRAR_INIT_4(kernel_name,      \
+                                    func_id,          \
+                                    registrar_id,     \
+                                    backend,          \
+                                    layout,           \
+                                    param_def_fn,     \
+                                    meta_kernel_fn,   \
+                                    cpp_dtype,        \
+                                    ...)              \
+  static const ::pt::KernelRegistrar PT_CONCATENATE(  \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)( \
+      kernel_name,                                    \
+      BACKEND(backend),                               \
+      DATALAYOUT(layout),                             \
+      ::pt::CppTypeToDataType<cpp_dtype>::Type(),     \
+      param_def_fn,                                   \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));          \
+  _PT_KERNEL_REGISTRAR_INIT_3(kernel_name,            \
+                              func_id,                \
+                              PT_ID,                  \
+                              backend,                \
+                              layout,                 \
+                              param_def_fn,           \
+                              meta_kernel_fn,         \
                               __VA_ARGS__)
-#define _PT_KERNEL_REGISTRAR_INIT_5(op_name,           \
-                                    func_id,           \
-                                    registrar_id,      \
-                                    backend,           \
-                                    layout,            \
-                                    param_def_fn,      \
-                                    meta_kernel_fn,    \
-                                    cpp_dtype,         \
-                                    ...)               \
-  static const ::pt::OpKernelRegistrar PT_CONCATENATE( \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(  \
-      op_name,                                         \
-      BACKEND(backend),                                \
-      DATALAYOUT(layout),                              \
-      ::pt::CppTypeToDataType<cpp_dtype>::Type(),      \
-      param_def_fn,                                    \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>));           \
-  _PT_KERNEL_REGISTRAR_INIT_4(op_name,                 \
-                              func_id,                 \
-                              PT_ID,                   \
-                              backend,                 \
-                              layout,                  \
-                              param_def_fn,            \
-                              meta_kernel_fn,          \
+#define _PT_KERNEL_REGISTRAR_INIT_5(kernel_name,      \
+                                    func_id,          \
+                                    registrar_id,     \
+                                    backend,          \
+                                    layout,           \
+                                    param_def_fn,     \
+                                    meta_kernel_fn,   \
+                                    cpp_dtype,        \
+                                    ...)              \
+  static const ::pt::KernelRegistrar PT_CONCATENATE(  \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)( \
+      kernel_name,                                    \
+      BACKEND(backend),                               \
+      DATALAYOUT(layout),                             \
+      ::pt::CppTypeToDataType<cpp_dtype>::Type(),     \
+      param_def_fn,                                   \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));          \
+  _PT_KERNEL_REGISTRAR_INIT_4(kernel_name,            \
+                              func_id,                \
+                              PT_ID,                  \
+                              backend,                \
+                              layout,                 \
+                              param_def_fn,           \
+                              meta_kernel_fn,         \
                               __VA_ARGS__)
-#define _PT_KERNEL_REGISTRAR_INIT_6(op_name,           \
-                                    func_id,           \
-                                    registrar_id,      \
-                                    backend,           \
-                                    layout,            \
-                                    param_def_fn,      \
-                                    meta_kernel_fn,    \
-                                    cpp_dtype,         \
-                                    ...)               \
-  static const ::pt::OpKernelRegistrar PT_CONCATENATE( \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(  \
-      op_name,                                         \
-      BACKEND(backend),                                \
-      DATALAYOUT(layout),                              \
-      ::pt::CppTypeToDataType<cpp_dtype>::Type(),      \
-      param_def_fn,                                    \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>));           \
-  _PT_KERNEL_REGISTRAR_INIT_5(op_name,                 \
-                              func_id,                 \
-                              PT_ID,                   \
-                              backend,                 \
-                              layout,                  \
-                              param_def_fn,            \
-                              meta_kernel_fn,          \
+#define _PT_KERNEL_REGISTRAR_INIT_6(kernel_name,      \
+                                    func_id,          \
+                                    registrar_id,     \
+                                    backend,          \
+                                    layout,           \
+                                    param_def_fn,     \
+                                    meta_kernel_fn,   \
+                                    cpp_dtype,        \
+                                    ...)              \
+  static const ::pt::KernelRegistrar PT_CONCATENATE(  \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)( \
+      kernel_name,                                    \
+      BACKEND(backend),                               \
+      DATALAYOUT(layout),                             \
+      ::pt::CppTypeToDataType<cpp_dtype>::Type(),     \
+      param_def_fn,                                   \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));          \
+  _PT_KERNEL_REGISTRAR_INIT_5(kernel_name,            \
+                              func_id,                \
+                              PT_ID,                  \
+                              backend,                \
+                              layout,                 \
+                              param_def_fn,           \
+                              meta_kernel_fn,         \
                               __VA_ARGS__)
-#define _PT_KERNEL_REGISTRAR_INIT_7(op_name,           \
-                                    func_id,           \
-                                    registrar_id,      \
-                                    backend,           \
-                                    layout,            \
-                                    param_def_fn,      \
-                                    meta_kernel_fn,    \
-                                    cpp_dtype,         \
-                                    ...)               \
-  static const ::pt::OpKernelRegistrar PT_CONCATENATE( \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(  \
-      op_name,                                         \
-      BACKEND(backend),                                \
-      DATALAYOUT(layout),                              \
-      ::pt::CppTypeToDataType<cpp_dtype>::Type(),      \
-      param_def_fn,                                    \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>));           \
-  _PT_KERNEL_REGISTRAR_INIT_6(op_name,                 \
-                              func_id,                 \
-                              PT_ID,                   \
-                              backend,                 \
-                              layout,                  \
-                              param_def_fn,            \
-                              meta_kernel_fn,          \
+#define _PT_KERNEL_REGISTRAR_INIT_7(kernel_name,      \
+                                    func_id,          \
+                                    registrar_id,     \
+                                    backend,          \
+                                    layout,           \
+                                    param_def_fn,     \
+                                    meta_kernel_fn,   \
+                                    cpp_dtype,        \
+                                    ...)              \
+  static const ::pt::KernelRegistrar PT_CONCATENATE(  \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)( \
+      kernel_name,                                    \
+      BACKEND(backend),                               \
+      DATALAYOUT(layout),                             \
+      ::pt::CppTypeToDataType<cpp_dtype>::Type(),     \
+      param_def_fn,                                   \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));          \
+  _PT_KERNEL_REGISTRAR_INIT_6(kernel_name,            \
+                              func_id,                \
+                              PT_ID,                  \
+                              backend,                \
+                              layout,                 \
+                              param_def_fn,           \
+                              meta_kernel_fn,         \
                               __VA_ARGS__)
-#define _PT_KERNEL_REGISTRAR_INIT_8(op_name,           \
-                                    func_id,           \
-                                    registrar_id,      \
-                                    backend,           \
-                                    layout,            \
-                                    param_def_fn,      \
-                                    meta_kernel_fn,    \
-                                    cpp_dtype,         \
-                                    ...)               \
-  static const ::pt::OpKernelRegistrar PT_CONCATENATE( \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(  \
-      op_name,                                         \
-      BACKEND(backend),                                \
-      DATALAYOUT(layout),                              \
-      ::pt::CppTypeToDataType<cpp_dtype>::Type(),      \
-      param_def_fn,                                    \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>));           \
-  _PT_KERNEL_REGISTRAR_INIT_7(op_name,                 \
-                              func_id,                 \
-                              PT_ID,                   \
-                              backend,                 \
-                              layout,                  \
-                              param_def_fn,            \
-                              meta_kernel_fn,          \
+#define _PT_KERNEL_REGISTRAR_INIT_8(kernel_name,      \
+                                    func_id,          \
+                                    registrar_id,     \
+                                    backend,          \
+                                    layout,           \
+                                    param_def_fn,     \
+                                    meta_kernel_fn,   \
+                                    cpp_dtype,        \
+                                    ...)              \
+  static const ::pt::KernelRegistrar PT_CONCATENATE(  \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)( \
+      kernel_name,                                    \
+      BACKEND(backend),                               \
+      DATALAYOUT(layout),                             \
+      ::pt::CppTypeToDataType<cpp_dtype>::Type(),     \
+      param_def_fn,                                   \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));          \
+  _PT_KERNEL_REGISTRAR_INIT_7(kernel_name,            \
+                              func_id,                \
+                              PT_ID,                  \
+                              backend,                \
+                              layout,                 \
+                              param_def_fn,           \
+                              meta_kernel_fn,         \
                               __VA_ARGS__)
 
-#define PT_REGISTER_KERNEL_STANDARD(                                      \
-    op_name, backend, layout, dtype, kernel_fn)                           \
-  template decltype(kernel_fn) kernel_fn;                                 \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                      \
-      __reg_pt_op_kernel_##op_name##_##backend##_##layout##_##dtype##__,  \
-      "PT_REGISTER_KERNEL_STANDARD must be called in global namespace."); \
-  static ::pt::OpKernelRegistrar                                          \
-      __pt_op_kernel_##op_name##_##backend##_##layout##_##dtype##__ =     \
-          ::pt::OpKernelRegistrar(#op_name,                               \
-                                  BACKEND(backend),                       \
-                                  DATALAYOUT(layout),                     \
-                                  DATATYPE(dtype),                        \
-                                  PT_KERNEL(kernel_fn))
+#define PT_REGISTER_KERNEL_STANDARD(                                         \
+    kernel_name, backend, layout, dtype, kernel_fn)                          \
+  template decltype(kernel_fn) kernel_fn;                                    \
+  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
+      __reg_pt_op_kernel_##kernel_name##_##backend##_##layout##_##dtype##__, \
+      "PT_REGISTER_KERNEL_STANDARD must be called in global namespace.");    \
+  static ::pt::KernelRegistrar                                               \
+      __pt_op_kernel_##kernel_name##_##backend##_##layout##_##dtype##__ =    \
+          ::pt::KernelRegistrar(#kernel_name,                                \
+                                BACKEND(backend),                            \
+                                DATALAYOUT(layout),                          \
+                                DATATYPE(dtype),                             \
+                                PT_KERNEL(kernel_fn))
 
-#define PT_REGISTER_KERNEL_AUTO_SPECIALIZE(                               \
-    op_name, backend, layout, meta_kernel_fn, dtype)                      \
-  template decltype(meta_kernel_fn<dtype>) meta_kernel_fn<dtype>;         \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                      \
-      __reg_pt_op_kernel_##op_name##_##backend##_##layout##_##dtype##__,  \
-      "PT_REGISTER_KERNEL_AUTO_SPECIALIZE must be called in global "      \
-      "namespace.");                                                      \
-  static ::pt::OpKernelRegistrar                                          \
-      __pt_op_kernel_##op_name##_##backend##_##layout##_##dtype##__ =     \
-          ::pt::OpKernelRegistrar(#op_name,                               \
-                                  BACKEND(backend),                       \
-                                  DATALAYOUT(layout),                     \
-                                  ::pt::CppTypeToDataType<dtype>::Type(), \
-                                  PT_KERNEL(meta_kernel_fn<dtype>))
+#define PT_REGISTER_KERNEL_AUTO_SPECIALIZE(                                  \
+    kernel_name, backend, layout, meta_kernel_fn, dtype)                     \
+  template decltype(meta_kernel_fn<dtype>) meta_kernel_fn<dtype>;            \
+  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
+      __reg_pt_op_kernel_##kernel_name##_##backend##_##layout##_##dtype##__, \
+      "PT_REGISTER_KERNEL_AUTO_SPECIALIZE must be called in global "         \
+      "namespace.");                                                         \
+  static ::pt::KernelRegistrar                                               \
+      __pt_op_kernel_##kernel_name##_##backend##_##layout##_##dtype##__ =    \
+          ::pt::KernelRegistrar(#kernel_name,                                \
+                                BACKEND(backend),                            \
+                                DATALAYOUT(layout),                          \
+                                ::pt::CppTypeToDataType<dtype>::Type(),      \
+                                PT_KERNEL(meta_kernel_fn<dtype>))
 
-#define PT_TOUCH_KERNEL_REGISTRAR(op_name, backend, layout, dtype)          \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                        \
-      __touch_pt_op_kernel_##op_name##_##backend##_##layout##_##dtype##__,  \
-      "PT_TOUCH_KERNEL_REGISTRAR must be called in global namespace.");     \
-  int TouchOpKernelRegistrar_##op_name##_##backend##_##dtype##_##layout() { \
-    __pt_op_kernel_##op_name##_##backend##_##layout##_##dtype##__.Touch();  \
-    return 0;                                                               \
+#define PT_TOUCH_KERNEL_REGISTRAR(kernel_name, backend, layout, dtype)         \
+  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                           \
+      __touch_pt_op_kernel_##kernel_name##_##backend##_##layout##_##dtype##__, \
+      "PT_TOUCH_KERNEL_REGISTRAR must be called in global namespace.");        \
+  int TouchKernelRegistrar_##kernel_name##_##backend##_##dtype##_##layout() {  \
+    __pt_op_kernel_##kernel_name##_##backend##_##layout##_##dtype##__.Touch(); \
+    return 0;                                                                  \
   }
 
 }  // namespace pt
diff --git a/paddle/tcmpt/core/kernel_utils.h b/paddle/tcmpt/core/kernel_utils.h
index 6ef4877735b52..33702c78f3448 100644
--- a/paddle/tcmpt/core/kernel_utils.h
+++ b/paddle/tcmpt/core/kernel_utils.h
@@ -39,17 +39,17 @@ using XPUContext = paddle::platform::XPUDeviceContext;
 #endif
 
 #define PT_KERNEL(...) \
-  ::pt::OpKernelImpl<decltype(&__VA_ARGS__), &__VA_ARGS__>::Compute
+  ::pt::KernelImpl<decltype(&__VA_ARGS__), &__VA_ARGS__>::Compute
 
-#define PT_SPECIALIZE_OpKernelCallHelper_FOR_DEVICE_CONTEXT(dev_ctx)         \
+#define PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(dev_ctx)           \
   template <typename... Tail>                                                \
-  struct OpKernelCallHelper<const dev_ctx&, Tail...> {                       \
+  struct KernelCallHelper<const dev_ctx&, Tail...> {                         \
     template <int dev_ctx_idx,                                               \
               int in_idx,                                                    \
               int attr_idx,                                                  \
               int out_idx,                                                   \
               typename... PreviousArgs>                                      \
-    static void Compute(OpKernelContext* ctx, PreviousArgs&... pargs) {      \
+    static void Compute(KernelContext* ctx, PreviousArgs&... pargs) {        \
       static_assert(in_idx == 0,                                             \
                     "Kernel's DeviceContext should appear before Inputs.");  \
       static_assert(                                                         \
@@ -58,25 +58,25 @@ using XPUContext = paddle::platform::XPUDeviceContext;
       static_assert(out_idx == 0,                                            \
                     "Kernel's DeviceContext should appear before Outputs."); \
       const dev_ctx& arg = ctx->GetDeviceContext<dev_ctx>();                 \
-      OpKernelCallHelper<Tail...>::                                          \
+      KernelCallHelper<Tail...>::                                            \
           template Compute<dev_ctx_idx + 1, in_idx, attr_idx, out_idx>(      \
               ctx, pargs..., arg);                                           \
     }                                                                        \
   }
 
-#define PT_SPECIALIZE_OpKernelCallHelper_FOR_ATTRIBUTE(attr_type)         \
+#define PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(attr_type)           \
   template <typename... Tail>                                             \
-  struct OpKernelCallHelper<attr_type, Tail...> {                         \
+  struct KernelCallHelper<attr_type, Tail...> {                           \
     template <int dev_ctx_idx,                                            \
               int in_idx,                                                 \
               int attr_idx,                                               \
               int out_idx,                                                \
               typename... PreviousArgs>                                   \
-    static void Compute(OpKernelContext* ctx, PreviousArgs&... pargs) {   \
+    static void Compute(KernelContext* ctx, PreviousArgs&... pargs) {     \
       static_assert(out_idx == 0,                                         \
                     "Kernel's Attributes should appear before Outputs."); \
       attr_type arg = ctx->AttrAt<attr_type>(attr_idx);                   \
-      OpKernelCallHelper<Tail...>::                                       \
+      KernelCallHelper<Tail...>::                                         \
           template Compute<dev_ctx_idx, in_idx, attr_idx + 1, out_idx>(   \
               ctx, pargs..., arg);                                        \
     }                                                                     \
@@ -86,48 +86,47 @@ template <typename T>
 struct TypeTag {};
 
 template <typename Fn, Fn fn>
-struct OpKernelImpl;
+struct KernelImpl;
 
 template <typename Return, typename... Args, Return (*kernel_fn)(Args...)>
-struct OpKernelImpl<Return (*)(Args...), kernel_fn> {
-  static void Compute(OpKernelContext* ctx) {
-    OpKernelCallHelper<Args..., TypeTag<int>>::template Compute<0, 0, 0, 0>(
-        ctx);
+struct KernelImpl<Return (*)(Args...), kernel_fn> {
+  static void Compute(KernelContext* ctx) {
+    KernelCallHelper<Args..., TypeTag<int>>::template Compute<0, 0, 0, 0>(ctx);
   }
 
  private:
   template <typename... RemainingArgs>
-  struct OpKernelCallHelper;
+  struct KernelCallHelper;
 
   /* DeviceContext Helpers */
 
-  PT_SPECIALIZE_OpKernelCallHelper_FOR_DEVICE_CONTEXT(CPUContext);
+  PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(CPUContext);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  PT_SPECIALIZE_OpKernelCallHelper_FOR_DEVICE_CONTEXT(CUDAContext);
+  PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(CUDAContext);
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
-  PT_SPECIALIZE_OpKernelCallHelper_FOR_DEVICE_CONTEXT(NPUContext);
+  PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(NPUContext);
 #endif
 #ifdef PADDLE_WITH_XPU
-  PT_SPECIALIZE_OpKernelCallHelper_FOR_DEVICE_CONTEXT(XPUContext);
+  PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(XPUContext);
 #endif
 
   /* Input Helpers */
 
   template <typename... Tail>
-  struct OpKernelCallHelper<const DenseTensor&, Tail...> {
+  struct KernelCallHelper<const DenseTensor&, Tail...> {
     template <int dev_ctx_idx,
               int in_idx,
               int attr_idx,
               int out_idx,
               typename... PreviousArgs>
-    static void Compute(OpKernelContext* ctx, PreviousArgs&... pargs) {
+    static void Compute(KernelContext* ctx, PreviousArgs&... pargs) {
       static_assert(attr_idx == 0,
                     "Kernel's Input should appear before Attributes.");
       static_assert(out_idx == 0,
                     "Kernel's Input should appear before Outputs.");
       const DenseTensor& arg = ctx->InputAt<DenseTensor>(in_idx);
-      OpKernelCallHelper<Tail...>::
+      KernelCallHelper<Tail...>::
           template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>(
               ctx, pargs..., arg);
     }
@@ -135,21 +134,21 @@ struct OpKernelImpl<Return (*)(Args...), kernel_fn> {
 
   /* Attribute Helpers */
 
-  PT_SPECIALIZE_OpKernelCallHelper_FOR_ATTRIBUTE(bool);
-  PT_SPECIALIZE_OpKernelCallHelper_FOR_ATTRIBUTE(float);
+  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(bool);
+  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(float);
 
   /* Output Helpers */
 
   template <typename... Tail>
-  struct OpKernelCallHelper<DenseTensor*, Tail...> {
+  struct KernelCallHelper<DenseTensor*, Tail...> {
     template <int dev_ctx_idx,
               int in_idx,
               int attr_idx,
               int out_idx,
               typename... PreviousArgs>
-    static void Compute(OpKernelContext* ctx, PreviousArgs&... pargs) {
+    static void Compute(KernelContext* ctx, PreviousArgs&... pargs) {
       DenseTensor* arg = ctx->MutableOutputAt<DenseTensor>(out_idx);
-      OpKernelCallHelper<Tail...>::
+      KernelCallHelper<Tail...>::
           template Compute<dev_ctx_idx, in_idx, attr_idx, out_idx + 1>(
               ctx, pargs..., arg);
     }
@@ -157,9 +156,9 @@ struct OpKernelImpl<Return (*)(Args...), kernel_fn> {
 
   /* End case */
   template <typename T>
-  struct OpKernelCallHelper<TypeTag<T>> {
+  struct KernelCallHelper<TypeTag<T>> {
     template <int dev_ctx_idx, int in_idx, int attr_idx, int out_idx>
-    static void Compute(OpKernelContext* ctx, Args&... args) {
+    static void Compute(KernelContext* ctx, Args&... args) {
       static_assert(dev_ctx_idx > 0,
                     "Kernel should pass DeviceContext as argument.");
       static_assert(out_idx > 0, "Kernel should have output argument.");
diff --git a/paddle/tcmpt/core/layout.h b/paddle/tcmpt/core/layout.h
index 10a7aa1f677c0..6a5cdb1c5e8cd 100644
--- a/paddle/tcmpt/core/layout.h
+++ b/paddle/tcmpt/core/layout.h
@@ -21,8 +21,8 @@ namespace pt {
 /**
  * We need to ensure that the operator library is relatively independent
  * and does not depend on the framework. Therefore, before calling the kernel
- * in the Tensor operation library inside the framework, the internal
- * layout needs to be converted to the data type in the Tensor operation
+ * in the Tensor Compute library inside the framework, the internal
+ * layout needs to be converted to the data type in the Tensor Compute
  * library.
  *
  * Here we also can use the DataLayout in framework, they are all enum classes.
diff --git a/paddle/tcmpt/cpu/math.cc b/paddle/tcmpt/cpu/math.cc
index 7656f88beffc9..8e760f6e11556 100644
--- a/paddle/tcmpt/cpu/math.cc
+++ b/paddle/tcmpt/cpu/math.cc
@@ -14,8 +14,8 @@
 
 #include "paddle/tcmpt/cpu/math.h"
 
-// #include "paddle/tcmpt/module/scale.h"
-// #include "paddle/tcmpt/module/sign.h"
+// #include "paddle/tcmpt/eigen/scale.h"
+// #include "paddle/tcmpt/eigen/sign.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/eigen.h"
diff --git a/paddle/tcmpt/cpu/math.h b/paddle/tcmpt/cpu/math.h
index de9521b54dede..f49848e645d5d 100644
--- a/paddle/tcmpt/cpu/math.h
+++ b/paddle/tcmpt/cpu/math.h
@@ -18,8 +18,8 @@ limitations under the License. */
 #include "paddle/tcmpt/core/kernel_registry.h"
 #include "paddle/tcmpt/core/selected_rows_tensor.h"
 
-#include "paddle/tcmpt/module/scale.h"
-#include "paddle/tcmpt/module/sign.h"
+#include "paddle/tcmpt/eigen/scale.h"
+#include "paddle/tcmpt/eigen/sign.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/tcmpt/cuda/math.cu b/paddle/tcmpt/cuda/math.cu
index 65d0bdfaa36b9..c62dc41bd6234 100644
--- a/paddle/tcmpt/cuda/math.cu
+++ b/paddle/tcmpt/cuda/math.cu
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "paddle/tcmpt/cuda/math.h"
 
-// #include "paddle/tcmpt/module/scale.h"
-// #include "paddle/tcmpt/module/sign.h"
+// #include "paddle/tcmpt/eigen/scale.h"
+// #include "paddle/tcmpt/eigen/sign.h"
 
 #ifdef __NVCC__
 #include "cub/cub.cuh"
diff --git a/paddle/tcmpt/cuda/math.h b/paddle/tcmpt/cuda/math.h
index 9bcb6c9dbf0c8..3e87163f89540 100644
--- a/paddle/tcmpt/cuda/math.h
+++ b/paddle/tcmpt/cuda/math.h
@@ -20,8 +20,8 @@ limitations under the License. */
 #include "paddle/tcmpt/core/dense_tensor.h"
 #include "paddle/tcmpt/core/selected_rows_tensor.h"
 
-#include "paddle/tcmpt/module/scale.h"
-#include "paddle/tcmpt/module/sign.h"
+#include "paddle/tcmpt/eigen/scale.h"
+#include "paddle/tcmpt/eigen/sign.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
diff --git a/paddle/tcmpt/eigen/CMakeLists.txt b/paddle/tcmpt/eigen/CMakeLists.txt
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/paddle/tcmpt/module/scale.h b/paddle/tcmpt/eigen/scale.h
similarity index 100%
rename from paddle/tcmpt/module/scale.h
rename to paddle/tcmpt/eigen/scale.h
diff --git a/paddle/tcmpt/module/sign.h b/paddle/tcmpt/eigen/sign.h
similarity index 100%
rename from paddle/tcmpt/module/sign.h
rename to paddle/tcmpt/eigen/sign.h
diff --git a/paddle/tcmpt/mkldnn/CMakeLists.txt b/paddle/tcmpt/mkldnn/CMakeLists.txt
index d058375874075..e69de29bb2d1d 100644
--- a/paddle/tcmpt/mkldnn/CMakeLists.txt
+++ b/paddle/tcmpt/mkldnn/CMakeLists.txt
@@ -1 +0,0 @@
-cc_library(math_mkldnn SRCS math.cc DEPS dense_tensor kernel_context kernel_factory mkldnn)
diff --git a/paddle/tcmpt/mkldnn/base.h b/paddle/tcmpt/mkldnn/base.h
deleted file mode 100644
index 35acf1f9f6815..0000000000000
--- a/paddle/tcmpt/mkldnn/base.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef PADDLE_WITH_MKLDNN
-
-#include "paddle/tcmpt/core/mkldnn_dense_tensor.h"
-
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/mkldnn_reuse.h"
-
-namespace pt {
-
-using MKLDNNDContext = paddle::platform::MKLDNNDeviceContext;
-
-// TODO(chenweihang): the handlers in `mkldnn_reuse.h` are coupled to
-// `ExecutionContext`, refactoring that may be a big project!
-
-template <typename T>
-class ScaleMKLDNNHandler : public paddle::platform::MKLDNNHandlerNoCachingT<
-                               T,
-                               mkldnn::eltwise_forward,
-                               mkldnn::eltwise_backward> {
- public:
-  ScaleMKLDNNHandler(const mkldnn::engine& engine,
-                     const pt::MKLDNNDenseTensor& in_x,
-                     float alpha,
-                     float beta,
-                     bool bias_after_scale)
-      : paddle::platform::MKLDNNHandlerNoCachingT<T,
-                                                  mkldnn::eltwise_forward,
-                                                  mkldnn::eltwise_backward>(
-            engine, in_x.place()) {
-    if (!bias_after_scale) {
-      beta *= alpha;
-    }
-
-    PADDLE_ENFORCE(in_x.dims().size() >= 1 || in_x.dims().size() <= 6,
-                   paddle::platform::errors::Unimplemented(
-                       "Input dimension size can be 1, 2, 3, 4, "
-                       "5, or 6, but now the dimension size is",
-                       in_x.dims().size()));
-
-    auto src_tz = paddle::framework::vectorize<int64_t>(in_x.dims());
-    auto src_fmt =
-        src_tz.size() == 2 ? paddle::MKLDNNMemoryFormat::nc : in_x.format();
-    auto md = mkldnn::memory::desc(
-        src_tz, paddle::platform::MKLDNNGetDataType<T>(), src_fmt);
-
-    this->AcquireForwardPrimitiveDescriptor(mkldnn::prop_kind::forward_training,
-                                            mkldnn::algorithm::eltwise_linear,
-                                            md,
-                                            alpha,
-                                            beta);
-  }
-};
-
-}  // namespace pt
-
-#endif
diff --git a/paddle/tcmpt/mkldnn/math.cc b/paddle/tcmpt/mkldnn/math.cc
deleted file mode 100644
index 6f4cc9f7f6628..0000000000000
--- a/paddle/tcmpt/mkldnn/math.cc
+++ /dev/null
@@ -1,20 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/tcmpt/mkldnn/math.h"
-
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/float16.h"
-
-namespace pt {}  // namespace pt
diff --git a/paddle/tcmpt/mkldnn/math.h b/paddle/tcmpt/mkldnn/math.h
deleted file mode 100644
index 07ac563c2177c..0000000000000
--- a/paddle/tcmpt/mkldnn/math.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef PADDLE_WITH_MKLDNN
-
-#include "paddle/tcmpt/core/mkldnn_dense_tensor.h"
-#include "paddle/tcmpt/mkldnn/base.h"
-
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/device_context.h"
-
-namespace pt {
-
-using MKLDNNDContext = paddle::platform::MKLDNNDeviceContext;
-
-template <typename T>
-void Scale(const MKLDNNDContext& dev_ctx,
-           const MKLDNNDenseTensor& x,
-           float scale,
-           float bias,
-           bool bias_after_scale,
-           MKLDNNDenseTensor* out) {
-  const auto mkldnn_engine = dev_ctx.GetEngine();
-
-  ScaleMKLDNNHandler<T> handler(mkldnn_engine,
-                                x,
-                                /*alpha=*/scale,
-                                /*beta=*/bias,
-                                bias_after_scale);
-
-  bool is_inplaced = x.allocation() && x.allocation() == out->allocation();
-
-  auto src_memory_p = handler.AcquireSrcMemory(&x);
-  auto dst_memory_p =
-      is_inplaced ? src_memory_p : handler.AcquireDstMemory(out);
-  auto activation_p = handler.AcquireForwardPrimitive();
-
-  auto& astream = MKLDNNDContext::tls().get_stream();
-  activation_p->execute(
-      astream,
-      {{MKLDNN_ARG_FROM, *src_memory_p}, {MKLDNN_ARG_TO, *dst_memory_p}});
-  astream.wait();
-
-  out->mutable_meta()->layout = DataLayout::kMKLDNN;
-  // TODO(chenweihang): format is also meta info, how to deal with here?
-  out->set_format(paddle::platform::GetMKLDNNFormat(*dst_memory_p));
-}
-
-}  // namespace pt
-
-#endif
diff --git a/paddle/tcmpt/npu/math.h b/paddle/tcmpt/npu/math.h
deleted file mode 100644
index d480bb22e9287..0000000000000
--- a/paddle/tcmpt/npu/math.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef PADDLE_WITH_ASCEND_CL
-
-#include "paddle/tcmpt/core/dense_tensor.h"
-
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/operators/npu_op_runner.h"
-#include "paddle/fluid/platform/device_context.h"
-
-namespace pt {
-
-using NPUContext = paddle::platform::NPUDeviceContext;
-
-template <typename T>
-void Mean(const NPUContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
-  std::vector<int> axes;
-  paddle::framework::NPUAttributeMap attr_input = {{"keep_dims", false},
-                                                   {"axes", axes}};
-  out->mutable_data<T>();
-  const auto& runner =
-      paddle::operators::NpuOpRunner("ReduceMeanD", {x}, {*out}, attr_input);
-  auto stream = dev_ctx.stream();
-  runner.Run(stream);
-}
-
-template <typename T>
-void Scale(const NPUContext& dev_ctx,
-           const DenseTensor& x,
-           float scale,
-           float bias,
-           bool bias_after_scale,
-           DenseTensor* out) {
-  out->mutable_data<T>();
-  auto stream = dev_ctx.stream();
-  float power = 1.0;
-  if (bias_after_scale) {
-    auto runner = paddle::operators::NpuOpRunner(
-        "Power",
-        {x},
-        {*out},
-        {{"power", power}, {"scale", scale}, {"shift", bias}});
-
-    runner.Run(stream);
-  } else {
-    DenseTensor tmp_x(TensorMeta(x.dims(), x.backend(), x.type(), x.layout()),
-                      TensorStatus());
-    tmp_x.mutable_data<T>();
-
-    auto runner_tmp =
-        paddle::operators::NpuOpRunner("Adds", {x}, {tmp_x}, {{"value", bias}});
-    runner_tmp.Run(stream);
-
-    out->mutable_data<T>();
-    float bias = 0.0;
-    auto runner = paddle::operators::NpuOpRunner(
-        "Power",
-        {tmp_x},
-        {*out},
-        {{"power", power}, {"scale", scale}, {"shift", bias}});
-    runner.Run(stream);
-  }
-}
-
-}  // namespace pt
-
-#endif
diff --git a/paddle/tcmpt/tests/kernel_factory_test.cc b/paddle/tcmpt/tests/kernel_factory_test.cc
index f3493ea63d56e..66ce7cd9892ef 100644
--- a/paddle/tcmpt/tests/kernel_factory_test.cc
+++ b/paddle/tcmpt/tests/kernel_factory_test.cc
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include "gtest/gtest.h"
 
-TEST(OpKernelFactory, OpKernelKey) {
-  pt::OpKernelKey key(
+TEST(KernelFactory, KernelKey) {
+  pt::KernelKey key(
       pt::Backend::kCPU, pt::DataLayout::kNCHW, pt::DataType::kFLOAT32);
   std::cout << key;
 }
diff --git a/paddle/tcmpt/xpu/CMakeLists.txt b/paddle/tcmpt/xpu/CMakeLists.txt
index 26a3758808c74..e69de29bb2d1d 100644
--- a/paddle/tcmpt/xpu/CMakeLists.txt
+++ b/paddle/tcmpt/xpu/CMakeLists.txt
@@ -1 +0,0 @@
-cc_library(math_xpu SRCS math.cc DEPS dense_tensor kernel_context kernel_factory)
diff --git a/paddle/tcmpt/xpu/math.cc b/paddle/tcmpt/xpu/math.cc
deleted file mode 100644
index 57b92da34edee..0000000000000
--- a/paddle/tcmpt/xpu/math.cc
+++ /dev/null
@@ -1,19 +0,0 @@
-//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/tcmpt/xpu/math.h"
-
-#include "paddle/tcmpt/core/kernel_registry.h"
-
-// PT_REGISTER_KERNEL_1T(sign, XPU, NCHW, pt::Sign, float);
diff --git a/paddle/tcmpt/xpu/math.h b/paddle/tcmpt/xpu/math.h
deleted file mode 100644
index ed223c8a71bea..0000000000000
--- a/paddle/tcmpt/xpu/math.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef PADDLE_WITH_XPU
-
-#include "paddle/tcmpt/core/dense_tensor.h"
-
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/xpu/xpu_header.h"
-
-namespace pt {
-
-using XPUContext = paddle::platform::XPUDeviceContext;
-
-template <typename T>
-void Sign(const XPUContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
-  T* out_data = out->mutable_data<T>();
-  auto xpu_ctx = dev_ctx.x_context();
-  int r = xpu::activation_forward(
-      xpu_ctx, xpu::Activation_t::SIGN, x.numel(), x.data<T>(), out_data);
-  PADDLE_ENFORCE_EQ(r,
-                    xpu::Error_t::SUCCESS,
-                    paddle::platform::errors::Fatal("XPU sign kernel error!"));
-}
-
-template <typename T>
-void Mean(const XPUContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
-  T* out_data = out->mutable_data<T>();
-  auto xpu_ctx = dev_ctx.x_context();
-  const T* x_data = x.data<T>();
-  int r = xpu::mean(xpu_ctx, x_data, out_data, x.numel());
-  PADDLE_ENFORCE_EQ(
-      r,
-      xpu::Error_t::SUCCESS,
-      paddle::platform::errors::External(
-          "XPU kernel error, Mean op execution not succeed, error code=%d", r));
-}
-
-template <typename T>
-void Scale(const XPUContext& dev_ctx,
-           const DenseTensor& x,
-           float scale,
-           float bias,
-           bool bias_after_scale,
-           DenseTensor* out) {
-  T* out_data = out->mutable_data<T>();
-  PADDLE_ENFORCE_EQ(x.dims(),
-                    out->dims(),
-                    paddle::platform::errors::InvalidArgument(
-                        "In and out should have the same dim,"
-                        " expected %s, but got %s.",
-                        x.dims().to_str().c_str(),
-                        out->dims().to_str().c_str()));
-  int r = xpu::scale(dev_ctx.x_context(),
-                     x.data<T>(),
-                     out_data,
-                     x.numel(),
-                     bias_after_scale,
-                     scale,
-                     bias);
-  PADDLE_ENFORCE_EQ(
-      r,
-      XPU_SUCCESS,
-      paddle::platform::errors::External(
-          "XPU scale kernel return wrong value[%d %s]", r, XPUAPIErrorMsg[r]));
-}
-
-}  // namespace pt
-
-#endif

From 321b141d56b0e109e7f87b854bc80f7684688ec4 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 8 Sep 2021 07:01:13 +0000
Subject: [PATCH 045/125] add kernel args parse functor to auto parse args

---
 paddle/fluid/framework/operator.cc           |   5 +-
 paddle/fluid/imperative/prepared_operator.cc |   4 +-
 paddle/tcmpt/core/kernel_def.h               |   6 +-
 paddle/tcmpt/core/kernel_factory.h           |  26 +-
 paddle/tcmpt/core/kernel_registry.h          | 265 +++++++++++--------
 5 files changed, 171 insertions(+), 135 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 5c80a3a9b800e..ecf10de7c82e3 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1145,6 +1145,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   // TODO(chenweihang): in the first phase of project, we only support CPU, CUDA
   // and RCOM backend, the XPU, NPU and MKLDNN will be supported in the second
   // phase
+  // TODO(chenweihang): ContainsKernel need more acurrate
   run_pt_kernel_ = pt::KernelFactory::Instance().ContainsKernel(type_.c_str());
   if (run_pt_kernel_) {
     if (pt_kernel_key_.get() == nullptr || pt_kernel_.get() == nullptr) {
@@ -1834,8 +1835,8 @@ pt::KernelContext OperatorWithKernel::ConstructPtKernelContext(
   // 4. use pt Tensor directly
   // 5. kernel input is not DenseTensor
   pt::KernelContext op_kernel_ctx(dev_ctx);
-  auto input_defs = pt_kernel_->param_def().input_defs();
-  auto output_defs = pt_kernel_->param_def().output_defs();
+  auto input_defs = pt_kernel_->args_def().input_defs();
+  auto output_defs = pt_kernel_->args_def().output_defs();
 
   // TODO(chenweihang): use ordered_map for VariableNameMap and VariableValueMap
   // If we the VariableValueMap are ordered, we can get tensor by iter the map,
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 955c722965a6e..3ddd26df65554 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -242,8 +242,8 @@ static pt::KernelContext BuildDygraphKernelContext(
   // 4. use pt Tensor directly
   // 5. kernel input is not DenseTensor
   pt::KernelContext op_kernel_ctx(dev_ctx);
-  auto input_defs = pt_kernel.param_def().input_defs();
-  auto output_defs = pt_kernel.param_def().output_defs();
+  auto input_defs = pt_kernel.args_def().input_defs();
+  auto output_defs = pt_kernel.args_def().output_defs();
 
   size_t i = 0;
   for (auto& var_pair : ins) {
diff --git a/paddle/tcmpt/core/kernel_def.h b/paddle/tcmpt/core/kernel_def.h
index e9069742844af..e0334f770bfd1 100644
--- a/paddle/tcmpt/core/kernel_def.h
+++ b/paddle/tcmpt/core/kernel_def.h
@@ -17,9 +17,13 @@
 namespace pt {
 
 class Kernel;
+class KernelKey;
+class KernelArgsDef;
 class KernelContext;
 
 using KernelFn = void (*)(KernelContext* ctx);
-using KernelParamDefFn = void (*)(Kernel* kernel);
+using KernelArgsDefFn = void (*)(Kernel* kernel);
+using KernelArgsParseFn = void (*)(const KernelKey& default_key,
+                                   KernelArgsDef* args_def);
 
 }  // namespace pt
diff --git a/paddle/tcmpt/core/kernel_factory.h b/paddle/tcmpt/core/kernel_factory.h
index fd3ef051b02db..19c08f5dfeb65 100644
--- a/paddle/tcmpt/core/kernel_factory.h
+++ b/paddle/tcmpt/core/kernel_factory.h
@@ -150,35 +150,35 @@ class KernelKey {
 };
 
 // TODO(chenweihang): how deal with vector<Param>?
-struct ParamDef {
+struct ArgDef {
   Backend backend;
   DataLayout layout;
   DataType dtype;
 
-  ParamDef(Backend backend, DataLayout layout, DataType dtype)
+  ArgDef(Backend backend, DataLayout layout, DataType dtype)
       : backend(backend), layout(layout), dtype(dtype) {}
 };
 
-class KernelParamDef {
+class KernelArgsDef {
  public:
-  KernelParamDef() = default;
+  KernelArgsDef() = default;
 
   void AppendInput(Backend backend, DataLayout layout, DataType dtype) {
-    input_defs_.emplace_back(ParamDef(backend, layout, dtype));
+    input_defs_.emplace_back(ArgDef(backend, layout, dtype));
   }
 
   void AppendOutput(Backend backend, DataLayout layout, DataType dtype) {
-    output_defs_.emplace_back(ParamDef(backend, layout, dtype));
+    output_defs_.emplace_back(ArgDef(backend, layout, dtype));
   }
 
-  const std::vector<ParamDef>& input_defs() const { return input_defs_; }
+  const std::vector<ArgDef>& input_defs() const { return input_defs_; }
 
-  const std::vector<ParamDef>& output_defs() const { return output_defs_; }
+  const std::vector<ArgDef>& output_defs() const { return output_defs_; }
 
  private:
   // TODO(chenweihang): replaced by paddle::small_vector
-  std::vector<ParamDef> input_defs_{{}};
-  std::vector<ParamDef> output_defs_{{}};
+  std::vector<ArgDef> input_defs_{{}};
+  std::vector<ArgDef> output_defs_{{}};
 };
 
 class Kernel {
@@ -190,13 +190,13 @@ class Kernel {
 
   void operator()(KernelContext* ctx) const { fn_(ctx); }
 
-  KernelParamDef* mutable_param_def() { return &param_def_; }
+  KernelArgsDef* mutable_args_def() { return &args_def_; }
 
-  const KernelParamDef& param_def() const { return param_def_; }
+  const KernelArgsDef& args_def() const { return args_def_; }
 
  private:
   KernelFn fn_{nullptr};
-  KernelParamDef param_def_;
+  KernelArgsDef args_def_;
 };
 
 /**
diff --git a/paddle/tcmpt/core/kernel_registry.h b/paddle/tcmpt/core/kernel_registry.h
index 448f5b8dbc5d0..e56629a835503 100644
--- a/paddle/tcmpt/core/kernel_registry.h
+++ b/paddle/tcmpt/core/kernel_registry.h
@@ -14,6 +14,11 @@
 
 #pragma once
 
+#include <type_traits>
+#include <typeindex>
+#include <typeinfo>
+#include <vector>
+
 #include "paddle/tcmpt/core/kernel_def.h"
 #include "paddle/tcmpt/core/kernel_factory.h"
 #include "paddle/tcmpt/core/kernel_utils.h"
@@ -24,35 +29,60 @@ namespace pt {
 #define DATALAYOUT(arg__) pt::DataLayout::k##arg__
 #define DATATYPE(arg__) pt::DataType::k##arg__
 
-class KernelRegistrar {
+template <typename Func>
+struct KernelArgsParseFunctor;
+
+template <typename Return_, typename... Args_>
+struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
+  using Args = std::tuple<Args_...>;
+  enum : std::size_t { Arity = sizeof...(Args_) };
+  using Indices = std::make_index_sequence<Arity>;
+  template <std::size_t Index>
+  using Arg = typename std::tuple_element<Index, Args>::type;
+
+  static void Parse(const KernelKey& default_key, KernelArgsDef* args_def) {
+    auto args_type = ParseArgType(Indices{});
+    for (auto arg_type : args_type) {
+      if (arg_type == std::type_index(typeid(const DenseTensor&))) {
+        args_def->AppendInput(
+            default_key.backend(), default_key.layout(), default_key.dtype());
+      } else if (arg_type == std::type_index(typeid(DenseTensor*))) {
+        args_def->AppendOutput(
+            default_key.backend(), default_key.layout(), default_key.dtype());
+      } else {
+        // TODO(chenweihang): throw argument error
+        VLOG(1) << "invalid arg";
+      }
+    }
+  }
+
+ private:
+  template <std::size_t... INDEX>
+  static std::vector<std::type_index> ParseArgType(
+      std::index_sequence<INDEX...>) {
+    return {std::type_index(typeid(Arg<INDEX>))...};
+  }
+};
+
+struct KernelRegistrar {
  public:
-  KernelRegistrar(const char* kernel_name,
+  KernelRegistrar(const char* kernel_name_cstr,
                   Backend backend,
                   DataLayout layout,
                   DataType dtype,
-                  KernelParamDefFn param_def_fn,
+                  KernelArgsParseFn args_parse_fn,
+                  KernelArgsDefFn args_def_fn,
                   KernelFn kernel_fn) {
-    KernelName final_kernel_name(kernel_name);
-    KernelKey op_kernel_key(backend, layout, dtype);
+    KernelName kernel_name(kernel_name_cstr);
+    KernelKey kernel_key(backend, layout, dtype);
     Kernel kernel(kernel_fn);
-    param_def_fn(&kernel);
-
-    // TODO(chenweihang): use default input and output for verify
-    kernel.mutable_param_def()->AppendInput(backend, layout, dtype);
-    kernel.mutable_param_def()->AppendOutput(backend, layout, dtype);
+    args_parse_fn(kernel_key, kernel.mutable_args_def());
+    args_def_fn(&kernel);
 
-    KernelFactory::Instance().kernels()[final_kernel_name][op_kernel_key] =
-        kernel;
+    KernelFactory::Instance().kernels()[kernel_name][kernel_key] = kernel;
   }
 };
 
-#if defined(_WIN32)
-#define UNUSED
-#define __builtin_expect(EXP, C) (EXP)
-#else
-#define UNUSED __attribute__((unused))
-#endif
-
 #define PT_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg) \
   _PT_STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)
 
@@ -89,24 +119,23 @@ class KernelRegistrar {
                       cpp_dtype,                                  \
                       __VA_ARGS__)
 
-#define _PT_REGISTER_KERNEL(                                               \
-    kernel_name, func_id, backend, layout, meta_kernel_fn, cpp_dtype, ...) \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                       \
-      PT_CONCATENATE(pt_op_kernel_ns_check_, func_id),                     \
-      "PT_REGISTER_KERNEL must be called in global namespace.");           \
-  PT_KERNEL_SPECIALIZE(meta_kernel_fn, cpp_dtype, __VA_ARGS__);            \
-  static void PT_CONCATENATE(__PT_KERNEL_PARAM_DEF_FN_,                    \
-                             func_id)(::pt::Kernel*);                      \
-  PT_KERNEL_REGISTRAR_INIT(                                                \
-      kernel_name,                                                         \
-      func_id,                                                             \
-      backend,                                                             \
-      layout,                                                              \
-      &PT_CONCATENATE(__PT_KERNEL_PARAM_DEF_FN_, func_id),                 \
-      meta_kernel_fn,                                                      \
-      cpp_dtype,                                                           \
-      __VA_ARGS__);                                                        \
-  void PT_CONCATENATE(__PT_KERNEL_PARAM_DEF_FN_, func_id)(::pt::Kernel * kernel)
+#define _PT_REGISTER_KERNEL(                                                   \
+    kernel_name, func_id, backend, layout, meta_kernel_fn, cpp_dtype, ...)     \
+  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                           \
+      PT_CONCATENATE(pt_op_kernel_ns_check_, func_id),                         \
+      "PT_REGISTER_KERNEL must be called in global namespace.");               \
+  PT_KERNEL_SPECIALIZE(meta_kernel_fn, cpp_dtype, __VA_ARGS__);                \
+  static void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                         \
+                             func_id)(::pt::Kernel*);                          \
+  PT_KERNEL_REGISTRAR_INIT(kernel_name,                                        \
+                           func_id,                                            \
+                           backend,                                            \
+                           layout,                                             \
+                           &PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id), \
+                           meta_kernel_fn,                                     \
+                           cpp_dtype,                                          \
+                           __VA_ARGS__);                                       \
+  void PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id)(::pt::Kernel * kernel)
 
 #define PT_KERNEL_SPECIALIZE(meta_kernel_fn, cpp_dtype, ...) \
   _PT_KERNEL_SPECIALIZE(PT_NARGS(cpp_dtype, __VA_ARGS__),    \
@@ -146,7 +175,7 @@ class KernelRegistrar {
                                  func_id,                     \
                                  backend,                     \
                                  layout,                      \
-                                 param_def_fn,                \
+                                 args_def_fn,                 \
                                  meta_kernel_fn,              \
                                  cpp_dtype,                   \
                                  ...)                         \
@@ -155,37 +184,37 @@ class KernelRegistrar {
                             func_id,                          \
                             backend,                          \
                             layout,                           \
-                            param_def_fn,                     \
+                            args_def_fn,                      \
                             meta_kernel_fn,                   \
                             cpp_dtype,                        \
                             __VA_ARGS__)
 
-#define _PT_KERNEL_REGISTRAR_INIT(N,              \
-                                  kernel_name,    \
-                                  func_id,        \
-                                  backend,        \
-                                  layout,         \
-                                  param_def_fn,   \
-                                  meta_kernel_fn, \
-                                  cpp_dtype,      \
-                                  ...)            \
-  PT_CONCATENATE(_PT_KERNEL_REGISTRAR_INIT_, N)   \
-  (kernel_name,                                   \
-   func_id,                                       \
-   PT_ID,                                         \
-   backend,                                       \
-   layout,                                        \
-   param_def_fn,                                  \
-   meta_kernel_fn,                                \
-   cpp_dtype,                                     \
-   __VA_ARGS__)
+#define _PT_KERNEL_REGISTRAR_INIT(N,                 \
+                                  kernel_name,       \
+                                  func_id,           \
+                                  backend,           \
+                                  layout,            \
+                                  args_def_fn,       \
+                                  meta_kernel_fn,    \
+                                  cpp_dtype,         \
+                                  ...)               \
+  PT_CONCATENATE(_PT_KERNEL_REGISTRAR_INIT_, N)      \
+    (kernel_name,                                    \
+      func_id,                                       \
+      PT_ID,                                         \
+      backend,                                       \
+      layout,                                        \
+      args_def_fn,                                   \
+      meta_kernel_fn,                                \
+      cpp_dtype,                                     \
+      __VA_ARGS__)
 
 #define _PT_KERNEL_REGISTRAR_INIT_1(kernel_name,      \
                                     func_id,          \
                                     registrar_id,     \
                                     backend,          \
                                     layout,           \
-                                    param_def_fn,     \
+                                    args_def_fn,      \
                                     meta_kernel_fn,   \
                                     cpp_dtype,        \
                                     ...)              \
@@ -195,14 +224,16 @@ class KernelRegistrar {
       BACKEND(backend),                               \
       DATALAYOUT(layout),                             \
       ::pt::CppTypeToDataType<cpp_dtype>::Type(),     \
-      param_def_fn,                                   \
+      ::pt::KernelArgsParseFunctor<decltype(          \
+          &meta_kernel_fn<cpp_dtype>)>::Parse,        \
+      args_def_fn,                                    \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));
 #define _PT_KERNEL_REGISTRAR_INIT_2(kernel_name,      \
                                     func_id,          \
                                     registrar_id,     \
                                     backend,          \
                                     layout,           \
-                                    param_def_fn,     \
+                                    args_def_fn,      \
                                     meta_kernel_fn,   \
                                     cpp_dtype,        \
                                     ...)              \
@@ -212,14 +243,16 @@ class KernelRegistrar {
       BACKEND(backend),                               \
       DATALAYOUT(layout),                             \
       ::pt::CppTypeToDataType<cpp_dtype>::Type(),     \
-      param_def_fn,                                   \
+      ::pt::KernelArgsParseFunctor<decltype(          \
+          &meta_kernel_fn<cpp_dtype>)>::Parse,        \
+      args_def_fn,                                    \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));          \
   _PT_KERNEL_REGISTRAR_INIT_1(kernel_name,            \
                               func_id,                \
                               PT_ID,                  \
                               backend,                \
                               layout,                 \
-                              param_def_fn,           \
+                              args_def_fn,            \
                               meta_kernel_fn,         \
                               __VA_ARGS__)
 #define _PT_KERNEL_REGISTRAR_INIT_3(kernel_name,      \
@@ -227,7 +260,7 @@ class KernelRegistrar {
                                     registrar_id,     \
                                     backend,          \
                                     layout,           \
-                                    param_def_fn,     \
+                                    args_def_fn,      \
                                     meta_kernel_fn,   \
                                     cpp_dtype,        \
                                     ...)              \
@@ -237,14 +270,16 @@ class KernelRegistrar {
       BACKEND(backend),                               \
       DATALAYOUT(layout),                             \
       ::pt::CppTypeToDataType<cpp_dtype>::Type(),     \
-      param_def_fn,                                   \
+      ::pt::KernelArgsParseFunctor<decltype(          \
+          &meta_kernel_fn<cpp_dtype>)>::Parse,        \
+      args_def_fn,                                    \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));          \
   _PT_KERNEL_REGISTRAR_INIT_2(kernel_name,            \
                               func_id,                \
                               PT_ID,                  \
                               backend,                \
                               layout,                 \
-                              param_def_fn,           \
+                              args_def_fn,            \
                               meta_kernel_fn,         \
                               __VA_ARGS__)
 #define _PT_KERNEL_REGISTRAR_INIT_4(kernel_name,      \
@@ -252,7 +287,7 @@ class KernelRegistrar {
                                     registrar_id,     \
                                     backend,          \
                                     layout,           \
-                                    param_def_fn,     \
+                                    args_def_fn,      \
                                     meta_kernel_fn,   \
                                     cpp_dtype,        \
                                     ...)              \
@@ -262,14 +297,16 @@ class KernelRegistrar {
       BACKEND(backend),                               \
       DATALAYOUT(layout),                             \
       ::pt::CppTypeToDataType<cpp_dtype>::Type(),     \
-      param_def_fn,                                   \
+      ::pt::KernelArgsParseFunctor<decltype(          \
+          &meta_kernel_fn<cpp_dtype>)>::Parse,        \
+      args_def_fn,                                    \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));          \
   _PT_KERNEL_REGISTRAR_INIT_3(kernel_name,            \
                               func_id,                \
                               PT_ID,                  \
                               backend,                \
                               layout,                 \
-                              param_def_fn,           \
+                              args_def_fn,            \
                               meta_kernel_fn,         \
                               __VA_ARGS__)
 #define _PT_KERNEL_REGISTRAR_INIT_5(kernel_name,      \
@@ -277,7 +314,7 @@ class KernelRegistrar {
                                     registrar_id,     \
                                     backend,          \
                                     layout,           \
-                                    param_def_fn,     \
+                                    args_def_fn,      \
                                     meta_kernel_fn,   \
                                     cpp_dtype,        \
                                     ...)              \
@@ -287,14 +324,16 @@ class KernelRegistrar {
       BACKEND(backend),                               \
       DATALAYOUT(layout),                             \
       ::pt::CppTypeToDataType<cpp_dtype>::Type(),     \
-      param_def_fn,                                   \
+      ::pt::KernelArgsParseFunctor<decltype(          \
+          &meta_kernel_fn<cpp_dtype>)>::Parse,        \
+      args_def_fn,                                    \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));          \
   _PT_KERNEL_REGISTRAR_INIT_4(kernel_name,            \
                               func_id,                \
                               PT_ID,                  \
                               backend,                \
                               layout,                 \
-                              param_def_fn,           \
+                              args_def_fn,            \
                               meta_kernel_fn,         \
                               __VA_ARGS__)
 #define _PT_KERNEL_REGISTRAR_INIT_6(kernel_name,      \
@@ -302,7 +341,7 @@ class KernelRegistrar {
                                     registrar_id,     \
                                     backend,          \
                                     layout,           \
-                                    param_def_fn,     \
+                                    args_def_fn,      \
                                     meta_kernel_fn,   \
                                     cpp_dtype,        \
                                     ...)              \
@@ -312,14 +351,16 @@ class KernelRegistrar {
       BACKEND(backend),                               \
       DATALAYOUT(layout),                             \
       ::pt::CppTypeToDataType<cpp_dtype>::Type(),     \
-      param_def_fn,                                   \
+      ::pt::KernelArgsParseFunctor<decltype(          \
+          &meta_kernel_fn<cpp_dtype>)>::Parse,        \
+      args_def_fn,                                    \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));          \
   _PT_KERNEL_REGISTRAR_INIT_5(kernel_name,            \
                               func_id,                \
                               PT_ID,                  \
                               backend,                \
                               layout,                 \
-                              param_def_fn,           \
+                              args_def_fn,            \
                               meta_kernel_fn,         \
                               __VA_ARGS__)
 #define _PT_KERNEL_REGISTRAR_INIT_7(kernel_name,      \
@@ -327,7 +368,7 @@ class KernelRegistrar {
                                     registrar_id,     \
                                     backend,          \
                                     layout,           \
-                                    param_def_fn,     \
+                                    args_def_fn,      \
                                     meta_kernel_fn,   \
                                     cpp_dtype,        \
                                     ...)              \
@@ -337,14 +378,16 @@ class KernelRegistrar {
       BACKEND(backend),                               \
       DATALAYOUT(layout),                             \
       ::pt::CppTypeToDataType<cpp_dtype>::Type(),     \
-      param_def_fn,                                   \
+      ::pt::KernelArgsParseFunctor<decltype(          \
+          &meta_kernel_fn<cpp_dtype>)>::Parse,        \
+      args_def_fn,                                    \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));          \
   _PT_KERNEL_REGISTRAR_INIT_6(kernel_name,            \
                               func_id,                \
                               PT_ID,                  \
                               backend,                \
                               layout,                 \
-                              param_def_fn,           \
+                              args_def_fn,            \
                               meta_kernel_fn,         \
                               __VA_ARGS__)
 #define _PT_KERNEL_REGISTRAR_INIT_8(kernel_name,      \
@@ -352,7 +395,7 @@ class KernelRegistrar {
                                     registrar_id,     \
                                     backend,          \
                                     layout,           \
-                                    param_def_fn,     \
+                                    args_def_fn,      \
                                     meta_kernel_fn,   \
                                     cpp_dtype,        \
                                     ...)              \
@@ -362,53 +405,41 @@ class KernelRegistrar {
       BACKEND(backend),                               \
       DATALAYOUT(layout),                             \
       ::pt::CppTypeToDataType<cpp_dtype>::Type(),     \
-      param_def_fn,                                   \
+      ::pt::KernelArgsParseFunctor<decltype(          \
+          &meta_kernel_fn<cpp_dtype>)>::Parse,        \
+      args_def_fn,                                    \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));          \
   _PT_KERNEL_REGISTRAR_INIT_7(kernel_name,            \
                               func_id,                \
                               PT_ID,                  \
                               backend,                \
                               layout,                 \
-                              param_def_fn,           \
+                              args_def_fn,            \
                               meta_kernel_fn,         \
                               __VA_ARGS__)
 
-#define PT_REGISTER_KERNEL_STANDARD(                                         \
-    kernel_name, backend, layout, dtype, kernel_fn)                          \
-  template decltype(kernel_fn) kernel_fn;                                    \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
-      __reg_pt_op_kernel_##kernel_name##_##backend##_##layout##_##dtype##__, \
-      "PT_REGISTER_KERNEL_STANDARD must be called in global namespace.");    \
-  static ::pt::KernelRegistrar                                               \
-      __pt_op_kernel_##kernel_name##_##backend##_##layout##_##dtype##__ =    \
-          ::pt::KernelRegistrar(#kernel_name,                                \
-                                BACKEND(backend),                            \
-                                DATALAYOUT(layout),                          \
-                                DATATYPE(dtype),                             \
-                                PT_KERNEL(kernel_fn))
-
-#define PT_REGISTER_KERNEL_AUTO_SPECIALIZE(                                  \
-    kernel_name, backend, layout, meta_kernel_fn, dtype)                     \
-  template decltype(meta_kernel_fn<dtype>) meta_kernel_fn<dtype>;            \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
-      __reg_pt_op_kernel_##kernel_name##_##backend##_##layout##_##dtype##__, \
-      "PT_REGISTER_KERNEL_AUTO_SPECIALIZE must be called in global "         \
-      "namespace.");                                                         \
-  static ::pt::KernelRegistrar                                               \
-      __pt_op_kernel_##kernel_name##_##backend##_##layout##_##dtype##__ =    \
-          ::pt::KernelRegistrar(#kernel_name,                                \
-                                BACKEND(backend),                            \
-                                DATALAYOUT(layout),                          \
-                                ::pt::CppTypeToDataType<dtype>::Type(),      \
-                                PT_KERNEL(meta_kernel_fn<dtype>))
+#define PT_REGISTER_KERNEL_STANDARD(                \
+    kernel_name, backend, layout, dtype, kernel_fn) \
+  _PT_REGISTER_KERNEL_STANDARD(                     \
+      kernel_name, PT_ID, backend, layout, dtype, kernel_fn)
 
-#define PT_TOUCH_KERNEL_REGISTRAR(kernel_name, backend, layout, dtype)         \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                           \
-      __touch_pt_op_kernel_##kernel_name##_##backend##_##layout##_##dtype##__, \
-      "PT_TOUCH_KERNEL_REGISTRAR must be called in global namespace.");        \
-  int TouchKernelRegistrar_##kernel_name##_##backend##_##dtype##_##layout() {  \
-    __pt_op_kernel_##kernel_name##_##backend##_##layout##_##dtype##__.Touch(); \
-    return 0;                                                                  \
-  }
+#define _PT_REGISTER_KERNEL_STANDARD(                                      \
+    kernel_name, func_id, backend, layout, dtype, kernel_fn)               \
+  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                       \
+      PT_CONCATENATE(pt_op_kernel_ns_check_, func_id),                     \
+      "_PT_REGISTER_KERNEL_STANDARD must be called in global namespace."); \
+  template decltype(kernel_fn) kernel_fn;                                  \
+  static void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                     \
+                             func_id)(::pt::Kernel*);                      \
+  static const ::pt::KernelRegistrar PT_CONCATENATE(__reg_pt_op_kernel_,   \
+                                                    func_id)(              \
+      kernel_name,                                                         \
+      BACKEND(backend),                                                    \
+      DATALAYOUT(layout),                                                  \
+      DATATYPE(dtype),                                                     \
+      ::pt::KernelArgsParseFunctor<decltype(&kernel_fn)>::Parse,           \
+      args_def_fn,                                                         \
+      PT_KERNEL(kernel_fn));                                               \
+  void PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id)(::pt::Kernel*)
 
 }  // namespace pt

From c3ebfeafd3606a1b50796e3bbcd113bd2e51e375 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 9 Sep 2021 07:41:36 +0000
Subject: [PATCH 046/125] revert some change & add scale kernels

---
 cmake/generic.cmake                           |  18 +-
 paddle/fluid/framework/operator.cc            |   2 -
 paddle/fluid/inference/CMakeLists.txt         |   8 +-
 paddle/fluid/operators/mean_op.cc             |  13 --
 paddle/fluid/operators/pool_cudnn_op.cu.cc    |   4 +-
 paddle/fluid/operators/scale_op.cc            |  22 ---
 paddle/tcmpt/core/dtype.cc                    |   3 +
 paddle/tcmpt/core/kernel_factory.h            |  43 +++--
 paddle/tcmpt/core/kernel_registry.h           |  47 ++---
 paddle/tcmpt/core/kernel_utils.h              |  74 ++++----
 paddle/tcmpt/cpu/math.cc                      | 172 +++++++++++-------
 paddle/tcmpt/cpu/math.h                       |  39 ++--
 paddle/tcmpt/cuda/math.cu                     | 156 +++++++++++-----
 paddle/tcmpt/cuda/math.h                      |  36 ++--
 .../contrib/tests/test_quantize_transpiler.py |   1 -
 15 files changed, 380 insertions(+), 258 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 24cac6ad8546e..410a7c52a24d5 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -116,19 +116,19 @@ function(find_fluid_modules TARGET_NAME)
   endif()
 endfunction(find_fluid_modules)
 
-set_property(GLOBAL PROPERTY TOP_MODULES "")
+set_property(GLOBAL PROPERTY TCMPT_MODULES "")
 # find all top modules is used for paddle static library
 # for building inference libs
-function(find_top_modules TARGET_NAME)
+function(find_tcmpt_modules TARGET_NAME)
   get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
   string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
   string(REGEX MATCH "\/top\/" result "${__target_path}")
   if(NOT result STREQUAL "")
-    get_property(top_modules GLOBAL PROPERTY TOP_MODULES)
-    set(top_modules ${top_modules} ${TARGET_NAME})
-    set_property(GLOBAL PROPERTY TOP_MODULES "${top_modules}")
+    get_property(tcmpt_modules GLOBAL PROPERTY TCMPT_MODULES)
+    set(tcmpt_modules ${tcmpt_modules} ${TARGET_NAME})
+    set_property(GLOBAL PROPERTY TCMPT_MODULES "${tcmpt_modules}")
   endif()
-endfunction(find_top_modules)
+endfunction(find_tcmpt_modules)
 
 function(common_link TARGET_NAME)
   if (WITH_PROFILER)
@@ -324,7 +324,7 @@ function(cc_library TARGET_NAME)
       else()
         add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
         find_fluid_modules(${TARGET_NAME})
-        find_top_modules(${TARGET_NAME})
+        find_tcmpt_modules(${TARGET_NAME})
       endif()
     if(cc_library_DEPS)
       # Don't need link libwarpctc.so
@@ -497,7 +497,7 @@ function(nv_library TARGET_NAME)
       else()
         add_library(${TARGET_NAME} STATIC ${nv_library_SRCS})
         find_fluid_modules(${TARGET_NAME})
-        find_top_modules(${TARGET_NAME})
+        find_tcmpt_modules(${TARGET_NAME})
       endif()
       if (nv_library_DEPS)
         add_dependencies(${TARGET_NAME} ${nv_library_DEPS})
@@ -588,7 +588,7 @@ function(hip_library TARGET_NAME)
       else()
         hip_add_library(${TARGET_NAME} STATIC ${hip_library_SRCS})
         find_fluid_modules(${TARGET_NAME})
-        find_top_modules(${TARGET_NAME})
+        find_tcmpt_modules(${TARGET_NAME})
       endif()
       if (hip_library_DEPS)
         add_dependencies(${TARGET_NAME} ${hip_library_DEPS})
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index c8ceabe2bd288..865b604c1a240 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1210,8 +1210,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
       // TODO(chenweihang): here will intrduce copy
       auto op_kernel_ctx = ConstructPtKernelContext(*runtime_ctx, *dev_ctx);
       (*pt_kernel_)(&op_kernel_ctx);
-      // need share output into fluid tensor
-
     } else {
       (*kernel_func_)(
           ExecutionContext(*this, exec_scope, *dev_ctx, *runtime_ctx));
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 19b559a0559bf..adfd7946c2416 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -35,7 +35,7 @@ endif()
 
 # fluid_modules exclude API-interface of inference/api and inference/capi_exp
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
-get_property(top_modules GLOBAL PROPERTY TOP_MODULES)
+get_property(tcmpt_modules GLOBAL PROPERTY TCMPT_MODULES)
 
 # Adapt to custom op mechanism: Include the header files related to the data type
 # to avoid exposing the path of the underlying file
@@ -51,9 +51,9 @@ set(STATIC_INFERENCE_API paddle_inference_api analysis_predictor
         analysis_config paddle_pass_builder activation_functions ${mkldnn_quantizer_cfg})
 #TODO(wilber, T8T9): Do we still need to support windows gpu static library?
 if(WIN32 AND WITH_GPU)
-  cc_library(paddle_inference DEPS ${fluid_modules} ${top_modules} ${STATIC_INFERENCE_API})
+  cc_library(paddle_inference DEPS ${fluid_modules} ${tcmpt_modules} ${STATIC_INFERENCE_API})
 else()
-  create_static_lib(paddle_inference ${fluid_modules} ${top_modules} ${STATIC_INFERENCE_API})
+  create_static_lib(paddle_inference ${fluid_modules} ${tcmpt_modules} ${STATIC_INFERENCE_API})
 endif()
 
 if(NOT APPLE)
@@ -82,7 +82,7 @@ set(SHARED_INFERENCE_SRCS
     ${PADDLE_CUSTOM_OP_SRCS})
 
 # shared inference library deps
-set(SHARED_INFERENCE_DEPS ${fluid_modules} ${top_modules} analysis_predictor)
+set(SHARED_INFERENCE_DEPS ${fluid_modules} ${tcmpt_modules} analysis_predictor)
 
 if (WITH_CRYPTO) 
     set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} paddle_crypto)
diff --git a/paddle/fluid/operators/mean_op.cc b/paddle/fluid/operators/mean_op.cc
index 6aa4e0189825d..764529a15b6a2 100644
--- a/paddle/fluid/operators/mean_op.cc
+++ b/paddle/fluid/operators/mean_op.cc
@@ -100,16 +100,3 @@ REGISTER_OP_CPU_KERNEL(
 REGISTER_OP_CPU_KERNEL(
     mean_grad, ops::MeanGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::MeanGradKernel<paddle::platform::CPUDeviceContext, double>);
-
-#ifdef PADDLE_WITH_XPU
-REGISTER_OP_XPU_KERNEL(
-    mean, ops::MeanKernel<paddle::platform::XPUDeviceContext, float>);
-#endif
-
-#ifdef PADDLE_WITH_ASCEND_CL
-REGISTER_OP_NPU_KERNEL(
-    mean, ops::MeanKernel<paddle::platform::NPUDeviceContext, int>,
-    ops::MeanKernel<paddle::platform::NPUDeviceContext, float>,
-    ops::MeanKernel<paddle::platform::NPUDeviceContext, double>,
-    ops::MeanKernel<paddle::platform::NPUDeviceContext, plat::float16>)
-#endif
diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc
index 74ab7532c6a11..8fcd40a9a2df4 100644
--- a/paddle/fluid/operators/pool_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc
@@ -268,7 +268,7 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
 
 #ifdef PADDLE_WITH_HIP
     if (pooling_type == "max") {
-      using KernelMap = paddle::framework::OperatorWithKernel::KernelMap;
+      using OpKernelMap = paddle::framework::OperatorWithKernel::OpKernelMap;
       using OpKernelFunc = paddle::framework::OperatorWithKernel::OpKernelFunc;
       auto &all_op_kernels =
           paddle::framework::OperatorWithKernel::AllOpKernels();
@@ -279,7 +279,7 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
           platform::errors::Unavailable(
               "There are no kernels which are registered in the %s operator.",
               op_type));
-      KernelMap &kernels = kernels_iter->second;
+      OpKernelMap &kernels = kernels_iter->second;
       paddle::framework::OpKernelType expected_kernel_key(
           paddle::framework::ToDataType(typeid(T)), ctx.GetPlace());
       auto kernel_iter = kernels.find(expected_kernel_key);
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index b9c3ddf201c7a..a195452791048 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -171,25 +171,3 @@ REGISTER_OP_CUDA_KERNEL(
                                    int64_t>,
     paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
                                    paddle::platform::float16>);
-
-#ifdef PADDLE_WITH_XPU
-REGISTER_OP_XPU_KERNEL(
-    scale,
-    paddle::operators::ScaleKernel<paddle::platform::XPUDeviceContext, float>);
-#endif
-
-#ifdef PADDLE_WITH_ASCEND_CL
-REGISTER_OP_NPU_KERNEL(
-    scale,
-    paddle::operators::ScaleKernel<paddle::platform::NPUDeviceContext, float>,
-    paddle::operators::ScaleKernel<paddle::platform::NPUDeviceContext,
-                                   paddle::platform::float16>);
-#endif
-
-#ifdef PADDLE_WITH_MKLDNN
-REGISTER_OP_KERNEL(
-    scale, MKLDNN, paddle::platform::CPUPlace,
-    ops::ScaleKernel<paddle::platform::MKLDNNDeviceContext, float>,
-    ops::ScaleKernel<paddle::platform::MKLDNNDeviceContext,
-                     paddle::platform::bfloat16>);
-#endif
diff --git a/paddle/tcmpt/core/dtype.cc b/paddle/tcmpt/core/dtype.cc
index 1ddf1b25b3357..f1de29f184fc4 100644
--- a/paddle/tcmpt/core/dtype.cc
+++ b/paddle/tcmpt/core/dtype.cc
@@ -39,6 +39,9 @@ std::ostream& operator<<(std::ostream& os, DataType dtype) {
     case DataType::kINT64:
       os << "int64";
       break;
+    case DataType::kBFLOAT16:
+      os << "bfloat16";
+      break;
     case DataType::kFLOAT16:
       os << "float16";
       break;
diff --git a/paddle/tcmpt/core/kernel_factory.h b/paddle/tcmpt/core/kernel_factory.h
index 19c08f5dfeb65..ca63cfdc229f9 100644
--- a/paddle/tcmpt/core/kernel_factory.h
+++ b/paddle/tcmpt/core/kernel_factory.h
@@ -65,10 +65,6 @@ struct KernelName final {
       overload_name = "";
     } else {
       name = kernel_name_str.substr(0, pos);
-      PADDLE_ENFORCE_EQ(kernel_name_str.find('.', pos + 1),
-                        std::string::npos,
-                        paddle::platform::errors::InvalidArgument(
-                            "KernelName only can contains one '.'."));
       overload_name = kernel_name_str.substr(pos + 1, kernel_name_str.size());
     }
     hash_value = std::hash<std::string>()(name) ^
@@ -150,13 +146,28 @@ class KernelKey {
 };
 
 // TODO(chenweihang): how deal with vector<Param>?
-struct ArgDef {
+struct TensorArgDef {
   Backend backend;
   DataLayout layout;
   DataType dtype;
 
-  ArgDef(Backend backend, DataLayout layout, DataType dtype)
+  TensorArgDef(Backend backend, DataLayout layout, DataType dtype)
       : backend(backend), layout(layout), dtype(dtype) {}
+
+  TensorArgDef& SetBackend(Backend backend) {
+    backend = backend;
+    return *this;
+  }
+
+  TensorArgDef& SetDataLayout(DataLayout layout) {
+    layout = layout;
+    return *this;
+  }
+
+  TensorArgDef& SetDataType(DataType dtype) {
+    dtype = dtype;
+    return *this;
+  }
 };
 
 class KernelArgsDef {
@@ -164,21 +175,25 @@ class KernelArgsDef {
   KernelArgsDef() = default;
 
   void AppendInput(Backend backend, DataLayout layout, DataType dtype) {
-    input_defs_.emplace_back(ArgDef(backend, layout, dtype));
+    input_defs_.emplace_back(TensorArgDef(backend, layout, dtype));
   }
 
   void AppendOutput(Backend backend, DataLayout layout, DataType dtype) {
-    output_defs_.emplace_back(ArgDef(backend, layout, dtype));
+    output_defs_.emplace_back(TensorArgDef(backend, layout, dtype));
   }
 
-  const std::vector<ArgDef>& input_defs() const { return input_defs_; }
+  const std::vector<TensorArgDef>& input_defs() const { return input_defs_; }
+
+  const std::vector<TensorArgDef>& output_defs() const { return output_defs_; }
+
+  std::vector<TensorArgDef>& input_defs() { return input_defs_; }
 
-  const std::vector<ArgDef>& output_defs() const { return output_defs_; }
+  std::vector<TensorArgDef>& output_defs() { return output_defs_; }
 
  private:
   // TODO(chenweihang): replaced by paddle::small_vector
-  std::vector<ArgDef> input_defs_{{}};
-  std::vector<ArgDef> output_defs_{{}};
+  std::vector<TensorArgDef> input_defs_{{}};
+  std::vector<TensorArgDef> output_defs_{{}};
 };
 
 class Kernel {
@@ -194,6 +209,10 @@ class Kernel {
 
   const KernelArgsDef& args_def() const { return args_def_; }
 
+  TensorArgDef& InputAt(size_t idx) { return args_def_.input_defs().at(idx); }
+
+  TensorArgDef& OutputAt(size_t idx) { return args_def_.output_defs().at(idx); }
+
  private:
   KernelFn fn_{nullptr};
   KernelArgsDef args_def_;
diff --git a/paddle/tcmpt/core/kernel_registry.h b/paddle/tcmpt/core/kernel_registry.h
index e56629a835503..1aaaead43f935 100644
--- a/paddle/tcmpt/core/kernel_registry.h
+++ b/paddle/tcmpt/core/kernel_registry.h
@@ -43,10 +43,12 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
   static void Parse(const KernelKey& default_key, KernelArgsDef* args_def) {
     auto args_type = ParseArgType(Indices{});
     for (auto arg_type : args_type) {
-      if (arg_type == std::type_index(typeid(const DenseTensor&))) {
+      if (arg_type == std::type_index(typeid(const DenseTensor&)) ||
+          arg_type == std::type_index(typeid(const SelectedRowsTensor&))) {
         args_def->AppendInput(
             default_key.backend(), default_key.layout(), default_key.dtype());
-      } else if (arg_type == std::type_index(typeid(DenseTensor*))) {
+      } else if (arg_type == std::type_index(typeid(DenseTensor*)) ||
+                 arg_type == std::type_index(typeid(SelectedRowsTensor*))) {
         args_def->AppendOutput(
             default_key.backend(), default_key.layout(), default_key.dtype());
       } else {
@@ -189,25 +191,28 @@ struct KernelRegistrar {
                             cpp_dtype,                        \
                             __VA_ARGS__)
 
-#define _PT_KERNEL_REGISTRAR_INIT(N,                 \
-                                  kernel_name,       \
-                                  func_id,           \
-                                  backend,           \
-                                  layout,            \
-                                  args_def_fn,       \
-                                  meta_kernel_fn,    \
-                                  cpp_dtype,         \
-                                  ...)               \
-  PT_CONCATENATE(_PT_KERNEL_REGISTRAR_INIT_, N)      \
-    (kernel_name,                                    \
-      func_id,                                       \
-      PT_ID,                                         \
-      backend,                                       \
-      layout,                                        \
-      args_def_fn,                                   \
-      meta_kernel_fn,                                \
-      cpp_dtype,                                     \
-      __VA_ARGS__)
+// The =pre-commit always treats this macro into the wrong format,
+// and multi-line macros cannot be skipped with NOLINT.
+// If there are only errors here, you can use -n to skip check
+#define _PT_KERNEL_REGISTRAR_INIT(N,              \
+                                  kernel_name,    \
+                                  func_id,        \
+                                  backend,        \
+                                  layout,         \
+                                  args_def_fn,    \
+                                  meta_kernel_fn, \
+                                  cpp_dtype,      \
+                                  ...)            \
+  PT_CONCATENATE(_PT_KERNEL_REGISTRAR_INIT_, N)   \
+  (kernel_name,                                   \
+   func_id,                                       \
+   PT_ID,                                         \
+   backend,                                       \
+   layout,                                        \
+   args_def_fn,                                   \
+   meta_kernel_fn,                                \
+   cpp_dtype,                                     \
+   __VA_ARGS__)
 
 #define _PT_KERNEL_REGISTRAR_INIT_1(kernel_name,      \
                                     func_id,          \
diff --git a/paddle/tcmpt/core/kernel_utils.h b/paddle/tcmpt/core/kernel_utils.h
index 33702c78f3448..98dd0b0472331 100644
--- a/paddle/tcmpt/core/kernel_utils.h
+++ b/paddle/tcmpt/core/kernel_utils.h
@@ -14,8 +14,10 @@
 
 #pragma once
 
+#include "paddle/tcmpt/core/dense_tensor.h"
 #include "paddle/tcmpt/core/kernel_context.h"
 #include "paddle/tcmpt/core/kernel_def.h"
+#include "paddle/tcmpt/core/selected_rows_tensor.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
@@ -64,6 +66,26 @@ using XPUContext = paddle::platform::XPUDeviceContext;
     }                                                                        \
   }
 
+#define PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(tensor_type)           \
+  template <typename... Tail>                                           \
+  struct KernelCallHelper<const tensor_type&, Tail...> {                \
+    template <int dev_ctx_idx,                                          \
+              int in_idx,                                               \
+              int attr_idx,                                             \
+              int out_idx,                                              \
+              typename... PreviousArgs>                                 \
+    static void Compute(KernelContext* ctx, PreviousArgs&... pargs) {   \
+      static_assert(attr_idx == 0,                                      \
+                    "Kernel's Input should appear before Attributes."); \
+      static_assert(out_idx == 0,                                       \
+                    "Kernel's Input should appear before Outputs.");    \
+      const tensor_type& arg = ctx->InputAt<tensor_type>(in_idx);       \
+      KernelCallHelper<Tail...>::                                       \
+          template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>( \
+              ctx, pargs..., arg);                                      \
+    }                                                                   \
+  }
+
 #define PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(attr_type)           \
   template <typename... Tail>                                             \
   struct KernelCallHelper<attr_type, Tail...> {                           \
@@ -82,6 +104,22 @@ using XPUContext = paddle::platform::XPUDeviceContext;
     }                                                                     \
   }
 
+#define PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(tensor_type)          \
+  template <typename... Tail>                                           \
+  struct KernelCallHelper<tensor_type*, Tail...> {                      \
+    template <int dev_ctx_idx,                                          \
+              int in_idx,                                               \
+              int attr_idx,                                             \
+              int out_idx,                                              \
+              typename... PreviousArgs>                                 \
+    static void Compute(KernelContext* ctx, PreviousArgs&... pargs) {   \
+      tensor_type* arg = ctx->MutableOutputAt<tensor_type>(out_idx);    \
+      KernelCallHelper<Tail...>::                                       \
+          template Compute<dev_ctx_idx, in_idx, attr_idx, out_idx + 1>( \
+              ctx, pargs..., arg);                                      \
+    }                                                                   \
+  }
+
 template <typename T>
 struct TypeTag {};
 
@@ -113,24 +151,8 @@ struct KernelImpl<Return (*)(Args...), kernel_fn> {
 
   /* Input Helpers */
 
-  template <typename... Tail>
-  struct KernelCallHelper<const DenseTensor&, Tail...> {
-    template <int dev_ctx_idx,
-              int in_idx,
-              int attr_idx,
-              int out_idx,
-              typename... PreviousArgs>
-    static void Compute(KernelContext* ctx, PreviousArgs&... pargs) {
-      static_assert(attr_idx == 0,
-                    "Kernel's Input should appear before Attributes.");
-      static_assert(out_idx == 0,
-                    "Kernel's Input should appear before Outputs.");
-      const DenseTensor& arg = ctx->InputAt<DenseTensor>(in_idx);
-      KernelCallHelper<Tail...>::
-          template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>(
-              ctx, pargs..., arg);
-    }
-  };
+  PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(DenseTensor);
+  PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SelectedRowsTensor);
 
   /* Attribute Helpers */
 
@@ -139,20 +161,8 @@ struct KernelImpl<Return (*)(Args...), kernel_fn> {
 
   /* Output Helpers */
 
-  template <typename... Tail>
-  struct KernelCallHelper<DenseTensor*, Tail...> {
-    template <int dev_ctx_idx,
-              int in_idx,
-              int attr_idx,
-              int out_idx,
-              typename... PreviousArgs>
-    static void Compute(KernelContext* ctx, PreviousArgs&... pargs) {
-      DenseTensor* arg = ctx->MutableOutputAt<DenseTensor>(out_idx);
-      KernelCallHelper<Tail...>::
-          template Compute<dev_ctx_idx, in_idx, attr_idx, out_idx + 1>(
-              ctx, pargs..., arg);
-    }
-  };
+  PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(DenseTensor);
+  PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SelectedRowsTensor);
 
   /* End case */
   template <typename T>
diff --git a/paddle/tcmpt/cpu/math.cc b/paddle/tcmpt/cpu/math.cc
index 8e760f6e11556..5b125f92f8529 100644
--- a/paddle/tcmpt/cpu/math.cc
+++ b/paddle/tcmpt/cpu/math.cc
@@ -14,8 +14,8 @@
 
 #include "paddle/tcmpt/cpu/math.h"
 
-// #include "paddle/tcmpt/eigen/scale.h"
-// #include "paddle/tcmpt/eigen/sign.h"
+#include "paddle/tcmpt/eigen/scale.h"
+#include "paddle/tcmpt/eigen/sign.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/eigen.h"
@@ -45,73 +45,115 @@ void Mean(const CPUContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
   y_data.device(place) = x_data.mean();
 }
 
-// template <typename T>
-// void Scale(const CPUContext& dev_ctx,
-//            const DenseTensor& x,
-//            float scale,
-//            float bias,
-//            bool bias_after_scale,
-//            DenseTensor* out) {
-//   module::Scale<CPUContext, T>(dev_ctx, x, scale, bias, bias_after_scale,
-//   out);
-// }
-
-// template <typename T>
-// void ScaleSelectedRows(const CPUContext& dev_ctx,
-//           const SelectedRowsTensor& x,
-//           float scale,
-//           float bias,
-//           bool bias_after_scale,
-//           SelectedRowsTensor* out) {
-//   out->set_rows(x.rows());
-//   out->set_height(x.height());
-//   Scale<T>(dev_ctx, x.value(), scale, bias, bias_after_scale, out->value());
-// }
-
-}  // namespace pt
+template <typename T>
+void Scale(const CPUContext& dev_ctx,
+           const DenseTensor& x,
+           float scale,
+           float bias,
+           bool bias_after_scale,
+           DenseTensor* out) {
+  module::Scale<CPUContext, T>(dev_ctx, x, scale, bias, bias_after_scale, out);
+}
 
-// using bfloat16 = ::paddle::platform::bfloat16;
+template <typename T>
+void ScaleSelectedRows(const CPUContext& dev_ctx,
+                       const SelectedRowsTensor& x,
+                       float scale,
+                       float bias,
+                       bool bias_after_scale,
+                       SelectedRowsTensor* out) {
+  out->set_rows(x.rows());
+  out->set_height(x.height());
+  Scale<T>(
+      dev_ctx, x.value(), scale, bias, bias_after_scale, out->mutable_value());
+}
 
-// Register method 1:
-// PT_REGISTER_KERNEL_STANDARD(sign, CPU, NCHW, FLOAT32, pt::Sign<float>)
-//   .Input(BACKEND(CPU), DATALAYOUT(NCHW), DATATYPE(FLOAT32))
-//   .Output(BACKEND(CPU), DATALAYOUT(NCHW), DATATYPE(FLOAT32));
-// PT_TOUCH_KERNEL_REGISTRAR(sign, CPU, NCHW, FLOAT32);
+template <typename T>
+void ScaleDynamicAttr(const CPUContext& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& scale,
+                      float bias,
+                      bool bias_after_scale,
+                      DenseTensor* out) {
+  module::Scale<CPUContext, T>(
+      dev_ctx, x, *scale.data<float>(), bias, bias_after_scale, out);
+}
 
-// Register method 2:
-// PT_REGISTER_KERNEL_AUTO_SPECIALIZE(sign, CPU, NCHW, pt::Sign, float)
-//   .Input(BACKEND(CPU), DATALAYOUT(NCHW), DATATYPE(FLOAT32))
-//   .Output(BACKEND(CPU), DATALAYOUT(NCHW), DATATYPE(FLOAT32));
-// PT_TOUCH_KERNEL_REGISTRAR(sign, CPU, NCHW, FLOAT32);
+template <typename T>
+void ScaleSelectedRowsDynamicAttr(const CPUContext& dev_ctx,
+                                  const SelectedRowsTensor& x,
+                                  const DenseTensor& scale,
+                                  float bias,
+                                  bool bias_after_scale,
+                                  SelectedRowsTensor* out) {
+  out->set_rows(x.rows());
+  out->set_height(x.height());
+  Scale<T>(dev_ctx,
+           x.value(),
+           *scale.data<float>(),
+           bias,
+           bias_after_scale,
+           out->mutable_value());
+}
 
-// Register method 3:
-// PT_REGISTER_KERNEL_2T(sign, CPU, NCHW, pt::Sign, float, double);
-// PT_REGISTER_KERNEL_2T(mean, CPU, NCHW, pt::Mean, float, double);
-// PT_REGISTER_KERNEL_8T(scale,
-//                       CPU,
-//                       NCHW,
-//                       pt::Scale,
-//                       float,
-//                       double,
-//                       bfloat16,
-//                       uint8_t,
-//                       int8_t,
-//                       int16_t,
-//                       int,
-//                       int64_t);
-// PT_REGISTER_KERNEL_8T(scale.selected_rows,
-//                       CPU,
-//                       NCHW,
-//                       pt::ScaleSelectedRows,
-//                       float,
-//                       double,
-//                       bfloat16,
-//                       uint8_t,
-//                       int8_t,
-//                       int16_t,
-//                       int,
-//                       int64_t);
+}  // namespace pt
 
-// Register method 4:
+using bfloat16 = ::paddle::platform::bfloat16;
 PT_REGISTER_KERNEL("sign", CPU, NCHW, pt::Sign, float, double) {}
 PT_REGISTER_KERNEL("mean", CPU, NCHW, pt::Mean, float, double) {}
+PT_REGISTER_KERNEL("scale",
+                   CPU,
+                   NCHW,
+                   pt::Scale,
+                   float,
+                   double,
+                   bfloat16,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
+PT_REGISTER_KERNEL("scale.selectedrows",
+                   CPU,
+                   NCHW,
+                   pt::ScaleSelectedRows,
+                   float,
+                   double,
+                   bfloat16,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
+PT_REGISTER_KERNEL("scale.dynamic_attr",
+                   CPU,
+                   NCHW,
+                   pt::ScaleDynamicAttr,
+                   float,
+                   double,
+                   bfloat16,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {
+  kernel->InputAt(1)
+      .SetBackend(pt::Backend::kCPU)
+      .SetDataType(pt::DataType::kFLOAT32);
+}
+PT_REGISTER_KERNEL("scale.selectedrows.dynamic_attr",
+                   CPU,
+                   NCHW,
+                   pt::ScaleSelectedRowsDynamicAttr,
+                   float,
+                   double,
+                   bfloat16,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {
+  kernel->InputAt(1)
+      .SetBackend(pt::Backend::kCPU)
+      .SetDataType(pt::DataType::kFLOAT32);
+}
diff --git a/paddle/tcmpt/cpu/math.h b/paddle/tcmpt/cpu/math.h
index f49848e645d5d..f6e3375a98397 100644
--- a/paddle/tcmpt/cpu/math.h
+++ b/paddle/tcmpt/cpu/math.h
@@ -18,9 +18,6 @@ limitations under the License. */
 #include "paddle/tcmpt/core/kernel_registry.h"
 #include "paddle/tcmpt/core/selected_rows_tensor.h"
 
-#include "paddle/tcmpt/eigen/scale.h"
-#include "paddle/tcmpt/eigen/sign.h"
-
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
 
@@ -40,16 +37,30 @@ void Scale(const CPUContext& dev_ctx,
            float scale,
            float bias,
            bool bias_after_scale,
-           DenseTensor* out) {
-  module::Scale<CPUContext, T>(dev_ctx, x, scale, bias, bias_after_scale, out);
-}
-
-// template <typename T>
-// void ScaleSelectedRows(const CPUContext& dev_ctx,
-//         const SelectedRowsTensor& x,
-//         float scale,
-//         float bias,
-//         bool bias_after_scale,
-//         SelectedRowsTensor* out);
+           DenseTensor* out);
+
+template <typename T>
+void ScaleSelectedRows(const CPUContext& dev_ctx,
+                       const SelectedRowsTensor& x,
+                       float scale,
+                       float bias,
+                       bool bias_after_scale,
+                       SelectedRowsTensor* out);
+
+template <typename T>
+void ScaleDynamicAttr(const CPUContext& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& scale,
+                      float bias,
+                      bool bias_after_scale,
+                      DenseTensor* out);
+
+template <typename T>
+void ScaleSelectedRowsDynamicAttr(const CPUContext& dev_ctx,
+                                  const SelectedRowsTensor& x,
+                                  const DenseTensor& scale,
+                                  float bias,
+                                  bool bias_after_scale,
+                                  SelectedRowsTensor* out);
 
 }  // namespace pt
diff --git a/paddle/tcmpt/cuda/math.cu b/paddle/tcmpt/cuda/math.cu
index c62dc41bd6234..e7325f83e6732 100644
--- a/paddle/tcmpt/cuda/math.cu
+++ b/paddle/tcmpt/cuda/math.cu
@@ -84,59 +84,115 @@ void Mean(const CUDAContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
   PADDLE_ENFORCE_CUDA_SUCCESS(err);
 }
 
-// template <typename T>
-// void Scale(const CUDAContext& dev_ctx,
-//            const DenseTensor& x,
-//            float scale,
-//            float bias,
-//            bool bias_after_scale,
-//            DenseTensor* out) {
-//   module::Scale<CUDAContext, T>(dev_ctx, x, scale, bias, bias_after_scale,
-//   out);
-// }
-
-// template <typename T>
-// void ScaleSelectedRows(const CUDAContext& dev_ctx,
-//           const SelectedRowsTensor& x,
-//           float scale,
-//           float bias,
-//           bool bias_after_scale,
-//           SelectedRowsTensor* out) {
-//   out->set_rows(x.rows());
-//   out->set_height(x.height());
-//   Scale<T>(dev_ctx, x.value(), scale, bias, bias_after_scale, out->value());
-// }
+template <typename T>
+void Scale(const CUDAContext& dev_ctx,
+           const DenseTensor& x,
+           float scale,
+           float bias,
+           bool bias_after_scale,
+           DenseTensor* out) {
+  module::Scale<CUDAContext, T>(dev_ctx, x, scale, bias, bias_after_scale, out);
+}
 
-}  // namespace pt
+template <typename T>
+void ScaleSelectedRows(const CUDAContext& dev_ctx,
+                       const SelectedRowsTensor& x,
+                       float scale,
+                       float bias,
+                       bool bias_after_scale,
+                       SelectedRowsTensor* out) {
+  out->set_rows(x.rows());
+  out->set_height(x.height());
+  Scale<T>(
+      dev_ctx, x.value(), scale, bias, bias_after_scale, out->mutable_value());
+}
+
+template <typename T>
+void ScaleDynamicAttr(const CUDAContext& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& scale,
+                      float bias,
+                      bool bias_after_scale,
+                      DenseTensor* out) {
+  module::Scale<CUDAContext, T>(
+      dev_ctx, x, *scale.data<float>(), bias, bias_after_scale, out);
+}
 
-// using float16 = paddle::platform::float16;
-// PT_REGISTER_KERNEL_3T(sign, CUDA, NCHW, pt::Sign, float, double, float16);
-// PT_REGISTER_KERNEL_3T(mean, CUDA, NCHW, pt::Mean, float, double, float16);
-// PT_REGISTER_KERNEL_8T(scale,
-//                       CUDA,
-//                       NCHW,
-//                       pt::Scale,
-//                       float,
-//                       double,
-//                       float16,
-//                       uint8_t,
-//                       int8_t,
-//                       int16_t,
-//                       int,
-//                       int64_t);
-// PT_REGISTER_KERNEL_8T(scale.selected_rows,
-//                       CUDA,
-//                       NCHW,
-//                       pt::ScaleSelectedRows,
-//                       float,
-//                       double,
-//                       float16,
-//                       uint8_t,
-//                       int8_t,
-//                       int16_t,
-//                       int,
-//                       int64_t);
+template <typename T>
+void ScaleSelectedRowsDynamicAttr(const CUDAContext& dev_ctx,
+                                  const SelectedRowsTensor& x,
+                                  const DenseTensor& scale,
+                                  float bias,
+                                  bool bias_after_scale,
+                                  SelectedRowsTensor* out) {
+  out->set_rows(x.rows());
+  out->set_height(x.height());
+  Scale<T>(dev_ctx,
+           x.value(),
+           *scale.data<float>(),
+           bias,
+           bias_after_scale,
+           out->mutable_value());
+}
+
+}  // namespace pt
 
 using float16 = paddle::platform::float16;
 PT_REGISTER_KERNEL("sign", CUDA, NCHW, pt::Sign, float, double, float16) {}
 PT_REGISTER_KERNEL("mean", CUDA, NCHW, pt::Mean, float, double, float16) {}
+PT_REGISTER_KERNEL("scale",
+                   CUDA,
+                   NCHW,
+                   pt::Scale,
+                   float,
+                   double,
+                   float16,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
+PT_REGISTER_KERNEL("scale.selectedrows",
+                   CUDA,
+                   NCHW,
+                   pt::ScaleSelectedRows,
+                   float,
+                   double,
+                   float16,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
+PT_REGISTER_KERNEL("scale.dynamic_attr",
+                   CUDA,
+                   NCHW,
+                   pt::ScaleDynamicAttr,
+                   float,
+                   double,
+                   float16,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {
+  kernel->InputAt(1)
+      .SetBackend(pt::Backend::kCPU)
+      .SetDataType(pt::DataType::kFLOAT32);
+}
+PT_REGISTER_KERNEL("scale.selectedrows.dynamic_attr",
+                   CUDA,
+                   NCHW,
+                   pt::ScaleSelectedRowsDynamicAttr,
+                   float,
+                   double,
+                   float16,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {
+  kernel->InputAt(1)
+      .SetBackend(pt::Backend::kCPU)
+      .SetDataType(pt::DataType::kFLOAT32);
+}
diff --git a/paddle/tcmpt/cuda/math.h b/paddle/tcmpt/cuda/math.h
index 3e87163f89540..a3e4985920f24 100644
--- a/paddle/tcmpt/cuda/math.h
+++ b/paddle/tcmpt/cuda/math.h
@@ -42,17 +42,31 @@ void Scale(const CUDAContext& dev_ctx,
            float scale,
            float bias,
            bool bias_after_scale,
-           DenseTensor* out) {
-  module::Scale<CUDAContext, T>(dev_ctx, x, scale, bias, bias_after_scale, out);
-}
-
-// template <typename T>
-// void ScaleSelectedRows(const CUDAContext& dev_ctx,
-//         const SelectedRowsTensor& x,
-//         float scale,
-//         float bias,
-//         bool bias_after_scale,
-//         SelectedRowsTensor* out);
+           DenseTensor* out);
+
+template <typename T>
+void ScaleSelectedRows(const CUDAContext& dev_ctx,
+                       const SelectedRowsTensor& x,
+                       float scale,
+                       float bias,
+                       bool bias_after_scale,
+                       SelectedRowsTensor* out);
+
+template <typename T>
+void ScaleDynamicAttr(const CUDAContext& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& scale,
+                      float bias,
+                      bool bias_after_scale,
+                      DenseTensor* out);
+
+template <typename T>
+void ScaleSelectedRowsDynamicAttr(const CUDAContext& dev_ctx,
+                                  const SelectedRowsTensor& x,
+                                  const DenseTensor& scale,
+                                  float bias,
+                                  bool bias_after_scale,
+                                  SelectedRowsTensor* out);
 
 }  // namespace pt
 
diff --git a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
index 0a5566323ac55..342be7db3ed30 100644
--- a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
+++ b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
@@ -169,7 +169,6 @@ def residual_block_quant(self, quant_type):
             opt.minimize(loss)
             t = QuantizeTranspiler(activation_quantize_type=quant_type)
             t.training_transpile(main)
-            print(main)
             self.check_program(main)
 
     def test_residual_block_abs_max(self):

From b67de9cda1b4487061972b215f615f095deaf7f8 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 9 Sep 2021 11:12:31 +0000
Subject: [PATCH 047/125] add op proto in dygraph kernelcontext building

---
 paddle/fluid/framework/operator.cc           |  18 ++-
 paddle/fluid/imperative/prepared_operator.cc | 134 +++++++++++++++----
 paddle/tcmpt/core/kernel_factory.cc          |  12 +-
 paddle/tcmpt/core/kernel_factory.h           |  20 +++
 paddle/tcmpt/core/kernel_registry.h          |   6 +-
 5 files changed, 154 insertions(+), 36 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 865b604c1a240..cb3d89d861ac6 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1850,16 +1850,18 @@ pt::KernelContext OperatorWithKernel::ConstructPtKernelContext(
   // If we the VariableValueMap are ordered, we can get tensor by iter the map,
   // and its order is same as OpProto
 
-  auto& op_proto = Info().proto_;
+  auto* op_proto = Info().proto_;
   for (int i = 0; i < op_proto->inputs_size(); ++i) {
     auto in = op_proto->inputs()[i];
     // TODO(chenweihang): skip special cases temporarily
     // TODO(chenweihang): deal with diff param in vector
-    if (in.has_dispensable() && in.dispensable()) {
+    if ((in.has_dispensable() && in.dispensable()) ||
+        (in.has_extra() && in.extra()) || (in.has_quant() && in.quant())) {
       VLOG(1) << "BuildKernelContext: skip dispensable input - " << in.name();
       continue;
     }
     auto in_name = in.name();
+    VLOG(1) << "Static graph PtKernel input: " << in_name;
     auto in_def = input_defs.at(i);
     for (auto* var : ctx.inputs.at(in_name)) {
       if (var->IsType<LoDTensor>()) {
@@ -1881,6 +1883,8 @@ pt::KernelContext OperatorWithKernel::ConstructPtKernelContext(
   }
   for (int i = 0; i < op_proto->outputs_size(); ++i) {
     auto out_name = op_proto->outputs()[i].name();
+    VLOG(1) << "Static graph PtKernel output: " << out_name;
+    // TODO(chenweihang): outputs also need skip some cases
     auto out_def = output_defs.at(i);
     for (auto* var : ctx.outputs.at(out_name)) {
       // mutable_data before run kernel, to avoid share output form
@@ -1909,15 +1913,17 @@ pt::KernelContext OperatorWithKernel::ConstructPtKernelContext(
   }
   for (int i = 0; i < op_proto->attrs_size(); ++i) {
     auto attr = op_proto->attrs()[i];
-    // TODO(chenweihang): skip extra attrs by extra value
-    // if (attr.has_extra() && attr.extra()) {
-    //   continue;
-    // }
+    VLOG(1) << "Static graph PtKernel attribute: " << attr.name();
+    if ((attr.has_extra() && attr.extra()) ||
+        (attr.has_quant() && attr.quant())) {
+      continue;
+    }
     if (attr.name() == "use_mkldnn" || attr.name() == "op_role" ||
         attr.name() == "op_role_var" || attr.name() == "op_namescope" ||
         attr.name() == "op_callstack" || attr.name() == "op_device") {
       continue;
     }
+    // TODO(chenweihang): support other attrs
     switch (attr.type()) {
       case proto::AttrType::INT:
         op_kernel_ctx.EmplaceBackAttr(Attr<int>(attr.name()));
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 3ddd26df65554..cbf394611227e 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -46,6 +46,15 @@ const framework::Tensor* GetTensorFromVar(const framework::Variable& var) {
   }
 }
 
+template <typename T>
+static const T& GetAttr(const framework::AttributeMap& attrs,
+                        const std::string& name) {
+  PADDLE_ENFORCE_NE(
+      attrs.find(name), attrs.end(),
+      platform::errors::NotFound("(%s) is not found in AttributeMap.", name));
+  return BOOST_GET_CONST(T, attrs.at(name));
+}
+
 template <typename VarType>
 static void HandleComplexGradToRealGrad(const NameVarMap<VarType>& outs) {
   for (auto& pair : outs) {
@@ -232,8 +241,10 @@ PreparedOp PreparedOp::Prepare(const NameVarMap<VariableWrapper>& ins,
 
 template <typename VarType>
 static pt::KernelContext BuildDygraphKernelContext(
-    const pt::Kernel& pt_kernel, const NameVarMap<VarType>& ins,
-    const NameVarMap<VarType>& outs, const platform::DeviceContext& dev_ctx) {
+    const pt::Kernel& pt_kernel, const framework::proto::OpProto& op_proto,
+    const NameVarMap<VarType>& ins, const NameVarMap<VarType>& outs,
+    const framework::AttributeMap& attrs,
+    const platform::DeviceContext& dev_ctx) {
   // TODO(chenweihang): now only work for very simple case (sign op),
   // many cases need to be deal with later:
   // 1. the input and output are not tensor
@@ -245,38 +256,109 @@ static pt::KernelContext BuildDygraphKernelContext(
   auto input_defs = pt_kernel.args_def().input_defs();
   auto output_defs = pt_kernel.args_def().output_defs();
 
-  size_t i = 0;
-  for (auto& var_pair : ins) {
+  for (int i = 0; i < op_proto.inputs_size(); ++i) {
+    auto in = op_proto.inputs()[i];
+    // TODO(chenweihang): deal with diff param in vector
+    if ((in.has_dispensable() && in.dispensable()) ||
+        (in.has_extra() && in.extra()) || (in.has_quant() && in.quant())) {
+      VLOG(1) << "BuildDygraphKernelContext: skip dispensable input - "
+              << in.name();
+      continue;
+    }
+    auto in_name = in.name();
+    VLOG(1) << "Dygraph PtKernel input: " << in_name;
     auto in_def = input_defs.at(i);
-    for (auto var : var_pair.second) {
+    for (auto var : ins.at(in_name)) {
       const auto& variable = var->Var();
-      const auto& tensor = variable.template Get<framework::LoDTensor>();
-      auto pt_in =
-          framework::MakeTensorImpl<pt::DenseTensor, framework::LoDTensor>(
-              tensor, in_def.backend, in_def.dtype, in_def.layout);
-      op_kernel_ctx.EmplaceBackInput(pt_in);
+      if (variable.template IsType<framework::LoDTensor>()) {
+        const auto& tensor = variable.template Get<framework::LoDTensor>();
+        auto pt_in =
+            framework::MakeTensorImpl<pt::DenseTensor, framework::LoDTensor>(
+                tensor, in_def.backend, in_def.dtype, in_def.layout);
+        op_kernel_ctx.EmplaceBackInput(pt_in);
+      } else if (variable.template IsType<framework::SelectedRows>()) {
+        const auto& tensor = variable.template Get<framework::SelectedRows>();
+        auto pt_in = framework::MakeTensorImpl<pt::SelectedRowsTensor,
+                                               framework::SelectedRows>(
+            tensor, in_def.backend, in_def.dtype, in_def.layout);
+        op_kernel_ctx.EmplaceBackInput(pt_in);
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported shared input `%s` type now when call pt kernel.",
+            framework::ToTypeName(variable.Type())));
+      }
     }
-    ++i;
   }
 
-  i = 0;
-  for (auto it = outs.begin(); it != outs.end(); ++it) {
+  for (int i = 0; i < op_proto.outputs_size(); ++i) {
+    auto out_name = op_proto.outputs()[i].name();
+    VLOG(1) << "Dygraph PtKernel output: " << out_name;
+    // TODO(chenweihang): outputs also need skip some cases
     auto out_def = output_defs.at(i);
-    for (auto var : it->second) {
-      auto* variable = var->MutableVar();
-      auto* tensor = variable->template GetMutable<framework::LoDTensor>();
+    for (auto var : outs.at(out_name)) {
       // mutable_data before run kernel, to avoid share output form
       // KernelContext to original tensor
-      tensor->mutable_data(pt::TransToFluidPlace(out_def.backend),
-                           pt::TransToProtoVarType(out_def.dtype));
-      auto pt_out =
-          framework::MakeTensorImpl<pt::DenseTensor, framework::LoDTensor>(
-              *tensor, out_def.backend, out_def.dtype, out_def.layout);
-      op_kernel_ctx.EmplaceBackOutput(pt_out);
+      auto* variable = var->MutableVar();
+      if (variable->template IsType<framework::LoDTensor>()) {
+        auto* tensor = variable->template GetMutable<framework::LoDTensor>();
+        tensor->mutable_data(pt::TransToFluidPlace(out_def.backend),
+                             pt::TransToProtoVarType(out_def.dtype));
+        auto pt_out =
+            framework::MakeTensorImpl<pt::DenseTensor, framework::LoDTensor>(
+                *tensor, out_def.backend, out_def.dtype, out_def.layout);
+        op_kernel_ctx.EmplaceBackOutput(pt_out);
+      } else if (variable->template IsType<framework::SelectedRows>()) {
+        auto* tensor = variable->template GetMutable<framework::SelectedRows>();
+        tensor->mutable_value()->mutable_data(
+            pt::TransToFluidPlace(out_def.backend),
+            pt::TransToProtoVarType(out_def.dtype));
+        auto pt_out = framework::MakeTensorImpl<pt::SelectedRowsTensor,
+                                                framework::SelectedRows>(
+            *tensor, out_def.backend, out_def.dtype, out_def.layout);
+        op_kernel_ctx.EmplaceBackOutput(pt_out);
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupported shared output `%s` type now when call pt kernel.",
+            framework::ToTypeName(variable->Type())));
+      }
     }
-    ++i;
   }
-  // TODO(chenweihang): append attrs
+
+  for (int i = 0; i < op_proto.attrs_size(); ++i) {
+    auto attr = op_proto.attrs()[i];
+    VLOG(1) << "Dygraph PtKernel attribute: " << attr.name();
+    if ((attr.has_extra() && attr.extra()) ||
+        (attr.has_quant() && attr.quant())) {
+      continue;
+    }
+    if (attr.name() == "use_mkldnn" || attr.name() == "op_role" ||
+        attr.name() == "op_role_var" || attr.name() == "op_namescope" ||
+        attr.name() == "op_callstack" || attr.name() == "op_device") {
+      continue;
+    }
+    // TODO(chenweihang): support other attrs
+    // In principle, the attr required by the dynamic mode should be
+    // passed in from the Python side, and there is no need to look up
+    // from the default_map
+    switch (attr.type()) {
+      case framework::proto::AttrType::INT:
+        op_kernel_ctx.EmplaceBackAttr(GetAttr<int>(attrs, attr.name()));
+        break;
+      case framework::proto::AttrType::FLOAT:
+        op_kernel_ctx.EmplaceBackAttr(GetAttr<float>(attrs, attr.name()));
+        break;
+      case framework::proto::AttrType::BOOLEAN:
+        op_kernel_ctx.EmplaceBackAttr(GetAttr<bool>(attrs, attr.name()));
+        break;
+      default:
+        // TODO(chenweihang): support other attrs type
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "unsupported cast op attribute `%s` when construct "
+            "KernelContext.",
+            attr.name()));
+    }
+  }
+
   return op_kernel_ctx;
 }
 
@@ -335,8 +417,8 @@ static void PreparedOpRunPtImpl(const framework::OperatorBase& op,
   static_cast<const framework::OperatorWithKernel&>(op).InferShape(
       &infer_shape_ctx);
 
-  auto op_kernel_ctx =
-      BuildDygraphKernelContext<VarType>(pt_kernel, ins, outs, *dev_ctx);
+  auto op_kernel_ctx = BuildDygraphKernelContext<VarType>(
+      pt_kernel, *(op.Info().proto_), ins, outs, attrs, *dev_ctx);
   pt_kernel(&op_kernel_ctx);
 
   // TODO(chenweihang): add flags
diff --git a/paddle/tcmpt/core/kernel_factory.cc b/paddle/tcmpt/core/kernel_factory.cc
index 25696c8d8ff11..6617754f6ddc8 100644
--- a/paddle/tcmpt/core/kernel_factory.cc
+++ b/paddle/tcmpt/core/kernel_factory.cc
@@ -56,11 +56,19 @@ const Kernel& KernelFactory::SelectKernel(const KernelName& kernel_name,
   return SelectKernel(kernel_name, KernelKey(backend, layout, dtype));
 }
 
+std::ostream& operator<<(std::ostream& os, const Kernel& kernel) {
+  os << "InputNum(" << kernel.args_def().input_defs().size()
+     << "), AttributeNum(" << kernel.args_def().attribute_defs().size()
+     << "), OutputNum(" << kernel.args_def().output_defs().size() << ")";
+  return os;
+}
+
 std::ostream& operator<<(std::ostream& os, KernelFactory& kernel_factory) {
   for (const auto& op_kernel_pair : kernel_factory.kernels()) {
-    os << "- op: " << op_kernel_pair.first << "\n";
+    os << "- kernel name: " << op_kernel_pair.first << "\n";
     for (const auto& kernel_pair : op_kernel_pair.second) {
-      os << "\t- kernel: " << kernel_pair.first << "\n";
+      os << "\t- kernel key: " << kernel_pair.first << " | "
+         << "kernel: " << kernel_pair.second << "\n";
     }
   }
   return os;
diff --git a/paddle/tcmpt/core/kernel_factory.h b/paddle/tcmpt/core/kernel_factory.h
index ca63cfdc229f9..b381c8eb409b2 100644
--- a/paddle/tcmpt/core/kernel_factory.h
+++ b/paddle/tcmpt/core/kernel_factory.h
@@ -170,6 +170,13 @@ struct TensorArgDef {
   }
 };
 
+struct AttributeArgDef {
+  std::type_index type_index;
+
+  explicit AttributeArgDef(std::type_index type_index)
+      : type_index(type_index) {}
+};
+
 class KernelArgsDef {
  public:
   KernelArgsDef() = default;
@@ -182,18 +189,29 @@ class KernelArgsDef {
     output_defs_.emplace_back(TensorArgDef(backend, layout, dtype));
   }
 
+  void AppendAttribute(std::type_index type_index) {
+    attribute_defs_.emplace_back(AttributeArgDef(type_index));
+  }
+
   const std::vector<TensorArgDef>& input_defs() const { return input_defs_; }
 
   const std::vector<TensorArgDef>& output_defs() const { return output_defs_; }
 
+  const std::vector<AttributeArgDef>& attribute_defs() const {
+    return attribute_defs_;
+  }
+
   std::vector<TensorArgDef>& input_defs() { return input_defs_; }
 
   std::vector<TensorArgDef>& output_defs() { return output_defs_; }
 
+  std::vector<AttributeArgDef>& attribute_defs() { return attribute_defs_; }
+
  private:
   // TODO(chenweihang): replaced by paddle::small_vector
   std::vector<TensorArgDef> input_defs_{{}};
   std::vector<TensorArgDef> output_defs_{{}};
+  std::vector<AttributeArgDef> attribute_defs_{{}};
 };
 
 class Kernel {
@@ -270,6 +288,8 @@ inline std::ostream& operator<<(std::ostream& os, const KernelKey& kernel_key) {
   return os;
 }
 
+std::ostream& operator<<(std::ostream& os, const Kernel& kernel);
+
 std::ostream& operator<<(std::ostream& os, KernelFactory& kernel_factory);
 
 }  // namespace pt
diff --git a/paddle/tcmpt/core/kernel_registry.h b/paddle/tcmpt/core/kernel_registry.h
index 1aaaead43f935..f739d73d42464 100644
--- a/paddle/tcmpt/core/kernel_registry.h
+++ b/paddle/tcmpt/core/kernel_registry.h
@@ -52,8 +52,10 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
         args_def->AppendOutput(
             default_key.backend(), default_key.layout(), default_key.dtype());
       } else {
-        // TODO(chenweihang): throw argument error
-        VLOG(1) << "invalid arg";
+        // Attribute deal with
+        // TODO(chenweihang): now here allow any types of attribute, maybe
+        // should add limits here
+        args_def->AppendAttribute(arg_type);
       }
     }
   }

From 13c02aa04e969a030f17ee9402223296238e70ca Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Fri, 10 Sep 2021 02:20:01 +0000
Subject: [PATCH 048/125] polish kernel dispatch logic & nameing rule

---
 paddle/fluid/framework/operator.cc           |  45 ++++--
 paddle/fluid/imperative/prepared_operator.cc | 152 ++++++++++++-------
 paddle/tcmpt/core/kernel_def.h               |  11 ++
 paddle/tcmpt/core/kernel_factory.cc          |  29 +++-
 paddle/tcmpt/core/kernel_factory.h           |  17 ++-
 paddle/tcmpt/core/kernel_registry.h          |   1 +
 paddle/tcmpt/cpu/math.cc                     |  34 ++---
 paddle/tcmpt/cpu/math.h                      |  24 +--
 paddle/tcmpt/cuda/math.cu                    |  34 ++---
 paddle/tcmpt/cuda/math.h                     |  24 +--
 10 files changed, 235 insertions(+), 136 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index cb3d89d861ac6..da69a2ad60dc6 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1155,13 +1155,13 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   // TODO(chenweihang): in the first phase of project, we only support CPU, CUDA
   // and RCOM backend, the XPU, NPU and MKLDNN will be supported in the second
   // phase
-  // TODO(chenweihang): ContainsKernel need more acurrate
-  run_pt_kernel_ = pt::KernelFactory::Instance().ContainsKernel(type_.c_str());
-  if (run_pt_kernel_) {
+  if (pt::KernelFactory::Instance().ContainsKernel(type_.c_str())) {
     if (pt_kernel_key_.get() == nullptr || pt_kernel_.get() == nullptr) {
       ChoosePtKernel(*runtime_ctx, *dev_ctx);
     }
-  } else {
+    run_pt_kernel_ = pt_kernel_->IsValid();
+  }
+  if (!run_pt_kernel_) {
     if (kernel_type_.get() == nullptr || kernel_func_.get() == nullptr) {
       ChooseKernel(*runtime_ctx, scope, place);
     }
@@ -1261,7 +1261,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   }
 }
 
-bool ContainsSelectedRows(const VariableValueMap& inputs) {
+bool ContainSelectedRows(const VariableValueMap& inputs) {
   for (auto& var_pair : inputs) {
     for (auto* var : var_pair.second) {
       if (var->IsType<SelectedRows>()) {
@@ -1272,15 +1272,40 @@ bool ContainsSelectedRows(const VariableValueMap& inputs) {
   return false;
 }
 
+// TODO(chenweihang): enhance rules, not all dispensable inputs
+// are host tensor, now only for scale kernel verify
+bool ContainHostTensor(const proto::OpProto& op_proto,
+                       const VariableValueMap& inputs) {
+  for (int i = 0; i < op_proto.inputs_size(); ++i) {
+    auto in = op_proto.inputs()[i];
+    auto it = inputs.find(in.name());
+    if (it == inputs.end()) {
+      return false;
+    }
+    return it->second.empty() ? false : true;
+  }
+  return false;
+}
+
+static pt::KernelName ConstructPtKernelName(const std::string& op_type,
+                                            const proto::OpProto& op_proto,
+                                            const VariableValueMap& inputs) {
+  pt::KernelName kernel_name(op_type.c_str());
+  if (ContainSelectedRows(inputs)) {
+    kernel_name.overload_name += pt::kContainSelectedRowsSuffix;
+  }
+  if (ContainHostTensor(op_proto, inputs)) {
+    kernel_name.overload_name += pt::kContainHostTensorSuffix;
+  }
+  return kernel_name;
+}
+
 void OperatorWithKernel::ChoosePtKernel(
     const RuntimeContext& ctx, const platform::DeviceContext& dev_ctx) const {
   // 1. construct operation name
   // TODO(chenweihang): add rules for construct op name
-  pt::KernelName kernel_name(Type().c_str());
-  // TODO(chenweihang): polish judge rules
-  if (ContainsSelectedRows(ctx.inputs)) {
-    kernel_name.overload_name = "selected_rows";
-  }
+  auto kernel_name =
+      ConstructPtKernelName(Type(), *(Info().proto_), ctx.inputs);
 
   // 2. construct op kernel key
   pt_kernel_key_.reset(
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index cbf394611227e..de1a3a1ffcc0c 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -129,6 +129,48 @@ static framework::VariableValueMap BuildInputMap(
   return inputs;
 }
 
+template <typename VarType>
+bool ContainSelectedRows(const NameVarMap<VarType>& inputs) {
+  for (auto& var_pair : inputs) {
+    for (auto& var : var_pair.second) {
+      if (var->Var().template IsType<framework::SelectedRows>()) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+// TODO(chenweihang): enhance rules, not all dispensable inputs
+// are host tensor, now only for scale kernel verify
+template <typename VarType>
+bool ContainHostTensor(const framework::proto::OpProto& op_proto,
+                       const NameVarMap<VarType>& inputs) {
+  for (int i = 0; i < op_proto.inputs_size(); ++i) {
+    auto in = op_proto.inputs()[i];
+    auto it = inputs.find(in.name());
+    if (it == inputs.end()) {
+      return false;
+    }
+    return it->second.empty() ? false : true;
+  }
+  return false;
+}
+
+template <typename VarType>
+static pt::KernelName ConstructPtKernelName(
+    const std::string& op_type, const framework::proto::OpProto& op_proto,
+    const NameVarMap<VarType>& inputs) {
+  pt::KernelName kernel_name(op_type.c_str());
+  if (ContainSelectedRows<VarType>(inputs)) {
+    kernel_name.overload_name += pt::kContainSelectedRowsSuffix;
+  }
+  if (ContainHostTensor<VarType>(op_proto, inputs)) {
+    kernel_name.overload_name += pt::kContainHostTensorSuffix;
+  }
+  return kernel_name;
+}
+
 template <typename VarType>
 PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
                        const NameVarMap<VarType>& outs,
@@ -155,69 +197,69 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
 #endif
 
   // 1. get expected kernel key
-  bool run_pt_kernel =
-      pt::KernelFactory::Instance().ContainsKernel(op.Type().c_str());
-  if (run_pt_kernel) {
-    pt::KernelName op_name(op.Type().c_str());
+  if (pt::KernelFactory::Instance().ContainsKernel(op.Type().c_str())) {
+    auto kernel_name =
+        ConstructPtKernelName<VarType>(op.Type(), (*op.Info().proto_), ins);
     auto inputs = BuildInputMap<VarType>(ins);
     auto pt_kernel_key = op.ConstructPtKernelKey(inputs, place);
     auto pt_kernel =
-        pt::KernelFactory::Instance().SelectKernel(op_name, pt_kernel_key);
-    // TODO(chenweihang): using CPUKernel when miss device kernel case
-    return PreparedOp(op, ctx, pt_kernel_key, pt_kernel, dev_ctx);
-  } else {
-    auto expected_kernel_key = op.GetExpectedKernelType(
-        DygraphExecutionContext<VarType>(op, framework::Scope(), *dev_ctx, ctx,
-                                         ins, outs, attrs, default_attrs));
-    VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
-
-    // 2. check if op[type] has kernel registered.
-    auto& all_op_kernels = op.AllOpKernels();
-    auto kernels_iter = all_op_kernels.find(op.Type());
-    PADDLE_ENFORCE_NE(
-        kernels_iter, all_op_kernels.end(),
-        platform::errors::NotFound(
-            "There are no kernels which are registered in the %s operator.",
-            op.Type()));
-
-    auto& kernels = kernels_iter->second;
-    auto kernel_iter = kernels.find(expected_kernel_key);
-#ifdef PADDLE_WITH_XPU
-    if (is_xpu_place(expected_kernel_key.place_) &&
-        (kernel_iter == kernels.end() ||
-         !paddle::platform::is_xpu_support_op(op.Type(), expected_kernel_key) ||
-         paddle::platform::is_in_xpu_black_list(op.Type()))) {
-      VLOG(3) << "missing XPU kernel: " << op.Type()
-              << ", expected_kernel_key:" << expected_kernel_key
-              << ", fallbacking to CPU one!";
-      expected_kernel_key.place_ = platform::CPUPlace();
-      kernel_iter = kernels.find(expected_kernel_key);
+        pt::KernelFactory::Instance().SelectKernel(kernel_name, pt_kernel_key);
+    if (pt_kernel.IsValid()) {
+      // TODO(chenweihang): using CPUKernel when miss device kernel case
+      return PreparedOp(op, ctx, pt_kernel_key, pt_kernel, dev_ctx);
     }
+  }
+
+  auto expected_kernel_key = op.GetExpectedKernelType(
+      DygraphExecutionContext<VarType>(op, framework::Scope(), *dev_ctx, ctx,
+                                       ins, outs, attrs, default_attrs));
+  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
+
+  // 2. check if op[type] has kernel registered.
+  auto& all_op_kernels = op.AllOpKernels();
+  auto kernels_iter = all_op_kernels.find(op.Type());
+  PADDLE_ENFORCE_NE(
+      kernels_iter, all_op_kernels.end(),
+      platform::errors::NotFound(
+          "There are no kernels which are registered in the %s operator.",
+          op.Type()));
+
+  auto& kernels = kernels_iter->second;
+  auto kernel_iter = kernels.find(expected_kernel_key);
+#ifdef PADDLE_WITH_XPU
+  if (is_xpu_place(expected_kernel_key.place_) &&
+      (kernel_iter == kernels.end() ||
+       !paddle::platform::is_xpu_support_op(op.Type(), expected_kernel_key) ||
+       paddle::platform::is_in_xpu_black_list(op.Type()))) {
+    VLOG(3) << "missing XPU kernel: " << op.Type()
+            << ", expected_kernel_key:" << expected_kernel_key
+            << ", fallbacking to CPU one!";
+    expected_kernel_key.place_ = platform::CPUPlace();
+    kernel_iter = kernels.find(expected_kernel_key);
+  }
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
-    if (kernel_iter == kernels.end() &&
-        is_npu_place(expected_kernel_key.place_)) {
-      VLOG(3) << "missing NPU kernel: " << op.Type()
-              << ", expected_kernel_key:" << expected_kernel_key
-              << ", fallbacking to CPU one!";
-      expected_kernel_key.place_ = platform::CPUPlace();
-      kernel_iter = kernels.find(expected_kernel_key);
-    }
+  if (kernel_iter == kernels.end() &&
+      is_npu_place(expected_kernel_key.place_)) {
+    VLOG(3) << "missing NPU kernel: " << op.Type()
+            << ", expected_kernel_key:" << expected_kernel_key
+            << ", fallbacking to CPU one!";
+    expected_kernel_key.place_ = platform::CPUPlace();
+    kernel_iter = kernels.find(expected_kernel_key);
+  }
 #endif
-    // TODO(jiabin): Add operator.cc's line 1000 part back when we need that
-    // case
-    PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
-                      platform::errors::NotFound(
-                          "Operator %s does not have kernel for %s.", op.Type(),
-                          KernelTypeToString(expected_kernel_key)));
-
-    if (!(expected_kernel_key.place_ == place)) {
-      dev_ctx = pool.Get(expected_kernel_key.place_);
-    }
-
-    return PreparedOp(op, ctx, expected_kernel_key, kernel_iter->second,
-                      dev_ctx);
+  // TODO(jiabin): Add operator.cc's line 1000 part back when we need that
+  // case
+  PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
+                    platform::errors::NotFound(
+                        "Operator %s does not have kernel for %s.", op.Type(),
+                        KernelTypeToString(expected_kernel_key)));
+
+  if (!(expected_kernel_key.place_ == place)) {
+    dev_ctx = pool.Get(expected_kernel_key.place_);
   }
+
+  return PreparedOp(op, ctx, expected_kernel_key, kernel_iter->second, dev_ctx);
 }
 
 PreparedOp PreparedOp::Prepare(const NameVarMap<VarBase>& ins,
diff --git a/paddle/tcmpt/core/kernel_def.h b/paddle/tcmpt/core/kernel_def.h
index e0334f770bfd1..97d2721d28fd0 100644
--- a/paddle/tcmpt/core/kernel_def.h
+++ b/paddle/tcmpt/core/kernel_def.h
@@ -26,4 +26,15 @@ using KernelArgsDefFn = void (*)(Kernel* kernel);
 using KernelArgsParseFn = void (*)(const KernelKey& default_key,
                                    KernelArgsDef* args_def);
 
+// Multiple kernels of the same operation are distinguished by the difference
+// of the overload name. For the convenience of reuse, we define some overload
+// naming strings for the naming of the kernel
+
+// For kernels that contains dynamic tensor attribute and it need to be always
+// on host device, such as `ScaleTensor`
+constexpr char kContainHostTensorSuffix[] = ".host";
+
+// For kernels with SelectedRowsTensor input and output
+constexpr char kContainSelectedRowsSuffix[] = ".sr";
+
 }  // namespace pt
diff --git a/paddle/tcmpt/core/kernel_factory.cc b/paddle/tcmpt/core/kernel_factory.cc
index 6617754f6ddc8..94411ffb6ddab 100644
--- a/paddle/tcmpt/core/kernel_factory.cc
+++ b/paddle/tcmpt/core/kernel_factory.cc
@@ -29,8 +29,21 @@ bool KernelFactory::ContainsKernel(const char* kernel_name) const {
   return (iter != kernels_.end());
 }
 
-const Kernel& KernelFactory::SelectKernel(const KernelName& kernel_name,
-                                          const KernelKey& kernel_key) const {
+Kernel KernelFactory::SelectKernel(const KernelName& kernel_name,
+                                   const KernelKey& kernel_key) const {
+  auto iter = kernels_.find(kernel_name);
+  if (iter == kernels_.end()) {
+    return Kernel();
+  }
+  auto kernel_iter = iter->second.find(kernel_key);
+  if (kernel_iter == iter->second.end()) {
+    return Kernel();
+  }
+  return kernel_iter->second;
+}
+
+const Kernel& KernelFactory::SelectKernelOrThrowError(
+    const KernelName& kernel_name, const KernelKey& kernel_key) const {
   auto iter = kernels_.find(kernel_name);
   PADDLE_ENFORCE_NE(iter,
                     kernels_.end(),
@@ -49,11 +62,13 @@ const Kernel& KernelFactory::SelectKernel(const KernelName& kernel_name,
   return kernel_iter->second;
 }
 
-const Kernel& KernelFactory::SelectKernel(const KernelName& kernel_name,
-                                          Backend backend,
-                                          DataLayout layout,
-                                          DataType dtype) const {
-  return SelectKernel(kernel_name, KernelKey(backend, layout, dtype));
+const Kernel& KernelFactory::SelectKernelOrThrowError(
+    const KernelName& kernel_name,
+    Backend backend,
+    DataLayout layout,
+    DataType dtype) const {
+  return SelectKernelOrThrowError(kernel_name,
+                                  KernelKey(backend, layout, dtype));
 }
 
 std::ostream& operator<<(std::ostream& os, const Kernel& kernel) {
diff --git a/paddle/tcmpt/core/kernel_factory.h b/paddle/tcmpt/core/kernel_factory.h
index b381c8eb409b2..764ef5bda3007 100644
--- a/paddle/tcmpt/core/kernel_factory.h
+++ b/paddle/tcmpt/core/kernel_factory.h
@@ -231,6 +231,8 @@ class Kernel {
 
   TensorArgDef& OutputAt(size_t idx) { return args_def_.output_defs().at(idx); }
 
+  bool IsValid() { return fn_ != nullptr; }
+
  private:
   KernelFn fn_{nullptr};
   KernelArgsDef args_def_;
@@ -256,13 +258,16 @@ class KernelFactory {
 
   bool ContainsKernel(const char* name) const;
 
-  const Kernel& SelectKernel(const KernelName& kernel_name,
-                             const KernelKey& kernel_key) const;
+  const Kernel& SelectKernelOrThrowError(const KernelName& kernel_name,
+                                         const KernelKey& kernel_key) const;
+
+  const Kernel& SelectKernelOrThrowError(const KernelName& kernel_name,
+                                         Backend backend,
+                                         DataLayout layout,
+                                         DataType dtype) const;
 
-  const Kernel& SelectKernel(const KernelName& kernel_name,
-                             Backend backend,
-                             DataLayout layout,
-                             DataType dtype) const;
+  Kernel SelectKernel(const KernelName& kernel_name,
+                      const KernelKey& kernel_key) const;
 
  private:
   KernelFactory() = default;
diff --git a/paddle/tcmpt/core/kernel_registry.h b/paddle/tcmpt/core/kernel_registry.h
index f739d73d42464..d0f03ed5c5fe3 100644
--- a/paddle/tcmpt/core/kernel_registry.h
+++ b/paddle/tcmpt/core/kernel_registry.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <cstring>
 #include <type_traits>
 #include <typeindex>
 #include <typeinfo>
diff --git a/paddle/tcmpt/cpu/math.cc b/paddle/tcmpt/cpu/math.cc
index 5b125f92f8529..bf48ac420c80b 100644
--- a/paddle/tcmpt/cpu/math.cc
+++ b/paddle/tcmpt/cpu/math.cc
@@ -69,23 +69,23 @@ void ScaleSelectedRows(const CPUContext& dev_ctx,
 }
 
 template <typename T>
-void ScaleDynamicAttr(const CPUContext& dev_ctx,
-                      const DenseTensor& x,
-                      const DenseTensor& scale,
-                      float bias,
-                      bool bias_after_scale,
-                      DenseTensor* out) {
+void ScaleHost(const CPUContext& dev_ctx,
+               const DenseTensor& x,
+               const DenseTensor& scale,
+               float bias,
+               bool bias_after_scale,
+               DenseTensor* out) {
   module::Scale<CPUContext, T>(
       dev_ctx, x, *scale.data<float>(), bias, bias_after_scale, out);
 }
 
 template <typename T>
-void ScaleSelectedRowsDynamicAttr(const CPUContext& dev_ctx,
-                                  const SelectedRowsTensor& x,
-                                  const DenseTensor& scale,
-                                  float bias,
-                                  bool bias_after_scale,
-                                  SelectedRowsTensor* out) {
+void ScaleSelectedRowsHost(const CPUContext& dev_ctx,
+                           const SelectedRowsTensor& x,
+                           const DenseTensor& scale,
+                           float bias,
+                           bool bias_after_scale,
+                           SelectedRowsTensor* out) {
   out->set_rows(x.rows());
   out->set_height(x.height());
   Scale<T>(dev_ctx,
@@ -113,7 +113,7 @@ PT_REGISTER_KERNEL("scale",
                    int16_t,
                    int,
                    int64_t) {}
-PT_REGISTER_KERNEL("scale.selectedrows",
+PT_REGISTER_KERNEL("scale.sr",
                    CPU,
                    NCHW,
                    pt::ScaleSelectedRows,
@@ -125,10 +125,10 @@ PT_REGISTER_KERNEL("scale.selectedrows",
                    int16_t,
                    int,
                    int64_t) {}
-PT_REGISTER_KERNEL("scale.dynamic_attr",
+PT_REGISTER_KERNEL("scale.host",
                    CPU,
                    NCHW,
-                   pt::ScaleDynamicAttr,
+                   pt::ScaleHost,
                    float,
                    double,
                    bfloat16,
@@ -141,10 +141,10 @@ PT_REGISTER_KERNEL("scale.dynamic_attr",
       .SetBackend(pt::Backend::kCPU)
       .SetDataType(pt::DataType::kFLOAT32);
 }
-PT_REGISTER_KERNEL("scale.selectedrows.dynamic_attr",
+PT_REGISTER_KERNEL("scale.sr.host",
                    CPU,
                    NCHW,
-                   pt::ScaleSelectedRowsDynamicAttr,
+                   pt::ScaleSelectedRowsHost,
                    float,
                    double,
                    bfloat16,
diff --git a/paddle/tcmpt/cpu/math.h b/paddle/tcmpt/cpu/math.h
index f6e3375a98397..e0694beafe4d5 100644
--- a/paddle/tcmpt/cpu/math.h
+++ b/paddle/tcmpt/cpu/math.h
@@ -48,19 +48,19 @@ void ScaleSelectedRows(const CPUContext& dev_ctx,
                        SelectedRowsTensor* out);
 
 template <typename T>
-void ScaleDynamicAttr(const CPUContext& dev_ctx,
-                      const DenseTensor& x,
-                      const DenseTensor& scale,
-                      float bias,
-                      bool bias_after_scale,
-                      DenseTensor* out);
+void ScaleHost(const CPUContext& dev_ctx,
+               const DenseTensor& x,
+               const DenseTensor& scale,
+               float bias,
+               bool bias_after_scale,
+               DenseTensor* out);
 
 template <typename T>
-void ScaleSelectedRowsDynamicAttr(const CPUContext& dev_ctx,
-                                  const SelectedRowsTensor& x,
-                                  const DenseTensor& scale,
-                                  float bias,
-                                  bool bias_after_scale,
-                                  SelectedRowsTensor* out);
+void ScaleSelectedRowsHost(const CPUContext& dev_ctx,
+                           const SelectedRowsTensor& x,
+                           const DenseTensor& scale,
+                           float bias,
+                           bool bias_after_scale,
+                           SelectedRowsTensor* out);
 
 }  // namespace pt
diff --git a/paddle/tcmpt/cuda/math.cu b/paddle/tcmpt/cuda/math.cu
index e7325f83e6732..b8f5777ce9a7e 100644
--- a/paddle/tcmpt/cuda/math.cu
+++ b/paddle/tcmpt/cuda/math.cu
@@ -108,23 +108,23 @@ void ScaleSelectedRows(const CUDAContext& dev_ctx,
 }
 
 template <typename T>
-void ScaleDynamicAttr(const CUDAContext& dev_ctx,
-                      const DenseTensor& x,
-                      const DenseTensor& scale,
-                      float bias,
-                      bool bias_after_scale,
-                      DenseTensor* out) {
+void ScaleHost(const CUDAContext& dev_ctx,
+               const DenseTensor& x,
+               const DenseTensor& scale,
+               float bias,
+               bool bias_after_scale,
+               DenseTensor* out) {
   module::Scale<CUDAContext, T>(
       dev_ctx, x, *scale.data<float>(), bias, bias_after_scale, out);
 }
 
 template <typename T>
-void ScaleSelectedRowsDynamicAttr(const CUDAContext& dev_ctx,
-                                  const SelectedRowsTensor& x,
-                                  const DenseTensor& scale,
-                                  float bias,
-                                  bool bias_after_scale,
-                                  SelectedRowsTensor* out) {
+void ScaleSelectedRowsHost(const CUDAContext& dev_ctx,
+                           const SelectedRowsTensor& x,
+                           const DenseTensor& scale,
+                           float bias,
+                           bool bias_after_scale,
+                           SelectedRowsTensor* out) {
   out->set_rows(x.rows());
   out->set_height(x.height());
   Scale<T>(dev_ctx,
@@ -152,7 +152,7 @@ PT_REGISTER_KERNEL("scale",
                    int16_t,
                    int,
                    int64_t) {}
-PT_REGISTER_KERNEL("scale.selectedrows",
+PT_REGISTER_KERNEL("scale.sr",
                    CUDA,
                    NCHW,
                    pt::ScaleSelectedRows,
@@ -164,10 +164,10 @@ PT_REGISTER_KERNEL("scale.selectedrows",
                    int16_t,
                    int,
                    int64_t) {}
-PT_REGISTER_KERNEL("scale.dynamic_attr",
+PT_REGISTER_KERNEL("scale.host",
                    CUDA,
                    NCHW,
-                   pt::ScaleDynamicAttr,
+                   pt::ScaleHost,
                    float,
                    double,
                    float16,
@@ -180,10 +180,10 @@ PT_REGISTER_KERNEL("scale.dynamic_attr",
       .SetBackend(pt::Backend::kCPU)
       .SetDataType(pt::DataType::kFLOAT32);
 }
-PT_REGISTER_KERNEL("scale.selectedrows.dynamic_attr",
+PT_REGISTER_KERNEL("scale.sr.host",
                    CUDA,
                    NCHW,
-                   pt::ScaleSelectedRowsDynamicAttr,
+                   pt::ScaleSelectedRowsHost,
                    float,
                    double,
                    float16,
diff --git a/paddle/tcmpt/cuda/math.h b/paddle/tcmpt/cuda/math.h
index a3e4985920f24..1b221ecbaa9e2 100644
--- a/paddle/tcmpt/cuda/math.h
+++ b/paddle/tcmpt/cuda/math.h
@@ -53,20 +53,20 @@ void ScaleSelectedRows(const CUDAContext& dev_ctx,
                        SelectedRowsTensor* out);
 
 template <typename T>
-void ScaleDynamicAttr(const CUDAContext& dev_ctx,
-                      const DenseTensor& x,
-                      const DenseTensor& scale,
-                      float bias,
-                      bool bias_after_scale,
-                      DenseTensor* out);
+void ScaleHost(const CUDAContext& dev_ctx,
+               const DenseTensor& x,
+               const DenseTensor& scale,
+               float bias,
+               bool bias_after_scale,
+               DenseTensor* out);
 
 template <typename T>
-void ScaleSelectedRowsDynamicAttr(const CUDAContext& dev_ctx,
-                                  const SelectedRowsTensor& x,
-                                  const DenseTensor& scale,
-                                  float bias,
-                                  bool bias_after_scale,
-                                  SelectedRowsTensor* out);
+void ScaleSelectedRowsHost(const CUDAContext& dev_ctx,
+                           const SelectedRowsTensor& x,
+                           const DenseTensor& scale,
+                           float bias,
+                           bool bias_after_scale,
+                           SelectedRowsTensor* out);
 
 }  // namespace pt
 

From 1987ce9dd3373f798a0c0dfb22108817738309bf Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Fri, 10 Sep 2021 09:00:46 +0000
Subject: [PATCH 049/125] fix scale kernel match error

---
 paddle/fluid/framework/operator.cc           | 64 +++++++++++---
 paddle/fluid/imperative/prepared_operator.cc | 87 +++++++++++++++-----
 paddle/tcmpt/api/include/math.h              | 21 +++++
 paddle/tcmpt/core/convert_utils.cc           |  4 +
 paddle/tcmpt/core/kernel_def.h               |  4 +-
 5 files changed, 146 insertions(+), 34 deletions(-)
 create mode 100644 paddle/tcmpt/api/include/math.h

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index da69a2ad60dc6..04e95c3e945e3 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1148,9 +1148,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
 
   // TODO(chenweihang): Now we are still reusing a lot of the original fluid
   // implementation, this is a gradual replacement process
-  // TODO(chenweihang): only for debug, remove it after
-  // print all registered kernels
-  VLOG(1) << pt::KernelFactory::Instance();
 
   // TODO(chenweihang): in the first phase of project, we only support CPU, CUDA
   // and RCOM backend, the XPU, NPU and MKLDNN will be supported in the second
@@ -1290,14 +1287,17 @@ bool ContainHostTensor(const proto::OpProto& op_proto,
 static pt::KernelName ConstructPtKernelName(const std::string& op_type,
                                             const proto::OpProto& op_proto,
                                             const VariableValueMap& inputs) {
-  pt::KernelName kernel_name(op_type.c_str());
+  std::string overload_name;
   if (ContainSelectedRows(inputs)) {
-    kernel_name.overload_name += pt::kContainSelectedRowsSuffix;
+    overload_name = pt::kContainSelectedRowsSuffix;
   }
   if (ContainHostTensor(op_proto, inputs)) {
-    kernel_name.overload_name += pt::kContainHostTensorSuffix;
+    if (overload_name != "") {
+      overload_name += ".";
+    }
+    overload_name += pt::kContainHostTensorSuffix;
   }
-  return kernel_name;
+  return pt::KernelName(op_type, overload_name);
 }
 
 void OperatorWithKernel::ChoosePtKernel(
@@ -1314,6 +1314,11 @@ void OperatorWithKernel::ChoosePtKernel(
   // 3. selecte op kernel
   pt_kernel_.reset(new pt::Kernel(pt::KernelFactory::Instance().SelectKernel(
       kernel_name, *pt_kernel_key_)));
+
+  // for debug
+  VLOG(1) << "ChoosePtKernel - kernel name: " << kernel_name
+          << " | kernel key: " << *pt_kernel_key_
+          << " | kernel: " << *pt_kernel_;
 }
 
 void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
@@ -1875,17 +1880,38 @@ pt::KernelContext OperatorWithKernel::ConstructPtKernelContext(
   // If we the VariableValueMap are ordered, we can get tensor by iter the map,
   // and its order is same as OpProto
 
+  // TODO(chenweihang): For scale op, when the input has a `ScaleTensor`,
+  // the following scale attribute should be skipped, and there are many
+  // such ops, which require certain rules to process, now only for verify
+  // scale op
+  std::unordered_map<std::string, bool> contain_host_tensor_flags{
+      {"ScaleTensor", false}};
+  std::unordered_map<std::string, std::string> attr_to_host_tensor{
+      {"scale", "ScaleTensor"}};
+
   auto* op_proto = Info().proto_;
   for (int i = 0; i < op_proto->inputs_size(); ++i) {
     auto in = op_proto->inputs()[i];
     // TODO(chenweihang): skip special cases temporarily
     // TODO(chenweihang): deal with diff param in vector
-    if ((in.has_dispensable() && in.dispensable()) ||
-        (in.has_extra() && in.extra()) || (in.has_quant() && in.quant())) {
-      VLOG(1) << "BuildKernelContext: skip dispensable input - " << in.name();
+    if ((in.has_extra() && in.extra()) || (in.has_quant() && in.quant())) {
+      VLOG(1) << "Static graph PtKernel input: skip extra & quant input - "
+              << in.name();
       continue;
     }
     auto in_name = in.name();
+    if (in.has_dispensable() && in.dispensable()) {
+      if (contain_host_tensor_flags.count(in_name) > 0 &&
+          ctx.inputs.count(in_name) > 0 && ctx.inputs.at(in_name).size() > 0) {
+        VLOG(1) << "Static graph PtKernel input: contain host input - "
+                << in_name;
+        contain_host_tensor_flags[in_name] = true;
+      } else {
+        VLOG(1) << "Static graph PtKernel input: skip dispensable input - "
+                << in_name;
+        continue;
+      }
+    }
     VLOG(1) << "Static graph PtKernel input: " << in_name;
     auto in_def = input_defs.at(i);
     for (auto* var : ctx.inputs.at(in_name)) {
@@ -1938,14 +1964,26 @@ pt::KernelContext OperatorWithKernel::ConstructPtKernelContext(
   }
   for (int i = 0; i < op_proto->attrs_size(); ++i) {
     auto attr = op_proto->attrs()[i];
+    if (attr.name() == "use_mkldnn" || attr.name() == "op_role" ||
+        attr.name() == "op_role_var" || attr.name() == "op_namescope" ||
+        attr.name() == "op_callstack" || attr.name() == "op_device") {
+      VLOG(1) << "Static graph PtKernel attribute: skip needless attr - "
+              << attr.name();
+      continue;
+    }
     VLOG(1) << "Static graph PtKernel attribute: " << attr.name();
     if ((attr.has_extra() && attr.extra()) ||
         (attr.has_quant() && attr.quant())) {
+      VLOG(1) << "Static graph PtKernel attribute: skip extra or quant attr - "
+              << attr.name();
       continue;
     }
-    if (attr.name() == "use_mkldnn" || attr.name() == "op_role" ||
-        attr.name() == "op_role_var" || attr.name() == "op_namescope" ||
-        attr.name() == "op_callstack" || attr.name() == "op_device") {
+    if (attr_to_host_tensor.count(attr.name()) > 0 &&
+        contain_host_tensor_flags.at(attr_to_host_tensor.at(attr.name())) ==
+            true) {
+      VLOG(1) << "Static graph PtKernel attribute: skip dynaimc attr - "
+              << attr.name() << ", because "
+              << attr_to_host_tensor.at(attr.name()) << " exists.";
       continue;
     }
     // TODO(chenweihang): support other attrs
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index de1a3a1ffcc0c..b87ec99b9c73e 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -48,11 +48,18 @@ const framework::Tensor* GetTensorFromVar(const framework::Variable& var) {
 
 template <typename T>
 static const T& GetAttr(const framework::AttributeMap& attrs,
+                        const framework::AttributeMap& default_attrs,
                         const std::string& name) {
-  PADDLE_ENFORCE_NE(
-      attrs.find(name), attrs.end(),
+  auto it = attrs.find(name);
+  bool found = it != attrs.end();
+  if (!found) {
+    it = default_attrs.find(name);
+    found = it != default_attrs.end();
+  }
+  PADDLE_ENFORCE_EQ(
+      found, true,
       platform::errors::NotFound("(%s) is not found in AttributeMap.", name));
-  return BOOST_GET_CONST(T, attrs.at(name));
+  return BOOST_GET_CONST(T, it->second);
 }
 
 template <typename VarType>
@@ -161,14 +168,17 @@ template <typename VarType>
 static pt::KernelName ConstructPtKernelName(
     const std::string& op_type, const framework::proto::OpProto& op_proto,
     const NameVarMap<VarType>& inputs) {
-  pt::KernelName kernel_name(op_type.c_str());
+  std::string overload_name;
   if (ContainSelectedRows<VarType>(inputs)) {
-    kernel_name.overload_name += pt::kContainSelectedRowsSuffix;
+    overload_name = pt::kContainSelectedRowsSuffix;
   }
   if (ContainHostTensor<VarType>(op_proto, inputs)) {
-    kernel_name.overload_name += pt::kContainHostTensorSuffix;
+    if (overload_name != "") {
+      overload_name += ".";
+    }
+    overload_name += pt::kContainHostTensorSuffix;
   }
-  return kernel_name;
+  return pt::KernelName(op_type, overload_name);
 }
 
 template <typename VarType>
@@ -204,6 +214,9 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
     auto pt_kernel_key = op.ConstructPtKernelKey(inputs, place);
     auto pt_kernel =
         pt::KernelFactory::Instance().SelectKernel(kernel_name, pt_kernel_key);
+    // for debug
+    VLOG(1) << "PrepareImpl - kernel name: " << kernel_name
+            << " | kernel key: " << pt_kernel_key << " | kernel: " << pt_kernel;
     if (pt_kernel.IsValid()) {
       // TODO(chenweihang): using CPUKernel when miss device kernel case
       return PreparedOp(op, ctx, pt_kernel_key, pt_kernel, dev_ctx);
@@ -286,6 +299,7 @@ static pt::KernelContext BuildDygraphKernelContext(
     const pt::Kernel& pt_kernel, const framework::proto::OpProto& op_proto,
     const NameVarMap<VarType>& ins, const NameVarMap<VarType>& outs,
     const framework::AttributeMap& attrs,
+    const framework::AttributeMap& default_attrs,
     const platform::DeviceContext& dev_ctx) {
   // TODO(chenweihang): now only work for very simple case (sign op),
   // many cases need to be deal with later:
@@ -298,16 +312,35 @@ static pt::KernelContext BuildDygraphKernelContext(
   auto input_defs = pt_kernel.args_def().input_defs();
   auto output_defs = pt_kernel.args_def().output_defs();
 
+  // TODO(chenweihang): For scale op, when the input has a `ScaleTensor`,
+  // the following scale attribute should be skipped, and there are many
+  // such ops, which require certain rules to process, now only for verify
+  // scale op
+  std::unordered_map<std::string, bool> contain_host_tensor_flags{
+      {"ScaleTensor", false}};
+  std::unordered_map<std::string, std::string> attr_to_host_tensor{
+      {"scale", "ScaleTensor"}};
+
   for (int i = 0; i < op_proto.inputs_size(); ++i) {
     auto in = op_proto.inputs()[i];
     // TODO(chenweihang): deal with diff param in vector
-    if ((in.has_dispensable() && in.dispensable()) ||
-        (in.has_extra() && in.extra()) || (in.has_quant() && in.quant())) {
-      VLOG(1) << "BuildDygraphKernelContext: skip dispensable input - "
+    if ((in.has_extra() && in.extra()) || (in.has_quant() && in.quant())) {
+      VLOG(1) << "Dygraph PtKernel input: skip extra & quant input - "
               << in.name();
       continue;
     }
     auto in_name = in.name();
+    if (in.has_dispensable() && in.dispensable()) {
+      if (contain_host_tensor_flags.count(in_name) > 0 &&
+          ins.count(in_name) > 0 && ins.at(in_name).size() > 0) {
+        VLOG(1) << "Dygraph PtKernel input: contain host input - " << in_name;
+        contain_host_tensor_flags[in_name] = true;
+      } else {
+        VLOG(1) << "Dygraph PtKernel input: skip dispensable input - "
+                << in_name;
+        continue;
+      }
+    }
     VLOG(1) << "Dygraph PtKernel input: " << in_name;
     auto in_def = input_defs.at(i);
     for (auto var : ins.at(in_name)) {
@@ -369,28 +402,43 @@ static pt::KernelContext BuildDygraphKernelContext(
   for (int i = 0; i < op_proto.attrs_size(); ++i) {
     auto attr = op_proto.attrs()[i];
     VLOG(1) << "Dygraph PtKernel attribute: " << attr.name();
+    if (attr.name() == "use_mkldnn" || attr.name() == "op_role" ||
+        attr.name() == "op_role_var" || attr.name() == "op_namescope" ||
+        attr.name() == "op_callstack" || attr.name() == "op_device") {
+      VLOG(1) << "Dygraph PtKernel attribute: skip needless attr - "
+              << attr.name();
+      continue;
+    }
     if ((attr.has_extra() && attr.extra()) ||
         (attr.has_quant() && attr.quant())) {
+      VLOG(1) << "Dygraph PtKernel attribute: skip extra & quant attr - "
+              << attr.name();
       continue;
     }
-    if (attr.name() == "use_mkldnn" || attr.name() == "op_role" ||
-        attr.name() == "op_role_var" || attr.name() == "op_namescope" ||
-        attr.name() == "op_callstack" || attr.name() == "op_device") {
+    if (attr_to_host_tensor.count(attr.name()) > 0 &&
+        contain_host_tensor_flags.at(attr_to_host_tensor.at(attr.name())) ==
+            true) {
+      VLOG(1) << "Dygraph PtKernel attribute: skip dynaimc attr - "
+              << attr.name() << ", because "
+              << attr_to_host_tensor.at(attr.name()) << " exists.";
       continue;
     }
     // TODO(chenweihang): support other attrs
     // In principle, the attr required by the dynamic mode should be
     // passed in from the Python side, and there is no need to look up
-    // from the default_map
+    // from the default_map, but now this nor work
     switch (attr.type()) {
       case framework::proto::AttrType::INT:
-        op_kernel_ctx.EmplaceBackAttr(GetAttr<int>(attrs, attr.name()));
+        op_kernel_ctx.EmplaceBackAttr(
+            GetAttr<int>(attrs, default_attrs, attr.name()));
         break;
       case framework::proto::AttrType::FLOAT:
-        op_kernel_ctx.EmplaceBackAttr(GetAttr<float>(attrs, attr.name()));
+        op_kernel_ctx.EmplaceBackAttr(
+            GetAttr<float>(attrs, default_attrs, attr.name()));
         break;
       case framework::proto::AttrType::BOOLEAN:
-        op_kernel_ctx.EmplaceBackAttr(GetAttr<bool>(attrs, attr.name()));
+        op_kernel_ctx.EmplaceBackAttr(
+            GetAttr<bool>(attrs, default_attrs, attr.name()));
         break;
       default:
         // TODO(chenweihang): support other attrs type
@@ -459,8 +507,9 @@ static void PreparedOpRunPtImpl(const framework::OperatorBase& op,
   static_cast<const framework::OperatorWithKernel&>(op).InferShape(
       &infer_shape_ctx);
 
-  auto op_kernel_ctx = BuildDygraphKernelContext<VarType>(
-      pt_kernel, *(op.Info().proto_), ins, outs, attrs, *dev_ctx);
+  auto op_kernel_ctx =
+      BuildDygraphKernelContext<VarType>(pt_kernel, *(op.Info().proto_), ins,
+                                         outs, attrs, default_attrs, *dev_ctx);
   pt_kernel(&op_kernel_ctx);
 
   // TODO(chenweihang): add flags
diff --git a/paddle/tcmpt/api/include/math.h b/paddle/tcmpt/api/include/math.h
new file mode 100644
index 0000000000000..aab65f5e8345d
--- /dev/null
+++ b/paddle/tcmpt/api/include/math.h
@@ -0,0 +1,21 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace pt {
+
+Tensor sign(const Tensor& x);
+
+}  // namespace pt
diff --git a/paddle/tcmpt/core/convert_utils.cc b/paddle/tcmpt/core/convert_utils.cc
index 9ad98d3d910b2..e994b8835fa2b 100644
--- a/paddle/tcmpt/core/convert_utils.cc
+++ b/paddle/tcmpt/core/convert_utils.cc
@@ -60,6 +60,8 @@ pt::DataType TransToPtDataType(
       return DataType::kCOMPLEX128;
     case paddle::framework::proto::VarType::FP16:
       return DataType::kFLOAT16;
+    case paddle::framework::proto::VarType::BF16:
+      return DataType::kBFLOAT16;
     case paddle::framework::proto::VarType::BOOL:
       return DataType::kBOOL;
     default:
@@ -129,6 +131,8 @@ paddle::framework::proto::VarType::Type TransToProtoVarType(
       return paddle::framework::proto::VarType::COMPLEX128;
     case DataType::kFLOAT16:
       return paddle::framework::proto::VarType::FP16;
+    case DataType::kBFLOAT16:
+      return paddle::framework::proto::VarType::BF16;
     case DataType::kBOOL:
       return paddle::framework::proto::VarType::BOOL;
     default:
diff --git a/paddle/tcmpt/core/kernel_def.h b/paddle/tcmpt/core/kernel_def.h
index 97d2721d28fd0..073d57269c321 100644
--- a/paddle/tcmpt/core/kernel_def.h
+++ b/paddle/tcmpt/core/kernel_def.h
@@ -32,9 +32,9 @@ using KernelArgsParseFn = void (*)(const KernelKey& default_key,
 
 // For kernels that contains dynamic tensor attribute and it need to be always
 // on host device, such as `ScaleTensor`
-constexpr char kContainHostTensorSuffix[] = ".host";
+constexpr char kContainHostTensorSuffix[] = "host";
 
 // For kernels with SelectedRowsTensor input and output
-constexpr char kContainSelectedRowsSuffix[] = ".sr";
+constexpr char kContainSelectedRowsSuffix[] = "sr";
 
 }  // namespace pt

From 33a4c41ba644f4a109f6d91825b8e0ee03299b36 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Fri, 10 Sep 2021 15:10:34 +0000
Subject: [PATCH 050/125] fix scale test failed

---
 paddle/fluid/framework/operator.cc            | 67 ++++++++++++++----
 paddle/fluid/imperative/prepared_operator.cc  | 69 +++++++++++++-----
 paddle/tcmpt/api/include/math.h               |  2 +
 paddle/tcmpt/api/include/tensor.h             | 21 ++----
 paddle/tcmpt/api/src/math.cc                  | 17 +++++
 paddle/tcmpt/core/kernel_factory.cc           | 10 ++-
 paddle/tcmpt/core/kernel_factory.h            | 70 +++++++++++--------
 paddle/tcmpt/core/kernel_registry.h           |  8 ++-
 paddle/tcmpt/cpu/math.cc                      | 20 +++---
 paddle/tcmpt/cuda/math.cu                     | 21 +++---
 .../fluid/tests/unittests/test_scale_op.py    |  4 +-
 11 files changed, 205 insertions(+), 104 deletions(-)
 create mode 100644 paddle/tcmpt/api/src/math.cc

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 04e95c3e945e3..7a91581d9fe3b 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1258,7 +1258,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   }
 }
 
-bool ContainSelectedRows(const VariableValueMap& inputs) {
+static bool ContainSelectedRows(const VariableValueMap& inputs) {
   for (auto& var_pair : inputs) {
     for (auto* var : var_pair.second) {
       if (var->IsType<SelectedRows>()) {
@@ -1269,17 +1269,26 @@ bool ContainSelectedRows(const VariableValueMap& inputs) {
   return false;
 }
 
+// TODO(chenweihang): now only check single var input
+static bool IsValidVar(const std::string& name,
+                       const VariableValueMap& inputs) {
+  auto it = inputs.find(name);
+  if (it == inputs.end()) {
+    return false;
+  }
+  auto* var = it->second.empty() ? nullptr : it->second[0];
+  return var != nullptr;
+}
+
 // TODO(chenweihang): enhance rules, not all dispensable inputs
 // are host tensor, now only for scale kernel verify
-bool ContainHostTensor(const proto::OpProto& op_proto,
-                       const VariableValueMap& inputs) {
+static bool ContainHostTensor(const proto::OpProto& op_proto,
+                              const VariableValueMap& inputs) {
   for (int i = 0; i < op_proto.inputs_size(); ++i) {
     auto in = op_proto.inputs()[i];
-    auto it = inputs.find(in.name());
-    if (it == inputs.end()) {
-      return false;
+    if (in.has_dispensable() && in.dispensable()) {
+      return IsValidVar(in.name(), inputs);
     }
-    return it->second.empty() ? false : true;
   }
   return false;
 }
@@ -1316,6 +1325,7 @@ void OperatorWithKernel::ChoosePtKernel(
       kernel_name, *pt_kernel_key_)));
 
   // for debug
+  // VLOG(1) << pt::KernelFactory::Instance();
   VLOG(1) << "ChoosePtKernel - kernel name: " << kernel_name
           << " | kernel key: " << *pt_kernel_key_
           << " | kernel: " << *pt_kernel_;
@@ -1861,6 +1871,7 @@ pt::KernelKey OperatorWithKernel::ConstructPtKernelKey(
   return pt::KernelKey(backend, layout, dtype);
 }
 
+// TODO(chenweihang): This function is too complicated and needs to be split
 pt::KernelContext OperatorWithKernel::ConstructPtKernelContext(
     const RuntimeContext& ctx, const platform::DeviceContext& dev_ctx) const {
   VLOG(1) << RuntimeContextDebugString(ctx);
@@ -1902,7 +1913,7 @@ pt::KernelContext OperatorWithKernel::ConstructPtKernelContext(
     auto in_name = in.name();
     if (in.has_dispensable() && in.dispensable()) {
       if (contain_host_tensor_flags.count(in_name) > 0 &&
-          ctx.inputs.count(in_name) > 0 && ctx.inputs.at(in_name).size() > 0) {
+          IsValidVar(in_name, ctx.inputs)) {
         VLOG(1) << "Static graph PtKernel input: contain host input - "
                 << in_name;
         contain_host_tensor_flags[in_name] = true;
@@ -1914,17 +1925,43 @@ pt::KernelContext OperatorWithKernel::ConstructPtKernelContext(
     }
     VLOG(1) << "Static graph PtKernel input: " << in_name;
     auto in_def = input_defs.at(i);
+    VLOG(1) << "in_def: " << in_def.backend << ", " << in_def.dtype << ", "
+            << in_def.layout;
+    // TODO(chenweihang): input need to be transformed by in all define
+    auto expected_place = pt::TransToFluidPlace(in_def.backend);
+    VLOG(1) << "expected_place: " << expected_place;
     for (auto* var : ctx.inputs.at(in_name)) {
       if (var->IsType<LoDTensor>()) {
+        VLOG(1) << "var is LoDTensor";
         const auto& tensor = var->Get<LoDTensor>();
-        auto pt_in = MakeTensorImpl<pt::DenseTensor, LoDTensor>(
-            tensor, in_def.backend, in_def.dtype, in_def.layout);
-        op_kernel_ctx.EmplaceBackInput(pt_in);
+        if (!platform::is_same_place(tensor.place(), expected_place)) {
+          VLOG(1) << "var place is mismatch.";
+          LoDTensor tmp_tensor;
+          TensorCopySync(tensor, expected_place, &tmp_tensor);
+          auto pt_in = MakeTensorImpl<pt::DenseTensor, LoDTensor>(
+              tmp_tensor, in_def.backend, in_def.dtype, in_def.layout);
+          op_kernel_ctx.EmplaceBackInput(pt_in);
+        } else {
+          auto pt_in = MakeTensorImpl<pt::DenseTensor, LoDTensor>(
+              tensor, in_def.backend, in_def.dtype, in_def.layout);
+          op_kernel_ctx.EmplaceBackInput(pt_in);
+        }
       } else if (var->IsType<SelectedRows>()) {
         const auto& tensor = var->Get<SelectedRows>();
-        auto pt_in = MakeTensorImpl<pt::SelectedRowsTensor, SelectedRows>(
-            tensor, in_def.backend, in_def.dtype, in_def.layout);
-        op_kernel_ctx.EmplaceBackInput(pt_in);
+        if (!platform::is_same_place(tensor.value().place(), expected_place)) {
+          SelectedRows tmp_tensor;
+          tmp_tensor.set_rows(tensor.rows());
+          tmp_tensor.set_height(tensor.height());
+          TensorCopySync(tensor.value(), expected_place,
+                         tmp_tensor.mutable_value());
+          auto pt_in = MakeTensorImpl<pt::SelectedRowsTensor, SelectedRows>(
+              tmp_tensor, in_def.backend, in_def.dtype, in_def.layout);
+          op_kernel_ctx.EmplaceBackInput(pt_in);
+        } else {
+          auto pt_in = MakeTensorImpl<pt::SelectedRowsTensor, SelectedRows>(
+              tensor, in_def.backend, in_def.dtype, in_def.layout);
+          op_kernel_ctx.EmplaceBackInput(pt_in);
+        }
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "Unsupported shared input `%s` type now when call pt kernel.",
@@ -1971,7 +2008,6 @@ pt::KernelContext OperatorWithKernel::ConstructPtKernelContext(
               << attr.name();
       continue;
     }
-    VLOG(1) << "Static graph PtKernel attribute: " << attr.name();
     if ((attr.has_extra() && attr.extra()) ||
         (attr.has_quant() && attr.quant())) {
       VLOG(1) << "Static graph PtKernel attribute: skip extra or quant attr - "
@@ -1986,6 +2022,7 @@ pt::KernelContext OperatorWithKernel::ConstructPtKernelContext(
               << attr_to_host_tensor.at(attr.name()) << " exists.";
       continue;
     }
+    VLOG(1) << "Static graph PtKernel attribute: " << attr.name();
     // TODO(chenweihang): support other attrs
     switch (attr.type()) {
       case proto::AttrType::INT:
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index b87ec99b9c73e..29a1476662ce8 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -137,7 +137,7 @@ static framework::VariableValueMap BuildInputMap(
 }
 
 template <typename VarType>
-bool ContainSelectedRows(const NameVarMap<VarType>& inputs) {
+static bool ContainSelectedRows(const NameVarMap<VarType>& inputs) {
   for (auto& var_pair : inputs) {
     for (auto& var : var_pair.second) {
       if (var->Var().template IsType<framework::SelectedRows>()) {
@@ -148,18 +148,30 @@ bool ContainSelectedRows(const NameVarMap<VarType>& inputs) {
   return false;
 }
 
+// TODO(chenweihang): now only check single var input
+template <typename VarType>
+static bool IsValidVar(const std::string& name,
+                       const NameVarMap<VarType>& inputs) {
+  auto it = inputs.find(name);
+  if (it == inputs.end()) {
+    return false;
+  }
+  if (it->second.empty()) {
+    return false;
+  }
+  return it->second[0] != nullptr;
+}
+
 // TODO(chenweihang): enhance rules, not all dispensable inputs
 // are host tensor, now only for scale kernel verify
 template <typename VarType>
-bool ContainHostTensor(const framework::proto::OpProto& op_proto,
-                       const NameVarMap<VarType>& inputs) {
+static bool ContainHostTensor(const framework::proto::OpProto& op_proto,
+                              const NameVarMap<VarType>& inputs) {
   for (int i = 0; i < op_proto.inputs_size(); ++i) {
     auto in = op_proto.inputs()[i];
-    auto it = inputs.find(in.name());
-    if (it == inputs.end()) {
-      return false;
+    if (in.has_dispensable() && in.dispensable()) {
+      return IsValidVar<VarType>(in.name(), inputs);
     }
-    return it->second.empty() ? false : true;
   }
   return false;
 }
@@ -294,6 +306,7 @@ PreparedOp PreparedOp::Prepare(const NameVarMap<VariableWrapper>& ins,
                                       default_attrs);
 }
 
+// TODO(chenweihang): This function is too complicated and needs to be split
 template <typename VarType>
 static pt::KernelContext BuildDygraphKernelContext(
     const pt::Kernel& pt_kernel, const framework::proto::OpProto& op_proto,
@@ -332,7 +345,7 @@ static pt::KernelContext BuildDygraphKernelContext(
     auto in_name = in.name();
     if (in.has_dispensable() && in.dispensable()) {
       if (contain_host_tensor_flags.count(in_name) > 0 &&
-          ins.count(in_name) > 0 && ins.at(in_name).size() > 0) {
+          IsValidVar<VarType>(in_name, ins)) {
         VLOG(1) << "Dygraph PtKernel input: contain host input - " << in_name;
         contain_host_tensor_flags[in_name] = true;
       } else {
@@ -343,20 +356,42 @@ static pt::KernelContext BuildDygraphKernelContext(
     }
     VLOG(1) << "Dygraph PtKernel input: " << in_name;
     auto in_def = input_defs.at(i);
+    auto expected_place = pt::TransToFluidPlace(in_def.backend);
     for (auto var : ins.at(in_name)) {
       const auto& variable = var->Var();
       if (variable.template IsType<framework::LoDTensor>()) {
         const auto& tensor = variable.template Get<framework::LoDTensor>();
-        auto pt_in =
-            framework::MakeTensorImpl<pt::DenseTensor, framework::LoDTensor>(
-                tensor, in_def.backend, in_def.dtype, in_def.layout);
-        op_kernel_ctx.EmplaceBackInput(pt_in);
+        if (!platform::is_same_place(tensor.place(), expected_place)) {
+          framework::LoDTensor tmp_tensor;
+          framework::TensorCopySync(tensor, expected_place, &tmp_tensor);
+          auto pt_in =
+              framework::MakeTensorImpl<pt::DenseTensor, framework::LoDTensor>(
+                  tmp_tensor, in_def.backend, in_def.dtype, in_def.layout);
+          op_kernel_ctx.EmplaceBackInput(pt_in);
+        } else {
+          auto pt_in =
+              framework::MakeTensorImpl<pt::DenseTensor, framework::LoDTensor>(
+                  tensor, in_def.backend, in_def.dtype, in_def.layout);
+          op_kernel_ctx.EmplaceBackInput(pt_in);
+        }
       } else if (variable.template IsType<framework::SelectedRows>()) {
         const auto& tensor = variable.template Get<framework::SelectedRows>();
-        auto pt_in = framework::MakeTensorImpl<pt::SelectedRowsTensor,
-                                               framework::SelectedRows>(
-            tensor, in_def.backend, in_def.dtype, in_def.layout);
-        op_kernel_ctx.EmplaceBackInput(pt_in);
+        if (!platform::is_same_place(tensor.value().place(), expected_place)) {
+          framework::SelectedRows tmp_tensor;
+          tmp_tensor.set_rows(tensor.rows());
+          tmp_tensor.set_height(tensor.height());
+          TensorCopySync(tensor.value(), expected_place,
+                         tmp_tensor.mutable_value());
+          auto pt_in = framework::MakeTensorImpl<pt::SelectedRowsTensor,
+                                                 framework::SelectedRows>(
+              tmp_tensor, in_def.backend, in_def.dtype, in_def.layout);
+          op_kernel_ctx.EmplaceBackInput(pt_in);
+        } else {
+          auto pt_in = framework::MakeTensorImpl<pt::SelectedRowsTensor,
+                                                 framework::SelectedRows>(
+              tensor, in_def.backend, in_def.dtype, in_def.layout);
+          op_kernel_ctx.EmplaceBackInput(pt_in);
+        }
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "Unsupported shared input `%s` type now when call pt kernel.",
@@ -401,7 +436,6 @@ static pt::KernelContext BuildDygraphKernelContext(
 
   for (int i = 0; i < op_proto.attrs_size(); ++i) {
     auto attr = op_proto.attrs()[i];
-    VLOG(1) << "Dygraph PtKernel attribute: " << attr.name();
     if (attr.name() == "use_mkldnn" || attr.name() == "op_role" ||
         attr.name() == "op_role_var" || attr.name() == "op_namescope" ||
         attr.name() == "op_callstack" || attr.name() == "op_device") {
@@ -423,6 +457,7 @@ static pt::KernelContext BuildDygraphKernelContext(
               << attr_to_host_tensor.at(attr.name()) << " exists.";
       continue;
     }
+    VLOG(1) << "Dygraph PtKernel attribute: " << attr.name();
     // TODO(chenweihang): support other attrs
     // In principle, the attr required by the dynamic mode should be
     // passed in from the Python side, and there is no need to look up
diff --git a/paddle/tcmpt/api/include/math.h b/paddle/tcmpt/api/include/math.h
index aab65f5e8345d..cf7a769f67493 100644
--- a/paddle/tcmpt/api/include/math.h
+++ b/paddle/tcmpt/api/include/math.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/tcmpt/api/include/tensor.h"
+
 namespace pt {
 
 Tensor sign(const Tensor& x);
diff --git a/paddle/tcmpt/api/include/tensor.h b/paddle/tcmpt/api/include/tensor.h
index 79d2183ee58b3..e867d1ae507ae 100644
--- a/paddle/tcmpt/api/include/tensor.h
+++ b/paddle/tcmpt/api/include/tensor.h
@@ -45,11 +45,10 @@ namespace pt {
 
 class Tensor;
 
-class AutogradMetaInterface {
+class AbstractAutogradMeta {
  public:
-  virtual const Tensor& grad() const = 0;
-  virtual ~AutogradMetaInterface() = 0;
-  // TODO(yangjiabin): design other methods
+  // No AbstractAutogradMeta should be created
+  virtual ~AbstractAutogradMeta() {}
 };
 
 /**
@@ -135,23 +134,11 @@ class Tensor final {
    */
   Place place() const { return impl_->place(); }
 
-  /**
-   * @description: Convert the current Tensor to a Tensor of
-   *               a specific data type for a specific device
-   * @param {const} Backend
-   * @param {const} DataType
-   * @return {*}
-   */
-  // Tensor to(const Backend& backend, const DataType& dtype) {
-  //   // TODO(chenweihang): use kernels to impl later
-  // }
-
   /**
    * Backend judgment APIs, shield the concept of Backend.
    */
-  // TODO(chenweihang): impl later
   bool is_cpu() const { return impl_->backend() == Backend::kCPU; }
-  bool is_cuda() const;
+  bool is_cuda() const { return impl_->backend() == Backend::kCUDA; }
   bool is_hip() const;
   bool is_xpu() const;
   bool is_npu() const;
diff --git a/paddle/tcmpt/api/src/math.cc b/paddle/tcmpt/api/src/math.cc
new file mode 100644
index 0000000000000..78bf8394ae96e
--- /dev/null
+++ b/paddle/tcmpt/api/src/math.cc
@@ -0,0 +1,17 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/tcmpt/api/include/math.h"
+
+namespace pt {}  // namespace pt
diff --git a/paddle/tcmpt/core/kernel_factory.cc b/paddle/tcmpt/core/kernel_factory.cc
index 94411ffb6ddab..3c6daaa776742 100644
--- a/paddle/tcmpt/core/kernel_factory.cc
+++ b/paddle/tcmpt/core/kernel_factory.cc
@@ -25,7 +25,7 @@ KernelFactory& KernelFactory::Instance() {
 }
 
 bool KernelFactory::ContainsKernel(const char* kernel_name) const {
-  auto iter = kernels_.find(KernelName(kernel_name));
+  auto iter = kernels_.find(KernelName(kernel_name, ""));
   return (iter != kernels_.end());
 }
 
@@ -72,8 +72,12 @@ const Kernel& KernelFactory::SelectKernelOrThrowError(
 }
 
 std::ostream& operator<<(std::ostream& os, const Kernel& kernel) {
-  os << "InputNum(" << kernel.args_def().input_defs().size()
-     << "), AttributeNum(" << kernel.args_def().attribute_defs().size()
+  os << "InputNum(" << kernel.args_def().input_defs().size() << "): [";
+  for (auto& in_def : kernel.args_def().input_defs()) {
+    os << "<" << in_def.backend << ", " << in_def.layout << ", " << in_def.dtype
+       << ">";
+  }
+  os << "]), AttributeNum(" << kernel.args_def().attribute_defs().size()
      << "), OutputNum(" << kernel.args_def().output_defs().size() << ")";
   return os;
 }
diff --git a/paddle/tcmpt/core/kernel_factory.h b/paddle/tcmpt/core/kernel_factory.h
index 764ef5bda3007..af1afdf0610d7 100644
--- a/paddle/tcmpt/core/kernel_factory.h
+++ b/paddle/tcmpt/core/kernel_factory.h
@@ -44,50 +44,58 @@ class KernelContext;
 
 using KernelFn = void (*)(KernelContext* ctx);
 
-struct KernelName final {
-  // TODO(chenweihang): use string_view later?
-  std::string name;
-  std::string overload_name;
-  // Avoid calculating Hash value at runtime
-  size_t hash_value;
-
+class KernelName final {
+ public:
   KernelName(std::string name, std::string overload_name)
-      : name(std::move(name)), overload_name(std::move(overload_name)) {
-    hash_value = std::hash<std::string>()(name) ^
-                 (std::hash<std::string>()(overload_name) << 1);
+      : name_(std::move(name)), overload_name_(std::move(overload_name)) {
+    hash_value_ = std::hash<std::string>()(name_) ^
+                  (std::hash<std::string>()(overload_name_) << 1);
   }
 
   KernelName(const char* kernel_name) {
     std::string kernel_name_str(kernel_name);
     size_t pos = kernel_name_str.find_first_of('.');
     if (pos == std::string::npos) {
-      name = kernel_name_str;
-      overload_name = "";
+      name_ = kernel_name_str;
+      overload_name_ = "";
     } else {
-      name = kernel_name_str.substr(0, pos);
-      overload_name = kernel_name_str.substr(pos + 1, kernel_name_str.size());
+      name_ = kernel_name_str.substr(0, pos);
+      overload_name_ = kernel_name_str.substr(pos + 1, kernel_name_str.size());
     }
-    hash_value = std::hash<std::string>()(name) ^
-                 (std::hash<std::string>()(overload_name) << 1);
+    hash_value_ = std::hash<std::string>()(name_) ^
+                  (std::hash<std::string>()(overload_name_) << 1);
   }
 
+  const std::string& name() const { return name_; }
+  const std::string& overload_name() const { return overload_name_; }
+  size_t hash_value() const { return hash_value_; }
+
   struct Hash {
     size_t operator()(const KernelName& kernel_name) const {
-      return kernel_name.hash_value;
+      return kernel_name.hash_value();
     }
   };
 
   bool operator<(const KernelName& kernel_name) const {
-    return hash_value < kernel_name.hash_value;
+    return hash_value_ < kernel_name.hash_value();
   }
 
   bool operator==(const KernelName& kernel_name) const {
-    return hash_value == kernel_name.hash_value;
+    return hash_value_ == kernel_name.hash_value();
   }
 
   bool operator!=(const KernelName& kernel_name) const {
-    return hash_value != kernel_name.hash_value;
+    return hash_value_ != kernel_name.hash_value();
   }
+
+ private:
+  // The members cannot be modified except by constructing,
+  // because the hash value need to be re calculated
+  // TODO(chenweihang): use string_view later?
+  std::string name_;
+  std::string overload_name_;
+  // Avoid calculating Hash value at runtime
+  size_t hash_value_;
 };
 
 class KernelKey {
@@ -151,21 +159,21 @@ struct TensorArgDef {
   DataLayout layout;
   DataType dtype;
 
-  TensorArgDef(Backend backend, DataLayout layout, DataType dtype)
-      : backend(backend), layout(layout), dtype(dtype) {}
+  TensorArgDef(Backend in_backend, DataLayout in_layout, DataType in_dtype)
+      : backend(in_backend), layout(in_layout), dtype(in_dtype) {}
 
-  TensorArgDef& SetBackend(Backend backend) {
-    backend = backend;
+  TensorArgDef& SetBackend(Backend in_backend) {
+    backend = in_backend;
     return *this;
   }
 
-  TensorArgDef& SetDataLayout(DataLayout layout) {
-    layout = layout;
+  TensorArgDef& SetDataLayout(DataLayout in_layout) {
+    layout = in_layout;
     return *this;
   }
 
-  TensorArgDef& SetDataType(DataType dtype) {
-    dtype = dtype;
+  TensorArgDef& SetDataType(DataType in_dtype) {
+    dtype = in_dtype;
     return *this;
   }
 };
@@ -279,10 +287,10 @@ class KernelFactory {
 
 inline std::ostream& operator<<(std::ostream& os,
                                 const KernelName& kernel_name) {
-  if (kernel_name.overload_name.empty()) {
-    os << kernel_name.name;
+  if (kernel_name.overload_name().empty()) {
+    os << kernel_name.name();
   } else {
-    os << kernel_name.name << "." << kernel_name.overload_name;
+    os << kernel_name.name() << "." << kernel_name.overload_name();
   }
   return os;
 }
diff --git a/paddle/tcmpt/core/kernel_registry.h b/paddle/tcmpt/core/kernel_registry.h
index d0f03ed5c5fe3..33475bb4728a3 100644
--- a/paddle/tcmpt/core/kernel_registry.h
+++ b/paddle/tcmpt/core/kernel_registry.h
@@ -44,8 +44,12 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
   static void Parse(const KernelKey& default_key, KernelArgsDef* args_def) {
     auto args_type = ParseArgType(Indices{});
     for (auto arg_type : args_type) {
-      if (arg_type == std::type_index(typeid(const DenseTensor&)) ||
-          arg_type == std::type_index(typeid(const SelectedRowsTensor&))) {
+      if (arg_type == std::type_index(typeid(const CPUContext&)) ||
+          arg_type == std::type_index(typeid(const CUDAContext&))) {
+        // do nothing, skip context arg now
+      } else if (arg_type == std::type_index(typeid(const DenseTensor&)) ||
+                 arg_type ==
+                     std::type_index(typeid(const SelectedRowsTensor&))) {
         args_def->AppendInput(
             default_key.backend(), default_key.layout(), default_key.dtype());
       } else if (arg_type == std::type_index(typeid(DenseTensor*)) ||
diff --git a/paddle/tcmpt/cpu/math.cc b/paddle/tcmpt/cpu/math.cc
index bf48ac420c80b..e393576ad692d 100644
--- a/paddle/tcmpt/cpu/math.cc
+++ b/paddle/tcmpt/cpu/math.cc
@@ -68,6 +68,8 @@ void ScaleSelectedRows(const CPUContext& dev_ctx,
       dev_ctx, x.value(), scale, bias, bias_after_scale, out->mutable_value());
 }
 
+// TODO(chenweihang): now the ScaleTensor's dtype are same as x, so we cannot
+// register its dtype def
 template <typename T>
 void ScaleHost(const CPUContext& dev_ctx,
                const DenseTensor& x,
@@ -75,8 +77,12 @@ void ScaleHost(const CPUContext& dev_ctx,
                float bias,
                bool bias_after_scale,
                DenseTensor* out) {
-  module::Scale<CPUContext, T>(
-      dev_ctx, x, *scale.data<float>(), bias, bias_after_scale, out);
+  module::Scale<CPUContext, T>(dev_ctx,
+                               x,
+                               static_cast<float>(*scale.data<T>()),
+                               bias,
+                               bias_after_scale,
+                               out);
 }
 
 template <typename T>
@@ -90,7 +96,7 @@ void ScaleSelectedRowsHost(const CPUContext& dev_ctx,
   out->set_height(x.height());
   Scale<T>(dev_ctx,
            x.value(),
-           *scale.data<float>(),
+           static_cast<float>(*scale.data<T>()),
            bias,
            bias_after_scale,
            out->mutable_value());
@@ -137,9 +143,7 @@ PT_REGISTER_KERNEL("scale.host",
                    int16_t,
                    int,
                    int64_t) {
-  kernel->InputAt(1)
-      .SetBackend(pt::Backend::kCPU)
-      .SetDataType(pt::DataType::kFLOAT32);
+  kernel->InputAt(1).SetBackend(pt::Backend::kCPU);
 }
 PT_REGISTER_KERNEL("scale.sr.host",
                    CPU,
@@ -153,7 +157,5 @@ PT_REGISTER_KERNEL("scale.sr.host",
                    int16_t,
                    int,
                    int64_t) {
-  kernel->InputAt(1)
-      .SetBackend(pt::Backend::kCPU)
-      .SetDataType(pt::DataType::kFLOAT32);
+  kernel->InputAt(1).SetBackend(pt::Backend::kCPU);
 }
diff --git a/paddle/tcmpt/cuda/math.cu b/paddle/tcmpt/cuda/math.cu
index b8f5777ce9a7e..c4d6663a063cc 100644
--- a/paddle/tcmpt/cuda/math.cu
+++ b/paddle/tcmpt/cuda/math.cu
@@ -114,8 +114,15 @@ void ScaleHost(const CUDAContext& dev_ctx,
                float bias,
                bool bias_after_scale,
                DenseTensor* out) {
-  module::Scale<CUDAContext, T>(
-      dev_ctx, x, *scale.data<float>(), bias, bias_after_scale, out);
+  if (paddle::platform::is_gpu_place(scale.place())) {
+    throw std::runtime_error("scale host place error.");
+  }
+  module::Scale<CUDAContext, T>(dev_ctx,
+                                x,
+                                static_cast<float>(*scale.data<T>()),
+                                bias,
+                                bias_after_scale,
+                                out);
 }
 
 template <typename T>
@@ -129,7 +136,7 @@ void ScaleSelectedRowsHost(const CUDAContext& dev_ctx,
   out->set_height(x.height());
   Scale<T>(dev_ctx,
            x.value(),
-           *scale.data<float>(),
+           static_cast<float>(*scale.data<T>()),
            bias,
            bias_after_scale,
            out->mutable_value());
@@ -176,9 +183,7 @@ PT_REGISTER_KERNEL("scale.host",
                    int16_t,
                    int,
                    int64_t) {
-  kernel->InputAt(1)
-      .SetBackend(pt::Backend::kCPU)
-      .SetDataType(pt::DataType::kFLOAT32);
+  kernel->InputAt(1).SetBackend(pt::Backend::kCPU);
 }
 PT_REGISTER_KERNEL("scale.sr.host",
                    CUDA,
@@ -192,7 +197,5 @@ PT_REGISTER_KERNEL("scale.sr.host",
                    int16_t,
                    int,
                    int64_t) {
-  kernel->InputAt(1)
-      .SetBackend(pt::Backend::kCPU)
-      .SetDataType(pt::DataType::kFLOAT32);
+  kernel->InputAt(1).SetBackend(pt::Backend::kCPU);
 }
diff --git a/python/paddle/fluid/tests/unittests/test_scale_op.py b/python/paddle/fluid/tests/unittests/test_scale_op.py
index c1ce032f50612..baedc2b095914 100644
--- a/python/paddle/fluid/tests/unittests/test_scale_op.py
+++ b/python/paddle/fluid/tests/unittests/test_scale_op.py
@@ -109,7 +109,9 @@ def check_with_place(self, place, in_name, out_name):
 
         assert (in_array * scale == result_array).all()
         assert in_height == out_height
-        assert in_rows == out_rows
+        # TODO(chenweihang): output rows and height cannot be shared into
+        # fluid output tensor
+        # assert in_rows == out_rows
 
     def test_scale_selected_rows(self):
         places = [core.CPUPlace()]

From c32fde99fd0358c4ca9c03496eede9ef746240ce Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 13 Sep 2021 13:49:59 +0000
Subject: [PATCH 051/125] add mean API and unittest

---
 paddle/tcmpt/api/CMakeLists.txt      |   2 +
 paddle/tcmpt/api/all.h               |   2 +
 paddle/tcmpt/api/include/math.h      |   2 +-
 paddle/tcmpt/api/include/tensor.h    |  34 ++++++--
 paddle/tcmpt/api/src/CMakeLists.txt  |   6 ++
 paddle/tcmpt/api/src/math.cc         |  54 +++++++++++-
 paddle/tcmpt/core/kernel_factory.h   |  28 ++++---
 paddle/tcmpt/core/kernel_generate.h  | 120 +++++++++++++++++++++++++++
 paddle/tcmpt/core/kernel_utils.h     |   2 +-
 paddle/tcmpt/core/tensor_interface.h |   6 +-
 paddle/tcmpt/infershape/unary.h      |  33 ++++++++
 paddle/tcmpt/tests/CMakeLists.txt    |   1 +
 paddle/tcmpt/tests/test_mean_api.cc  |  58 +++++++++++++
 13 files changed, 325 insertions(+), 23 deletions(-)
 create mode 100644 paddle/tcmpt/core/kernel_generate.h
 create mode 100644 paddle/tcmpt/infershape/unary.h
 create mode 100644 paddle/tcmpt/tests/test_mean_api.cc

diff --git a/paddle/tcmpt/api/CMakeLists.txt b/paddle/tcmpt/api/CMakeLists.txt
index 26aed55eee21c..5826810fd32ff 100644
--- a/paddle/tcmpt/api/CMakeLists.txt
+++ b/paddle/tcmpt/api/CMakeLists.txt
@@ -6,4 +6,6 @@ if(WITH_GPU OR WITH_ROCM)
   set(TCMPT_DEPS ${TCMPT_DEPS} math_cuda)
 endif()
 
+set(TCMPT_DEPS ${TCMPT_DEPS} math_api)
+
 cc_library(tcmpt SRCS all.cc DEPS ${TCMPT_DEPS})
diff --git a/paddle/tcmpt/api/all.h b/paddle/tcmpt/api/all.h
index db944cb13b6a7..60bd3c342b75d 100644
--- a/paddle/tcmpt/api/all.h
+++ b/paddle/tcmpt/api/all.h
@@ -19,3 +19,5 @@ limitations under the License. */
 #include "paddle/tcmpt/api/include/dev/math.h"
 
 // user apis
+#include "paddle/tcmpt/api/include/math.h"
+#include "paddle/tcmpt/api/include/tensor.h"
diff --git a/paddle/tcmpt/api/include/math.h b/paddle/tcmpt/api/include/math.h
index cf7a769f67493..27e3f1a1d3cff 100644
--- a/paddle/tcmpt/api/include/math.h
+++ b/paddle/tcmpt/api/include/math.h
@@ -18,6 +18,6 @@ limitations under the License. */
 
 namespace pt {
 
-Tensor sign(const Tensor& x);
+Tensor mean(const Tensor& x);
 
 }  // namespace pt
diff --git a/paddle/tcmpt/api/include/tensor.h b/paddle/tcmpt/api/include/tensor.h
index e867d1ae507ae..6be7f6309bd2e 100644
--- a/paddle/tcmpt/api/include/tensor.h
+++ b/paddle/tcmpt/api/include/tensor.h
@@ -45,10 +45,10 @@ namespace pt {
 
 class Tensor;
 
-class AbstractAutogradMeta {
+class AutogradMetaInterface {
  public:
-  // No AbstractAutogradMeta should be created
-  virtual ~AbstractAutogradMeta() {}
+  // No AutogradMetaInterface should be created
+  virtual ~AutogradMetaInterface() {}
 };
 
 /**
@@ -166,6 +166,13 @@ class Tensor final {
    */
   std::shared_ptr<TensorInterface> impl() const { return impl_; }
 
+  /**
+   * @description: Set the implemention of current Tensor.
+   * @param {std::shared_ptr<TensorInterface>}
+   * @return None
+   */
+  void set_impl(const std::shared_ptr<TensorInterface>& impl) { impl_ = impl; }
+
   // Whether API Tensor need `data` and `mutable_data`?
 
   // TODO(chenweihang): slice and split methods use kernels?
@@ -195,18 +202,33 @@ class Tensor final {
   /* Part 6: Operator overloading */
   Tensor& operator=(const Tensor& x) & {
     impl_ = x.impl_;
+    autograd_meta_ = x.autograd_meta_;
     return *this;
   }
   Tensor& operator=(Tensor&& x) & {
     impl_ = std::move(x.impl_);
+    autograd_meta_ = std::move(x.autograd_meta_);
     return *this;
   }
   // TODO(chenweihang): impl later
-  Tensor& operator=(const Tensor&) &&;
-  Tensor& operator=(Tensor&&) &&;
+  // Tensor& operator=(const Tensor&) &&;
+  // Tensor& operator=(Tensor&&) &&;
 
   /* Part 7: Autograd methods */
   // TODO(yangjiabin): Design autograd methods
+  void SetAutoGradMeta(
+      const std::shared_ptr<AutogradMetaInterface>& auto_grad_meta) {
+    // Copy this shared_ptr
+    autograd_meta_ = auto_grad_meta;
+  }
+
+  AutogradMetaInterface* get_autograd_meta() const {
+    return autograd_meta_.get();
+  }
+
+  void set_autograd_meta(std::shared_ptr<AutogradMetaInterface> autograd_meta) {
+    autograd_meta_ = std::move(autograd_meta);
+  }
 
   /* Part 8: Auto generated Tensor methods */
   // ...
@@ -243,7 +265,7 @@ class Tensor final {
    *    information, not Tensor data description-related information.
    * 2. Kernel calculation does not require AutogradMeta.
    */
-  std::unique_ptr<AutogradMetaInterface> autograd_meta_ = nullptr;
+  std::shared_ptr<AutogradMetaInterface> autograd_meta_ = nullptr;
 };
 
 }  // namespace pt
diff --git a/paddle/tcmpt/api/src/CMakeLists.txt b/paddle/tcmpt/api/src/CMakeLists.txt
index e69de29bb2d1d..9cada664d7044 100644
--- a/paddle/tcmpt/api/src/CMakeLists.txt
+++ b/paddle/tcmpt/api/src/CMakeLists.txt
@@ -0,0 +1,6 @@
+set(API_DEPS convert_utils dense_tensor selected_rows_tensor kernel_factory kernel_context)
+set(API_DEPS ${API_DEPS} math_cpu)
+if(WITH_GPU OR WITH_ROCM)
+  set(API_DEPS ${API_DEPS} math_cuda)
+endif()
+cc_library(math_api SRCS math.cc DEPS ${API_DEPS})
diff --git a/paddle/tcmpt/api/src/math.cc b/paddle/tcmpt/api/src/math.cc
index 78bf8394ae96e..6e16a84a54f20 100644
--- a/paddle/tcmpt/api/src/math.cc
+++ b/paddle/tcmpt/api/src/math.cc
@@ -14,4 +14,56 @@ limitations under the License. */
 
 #include "paddle/tcmpt/api/include/math.h"
 
-namespace pt {}  // namespace pt
+#include <memory>
+
+#include "glog/logging.h"
+
+#include "paddle/tcmpt/core/convert_utils.h"
+#include "paddle/tcmpt/core/dense_tensor.h"
+#include "paddle/tcmpt/core/kernel_context.h"
+#include "paddle/tcmpt/core/kernel_generate.h"
+#include "paddle/tcmpt/infershape/unary.h"
+
+namespace pt {
+
+Tensor mean(const Tensor& x) {
+  // 1. Get kernel signature and kernel
+  auto kernel_signature = ParseKernelNameAndKeyByArgs("mean", x);
+  VLOG(1) << kernel_signature.first;
+  VLOG(1) << kernel_signature.second;
+  VLOG(1) << KernelFactory::Instance();
+
+  auto kernel = KernelFactory::Instance().SelectKernelOrThrowError(
+      kernel_signature.first, kernel_signature.second);
+  VLOG(1) << kernel;
+
+  // 2. Get Device Context
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_signature.second.backend());
+  auto kernel_context = KernelContext(*dev_ctx);
+
+  // 3. Auto data transform
+  auto dense_x = std::dynamic_pointer_cast<DenseTensor>(x.impl());
+  kernel_context.EmplaceBackInput(dense_x);
+  // TODO(chenweihang): add transform impl
+
+  // 4. InferShape
+  // TODO(chenweihang): how to auto selected infershape?
+  auto out_dims = UnchangedInferShape(dense_x->dims());
+
+  // 5. Prepare outputs
+  pt::Tensor out;
+  // TODO(chenweihang): deal with multiple outputs
+  auto out_def = kernel.args_def().output_defs()[0];
+  auto dense_out = std::make_shared<DenseTensor>(
+      TensorMeta(out_dims, out_def.backend, out_def.dtype, out_def.layout),
+      TensorStatus());
+  kernel_context.EmplaceBackOutput(dense_out);
+  out.set_impl(dense_out);
+
+  // 6. Call kernel
+  kernel(&kernel_context);
+
+  return out;
+}
+
+}  // namespace pt
diff --git a/paddle/tcmpt/core/kernel_factory.h b/paddle/tcmpt/core/kernel_factory.h
index af1afdf0610d7..180f0ce2c6b87 100644
--- a/paddle/tcmpt/core/kernel_factory.h
+++ b/paddle/tcmpt/core/kernel_factory.h
@@ -52,18 +52,13 @@ class KernelName final {
                   (std::hash<std::string>()(overload_name_) << 1);
   }
 
+  KernelName(const std::string& kernel_name) {
+    ParseNameAndOverloadNameFromString(kernel_name);
+  }
+
   KernelName(const char* kernel_name) {
     std::string kernel_name_str(kernel_name);
-    size_t pos = kernel_name_str.find_first_of('.');
-    if (pos == std::string::npos) {
-      name_ = kernel_name_str;
-      overload_name_ = "";
-    } else {
-      name_ = kernel_name_str.substr(0, pos);
-      overload_name_ = kernel_name_str.substr(pos + 1, kernel_name_str.size());
-    }
-    hash_value_ = std::hash<std::string>()(name_) ^
-                  (std::hash<std::string>()(overload_name_) << 1);
+    ParseNameAndOverloadNameFromString(kernel_name_str);
   }
 
   const std::string& name() const { return name_; }
@@ -89,6 +84,19 @@ class KernelName final {
   }
 
  private:
+  void ParseNameAndOverloadNameFromString(const std::string& kernel_name) {
+    size_t pos = kernel_name.find_first_of('.');
+    if (pos == std::string::npos) {
+      name_ = kernel_name;
+      overload_name_ = "";
+    } else {
+      name_ = kernel_name.substr(0, pos);
+      overload_name_ = kernel_name.substr(pos + 1, kernel_name.size());
+    }
+    hash_value_ = std::hash<std::string>()(name_) ^
+                  (std::hash<std::string>()(overload_name_) << 1);
+  }
+
   // The members cannot be modified except by constructing,
   // because the hash value need to be re calculated
   // TODO(chenweihang): use string_view later?
diff --git a/paddle/tcmpt/core/kernel_generate.h b/paddle/tcmpt/core/kernel_generate.h
new file mode 100644
index 0000000000000..a507851934406
--- /dev/null
+++ b/paddle/tcmpt/core/kernel_generate.h
@@ -0,0 +1,120 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <utility>
+
+// TODO(chenweihang): split KernelName, Key, Kernel, Factory into diff files
+#include "paddle/tcmpt/core/kernel_factory.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/device_context.h"
+
+namespace pt {
+
+// TODO(shixiaowei): replaced by new DeviceContext later
+using CPUContext = paddle::platform::CPUDeviceContext;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+using CUDAContext = paddle::platform::CUDADeviceContext;
+#endif
+
+namespace detail {
+
+template <typename Functor>
+struct ArgsIterator {
+  template <typename... Args>
+  inline Functor& apply() {
+    return self();
+  }
+
+  template <typename T, typename... Args>
+  inline Functor& apply(T&& arg, Args&&... args) {
+    self()(std::forward<T>(arg));
+    if (self().short_circurt()) {
+      return self();
+    } else {
+      return apply(std::forward<Args>(args)...);
+    }
+  }
+
+  constexpr bool short_circuit() const { return false; }
+
+ private:
+  inline Functor& self() { return *static_cast<Functor*>(this); }
+};
+
+struct KernelNameAndKeyParser : ArgsIterator<KernelNameAndKeyParser> {
+  std::string kernel_name;
+  Backend backend;
+  DataLayout layout;
+  DataType dtype;
+
+  explicit KernelNameAndKeyParser(const std::string& name)
+      : kernel_name(name) {}
+
+  // TODO(chenweihang): use bit set here
+  // TODO(chenweihang): deal with multiple diff input Tensors
+  void operator()(const Tensor& x) {
+    if (x.is_cpu()) {
+      backend = Backend::kCPU;
+    } else if (x.is_cuda()) {
+      backend = Backend::kCUDA;
+    } else {
+      throw std::runtime_error("Unsupported backend when parser args.");
+    }
+  }
+
+  // skip other type args
+  template <typename T>
+  void operator()(const T& x) {
+    // do nothing
+  }
+};
+
+}  // namespace detail
+
+// TODO(chenweihang): Determine the Kernel name and key according to the
+// function name and the input Tensor parameters. For example, if the input
+// x holds SelectedRows, then the Kernel name should be added with the `sr`
+// suffix on the basis of the function name, or the input contains HostTensor,
+// and the `host` suffix should be added on the basis of the function name.
+template <typename... Args>
+std::pair<KernelName, KernelKey> ParseKernelNameAndKeyByArgs(
+    const std::string& fn_name, const Args&... args) {
+  auto parser = detail::KernelNameAndKeyParser(fn_name);
+  parser(args...);
+  // TODO(chenweihang): polish design here
+  KernelName kernel_name(parser.kernel_name);
+  KernelKey kernel_key(parser.backend, parser.layout, parser.dtype);
+  return std::make_pair(kernel_name, kernel_key);
+}
+
+paddle::platform::DeviceContext* GetDeviceContextByBackend(Backend backend) {
+  auto& pool = paddle::platform::DeviceContextPool::Instance();
+  auto place = TransToFluidPlace(backend);
+  // switch (backend) {
+  //   case Backend::kCPU:
+  //     return pool.GetByPlace(paddle::platform::CPUPlace());
+  //   case Backend::kCUDA:
+  //     return pool.GetByPlace(paddle::platform::CUDAPlace());
+  //   default:
+  //     throw std::runtime_error(
+  //       "Unsupported backend when getting device context.");
+  // }
+  return pool.Get(place);
+}
+
+}  // namespace pt
diff --git a/paddle/tcmpt/core/kernel_utils.h b/paddle/tcmpt/core/kernel_utils.h
index 98dd0b0472331..ed863cbde14a6 100644
--- a/paddle/tcmpt/core/kernel_utils.h
+++ b/paddle/tcmpt/core/kernel_utils.h
@@ -25,7 +25,7 @@
 
 namespace pt {
 
-// TODO(chenweihang): replaced by new DeviceContext later
+// TODO(shixiaowei): replaced by new DeviceContext later
 using CPUContext = paddle::platform::CPUDeviceContext;
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 using CUDAContext = paddle::platform::CUDADeviceContext;
diff --git a/paddle/tcmpt/core/tensor_interface.h b/paddle/tcmpt/core/tensor_interface.h
index 101c39e36cd41..6991c0d7f7f71 100644
--- a/paddle/tcmpt/core/tensor_interface.h
+++ b/paddle/tcmpt/core/tensor_interface.h
@@ -29,12 +29,10 @@ class Place;
 
 namespace pt {
 
-// TODO(chenweihang): Use the existing DDim directly?
-// or design a abstract interface of DDim?
+// TODO(shixiaowei): replace by new DDim
 using DDim = paddle::framework::DDim;
 
-// TODO(chenweihang): Use the existing Place directly?
-// or design a abstract interface of Place?
+// TODO(shixiaowei): replace by new Place?
 using Place = paddle::platform::Place;
 
 /**
diff --git a/paddle/tcmpt/infershape/unary.h b/paddle/tcmpt/infershape/unary.h
new file mode 100644
index 0000000000000..35eb675ba11e4
--- /dev/null
+++ b/paddle/tcmpt/infershape/unary.h
@@ -0,0 +1,33 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/framework/ddim.h"
+
+namespace pt {
+
+using DDim = paddle::framework::DDim;
+
+// Common InferShape Functions, The format like:
+//
+//   1. DDim [OpName]InferShape(const DDim& x_dim, ...) {}
+//   2. std::pair<DDim, DDim> [OpName]InferShape(const DDim& x_dim, ...) {}
+//   3. std::tuple<DDim, DDim, DDim> [OpName]InferShape(const DDim& x_dim, ...)
+//   {}
+
+DDim UnchangedInferShape(const DDim& x_dim) { return x_dim; }
+
+}  // namespace pt
diff --git a/paddle/tcmpt/tests/CMakeLists.txt b/paddle/tcmpt/tests/CMakeLists.txt
index 87e05028db53f..a6b4a45cf1f9f 100644
--- a/paddle/tcmpt/tests/CMakeLists.txt
+++ b/paddle/tcmpt/tests/CMakeLists.txt
@@ -1,2 +1,3 @@
 cc_test(dense_tensor_test SRCS dense_tensor_test.cc DEPS dense_tensor)
 cc_test(kernel_factory_test SRCS kernel_factory_test.cc DEPS kernel_factory)
+cc_test(test_mean_api SRCS test_mean_api.cc DEPS math_api)
diff --git a/paddle/tcmpt/tests/test_mean_api.cc b/paddle/tcmpt/tests/test_mean_api.cc
new file mode 100644
index 0000000000000..f6c8718620206
--- /dev/null
+++ b/paddle/tcmpt/tests/test_mean_api.cc
@@ -0,0 +1,58 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/tcmpt/api/include/math.h"
+#include "paddle/tcmpt/core/dense_tensor.h"
+
+namespace framework = paddle::framework;
+using DDim = paddle::framework::DDim;
+
+TEST(API, mean) {
+  // 1. create tensor
+  auto dense_x = std::make_shared<pt::DenseTensor>(
+      pt::TensorMeta(framework::make_ddim({3, 4}),
+                     pt::Backend::kCPU,
+                     pt::DataType::kFLOAT32,
+                     pt::DataLayout::kNCHW),
+      pt::TensorStatus());
+  auto* dense_x_data = dense_x->mutable_data<float>();
+
+  float sum = 0.0;
+  for (size_t i = 0; i < 12; ++i) {
+    dense_x_data[i] = i * 1.0;
+    sum += i * 1.0;
+  }
+
+  pt::Tensor x(dense_x);
+
+  // 2. test API
+  auto out = pt::mean(x);
+
+  // 3. check result
+  ASSERT_EQ(out.shape().size(), 1);
+  ASSERT_EQ(out.shape()[0], 1);
+  ASSERT_EQ(out.numel(), 1);
+  ASSERT_EQ(out.is_cpu(), true);
+  ASSERT_EQ(out.type(), pt::DataType::kFLOAT32);
+  ASSERT_EQ(out.layout(), pt::DataLayout::kNCHW);
+  ASSERT_EQ(out.initialized(), true);
+
+  auto expect_result = sum / 12;
+  auto dense_out = std::dynamic_pointer_cast<pt::DenseTensor>(out.impl());
+  auto actual_result = dense_out->data<float>()[0];
+  ASSERT_NEAR(expect_result, actual_result, 1e-6f);
+}

From a4e53efa526b5c4ba9722360eb33342f3a4f1511 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Fri, 17 Sep 2021 12:29:05 +0000
Subject: [PATCH 052/125] test mean api success

---
 paddle/tcmpt/api/include/tensor.h   | 12 +----------
 paddle/tcmpt/api/src/math.cc        |  2 +-
 paddle/tcmpt/core/kernel_generate.h |  2 ++
 paddle/tcmpt/core/kernel_registry.h | 31 +++++++++++++++++++++++++++++
 paddle/tcmpt/infershape/unary.h     |  2 ++
 paddle/tcmpt/tests/test_mean_api.cc |  6 ++++++
 6 files changed, 43 insertions(+), 12 deletions(-)

diff --git a/paddle/tcmpt/api/include/tensor.h b/paddle/tcmpt/api/include/tensor.h
index 6be7f6309bd2e..1c503c842ad30 100644
--- a/paddle/tcmpt/api/include/tensor.h
+++ b/paddle/tcmpt/api/include/tensor.h
@@ -173,7 +173,7 @@ class Tensor final {
    */
   void set_impl(const std::shared_ptr<TensorInterface>& impl) { impl_ = impl; }
 
-  // Whether API Tensor need `data` and `mutable_data`?
+  // TODO(chenweihang): Whether API Tensor need `data` and `mutable_data`?
 
   // TODO(chenweihang): slice and split methods use kernels?
 
@@ -210,18 +210,8 @@ class Tensor final {
     autograd_meta_ = std::move(x.autograd_meta_);
     return *this;
   }
-  // TODO(chenweihang): impl later
-  // Tensor& operator=(const Tensor&) &&;
-  // Tensor& operator=(Tensor&&) &&;
 
   /* Part 7: Autograd methods */
-  // TODO(yangjiabin): Design autograd methods
-  void SetAutoGradMeta(
-      const std::shared_ptr<AutogradMetaInterface>& auto_grad_meta) {
-    // Copy this shared_ptr
-    autograd_meta_ = auto_grad_meta;
-  }
-
   AutogradMetaInterface* get_autograd_meta() const {
     return autograd_meta_.get();
   }
diff --git a/paddle/tcmpt/api/src/math.cc b/paddle/tcmpt/api/src/math.cc
index 6e16a84a54f20..65abdc95ed4ba 100644
--- a/paddle/tcmpt/api/src/math.cc
+++ b/paddle/tcmpt/api/src/math.cc
@@ -48,7 +48,7 @@ Tensor mean(const Tensor& x) {
 
   // 4. InferShape
   // TODO(chenweihang): how to auto selected infershape?
-  auto out_dims = UnchangedInferShape(dense_x->dims());
+  auto out_dims = MeanInferShape(dense_x->dims());
 
   // 5. Prepare outputs
   pt::Tensor out;
diff --git a/paddle/tcmpt/core/kernel_generate.h b/paddle/tcmpt/core/kernel_generate.h
index a507851934406..6cc8f411924d2 100644
--- a/paddle/tcmpt/core/kernel_generate.h
+++ b/paddle/tcmpt/core/kernel_generate.h
@@ -75,6 +75,8 @@ struct KernelNameAndKeyParser : ArgsIterator<KernelNameAndKeyParser> {
     } else {
       throw std::runtime_error("Unsupported backend when parser args.");
     }
+    layout = x.layout();
+    dtype = x.type();
   }
 
   // skip other type args
diff --git a/paddle/tcmpt/core/kernel_registry.h b/paddle/tcmpt/core/kernel_registry.h
index 33475bb4728a3..02eda90da74c4 100644
--- a/paddle/tcmpt/core/kernel_registry.h
+++ b/paddle/tcmpt/core/kernel_registry.h
@@ -454,4 +454,35 @@ struct KernelRegistrar {
       PT_KERNEL(kernel_fn));                                               \
   void PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id)(::pt::Kernel*)
 
+// only used in cpp tests
+
+#define PT_REGISTER_KERNEL_FOR_TEST(                              \
+    kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...) \
+  _PT_REGISTER_KERNEL_FOR_TEST(kernel_name,                       \
+                               PT_ID,                             \
+                               backend,                           \
+                               layout,                            \
+                               meta_kernel_fn,                    \
+                               cpp_dtype,                         \
+                               __VA_ARGS__)
+
+#define _PT_REGISTER_KERNEL_FOR_TEST(                                      \
+    kernel_name, func_id, backend, layout, meta_kernel_fn, cpp_dtype, ...) \
+  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                       \
+      PT_CONCATENATE(pt_op_kernel_for_test_ns_check_, func_id),            \
+      "PT_REGISTER_KERNEL must be called in global namespace.");           \
+  static void PT_CONCATENATE(__PT_KERNEL_for_test_args_def_FN_,            \
+                             func_id)(::pt::Kernel*);                      \
+  PT_KERNEL_REGISTRAR_INIT(                                                \
+      kernel_name,                                                         \
+      func_id,                                                             \
+      backend,                                                             \
+      layout,                                                              \
+      &PT_CONCATENATE(__PT_KERNEL_for_test_args_def_FN_, func_id),         \
+      meta_kernel_fn,                                                      \
+      cpp_dtype,                                                           \
+      __VA_ARGS__);                                                        \
+  void PT_CONCATENATE(__PT_KERNEL_for_test_args_def_FN_,                   \
+                      func_id)(::pt::Kernel * kernel)
+
 }  // namespace pt
diff --git a/paddle/tcmpt/infershape/unary.h b/paddle/tcmpt/infershape/unary.h
index 35eb675ba11e4..c576410699d94 100644
--- a/paddle/tcmpt/infershape/unary.h
+++ b/paddle/tcmpt/infershape/unary.h
@@ -30,4 +30,6 @@ using DDim = paddle::framework::DDim;
 
 DDim UnchangedInferShape(const DDim& x_dim) { return x_dim; }
 
+DDim MeanInferShape(const DDim& x_dim) { return {1}; }
+
 }  // namespace pt
diff --git a/paddle/tcmpt/tests/test_mean_api.cc b/paddle/tcmpt/tests/test_mean_api.cc
index f6c8718620206..7483ab837334c 100644
--- a/paddle/tcmpt/tests/test_mean_api.cc
+++ b/paddle/tcmpt/tests/test_mean_api.cc
@@ -18,6 +18,8 @@ limitations under the License. */
 #include "paddle/tcmpt/api/include/math.h"
 #include "paddle/tcmpt/core/dense_tensor.h"
 
+#include "paddle/tcmpt/cpu/math.h"
+
 namespace framework = paddle::framework;
 using DDim = paddle::framework::DDim;
 
@@ -56,3 +58,7 @@ TEST(API, mean) {
   auto actual_result = dense_out->data<float>()[0];
   ASSERT_NEAR(expect_result, actual_result, 1e-6f);
 }
+
+// TODO(chenweihang): register kernel in test, all kernels in cpu/math.h are
+// registered
+PT_REGISTER_KERNEL_FOR_TEST("mean", CPU, NCHW, pt::Mean, float, double) {}

From 1d9f33f17111448771f1af0fdd2bc3d65dc7a26a Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Sat, 18 Sep 2021 03:27:22 +0000
Subject: [PATCH 053/125] add branch to solve compiled error

---
 paddle/tcmpt/core/kernel_registry.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/paddle/tcmpt/core/kernel_registry.h b/paddle/tcmpt/core/kernel_registry.h
index 02eda90da74c4..1cfe074480d23 100644
--- a/paddle/tcmpt/core/kernel_registry.h
+++ b/paddle/tcmpt/core/kernel_registry.h
@@ -44,8 +44,13 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
   static void Parse(const KernelKey& default_key, KernelArgsDef* args_def) {
     auto args_type = ParseArgType(Indices{});
     for (auto arg_type : args_type) {
-      if (arg_type == std::type_index(typeid(const CPUContext&)) ||
+      if (arg_type == std::type_index(typeid(const CPUContext&))
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+          ||
           arg_type == std::type_index(typeid(const CUDAContext&))) {
+#else
+              ) {
+#endif
         // do nothing, skip context arg now
       } else if (arg_type == std::type_index(typeid(const DenseTensor&)) ||
                  arg_type ==

From b0cf02c8bb1c3134811dae6125943f657dbf0b98 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Sat, 18 Sep 2021 04:06:38 +0000
Subject: [PATCH 054/125] skip clang format error

---
 paddle/tcmpt/core/kernel_registry.h | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/paddle/tcmpt/core/kernel_registry.h b/paddle/tcmpt/core/kernel_registry.h
index 1cfe074480d23..2066de3e6dadc 100644
--- a/paddle/tcmpt/core/kernel_registry.h
+++ b/paddle/tcmpt/core/kernel_registry.h
@@ -203,9 +203,10 @@ struct KernelRegistrar {
                             cpp_dtype,                        \
                             __VA_ARGS__)
 
-// The =pre-commit always treats this macro into the wrong format,
-// and multi-line macros cannot be skipped with NOLINT.
-// If there are only errors here, you can use -n to skip check
+// clang-format off
+
+/* The =pre-commit always treats this macro into the wrong format,
+  and multi-line macros cannot be skipped with NOLINT.*/
 #define _PT_KERNEL_REGISTRAR_INIT(N,              \
                                   kernel_name,    \
                                   func_id,        \
@@ -215,16 +216,18 @@ struct KernelRegistrar {
                                   meta_kernel_fn, \
                                   cpp_dtype,      \
                                   ...)            \
-  PT_CONCATENATE(_PT_KERNEL_REGISTRAR_INIT_, N)   \
-  (kernel_name,                                   \
-   func_id,                                       \
-   PT_ID,                                         \
-   backend,                                       \
-   layout,                                        \
-   args_def_fn,                                   \
-   meta_kernel_fn,                                \
-   cpp_dtype,                                     \
-   __VA_ARGS__)
+  PT_CONCATENATE(_PT_KERNEL_REGISTRAR_INIT_, N) ( \
+    kernel_name,                                  \
+    func_id,                                      \
+    PT_ID,                                        \
+    backend,                                      \
+    layout,                                       \
+    args_def_fn,                                  \
+    meta_kernel_fn,                               \
+    cpp_dtype,                                    \
+    __VA_ARGS__)
+
+// clang-format on
 
 #define _PT_KERNEL_REGISTRAR_INIT_1(kernel_name,      \
                                     func_id,          \

From 95a612efd426bd5ede36d6e5b386f5167d7a3f46 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Sat, 18 Sep 2021 06:24:08 +0000
Subject: [PATCH 055/125] add mean skip rule in op_library

---
 cmake/operators.cmake             | 12 ++++++++++++
 paddle/fluid/operators/mean_op.cc |  4 +---
 paddle/fluid/operators/mean_op.cu |  4 ----
 paddle/fluid/operators/mean_op.h  | 21 ---------------------
 4 files changed, 13 insertions(+), 28 deletions(-)

diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 7730550e061f1..e8f99cc2c81fd 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -340,6 +340,18 @@ function(op_library TARGET)
       endif()
     endif()
 
+    # TODO(chenweihang): Because the Tensor compute library will migrate the forward Kernel,
+    # only the grad kernel is left, if the USE_OP still be declared in the original way,
+    # the symbol will can not be found, so special treatment is needed here, and it will
+    # need to be deleted after the complete migration of the kernel in the future.
+    foreach(forward_moved_op "mean")
+        if ("${TARGET}" STREQUAL "${forward_moved_op}")
+            file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(${TARGET});\n")
+            file(APPEND ${pybind_file} "USE_OP_KERNEL(${TARGET}_grad);\n")
+            set(pybind_flag 1)
+        endif()
+    endforeach()
+
     # pybind USE_OP
     if (${pybind_flag} EQUAL 0)
       # NOTE(*): activation use macro to regist the kernels, set use_op manually.
diff --git a/paddle/fluid/operators/mean_op.cc b/paddle/fluid/operators/mean_op.cc
index 764529a15b6a2..2489cd18bb00f 100644
--- a/paddle/fluid/operators/mean_op.cc
+++ b/paddle/fluid/operators/mean_op.cc
@@ -94,9 +94,7 @@ REGISTER_OPERATOR(mean, ops::MeanOp, ops::MeanOpMaker, ops::MeanOpInferVarType,
                   ops::MeanGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(mean_grad, ops::MeanGradOp,
                   ops::MeanGradNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    mean, ops::MeanKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::MeanKernel<paddle::platform::CPUDeviceContext, double>);
+
 REGISTER_OP_CPU_KERNEL(
     mean_grad, ops::MeanGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::MeanGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/mean_op.cu b/paddle/fluid/operators/mean_op.cu
index ffb667ba974b8..786d73ee9c811 100644
--- a/paddle/fluid/operators/mean_op.cu
+++ b/paddle/fluid/operators/mean_op.cu
@@ -62,10 +62,6 @@ class MeanCUDAGradKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    mean, ops::MeanKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MeanKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MeanKernel<paddle::platform::CUDADeviceContext, plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
     mean_grad,
     ops::MeanCUDAGradKernel<paddle::platform::CUDADeviceContext, float>,
diff --git a/paddle/fluid/operators/mean_op.h b/paddle/fluid/operators/mean_op.h
index 4f9c1505a6ee3..b9e09f31bc8c1 100644
--- a/paddle/fluid/operators/mean_op.h
+++ b/paddle/fluid/operators/mean_op.h
@@ -32,27 +32,6 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
-template <typename DeviceContext, typename T>
-class MeanKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* out = context.Output<Tensor>("Out");
-    auto& dev_ctx = context.device_context<DeviceContext>();
-
-    auto pt_x =
-        framework::MakeTensorImpl<pt::DenseTensor>(*x, x->place(), x->type());
-    auto pt_out =
-        framework::MakeTensorImpl<pt::DenseTensor>(*out, x->place(), x->type());
-
-    // call new kernel
-    pt::Mean<T>(dev_ctx, *pt_x.get(), pt_out.get());
-
-    // share pt_out data to out
-    framework::ShareTensorImpl(pt_out.get(), out);
-  }
-};
-
 template <typename DeviceContext, typename T>
 class MeanGradKernel : public framework::OpKernel<T> {
  public:

From 83d6f7721331bd7ba082c1c775e661c585bd844f Mon Sep 17 00:00:00 2001
From: chentianyu03 <ctychentianyu@gmail.com>
Date: Sat, 18 Sep 2021 19:43:25 +0800
Subject: [PATCH 056/125] add dot kernel, api and unittest (#6)

---
 paddle/fluid/operators/dot_op.h     | 60 +++++++------------
 paddle/tcmpt/api/CMakeLists.txt     |  6 +-
 paddle/tcmpt/api/all.h              |  2 +
 paddle/tcmpt/api/include/dev/dot.h  | 19 ++++++
 paddle/tcmpt/api/include/dot.h      | 23 ++++++++
 paddle/tcmpt/api/src/CMakeLists.txt |  5 +-
 paddle/tcmpt/api/src/dot.cc         | 71 ++++++++++++++++++++++
 paddle/tcmpt/cpu/CMakeLists.txt     |  1 +
 paddle/tcmpt/cpu/dot.cc             | 61 +++++++++++++++++++
 paddle/tcmpt/cpu/dot.h              | 32 ++++++++++
 paddle/tcmpt/cuda/CMakeLists.txt    |  2 +
 paddle/tcmpt/cuda/dot.cu            | 71 ++++++++++++++++++++++
 paddle/tcmpt/cuda/dot.h             | 40 +++++++++++++
 paddle/tcmpt/infershape/unary.h     |  6 ++
 paddle/tcmpt/tests/CMakeLists.txt   |  1 +
 paddle/tcmpt/tests/test_dot_api.cc  | 91 +++++++++++++++++++++++++++++
 16 files changed, 448 insertions(+), 43 deletions(-)
 create mode 100644 paddle/tcmpt/api/include/dev/dot.h
 create mode 100644 paddle/tcmpt/api/include/dot.h
 create mode 100644 paddle/tcmpt/api/src/dot.cc
 create mode 100644 paddle/tcmpt/cpu/dot.cc
 create mode 100644 paddle/tcmpt/cpu/dot.h
 create mode 100644 paddle/tcmpt/cuda/dot.cu
 create mode 100644 paddle/tcmpt/cuda/dot.h
 create mode 100644 paddle/tcmpt/tests/test_dot_api.cc

diff --git a/paddle/fluid/operators/dot_op.h b/paddle/fluid/operators/dot_op.h
index 09d607891b485..65e22354d6a79 100644
--- a/paddle/fluid/operators/dot_op.h
+++ b/paddle/fluid/operators/dot_op.h
@@ -16,9 +16,14 @@
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tcmpt_utils.h"
 #include "paddle/fluid/operators/math/complex_functors.h"
 #include "paddle/fluid/platform/for_range.h"
 
+// only can include the headers in paddle/tcmpt/api dirs
+#include "paddle/tcmpt/api/include/dev/core.h"
+#include "paddle/tcmpt/api/include/dev/dot.h"
+
 namespace paddle {
 namespace operators {
 
@@ -232,44 +237,23 @@ template <typename DeviceContext, typename T>
 class DotKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* tensor_x = ctx.Input<Tensor>("X");
-    auto* tensor_y = ctx.Input<Tensor>("Y");
-    auto* tensor_out = ctx.Output<Tensor>("Out");
-    tensor_out->mutable_data<T>(ctx.GetPlace());
-
-#if defined(__NVCC__) || defined(__HIPCC__)
-    if (1 == tensor_out->dims().size()) {
-      auto out = framework::EigenScalar<T>::From(*tensor_out);
-      auto x = framework::EigenVector<T>::Flatten(*tensor_x);
-      auto y = framework::EigenVector<T>::Flatten(*tensor_y);
-
-      auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-      out.device(dev) = (x * y).sum();
-    } else {
-      auto out = framework::EigenMatrix<T>::From(*tensor_out);
-      auto x = framework::EigenMatrix<T>::From(*tensor_x);
-      auto y = framework::EigenMatrix<T>::From(*tensor_y);
-
-      auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-      out.device(dev) = (x * y).sum(Eigen::DSizes<int, 1>(1));
-    }
-#else
-    auto const *x = tensor_x->data<T>(), *x_ = &x[0];
-    auto const *y = tensor_y->data<T>(), *y_ = &y[0];
-    auto* z = tensor_out->data<T>();
-
-    // Loop over the total N elements of both operands while sum-reducing every
-    // B pairs along the way where B is the dimension of the least ordered axis
-    auto&& d = tensor_x->dims();
-    auto const N = tensor_x->numel();
-    auto const B = d[d.size() - 1];
-
-    for (int j = 0; j < N / B; j++) {
-      T ss = 0;
-      for (int i = 0; i < B; i++) ss += (*x_++) * (*y_++);
-      z[j] = ss;
-    }
-#endif
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Output<Tensor>("Out");
+    auto& dev_ctx = ctx.device_context<DeviceContext>();
+
+    auto pt_x =
+        framework::MakeTensorImpl<pt::DenseTensor>(*x, x->place(), x->type());
+    auto pt_y =
+        framework::MakeTensorImpl<pt::DenseTensor>(*y, y->place(), y->type());
+    auto pt_out =
+        framework::MakeTensorImpl<pt::DenseTensor>(*out, x->place(), x->type());
+
+    // call new kernel
+    pt::Dot<T>(dev_ctx, *pt_x.get(), *pt_y.get(), pt_out.get());
+
+    // share pt_out data to out
+    framework::ShareTensorImpl(pt_out.get(), out);
   }
 };
 
diff --git a/paddle/tcmpt/api/CMakeLists.txt b/paddle/tcmpt/api/CMakeLists.txt
index 5826810fd32ff..f868a4bdad728 100644
--- a/paddle/tcmpt/api/CMakeLists.txt
+++ b/paddle/tcmpt/api/CMakeLists.txt
@@ -1,11 +1,11 @@
 add_subdirectory(src)
 
 set(TCMPT_DEPS convert_utils dense_tensor selected_rows_tensor kernel_factory kernel_context)
-set(TCMPT_DEPS ${TCMPT_DEPS} math_cpu)
+set(TCMPT_DEPS ${TCMPT_DEPS} math_cpu dot_cpu)
 if(WITH_GPU OR WITH_ROCM)
-  set(TCMPT_DEPS ${TCMPT_DEPS} math_cuda)
+  set(TCMPT_DEPS ${TCMPT_DEPS} math_cuda dot_cuda)
 endif()
 
-set(TCMPT_DEPS ${TCMPT_DEPS} math_api)
+set(TCMPT_DEPS ${TCMPT_DEPS} math_api dot_api)
 
 cc_library(tcmpt SRCS all.cc DEPS ${TCMPT_DEPS})
diff --git a/paddle/tcmpt/api/all.h b/paddle/tcmpt/api/all.h
index 60bd3c342b75d..25fb4ebd57505 100644
--- a/paddle/tcmpt/api/all.h
+++ b/paddle/tcmpt/api/all.h
@@ -16,8 +16,10 @@ limitations under the License. */
 
 // develop apis
 #include "paddle/tcmpt/api/include/dev/core.h"
+#include "paddle/tcmpt/api/include/dev/dot.h"
 #include "paddle/tcmpt/api/include/dev/math.h"
 
 // user apis
+#include "paddle/tcmpt/api/include/dot.h"
 #include "paddle/tcmpt/api/include/math.h"
 #include "paddle/tcmpt/api/include/tensor.h"
diff --git a/paddle/tcmpt/api/include/dev/dot.h b/paddle/tcmpt/api/include/dev/dot.h
new file mode 100644
index 0000000000000..1afaebcdd5dfb
--- /dev/null
+++ b/paddle/tcmpt/api/include/dev/dot.h
@@ -0,0 +1,19 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+// See Note: [ How do we organize the kernel directory ]
+#include "paddle/tcmpt/cpu/dot.h"
+#include "paddle/tcmpt/cuda/dot.h"
diff --git a/paddle/tcmpt/api/include/dot.h b/paddle/tcmpt/api/include/dot.h
new file mode 100644
index 0000000000000..0322aa91763a6
--- /dev/null
+++ b/paddle/tcmpt/api/include/dot.h
@@ -0,0 +1,23 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/tcmpt/api/include/tensor.h"
+
+namespace pt {
+
+Tensor dot(const Tensor& x, const Tensor& y);
+
+}  // namespace pt
diff --git a/paddle/tcmpt/api/src/CMakeLists.txt b/paddle/tcmpt/api/src/CMakeLists.txt
index 9cada664d7044..21c871f353a76 100644
--- a/paddle/tcmpt/api/src/CMakeLists.txt
+++ b/paddle/tcmpt/api/src/CMakeLists.txt
@@ -1,6 +1,7 @@
 set(API_DEPS convert_utils dense_tensor selected_rows_tensor kernel_factory kernel_context)
-set(API_DEPS ${API_DEPS} math_cpu)
+set(API_DEPS ${API_DEPS} math_cpu dot_cpu)
 if(WITH_GPU OR WITH_ROCM)
-  set(API_DEPS ${API_DEPS} math_cuda)
+  set(API_DEPS ${API_DEPS} math_cuda dot_cuda)
 endif()
 cc_library(math_api SRCS math.cc DEPS ${API_DEPS})
+cc_library(dot_api SRCS dot.cc DEPS ${API_DEPS})
diff --git a/paddle/tcmpt/api/src/dot.cc b/paddle/tcmpt/api/src/dot.cc
new file mode 100644
index 0000000000000..9e15e4c4288ad
--- /dev/null
+++ b/paddle/tcmpt/api/src/dot.cc
@@ -0,0 +1,71 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/tcmpt/api/include/dot.h"
+
+#include <memory>
+
+#include "glog/logging.h"
+
+#include "paddle/tcmpt/core/convert_utils.h"
+#include "paddle/tcmpt/core/dense_tensor.h"
+#include "paddle/tcmpt/core/kernel_context.h"
+#include "paddle/tcmpt/core/kernel_generate.h"
+#include "paddle/tcmpt/infershape/unary.h"
+
+namespace pt {
+
+Tensor dot(const Tensor& x, const Tensor& y) {
+  // 1. Get kernel signature and kernel
+  auto kernel_signature = ParseKernelNameAndKeyByArgs("dot", x);
+  VLOG(1) << kernel_signature.first;
+  VLOG(1) << kernel_signature.second;
+  VLOG(1) << KernelFactory::Instance();
+
+  auto kernel = KernelFactory::Instance().SelectKernelOrThrowError(
+      kernel_signature.first, kernel_signature.second);
+  VLOG(1) << kernel;
+
+  // 2. Get Device Context
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_signature.second.backend());
+  auto kernel_context = KernelContext(*dev_ctx);
+
+  // 3. Auto data transform
+  auto dense_x = std::dynamic_pointer_cast<DenseTensor>(x.impl());
+  kernel_context.EmplaceBackInput(dense_x);
+  auto dense_y = std::dynamic_pointer_cast<DenseTensor>(y.impl());
+  kernel_context.EmplaceBackInput(dense_y);
+  // TODO(chenweihang): add transform impl
+
+  // 4. InferShape
+  // TODO(chenweihang): how to auto selected infershape?
+  auto out_dims = DotInferShape(dense_x->dims());
+
+  // 5. Prepare outputs
+  pt::Tensor out;
+  // TODO(chenweihang): deal with multiple outputs
+  auto out_def = kernel.args_def().output_defs()[0];
+  auto dense_out = std::make_shared<DenseTensor>(
+      TensorMeta(out_dims, out_def.backend, out_def.dtype, out_def.layout),
+      TensorStatus());
+  kernel_context.EmplaceBackOutput(dense_out);
+  out.set_impl(dense_out);
+
+  // 6. Call kernel
+  kernel(&kernel_context);
+
+  return out;
+}
+
+}  // namespace pt
diff --git a/paddle/tcmpt/cpu/CMakeLists.txt b/paddle/tcmpt/cpu/CMakeLists.txt
index 874ea85b4b97f..c0c4ef72fbb8a 100644
--- a/paddle/tcmpt/cpu/CMakeLists.txt
+++ b/paddle/tcmpt/cpu/CMakeLists.txt
@@ -1 +1,2 @@
 cc_library(math_cpu SRCS math.cc DEPS dense_tensor kernel_context kernel_factory eigen_function)
+cc_library(dot_cpu SRCS dot.cc DEPS dense_tensor kernel_context kernel_factory)
diff --git a/paddle/tcmpt/cpu/dot.cc b/paddle/tcmpt/cpu/dot.cc
new file mode 100644
index 0000000000000..f7525dde39e7a
--- /dev/null
+++ b/paddle/tcmpt/cpu/dot.cc
@@ -0,0 +1,61 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/tcmpt/cpu/dot.h"
+
+#include "paddle/tcmpt/eigen/scale.h"
+#include "paddle/tcmpt/eigen/sign.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/framework/eigen.h"
+
+namespace pt {
+
+template <typename T>
+void Dot(const CPUContext& dev_ctx,
+         const DenseTensor& x,
+         const DenseTensor& y,
+         DenseTensor* out) {
+  auto const *x_ptr = x.data<T>(), *x_ptr_ = &x_ptr[0];
+  auto const *y_ptr = y.data<T>(), *y_ptr_ = &y_ptr[0];
+  auto* z = out->mutable_data<T>();
+
+  // Loop over the total N elements of both operands while sum-reducing every
+  // B pairs along the way where B is the dimension of the least ordered axis
+  auto&& d = x.dims();
+  auto const N = x.numel();
+  auto const B = d[d.size() - 1];
+
+  for (int j = 0; j < N / B; j++) {
+    T ss = 0;
+    for (int i = 0; i < B; i++) ss += (*x_ptr_++) * (*y_ptr_++);
+    z[j] = ss;
+  }
+}
+
+}  // namespace pt
+
+using complex64 = ::paddle::platform::complex<float>;
+using complex128 = ::paddle::platform::complex<double>;
+
+PT_REGISTER_KERNEL("dot",
+                   CPU,
+                   NCHW,
+                   pt::Dot,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {}
diff --git a/paddle/tcmpt/cpu/dot.h b/paddle/tcmpt/cpu/dot.h
new file mode 100644
index 0000000000000..f8f384496a0f1
--- /dev/null
+++ b/paddle/tcmpt/cpu/dot.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/tcmpt/core/dense_tensor.h"
+#include "paddle/tcmpt/core/kernel_registry.h"
+#include "paddle/tcmpt/core/selected_rows_tensor.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/device_context.h"
+
+namespace pt {
+
+template <typename T>
+void Dot(const CPUContext& dev_ctx,
+         const DenseTensor& x,
+         const DenseTensor& y,
+         DenseTensor* out);
+
+}  // namespace pt
diff --git a/paddle/tcmpt/cuda/CMakeLists.txt b/paddle/tcmpt/cuda/CMakeLists.txt
index e5899c8eb5ad5..b2e3ce09d81e8 100644
--- a/paddle/tcmpt/cuda/CMakeLists.txt
+++ b/paddle/tcmpt/cuda/CMakeLists.txt
@@ -1,5 +1,7 @@
 if(WITH_GPU)
   nv_library(math_cuda SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory)
+  nv_library(dot_cuda SRCS dot.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
 elseif(WITH_ROCM)
   hip_library(math_cuda SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory)
+  hip_library(dot_cuda SRCS dot.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
 endif()
diff --git a/paddle/tcmpt/cuda/dot.cu b/paddle/tcmpt/cuda/dot.cu
new file mode 100644
index 0000000000000..6f6eb81073e40
--- /dev/null
+++ b/paddle/tcmpt/cuda/dot.cu
@@ -0,0 +1,71 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/tcmpt/cuda/dot.h"
+
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/tcmpt/core/kernel_registry.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+
+namespace pt {
+
+template <typename T>
+void Dot(const CUDAContext& dev_ctx,
+         const DenseTensor& x,
+         const DenseTensor& y,
+         DenseTensor* out) {
+  out->mutable_data();
+  if (1 == out->dims().size()) {
+    auto eigen_out = paddle::framework::EigenScalar<T>::From(*out);
+    auto eigen_x = paddle::framework::EigenVector<T>::Flatten(x);
+    auto eigen_y = paddle::framework::EigenVector<T>::Flatten(y);
+
+    auto& dev = *dev_ctx.eigen_device();
+    eigen_out.device(dev) = (eigen_x * eigen_y).sum();
+  } else {
+    auto eigen_out = paddle::framework::EigenMatrix<T>::From(*out);
+    auto eigen_x = paddle::framework::EigenMatrix<T>::From(x);
+    auto eigen_y = paddle::framework::EigenMatrix<T>::From(y);
+
+    auto& dev = *dev_ctx.eigen_device();
+    eigen_out.device(dev) = (eigen_x * eigen_y).sum(Eigen::DSizes<int, 1>(1));
+  }
+}
+
+}  // namespace pt
+
+using complex64 = ::paddle::platform::complex<float>;
+using complex128 = ::paddle::platform::complex<double>;
+
+PT_REGISTER_KERNEL("dot",
+                   CUDA,
+                   NCHW,
+                   pt::Dot,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {}
diff --git a/paddle/tcmpt/cuda/dot.h b/paddle/tcmpt/cuda/dot.h
new file mode 100644
index 0000000000000..b7489b6701fe1
--- /dev/null
+++ b/paddle/tcmpt/cuda/dot.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+// CUDA and HIP use same api
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+#include "paddle/tcmpt/core/dense_tensor.h"
+#include "paddle/tcmpt/core/selected_rows_tensor.h"
+
+#include "paddle/tcmpt/eigen/scale.h"
+#include "paddle/tcmpt/eigen/sign.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/device_context.h"
+namespace pt {
+
+using CUDAContext = paddle::platform::CUDADeviceContext;
+
+template <typename T>
+void Dot(const CUDAContext& dev_ctx,
+         const DenseTensor& x,
+         const DenseTensor& y,
+         DenseTensor* out);
+
+}  // namespace pt
+
+#endif
diff --git a/paddle/tcmpt/infershape/unary.h b/paddle/tcmpt/infershape/unary.h
index c576410699d94..64a735c060edc 100644
--- a/paddle/tcmpt/infershape/unary.h
+++ b/paddle/tcmpt/infershape/unary.h
@@ -32,4 +32,10 @@ DDim UnchangedInferShape(const DDim& x_dim) { return x_dim; }
 
 DDim MeanInferShape(const DDim& x_dim) { return {1}; }
 
+DDim DotInferShape(const DDim& x_dim) {
+  auto dims = paddle::framework::vectorize(x_dim);
+  dims[dims.size() - 1] = 1;
+  return paddle::framework::make_ddim(dims);
+}
+
 }  // namespace pt
diff --git a/paddle/tcmpt/tests/CMakeLists.txt b/paddle/tcmpt/tests/CMakeLists.txt
index a6b4a45cf1f9f..272f4769bf993 100644
--- a/paddle/tcmpt/tests/CMakeLists.txt
+++ b/paddle/tcmpt/tests/CMakeLists.txt
@@ -1,3 +1,4 @@
 cc_test(dense_tensor_test SRCS dense_tensor_test.cc DEPS dense_tensor)
 cc_test(kernel_factory_test SRCS kernel_factory_test.cc DEPS kernel_factory)
 cc_test(test_mean_api SRCS test_mean_api.cc DEPS math_api)
+cc_test(test_dot_api SRCS test_dot_api.cc DEPS dot_api)
diff --git a/paddle/tcmpt/tests/test_dot_api.cc b/paddle/tcmpt/tests/test_dot_api.cc
new file mode 100644
index 0000000000000..a7d0cd3d10155
--- /dev/null
+++ b/paddle/tcmpt/tests/test_dot_api.cc
@@ -0,0 +1,91 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/tcmpt/api/include/dot.h"
+#include "paddle/tcmpt/core/dense_tensor.h"
+
+#include "paddle/tcmpt/cpu/dot.h"
+
+namespace framework = paddle::framework;
+using DDim = paddle::framework::DDim;
+
+TEST(API, dot) {
+  // 1. create tensor
+  auto dense_x = std::make_shared<pt::DenseTensor>(
+      pt::TensorMeta(framework::make_ddim({3, 10}),
+                     pt::Backend::kCPU,
+                     pt::DataType::kFLOAT32,
+                     pt::DataLayout::kNCHW),
+      pt::TensorStatus());
+  auto* dense_x_data = dense_x->mutable_data<float>();
+
+  auto dense_y = std::make_shared<pt::DenseTensor>(
+      pt::TensorMeta(framework::make_ddim({3, 10}),
+                     pt::Backend::kCPU,
+                     pt::DataType::kFLOAT32,
+                     pt::DataLayout::kNCHW),
+      pt::TensorStatus());
+  auto* dense_y_data = dense_y->mutable_data<float>();
+
+  float sum[3] = {0.0, 0.0, 0.0};
+  for (size_t i = 0; i < 3; ++i) {
+    for (size_t j = 0; j < 10; ++j) {
+      dense_x_data[i * 10 + j] = (i * 10 + j) * 1.0;
+      dense_y_data[i * 10 + j] = (i * 10 + j) * 1.0;
+      sum[i] += (i * 10 + j) * (i * 10 + j) * 1.0;
+    }
+  }
+
+  pt::Tensor x(dense_x);
+  pt::Tensor y(dense_y);
+
+  // 2. test API
+  auto out = pt::dot(x, y);
+
+  // 3. check result
+  ASSERT_EQ(out.shape().size(), 2);
+  ASSERT_EQ(out.shape()[0], 3);
+  ASSERT_EQ(out.numel(), 3);
+  ASSERT_EQ(out.is_cpu(), true);
+  ASSERT_EQ(out.type(), pt::DataType::kFLOAT32);
+  ASSERT_EQ(out.layout(), pt::DataLayout::kNCHW);
+  ASSERT_EQ(out.initialized(), true);
+
+  auto expect_result = sum;
+  auto dense_out = std::dynamic_pointer_cast<pt::DenseTensor>(out.impl());
+  auto actual_result0 = dense_out->data<float>()[0];
+  auto actual_result1 = dense_out->data<float>()[1];
+  auto actual_result2 = dense_out->data<float>()[2];
+  ASSERT_NEAR(expect_result[0], actual_result0, 1e-6f);
+  ASSERT_NEAR(expect_result[1], actual_result1, 1e-6f);
+  ASSERT_NEAR(expect_result[2], actual_result2, 1e-6f);
+}
+
+// TODO(chenweihang): register kernel in test, all kernels in cpu/math.h are
+// registered
+using complex64 = ::paddle::platform::complex<float>;
+using complex128 = ::paddle::platform::complex<double>;
+PT_REGISTER_KERNEL_FOR_TEST("dot",
+                            CPU,
+                            NCHW,
+                            pt::Dot,
+                            float,
+                            double,
+                            int,
+                            int64_t,
+                            complex64,
+                            complex128) {}

From dad5e6143cd5dc8317532860cad6a0a8404697b4 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Sat, 18 Sep 2021 12:49:01 +0000
Subject: [PATCH 057/125] remove old kernel and add symbol link

---
 cmake/operators.cmake                         |  6 ++
 ...est_reference_count_pass_last_lived_ops.cc |  2 +-
 paddle/fluid/framework/operator.cc            |  4 +-
 paddle/fluid/framework/tcmpt_utils.cc         |  2 +
 paddle/fluid/operators/mean_op.h              |  5 --
 .../pscore/heter_listen_and_server_test.cc    |  2 +-
 .../operators/pscore/heter_server_test.cc     |  2 +-
 paddle/fluid/operators/scale_op.cc            | 27 +-----
 paddle/fluid/operators/scale_op.h             | 89 -------------------
 paddle/fluid/operators/scale_op_npu.cc        |  2 +-
 paddle/fluid/operators/scale_op_xpu.cc        |  2 +-
 paddle/fluid/platform/CMakeLists.txt          |  2 +-
 paddle/tcmpt/api/CMakeLists.txt               | 15 ++++
 paddle/tcmpt/api/all.h                        |  1 +
 paddle/tcmpt/api/include/dev/infershape.h     | 18 ++++
 paddle/tcmpt/api/include/dev/symbols.h        | 21 +++++
 paddle/tcmpt/api/src/math.cc                  |  8 +-
 paddle/tcmpt/core/kernel_registry.h           | 15 ++++
 paddle/tcmpt/cpu/math.cc                      |  3 +
 paddle/tcmpt/cuda/math.cu                     |  3 +
 20 files changed, 97 insertions(+), 132 deletions(-)
 delete mode 100644 paddle/fluid/operators/scale_op.h
 create mode 100644 paddle/tcmpt/api/include/dev/infershape.h
 create mode 100644 paddle/tcmpt/api/include/dev/symbols.h

diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index e8f99cc2c81fd..285db13361916 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -351,6 +351,12 @@ function(op_library TARGET)
             set(pybind_flag 1)
         endif()
     endforeach()
+    foreach(moved_op "scale")
+        if ("${TARGET}" STREQUAL "${forward_moved_op}")
+            file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(${TARGET});\n")
+            set(pybind_flag 1)
+        endif()
+    endforeach()
 
     # pybind USE_OP
     if (${pybind_flag} EQUAL 0)
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
index f410171f99896..8cf541637557b 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
@@ -21,7 +21,7 @@
 #include "paddle/fluid/framework/parallel_executor.h"
 #include "paddle/fluid/framework/program_desc.h"
 
-USE_OP(scale);
+USE_NO_KERNEL_OP(scale);
 USE_OP(elementwise_mul);
 USE_OP(elementwise_add);
 USE_OP(elementwise_add_grad);
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 7a91581d9fe3b..a7843256662b7 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1148,10 +1148,11 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
 
   // TODO(chenweihang): Now we are still reusing a lot of the original fluid
   // implementation, this is a gradual replacement process
-
   // TODO(chenweihang): in the first phase of project, we only support CPU, CUDA
   // and RCOM backend, the XPU, NPU and MKLDNN will be supported in the second
   // phase
+
+  VLOG(1) << "Pt KernelFactory: " << pt::KernelFactory::Instance();
   if (pt::KernelFactory::Instance().ContainsKernel(type_.c_str())) {
     if (pt_kernel_key_.get() == nullptr || pt_kernel_.get() == nullptr) {
       ChoosePtKernel(*runtime_ctx, *dev_ctx);
@@ -1325,7 +1326,6 @@ void OperatorWithKernel::ChoosePtKernel(
       kernel_name, *pt_kernel_key_)));
 
   // for debug
-  // VLOG(1) << pt::KernelFactory::Instance();
   VLOG(1) << "ChoosePtKernel - kernel name: " << kernel_name
           << " | kernel key: " << *pt_kernel_key_
           << " | kernel: " << *pt_kernel_;
diff --git a/paddle/fluid/framework/tcmpt_utils.cc b/paddle/fluid/framework/tcmpt_utils.cc
index c46b43bd75952..a28cf9a57a0e4 100644
--- a/paddle/fluid/framework/tcmpt_utils.cc
+++ b/paddle/fluid/framework/tcmpt_utils.cc
@@ -16,6 +16,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/selected_rows.h"
 
+#include "paddle/tcmpt/api/include/dev/symbols.h"
+
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/operators/mean_op.h b/paddle/fluid/operators/mean_op.h
index b9e09f31bc8c1..9e752c7173d23 100644
--- a/paddle/fluid/operators/mean_op.h
+++ b/paddle/fluid/operators/mean_op.h
@@ -15,11 +15,6 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tcmpt_utils.h"
-
-// only can include the headers in paddle/tcmpt/api dirs
-#include "paddle/tcmpt/api/include/dev/core.h"
-#include "paddle/tcmpt/api/include/dev/math.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc b/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc
index 3b005e10d9b98..bbc7f01597900 100644
--- a/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc
+++ b/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc
@@ -32,7 +32,7 @@ using MultiVarMsg = ::paddle::distributed::MultiVariableMessage;
 using VarMsg = ::paddle::distributed::VariableMessage;
 DECLARE_double(eager_delete_tensor_gb);
 
-USE_OP(scale);
+USE_NO_KERNEL_OP(scale);
 USE_NO_KERNEL_OP(heter_listen_and_serv);
 
 framework::BlockDesc* AppendSendAndRecvBlock(framework::ProgramDesc* program) {
diff --git a/paddle/fluid/operators/pscore/heter_server_test.cc b/paddle/fluid/operators/pscore/heter_server_test.cc
index df2eb70b144e4..3e6897073e129 100644
--- a/paddle/fluid/operators/pscore/heter_server_test.cc
+++ b/paddle/fluid/operators/pscore/heter_server_test.cc
@@ -29,7 +29,7 @@ namespace distributed = paddle::distributed;
 using MultiVarMsg = ::paddle::distributed::MultiVariableMessage;
 using VarMsg = ::paddle::distributed::VariableMessage;
 
-USE_OP(scale);
+USE_NO_KERNEL_OP(scale);
 
 std::shared_ptr<distributed::HeterServer> b_rpc_service;
 
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index a195452791048..ae917eb934f24 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/scale_op.h"
 #include <string>
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
@@ -146,28 +146,3 @@ REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker,
                   ops::ScaleGradMaker<paddle::framework::OpDesc>,
                   ops::ScaleGradMaker<paddle::imperative::OpBase>,
                   ops::ScaleOpVarTypeInference, ops::ScaleOpInplaceInferer);
-REGISTER_OP_CPU_KERNEL(
-    scale, ops::ScaleKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::ScaleKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::ScaleKernel<paddle::platform::CPUDeviceContext,
-                     paddle::platform::bfloat16>,
-    ops::ScaleKernel<paddle::platform::CPUDeviceContext, uint8_t>,
-    ops::ScaleKernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::ScaleKernel<paddle::platform::CPUDeviceContext, int16_t>,
-    ops::ScaleKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::ScaleKernel<paddle::platform::CPUDeviceContext, int64_t>);
-
-REGISTER_OP_CUDA_KERNEL(
-    scale,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, float>,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, double>,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
-                                   uint8_t>,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, int8_t>,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
-                                   int16_t>,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, int>,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
-                                   int64_t>,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
-                                   paddle::platform::float16>);
diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h
deleted file mode 100644
index 723f9bb7c256e..0000000000000
--- a/paddle/fluid/operators/scale_op.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tcmpt_utils.h"
-
-// only can include the headers in paddle/tcmpt/api dirs
-#include "paddle/tcmpt/api/include/dev/core.h"
-#include "paddle/tcmpt/api/include/dev/math.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-static inline T GetAttrFromTensor(const framework::Tensor* tensor) {
-  const auto* tensor_data = tensor->data<T>();
-  framework::Tensor cpu_tensor;
-  if (platform::is_gpu_place(tensor->place()) ||
-      platform::is_npu_place(tensor->place())) {
-    TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
-    tensor_data = cpu_tensor.data<T>();
-  }
-  return tensor_data[0];
-}
-
-template <typename DeviceContext, typename T>
-class ScaleKernel : public framework::OpKernel<T> {
- public:
-  virtual void Compute(const framework::ExecutionContext& ctx) const {
-    auto* in_var = ctx.InputVar("X");
-    auto* in = framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_var);
-
-    auto bias = ctx.Attr<float>("bias");
-    auto bias_after_scale = ctx.Attr<bool>("bias_after_scale");
-
-    auto scale = ctx.Attr<float>("scale");
-    if (ctx.HasInput("ScaleTensor")) {
-      auto* scale_tensor = ctx.Input<framework::Tensor>("ScaleTensor");
-      scale = static_cast<float>(GetAttrFromTensor<T>(scale_tensor));
-    }
-
-    auto* out_var = ctx.OutputVar("Out");
-    if (in_var->IsType<framework::SelectedRows>() && in_var != out_var) {
-      auto& in_slr = in_var->Get<framework::SelectedRows>();
-      auto* out_slr = out_var->GetMutable<framework::SelectedRows>();
-      out_slr->set_rows(in_slr.rows());
-      out_slr->set_height(in_slr.height());
-    }
-
-    auto* out =
-        framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(out_var);
-    auto& dev_ctx = ctx.device_context<DeviceContext>();
-
-#ifdef PADDLE_WITH_MKLDNN
-    auto pt_x = framework::MakeTensorImpl<pt::MKLDNNDenseTensor>(
-        *in, in->place(), in->type());
-    auto pt_out = framework::MakeTensorImpl<pt::MKLDNNDenseTensor>(
-        *out, in->place(), in->type());
-#else
-    auto pt_x = framework::MakeTensorImpl<pt::DenseTensor>(*in, in->place(),
-                                                           in->type());
-    auto pt_out = framework::MakeTensorImpl<pt::DenseTensor>(*out, in->place(),
-                                                             in->type());
-#endif
-
-    // call new kernel
-    pt::Scale<T>(dev_ctx, *pt_x.get(), scale, bias, bias_after_scale,
-                 pt_out.get());
-
-    // share pt_out data to out
-    framework::ShareTensorImpl(pt_out.get(), out);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/scale_op_npu.cc b/paddle/fluid/operators/scale_op_npu.cc
index 2381719020869..159a213471d1b 100644
--- a/paddle/fluid/operators/scale_op_npu.cc
+++ b/paddle/fluid/operators/scale_op_npu.cc
@@ -15,8 +15,8 @@ limitations under the License. */
 #include <memory>
 #include <string>
 
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
-#include "paddle/fluid/operators/scale_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/scale_op_xpu.cc b/paddle/fluid/operators/scale_op_xpu.cc
index e0dfad91570ad..da1c8caa84555 100644
--- a/paddle/fluid/operators/scale_op_xpu.cc
+++ b/paddle/fluid/operators/scale_op_xpu.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/scale_op.h"
 #include <string>
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/xpu/xpu_header.h"
 
 namespace paddle {
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index d99f991911e9c..fd2578e0f093f 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -163,7 +163,7 @@ if(WITH_GPU)
   nv_test(device_event_test SRCS device_event_test.cc DEPS device_event_gpu)
 
   nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
-  nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
+  nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda tcmpt)
   nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
   nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
 endif()
diff --git a/paddle/tcmpt/api/CMakeLists.txt b/paddle/tcmpt/api/CMakeLists.txt
index 5826810fd32ff..454f364fc6d1a 100644
--- a/paddle/tcmpt/api/CMakeLists.txt
+++ b/paddle/tcmpt/api/CMakeLists.txt
@@ -1,5 +1,14 @@
 add_subdirectory(src)
 
+# set(declare_file ${PADDLE_BINARY_DIR}/paddle/tcmpt/api/symbols.h.tmp CACHE INTERNAL "symbols.h file")
+# set(declare_file_final ${PADDLE_BINARY_DIR}/paddle/tcmpt/api/symbols.h)
+# file(WRITE ${declare_file} "// Generated by the paddle/tcmpt/api/CMakeLists.txt.  DO NOT EDIT!\n\n")
+
+# function(declare_module TARGTE)
+#     file(APPEND ${declare_file} "extern int RegisterSymbolsFor${TARGET}();\n")
+#     message(STATUS "")
+# endfunction()
+
 set(TCMPT_DEPS convert_utils dense_tensor selected_rows_tensor kernel_factory kernel_context)
 set(TCMPT_DEPS ${TCMPT_DEPS} math_cpu)
 if(WITH_GPU OR WITH_ROCM)
@@ -8,4 +17,10 @@ endif()
 
 set(TCMPT_DEPS ${TCMPT_DEPS} math_api)
 
+# TODO(chenweihang): unify decclare into **_library
+# declare_module(MathCPU)
+# declare_module(MathCUDA)
+
 cc_library(tcmpt SRCS all.cc DEPS ${TCMPT_DEPS})
+
+# copy_if_different(${declare_file} ${declare_file_final})
diff --git a/paddle/tcmpt/api/all.h b/paddle/tcmpt/api/all.h
index 60bd3c342b75d..a30159ae4beab 100644
--- a/paddle/tcmpt/api/all.h
+++ b/paddle/tcmpt/api/all.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 // develop apis
 #include "paddle/tcmpt/api/include/dev/core.h"
+#include "paddle/tcmpt/api/include/dev/infershape.h"
 #include "paddle/tcmpt/api/include/dev/math.h"
 
 // user apis
diff --git a/paddle/tcmpt/api/include/dev/infershape.h b/paddle/tcmpt/api/include/dev/infershape.h
new file mode 100644
index 0000000000000..3ac4d37459e71
--- /dev/null
+++ b/paddle/tcmpt/api/include/dev/infershape.h
@@ -0,0 +1,18 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+// See Note: [ How do we organize the kernel directory ]
+#include "paddle/tcmpt/infershape/unary.h"
diff --git a/paddle/tcmpt/api/include/dev/symbols.h b/paddle/tcmpt/api/include/dev/symbols.h
new file mode 100644
index 0000000000000..7d723ea7f6fb8
--- /dev/null
+++ b/paddle/tcmpt/api/include/dev/symbols.h
@@ -0,0 +1,21 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/tcmpt/core/kernel_registry.h"
+
+// symbol declare
+PT_DECLARE_MODULE(MathCPU);
+PT_DECLARE_MODULE(MathCUDA);
diff --git a/paddle/tcmpt/api/src/math.cc b/paddle/tcmpt/api/src/math.cc
index 65abdc95ed4ba..813cfde997edc 100644
--- a/paddle/tcmpt/api/src/math.cc
+++ b/paddle/tcmpt/api/src/math.cc
@@ -18,11 +18,10 @@ limitations under the License. */
 
 #include "glog/logging.h"
 
-#include "paddle/tcmpt/core/convert_utils.h"
-#include "paddle/tcmpt/core/dense_tensor.h"
-#include "paddle/tcmpt/core/kernel_context.h"
+#include "paddle/tcmpt/api/include/dev/core.h"
+#include "paddle/tcmpt/api/include/dev/infershape.h"
+#include "paddle/tcmpt/api/include/dev/math.h"
 #include "paddle/tcmpt/core/kernel_generate.h"
-#include "paddle/tcmpt/infershape/unary.h"
 
 namespace pt {
 
@@ -61,6 +60,7 @@ Tensor mean(const Tensor& x) {
   out.set_impl(dense_out);
 
   // 6. Call kernel
+  // TODO(chenweihang): finally, we may call the function directly,
   kernel(&kernel_context);
 
   return out;
diff --git a/paddle/tcmpt/core/kernel_registry.h b/paddle/tcmpt/core/kernel_registry.h
index 2066de3e6dadc..5bdb9f8744c80 100644
--- a/paddle/tcmpt/core/kernel_registry.h
+++ b/paddle/tcmpt/core/kernel_registry.h
@@ -112,6 +112,13 @@ struct KernelRegistrar {
 #define PT_ID __LINE__
 #endif
 
+#if defined(_WIN32)
+#define UNUSED
+#define __builtin_expect(EXP, C) (EXP)
+#else
+#define UNUSED __attribute__((unused))
+#endif
+
 #define PT_CONCATENATE(arg1, arg2) PT_CONCATENATE1(arg1, arg2)
 #define PT_CONCATENATE1(arg1, arg2) PT_CONCATENATE2(arg1, arg2)
 #define PT_CONCATENATE2(arg1, arg2) arg1##arg2
@@ -462,6 +469,14 @@ struct KernelRegistrar {
       PT_KERNEL(kernel_fn));                                               \
   void PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id)(::pt::Kernel*)
 
+// use to declare symbol
+#define PT_REGISTER_MODULE(name) \
+  int RegisterSymbolsFor##name() { return 0; }
+
+#define PT_DECLARE_MODULE(name)          \
+  extern int RegisterSymbolsFor##name(); \
+  UNUSED static int use_kernel_module_##name = RegisterSymbolsFor##name()
+
 // only used in cpp tests
 
 #define PT_REGISTER_KERNEL_FOR_TEST(                              \
diff --git a/paddle/tcmpt/cpu/math.cc b/paddle/tcmpt/cpu/math.cc
index e393576ad692d..b66d57c8ee78d 100644
--- a/paddle/tcmpt/cpu/math.cc
+++ b/paddle/tcmpt/cpu/math.cc
@@ -104,6 +104,9 @@ void ScaleSelectedRowsHost(const CPUContext& dev_ctx,
 
 }  // namespace pt
 
+// TODO(chenweihang): replace by better impl
+PT_REGISTER_MODULE(MathCPU);
+
 using bfloat16 = ::paddle::platform::bfloat16;
 PT_REGISTER_KERNEL("sign", CPU, NCHW, pt::Sign, float, double) {}
 PT_REGISTER_KERNEL("mean", CPU, NCHW, pt::Mean, float, double) {}
diff --git a/paddle/tcmpt/cuda/math.cu b/paddle/tcmpt/cuda/math.cu
index c4d6663a063cc..b96337ef20d04 100644
--- a/paddle/tcmpt/cuda/math.cu
+++ b/paddle/tcmpt/cuda/math.cu
@@ -144,6 +144,9 @@ void ScaleSelectedRowsHost(const CUDAContext& dev_ctx,
 
 }  // namespace pt
 
+// TODO(chenweihang): replace by better impl
+PT_REGISTER_MODULE(MathCUDA);
+
 using float16 = paddle::platform::float16;
 PT_REGISTER_KERNEL("sign", CUDA, NCHW, pt::Sign, float, double, float16) {}
 PT_REGISTER_KERNEL("mean", CUDA, NCHW, pt::Mean, float, double, float16) {}

From 8add5e47269280eb81d5a3b210ec5890d6858267 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Sat, 18 Sep 2021 13:48:01 +0000
Subject: [PATCH 058/125] fix dot compiled failed

---
 cmake/operators.cmake                      |  2 +-
 paddle/fluid/operators/dot_op.cc           |  9 -------
 paddle/fluid/operators/dot_op.cu           |  7 ------
 paddle/fluid/operators/dot_op.h            | 29 ----------------------
 paddle/tcmpt/api/all.h                     |  2 +-
 paddle/tcmpt/api/src/CMakeLists.txt        |  6 ++---
 paddle/tcmpt/api/src/{dot.cc => linalg.cc} |  2 +-
 paddle/tcmpt/cpu/CMakeLists.txt            |  2 +-
 paddle/tcmpt/cpu/linalg.h                  |  2 ++
 paddle/tcmpt/cuda/CMakeLists.txt           |  4 +--
 paddle/tcmpt/tests/CMakeLists.txt          |  2 +-
 paddle/tcmpt/tests/test_dot_api.cc         | 21 +++-------------
 paddle/tcmpt/tests/test_mean_api.cc        |  8 ++----
 13 files changed, 17 insertions(+), 79 deletions(-)
 rename paddle/tcmpt/api/src/{dot.cc => linalg.cc} (98%)

diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 285db13361916..1e3e42fc81f6f 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -344,7 +344,7 @@ function(op_library TARGET)
     # only the grad kernel is left, if the USE_OP still be declared in the original way,
     # the symbol will can not be found, so special treatment is needed here, and it will
     # need to be deleted after the complete migration of the kernel in the future.
-    foreach(forward_moved_op "mean")
+    foreach(forward_moved_op "mean" "dot")
         if ("${TARGET}" STREQUAL "${forward_moved_op}")
             file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(${TARGET});\n")
             file(APPEND ${pybind_file} "USE_OP_KERNEL(${TARGET}_grad);\n")
diff --git a/paddle/fluid/operators/dot_op.cc b/paddle/fluid/operators/dot_op.cc
index 31acd9718115c..b7f65a025fb79 100644
--- a/paddle/fluid/operators/dot_op.cc
+++ b/paddle/fluid/operators/dot_op.cc
@@ -148,15 +148,6 @@ REGISTER_OPERATOR(dot, ops::DotOp, ops::DotOpMaker,
 
 REGISTER_OPERATOR(dot_grad, ops::DotGradOp);
 
-REGISTER_OP_CPU_KERNEL(
-    dot, ops::DotKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::DotKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::DotKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::DotKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::DotKernel<paddle::platform::CPUDeviceContext,
-                   paddle::platform::complex<float>>,
-    ops::DotKernel<paddle::platform::CPUDeviceContext,
-                   paddle::platform::complex<double>>);
 REGISTER_OP_CPU_KERNEL(
     dot_grad, ops::DotGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::DotGradKernel<paddle::platform::CPUDeviceContext, double>,
diff --git a/paddle/fluid/operators/dot_op.cu b/paddle/fluid/operators/dot_op.cu
index 49f27e1ffb128..57c9ced7cfbad 100644
--- a/paddle/fluid/operators/dot_op.cu
+++ b/paddle/fluid/operators/dot_op.cu
@@ -17,13 +17,6 @@
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_CUDA_KERNEL(
-    dot, ops::DotKernel<plat::CUDADeviceContext, float>,
-    ops::DotKernel<plat::CUDADeviceContext, double>,
-    ops::DotKernel<plat::CUDADeviceContext, int>,
-    ops::DotKernel<plat::CUDADeviceContext, int64_t>,
-    ops::DotKernel<plat::CUDADeviceContext, paddle::platform::complex<float>>,
-    ops::DotKernel<plat::CUDADeviceContext, paddle::platform::complex<double>>);
 REGISTER_OP_CUDA_KERNEL(dot_grad,
                         ops::DotGradKernel<plat::CUDADeviceContext, float>,
                         ops::DotGradKernel<plat::CUDADeviceContext, double>,
diff --git a/paddle/fluid/operators/dot_op.h b/paddle/fluid/operators/dot_op.h
index 65e22354d6a79..7c3b6c164d0bf 100644
--- a/paddle/fluid/operators/dot_op.h
+++ b/paddle/fluid/operators/dot_op.h
@@ -16,14 +16,9 @@
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tcmpt_utils.h"
 #include "paddle/fluid/operators/math/complex_functors.h"
 #include "paddle/fluid/platform/for_range.h"
 
-// only can include the headers in paddle/tcmpt/api dirs
-#include "paddle/tcmpt/api/include/dev/core.h"
-#include "paddle/tcmpt/api/include/dev/dot.h"
-
 namespace paddle {
 namespace operators {
 
@@ -233,30 +228,6 @@ struct DotGradFunction<DeviceContext, T, math::DisableComplex<T>> {
   }
 };
 
-template <typename DeviceContext, typename T>
-class DotKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Output<Tensor>("Out");
-    auto& dev_ctx = ctx.device_context<DeviceContext>();
-
-    auto pt_x =
-        framework::MakeTensorImpl<pt::DenseTensor>(*x, x->place(), x->type());
-    auto pt_y =
-        framework::MakeTensorImpl<pt::DenseTensor>(*y, y->place(), y->type());
-    auto pt_out =
-        framework::MakeTensorImpl<pt::DenseTensor>(*out, x->place(), x->type());
-
-    // call new kernel
-    pt::Dot<T>(dev_ctx, *pt_x.get(), *pt_y.get(), pt_out.get());
-
-    // share pt_out data to out
-    framework::ShareTensorImpl(pt_out.get(), out);
-  }
-};
-
 template <typename DeviceContext, typename T>
 class DotGradKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/tcmpt/api/all.h b/paddle/tcmpt/api/all.h
index 5ab0c347dc294..2b5524396072a 100644
--- a/paddle/tcmpt/api/all.h
+++ b/paddle/tcmpt/api/all.h
@@ -17,7 +17,7 @@ limitations under the License. */
 // develop apis
 #include "paddle/tcmpt/api/include/dev/core.h"
 #include "paddle/tcmpt/api/include/dev/infershape.h"
-#include "paddle/tcmpt/api/include/dev/lianlg.h"
+#include "paddle/tcmpt/api/include/dev/linalg.h"
 #include "paddle/tcmpt/api/include/dev/math.h"
 
 // user apis
diff --git a/paddle/tcmpt/api/src/CMakeLists.txt b/paddle/tcmpt/api/src/CMakeLists.txt
index 21c871f353a76..3deb6a08dbc86 100644
--- a/paddle/tcmpt/api/src/CMakeLists.txt
+++ b/paddle/tcmpt/api/src/CMakeLists.txt
@@ -1,7 +1,7 @@
 set(API_DEPS convert_utils dense_tensor selected_rows_tensor kernel_factory kernel_context)
-set(API_DEPS ${API_DEPS} math_cpu dot_cpu)
+set(API_DEPS ${API_DEPS} math_cpu linalg_cpu)
 if(WITH_GPU OR WITH_ROCM)
-  set(API_DEPS ${API_DEPS} math_cuda dot_cuda)
+  set(API_DEPS ${API_DEPS} math_cuda linalg_cuda)
 endif()
 cc_library(math_api SRCS math.cc DEPS ${API_DEPS})
-cc_library(dot_api SRCS dot.cc DEPS ${API_DEPS})
+cc_library(linalg_api SRCS linalg.cc DEPS ${API_DEPS})
diff --git a/paddle/tcmpt/api/src/dot.cc b/paddle/tcmpt/api/src/linalg.cc
similarity index 98%
rename from paddle/tcmpt/api/src/dot.cc
rename to paddle/tcmpt/api/src/linalg.cc
index 9e15e4c4288ad..4be1c67bd169b 100644
--- a/paddle/tcmpt/api/src/dot.cc
+++ b/paddle/tcmpt/api/src/linalg.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/tcmpt/api/include/dot.h"
+#include "paddle/tcmpt/api/include/linalg.h"
 
 #include <memory>
 
diff --git a/paddle/tcmpt/cpu/CMakeLists.txt b/paddle/tcmpt/cpu/CMakeLists.txt
index fee9e5cf5a647..8ee42a210b7f8 100644
--- a/paddle/tcmpt/cpu/CMakeLists.txt
+++ b/paddle/tcmpt/cpu/CMakeLists.txt
@@ -1,2 +1,2 @@
 cc_library(math_cpu SRCS math.cc DEPS dense_tensor kernel_context kernel_factory eigen_function)
-cc_library(linalg_cpu SRCS dot.cc DEPS dense_tensor kernel_context kernel_factory)
+cc_library(linalg_cpu SRCS linalg.cc DEPS dense_tensor kernel_context kernel_factory)
diff --git a/paddle/tcmpt/cpu/linalg.h b/paddle/tcmpt/cpu/linalg.h
index aab40e2c4f6d9..c457943538761 100644
--- a/paddle/tcmpt/cpu/linalg.h
+++ b/paddle/tcmpt/cpu/linalg.h
@@ -21,6 +21,8 @@
 
 namespace pt {
 
+using CPUContext = paddle::platform::CPUDeviceContext;
+
 template <typename T>
 void Dot(const CPUContext& dev_ctx,
          const DenseTensor& x,
diff --git a/paddle/tcmpt/cuda/CMakeLists.txt b/paddle/tcmpt/cuda/CMakeLists.txt
index 9787aaea17e64..d695bf7b28a2b 100644
--- a/paddle/tcmpt/cuda/CMakeLists.txt
+++ b/paddle/tcmpt/cuda/CMakeLists.txt
@@ -1,7 +1,7 @@
 if(WITH_GPU)
   nv_library(math_cuda SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory)
-  nv_library(linalg_cuda SRCS dot.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
+  nv_library(linalg_cuda SRCS linalg.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
 elseif(WITH_ROCM)
   hip_library(math_cuda SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory)
-  hip_library(linalg_cuda SRCS dot.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
+  hip_library(linalg_cuda SRCS linalg.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
 endif()
diff --git a/paddle/tcmpt/tests/CMakeLists.txt b/paddle/tcmpt/tests/CMakeLists.txt
index 272f4769bf993..aeeec69adc8e3 100644
--- a/paddle/tcmpt/tests/CMakeLists.txt
+++ b/paddle/tcmpt/tests/CMakeLists.txt
@@ -1,4 +1,4 @@
 cc_test(dense_tensor_test SRCS dense_tensor_test.cc DEPS dense_tensor)
 cc_test(kernel_factory_test SRCS kernel_factory_test.cc DEPS kernel_factory)
 cc_test(test_mean_api SRCS test_mean_api.cc DEPS math_api)
-cc_test(test_dot_api SRCS test_dot_api.cc DEPS dot_api)
+cc_test(test_dot_api SRCS test_dot_api.cc DEPS linalg_api)
diff --git a/paddle/tcmpt/tests/test_dot_api.cc b/paddle/tcmpt/tests/test_dot_api.cc
index a7d0cd3d10155..fafd095d02166 100644
--- a/paddle/tcmpt/tests/test_dot_api.cc
+++ b/paddle/tcmpt/tests/test_dot_api.cc
@@ -15,10 +15,10 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/tcmpt/api/include/dot.h"
-#include "paddle/tcmpt/core/dense_tensor.h"
+#include "paddle/tcmpt/api/include/dev/symbols.h"
+#include "paddle/tcmpt/api/include/linalg.h"
 
-#include "paddle/tcmpt/cpu/dot.h"
+#include "paddle/tcmpt/core/dense_tensor.h"
 
 namespace framework = paddle::framework;
 using DDim = paddle::framework::DDim;
@@ -74,18 +74,3 @@ TEST(API, dot) {
   ASSERT_NEAR(expect_result[1], actual_result1, 1e-6f);
   ASSERT_NEAR(expect_result[2], actual_result2, 1e-6f);
 }
-
-// TODO(chenweihang): register kernel in test, all kernels in cpu/math.h are
-// registered
-using complex64 = ::paddle::platform::complex<float>;
-using complex128 = ::paddle::platform::complex<double>;
-PT_REGISTER_KERNEL_FOR_TEST("dot",
-                            CPU,
-                            NCHW,
-                            pt::Dot,
-                            float,
-                            double,
-                            int,
-                            int64_t,
-                            complex64,
-                            complex128) {}
diff --git a/paddle/tcmpt/tests/test_mean_api.cc b/paddle/tcmpt/tests/test_mean_api.cc
index 7483ab837334c..293f302cbead4 100644
--- a/paddle/tcmpt/tests/test_mean_api.cc
+++ b/paddle/tcmpt/tests/test_mean_api.cc
@@ -15,10 +15,10 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
+#include "paddle/tcmpt/api/include/dev/symbols.h"
 #include "paddle/tcmpt/api/include/math.h"
-#include "paddle/tcmpt/core/dense_tensor.h"
 
-#include "paddle/tcmpt/cpu/math.h"
+#include "paddle/tcmpt/core/dense_tensor.h"
 
 namespace framework = paddle::framework;
 using DDim = paddle::framework::DDim;
@@ -58,7 +58,3 @@ TEST(API, mean) {
   auto actual_result = dense_out->data<float>()[0];
   ASSERT_NEAR(expect_result, actual_result, 1e-6f);
 }
-
-// TODO(chenweihang): register kernel in test, all kernels in cpu/math.h are
-// registered
-PT_REGISTER_KERNEL_FOR_TEST("mean", CPU, NCHW, pt::Mean, float, double) {}

From 71a340375ea78fff93f5db63ad0b921bd045883a Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 22 Sep 2021 03:40:14 +0000
Subject: [PATCH 059/125] add merco for module declare

---
 paddle/tcmpt/api/include/dev/symbols.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/paddle/tcmpt/api/include/dev/symbols.h b/paddle/tcmpt/api/include/dev/symbols.h
index 84645a31bca7e..c590c95c1fc94 100644
--- a/paddle/tcmpt/api/include/dev/symbols.h
+++ b/paddle/tcmpt/api/include/dev/symbols.h
@@ -18,7 +18,9 @@ limitations under the License. */
 
 // symbol declare
 PT_DECLARE_MODULE(MathCPU);
-PT_DECLARE_MODULE(MathCUDA);
-
 PT_DECLARE_MODULE(LinalgCPU);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PT_DECLARE_MODULE(MathCUDA);
 PT_DECLARE_MODULE(LinalgCUDA);
+#endif

From 466303373248e0b9285f4471dfe5ef7021fea76f Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 22 Sep 2021 07:30:57 +0000
Subject: [PATCH 060/125] fix npu and xpu compile error

---
 cmake/operators.cmake                  | 36 +++++++++++++-------------
 paddle/fluid/operators/scale_op_npu.cc | 12 +++++++++
 paddle/fluid/operators/scale_op_xpu.cc | 13 ++++++++++
 3 files changed, 43 insertions(+), 18 deletions(-)

diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 11ae6a0f4eb95..f4d8c2404a714 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -250,6 +250,24 @@ function(op_library TARGET)
         set(pybind_flag 1)
     endif()
 
+    # TODO(chenweihang): Because the Tensor compute library will migrate the forward Kernel,
+    # only the grad kernel is left, if the USE_OP still be declared in the original way,
+    # the symbol will can not be found, so special treatment is needed here, and it will
+    # need to be deleted after the complete migration of the kernel in the future.
+    foreach(forward_moved_op "mean" "dot")
+        if ("${TARGET}" STREQUAL "${forward_moved_op}")
+            file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(${TARGET});\n")
+            file(APPEND ${pybind_file} "USE_OP_KERNEL(${TARGET}_grad);\n")
+            set(pybind_flag 1)
+        endif()
+    endforeach()
+    foreach(moved_op "scale")
+        if ("${TARGET}" STREQUAL "${forward_moved_op}")
+            file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(${TARGET});\n")
+            set(pybind_flag 1)
+        endif()
+    endforeach()
+
     # pybind USE_CPU_ONLY_OP
     list(LENGTH cu_srcs cu_srcs_len)
     list(LENGTH hip_srcs hip_srcs_len)
@@ -342,24 +360,6 @@ function(op_library TARGET)
       endif()
     endif()
 
-    # TODO(chenweihang): Because the Tensor compute library will migrate the forward Kernel,
-    # only the grad kernel is left, if the USE_OP still be declared in the original way,
-    # the symbol will can not be found, so special treatment is needed here, and it will
-    # need to be deleted after the complete migration of the kernel in the future.
-    foreach(forward_moved_op "mean" "dot")
-        if ("${TARGET}" STREQUAL "${forward_moved_op}")
-            file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(${TARGET});\n")
-            file(APPEND ${pybind_file} "USE_OP_KERNEL(${TARGET}_grad);\n")
-            set(pybind_flag 1)
-        endif()
-    endforeach()
-    foreach(moved_op "scale")
-        if ("${TARGET}" STREQUAL "${forward_moved_op}")
-            file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(${TARGET});\n")
-            set(pybind_flag 1)
-        endif()
-    endforeach()
-
     # pybind USE_OP
     if (${pybind_flag} EQUAL 0)
       # NOTE(*): activation use macro to regist the kernels, set use_op manually.
diff --git a/paddle/fluid/operators/scale_op_npu.cc b/paddle/fluid/operators/scale_op_npu.cc
index 159a213471d1b..094ea798c34d2 100644
--- a/paddle/fluid/operators/scale_op_npu.cc
+++ b/paddle/fluid/operators/scale_op_npu.cc
@@ -21,6 +21,18 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+template <typename T>
+static inline T GetAttrFromTensor(const framework::Tensor* tensor) {
+  const auto* tensor_data = tensor->data<T>();
+  framework::Tensor cpu_tensor;
+  if (platform::is_gpu_place(tensor->place()) ||
+      platform::is_npu_place(tensor->place())) {
+    TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
+    tensor_data = cpu_tensor.data<T>();
+  }
+  return tensor_data[0];
+}
+
 template <typename T>
 class ScaleNPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/scale_op_xpu.cc b/paddle/fluid/operators/scale_op_xpu.cc
index da1c8caa84555..cfec77a9e6b31 100644
--- a/paddle/fluid/operators/scale_op_xpu.cc
+++ b/paddle/fluid/operators/scale_op_xpu.cc
@@ -20,6 +20,19 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
+
+template <typename T>
+static inline T GetAttrFromTensor(const framework::Tensor* tensor) {
+  const auto* tensor_data = tensor->data<T>();
+  framework::Tensor cpu_tensor;
+  if (platform::is_gpu_place(tensor->place()) ||
+      platform::is_npu_place(tensor->place())) {
+    TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
+    tensor_data = cpu_tensor.data<T>();
+  }
+  return tensor_data[0];
+}
+
 template <typename DeviceContext, typename T>
 class ScaleXPUKernel : public framework::OpKernel<T> {
  public:

From be15b0215a2063f6cd442cf63e7d50574d163f72 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 23 Sep 2021 06:27:20 +0000
Subject: [PATCH 061/125] revert sign, mean, scale, dot kernel removing

---
 cmake/operators.cmake                  | 18 ------
 paddle/fluid/operators/dot_op.cc       |  9 +++
 paddle/fluid/operators/dot_op.cu       |  7 +++
 paddle/fluid/operators/dot_op.h        | 27 +++++++++
 paddle/fluid/operators/mean_op.cc      |  4 +-
 paddle/fluid/operators/mean_op.cu      |  4 ++
 paddle/fluid/operators/mean_op.h       | 24 ++++++++
 paddle/fluid/operators/scale_op.cc     | 27 ++++++++-
 paddle/fluid/operators/scale_op.h      | 80 ++++++++++++++++++++++++++
 paddle/fluid/operators/scale_op_npu.cc | 14 +----
 paddle/fluid/operators/scale_op_xpu.cc | 14 +----
 paddle/fluid/operators/sign_op.cc      | 13 ++++-
 paddle/fluid/operators/sign_op.h       | 48 ++++++++++++++++
 13 files changed, 242 insertions(+), 47 deletions(-)
 create mode 100644 paddle/fluid/operators/scale_op.h
 create mode 100644 paddle/fluid/operators/sign_op.h

diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index f4d8c2404a714..2c010a1e6297f 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -250,24 +250,6 @@ function(op_library TARGET)
         set(pybind_flag 1)
     endif()
 
-    # TODO(chenweihang): Because the Tensor compute library will migrate the forward Kernel,
-    # only the grad kernel is left, if the USE_OP still be declared in the original way,
-    # the symbol will can not be found, so special treatment is needed here, and it will
-    # need to be deleted after the complete migration of the kernel in the future.
-    foreach(forward_moved_op "mean" "dot")
-        if ("${TARGET}" STREQUAL "${forward_moved_op}")
-            file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(${TARGET});\n")
-            file(APPEND ${pybind_file} "USE_OP_KERNEL(${TARGET}_grad);\n")
-            set(pybind_flag 1)
-        endif()
-    endforeach()
-    foreach(moved_op "scale")
-        if ("${TARGET}" STREQUAL "${forward_moved_op}")
-            file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(${TARGET});\n")
-            set(pybind_flag 1)
-        endif()
-    endforeach()
-
     # pybind USE_CPU_ONLY_OP
     list(LENGTH cu_srcs cu_srcs_len)
     list(LENGTH hip_srcs hip_srcs_len)
diff --git a/paddle/fluid/operators/dot_op.cc b/paddle/fluid/operators/dot_op.cc
index b7f65a025fb79..31acd9718115c 100644
--- a/paddle/fluid/operators/dot_op.cc
+++ b/paddle/fluid/operators/dot_op.cc
@@ -148,6 +148,15 @@ REGISTER_OPERATOR(dot, ops::DotOp, ops::DotOpMaker,
 
 REGISTER_OPERATOR(dot_grad, ops::DotGradOp);
 
+REGISTER_OP_CPU_KERNEL(
+    dot, ops::DotKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::DotKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::DotKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::DotKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::DotKernel<paddle::platform::CPUDeviceContext,
+                   paddle::platform::complex<float>>,
+    ops::DotKernel<paddle::platform::CPUDeviceContext,
+                   paddle::platform::complex<double>>);
 REGISTER_OP_CPU_KERNEL(
     dot_grad, ops::DotGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::DotGradKernel<paddle::platform::CPUDeviceContext, double>,
diff --git a/paddle/fluid/operators/dot_op.cu b/paddle/fluid/operators/dot_op.cu
index 57c9ced7cfbad..49f27e1ffb128 100644
--- a/paddle/fluid/operators/dot_op.cu
+++ b/paddle/fluid/operators/dot_op.cu
@@ -17,6 +17,13 @@
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
+REGISTER_OP_CUDA_KERNEL(
+    dot, ops::DotKernel<plat::CUDADeviceContext, float>,
+    ops::DotKernel<plat::CUDADeviceContext, double>,
+    ops::DotKernel<plat::CUDADeviceContext, int>,
+    ops::DotKernel<plat::CUDADeviceContext, int64_t>,
+    ops::DotKernel<plat::CUDADeviceContext, paddle::platform::complex<float>>,
+    ops::DotKernel<plat::CUDADeviceContext, paddle::platform::complex<double>>);
 REGISTER_OP_CUDA_KERNEL(dot_grad,
                         ops::DotGradKernel<plat::CUDADeviceContext, float>,
                         ops::DotGradKernel<plat::CUDADeviceContext, double>,
diff --git a/paddle/fluid/operators/dot_op.h b/paddle/fluid/operators/dot_op.h
index 7c3b6c164d0bf..7bb8c84bafdfe 100644
--- a/paddle/fluid/operators/dot_op.h
+++ b/paddle/fluid/operators/dot_op.h
@@ -16,9 +16,14 @@
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tcmpt_utils.h"
 #include "paddle/fluid/operators/math/complex_functors.h"
 #include "paddle/fluid/platform/for_range.h"
 
+// only can include the headers in paddle/tcmpt/api dirs
+#include "paddle/tcmpt/api/include/dev/core.h"
+#include "paddle/tcmpt/api/include/dev/linalg.h"
+
 namespace paddle {
 namespace operators {
 
@@ -228,6 +233,28 @@ struct DotGradFunction<DeviceContext, T, math::DisableComplex<T>> {
   }
 };
 
+template <typename DeviceContext, typename T>
+class DotKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Output<Tensor>("Out");
+    auto& dev_ctx = ctx.device_context<DeviceContext>();
+    out->mutable_data<T>(x->place(), x->type());
+
+    auto pt_x =
+        framework::MakeTensorImpl<pt::DenseTensor>(*x, x->place(), x->type());
+    auto pt_y =
+        framework::MakeTensorImpl<pt::DenseTensor>(*y, y->place(), y->type());
+    auto pt_out =
+        framework::MakeTensorImpl<pt::DenseTensor>(*out, x->place(), x->type());
+
+    // call new kernel
+    pt::Dot<T>(dev_ctx, *pt_x.get(), *pt_y.get(), pt_out.get());
+  }
+};
+
 template <typename DeviceContext, typename T>
 class DotGradKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/mean_op.cc b/paddle/fluid/operators/mean_op.cc
index 2489cd18bb00f..764529a15b6a2 100644
--- a/paddle/fluid/operators/mean_op.cc
+++ b/paddle/fluid/operators/mean_op.cc
@@ -94,7 +94,9 @@ REGISTER_OPERATOR(mean, ops::MeanOp, ops::MeanOpMaker, ops::MeanOpInferVarType,
                   ops::MeanGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(mean_grad, ops::MeanGradOp,
                   ops::MeanGradNoNeedBufferVarsInferer);
-
+REGISTER_OP_CPU_KERNEL(
+    mean, ops::MeanKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MeanKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
     mean_grad, ops::MeanGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::MeanGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/mean_op.cu b/paddle/fluid/operators/mean_op.cu
index 786d73ee9c811..ffb667ba974b8 100644
--- a/paddle/fluid/operators/mean_op.cu
+++ b/paddle/fluid/operators/mean_op.cu
@@ -62,6 +62,10 @@ class MeanCUDAGradKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(
+    mean, ops::MeanKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MeanKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::MeanKernel<paddle::platform::CUDADeviceContext, plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
     mean_grad,
     ops::MeanCUDAGradKernel<paddle::platform::CUDADeviceContext, float>,
diff --git a/paddle/fluid/operators/mean_op.h b/paddle/fluid/operators/mean_op.h
index 9e752c7173d23..3cb26d09186c8 100644
--- a/paddle/fluid/operators/mean_op.h
+++ b/paddle/fluid/operators/mean_op.h
@@ -15,6 +15,11 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tcmpt_utils.h"
+
+// only can include the headers in paddle/top/api dirs
+#include "paddle/tcmpt/api/include/dev/core.h"
+#include "paddle/tcmpt/api/include/dev/math.h"
 
 namespace paddle {
 namespace operators {
@@ -27,6 +32,25 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
+template <typename DeviceContext, typename T>
+class MeanKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
+    auto* out = context.Output<Tensor>("Out");
+    auto& dev_ctx = context.device_context<DeviceContext>();
+    out->mutable_data<T>(x->place(), x->type());
+
+    auto pt_x =
+        framework::MakeTensorImpl<pt::DenseTensor>(*x, x->place(), x->type());
+    auto pt_out =
+        framework::MakeTensorImpl<pt::DenseTensor>(*out, x->place(), x->type());
+
+    // call new kernel
+    pt::Mean<T>(dev_ctx, *pt_x.get(), pt_out.get());
+  }
+};
+
 template <typename DeviceContext, typename T>
 class MeanGradKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index ae917eb934f24..a195452791048 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/operators/scale_op.h"
 #include <string>
-#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
@@ -146,3 +146,28 @@ REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker,
                   ops::ScaleGradMaker<paddle::framework::OpDesc>,
                   ops::ScaleGradMaker<paddle::imperative::OpBase>,
                   ops::ScaleOpVarTypeInference, ops::ScaleOpInplaceInferer);
+REGISTER_OP_CPU_KERNEL(
+    scale, ops::ScaleKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ScaleKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ScaleKernel<paddle::platform::CPUDeviceContext,
+                     paddle::platform::bfloat16>,
+    ops::ScaleKernel<paddle::platform::CPUDeviceContext, uint8_t>,
+    ops::ScaleKernel<paddle::platform::CPUDeviceContext, int8_t>,
+    ops::ScaleKernel<paddle::platform::CPUDeviceContext, int16_t>,
+    ops::ScaleKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ScaleKernel<paddle::platform::CPUDeviceContext, int64_t>);
+
+REGISTER_OP_CUDA_KERNEL(
+    scale,
+    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, float>,
+    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, double>,
+    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
+                                   uint8_t>,
+    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, int8_t>,
+    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
+                                   int16_t>,
+    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, int>,
+    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
+                                   int64_t>,
+    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
+                                   paddle::platform::float16>);
diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h
new file mode 100644
index 0000000000000..2d66d7f89b880
--- /dev/null
+++ b/paddle/fluid/operators/scale_op.h
@@ -0,0 +1,80 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tcmpt_utils.h"
+
+// only can include the headers in paddle/top/api dirs
+#include "paddle/tcmpt/api/include/dev/core.h"
+#include "paddle/tcmpt/api/include/dev/math.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+static inline T GetAttrFromTensor(const framework::Tensor* tensor) {
+  const auto* tensor_data = tensor->data<T>();
+  framework::Tensor cpu_tensor;
+  if (platform::is_gpu_place(tensor->place()) ||
+      platform::is_npu_place(tensor->place())) {
+    TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
+    tensor_data = cpu_tensor.data<T>();
+  }
+  return tensor_data[0];
+}
+
+template <typename DeviceContext, typename T>
+class ScaleKernel : public framework::OpKernel<T> {
+ public:
+  virtual void Compute(const framework::ExecutionContext& ctx) const {
+    auto* in_var = ctx.InputVar("X");
+    auto* in = framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_var);
+
+    auto bias = ctx.Attr<float>("bias");
+    auto bias_after_scale = ctx.Attr<bool>("bias_after_scale");
+
+    auto scale = ctx.Attr<float>("scale");
+    if (ctx.HasInput("ScaleTensor")) {
+      auto* scale_tensor = ctx.Input<framework::Tensor>("ScaleTensor");
+      scale = static_cast<float>(GetAttrFromTensor<T>(scale_tensor));
+    }
+
+    auto* out_var = ctx.OutputVar("Out");
+    if (in_var->IsType<framework::SelectedRows>() && in_var != out_var) {
+      auto& in_slr = in_var->Get<framework::SelectedRows>();
+      auto* out_slr = out_var->GetMutable<framework::SelectedRows>();
+      out_slr->set_rows(in_slr.rows());
+      out_slr->set_height(in_slr.height());
+    }
+
+    auto* out =
+        framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(out_var);
+    out->mutable_data<T>(in->place(), in->type());
+    auto& dev_ctx = ctx.device_context<DeviceContext>();
+
+    auto pt_x = framework::MakeTensorImpl<pt::DenseTensor>(*in, in->place(),
+                                                           in->type());
+    auto pt_out = framework::MakeTensorImpl<pt::DenseTensor>(*out, in->place(),
+                                                             in->type());
+
+    // call new kernel
+    pt::Scale<T>(dev_ctx, *pt_x.get(), scale, bias, bias_after_scale,
+                 pt_out.get());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/scale_op_npu.cc b/paddle/fluid/operators/scale_op_npu.cc
index 094ea798c34d2..2381719020869 100644
--- a/paddle/fluid/operators/scale_op_npu.cc
+++ b/paddle/fluid/operators/scale_op_npu.cc
@@ -15,24 +15,12 @@ limitations under the License. */
 #include <memory>
 #include <string>
 
-#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/operators/scale_op.h"
 
 namespace paddle {
 namespace operators {
 
-template <typename T>
-static inline T GetAttrFromTensor(const framework::Tensor* tensor) {
-  const auto* tensor_data = tensor->data<T>();
-  framework::Tensor cpu_tensor;
-  if (platform::is_gpu_place(tensor->place()) ||
-      platform::is_npu_place(tensor->place())) {
-    TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
-    tensor_data = cpu_tensor.data<T>();
-  }
-  return tensor_data[0];
-}
-
 template <typename T>
 class ScaleNPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/scale_op_xpu.cc b/paddle/fluid/operators/scale_op_xpu.cc
index cfec77a9e6b31..c467f3f89d064 100644
--- a/paddle/fluid/operators/scale_op_xpu.cc
+++ b/paddle/fluid/operators/scale_op_xpu.cc
@@ -14,25 +14,13 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_XPU
 
+#include "paddle/fluid/operators/scale_op.h"
 #include <string>
-#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/xpu/xpu_header.h"
 
 namespace paddle {
 namespace operators {
 
-template <typename T>
-static inline T GetAttrFromTensor(const framework::Tensor* tensor) {
-  const auto* tensor_data = tensor->data<T>();
-  framework::Tensor cpu_tensor;
-  if (platform::is_gpu_place(tensor->place()) ||
-      platform::is_npu_place(tensor->place())) {
-    TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
-    tensor_data = cpu_tensor.data<T>();
-  }
-  return tensor_data[0];
-}
-
 template <typename DeviceContext, typename T>
 class ScaleXPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/sign_op.cc b/paddle/fluid/operators/sign_op.cc
index 83c1955758f20..a491da3931964 100644
--- a/paddle/fluid/operators/sign_op.cc
+++ b/paddle/fluid/operators/sign_op.cc
@@ -14,7 +14,8 @@ limitations under the License. */
 
 #include <memory>
 
-#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/sign_op.h"
+#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace operators {
@@ -67,3 +68,13 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(sign, ops::SignOp, ops::SignOpMaker<float>,
                   ops::SignGradMaker<paddle::framework::OpDesc>,
                   ops::SignGradMaker<paddle::imperative::OpBase>);
+REGISTER_OP_CPU_KERNEL(
+    sign, ops::SignKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SignKernel<paddle::platform::CPUDeviceContext, double>);
+
+REGISTER_OP_CUDA_KERNEL(
+    sign,
+    paddle::operators::SignKernel<paddle::platform::CUDADeviceContext, float>,
+    paddle::operators::SignKernel<paddle::platform::CUDADeviceContext, double>,
+    paddle::operators::SignKernel<paddle::platform::CUDADeviceContext,
+                                  paddle::platform::float16>);
diff --git a/paddle/fluid/operators/sign_op.h b/paddle/fluid/operators/sign_op.h
new file mode 100644
index 0000000000000..c98a2aac512fa
--- /dev/null
+++ b/paddle/fluid/operators/sign_op.h
@@ -0,0 +1,48 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tcmpt_utils.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+
+// only can include the headers in paddle/tcmpt/api dirs
+#include "paddle/tcmpt/api/include/dev/core.h"
+#include "paddle/tcmpt/api/include/dev/math.h"
+
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class SignKernel : public framework::OpKernel<T> {
+ public:
+  virtual void Compute(const framework::ExecutionContext& context) const {
+    auto* x = context.Input<framework::Tensor>("X");
+    auto* out = context.Output<framework::Tensor>("Out");
+    auto& dev_ctx = context.device_context<DeviceContext>();
+    out->mutable_data<T>(x->place(), x->type());
+
+    auto pt_x =
+        framework::MakeTensorImpl<pt::DenseTensor>(*x, x->place(), x->type());
+    auto pt_out =
+        framework::MakeTensorImpl<pt::DenseTensor>(*out, x->place(), x->type());
+
+    // call new kernel
+    pt::Sign<T>(dev_ctx, *pt_x.get(), pt_out.get());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle

From 8371096dcf1f0a552f6895aed5643671190aa720 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 23 Sep 2021 06:50:01 +0000
Subject: [PATCH 062/125] add comment for keeping old kernel impl

---
 paddle/fluid/operators/dot_op.h   |  1 +
 paddle/fluid/operators/mean_op.h  | 20 ++++++++++++++++++++
 paddle/fluid/operators/scale_op.h |  1 +
 paddle/fluid/operators/sign_op.h  |  2 ++
 4 files changed, 24 insertions(+)

diff --git a/paddle/fluid/operators/dot_op.h b/paddle/fluid/operators/dot_op.h
index 7bb8c84bafdfe..4d69c9f707b67 100644
--- a/paddle/fluid/operators/dot_op.h
+++ b/paddle/fluid/operators/dot_op.h
@@ -233,6 +233,7 @@ struct DotGradFunction<DeviceContext, T, math::DisableComplex<T>> {
   }
 };
 
+// See Note [ Why still keep the original kernel implementation? ]
 template <typename DeviceContext, typename T>
 class DotKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/mean_op.h b/paddle/fluid/operators/mean_op.h
index 3cb26d09186c8..808d00ab872ec 100644
--- a/paddle/fluid/operators/mean_op.h
+++ b/paddle/fluid/operators/mean_op.h
@@ -32,6 +32,26 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
+/** [ Why still keep the original kernel implementation? ]
+ *
+ * Removal of the original kernel implementation and kernel registration needs
+ * to ensure that the new kernel mechanism adapts to multiple sets of execution
+ * mechanisms, including:
+ *
+ * 1. Executor and ParallelExecutor
+ * 2. Dygraph OpBase (Tracer and Engine)
+ * 3. New Executor
+ * 4. Predictor
+ * 5. NPU and XPU lack kernel and need to reuse CPU Kernel
+ *
+ * Removal of the original Kernel requires a more complete solution to ensure
+ * that it will not affect the current execution system.
+ * Currently, only the first two cases are adapted.
+ *
+ * The principle here is that the implementation in the kernel must reuse the
+ * corresponding functions in the Tensor compute library and cannot maintain
+ * two copies of the code.
+ */
 template <typename DeviceContext, typename T>
 class MeanKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h
index 2d66d7f89b880..61b5e76f19a61 100644
--- a/paddle/fluid/operators/scale_op.h
+++ b/paddle/fluid/operators/scale_op.h
@@ -36,6 +36,7 @@ static inline T GetAttrFromTensor(const framework::Tensor* tensor) {
   return tensor_data[0];
 }
 
+// See Note [ Why still keep the original kernel implementation? ]
 template <typename DeviceContext, typename T>
 class ScaleKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/sign_op.h b/paddle/fluid/operators/sign_op.h
index c98a2aac512fa..5ae464cae9ef5 100644
--- a/paddle/fluid/operators/sign_op.h
+++ b/paddle/fluid/operators/sign_op.h
@@ -25,6 +25,8 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
+
+// See Note [ Why still keep the original kernel implementation? ]
 template <typename DeviceContext, typename T>
 class SignKernel : public framework::OpKernel<T> {
  public:

From f1f6c8ead231035bee2d03a25a841f676a3d5a12 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 23 Sep 2021 13:27:38 +0000
Subject: [PATCH 063/125] fix mutable_data error

---
 paddle/fluid/operators/dot_op.h   | 2 +-
 paddle/fluid/operators/mean_op.h  | 2 +-
 paddle/fluid/operators/scale_op.h | 2 +-
 paddle/fluid/operators/sign_op.h  | 2 +-
 paddle/tcmpt/cpu/math.cc          | 1 +
 5 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/dot_op.h b/paddle/fluid/operators/dot_op.h
index 4d69c9f707b67..7655c4b97be81 100644
--- a/paddle/fluid/operators/dot_op.h
+++ b/paddle/fluid/operators/dot_op.h
@@ -242,7 +242,7 @@ class DotKernel : public framework::OpKernel<T> {
     auto* y = ctx.Input<Tensor>("Y");
     auto* out = ctx.Output<Tensor>("Out");
     auto& dev_ctx = ctx.device_context<DeviceContext>();
-    out->mutable_data<T>(x->place(), x->type());
+    out->mutable_data<T>(x->place());
 
     auto pt_x =
         framework::MakeTensorImpl<pt::DenseTensor>(*x, x->place(), x->type());
diff --git a/paddle/fluid/operators/mean_op.h b/paddle/fluid/operators/mean_op.h
index 808d00ab872ec..ed4aaacd81b62 100644
--- a/paddle/fluid/operators/mean_op.h
+++ b/paddle/fluid/operators/mean_op.h
@@ -59,7 +59,7 @@ class MeanKernel : public framework::OpKernel<T> {
     auto* x = context.Input<Tensor>("X");
     auto* out = context.Output<Tensor>("Out");
     auto& dev_ctx = context.device_context<DeviceContext>();
-    out->mutable_data<T>(x->place(), x->type());
+    out->mutable_data<T>(x->place());
 
     auto pt_x =
         framework::MakeTensorImpl<pt::DenseTensor>(*x, x->place(), x->type());
diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h
index 61b5e76f19a61..aca28f1212ce8 100644
--- a/paddle/fluid/operators/scale_op.h
+++ b/paddle/fluid/operators/scale_op.h
@@ -63,7 +63,7 @@ class ScaleKernel : public framework::OpKernel<T> {
 
     auto* out =
         framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(out_var);
-    out->mutable_data<T>(in->place(), in->type());
+    out->mutable_data<T>(in->place());
     auto& dev_ctx = ctx.device_context<DeviceContext>();
 
     auto pt_x = framework::MakeTensorImpl<pt::DenseTensor>(*in, in->place(),
diff --git a/paddle/fluid/operators/sign_op.h b/paddle/fluid/operators/sign_op.h
index 5ae464cae9ef5..4b5d89b9b566c 100644
--- a/paddle/fluid/operators/sign_op.h
+++ b/paddle/fluid/operators/sign_op.h
@@ -34,7 +34,7 @@ class SignKernel : public framework::OpKernel<T> {
     auto* x = context.Input<framework::Tensor>("X");
     auto* out = context.Output<framework::Tensor>("Out");
     auto& dev_ctx = context.device_context<DeviceContext>();
-    out->mutable_data<T>(x->place(), x->type());
+    out->mutable_data<T>(x->place());
 
     auto pt_x =
         framework::MakeTensorImpl<pt::DenseTensor>(*x, x->place(), x->type());
diff --git a/paddle/tcmpt/cpu/math.cc b/paddle/tcmpt/cpu/math.cc
index b66d57c8ee78d..c2b3cf5dd50e6 100644
--- a/paddle/tcmpt/cpu/math.cc
+++ b/paddle/tcmpt/cpu/math.cc
@@ -19,6 +19,7 @@
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/platform/bfloat16.h"
 
 namespace pt {
 

From 5547b444dcc25c36fa854a222b44f1b212ad3c12 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Fri, 24 Sep 2021 03:01:59 +0000
Subject: [PATCH 064/125] fix bfloat16 conflit

---
 paddle/tcmpt/cpu/math.cc | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/paddle/tcmpt/cpu/math.cc b/paddle/tcmpt/cpu/math.cc
index c2b3cf5dd50e6..166c26543a4ae 100644
--- a/paddle/tcmpt/cpu/math.cc
+++ b/paddle/tcmpt/cpu/math.cc
@@ -108,7 +108,9 @@ void ScaleSelectedRowsHost(const CPUContext& dev_ctx,
 // TODO(chenweihang): replace by better impl
 PT_REGISTER_MODULE(MathCPU);
 
-using bfloat16 = ::paddle::platform::bfloat16;
+// NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
+// using bfloat16 = ::paddle::platform::bfloat16;
+
 PT_REGISTER_KERNEL("sign", CPU, NCHW, pt::Sign, float, double) {}
 PT_REGISTER_KERNEL("mean", CPU, NCHW, pt::Mean, float, double) {}
 PT_REGISTER_KERNEL("scale",
@@ -117,7 +119,7 @@ PT_REGISTER_KERNEL("scale",
                    pt::Scale,
                    float,
                    double,
-                   bfloat16,
+                   paddle::platform::bfloat16,
                    uint8_t,
                    int8_t,
                    int16_t,
@@ -129,7 +131,7 @@ PT_REGISTER_KERNEL("scale.sr",
                    pt::ScaleSelectedRows,
                    float,
                    double,
-                   bfloat16,
+                   paddle::platform::bfloat16,
                    uint8_t,
                    int8_t,
                    int16_t,
@@ -141,7 +143,7 @@ PT_REGISTER_KERNEL("scale.host",
                    pt::ScaleHost,
                    float,
                    double,
-                   bfloat16,
+                   paddle::platform::bfloat16,
                    uint8_t,
                    int8_t,
                    int16_t,
@@ -155,7 +157,7 @@ PT_REGISTER_KERNEL("scale.sr.host",
                    pt::ScaleSelectedRowsHost,
                    float,
                    double,
-                   bfloat16,
+                   paddle::platform::bfloat16,
                    uint8_t,
                    int8_t,
                    int16_t,

From dd3323dce67561706f5423d53b8b70fcff79a36a Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Fri, 24 Sep 2021 07:10:47 +0000
Subject: [PATCH 065/125] fix inference undef error

---
 cmake/generic.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 410a7c52a24d5..7390bd17e386e 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -122,8 +122,8 @@ set_property(GLOBAL PROPERTY TCMPT_MODULES "")
 function(find_tcmpt_modules TARGET_NAME)
   get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
   string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
-  string(REGEX MATCH "\/top\/" result "${__target_path}")
-  if(NOT result STREQUAL "")
+  string(FIND "${__target_path}" "tcmpt" pos)
+  if(pos GREATER 1)
     get_property(tcmpt_modules GLOBAL PROPERTY TCMPT_MODULES)
     set(tcmpt_modules ${tcmpt_modules} ${TARGET_NAME})
     set_property(GLOBAL PROPERTY TCMPT_MODULES "${tcmpt_modules}")

From caaed198601335fce15e82f34ebb3a935e4e0200 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Sun, 26 Sep 2021 09:18:26 +0000
Subject: [PATCH 066/125] adapt to msvc compile rules

---
 paddle/tcmpt/core/kernel_registry.h | 458 +++++++++++++++-------------
 1 file changed, 241 insertions(+), 217 deletions(-)

diff --git a/paddle/tcmpt/core/kernel_registry.h b/paddle/tcmpt/core/kernel_registry.h
index 5bdb9f8744c80..6e1865679697a 100644
--- a/paddle/tcmpt/core/kernel_registry.h
+++ b/paddle/tcmpt/core/kernel_registry.h
@@ -122,12 +122,25 @@ struct KernelRegistrar {
 #define PT_CONCATENATE(arg1, arg2) PT_CONCATENATE1(arg1, arg2)
 #define PT_CONCATENATE1(arg1, arg2) PT_CONCATENATE2(arg1, arg2)
 #define PT_CONCATENATE2(arg1, arg2) arg1##arg2
-
-// reference:
-// https://stackoverflow.com/questions/1872220/is-it-possible-to-iterate-over-arguments-in-variadic-macros
-#define PT_NARGS(...) _PT_NARGS(__VA_ARGS__, _PT_RESQ_N())
+#define PT_EXPAND(x) x
+
+/**
+ * Reference:
+ *
+ *   https://stackoverflow.com/questions/1872220/is-it-possible-to-iterate-over-arguments-in-variadic-macros
+ *   https://stackoverflow.com/questions/9183993/msvc-variadic-macro-expansion?rq=1
+ *   https://stackoverflow.com/questions/5134523/msvc-doesnt-expand-va-args-correctly
+ *
+ * Very carefully tiptoeing around an MSVC bug where it improperly expands
+ * __VA_ARGS__ as a single token in argument lists.  See these URLs for details:
+ *
+ *   http://connect.microsoft.com/VisualStudio/feedback/details/380090/variadic-macro-replacement
+ *   http://cplusplus.co.il/2010/07/17/variadic-macro-to-count-number-of-arguments/#comment-644
+ */
+#define PT_NARGS(...) _PT_NARGS((__VA_ARGS__, _PT_RESQ_N()))
 #define _PT_NARGS(...) _PT_ARG_N(__VA_ARGS__)
-#define _PT_ARG_N(_1, _2, _3, _4, _5, _6, _7, _8, N, ...) N
+#define _PT_ARG_N_EXPAND(_1, _2, _3, _4, _5, _6, _7, _8, N, ...) N
+#define _PT_ARG_N(args) _PT_ARG_N_EXPAND args
 #define _PT_RESQ_N() 8, 7, 6, 5, 4, 3, 2, 1, 0
 
 #define PT_REGISTER_KERNEL(                                       \
@@ -145,7 +158,7 @@ struct KernelRegistrar {
   PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                           \
       PT_CONCATENATE(pt_op_kernel_ns_check_, func_id),                         \
       "PT_REGISTER_KERNEL must be called in global namespace.");               \
-  PT_KERNEL_SPECIALIZE(meta_kernel_fn, cpp_dtype, __VA_ARGS__);                \
+  PT_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, __VA_ARGS__);             \
   static void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                         \
                              func_id)(::pt::Kernel*);                          \
   PT_KERNEL_REGISTRAR_INIT(kernel_name,                                        \
@@ -158,39 +171,50 @@ struct KernelRegistrar {
                            __VA_ARGS__);                                       \
   void PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id)(::pt::Kernel * kernel)
 
-#define PT_KERNEL_SPECIALIZE(meta_kernel_fn, cpp_dtype, ...) \
-  _PT_KERNEL_SPECIALIZE(PT_NARGS(cpp_dtype, __VA_ARGS__),    \
-                        meta_kernel_fn,                      \
-                        cpp_dtype,                           \
-                        __VA_ARGS__)
+#define PT_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, ...) \
+  _PT_KERNEL_INSTANTIATION(PT_NARGS(cpp_dtype, __VA_ARGS__),    \
+                           meta_kernel_fn,                      \
+                           cpp_dtype,                           \
+                           __VA_ARGS__)
 
-#define _PT_KERNEL_SPECIALIZE(N, meta_kernel_fn, cpp_dtype, ...) \
-  PT_CONCATENATE(_PT_KERNEL_SPECIALIZE_, N)                      \
+#define _PT_KERNEL_INSTANTIATION(N, meta_kernel_fn, cpp_dtype, ...) \
+  PT_CONCATENATE(_PT_KERNEL_INSTANTIATION_, N)                      \
   (meta_kernel_fn, cpp_dtype, __VA_ARGS__)
 
-#define _PT_KERNEL_SPECIALIZE_1(meta_kernel_fn, cpp_dtype, ...) \
+/**
+ * need use template<> instead of template here
+ * template can work on gcc and clang, but msvc will failed, error like:
+ *
+ *   error C2206: typedef cannot be used for function definition
+ *
+ * reference:
+ *
+ *   https://stackoverflow.com/questions/63989585/explicit-instantiation-of-function-using-decltype-work-on-g-but-not-on-visua
+ */
+
+#define _PT_KERNEL_INSTANTIATION_1(meta_kernel_fn, cpp_dtype, ...) \
   template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>
-#define _PT_KERNEL_SPECIALIZE_2(meta_kernel_fn, cpp_dtype, ...)           \
+#define _PT_KERNEL_INSTANTIATION_2(meta_kernel_fn, cpp_dtype, ...)        \
   template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
-  _PT_KERNEL_SPECIALIZE_1(meta_kernel_fn, __VA_ARGS__)
-#define _PT_KERNEL_SPECIALIZE_3(meta_kernel_fn, cpp_dtype, ...)           \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_1(meta_kernel_fn, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_3(meta_kernel_fn, cpp_dtype, ...)        \
   template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
-  _PT_KERNEL_SPECIALIZE_2(meta_kernel_fn, __VA_ARGS__)
-#define _PT_KERNEL_SPECIALIZE_4(meta_kernel_fn, cpp_dtype, ...)           \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_2(meta_kernel_fn, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_4(meta_kernel_fn, cpp_dtype, ...)        \
   template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
-  _PT_KERNEL_SPECIALIZE_3(meta_kernel_fn, __VA_ARGS__)
-#define _PT_KERNEL_SPECIALIZE_5(meta_kernel_fn, cpp_dtype, ...)           \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_3(meta_kernel_fn, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_5(meta_kernel_fn, cpp_dtype, ...)        \
   template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
-  _PT_KERNEL_SPECIALIZE_4(meta_kernel_fn, __VA_ARGS__)
-#define _PT_KERNEL_SPECIALIZE_6(meta_kernel_fn, cpp_dtype, ...)           \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_4(meta_kernel_fn, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_6(meta_kernel_fn, cpp_dtype, ...)        \
   template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
-  _PT_KERNEL_SPECIALIZE_5(meta_kernel_fn, __VA_ARGS__)
-#define _PT_KERNEL_SPECIALIZE_7(meta_kernel_fn, cpp_dtype, ...)           \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_5(meta_kernel_fn, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_7(meta_kernel_fn, cpp_dtype, ...)        \
   template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
-  _PT_KERNEL_SPECIALIZE_6(meta_kernel_fn, __VA_ARGS__)
-#define _PT_KERNEL_SPECIALIZE_8(meta_kernel_fn, cpp_dtype, ...)           \
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_6(meta_kernel_fn, __VA_ARGS__))
+#define _PT_KERNEL_INSTANTIATION_8(meta_kernel_fn, cpp_dtype, ...)        \
   template decltype(meta_kernel_fn<cpp_dtype>) meta_kernel_fn<cpp_dtype>; \
-  _PT_KERNEL_SPECIALIZE_7(meta_kernel_fn, __VA_ARGS__)
+  PT_EXPAND(_PT_KERNEL_INSTANTIATION_7(meta_kernel_fn, __VA_ARGS__))
 
 #define PT_KERNEL_REGISTRAR_INIT(kernel_name,                 \
                                  func_id,                     \
@@ -255,195 +279,195 @@ struct KernelRegistrar {
           &meta_kernel_fn<cpp_dtype>)>::Parse,        \
       args_def_fn,                                    \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));
-#define _PT_KERNEL_REGISTRAR_INIT_2(kernel_name,      \
-                                    func_id,          \
-                                    registrar_id,     \
-                                    backend,          \
-                                    layout,           \
-                                    args_def_fn,      \
-                                    meta_kernel_fn,   \
-                                    cpp_dtype,        \
-                                    ...)              \
-  static const ::pt::KernelRegistrar PT_CONCATENATE(  \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)( \
-      kernel_name,                                    \
-      BACKEND(backend),                               \
-      DATALAYOUT(layout),                             \
-      ::pt::CppTypeToDataType<cpp_dtype>::Type(),     \
-      ::pt::KernelArgsParseFunctor<decltype(          \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,        \
-      args_def_fn,                                    \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>));          \
-  _PT_KERNEL_REGISTRAR_INIT_1(kernel_name,            \
-                              func_id,                \
-                              PT_ID,                  \
-                              backend,                \
-                              layout,                 \
-                              args_def_fn,            \
-                              meta_kernel_fn,         \
-                              __VA_ARGS__)
-#define _PT_KERNEL_REGISTRAR_INIT_3(kernel_name,      \
-                                    func_id,          \
-                                    registrar_id,     \
-                                    backend,          \
-                                    layout,           \
-                                    args_def_fn,      \
-                                    meta_kernel_fn,   \
-                                    cpp_dtype,        \
-                                    ...)              \
-  static const ::pt::KernelRegistrar PT_CONCATENATE(  \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)( \
-      kernel_name,                                    \
-      BACKEND(backend),                               \
-      DATALAYOUT(layout),                             \
-      ::pt::CppTypeToDataType<cpp_dtype>::Type(),     \
-      ::pt::KernelArgsParseFunctor<decltype(          \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,        \
-      args_def_fn,                                    \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>));          \
-  _PT_KERNEL_REGISTRAR_INIT_2(kernel_name,            \
-                              func_id,                \
-                              PT_ID,                  \
-                              backend,                \
-                              layout,                 \
-                              args_def_fn,            \
-                              meta_kernel_fn,         \
-                              __VA_ARGS__)
-#define _PT_KERNEL_REGISTRAR_INIT_4(kernel_name,      \
-                                    func_id,          \
-                                    registrar_id,     \
-                                    backend,          \
-                                    layout,           \
-                                    args_def_fn,      \
-                                    meta_kernel_fn,   \
-                                    cpp_dtype,        \
-                                    ...)              \
-  static const ::pt::KernelRegistrar PT_CONCATENATE(  \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)( \
-      kernel_name,                                    \
-      BACKEND(backend),                               \
-      DATALAYOUT(layout),                             \
-      ::pt::CppTypeToDataType<cpp_dtype>::Type(),     \
-      ::pt::KernelArgsParseFunctor<decltype(          \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,        \
-      args_def_fn,                                    \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>));          \
-  _PT_KERNEL_REGISTRAR_INIT_3(kernel_name,            \
-                              func_id,                \
-                              PT_ID,                  \
-                              backend,                \
-                              layout,                 \
-                              args_def_fn,            \
-                              meta_kernel_fn,         \
-                              __VA_ARGS__)
-#define _PT_KERNEL_REGISTRAR_INIT_5(kernel_name,      \
-                                    func_id,          \
-                                    registrar_id,     \
-                                    backend,          \
-                                    layout,           \
-                                    args_def_fn,      \
-                                    meta_kernel_fn,   \
-                                    cpp_dtype,        \
-                                    ...)              \
-  static const ::pt::KernelRegistrar PT_CONCATENATE(  \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)( \
-      kernel_name,                                    \
-      BACKEND(backend),                               \
-      DATALAYOUT(layout),                             \
-      ::pt::CppTypeToDataType<cpp_dtype>::Type(),     \
-      ::pt::KernelArgsParseFunctor<decltype(          \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,        \
-      args_def_fn,                                    \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>));          \
-  _PT_KERNEL_REGISTRAR_INIT_4(kernel_name,            \
-                              func_id,                \
-                              PT_ID,                  \
-                              backend,                \
-                              layout,                 \
-                              args_def_fn,            \
-                              meta_kernel_fn,         \
-                              __VA_ARGS__)
-#define _PT_KERNEL_REGISTRAR_INIT_6(kernel_name,      \
-                                    func_id,          \
-                                    registrar_id,     \
-                                    backend,          \
-                                    layout,           \
-                                    args_def_fn,      \
-                                    meta_kernel_fn,   \
-                                    cpp_dtype,        \
-                                    ...)              \
-  static const ::pt::KernelRegistrar PT_CONCATENATE(  \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)( \
-      kernel_name,                                    \
-      BACKEND(backend),                               \
-      DATALAYOUT(layout),                             \
-      ::pt::CppTypeToDataType<cpp_dtype>::Type(),     \
-      ::pt::KernelArgsParseFunctor<decltype(          \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,        \
-      args_def_fn,                                    \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>));          \
-  _PT_KERNEL_REGISTRAR_INIT_5(kernel_name,            \
-                              func_id,                \
-                              PT_ID,                  \
-                              backend,                \
-                              layout,                 \
-                              args_def_fn,            \
-                              meta_kernel_fn,         \
-                              __VA_ARGS__)
-#define _PT_KERNEL_REGISTRAR_INIT_7(kernel_name,      \
-                                    func_id,          \
-                                    registrar_id,     \
-                                    backend,          \
-                                    layout,           \
-                                    args_def_fn,      \
-                                    meta_kernel_fn,   \
-                                    cpp_dtype,        \
-                                    ...)              \
-  static const ::pt::KernelRegistrar PT_CONCATENATE(  \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)( \
-      kernel_name,                                    \
-      BACKEND(backend),                               \
-      DATALAYOUT(layout),                             \
-      ::pt::CppTypeToDataType<cpp_dtype>::Type(),     \
-      ::pt::KernelArgsParseFunctor<decltype(          \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,        \
-      args_def_fn,                                    \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>));          \
-  _PT_KERNEL_REGISTRAR_INIT_6(kernel_name,            \
-                              func_id,                \
-                              PT_ID,                  \
-                              backend,                \
-                              layout,                 \
-                              args_def_fn,            \
-                              meta_kernel_fn,         \
-                              __VA_ARGS__)
-#define _PT_KERNEL_REGISTRAR_INIT_8(kernel_name,      \
-                                    func_id,          \
-                                    registrar_id,     \
-                                    backend,          \
-                                    layout,           \
-                                    args_def_fn,      \
-                                    meta_kernel_fn,   \
-                                    cpp_dtype,        \
-                                    ...)              \
-  static const ::pt::KernelRegistrar PT_CONCATENATE(  \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)( \
-      kernel_name,                                    \
-      BACKEND(backend),                               \
-      DATALAYOUT(layout),                             \
-      ::pt::CppTypeToDataType<cpp_dtype>::Type(),     \
-      ::pt::KernelArgsParseFunctor<decltype(          \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,        \
-      args_def_fn,                                    \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>));          \
-  _PT_KERNEL_REGISTRAR_INIT_7(kernel_name,            \
-                              func_id,                \
-                              PT_ID,                  \
-                              backend,                \
-                              layout,                 \
-                              args_def_fn,            \
-                              meta_kernel_fn,         \
-                              __VA_ARGS__)
+#define _PT_KERNEL_REGISTRAR_INIT_2(kernel_name,        \
+                                    func_id,            \
+                                    registrar_id,       \
+                                    backend,            \
+                                    layout,             \
+                                    args_def_fn,        \
+                                    meta_kernel_fn,     \
+                                    cpp_dtype,          \
+                                    ...)                \
+  static const ::pt::KernelRegistrar PT_CONCATENATE(    \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)(   \
+      kernel_name,                                      \
+      BACKEND(backend),                                 \
+      DATALAYOUT(layout),                               \
+      ::pt::CppTypeToDataType<cpp_dtype>::Type(),       \
+      ::pt::KernelArgsParseFunctor<decltype(            \
+          &meta_kernel_fn<cpp_dtype>)>::Parse,          \
+      args_def_fn,                                      \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));            \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_1(kernel_name,    \
+                                        func_id,        \
+                                        PT_ID,          \
+                                        backend,        \
+                                        layout,         \
+                                        args_def_fn,    \
+                                        meta_kernel_fn, \
+                                        __VA_ARGS__))
+#define _PT_KERNEL_REGISTRAR_INIT_3(kernel_name,        \
+                                    func_id,            \
+                                    registrar_id,       \
+                                    backend,            \
+                                    layout,             \
+                                    args_def_fn,        \
+                                    meta_kernel_fn,     \
+                                    cpp_dtype,          \
+                                    ...)                \
+  static const ::pt::KernelRegistrar PT_CONCATENATE(    \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)(   \
+      kernel_name,                                      \
+      BACKEND(backend),                                 \
+      DATALAYOUT(layout),                               \
+      ::pt::CppTypeToDataType<cpp_dtype>::Type(),       \
+      ::pt::KernelArgsParseFunctor<decltype(            \
+          &meta_kernel_fn<cpp_dtype>)>::Parse,          \
+      args_def_fn,                                      \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));            \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_2(kernel_name,    \
+                                        func_id,        \
+                                        PT_ID,          \
+                                        backend,        \
+                                        layout,         \
+                                        args_def_fn,    \
+                                        meta_kernel_fn, \
+                                        __VA_ARGS__))
+#define _PT_KERNEL_REGISTRAR_INIT_4(kernel_name,        \
+                                    func_id,            \
+                                    registrar_id,       \
+                                    backend,            \
+                                    layout,             \
+                                    args_def_fn,        \
+                                    meta_kernel_fn,     \
+                                    cpp_dtype,          \
+                                    ...)                \
+  static const ::pt::KernelRegistrar PT_CONCATENATE(    \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)(   \
+      kernel_name,                                      \
+      BACKEND(backend),                                 \
+      DATALAYOUT(layout),                               \
+      ::pt::CppTypeToDataType<cpp_dtype>::Type(),       \
+      ::pt::KernelArgsParseFunctor<decltype(            \
+          &meta_kernel_fn<cpp_dtype>)>::Parse,          \
+      args_def_fn,                                      \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));            \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_3(kernel_name,    \
+                                        func_id,        \
+                                        PT_ID,          \
+                                        backend,        \
+                                        layout,         \
+                                        args_def_fn,    \
+                                        meta_kernel_fn, \
+                                        __VA_ARGS__))
+#define _PT_KERNEL_REGISTRAR_INIT_5(kernel_name,        \
+                                    func_id,            \
+                                    registrar_id,       \
+                                    backend,            \
+                                    layout,             \
+                                    args_def_fn,        \
+                                    meta_kernel_fn,     \
+                                    cpp_dtype,          \
+                                    ...)                \
+  static const ::pt::KernelRegistrar PT_CONCATENATE(    \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)(   \
+      kernel_name,                                      \
+      BACKEND(backend),                                 \
+      DATALAYOUT(layout),                               \
+      ::pt::CppTypeToDataType<cpp_dtype>::Type(),       \
+      ::pt::KernelArgsParseFunctor<decltype(            \
+          &meta_kernel_fn<cpp_dtype>)>::Parse,          \
+      args_def_fn,                                      \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));            \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_4(kernel_name,    \
+                                        func_id,        \
+                                        PT_ID,          \
+                                        backend,        \
+                                        layout,         \
+                                        args_def_fn,    \
+                                        meta_kernel_fn, \
+                                        __VA_ARGS__))
+#define _PT_KERNEL_REGISTRAR_INIT_6(kernel_name,        \
+                                    func_id,            \
+                                    registrar_id,       \
+                                    backend,            \
+                                    layout,             \
+                                    args_def_fn,        \
+                                    meta_kernel_fn,     \
+                                    cpp_dtype,          \
+                                    ...)                \
+  static const ::pt::KernelRegistrar PT_CONCATENATE(    \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)(   \
+      kernel_name,                                      \
+      BACKEND(backend),                                 \
+      DATALAYOUT(layout),                               \
+      ::pt::CppTypeToDataType<cpp_dtype>::Type(),       \
+      ::pt::KernelArgsParseFunctor<decltype(            \
+          &meta_kernel_fn<cpp_dtype>)>::Parse,          \
+      args_def_fn,                                      \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));            \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_5(kernel_name,    \
+                                        func_id,        \
+                                        PT_ID,          \
+                                        backend,        \
+                                        layout,         \
+                                        args_def_fn,    \
+                                        meta_kernel_fn, \
+                                        __VA_ARGS__))
+#define _PT_KERNEL_REGISTRAR_INIT_7(kernel_name,        \
+                                    func_id,            \
+                                    registrar_id,       \
+                                    backend,            \
+                                    layout,             \
+                                    args_def_fn,        \
+                                    meta_kernel_fn,     \
+                                    cpp_dtype,          \
+                                    ...)                \
+  static const ::pt::KernelRegistrar PT_CONCATENATE(    \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)(   \
+      kernel_name,                                      \
+      BACKEND(backend),                                 \
+      DATALAYOUT(layout),                               \
+      ::pt::CppTypeToDataType<cpp_dtype>::Type(),       \
+      ::pt::KernelArgsParseFunctor<decltype(            \
+          &meta_kernel_fn<cpp_dtype>)>::Parse,          \
+      args_def_fn,                                      \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));            \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_6(kernel_name,    \
+                                        func_id,        \
+                                        PT_ID,          \
+                                        backend,        \
+                                        layout,         \
+                                        args_def_fn,    \
+                                        meta_kernel_fn, \
+                                        __VA_ARGS__))
+#define _PT_KERNEL_REGISTRAR_INIT_8(kernel_name,        \
+                                    func_id,            \
+                                    registrar_id,       \
+                                    backend,            \
+                                    layout,             \
+                                    args_def_fn,        \
+                                    meta_kernel_fn,     \
+                                    cpp_dtype,          \
+                                    ...)                \
+  static const ::pt::KernelRegistrar PT_CONCATENATE(    \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)(   \
+      kernel_name,                                      \
+      BACKEND(backend),                                 \
+      DATALAYOUT(layout),                               \
+      ::pt::CppTypeToDataType<cpp_dtype>::Type(),       \
+      ::pt::KernelArgsParseFunctor<decltype(            \
+          &meta_kernel_fn<cpp_dtype>)>::Parse,          \
+      args_def_fn,                                      \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));            \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_7(kernel_name,    \
+                                        func_id,        \
+                                        PT_ID,          \
+                                        backend,        \
+                                        layout,         \
+                                        args_def_fn,    \
+                                        meta_kernel_fn, \
+                                        __VA_ARGS__))
 
 #define PT_REGISTER_KERNEL_STANDARD(                \
     kernel_name, backend, layout, dtype, kernel_fn) \

From 46b77627341f50f3d2a496e95a7b37ba346a1b4d Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Sun, 26 Sep 2021 13:36:32 +0000
Subject: [PATCH 067/125] polish comment for template inst

---
 paddle/tcmpt/core/kernel_registry.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/tcmpt/core/kernel_registry.h b/paddle/tcmpt/core/kernel_registry.h
index 6e1865679697a..1d8f610dc85d2 100644
--- a/paddle/tcmpt/core/kernel_registry.h
+++ b/paddle/tcmpt/core/kernel_registry.h
@@ -182,8 +182,8 @@ struct KernelRegistrar {
   (meta_kernel_fn, cpp_dtype, __VA_ARGS__)
 
 /**
- * need use template<> instead of template here
- * template can work on gcc and clang, but msvc will failed, error like:
+ * `template decltype(fn) fn` can work on gcc and clang,
+ * but msvc will failed, error like:
  *
  *   error C2206: typedef cannot be used for function definition
  *

From 4253f4905687acfcd2f83d3345f94aa8f7943621 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 27 Sep 2021 13:46:20 +0000
Subject: [PATCH 068/125] add cmake template instantiation for win

---
 cmake/tcmpt.cmake                   | 48 +++++++++++++++++++++++++++++
 paddle/tcmpt/CMakeLists.txt         |  1 +
 paddle/tcmpt/core/kernel_registry.h | 22 ++++++++++++-
 paddle/tcmpt/cpu/CMakeLists.txt     |  6 ++++
 paddle/tcmpt/cuda/CMakeLists.txt    |  6 ++++
 5 files changed, 82 insertions(+), 1 deletion(-)
 create mode 100644 cmake/tcmpt.cmake

diff --git a/cmake/tcmpt.cmake b/cmake/tcmpt.cmake
new file mode 100644
index 0000000000000..26d5eff926b55
--- /dev/null
+++ b/cmake/tcmpt.cmake
@@ -0,0 +1,48 @@
+# TODO(chenweihang): keep message comment for debuging, remove it if needless
+function(kernel_instantiate TARGET)
+    set(target_file ${CURRENT_BINARY_DIR}/${TARGET}.tmp CACHE INTERNAL "${CURRENT_BINARY_DIR}/${TARGET} file")
+    set(target_file_final ${CURRENT_BINARY_DIR}/${TARGET})
+    file(READ ${TARGET} TARGET_CONTENT)
+    file(WRITE ${target_file} ${TARGET_CONTENT})
+    string(REGEX MATCHALL "void [A-Z][A-Za-z0-9_]+\\(.[^\\)]+\\)" func_signatures ${TARGET_CONTENT})
+    # message(STATUS "FUNCS: ${func_signatures}")
+    string(REGEX MATCHALL "PT_REGISTER_KERNEL\\(.[^\\)]+\\) \\{" func_registrars ${TARGET_CONTENT})
+    # message(STATUS "REGISTRARS: ${func_registrars}")
+    set(instantiate_context "")
+    foreach(signature ${func_signatures})
+        # message(STATUS "FUNC: ${signature}")
+        list(POP_FRONT func_registrars registrar)
+        # message(STATUS "REG: ${registrar}")
+        string(REGEX MATCHALL "[a-z0-9_:]+(,|\\))" dtypes ${registrar})
+        # message(STATUS "DTYPES: ${dtypes}")
+        list(REMOVE_AT dtypes 0)
+        # message(STATUS "REMOVED DTYPES: ${dtypes}")
+        foreach(dtype ${dtypes})
+            string(REGEX REPLACE ",|\\)" "" dtype ${dtype})
+            # message(STATUS "DTYPE: ${dtype}")
+            string(REGEX MATCH "[A-Z][A-Za-z0-9]+\\(" func_name ${signature})
+            string(REPLACE "(" "" func_name ${func_name})
+            # message(STATUS "FUNC NAME: ${func_name}")
+            string(REGEX REPLACE "${func_name}" "pt::${func_name}<${dtype}>" inst_signature ${signature})
+            # append namespace
+            string(REPLACE "CPUContext" "pt::CPUContext" inst_signature ${inst_signature})
+            string(REPLACE "CUDAContext" "pt::CUDAContext" inst_signature ${inst_signature})
+            string(REPLACE "DenseTensor" "pt::DenseTensor" inst_signature ${inst_signature})
+            string(REPLACE "SelectedRowsTensor" "pt::SelectedRowsTensor" inst_signature ${inst_signature})
+            # message(STATUS "INST FUNC: ${inst_signature}")
+            string(APPEND instantiate_context "template ${inst_signature};\n")
+        endforeach()
+    endforeach()
+    # message(STATUS "INST CONTENT: ${instantiate_context}")
+    file(APPEND ${target_file} "${instantiate_context}\n")
+    # copy_if_different(${target_file} ${target_file_final})
+    string(REPLACE "." "_" cmd_name ${TARGET})
+    # this is a dummy target for custom command, should always be run firstly to update ${target_file_final}
+    # TODO(chenweihang): nameing rule need to enchance
+    add_custom_target(copy_${cmd_name}_command ALL
+        COMMAND ${CMAKE_COMMAND} -E copy_if_different ${target_file} ${target_file_final}
+        COMMENT "copy_if_different ${target_file_final}"
+        VERBATIM
+    )
+    add_dependencies(extern_glog copy_${cmd_name}_command)
+endfunction()
\ No newline at end of file
diff --git a/paddle/tcmpt/CMakeLists.txt b/paddle/tcmpt/CMakeLists.txt
index 33fd0be0f374d..329728d422c3f 100644
--- a/paddle/tcmpt/CMakeLists.txt
+++ b/paddle/tcmpt/CMakeLists.txt
@@ -1,3 +1,4 @@
+include(tcmpt)
 # tcmpt api
 add_subdirectory(api)
 # tcmpt core components
diff --git a/paddle/tcmpt/core/kernel_registry.h b/paddle/tcmpt/core/kernel_registry.h
index 1d8f610dc85d2..2874f4db203f2 100644
--- a/paddle/tcmpt/core/kernel_registry.h
+++ b/paddle/tcmpt/core/kernel_registry.h
@@ -152,7 +152,7 @@ struct KernelRegistrar {
                       meta_kernel_fn,                             \
                       cpp_dtype,                                  \
                       __VA_ARGS__)
-
+#ifndef _WIN32
 #define _PT_REGISTER_KERNEL(                                                   \
     kernel_name, func_id, backend, layout, meta_kernel_fn, cpp_dtype, ...)     \
   PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                           \
@@ -170,6 +170,24 @@ struct KernelRegistrar {
                            cpp_dtype,                                          \
                            __VA_ARGS__);                                       \
   void PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id)(::pt::Kernel * kernel)
+#else
+#define _PT_REGISTER_KERNEL(                                                   \
+    kernel_name, func_id, backend, layout, meta_kernel_fn, cpp_dtype, ...)     \
+  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                           \
+      PT_CONCATENATE(pt_op_kernel_ns_check_, func_id),                         \
+      "PT_REGISTER_KERNEL must be called in global namespace.");               \
+  static void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                         \
+                             func_id)(::pt::Kernel*);                          \
+  PT_KERNEL_REGISTRAR_INIT(kernel_name,                                        \
+                           func_id,                                            \
+                           backend,                                            \
+                           layout,                                             \
+                           &PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id), \
+                           meta_kernel_fn,                                     \
+                           cpp_dtype,                                          \
+                           __VA_ARGS__);                                       \
+  void PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id)(::pt::Kernel * kernel)
+#endif
 
 #define PT_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, ...) \
   _PT_KERNEL_INSTANTIATION(PT_NARGS(cpp_dtype, __VA_ARGS__),    \
@@ -190,6 +208,8 @@ struct KernelRegistrar {
  * reference:
  *
  *   https://stackoverflow.com/questions/63989585/explicit-instantiation-of-function-using-decltype-work-on-g-but-not-on-visua
+ *
+ * So we solve the explict instantiation of kernel by CMake
  */
 
 #define _PT_KERNEL_INSTANTIATION_1(meta_kernel_fn, cpp_dtype, ...) \
diff --git a/paddle/tcmpt/cpu/CMakeLists.txt b/paddle/tcmpt/cpu/CMakeLists.txt
index 8ee42a210b7f8..c53dd675862ca 100644
--- a/paddle/tcmpt/cpu/CMakeLists.txt
+++ b/paddle/tcmpt/cpu/CMakeLists.txt
@@ -1,2 +1,8 @@
+if(WIN32)
+    set(CURRENT_BINARY_DIR ${PADDLE_BINARY_DIR}/paddle/tcmpt/cpu)
+    kernel_instantiate(math.cc)
+    kernel_instantiate(linalg.cc)
+endif()
+
 cc_library(math_cpu SRCS math.cc DEPS dense_tensor kernel_context kernel_factory eigen_function)
 cc_library(linalg_cpu SRCS linalg.cc DEPS dense_tensor kernel_context kernel_factory)
diff --git a/paddle/tcmpt/cuda/CMakeLists.txt b/paddle/tcmpt/cuda/CMakeLists.txt
index d695bf7b28a2b..f3d52c6ec6bf4 100644
--- a/paddle/tcmpt/cuda/CMakeLists.txt
+++ b/paddle/tcmpt/cuda/CMakeLists.txt
@@ -1,3 +1,9 @@
+if(WIN32)
+    set(CURRENT_BINARY_DIR ${PADDLE_BINARY_DIR}/paddle/tcmpt/cuda)
+    kernel_instantiate(math.cu)
+    kernel_instantiate(linalg.cu)
+endif()
+
 if(WITH_GPU)
   nv_library(math_cuda SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory)
   nv_library(linalg_cuda SRCS linalg.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)

From 817f052a6866f4cb2b2fc6b657e91d2a7999a987 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 29 Sep 2021 06:15:58 +0000
Subject: [PATCH 069/125] fix backend to place device id bug

---
 paddle/fluid/framework/operator.cc |  2 +-
 paddle/fluid/operators/mean_op.h   |  1 +
 paddle/tcmpt/core/convert_utils.cc | 11 ++++++--
 paddle/tcmpt/cpu/math.cc           | 16 ++----------
 paddle/tcmpt/cuda/math.cu          |  7 +++--
 paddle/tcmpt/cuda/math.h           |  3 ---
 paddle/tcmpt/eigen/mean.h          | 41 ++++++++++++++++++++++++++++++
 7 files changed, 59 insertions(+), 22 deletions(-)
 create mode 100644 paddle/tcmpt/eigen/mean.h

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 91ba0a7dc2771..f9ba46581ee6f 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1153,7 +1153,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   // and RCOM backend, the XPU, NPU and MKLDNN will be supported in the second
   // phase
 
-  VLOG(1) << "Pt KernelFactory: " << pt::KernelFactory::Instance();
+  // VLOG(1) << "Pt KernelFactory: " << pt::KernelFactory::Instance();
   if (pt::KernelFactory::Instance().ContainsKernel(type_.c_str())) {
     if (pt_kernel_key_.get() == nullptr || pt_kernel_.get() == nullptr) {
       ChoosePtKernel(*runtime_ctx, *dev_ctx);
diff --git a/paddle/fluid/operators/mean_op.h b/paddle/fluid/operators/mean_op.h
index ed4aaacd81b62..dec0f4dd22f4c 100644
--- a/paddle/fluid/operators/mean_op.h
+++ b/paddle/fluid/operators/mean_op.h
@@ -67,6 +67,7 @@ class MeanKernel : public framework::OpKernel<T> {
         framework::MakeTensorImpl<pt::DenseTensor>(*out, x->place(), x->type());
 
     // call new kernel
+    VLOG(1) << "chenweihang: call original mean kernel compute.";
     pt::Mean<T>(dev_ctx, *pt_x.get(), pt_out.get());
   }
 };
diff --git a/paddle/tcmpt/core/convert_utils.cc b/paddle/tcmpt/core/convert_utils.cc
index e994b8835fa2b..5059136b73d04 100644
--- a/paddle/tcmpt/core/convert_utils.cc
+++ b/paddle/tcmpt/core/convert_utils.cc
@@ -14,6 +14,9 @@ limitations under the License. */
 
 #include "paddle/tcmpt/core/convert_utils.h"
 
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/gpu_info.h"
+
 namespace pt {
 
 // TODO(chenweihang): Add other place branchs
@@ -90,15 +93,19 @@ paddle::platform::Place TransToFluidPlace(const Backend& backend) {
     case pt::Backend::kCPU:
       return paddle::platform::CPUPlace();
     case pt::Backend::kCUDA:
-      return paddle::platform::CUDAPlace();
+      return paddle::platform::CUDAPlace(
+          paddle::platform::GetCurrentDeviceId());
     case pt::Backend::kXPU:
+      // TODO(chenweihang): add device id
       return paddle::platform::XPUPlace();
     case pt::Backend::kNPU:
+      // TODO(chenweihang): add device id
       return paddle::platform::NPUPlace();
     case pt::Backend::kMKLDNN:
       return paddle::platform::CPUPlace();
     case pt::Backend::kCUDNN:
-      return paddle::platform::CUDAPlace();
+      return paddle::platform::CUDAPlace(
+          paddle::platform::GetCurrentDeviceId());
     default:
       PADDLE_THROW(paddle::platform::errors::Unimplemented(
           "Unsupported backend `%s` when casting it to paddle place type.",
diff --git a/paddle/tcmpt/cpu/math.cc b/paddle/tcmpt/cpu/math.cc
index 166c26543a4ae..9dc85d10dc171 100644
--- a/paddle/tcmpt/cpu/math.cc
+++ b/paddle/tcmpt/cpu/math.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/tcmpt/cpu/math.h"
 
+#include "paddle/tcmpt/eigen/mean.h"
 #include "paddle/tcmpt/eigen/scale.h"
 #include "paddle/tcmpt/eigen/sign.h"
 
@@ -23,15 +24,6 @@
 
 namespace pt {
 
-template <typename T,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenScalar = paddle::framework::EigenScalar<T, MajorType, IndexType>;
-template <typename T,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = paddle::framework::EigenVector<T, MajorType, IndexType>;
-
 template <typename T>
 void Sign(const CPUContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
   module::Sign<CPUContext, T>(dev_ctx, x, out);
@@ -39,11 +31,7 @@ void Sign(const CPUContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
 
 template <typename T>
 void Mean(const CPUContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
-  out->mutable_data<T>();
-  auto x_data = EigenVector<T>::Flatten(x);
-  auto y_data = EigenScalar<T>::From(*out);
-  auto& place = *dev_ctx.eigen_device();
-  y_data.device(place) = x_data.mean();
+  eigen::Mean<CPUContext, T>(dev_ctx, x, out);
 }
 
 template <typename T>
diff --git a/paddle/tcmpt/cuda/math.cu b/paddle/tcmpt/cuda/math.cu
index b96337ef20d04..474a72f22e930 100644
--- a/paddle/tcmpt/cuda/math.cu
+++ b/paddle/tcmpt/cuda/math.cu
@@ -14,8 +14,9 @@ limitations under the License. */
 
 #include "paddle/tcmpt/cuda/math.h"
 
-// #include "paddle/tcmpt/eigen/scale.h"
-// #include "paddle/tcmpt/eigen/sign.h"
+#include "paddle/tcmpt/eigen/mean.h"
+#include "paddle/tcmpt/eigen/scale.h"
+#include "paddle/tcmpt/eigen/sign.h"
 
 #ifdef __NVCC__
 #include "cub/cub.cuh"
@@ -57,6 +58,8 @@ void Sign(const CUDAContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
 
 template <typename T>
 void Mean(const CUDAContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
+  VLOG(1) << "chenweihang: call new pt mean kernel.";
+  // eigen::Mean<CUDAContext, T>(dev_ctx, x, out);
   auto size_prob = x.numel();
   const T* x_data = x.data<T>();
   T* out_data = out->mutable_data<T>();
diff --git a/paddle/tcmpt/cuda/math.h b/paddle/tcmpt/cuda/math.h
index 1b221ecbaa9e2..282803a54a292 100644
--- a/paddle/tcmpt/cuda/math.h
+++ b/paddle/tcmpt/cuda/math.h
@@ -20,9 +20,6 @@ limitations under the License. */
 #include "paddle/tcmpt/core/dense_tensor.h"
 #include "paddle/tcmpt/core/selected_rows_tensor.h"
 
-#include "paddle/tcmpt/eigen/scale.h"
-#include "paddle/tcmpt/eigen/sign.h"
-
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
 
diff --git a/paddle/tcmpt/eigen/mean.h b/paddle/tcmpt/eigen/mean.h
new file mode 100644
index 0000000000000..bd2c5ad2bf219
--- /dev/null
+++ b/paddle/tcmpt/eigen/mean.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/tcmpt/core/dense_tensor.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+
+namespace pt {
+namespace eigen {
+
+template <typename DevCtx, typename T>
+void Mean(const DevCtx& dev_ctx, const DenseTensor& x, DenseTensor* out) {
+  out->mutable_data<T>();
+
+  // TODO(chenweihang): if we design new tensor, we should support
+  // the low-level calc functor use new tensor as input,
+  // which may be a big project!
+  auto eigen_x = paddle::framework::EigenVector<T>::Flatten(x);
+  auto eigen_out = paddle::framework::EigenScalar<T>::From(*out);
+
+  auto& dev = *dev_ctx.eigen_device();
+  eigen_out.device(dev) = eigen_x.mean();
+}
+
+}  // namespace eigen
+}  // namespace pt

From bf0f99b4313448c015fa1a25bedd3beae1d25061 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 29 Sep 2021 11:36:16 +0000
Subject: [PATCH 070/125] fix ifdef error

---
 paddle/tcmpt/core/CMakeLists.txt   |  8 +++++++-
 paddle/tcmpt/core/convert_utils.cc | 10 ++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/paddle/tcmpt/core/CMakeLists.txt b/paddle/tcmpt/core/CMakeLists.txt
index 90a2e170d46fd..8c9e5ef9e7c74 100644
--- a/paddle/tcmpt/core/CMakeLists.txt
+++ b/paddle/tcmpt/core/CMakeLists.txt
@@ -8,7 +8,13 @@ cc_library(backend SRCS backend.cc)
 cc_library(dtype SRCS dtype.cc)
 cc_library(layout SRCS layout.cc)
 
-cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place backend dtype layout)
+if(WITH_GPU)
+    cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place backend dtype layout gpu_info)
+elseif(WITH_ROCM)
+    cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place backend dtype layout gpu_info)
+else()
+    cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place backend dtype layout)
+endif()
 cc_library(dense_tensor SRCS dense_tensor.cc DEPS enforce data_type ddim allocator place convert_utils ${MKLDNN_CTX_DEPS})
 cc_library(selected_rows_tensor SRCS selected_rows_tensor.cc DEPS dense_tensor)
 
diff --git a/paddle/tcmpt/core/convert_utils.cc b/paddle/tcmpt/core/convert_utils.cc
index 5059136b73d04..d393dcf51c61b 100644
--- a/paddle/tcmpt/core/convert_utils.cc
+++ b/paddle/tcmpt/core/convert_utils.cc
@@ -92,20 +92,30 @@ paddle::platform::Place TransToFluidPlace(const Backend& backend) {
   switch (backend) {
     case pt::Backend::kCPU:
       return paddle::platform::CPUPlace();
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     case pt::Backend::kCUDA:
       return paddle::platform::CUDAPlace(
           paddle::platform::GetCurrentDeviceId());
+#endif
+#ifdef PADDLE_WITH_XPU
     case pt::Backend::kXPU:
       // TODO(chenweihang): add device id
       return paddle::platform::XPUPlace();
+#endif
+#ifdef PADDLE_WITH_NPU
     case pt::Backend::kNPU:
       // TODO(chenweihang): add device id
       return paddle::platform::NPUPlace();
+#endif
+#ifdef PADDLE_WITH_MKLDNN
     case pt::Backend::kMKLDNN:
       return paddle::platform::CPUPlace();
+#endif
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     case pt::Backend::kCUDNN:
       return paddle::platform::CUDAPlace(
           paddle::platform::GetCurrentDeviceId());
+#endif
     default:
       PADDLE_THROW(paddle::platform::errors::Unimplemented(
           "Unsupported backend `%s` when casting it to paddle place type.",

From 73de8917fba22cb0c64c54ac7740c2d854155fb9 Mon Sep 17 00:00:00 2001
From: chentianyu03 <ctychentianyu@gmail.com>
Date: Thu, 30 Sep 2021 17:33:12 +0800
Subject: [PATCH 071/125] Op2functor (#7)

* add kernel args maker class

* make args maker non-const

* remove debug log

* modify codes by review options

* split constructPrKernelContext function

* fix output name bug

* fix test_mean_op test_sign_op failed
---
 paddle/fluid/framework/operator.cc            | 194 ++++++-----------
 paddle/fluid/framework/tcmpt_utils.cc         |  77 +++++++
 paddle/fluid/framework/tcmpt_utils.h          |   5 +
 paddle/fluid/imperative/CMakeLists.txt        |   4 +-
 .../imperative/kernel_args_names_maker.h      | 159 ++++++++++++++
 paddle/fluid/imperative/prepared_operator.cc  | 203 +++++-------------
 paddle/fluid/imperative/prepared_operator.h   |   1 +
 paddle/fluid/imperative/type_defs.h           |  11 +
 paddle/tcmpt/core/kernel_context.h            |  30 +++
 .../fluid/tests/unittests/test_mean_op.py     |   1 +
 .../fluid/tests/unittests/test_sign_op.py     |   1 +
 11 files changed, 401 insertions(+), 285 deletions(-)
 create mode 100644 paddle/fluid/imperative/kernel_args_names_maker.h

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 49fbebd4a5865..1e6ca38ce35f2 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -28,6 +28,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/transfer_scope_cache.h"
 #include "paddle/fluid/framework/unused_var_check.h"
 #include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/imperative/kernel_args_names_maker.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -1873,7 +1874,6 @@ pt::KernelKey OperatorWithKernel::ConstructPtKernelKey(
   return pt::KernelKey(backend, layout, dtype);
 }
 
-// TODO(chenweihang): This function is too complicated and needs to be split
 pt::KernelContext OperatorWithKernel::ConstructPtKernelContext(
     const RuntimeContext& ctx, const platform::DeviceContext& dev_ctx) const {
   VLOG(1) << RuntimeContextDebugString(ctx);
@@ -1888,162 +1888,88 @@ pt::KernelContext OperatorWithKernel::ConstructPtKernelContext(
   pt::KernelContext op_kernel_ctx(dev_ctx);
   auto input_defs = pt_kernel_->args_def().input_defs();
   auto output_defs = pt_kernel_->args_def().output_defs();
+  auto attr_defs = pt_kernel_->args_def().attribute_defs();
 
   // TODO(chenweihang): use ordered_map for VariableNameMap and VariableValueMap
   // If we the VariableValueMap are ordered, we can get tensor by iter the map,
   // and its order is same as OpProto
 
-  // TODO(chenweihang): For scale op, when the input has a `ScaleTensor`,
-  // the following scale attribute should be skipped, and there are many
-  // such ops, which require certain rules to process, now only for verify
-  // scale op
-  std::unordered_map<std::string, bool> contain_host_tensor_flags{
-      {"ScaleTensor", false}};
-  std::unordered_map<std::string, std::string> attr_to_host_tensor{
-      {"scale", "ScaleTensor"}};
-
-  auto* op_proto = Info().proto_;
-  for (int i = 0; i < op_proto->inputs_size(); ++i) {
-    auto in = op_proto->inputs()[i];
-    // TODO(chenweihang): skip special cases temporarily
-    // TODO(chenweihang): deal with diff param in vector
-    if ((in.has_extra() && in.extra()) || (in.has_quant() && in.quant())) {
-      VLOG(1) << "Static graph PtKernel input: skip extra & quant input - "
-              << in.name();
-      continue;
-    }
-    auto in_name = in.name();
-    if (in.has_dispensable() && in.dispensable()) {
-      if (contain_host_tensor_flags.count(in_name) > 0 &&
-          IsValidVar(in_name, ctx.inputs)) {
-        VLOG(1) << "Static graph PtKernel input: contain host input - "
-                << in_name;
-        contain_host_tensor_flags[in_name] = true;
-      } else {
-        VLOG(1) << "Static graph PtKernel input: skip dispensable input - "
-                << in_name;
-        continue;
-      }
-    }
-    VLOG(1) << "Static graph PtKernel input: " << in_name;
+  paddle::imperative::KernelArgsNameMakerByOpProto<Variable> argMaker(
+      Info().proto_, &ctx.inputs, &ctx.outputs);
+
+  auto& input_names = argMaker.GetInputArgsNames();
+  auto& output_names = argMaker.GetOutputArgsNames();
+  auto& attr_pairs = argMaker.GetAttrsArgsNamesAndTypes();
+
+  PADDLE_ENFORCE_EQ(input_names.size(), input_defs.size(),
+                    platform::errors::InvalidArgument(
+                        "the size of inputs_args names (%d) must be equal to "
+                        "the size of kernel input_defs (%d).",
+                        input_names.size(), input_defs.size()));
+
+  PADDLE_ENFORCE_EQ(output_names.size(), output_defs.size(),
+                    platform::errors::InvalidArgument(
+                        "the size of outputs_args names (%d) must be equal to "
+                        "the size of kernel output_defs (%d).",
+                        output_names.size(), output_defs.size()));
+
+  PADDLE_ENFORCE_EQ(attr_pairs.size(), attr_defs.size(),
+                    platform::errors::InvalidArgument(
+                        "the size of attribute_args names (%d) must be equal "
+                        "to the size of kernel attribute_defs (%d).",
+                        attr_pairs.size(), attr_defs.size()));
+
+  for (size_t i = 0; i < input_names.size(); ++i) {
     auto in_def = input_defs.at(i);
     VLOG(1) << "in_def: " << in_def.backend << ", " << in_def.dtype << ", "
             << in_def.layout;
-    // TODO(chenweihang): input need to be transformed by in all define
-    auto expected_place = pt::TransToFluidPlace(in_def.backend);
-    VLOG(1) << "expected_place: " << expected_place;
-    for (auto* var : ctx.inputs.at(in_name)) {
-      if (var->IsType<LoDTensor>()) {
-        VLOG(1) << "var is LoDTensor";
-        const auto& tensor = var->Get<LoDTensor>();
-        if (!platform::is_same_place(tensor.place(), expected_place)) {
-          VLOG(1) << "var place is mismatch.";
-          LoDTensor tmp_tensor;
-          TensorCopySync(tensor, expected_place, &tmp_tensor);
-          auto pt_in = MakeTensorImpl<pt::DenseTensor, LoDTensor>(
-              tmp_tensor, in_def.backend, in_def.dtype, in_def.layout);
-          op_kernel_ctx.EmplaceBackInput(pt_in);
-        } else {
-          auto pt_in = MakeTensorImpl<pt::DenseTensor, LoDTensor>(
-              tensor, in_def.backend, in_def.dtype, in_def.layout);
-          op_kernel_ctx.EmplaceBackInput(pt_in);
-        }
-      } else if (var->IsType<SelectedRows>()) {
-        const auto& tensor = var->Get<SelectedRows>();
-        if (!platform::is_same_place(tensor.value().place(), expected_place)) {
-          SelectedRows tmp_tensor;
-          tmp_tensor.set_rows(tensor.rows());
-          tmp_tensor.set_height(tensor.height());
-          TensorCopySync(tensor.value(), expected_place,
-                         tmp_tensor.mutable_value());
-          auto pt_in = MakeTensorImpl<pt::SelectedRowsTensor, SelectedRows>(
-              tmp_tensor, in_def.backend, in_def.dtype, in_def.layout);
-          op_kernel_ctx.EmplaceBackInput(pt_in);
-        } else {
-          auto pt_in = MakeTensorImpl<pt::SelectedRowsTensor, SelectedRows>(
-              tensor, in_def.backend, in_def.dtype, in_def.layout);
-          op_kernel_ctx.EmplaceBackInput(pt_in);
-        }
-      } else {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "Unsupported shared input `%s` type now when call pt kernel.",
-            ToTypeName(var->Type())));
-      }
+
+    auto ins_vector = ctx.inputs.at(input_names[i]);
+    std::vector<std::shared_ptr<pt::TensorInterface>> tmp_inputs;
+
+    for (auto var : ins_vector) {
+      auto pt_in = framework::InputVariableToPtTensor(*var, in_def);
+      tmp_inputs.emplace_back(pt_in);
     }
+    op_kernel_ctx.EmplaceBackInputs(tmp_inputs);
   }
-  for (int i = 0; i < op_proto->outputs_size(); ++i) {
-    auto out_name = op_proto->outputs()[i].name();
-    VLOG(1) << "Static graph PtKernel output: " << out_name;
-    // TODO(chenweihang): outputs also need skip some cases
+
+  for (size_t i = 0; i < output_names.size(); ++i) {
     auto out_def = output_defs.at(i);
-    for (auto* var : ctx.outputs.at(out_name)) {
-      // mutable_data before run kernel, to avoid share output form
-      // KernelContext to original tensor
-      if (var->IsType<LoDTensor>()) {
-        auto* tensor = var->GetMutable<LoDTensor>();
-        tensor->mutable_data(pt::TransToFluidPlace(out_def.backend),
-                             pt::TransToProtoVarType(out_def.dtype));
-        auto pt_out = MakeTensorImpl<pt::DenseTensor, LoDTensor>(
-            *tensor, out_def.backend, out_def.dtype, out_def.layout);
-        op_kernel_ctx.EmplaceBackOutput(pt_out);
-      } else if (var->IsType<SelectedRows>()) {
-        auto* tensor = var->GetMutable<SelectedRows>();
-        tensor->mutable_value()->mutable_data(
-            pt::TransToFluidPlace(out_def.backend),
-            pt::TransToProtoVarType(out_def.dtype));
-        auto pt_out = MakeTensorImpl<pt::SelectedRowsTensor, SelectedRows>(
-            *tensor, out_def.backend, out_def.dtype, out_def.layout);
-        op_kernel_ctx.EmplaceBackOutput(pt_out);
-      } else {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "Unsupported shared output `%s` type now when call pt kernel.",
-            ToTypeName(var->Type())));
-      }
+    auto outs_vector = ctx.outputs.at(output_names[i]);
+
+    std::vector<std::shared_ptr<pt::TensorInterface>> tmp_outputs;
+    for (auto var : outs_vector) {
+      auto pt_out = framework::OutputVariableToPtTensor(var, out_def);
+      tmp_outputs.emplace_back(pt_out);
     }
+    op_kernel_ctx.EmplaceBackOutputs(tmp_outputs);
   }
-  for (int i = 0; i < op_proto->attrs_size(); ++i) {
-    auto attr = op_proto->attrs()[i];
-    if (attr.name() == "use_mkldnn" || attr.name() == "op_role" ||
-        attr.name() == "op_role_var" || attr.name() == "op_namescope" ||
-        attr.name() == "op_callstack" || attr.name() == "op_device") {
-      VLOG(1) << "Static graph PtKernel attribute: skip needless attr - "
-              << attr.name();
-      continue;
-    }
-    if ((attr.has_extra() && attr.extra()) ||
-        (attr.has_quant() && attr.quant())) {
-      VLOG(1) << "Static graph PtKernel attribute: skip extra or quant attr - "
-              << attr.name();
-      continue;
-    }
-    if (attr_to_host_tensor.count(attr.name()) > 0 &&
-        contain_host_tensor_flags.at(attr_to_host_tensor.at(attr.name())) ==
-            true) {
-      VLOG(1) << "Static graph PtKernel attribute: skip dynaimc attr - "
-              << attr.name() << ", because "
-              << attr_to_host_tensor.at(attr.name()) << " exists.";
-      continue;
-    }
-    VLOG(1) << "Static graph PtKernel attribute: " << attr.name();
+
+  for (size_t i = 0; i < attr_pairs.size(); ++i) {
     // TODO(chenweihang): support other attrs
-    switch (attr.type()) {
-      case proto::AttrType::INT:
-        op_kernel_ctx.EmplaceBackAttr(Attr<int>(attr.name()));
+    // In principle, the attr required by the dynamic mode should be
+    // passed in from the Python side, and there is no need to look up
+    // from the default_map, but now this nor work
+    switch (attr_pairs[i].second) {
+      case framework::proto::AttrType::INT:
+        op_kernel_ctx.EmplaceBackAttr(Attr<int>(attr_pairs[i].first));
         break;
-      case proto::AttrType::FLOAT:
-        op_kernel_ctx.EmplaceBackAttr(Attr<float>(attr.name()));
+      case framework::proto::AttrType::FLOAT:
+        op_kernel_ctx.EmplaceBackAttr(Attr<float>(attr_pairs[i].first));
         break;
-      case proto::AttrType::BOOLEAN:
-        op_kernel_ctx.EmplaceBackAttr(Attr<bool>(attr.name()));
+      case framework::proto::AttrType::BOOLEAN:
+        op_kernel_ctx.EmplaceBackAttr(Attr<bool>(attr_pairs[i].first));
         break;
       default:
         // TODO(chenweihang): support other attrs type
         PADDLE_THROW(platform::errors::Unimplemented(
-            "unsupported cast op `%s`'s attribute `%s` when construct "
+            "unsupported cast op attribute `%s` when construct "
             "KernelContext.",
-            Type(), attr.name()));
+            attr_pairs[i].first));
     }
   }
+
   return op_kernel_ctx;
 }
 
diff --git a/paddle/fluid/framework/tcmpt_utils.cc b/paddle/fluid/framework/tcmpt_utils.cc
index a28cf9a57a0e4..6854ed7e63d26 100644
--- a/paddle/fluid/framework/tcmpt_utils.cc
+++ b/paddle/fluid/framework/tcmpt_utils.cc
@@ -16,6 +16,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/selected_rows.h"
 
+#include "paddle/fluid/framework/variable.h"
+#include "paddle/tcmpt/api/include/dev/core.h"
 #include "paddle/tcmpt/api/include/dev/symbols.h"
 
 namespace paddle {
@@ -109,6 +111,81 @@ void ShareTensorImpl<pt::DenseTensor>(pt::DenseTensor* tensor_impl,
                            pt::TransToProtoVarType(tensor_impl->type()));
 }
 
+std::shared_ptr<pt::TensorInterface> InputVariableToPtTensor(
+    const framework::Variable& variable, const pt::TensorArgDef& arg_def) {
+  auto expected_place = pt::TransToFluidPlace(arg_def.backend);
+
+  if (variable.template IsType<framework::LoDTensor>()) {
+    const auto& tensor = variable.template Get<framework::LoDTensor>();
+    if (!platform::is_same_place(tensor.place(), expected_place)) {
+      framework::LoDTensor tmp_tensor;
+      framework::TensorCopySync(tensor, expected_place, &tmp_tensor);
+      auto pt_in =
+          framework::MakeTensorImpl<pt::DenseTensor, framework::LoDTensor>(
+              tmp_tensor, arg_def.backend, arg_def.dtype, arg_def.layout);
+      return pt_in;
+    } else {
+      auto pt_in =
+          framework::MakeTensorImpl<pt::DenseTensor, framework::LoDTensor>(
+              tensor, arg_def.backend, arg_def.dtype, arg_def.layout);
+      return pt_in;
+    }
+  } else if (variable.template IsType<framework::SelectedRows>()) {
+    const auto& tensor = variable.template Get<framework::SelectedRows>();
+    if (!platform::is_same_place(tensor.value().place(), expected_place)) {
+      framework::SelectedRows tmp_tensor;
+      tmp_tensor.set_rows(tensor.rows());
+      tmp_tensor.set_height(tensor.height());
+      TensorCopySync(tensor.value(), expected_place,
+                     tmp_tensor.mutable_value());
+      auto pt_in = framework::MakeTensorImpl<pt::SelectedRowsTensor,
+                                             framework::SelectedRows>(
+          tmp_tensor, arg_def.backend, arg_def.dtype, arg_def.layout);
+      return pt_in;
+    } else {
+      auto pt_in = framework::MakeTensorImpl<pt::SelectedRowsTensor,
+                                             framework::SelectedRows>(
+          tensor, arg_def.backend, arg_def.dtype, arg_def.layout);
+      return pt_in;
+    }
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unsupported shared input `%s` type now when call pt kernel.",
+        framework::ToTypeName(variable.Type())));
+  }
+  return nullptr;
+}
+
+std::shared_ptr<pt::TensorInterface> OutputVariableToPtTensor(
+    framework::Variable* variable, const pt::TensorArgDef& arg_def) {
+  // mutable_data before run kernel, to avoid share output form
+  // KernelContext to original tensor
+  if (variable->template IsType<framework::LoDTensor>()) {
+    auto* tensor = variable->template GetMutable<framework::LoDTensor>();
+    tensor->mutable_data(pt::TransToFluidPlace(arg_def.backend),
+                         pt::TransToProtoVarType(arg_def.dtype));
+    auto pt_out =
+        framework::MakeTensorImpl<pt::DenseTensor, framework::LoDTensor>(
+            *tensor, arg_def.backend, arg_def.dtype, arg_def.layout);
+    return pt_out;
+  } else if (variable->template IsType<framework::SelectedRows>()) {
+    auto* tensor = variable->template GetMutable<framework::SelectedRows>();
+    tensor->mutable_value()->mutable_data(
+        pt::TransToFluidPlace(arg_def.backend),
+        pt::TransToProtoVarType(arg_def.dtype));
+    auto pt_out = framework::MakeTensorImpl<pt::SelectedRowsTensor,
+                                            framework::SelectedRows>(
+        *tensor, arg_def.backend, arg_def.dtype, arg_def.layout);
+    return pt_out;
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unsupported shared output `%s` type now when call pt kernel.",
+        framework::ToTypeName(variable->Type())));
+  }
+
+  return nullptr;
+}
+
 /* For MKLDNNDenseTensor (move this part into a single file later) */
 #ifdef PADDLE_WITH_MKLDNN
 
diff --git a/paddle/fluid/framework/tcmpt_utils.h b/paddle/fluid/framework/tcmpt_utils.h
index fecc98d90a66e..b677c0a3e4938 100644
--- a/paddle/fluid/framework/tcmpt_utils.h
+++ b/paddle/fluid/framework/tcmpt_utils.h
@@ -44,5 +44,10 @@ void ShareTensorImpl(PtTensorImplT* tensor_impl, LoDTensor* out);
 template <typename PtTensorImplT>
 void ShareTensorImpl(PtTensorImplT* tensor_impl, Tensor* out);
 
+std::shared_ptr<pt::TensorInterface> InputVariableToPtTensor(
+    const framework::Variable& variable, const pt::TensorArgDef& arg_def);
+std::shared_ptr<pt::TensorInterface> OutputVariableToPtTensor(
+    framework::Variable* variable, const pt::TensorArgDef& arg_def);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index cb744fb2b6aa2..617825870301b 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -1,9 +1,9 @@
 cc_library(imperative_flag SRCS flags.cc DEPS gflags flags)
 
 IF(WITH_XPU)
-cc_library(prepared_operator SRCS prepared_operator.cc DEPS xpu_op_list proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform nan_inf_utils)
+cc_library(prepared_operator SRCS prepared_operator.cc DEPS xpu_op_list proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform nan_inf_utils tcmpt_utils)
 ELSE()
-cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform nan_inf_utils)
+cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform nan_inf_utils tcmpt_utils)
 ENDIF()
 cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry)
 add_subdirectory(jit)
diff --git a/paddle/fluid/imperative/kernel_args_names_maker.h b/paddle/fluid/imperative/kernel_args_names_maker.h
new file mode 100644
index 0000000000000..b1fcf935426e6
--- /dev/null
+++ b/paddle/fluid/imperative/kernel_args_names_maker.h
@@ -0,0 +1,159 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "glog/logging.h"
+
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/imperative/type_defs.h"
+#include "paddle/utils/small_vector.h"
+
+namespace paddle {
+namespace imperative {
+// TODO(chenweihang): now only check single var input
+template <typename VarType>
+static bool IsValidVar(const std::string& name,
+                       const NameVarMap<VarType>& inputs) {
+  auto it = inputs.find(name);
+  if (it == inputs.end()) {
+    return false;
+  }
+  if (it->second.empty()) {
+    return false;
+  }
+  return it->second[0] != nullptr;
+}
+
+class KernelArgsNameMaker {
+ public:
+  virtual ~KernelArgsNameMaker() {}
+  virtual const paddle::SmallVector<std::string>& GetInputArgsNames() = 0;
+  virtual const paddle::SmallVector<std::string>& GetOutputArgsNames() = 0;
+  virtual const paddle::SmallVector<
+      std::pair<std::string, framework::proto::AttrType>>&
+  GetAttrsArgsNamesAndTypes() = 0;
+};
+
+template <typename VarType>
+class KernelArgsNameMakerByOpProto : public KernelArgsNameMaker {
+ public:
+  KernelArgsNameMakerByOpProto(framework::proto::OpProto* op_proto,
+                               const imperative::NameVarMap<VarType>* inputs,
+                               const imperative::NameVarMap<VarType>* outputs)
+      : op_proto_(op_proto), inputs_(inputs), outputs_(outputs) {}
+
+  ~KernelArgsNameMakerByOpProto() {}
+
+  const paddle::SmallVector<std::string>& GetInputArgsNames() override {
+    for (int i = 0; i < op_proto_->inputs_size(); ++i) {
+      auto in = op_proto_->inputs()[i];
+
+      // TODO(chenweihang): deal with diff param in vector
+      if ((in.has_extra() && in.extra()) || (in.has_quant() && in.quant())) {
+        VLOG(1) << "Dygraph PtKernel input: skip extra & quant input - "
+                << in.name();
+        continue;
+      }
+
+      std::string in_name = in.name();
+      if (in.has_dispensable() && in.dispensable()) {
+        if (this->contain_host_tensor_flags.count(in_name) > 0 &&
+            IsValidVar<VarType>(in_name, *inputs_)) {
+          VLOG(1) << "Dygraph PtKernel input: contain host input - " << in_name;
+          this->contain_host_tensor_flags[in_name] = true;
+        } else {
+          VLOG(1) << "Dygraph PtKernel input: skip dispensable input - "
+                  << in_name;
+          continue;
+        }
+      }
+
+      input_names.emplace_back(in.name());
+    }
+    return input_names;
+  }
+
+  const paddle::SmallVector<std::string>& GetOutputArgsNames() override {
+    for (int i = 0; i < op_proto_->outputs_size(); ++i) {
+      auto out_name = op_proto_->outputs()[i].name();
+      VLOG(1) << "Dygraph PtKernel output: " << out_name;
+      // TODO(chenweihang): outputs also need skip some cases
+
+      output_names.emplace_back(out_name);
+    }
+    return output_names;
+  }
+
+  const paddle::SmallVector<std::pair<std::string, framework::proto::AttrType>>&
+  GetAttrsArgsNamesAndTypes() override {
+    for (int i = 0; i < op_proto_->attrs_size(); ++i) {
+      auto attr = op_proto_->attrs()[i];
+      if (attr.name() == "use_mkldnn" || attr.name() == "op_role" ||
+          attr.name() == "op_role_var" || attr.name() == "op_namescope" ||
+          attr.name() == "op_callstack" || attr.name() == "op_device") {
+        VLOG(1) << "Dygraph PtKernel attribute: skip needless attr - "
+                << attr.name();
+        continue;
+      }
+      if ((attr.has_extra() && attr.extra()) ||
+          (attr.has_quant() && attr.quant())) {
+        VLOG(1) << "Dygraph PtKernel attribute: skip extra & quant attr - "
+                << attr.name();
+        continue;
+      }
+      if (attr_to_host_tensor.count(attr.name()) > 0 &&
+          contain_host_tensor_flags.at(attr_to_host_tensor.at(attr.name())) ==
+              true) {
+        VLOG(1) << "Dygraph PtKernel attribute: skip dynaimc attr - "
+                << attr.name() << ", because "
+                << attr_to_host_tensor.at(attr.name()) << " exists.";
+        continue;
+      }
+      VLOG(1) << "Dygraph PtKernel attribute: " << attr.name();
+      attr_names.emplace_back(
+          std::pair<std::string, framework::proto::AttrType>(attr.name(),
+                                                             attr.type()));
+    }
+
+    return attr_names;
+  }
+
+ private:
+  framework::proto::OpProto* op_proto_;
+
+  const imperative::NameVarMap<VarType>* inputs_;
+  const imperative::NameVarMap<VarType>* outputs_;
+
+  paddle::SmallVector<std::string> input_names;
+  paddle::SmallVector<std::string> output_names;
+  paddle::SmallVector<std::pair<std::string, framework::proto::AttrType>>
+      attr_names;
+
+  // TODO(chenweihang): For scale op, when the input has a `ScaleTensor`,
+  // the following scale attribute should be skipped, and there are many
+  // such ops, which require certain rules to process, now only for verify
+  // scale op
+  std::unordered_map<std::string, bool> contain_host_tensor_flags{
+      {"ScaleTensor", false}};
+  std::unordered_map<std::string, std::string> attr_to_host_tensor{
+      {"scale", "ScaleTensor"}};
+};
+
+}  // namespace imperative
+}  // namespace paddle
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 29a1476662ce8..f05d6b2b2e962 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -148,20 +148,6 @@ static bool ContainSelectedRows(const NameVarMap<VarType>& inputs) {
   return false;
 }
 
-// TODO(chenweihang): now only check single var input
-template <typename VarType>
-static bool IsValidVar(const std::string& name,
-                       const NameVarMap<VarType>& inputs) {
-  auto it = inputs.find(name);
-  if (it == inputs.end()) {
-    return false;
-  }
-  if (it->second.empty()) {
-    return false;
-  }
-  return it->second[0] != nullptr;
-}
-
 // TODO(chenweihang): enhance rules, not all dispensable inputs
 // are host tensor, now only for scale kernel verify
 template <typename VarType>
@@ -306,10 +292,9 @@ PreparedOp PreparedOp::Prepare(const NameVarMap<VariableWrapper>& ins,
                                       default_attrs);
 }
 
-// TODO(chenweihang): This function is too complicated and needs to be split
 template <typename VarType>
 static pt::KernelContext BuildDygraphKernelContext(
-    const pt::Kernel& pt_kernel, const framework::proto::OpProto& op_proto,
+    const pt::Kernel& pt_kernel, KernelArgsNameMaker* argsNameMaker,
     const NameVarMap<VarType>& ins, const NameVarMap<VarType>& outs,
     const framework::AttributeMap& attrs,
     const framework::AttributeMap& default_attrs,
@@ -324,163 +309,82 @@ static pt::KernelContext BuildDygraphKernelContext(
   pt::KernelContext op_kernel_ctx(dev_ctx);
   auto input_defs = pt_kernel.args_def().input_defs();
   auto output_defs = pt_kernel.args_def().output_defs();
-
-  // TODO(chenweihang): For scale op, when the input has a `ScaleTensor`,
-  // the following scale attribute should be skipped, and there are many
-  // such ops, which require certain rules to process, now only for verify
-  // scale op
-  std::unordered_map<std::string, bool> contain_host_tensor_flags{
-      {"ScaleTensor", false}};
-  std::unordered_map<std::string, std::string> attr_to_host_tensor{
-      {"scale", "ScaleTensor"}};
-
-  for (int i = 0; i < op_proto.inputs_size(); ++i) {
-    auto in = op_proto.inputs()[i];
-    // TODO(chenweihang): deal with diff param in vector
-    if ((in.has_extra() && in.extra()) || (in.has_quant() && in.quant())) {
-      VLOG(1) << "Dygraph PtKernel input: skip extra & quant input - "
-              << in.name();
-      continue;
-    }
-    auto in_name = in.name();
-    if (in.has_dispensable() && in.dispensable()) {
-      if (contain_host_tensor_flags.count(in_name) > 0 &&
-          IsValidVar<VarType>(in_name, ins)) {
-        VLOG(1) << "Dygraph PtKernel input: contain host input - " << in_name;
-        contain_host_tensor_flags[in_name] = true;
-      } else {
-        VLOG(1) << "Dygraph PtKernel input: skip dispensable input - "
-                << in_name;
-        continue;
-      }
-    }
-    VLOG(1) << "Dygraph PtKernel input: " << in_name;
+  auto attr_defs = pt_kernel.args_def().attribute_defs();
+
+  auto& input_names = argsNameMaker->GetInputArgsNames();
+  auto& output_names = argsNameMaker->GetOutputArgsNames();
+  auto& attr_pairs = argsNameMaker->GetAttrsArgsNamesAndTypes();
+
+  PADDLE_ENFORCE_EQ(input_names.size(), input_defs.size(),
+                    platform::errors::InvalidArgument(
+                        "the size of inputs_args names (%d) must be equal to "
+                        "the size of kernel input_defs (%d).",
+                        input_names.size(), input_defs.size()));
+
+  PADDLE_ENFORCE_EQ(output_names.size(), output_defs.size(),
+                    platform::errors::InvalidArgument(
+                        "the size of outputs_args names (%d) must be equal to "
+                        "the size of kernel output_defs (%d).",
+                        output_names.size(), output_defs.size()));
+
+  PADDLE_ENFORCE_EQ(attr_pairs.size(), attr_defs.size(),
+                    platform::errors::InvalidArgument(
+                        "the size of attribute_args names (%d) must be equal "
+                        "to the size of kernel attribute_defs (%d).",
+                        attr_pairs.size(), attr_defs.size()));
+
+  for (size_t i = 0; i < input_names.size(); ++i) {
     auto in_def = input_defs.at(i);
-    auto expected_place = pt::TransToFluidPlace(in_def.backend);
-    for (auto var : ins.at(in_name)) {
+
+    auto ins_vector = ins.at(input_names[i]);
+    std::vector<std::shared_ptr<pt::TensorInterface>> tmp_inputs;
+    for (auto var : ins_vector) {
       const auto& variable = var->Var();
-      if (variable.template IsType<framework::LoDTensor>()) {
-        const auto& tensor = variable.template Get<framework::LoDTensor>();
-        if (!platform::is_same_place(tensor.place(), expected_place)) {
-          framework::LoDTensor tmp_tensor;
-          framework::TensorCopySync(tensor, expected_place, &tmp_tensor);
-          auto pt_in =
-              framework::MakeTensorImpl<pt::DenseTensor, framework::LoDTensor>(
-                  tmp_tensor, in_def.backend, in_def.dtype, in_def.layout);
-          op_kernel_ctx.EmplaceBackInput(pt_in);
-        } else {
-          auto pt_in =
-              framework::MakeTensorImpl<pt::DenseTensor, framework::LoDTensor>(
-                  tensor, in_def.backend, in_def.dtype, in_def.layout);
-          op_kernel_ctx.EmplaceBackInput(pt_in);
-        }
-      } else if (variable.template IsType<framework::SelectedRows>()) {
-        const auto& tensor = variable.template Get<framework::SelectedRows>();
-        if (!platform::is_same_place(tensor.value().place(), expected_place)) {
-          framework::SelectedRows tmp_tensor;
-          tmp_tensor.set_rows(tensor.rows());
-          tmp_tensor.set_height(tensor.height());
-          TensorCopySync(tensor.value(), expected_place,
-                         tmp_tensor.mutable_value());
-          auto pt_in = framework::MakeTensorImpl<pt::SelectedRowsTensor,
-                                                 framework::SelectedRows>(
-              tmp_tensor, in_def.backend, in_def.dtype, in_def.layout);
-          op_kernel_ctx.EmplaceBackInput(pt_in);
-        } else {
-          auto pt_in = framework::MakeTensorImpl<pt::SelectedRowsTensor,
-                                                 framework::SelectedRows>(
-              tensor, in_def.backend, in_def.dtype, in_def.layout);
-          op_kernel_ctx.EmplaceBackInput(pt_in);
-        }
-      } else {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "Unsupported shared input `%s` type now when call pt kernel.",
-            framework::ToTypeName(variable.Type())));
-      }
+
+      auto pt_in = framework::InputVariableToPtTensor(variable, in_def);
+      tmp_inputs.emplace_back(pt_in);
     }
+    op_kernel_ctx.EmplaceBackInputs(tmp_inputs);
   }
 
-  for (int i = 0; i < op_proto.outputs_size(); ++i) {
-    auto out_name = op_proto.outputs()[i].name();
-    VLOG(1) << "Dygraph PtKernel output: " << out_name;
-    // TODO(chenweihang): outputs also need skip some cases
+  for (size_t i = 0; i < output_names.size(); ++i) {
     auto out_def = output_defs.at(i);
-    for (auto var : outs.at(out_name)) {
-      // mutable_data before run kernel, to avoid share output form
-      // KernelContext to original tensor
-      auto* variable = var->MutableVar();
-      if (variable->template IsType<framework::LoDTensor>()) {
-        auto* tensor = variable->template GetMutable<framework::LoDTensor>();
-        tensor->mutable_data(pt::TransToFluidPlace(out_def.backend),
-                             pt::TransToProtoVarType(out_def.dtype));
-        auto pt_out =
-            framework::MakeTensorImpl<pt::DenseTensor, framework::LoDTensor>(
-                *tensor, out_def.backend, out_def.dtype, out_def.layout);
-        op_kernel_ctx.EmplaceBackOutput(pt_out);
-      } else if (variable->template IsType<framework::SelectedRows>()) {
-        auto* tensor = variable->template GetMutable<framework::SelectedRows>();
-        tensor->mutable_value()->mutable_data(
-            pt::TransToFluidPlace(out_def.backend),
-            pt::TransToProtoVarType(out_def.dtype));
-        auto pt_out = framework::MakeTensorImpl<pt::SelectedRowsTensor,
-                                                framework::SelectedRows>(
-            *tensor, out_def.backend, out_def.dtype, out_def.layout);
-        op_kernel_ctx.EmplaceBackOutput(pt_out);
-      } else {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "Unsupported shared output `%s` type now when call pt kernel.",
-            framework::ToTypeName(variable->Type())));
-      }
+    auto outs_vector = outs.at(output_names[i]);
+
+    std::vector<std::shared_ptr<pt::TensorInterface>> tmp_outputs;
+    for (auto var : outs_vector) {
+      auto variable = var->MutableVar();
+
+      auto pt_out = framework::OutputVariableToPtTensor(variable, out_def);
+      tmp_outputs.emplace_back(pt_out);
     }
+    op_kernel_ctx.EmplaceBackOutputs(tmp_outputs);
   }
 
-  for (int i = 0; i < op_proto.attrs_size(); ++i) {
-    auto attr = op_proto.attrs()[i];
-    if (attr.name() == "use_mkldnn" || attr.name() == "op_role" ||
-        attr.name() == "op_role_var" || attr.name() == "op_namescope" ||
-        attr.name() == "op_callstack" || attr.name() == "op_device") {
-      VLOG(1) << "Dygraph PtKernel attribute: skip needless attr - "
-              << attr.name();
-      continue;
-    }
-    if ((attr.has_extra() && attr.extra()) ||
-        (attr.has_quant() && attr.quant())) {
-      VLOG(1) << "Dygraph PtKernel attribute: skip extra & quant attr - "
-              << attr.name();
-      continue;
-    }
-    if (attr_to_host_tensor.count(attr.name()) > 0 &&
-        contain_host_tensor_flags.at(attr_to_host_tensor.at(attr.name())) ==
-            true) {
-      VLOG(1) << "Dygraph PtKernel attribute: skip dynaimc attr - "
-              << attr.name() << ", because "
-              << attr_to_host_tensor.at(attr.name()) << " exists.";
-      continue;
-    }
-    VLOG(1) << "Dygraph PtKernel attribute: " << attr.name();
+  for (size_t i = 0; i < attr_pairs.size(); ++i) {
     // TODO(chenweihang): support other attrs
     // In principle, the attr required by the dynamic mode should be
     // passed in from the Python side, and there is no need to look up
     // from the default_map, but now this nor work
-    switch (attr.type()) {
+    switch (attr_pairs[i].second) {
       case framework::proto::AttrType::INT:
         op_kernel_ctx.EmplaceBackAttr(
-            GetAttr<int>(attrs, default_attrs, attr.name()));
+            GetAttr<int>(attrs, default_attrs, attr_pairs[i].first));
         break;
       case framework::proto::AttrType::FLOAT:
         op_kernel_ctx.EmplaceBackAttr(
-            GetAttr<float>(attrs, default_attrs, attr.name()));
+            GetAttr<float>(attrs, default_attrs, attr_pairs[i].first));
         break;
       case framework::proto::AttrType::BOOLEAN:
         op_kernel_ctx.EmplaceBackAttr(
-            GetAttr<bool>(attrs, default_attrs, attr.name()));
+            GetAttr<bool>(attrs, default_attrs, attr_pairs[i].first));
         break;
       default:
         // TODO(chenweihang): support other attrs type
         PADDLE_THROW(platform::errors::Unimplemented(
             "unsupported cast op attribute `%s` when construct "
             "KernelContext.",
-            attr.name()));
+            attr_pairs[i].first));
     }
   }
 
@@ -542,9 +446,10 @@ static void PreparedOpRunPtImpl(const framework::OperatorBase& op,
   static_cast<const framework::OperatorWithKernel&>(op).InferShape(
       &infer_shape_ctx);
 
-  auto op_kernel_ctx =
-      BuildDygraphKernelContext<VarType>(pt_kernel, *(op.Info().proto_), ins,
-                                         outs, attrs, default_attrs, *dev_ctx);
+  paddle::imperative::KernelArgsNameMakerByOpProto<VarType> argMaker(
+      op.Info().proto_, &ins, &outs);
+  auto op_kernel_ctx = BuildDygraphKernelContext<VarType>(
+      pt_kernel, &argMaker, ins, outs, attrs, default_attrs, *dev_ctx);
   pt_kernel(&op_kernel_ctx);
 
   // TODO(chenweihang): add flags
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index 8cfe209ec7ad0..4cc0bce603249 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -25,6 +25,7 @@
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/imperative/type_defs.h"
 
+#include "paddle/fluid/imperative/kernel_args_names_maker.h"
 #include "paddle/tcmpt/api/include/dev/core.h"
 
 DECLARE_bool(use_mkldnn);
diff --git a/paddle/fluid/imperative/type_defs.h b/paddle/fluid/imperative/type_defs.h
index 74fd152e72a57..fdbbc586979cd 100644
--- a/paddle/fluid/imperative/type_defs.h
+++ b/paddle/fluid/imperative/type_defs.h
@@ -20,6 +20,11 @@ limitations under the License. */
 #include <vector>
 
 namespace paddle {
+
+namespace framework {
+class Variable;
+}  // namespace framework
+
 namespace imperative {
 
 class VariableWrapper;
@@ -45,6 +50,12 @@ template <>
 struct NameVarMapTrait<VariableWrapper> {
   using Type = std::map<std::string, SavedVariableWrapperList>;
 };
+
+template <>
+struct NameVarMapTrait<paddle::framework::Variable> {
+  using Type = std::map<std::string, std::vector<paddle::framework::Variable*>>;
+};
+
 }  // namespace details
 
 template <typename T>
diff --git a/paddle/tcmpt/core/kernel_context.h b/paddle/tcmpt/core/kernel_context.h
index 4f2f4e121f014..057cbc11689f1 100644
--- a/paddle/tcmpt/core/kernel_context.h
+++ b/paddle/tcmpt/core/kernel_context.h
@@ -50,10 +50,37 @@ class KernelContext {
 
   void EmplaceBackInput(std::shared_ptr<TensorInterface> input) {
     inputs_.emplace_back(input);
+    // Record the start and end index of the input
+    int index = inputs_.size();
+    input_range_.emplace_back(std::pair<int, int>(index, index + 1));
+  }
+
+  void EmplaceBackInputs(std::vector<std::shared_ptr<TensorInterface>> inputs) {
+    for (auto in : inputs) {
+      inputs_.emplace_back(in);
+    }
+    // Record the start and end index of the input
+    int index = inputs_.size();
+    input_range_.emplace_back(
+        std::pair<int, int>(index, index + inputs.size()));
   }
 
   void EmplaceBackOutput(std::shared_ptr<TensorInterface> output) {
     outputs_.emplace_back(output);
+    // Record the start and end index of the input
+    int index = outputs_.size();
+    output_range_.emplace_back(std::pair<int, int>(index, index + 1));
+  }
+
+  void EmplaceBackOutputs(
+      std::vector<std::shared_ptr<TensorInterface>> outputs) {
+    for (auto out : outputs) {
+      outputs_.emplace_back(out);
+    }
+    // Record the start and end index of the input
+    int index = outputs_.size();
+    output_range_.emplace_back(
+        std::pair<int, int>(index, index + outputs.size()));
   }
 
   void EmplaceBackAttr(paddle::any attr) { attrs_.emplace_back(attr); }
@@ -78,6 +105,9 @@ class KernelContext {
     }
   }
 
+ private:
+  bool IsDuplicable() const { return input_range_.size() != inputs_.size(); }
+
  private:
   // DeviceContext base class
   const DeviceContext& dev_ctx_;
diff --git a/python/paddle/fluid/tests/unittests/test_mean_op.py b/python/paddle/fluid/tests/unittests/test_mean_op.py
index e2a2dcf44f056..d5cc81456b84b 100644
--- a/python/paddle/fluid/tests/unittests/test_mean_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mean_op.py
@@ -254,4 +254,5 @@ def test_errors(self):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sign_op.py b/python/paddle/fluid/tests/unittests/test_sign_op.py
index da5080eabddc9..bd145a968ed85 100644
--- a/python/paddle/fluid/tests/unittests/test_sign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sign_op.py
@@ -83,4 +83,5 @@ def test_static(self):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()

From e9b219d37e8485d0afd3d1a06b023fb05b22daf5 Mon Sep 17 00:00:00 2001
From: zyfncg <1370305206@qq.com>
Date: Thu, 30 Sep 2021 17:33:35 +0800
Subject: [PATCH 072/125] fill_any_like kernel refactor (#10)

* fill_any_like kernel refactor

* remove useless code of full_like c++ api
---
 paddle/fluid/operators/fill_any_like_op.h | 17 ++++--
 paddle/tcmpt/api/CMakeLists.txt           |  6 +-
 paddle/tcmpt/api/all.h                    |  2 +
 paddle/tcmpt/api/include/creation.h       | 23 ++++++++
 paddle/tcmpt/api/include/dev/creation.h   | 18 ++++++
 paddle/tcmpt/api/include/dev/symbols.h    |  2 +
 paddle/tcmpt/api/src/CMakeLists.txt       |  5 +-
 paddle/tcmpt/api/src/creation.cc          | 67 +++++++++++++++++++++++
 paddle/tcmpt/core/kernel_utils.h          |  4 ++
 paddle/tcmpt/cpu/CMakeLists.txt           |  1 +
 paddle/tcmpt/cpu/fill.cc                  | 48 ++++++++++++++++
 paddle/tcmpt/cpu/fill.h                   | 31 +++++++++++
 paddle/tcmpt/cpu/math.cc                  | 16 +++---
 paddle/tcmpt/cuda/CMakeLists.txt          |  2 +
 paddle/tcmpt/cuda/fill.cu                 | 48 ++++++++++++++++
 paddle/tcmpt/cuda/fill.h                  | 36 ++++++++++++
 paddle/tcmpt/cuda/math.cu                 | 16 +++---
 paddle/tcmpt/eigen/fill.h                 | 58 ++++++++++++++++++++
 paddle/tcmpt/eigen/scale.h                |  4 +-
 paddle/tcmpt/eigen/sign.h                 |  4 +-
 paddle/tcmpt/tests/CMakeLists.txt         |  1 +
 paddle/tcmpt/tests/test_dot_api.cc        |  9 ++-
 paddle/tcmpt/tests/test_fill_api.cc       | 65 ++++++++++++++++++++++
 paddle/tcmpt/tests/test_mean_api.cc       |  9 ++-
 24 files changed, 461 insertions(+), 31 deletions(-)
 create mode 100644 paddle/tcmpt/api/include/creation.h
 create mode 100644 paddle/tcmpt/api/include/dev/creation.h
 create mode 100644 paddle/tcmpt/api/src/creation.cc
 create mode 100644 paddle/tcmpt/cpu/fill.cc
 create mode 100644 paddle/tcmpt/cpu/fill.h
 create mode 100644 paddle/tcmpt/cuda/fill.cu
 create mode 100644 paddle/tcmpt/cuda/fill.h
 create mode 100644 paddle/tcmpt/eigen/fill.h
 create mode 100644 paddle/tcmpt/tests/test_fill_api.cc

diff --git a/paddle/fluid/operators/fill_any_like_op.h b/paddle/fluid/operators/fill_any_like_op.h
index 2fb7bf985f222..e8dad87d9644a 100644
--- a/paddle/fluid/operators/fill_any_like_op.h
+++ b/paddle/fluid/operators/fill_any_like_op.h
@@ -17,7 +17,10 @@ limitations under the License. */
 #include <limits>
 #include <type_traits>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/framework/tcmpt_utils.h"
+
+#include "paddle/tcmpt/api/include/dev/core.h"
+#include "paddle/tcmpt/api/include/dev/creation.h"
 
 namespace paddle {
 namespace operators {
@@ -31,6 +34,7 @@ class FillAnyLikeKernel : public framework::OpKernel<T> {
                                 float, T>::type>::type;
 
   void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<framework::Tensor>("X");
     auto* out = context.Output<framework::Tensor>("Out");
     out->mutable_data<T>(context.GetPlace());
 
@@ -58,9 +62,14 @@ class FillAnyLikeKernel : public framework::OpKernel<T> {
         std::isnan(value), false,
         platform::errors::InvalidArgument("The filled value is NaN."));
 
-    math::SetConstant<DeviceContext, T> setter;
-    setter(context.template device_context<DeviceContext>(), out,
-           static_cast<T>(value));
+    auto pt_x = framework::MakeTensorImpl<pt::DenseTensor>(*in, in->place(),
+                                                           in->type());
+    auto pt_out = framework::MakeTensorImpl<pt::DenseTensor>(*out, out->place(),
+                                                             out->type());
+
+    const auto& dev_ctx = context.template device_context<DeviceContext>();
+    // call new kernel
+    pt::FillAnyLike<T>(dev_ctx, *pt_x, value, pt_out.get());
   }
 };
 
diff --git a/paddle/tcmpt/api/CMakeLists.txt b/paddle/tcmpt/api/CMakeLists.txt
index 740cfbc4212a1..f9a547edb18d5 100644
--- a/paddle/tcmpt/api/CMakeLists.txt
+++ b/paddle/tcmpt/api/CMakeLists.txt
@@ -10,12 +10,12 @@ add_subdirectory(src)
 # endfunction()
 
 set(TCMPT_DEPS convert_utils dense_tensor selected_rows_tensor kernel_factory kernel_context)
-set(TCMPT_DEPS ${TCMPT_DEPS} math_cpu linalg_cpu)
+set(TCMPT_DEPS ${TCMPT_DEPS} math_cpu linalg_cpu fill_cpu)
 if(WITH_GPU OR WITH_ROCM)
-  set(TCMPT_DEPS ${TCMPT_DEPS} math_cuda linalg_cuda)
+  set(TCMPT_DEPS ${TCMPT_DEPS} math_cuda linalg_cuda fill_cuda)
 endif()
 
-set(TCMPT_DEPS ${TCMPT_DEPS} math_api linalg_api)
+set(TCMPT_DEPS ${TCMPT_DEPS} math_api linalg_api fill_api)
 
 # TODO(chenweihang): unify decclare into **_library
 # declare_module(MathCPU)
diff --git a/paddle/tcmpt/api/all.h b/paddle/tcmpt/api/all.h
index 2b5524396072a..86959c8ae43dc 100644
--- a/paddle/tcmpt/api/all.h
+++ b/paddle/tcmpt/api/all.h
@@ -16,11 +16,13 @@ limitations under the License. */
 
 // develop apis
 #include "paddle/tcmpt/api/include/dev/core.h"
+#include "paddle/tcmpt/api/include/dev/creation.h"
 #include "paddle/tcmpt/api/include/dev/infershape.h"
 #include "paddle/tcmpt/api/include/dev/linalg.h"
 #include "paddle/tcmpt/api/include/dev/math.h"
 
 // user apis
+#include "paddle/tcmpt/api/include/creation.h"
 #include "paddle/tcmpt/api/include/linalg.h"
 #include "paddle/tcmpt/api/include/math.h"
 #include "paddle/tcmpt/api/include/tensor.h"
diff --git a/paddle/tcmpt/api/include/creation.h b/paddle/tcmpt/api/include/creation.h
new file mode 100644
index 0000000000000..e4f870039eba5
--- /dev/null
+++ b/paddle/tcmpt/api/include/creation.h
@@ -0,0 +1,23 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/tcmpt/api/include/tensor.h"
+
+namespace pt {
+
+Tensor full_like(const Tensor& x, float value);
+
+}  // namespace pt
diff --git a/paddle/tcmpt/api/include/dev/creation.h b/paddle/tcmpt/api/include/dev/creation.h
new file mode 100644
index 0000000000000..02b14c50e5c04
--- /dev/null
+++ b/paddle/tcmpt/api/include/dev/creation.h
@@ -0,0 +1,18 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/tcmpt/cpu/fill.h"
+#include "paddle/tcmpt/cuda/fill.h"
diff --git a/paddle/tcmpt/api/include/dev/symbols.h b/paddle/tcmpt/api/include/dev/symbols.h
index c590c95c1fc94..bfda326326b62 100644
--- a/paddle/tcmpt/api/include/dev/symbols.h
+++ b/paddle/tcmpt/api/include/dev/symbols.h
@@ -19,8 +19,10 @@ limitations under the License. */
 // symbol declare
 PT_DECLARE_MODULE(MathCPU);
 PT_DECLARE_MODULE(LinalgCPU);
+PT_DECLARE_MODULE(FillCPU);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PT_DECLARE_MODULE(MathCUDA);
 PT_DECLARE_MODULE(LinalgCUDA);
+PT_DECLARE_MODULE(FillCUDA);
 #endif
diff --git a/paddle/tcmpt/api/src/CMakeLists.txt b/paddle/tcmpt/api/src/CMakeLists.txt
index 3deb6a08dbc86..b8982b13800e1 100644
--- a/paddle/tcmpt/api/src/CMakeLists.txt
+++ b/paddle/tcmpt/api/src/CMakeLists.txt
@@ -1,7 +1,8 @@
 set(API_DEPS convert_utils dense_tensor selected_rows_tensor kernel_factory kernel_context)
-set(API_DEPS ${API_DEPS} math_cpu linalg_cpu)
+set(API_DEPS ${API_DEPS} math_cpu linalg_cpu fill_cpu)
 if(WITH_GPU OR WITH_ROCM)
-  set(API_DEPS ${API_DEPS} math_cuda linalg_cuda)
+  set(API_DEPS ${API_DEPS} math_cuda linalg_cuda fill_cuda)
 endif()
 cc_library(math_api SRCS math.cc DEPS ${API_DEPS})
 cc_library(linalg_api SRCS linalg.cc DEPS ${API_DEPS})
+cc_library(fill_api SRCS creation.cc DEPS ${API_DEPS})
diff --git a/paddle/tcmpt/api/src/creation.cc b/paddle/tcmpt/api/src/creation.cc
new file mode 100644
index 0000000000000..668b14776d70d
--- /dev/null
+++ b/paddle/tcmpt/api/src/creation.cc
@@ -0,0 +1,67 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/tcmpt/api/include/creation.h"
+
+#include <memory>
+
+#include "glog/logging.h"
+
+#include "paddle/tcmpt/api/include/dev/core.h"
+#include "paddle/tcmpt/api/include/dev/creation.h"
+#include "paddle/tcmpt/api/include/dev/infershape.h"
+#include "paddle/tcmpt/core/kernel_generate.h"
+
+namespace pt {
+
+Tensor full_like(const Tensor& x, float value) {
+  // 1. Get kernel signature and kernel
+  auto kernel_signature = ParseKernelNameAndKeyByArgs("fill_any_like", x);
+  VLOG(1) << kernel_signature.first;
+  VLOG(1) << kernel_signature.second;
+  VLOG(1) << KernelFactory::Instance();
+
+  auto kernel = KernelFactory::Instance().SelectKernelOrThrowError(
+      kernel_signature.first, kernel_signature.second);
+  VLOG(1) << kernel;
+
+  // 2. Get Device Context
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_signature.second.backend());
+  auto kernel_context = KernelContext(*dev_ctx);
+
+  // 3. Auto data transform
+  auto dense_x = std::dynamic_pointer_cast<DenseTensor>(x.impl());
+  kernel_context.EmplaceBackInput(dense_x);
+
+  kernel_context.EmplaceBackAttr(value);
+
+  // 4. InferShape
+  auto out_dims = UnchangedInferShape(dense_x->dims());
+
+  // 5. Prepare outputs
+  pt::Tensor out;
+  auto out_def = kernel.args_def().output_defs()[0];
+  auto dense_out = std::make_shared<DenseTensor>(
+      TensorMeta(out_dims, out_def.backend, out_def.dtype, out_def.layout),
+      TensorStatus());
+  kernel_context.EmplaceBackOutput(dense_out);
+  out.set_impl(dense_out);
+
+  // 6. Call kernel
+  kernel(&kernel_context);
+
+  return out;
+}
+
+}  // namespace pt
diff --git a/paddle/tcmpt/core/kernel_utils.h b/paddle/tcmpt/core/kernel_utils.h
index ed863cbde14a6..05503dbd36116 100644
--- a/paddle/tcmpt/core/kernel_utils.h
+++ b/paddle/tcmpt/core/kernel_utils.h
@@ -158,6 +158,10 @@ struct KernelImpl<Return (*)(Args...), kernel_fn> {
 
   PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(bool);
   PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(float);
+  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(double);
+  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int);
+  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int64_t);
+  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(paddle::platform::float16);
 
   /* Output Helpers */
 
diff --git a/paddle/tcmpt/cpu/CMakeLists.txt b/paddle/tcmpt/cpu/CMakeLists.txt
index c53dd675862ca..261f8ddf940d9 100644
--- a/paddle/tcmpt/cpu/CMakeLists.txt
+++ b/paddle/tcmpt/cpu/CMakeLists.txt
@@ -6,3 +6,4 @@ endif()
 
 cc_library(math_cpu SRCS math.cc DEPS dense_tensor kernel_context kernel_factory eigen_function)
 cc_library(linalg_cpu SRCS linalg.cc DEPS dense_tensor kernel_context kernel_factory)
+cc_library(fill_cpu SRCS fill.cc DEPS dense_tensor kernel_context kernel_factory eigen_function)
diff --git a/paddle/tcmpt/cpu/fill.cc b/paddle/tcmpt/cpu/fill.cc
new file mode 100644
index 0000000000000..9b6d1dac7c961
--- /dev/null
+++ b/paddle/tcmpt/cpu/fill.cc
@@ -0,0 +1,48 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/tcmpt/cpu/fill.h"
+
+#include "paddle/tcmpt/core/kernel_registry.h"
+
+#include "paddle/tcmpt/eigen/fill.h"
+
+namespace pt {
+
+template <typename T>
+void FillAnyLike(const CPUContext& dev_ctx,
+                 const DenseTensor& x,
+                 float val,
+                 DenseTensor* out) {
+  PADDLE_ENFORCE_EQ(
+      std::isnan(val),
+      false,
+      paddle::platform::errors::InvalidArgument("The filled value is NaN."));
+  eigen::fill<CPUContext, T>(dev_ctx, out, val);
+}
+
+}  // namespace pt
+
+PT_REGISTER_MODULE(FillCPU);
+
+PT_REGISTER_KERNEL("fill_any_like",
+                   CPU,
+                   NCHW,
+                   pt::FillAnyLike,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   paddle::platform::float16) {}
diff --git a/paddle/tcmpt/cpu/fill.h b/paddle/tcmpt/cpu/fill.h
new file mode 100644
index 0000000000000..090112911bbab
--- /dev/null
+++ b/paddle/tcmpt/cpu/fill.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/tcmpt/core/dense_tensor.h"
+
+#include "paddle/fluid/platform/device_context.h"
+
+namespace pt {
+
+using CPUContext = paddle::platform::CPUDeviceContext;
+
+template <typename T>
+void FillAnyLike(const CPUContext& dev_ctx,
+                 const DenseTensor& x,
+                 float val,
+                 DenseTensor* out);
+
+}  // namespace pt
diff --git a/paddle/tcmpt/cpu/math.cc b/paddle/tcmpt/cpu/math.cc
index 9dc85d10dc171..1c27c9e53005c 100644
--- a/paddle/tcmpt/cpu/math.cc
+++ b/paddle/tcmpt/cpu/math.cc
@@ -26,7 +26,7 @@ namespace pt {
 
 template <typename T>
 void Sign(const CPUContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
-  module::Sign<CPUContext, T>(dev_ctx, x, out);
+  eigen::Sign<CPUContext, T>(dev_ctx, x, out);
 }
 
 template <typename T>
@@ -41,7 +41,7 @@ void Scale(const CPUContext& dev_ctx,
            float bias,
            bool bias_after_scale,
            DenseTensor* out) {
-  module::Scale<CPUContext, T>(dev_ctx, x, scale, bias, bias_after_scale, out);
+  eigen::Scale<CPUContext, T>(dev_ctx, x, scale, bias, bias_after_scale, out);
 }
 
 template <typename T>
@@ -66,12 +66,12 @@ void ScaleHost(const CPUContext& dev_ctx,
                float bias,
                bool bias_after_scale,
                DenseTensor* out) {
-  module::Scale<CPUContext, T>(dev_ctx,
-                               x,
-                               static_cast<float>(*scale.data<T>()),
-                               bias,
-                               bias_after_scale,
-                               out);
+  eigen::Scale<CPUContext, T>(dev_ctx,
+                              x,
+                              static_cast<float>(*scale.data<T>()),
+                              bias,
+                              bias_after_scale,
+                              out);
 }
 
 template <typename T>
diff --git a/paddle/tcmpt/cuda/CMakeLists.txt b/paddle/tcmpt/cuda/CMakeLists.txt
index f3d52c6ec6bf4..491b6d25b229b 100644
--- a/paddle/tcmpt/cuda/CMakeLists.txt
+++ b/paddle/tcmpt/cuda/CMakeLists.txt
@@ -7,7 +7,9 @@ endif()
 if(WITH_GPU)
   nv_library(math_cuda SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory)
   nv_library(linalg_cuda SRCS linalg.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
+  nv_library(fill_cuda SRCS fill.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
 elseif(WITH_ROCM)
   hip_library(math_cuda SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory)
   hip_library(linalg_cuda SRCS linalg.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
+  hip_library(fill_cuda SRCS fill.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
 endif()
diff --git a/paddle/tcmpt/cuda/fill.cu b/paddle/tcmpt/cuda/fill.cu
new file mode 100644
index 0000000000000..168af31c1cf81
--- /dev/null
+++ b/paddle/tcmpt/cuda/fill.cu
@@ -0,0 +1,48 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/tcmpt/cuda/fill.h"
+
+#include "paddle/tcmpt/core/kernel_registry.h"
+
+#include "paddle/tcmpt/eigen/fill.h"
+
+namespace pt {
+
+template <typename T>
+void FillAnyLike(const CUDAContext& dev_ctx,
+                 const DenseTensor& x,
+                 float val,
+                 DenseTensor* out) {
+  PADDLE_ENFORCE_EQ(
+      std::isnan(val),
+      false,
+      paddle::platform::errors::InvalidArgument("The filled value is NaN."));
+  eigen::fill<CUDAContext, T>(dev_ctx, out, val);
+}
+
+}  // namespace pt
+
+PT_REGISTER_MODULE(FillCUDA);
+
+PT_REGISTER_KERNEL("fill_any_like",
+                   CUDA,
+                   NCHW,
+                   pt::FillAnyLike,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   paddle::platform::float16) {}
diff --git a/paddle/tcmpt/cuda/fill.h b/paddle/tcmpt/cuda/fill.h
new file mode 100644
index 0000000000000..ff26ca11ca2a5
--- /dev/null
+++ b/paddle/tcmpt/cuda/fill.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+// CUDA and HIP use same api
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+#include "paddle/tcmpt/core/dense_tensor.h"
+
+#include "paddle/fluid/platform/device_context.h"
+
+namespace pt {
+
+using CUDAContext = paddle::platform::CUDADeviceContext;
+
+template <typename T>
+void FillAnyLike(const CUDAContext& dev_ctx,
+                 const DenseTensor& x,
+                 float val,
+                 DenseTensor* out);
+
+}  // namespace pt
+
+#endif
diff --git a/paddle/tcmpt/cuda/math.cu b/paddle/tcmpt/cuda/math.cu
index 474a72f22e930..15aa8c6966977 100644
--- a/paddle/tcmpt/cuda/math.cu
+++ b/paddle/tcmpt/cuda/math.cu
@@ -53,7 +53,7 @@ struct DivideFunctor {
 
 template <typename T>
 void Sign(const CUDAContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
-  module::Sign<CUDAContext, T>(dev_ctx, x, out);
+  eigen::Sign<CUDAContext, T>(dev_ctx, x, out);
 }
 
 template <typename T>
@@ -94,7 +94,7 @@ void Scale(const CUDAContext& dev_ctx,
            float bias,
            bool bias_after_scale,
            DenseTensor* out) {
-  module::Scale<CUDAContext, T>(dev_ctx, x, scale, bias, bias_after_scale, out);
+  eigen::Scale<CUDAContext, T>(dev_ctx, x, scale, bias, bias_after_scale, out);
 }
 
 template <typename T>
@@ -120,12 +120,12 @@ void ScaleHost(const CUDAContext& dev_ctx,
   if (paddle::platform::is_gpu_place(scale.place())) {
     throw std::runtime_error("scale host place error.");
   }
-  module::Scale<CUDAContext, T>(dev_ctx,
-                                x,
-                                static_cast<float>(*scale.data<T>()),
-                                bias,
-                                bias_after_scale,
-                                out);
+  eigen::Scale<CUDAContext, T>(dev_ctx,
+                               x,
+                               static_cast<float>(*scale.data<T>()),
+                               bias,
+                               bias_after_scale,
+                               out);
 }
 
 template <typename T>
diff --git a/paddle/tcmpt/eigen/fill.h b/paddle/tcmpt/eigen/fill.h
new file mode 100644
index 0000000000000..6a21ca6932cd5
--- /dev/null
+++ b/paddle/tcmpt/eigen/fill.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/tcmpt/core/dense_tensor.h"
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+
+namespace pt {
+namespace eigen {
+
+template <typename DeviceContext, typename T, typename VType>
+void fill(const DeviceContext& context, DenseTensor* tensor, VType val) {
+  tensor->mutable_data<T>();
+
+  using CommonType = typename std::common_type<
+      float,
+      typename std::conditional<
+          std::is_same<T, paddle::platform::float16>::value,
+          float,
+          T>::type>::type;
+
+  auto common_type_value = static_cast<CommonType>(val);
+
+  PADDLE_ENFORCE_EQ(
+      (common_type_value >=
+       static_cast<CommonType>(std::numeric_limits<T>::lowest())) &&
+          (common_type_value <=
+           static_cast<CommonType>(std::numeric_limits<T>::max())),
+      true,
+      paddle::platform::errors::InvalidArgument(
+          "The filled value is out of range for target type, "
+          "current kernel type is %s, the range should between %f "
+          "and %f, but now value is %f.",
+          typeid(T).name(),
+          static_cast<CommonType>(std::numeric_limits<T>::lowest()),
+          static_cast<CommonType>(std::numeric_limits<T>::max()),
+          static_cast<float>(val)));
+
+  auto t = paddle::framework::EigenVector<T>::Flatten(*tensor);
+  t.device(*context.eigen_device()) = t.constant(static_cast<T>(val));
+}
+
+}  // namespace eigen
+}  // namespace pt
diff --git a/paddle/tcmpt/eigen/scale.h b/paddle/tcmpt/eigen/scale.h
index d822256673201..5bea4fb300af4 100644
--- a/paddle/tcmpt/eigen/scale.h
+++ b/paddle/tcmpt/eigen/scale.h
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 
 namespace pt {
-namespace module {
+namespace eigen {
 
 template <typename DevCtx, typename T>
 void Scale(const DevCtx& dev_ctx,
@@ -47,5 +47,5 @@ void Scale(const DevCtx& dev_ctx,
       bias_after_scale);
 }
 
-}  // namespace module
+}  // namespace eigen
 }  // namespace pt
diff --git a/paddle/tcmpt/eigen/sign.h b/paddle/tcmpt/eigen/sign.h
index 10a11dff038ca..b138123e81ee0 100644
--- a/paddle/tcmpt/eigen/sign.h
+++ b/paddle/tcmpt/eigen/sign.h
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 
 namespace pt {
-namespace module {
+namespace eigen {
 
 template <typename DevCtx, typename T>
 void Sign(const DevCtx& dev_ctx, const DenseTensor& x, DenseTensor* out) {
@@ -41,5 +41,5 @@ void Sign(const DevCtx& dev_ctx, const DenseTensor& x, DenseTensor* out) {
       dev, eigen_out, eigen_x);
 }
 
-}  // namespace module
+}  // namespace eigen
 }  // namespace pt
diff --git a/paddle/tcmpt/tests/CMakeLists.txt b/paddle/tcmpt/tests/CMakeLists.txt
index aeeec69adc8e3..96df8853f3b26 100644
--- a/paddle/tcmpt/tests/CMakeLists.txt
+++ b/paddle/tcmpt/tests/CMakeLists.txt
@@ -2,3 +2,4 @@ cc_test(dense_tensor_test SRCS dense_tensor_test.cc DEPS dense_tensor)
 cc_test(kernel_factory_test SRCS kernel_factory_test.cc DEPS kernel_factory)
 cc_test(test_mean_api SRCS test_mean_api.cc DEPS math_api)
 cc_test(test_dot_api SRCS test_dot_api.cc DEPS linalg_api)
+cc_test(test_fill_api SRCS test_fill_api.cc DEPS fill_api)
diff --git a/paddle/tcmpt/tests/test_dot_api.cc b/paddle/tcmpt/tests/test_dot_api.cc
index fafd095d02166..ee541a5a1feed 100644
--- a/paddle/tcmpt/tests/test_dot_api.cc
+++ b/paddle/tcmpt/tests/test_dot_api.cc
@@ -15,11 +15,18 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/tcmpt/api/include/dev/symbols.h"
 #include "paddle/tcmpt/api/include/linalg.h"
 
 #include "paddle/tcmpt/core/dense_tensor.h"
 
+#include "paddle/tcmpt/core/kernel_registry.h"
+
+PT_DECLARE_MODULE(LinalgCPU);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PT_DECLARE_MODULE(LinalgCUDA);
+#endif
+
 namespace framework = paddle::framework;
 using DDim = paddle::framework::DDim;
 
diff --git a/paddle/tcmpt/tests/test_fill_api.cc b/paddle/tcmpt/tests/test_fill_api.cc
new file mode 100644
index 0000000000000..9b9add32f5b2b
--- /dev/null
+++ b/paddle/tcmpt/tests/test_fill_api.cc
@@ -0,0 +1,65 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/tcmpt/api/include/creation.h"
+
+#include "paddle/tcmpt/core/dense_tensor.h"
+
+#include "paddle/tcmpt/core/kernel_registry.h"
+
+PT_DECLARE_MODULE(FillCPU);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PT_DECLARE_MODULE(FillCUDA);
+#endif
+
+namespace framework = paddle::framework;
+using DDim = paddle::framework::DDim;
+
+TEST(API, fill) {
+  // 1. create tensor
+  auto dense_x = std::make_shared<pt::DenseTensor>(
+      pt::TensorMeta(framework::make_ddim({3, 2}),
+                     pt::Backend::kCPU,
+                     pt::DataType::kFLOAT32,
+                     pt::DataLayout::kNCHW),
+      pt::TensorStatus());
+  auto* dense_x_data = dense_x->mutable_data<float>();
+  dense_x_data[0] = 0;
+
+  float val = 1.0;
+
+  pt::Tensor x(dense_x);
+
+  // 2. test API
+  auto out = pt::full_like(x, val);
+
+  // 3. check result
+  ASSERT_EQ(out.shape().size(), 2);
+  ASSERT_EQ(out.shape()[0], 3);
+  ASSERT_EQ(out.numel(), 6);
+  ASSERT_EQ(out.is_cpu(), true);
+  ASSERT_EQ(out.type(), pt::DataType::kFLOAT32);
+  ASSERT_EQ(out.layout(), pt::DataLayout::kNCHW);
+  ASSERT_EQ(out.initialized(), true);
+
+  auto dense_out = std::dynamic_pointer_cast<pt::DenseTensor>(out.impl());
+  auto* actual_result = dense_out->data<float>();
+  for (auto i = 0; i < 6; i++) {
+    ASSERT_NEAR(actual_result[i], val, 1e-6f);
+  }
+}
diff --git a/paddle/tcmpt/tests/test_mean_api.cc b/paddle/tcmpt/tests/test_mean_api.cc
index 293f302cbead4..c3c993130d030 100644
--- a/paddle/tcmpt/tests/test_mean_api.cc
+++ b/paddle/tcmpt/tests/test_mean_api.cc
@@ -15,11 +15,18 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/tcmpt/api/include/dev/symbols.h"
 #include "paddle/tcmpt/api/include/math.h"
 
 #include "paddle/tcmpt/core/dense_tensor.h"
 
+#include "paddle/tcmpt/core/kernel_registry.h"
+
+PT_DECLARE_MODULE(MathCPU);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PT_DECLARE_MODULE(MathCUDA);
+#endif
+
 namespace framework = paddle::framework;
 using DDim = paddle::framework::DDim;
 

From 97898902f644af380b293162322e35df26f2a344 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 11 Oct 2021 09:57:40 +0000
Subject: [PATCH 073/125] skip dtype for fill_any_like

---
 paddle/fluid/imperative/kernel_args_names_maker.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/paddle/fluid/imperative/kernel_args_names_maker.h b/paddle/fluid/imperative/kernel_args_names_maker.h
index b1fcf935426e6..5863f3cae95c2 100644
--- a/paddle/fluid/imperative/kernel_args_names_maker.h
+++ b/paddle/fluid/imperative/kernel_args_names_maker.h
@@ -125,6 +125,12 @@ class KernelArgsNameMakerByOpProto : public KernelArgsNameMaker {
                 << attr_to_host_tensor.at(attr.name()) << " exists.";
         continue;
       }
+      // TODO(chenweihang): we need better methods to deal with special cases
+      if (attr.name() == "dtype") {
+        VLOG(1) << "Dygraph PtKernel attribute: skip " << op_proto_->type()
+                << "'s dtype attr.";
+        continue;
+      }
       VLOG(1) << "Dygraph PtKernel attribute: " << attr.name();
       attr_names.emplace_back(
           std::pair<std::string, framework::proto::AttrType>(attr.name(),

From 9b332702f40a5fa2f4aac23b80d1fb5bc7d24ee8 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 11 Oct 2021 12:58:13 +0000
Subject: [PATCH 074/125] add attrs for kernel key constrcut

---
 paddle/fluid/framework/operator.cc           | 21 +++++++++++++++-----
 paddle/fluid/framework/operator.h            |  3 ++-
 paddle/fluid/imperative/prepared_operator.cc |  4 +++-
 3 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 1e6ca38ce35f2..dd883843e0fb3 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1320,8 +1320,8 @@ void OperatorWithKernel::ChoosePtKernel(
       ConstructPtKernelName(Type(), *(Info().proto_), ctx.inputs);
 
   // 2. construct op kernel key
-  pt_kernel_key_.reset(
-      new pt::KernelKey(ConstructPtKernelKey(ctx.inputs, dev_ctx.GetPlace())));
+  pt_kernel_key_.reset(new pt::KernelKey(
+      ConstructPtKernelKey(ctx.inputs, Attrs(), dev_ctx.GetPlace())));
 
   // 3. selecte op kernel
   pt_kernel_.reset(new pt::Kernel(pt::KernelFactory::Instance().SelectKernel(
@@ -1837,12 +1837,16 @@ OpKernelType OperatorWithKernel::GetKernelTypeForVar(
 }
 
 pt::KernelKey OperatorWithKernel::ConstructPtKernelKey(
-    const VariableValueMap& inputs, const platform::Place& ctx_place) const {
+    const VariableValueMap& inputs, const AttributeMap& attrs,
+    const platform::Place& ctx_place) const {
   // 1. get backend based place and attrs
+  auto attr_reader = AttrReader(attrs);
   pt::Backend backend = pt::TransToPtBackend(ctx_place);
-  if (HasAttr("use_mkldnn") && Attr<bool>("use_mkldnn") == true) {
+  if (attrs.count("use_mkldnn") != 0 &&
+      attr_reader.Get<bool>("use_mkldnn") == true) {
     backend = pt::Backend::kMKLDNN;
-  } else if (HasAttr("use_cudnn") && Attr<bool>("use_cudnn") == true) {
+  } else if (attrs.count("use_cudnn") != 0 &&
+             attr_reader.Get<bool>("use_cudnn") == true) {
     backend = pt::Backend::kCUDNN;
   } else {
     // do nothing
@@ -1870,6 +1874,13 @@ pt::KernelKey OperatorWithKernel::ConstructPtKernelKey(
           "DataType should be indicated by input Variable at %s.", Type()));
   pt::DataType dtype = pt::TransToPtDataType(data_type);
 
+  // TODO(chenweihang): polish special dtype rules
+  if (attrs.count("dtype") != 0 &&
+      attr_reader.Get<int>("dtype") != static_cast<int>(data_type)) {
+    dtype = pt::TransToPtDataType(static_cast<framework::proto::VarType::Type>(
+        attr_reader.Get<int>("dtype")));
+  }
+
   // 4. build pt KernelKey
   return pt::KernelKey(backend, layout, dtype);
 }
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 09bfc65a17f0b..4e190d3d6c027 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -536,7 +536,8 @@ class OperatorWithKernel : public OperatorBase {
   /* member functions for adapting to tcmpt lib */
   // TODO(chenweihang): Temporarily as a class method
   virtual pt::KernelKey ConstructPtKernelKey(
-      const VariableValueMap& inputs, const platform::Place& ctx_place) const;
+      const VariableValueMap& inputs, const AttributeMap& attrs,
+      const platform::Place& ctx_place) const;
 
   virtual pt::KernelContext ConstructPtKernelContext(
       const RuntimeContext& ctx, const platform::DeviceContext& dev_ctx) const;
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index f05d6b2b2e962..34ab31846b289 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -209,7 +209,9 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
     auto kernel_name =
         ConstructPtKernelName<VarType>(op.Type(), (*op.Info().proto_), ins);
     auto inputs = BuildInputMap<VarType>(ins);
-    auto pt_kernel_key = op.ConstructPtKernelKey(inputs, place);
+    // we only need attrs here
+    // auto final_attrs = BuildAttrMap(attrs, default_attrs);
+    auto pt_kernel_key = op.ConstructPtKernelKey(inputs, attrs, place);
     auto pt_kernel =
         pt::KernelFactory::Instance().SelectKernel(kernel_name, pt_kernel_key);
     // for debug

From aa6ed57438de5a9c1a68f1c828370704f6d4ba07 Mon Sep 17 00:00:00 2001
From: chentianyu03 <ctychentianyu@gmail.com>
Date: Tue, 12 Oct 2021 13:25:25 +0800
Subject: [PATCH 075/125] add use_pt_kernel Flags to control whether to use pt
 kernel (#13)

* add use_pt_kernel Flags to control whether to use pt kernel

* change the default value to true for cheking pt kernels
---
 paddle/fluid/framework/operator.cc           |  4 +++-
 paddle/fluid/imperative/prepared_operator.cc |  4 +++-
 paddle/fluid/platform/flags.cc               | 14 ++++++++++++++
 3 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index dd883843e0fb3..eb1889ae1d8ef 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -51,6 +51,7 @@ DECLARE_bool(check_nan_inf);
 DECLARE_bool(enable_unused_var_check);
 PADDLE_DEFINE_EXPORTED_int32(inner_op_parallelism, 0,
                              "number of threads for inner op");
+DECLARE_bool(use_pt_kernel);
 
 namespace paddle {
 namespace framework {
@@ -1155,7 +1156,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   // phase
 
   // VLOG(1) << "Pt KernelFactory: " << pt::KernelFactory::Instance();
-  if (pt::KernelFactory::Instance().ContainsKernel(type_.c_str())) {
+  if (FLAGS_use_pt_kernel &&
+      pt::KernelFactory::Instance().ContainsKernel(type_.c_str())) {
     if (pt_kernel_key_.get() == nullptr || pt_kernel_.get() == nullptr) {
       ChoosePtKernel(*runtime_ctx, *dev_ctx);
     }
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 34ab31846b289..645343316a5b9 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -22,6 +22,7 @@
 #include "paddle/fluid/platform/xpu/xpu_op_list.h"
 #endif
 DECLARE_bool(check_nan_inf);
+DECLARE_bool(use_pt_kernel);
 
 namespace paddle {
 namespace imperative {
@@ -205,7 +206,8 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
 #endif
 
   // 1. get expected kernel key
-  if (pt::KernelFactory::Instance().ContainsKernel(op.Type().c_str())) {
+  if (FLAGS_use_pt_kernel &&
+      pt::KernelFactory::Instance().ContainsKernel(op.Type().c_str())) {
     auto kernel_name =
         ConstructPtKernelName<VarType>(op.Type(), (*op.Info().proto_), ins);
     auto inputs = BuildInputMap<VarType>(ins);
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index b97c3106439be..cfd03ca8df6aa 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -673,3 +673,17 @@ PADDLE_DEFINE_EXPORTED_int32(get_host_by_name_time, 120,
 PADDLE_DEFINE_EXPORTED_bool(
     apply_pass_to_program, false,
     "It controls whether to apply IR pass to program when using Fleet APIs");
+
+/**
+ * Pt kernel related FLAG
+ * Name: FLAGS_use_pt_kernel
+ * Since Version: 2.2.0
+ * Value Range: bool, default=false
+ * Example: FLAGS_use_pt_kernel=true would use the pt kernel to compute in the
+ * Op.
+ * Note:
+ */
+// TODO(chentianyu03): change default value to false before merge into develop
+// branch
+PADDLE_DEFINE_EXPORTED_bool(use_pt_kernel, true,
+                            "It controls whether to use pt kernel");

From 9db8e4ad29208ce39a097b97323f0e167ee519c6 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Tue, 12 Oct 2021 06:17:03 +0000
Subject: [PATCH 076/125] fix mutable_data cuda place error

---
 paddle/tcmpt/core/dense_tensor.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/paddle/tcmpt/core/dense_tensor.cc b/paddle/tcmpt/core/dense_tensor.cc
index d5306f08f0b54..921f0ee8d9102 100644
--- a/paddle/tcmpt/core/dense_tensor.cc
+++ b/paddle/tcmpt/core/dense_tensor.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/place.h"
 
 namespace pt {
@@ -57,16 +58,18 @@ Place DenseTensor::GetPlaceByBackend() const {
       return CPUPlace();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     case Backend::kCUDA:
-      return CUDAPlace();
+      return CUDAPlace(paddle::platform::GetCurrentDeviceId());
     case Backend::kCUDAPinned:
       return CUDAPinnedPlace();
 #endif
 #ifdef PADDLE_WITH_XPU
     case Backend::kXPU:
+      // TODO(chenweihang): add device id
       return XPUPlace();
 #endif
 #ifdef PADDLE_WITH_NPU
     case Backend::kNPU:
+      // TODO(chenweihang): add device id
       return NPUPlace();
     case Backend::kNPUPinned:
       return NPUPinnedPlace();

From c882b5cb5fac8b806588e54c36293dc7958695ee Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 13 Oct 2021 07:02:48 +0000
Subject: [PATCH 077/125] move high level apis into hapi

---
 paddle/fluid/framework/operator.h             |  2 +-
 paddle/fluid/framework/tcmpt_utils.cc         |  4 +--
 paddle/fluid/framework/tcmpt_utils.h          |  2 +-
 paddle/fluid/imperative/prepared_operator.h   |  2 +-
 paddle/fluid/operators/dot_op.h               |  4 +--
 paddle/fluid/operators/fill_any_like_op.h     |  4 +--
 paddle/fluid/operators/mean_op.h              |  4 +--
 paddle/fluid/operators/scale_op.h             |  4 +--
 paddle/fluid/operators/sign_op.h              |  4 +--
 paddle/tcmpt/CMakeLists.txt                   |  2 ++
 paddle/tcmpt/api/CMakeLists.txt               | 18 ++++------
 paddle/tcmpt/api/all.h                        | 10 ++----
 paddle/tcmpt/api/include/{dev => }/core.h     |  0
 paddle/tcmpt/api/include/creation.h           |  9 ++---
 .../tcmpt/api/include/{dev => }/infershape.h  |  0
 paddle/tcmpt/api/include/linalg.h             | 10 ++----
 paddle/tcmpt/api/include/math.h               | 10 ++----
 paddle/tcmpt/api/include/{dev => }/symbols.h  |  4 +--
 paddle/tcmpt/api/src/CMakeLists.txt           |  8 -----
 paddle/tcmpt/cpu/CMakeLists.txt               |  2 +-
 paddle/tcmpt/cpu/{fill.cc => creation.cc}     |  5 ++-
 paddle/tcmpt/cpu/{fill.h => creation.h}       |  0
 paddle/tcmpt/cuda/CMakeLists.txt              |  4 +--
 paddle/tcmpt/cuda/{fill.cu => creation.cu}    |  5 ++-
 paddle/tcmpt/cuda/{fill.h => creation.h}      |  0
 paddle/tcmpt/hapi/CMakeLists.txt              |  3 ++
 paddle/tcmpt/hapi/all.cc                      | 19 ++++++++++
 paddle/tcmpt/hapi/all.h                       | 21 +++++++++++
 .../include/dev => hapi/include}/creation.h   | 11 ++++--
 .../include/dev => hapi/include}/linalg.h     | 12 +++++--
 .../{api/include/dev => hapi/include}/math.h  | 12 +++++--
 paddle/tcmpt/{api => hapi}/include/tensor.h   | 28 ++++++++-------
 paddle/tcmpt/hapi/lib/CMakeLists.txt          |  3 ++
 .../tcmpt/{api/src => hapi/lib}/creation.cc   | 33 ++++++++---------
 .../{core => hapi/lib}/kernel_generate.h      | 29 ++++++++-------
 paddle/tcmpt/{api/src => hapi/lib}/linalg.cc  | 36 +++++++++----------
 paddle/tcmpt/{api/src => hapi/lib}/math.cc    | 33 ++++++++---------
 paddle/tcmpt/tests/CMakeLists.txt             |  2 +-
 paddle/tcmpt/tests/test_dot_api.cc            |  9 +++--
 paddle/tcmpt/tests/test_fill_api.cc           | 11 +++---
 paddle/tcmpt/tests/test_mean_api.cc           |  7 ++--
 41 files changed, 213 insertions(+), 173 deletions(-)
 rename paddle/tcmpt/api/include/{dev => }/core.h (100%)
 rename paddle/tcmpt/api/include/{dev => }/infershape.h (100%)
 rename paddle/tcmpt/api/include/{dev => }/symbols.h (92%)
 delete mode 100644 paddle/tcmpt/api/src/CMakeLists.txt
 rename paddle/tcmpt/cpu/{fill.cc => creation.cc} (95%)
 rename paddle/tcmpt/cpu/{fill.h => creation.h} (100%)
 rename paddle/tcmpt/cuda/{fill.cu => creation.cu} (95%)
 rename paddle/tcmpt/cuda/{fill.h => creation.h} (100%)
 create mode 100644 paddle/tcmpt/hapi/CMakeLists.txt
 create mode 100644 paddle/tcmpt/hapi/all.cc
 create mode 100644 paddle/tcmpt/hapi/all.h
 rename paddle/tcmpt/{api/include/dev => hapi/include}/creation.h (76%)
 rename paddle/tcmpt/{api/include/dev => hapi/include}/linalg.h (76%)
 rename paddle/tcmpt/{api/include/dev => hapi/include}/math.h (77%)
 rename paddle/tcmpt/{api => hapi}/include/tensor.h (90%)
 create mode 100644 paddle/tcmpt/hapi/lib/CMakeLists.txt
 rename paddle/tcmpt/{api/src => hapi/lib}/creation.cc (65%)
 rename paddle/tcmpt/{core => hapi/lib}/kernel_generate.h (84%)
 rename paddle/tcmpt/{api/src => hapi/lib}/linalg.cc (65%)
 rename paddle/tcmpt/{api/src => hapi/lib}/math.cc (67%)

diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 4e190d3d6c027..b844c2cf61407 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -39,7 +39,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/variant.h"
 #include "paddle/utils/flat_hash_map.h"
 
-#include "paddle/tcmpt/api/include/dev/core.h"
+#include "paddle/tcmpt/api/include/core.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/tcmpt_utils.cc b/paddle/fluid/framework/tcmpt_utils.cc
index 6854ed7e63d26..799fecfa442c2 100644
--- a/paddle/fluid/framework/tcmpt_utils.cc
+++ b/paddle/fluid/framework/tcmpt_utils.cc
@@ -17,8 +17,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows.h"
 
 #include "paddle/fluid/framework/variable.h"
-#include "paddle/tcmpt/api/include/dev/core.h"
-#include "paddle/tcmpt/api/include/dev/symbols.h"
+#include "paddle/tcmpt/api/include/core.h"
+#include "paddle/tcmpt/api/include/symbols.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/tcmpt_utils.h b/paddle/fluid/framework/tcmpt_utils.h
index b677c0a3e4938..0af8cd30bd34d 100644
--- a/paddle/fluid/framework/tcmpt_utils.h
+++ b/paddle/fluid/framework/tcmpt_utils.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/place.h"
 
-#include "paddle/tcmpt/api/include/dev/core.h"
+#include "paddle/tcmpt/api/include/core.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index 4cc0bce603249..d6ea055cecff2 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -26,7 +26,7 @@
 #include "paddle/fluid/imperative/type_defs.h"
 
 #include "paddle/fluid/imperative/kernel_args_names_maker.h"
-#include "paddle/tcmpt/api/include/dev/core.h"
+#include "paddle/tcmpt/api/include/core.h"
 
 DECLARE_bool(use_mkldnn);
 
diff --git a/paddle/fluid/operators/dot_op.h b/paddle/fluid/operators/dot_op.h
index 7655c4b97be81..a427da4f40f9f 100644
--- a/paddle/fluid/operators/dot_op.h
+++ b/paddle/fluid/operators/dot_op.h
@@ -21,8 +21,8 @@
 #include "paddle/fluid/platform/for_range.h"
 
 // only can include the headers in paddle/tcmpt/api dirs
-#include "paddle/tcmpt/api/include/dev/core.h"
-#include "paddle/tcmpt/api/include/dev/linalg.h"
+#include "paddle/tcmpt/api/include/core.h"
+#include "paddle/tcmpt/api/include/linalg.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/fill_any_like_op.h b/paddle/fluid/operators/fill_any_like_op.h
index e8dad87d9644a..c1c7152581ce5 100644
--- a/paddle/fluid/operators/fill_any_like_op.h
+++ b/paddle/fluid/operators/fill_any_like_op.h
@@ -19,8 +19,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tcmpt_utils.h"
 
-#include "paddle/tcmpt/api/include/dev/core.h"
-#include "paddle/tcmpt/api/include/dev/creation.h"
+#include "paddle/tcmpt/api/include/core.h"
+#include "paddle/tcmpt/api/include/creation.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/mean_op.h b/paddle/fluid/operators/mean_op.h
index dec0f4dd22f4c..1ae6f453a873e 100644
--- a/paddle/fluid/operators/mean_op.h
+++ b/paddle/fluid/operators/mean_op.h
@@ -18,8 +18,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/tcmpt_utils.h"
 
 // only can include the headers in paddle/top/api dirs
-#include "paddle/tcmpt/api/include/dev/core.h"
-#include "paddle/tcmpt/api/include/dev/math.h"
+#include "paddle/tcmpt/api/include/core.h"
+#include "paddle/tcmpt/api/include/math.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h
index aca28f1212ce8..ffc2a49232cd8 100644
--- a/paddle/fluid/operators/scale_op.h
+++ b/paddle/fluid/operators/scale_op.h
@@ -18,8 +18,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/tcmpt_utils.h"
 
 // only can include the headers in paddle/top/api dirs
-#include "paddle/tcmpt/api/include/dev/core.h"
-#include "paddle/tcmpt/api/include/dev/math.h"
+#include "paddle/tcmpt/api/include/core.h"
+#include "paddle/tcmpt/api/include/math.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/sign_op.h b/paddle/fluid/operators/sign_op.h
index 4b5d89b9b566c..bb439839bd330 100644
--- a/paddle/fluid/operators/sign_op.h
+++ b/paddle/fluid/operators/sign_op.h
@@ -20,8 +20,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 
 // only can include the headers in paddle/tcmpt/api dirs
-#include "paddle/tcmpt/api/include/dev/core.h"
-#include "paddle/tcmpt/api/include/dev/math.h"
+#include "paddle/tcmpt/api/include/core.h"
+#include "paddle/tcmpt/api/include/math.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/tcmpt/CMakeLists.txt b/paddle/tcmpt/CMakeLists.txt
index 329728d422c3f..c21428ef4715b 100644
--- a/paddle/tcmpt/CMakeLists.txt
+++ b/paddle/tcmpt/CMakeLists.txt
@@ -1,6 +1,8 @@
 include(tcmpt)
 # tcmpt api
 add_subdirectory(api)
+# tcmpt high level api
+add_subdirectory(hapi)
 # tcmpt core components
 add_subdirectory(core)
 # tcmpt eigne functors, now paddle must compiled with eigen, but eigen just is
diff --git a/paddle/tcmpt/api/CMakeLists.txt b/paddle/tcmpt/api/CMakeLists.txt
index f9a547edb18d5..4eee2c538d716 100644
--- a/paddle/tcmpt/api/CMakeLists.txt
+++ b/paddle/tcmpt/api/CMakeLists.txt
@@ -1,5 +1,3 @@
-add_subdirectory(src)
-
 # set(declare_file ${PADDLE_BINARY_DIR}/paddle/tcmpt/api/symbols.h.tmp CACHE INTERNAL "symbols.h file")
 # set(declare_file_final ${PADDLE_BINARY_DIR}/paddle/tcmpt/api/symbols.h)
 # file(WRITE ${declare_file} "// Generated by the paddle/tcmpt/api/CMakeLists.txt.  DO NOT EDIT!\n\n")
@@ -9,18 +7,14 @@ add_subdirectory(src)
 #     message(STATUS "")
 # endfunction()
 
-set(TCMPT_DEPS convert_utils dense_tensor selected_rows_tensor kernel_factory kernel_context)
-set(TCMPT_DEPS ${TCMPT_DEPS} math_cpu linalg_cpu fill_cpu)
-if(WITH_GPU OR WITH_ROCM)
-  set(TCMPT_DEPS ${TCMPT_DEPS} math_cuda linalg_cuda fill_cuda)
-endif()
-
-set(TCMPT_DEPS ${TCMPT_DEPS} math_api linalg_api fill_api)
-
 # TODO(chenweihang): unify decclare into **_library
 # declare_module(MathCPU)
 # declare_module(MathCUDA)
 
-cc_library(tcmpt SRCS all.cc DEPS ${TCMPT_DEPS})
+set(TCMPT_DEPS convert_utils dense_tensor selected_rows_tensor kernel_factory kernel_context)
+set(TCMPT_DEPS ${TCMPT_DEPS} math_cpu linalg_cpu creation_cpu)
+if(WITH_GPU OR WITH_ROCM)
+  set(TCMPT_DEPS ${TCMPT_DEPS} math_cuda linalg_cuda creation_cuda)
+endif()
 
-# copy_if_different(${declare_file} ${declare_file_final})
+cc_library(tcmpt SRCS all.cc DEPS ${TCMPT_DEPS})
diff --git a/paddle/tcmpt/api/all.h b/paddle/tcmpt/api/all.h
index 86959c8ae43dc..42079764bfe83 100644
--- a/paddle/tcmpt/api/all.h
+++ b/paddle/tcmpt/api/all.h
@@ -15,14 +15,8 @@ limitations under the License. */
 #pragma once
 
 // develop apis
-#include "paddle/tcmpt/api/include/dev/core.h"
-#include "paddle/tcmpt/api/include/dev/creation.h"
-#include "paddle/tcmpt/api/include/dev/infershape.h"
-#include "paddle/tcmpt/api/include/dev/linalg.h"
-#include "paddle/tcmpt/api/include/dev/math.h"
-
-// user apis
+#include "paddle/tcmpt/api/include/core.h"
 #include "paddle/tcmpt/api/include/creation.h"
+#include "paddle/tcmpt/api/include/infershape.h"
 #include "paddle/tcmpt/api/include/linalg.h"
 #include "paddle/tcmpt/api/include/math.h"
-#include "paddle/tcmpt/api/include/tensor.h"
diff --git a/paddle/tcmpt/api/include/dev/core.h b/paddle/tcmpt/api/include/core.h
similarity index 100%
rename from paddle/tcmpt/api/include/dev/core.h
rename to paddle/tcmpt/api/include/core.h
diff --git a/paddle/tcmpt/api/include/creation.h b/paddle/tcmpt/api/include/creation.h
index e4f870039eba5..e0ef25d202c6e 100644
--- a/paddle/tcmpt/api/include/creation.h
+++ b/paddle/tcmpt/api/include/creation.h
@@ -14,10 +14,5 @@
 
 #pragma once
 
-#include "paddle/tcmpt/api/include/tensor.h"
-
-namespace pt {
-
-Tensor full_like(const Tensor& x, float value);
-
-}  // namespace pt
+#include "paddle/tcmpt/cpu/creation.h"
+#include "paddle/tcmpt/cuda/creation.h"
diff --git a/paddle/tcmpt/api/include/dev/infershape.h b/paddle/tcmpt/api/include/infershape.h
similarity index 100%
rename from paddle/tcmpt/api/include/dev/infershape.h
rename to paddle/tcmpt/api/include/infershape.h
diff --git a/paddle/tcmpt/api/include/linalg.h b/paddle/tcmpt/api/include/linalg.h
index 0322aa91763a6..46acfaea32163 100644
--- a/paddle/tcmpt/api/include/linalg.h
+++ b/paddle/tcmpt/api/include/linalg.h
@@ -14,10 +14,6 @@
 
 #pragma once
 
-#include "paddle/tcmpt/api/include/tensor.h"
-
-namespace pt {
-
-Tensor dot(const Tensor& x, const Tensor& y);
-
-}  // namespace pt
+// See Note: [ How do we organize the kernel directory ]
+#include "paddle/tcmpt/cpu/linalg.h"
+#include "paddle/tcmpt/cuda/linalg.h"
diff --git a/paddle/tcmpt/api/include/math.h b/paddle/tcmpt/api/include/math.h
index 27e3f1a1d3cff..2f1a04d16f8ac 100644
--- a/paddle/tcmpt/api/include/math.h
+++ b/paddle/tcmpt/api/include/math.h
@@ -14,10 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/tcmpt/api/include/tensor.h"
-
-namespace pt {
-
-Tensor mean(const Tensor& x);
-
-}  // namespace pt
+// See Note: [ How do we organize the kernel directory ]
+#include "paddle/tcmpt/cpu/math.h"
+#include "paddle/tcmpt/cuda/math.h"
diff --git a/paddle/tcmpt/api/include/dev/symbols.h b/paddle/tcmpt/api/include/symbols.h
similarity index 92%
rename from paddle/tcmpt/api/include/dev/symbols.h
rename to paddle/tcmpt/api/include/symbols.h
index bfda326326b62..8dc75f859ce52 100644
--- a/paddle/tcmpt/api/include/dev/symbols.h
+++ b/paddle/tcmpt/api/include/symbols.h
@@ -19,10 +19,10 @@ limitations under the License. */
 // symbol declare
 PT_DECLARE_MODULE(MathCPU);
 PT_DECLARE_MODULE(LinalgCPU);
-PT_DECLARE_MODULE(FillCPU);
+PT_DECLARE_MODULE(CreationCPU);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PT_DECLARE_MODULE(MathCUDA);
 PT_DECLARE_MODULE(LinalgCUDA);
-PT_DECLARE_MODULE(FillCUDA);
+PT_DECLARE_MODULE(CreationCUDA);
 #endif
diff --git a/paddle/tcmpt/api/src/CMakeLists.txt b/paddle/tcmpt/api/src/CMakeLists.txt
deleted file mode 100644
index b8982b13800e1..0000000000000
--- a/paddle/tcmpt/api/src/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-set(API_DEPS convert_utils dense_tensor selected_rows_tensor kernel_factory kernel_context)
-set(API_DEPS ${API_DEPS} math_cpu linalg_cpu fill_cpu)
-if(WITH_GPU OR WITH_ROCM)
-  set(API_DEPS ${API_DEPS} math_cuda linalg_cuda fill_cuda)
-endif()
-cc_library(math_api SRCS math.cc DEPS ${API_DEPS})
-cc_library(linalg_api SRCS linalg.cc DEPS ${API_DEPS})
-cc_library(fill_api SRCS creation.cc DEPS ${API_DEPS})
diff --git a/paddle/tcmpt/cpu/CMakeLists.txt b/paddle/tcmpt/cpu/CMakeLists.txt
index 261f8ddf940d9..fbb0a45266003 100644
--- a/paddle/tcmpt/cpu/CMakeLists.txt
+++ b/paddle/tcmpt/cpu/CMakeLists.txt
@@ -6,4 +6,4 @@ endif()
 
 cc_library(math_cpu SRCS math.cc DEPS dense_tensor kernel_context kernel_factory eigen_function)
 cc_library(linalg_cpu SRCS linalg.cc DEPS dense_tensor kernel_context kernel_factory)
-cc_library(fill_cpu SRCS fill.cc DEPS dense_tensor kernel_context kernel_factory eigen_function)
+cc_library(creation_cpu SRCS creation.cc DEPS dense_tensor kernel_context kernel_factory eigen_function)
diff --git a/paddle/tcmpt/cpu/fill.cc b/paddle/tcmpt/cpu/creation.cc
similarity index 95%
rename from paddle/tcmpt/cpu/fill.cc
rename to paddle/tcmpt/cpu/creation.cc
index 9b6d1dac7c961..b117209fd35b0 100644
--- a/paddle/tcmpt/cpu/fill.cc
+++ b/paddle/tcmpt/cpu/creation.cc
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/tcmpt/cpu/fill.h"
+#include "paddle/tcmpt/cpu/creation.h"
 
 #include "paddle/tcmpt/core/kernel_registry.h"
-
 #include "paddle/tcmpt/eigen/fill.h"
 
 namespace pt {
@@ -34,7 +33,7 @@ void FillAnyLike(const CPUContext& dev_ctx,
 
 }  // namespace pt
 
-PT_REGISTER_MODULE(FillCPU);
+PT_REGISTER_MODULE(CreationCPU);
 
 PT_REGISTER_KERNEL("fill_any_like",
                    CPU,
diff --git a/paddle/tcmpt/cpu/fill.h b/paddle/tcmpt/cpu/creation.h
similarity index 100%
rename from paddle/tcmpt/cpu/fill.h
rename to paddle/tcmpt/cpu/creation.h
diff --git a/paddle/tcmpt/cuda/CMakeLists.txt b/paddle/tcmpt/cuda/CMakeLists.txt
index 491b6d25b229b..94de051e2e3a4 100644
--- a/paddle/tcmpt/cuda/CMakeLists.txt
+++ b/paddle/tcmpt/cuda/CMakeLists.txt
@@ -7,9 +7,9 @@ endif()
 if(WITH_GPU)
   nv_library(math_cuda SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory)
   nv_library(linalg_cuda SRCS linalg.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
-  nv_library(fill_cuda SRCS fill.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
+  nv_library(creation_cuda SRCS creation.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
 elseif(WITH_ROCM)
   hip_library(math_cuda SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory)
   hip_library(linalg_cuda SRCS linalg.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
-  hip_library(fill_cuda SRCS fill.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
+  hip_library(creation_cuda SRCS creation.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
 endif()
diff --git a/paddle/tcmpt/cuda/fill.cu b/paddle/tcmpt/cuda/creation.cu
similarity index 95%
rename from paddle/tcmpt/cuda/fill.cu
rename to paddle/tcmpt/cuda/creation.cu
index 168af31c1cf81..07fc5ee5f9b2b 100644
--- a/paddle/tcmpt/cuda/fill.cu
+++ b/paddle/tcmpt/cuda/creation.cu
@@ -12,10 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/tcmpt/cuda/fill.h"
+#include "paddle/tcmpt/cuda/creation.h"
 
 #include "paddle/tcmpt/core/kernel_registry.h"
-
 #include "paddle/tcmpt/eigen/fill.h"
 
 namespace pt {
@@ -34,7 +33,7 @@ void FillAnyLike(const CUDAContext& dev_ctx,
 
 }  // namespace pt
 
-PT_REGISTER_MODULE(FillCUDA);
+PT_REGISTER_MODULE(CreationCUDA);
 
 PT_REGISTER_KERNEL("fill_any_like",
                    CUDA,
diff --git a/paddle/tcmpt/cuda/fill.h b/paddle/tcmpt/cuda/creation.h
similarity index 100%
rename from paddle/tcmpt/cuda/fill.h
rename to paddle/tcmpt/cuda/creation.h
diff --git a/paddle/tcmpt/hapi/CMakeLists.txt b/paddle/tcmpt/hapi/CMakeLists.txt
new file mode 100644
index 0000000000000..ebc247ef8a2e2
--- /dev/null
+++ b/paddle/tcmpt/hapi/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_subdirectory(lib)
+
+cc_library(tcmpt_hapi SRCS all.cc DEPS math_api linalg_api creation_api)
diff --git a/paddle/tcmpt/hapi/all.cc b/paddle/tcmpt/hapi/all.cc
new file mode 100644
index 0000000000000..f43cdb9f78b53
--- /dev/null
+++ b/paddle/tcmpt/hapi/all.cc
@@ -0,0 +1,19 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/tcmpt/hapi/all.h"
+
+namespace paddle {
+namespace experimental {}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/tcmpt/hapi/all.h b/paddle/tcmpt/hapi/all.h
new file mode 100644
index 0000000000000..bd1c51fc49ed3
--- /dev/null
+++ b/paddle/tcmpt/hapi/all.h
@@ -0,0 +1,21 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+// user apis
+#include "paddle/tcmpt/hapi/include/creation.h"
+#include "paddle/tcmpt/hapi/include/linalg.h"
+#include "paddle/tcmpt/hapi/include/math.h"
+#include "paddle/tcmpt/hapi/include/tensor.h"
diff --git a/paddle/tcmpt/api/include/dev/creation.h b/paddle/tcmpt/hapi/include/creation.h
similarity index 76%
rename from paddle/tcmpt/api/include/dev/creation.h
rename to paddle/tcmpt/hapi/include/creation.h
index 02b14c50e5c04..98044636b12bb 100644
--- a/paddle/tcmpt/api/include/dev/creation.h
+++ b/paddle/tcmpt/hapi/include/creation.h
@@ -14,5 +14,12 @@
 
 #pragma once
 
-#include "paddle/tcmpt/cpu/fill.h"
-#include "paddle/tcmpt/cuda/fill.h"
+#include "paddle/tcmpt/hapi/include/tensor.h"
+
+namespace paddle {
+namespace experimental {
+
+Tensor full_like(const Tensor& x, float value);
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/tcmpt/api/include/dev/linalg.h b/paddle/tcmpt/hapi/include/linalg.h
similarity index 76%
rename from paddle/tcmpt/api/include/dev/linalg.h
rename to paddle/tcmpt/hapi/include/linalg.h
index 46acfaea32163..5e27fecd58a4e 100644
--- a/paddle/tcmpt/api/include/dev/linalg.h
+++ b/paddle/tcmpt/hapi/include/linalg.h
@@ -14,6 +14,12 @@
 
 #pragma once
 
-// See Note: [ How do we organize the kernel directory ]
-#include "paddle/tcmpt/cpu/linalg.h"
-#include "paddle/tcmpt/cuda/linalg.h"
+#include "paddle/tcmpt/hapi/include/tensor.h"
+
+namespace paddle {
+namespace experimental {
+
+Tensor dot(const Tensor& x, const Tensor& y);
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/tcmpt/api/include/dev/math.h b/paddle/tcmpt/hapi/include/math.h
similarity index 77%
rename from paddle/tcmpt/api/include/dev/math.h
rename to paddle/tcmpt/hapi/include/math.h
index 2f1a04d16f8ac..9245d1033c791 100644
--- a/paddle/tcmpt/api/include/dev/math.h
+++ b/paddle/tcmpt/hapi/include/math.h
@@ -14,6 +14,12 @@ limitations under the License. */
 
 #pragma once
 
-// See Note: [ How do we organize the kernel directory ]
-#include "paddle/tcmpt/cpu/math.h"
-#include "paddle/tcmpt/cuda/math.h"
+#include "paddle/tcmpt/hapi/include/tensor.h"
+
+namespace paddle {
+namespace experimental {
+
+Tensor mean(const Tensor& x);
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/tcmpt/api/include/tensor.h b/paddle/tcmpt/hapi/include/tensor.h
similarity index 90%
rename from paddle/tcmpt/api/include/tensor.h
rename to paddle/tcmpt/hapi/include/tensor.h
index 1c503c842ad30..eb64d66435c90 100644
--- a/paddle/tcmpt/api/include/tensor.h
+++ b/paddle/tcmpt/hapi/include/tensor.h
@@ -41,7 +41,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/platform/place.h"
 
-namespace pt {
+namespace paddle {
+namespace experimental {
 
 class Tensor;
 
@@ -90,7 +91,7 @@ class Tensor final {
    * @param {shared_ptr<TensorInterface>} tensor_impl
    * @return {Tensor}
    */
-  explicit Tensor(std::shared_ptr<TensorInterface> tensor_impl)
+  explicit Tensor(std::shared_ptr<pt::TensorInterface> tensor_impl)
       : impl_(std::move(tensor_impl)) {
     if (impl_.get() == nullptr) {
       throw std::runtime_error("TensorImpl with nullptr is not supported");
@@ -110,21 +111,21 @@ class Tensor final {
    * @param None
    * @return {DDim}
    */
-  DDim shape() const { return impl_->dims(); }
+  pt::DDim shape() const { return impl_->dims(); }
 
   /**
    * @description: Return the data type of current Tensor.
    * @param None
    * @return {DataType}
    */
-  DataType type() const { return impl_->type(); }
+  pt::DataType type() const { return impl_->type(); }
 
   /**
    * @description: Return the layout of current Tensor.
    * @param None
    * @return {DataLayout}
    */
-  DataLayout layout() const { return impl_->layout(); }
+  pt::DataLayout layout() const { return impl_->layout(); }
 
   /* Part 3: Device and Backend methods */
   /**
@@ -132,13 +133,13 @@ class Tensor final {
    * @param None
    * @return {Place}
    */
-  Place place() const { return impl_->place(); }
+  pt::Place place() const { return impl_->place(); }
 
   /**
    * Backend judgment APIs, shield the concept of Backend.
    */
-  bool is_cpu() const { return impl_->backend() == Backend::kCPU; }
-  bool is_cuda() const { return impl_->backend() == Backend::kCUDA; }
+  bool is_cpu() const { return impl_->backend() == pt::Backend::kCPU; }
+  bool is_cuda() const { return impl_->backend() == pt::Backend::kCUDA; }
   bool is_hip() const;
   bool is_xpu() const;
   bool is_npu() const;
@@ -164,14 +165,16 @@ class Tensor final {
    * @param None
    * @return {std::shared_ptr<TensorInterface>}
    */
-  std::shared_ptr<TensorInterface> impl() const { return impl_; }
+  std::shared_ptr<pt::TensorInterface> impl() const { return impl_; }
 
   /**
    * @description: Set the implemention of current Tensor.
    * @param {std::shared_ptr<TensorInterface>}
    * @return None
    */
-  void set_impl(const std::shared_ptr<TensorInterface>& impl) { impl_ = impl; }
+  void set_impl(const std::shared_ptr<pt::TensorInterface>& impl) {
+    impl_ = impl;
+  }
 
   // TODO(chenweihang): Whether API Tensor need `data` and `mutable_data`?
 
@@ -242,7 +245,7 @@ class Tensor final {
    * heterogeneous Tensor implementation, so that the API level can be unified
    * to one `Tensor`.
    */
-  std::shared_ptr<TensorInterface> impl_;
+  std::shared_ptr<pt::TensorInterface> impl_;
 
   /**
    * [ Why need abstract AutogradMetaInterface here? ]
@@ -258,4 +261,5 @@ class Tensor final {
   std::shared_ptr<AutogradMetaInterface> autograd_meta_ = nullptr;
 };
 
-}  // namespace pt
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/tcmpt/hapi/lib/CMakeLists.txt b/paddle/tcmpt/hapi/lib/CMakeLists.txt
new file mode 100644
index 0000000000000..c9f0fe2691a92
--- /dev/null
+++ b/paddle/tcmpt/hapi/lib/CMakeLists.txt
@@ -0,0 +1,3 @@
+cc_library(math_api SRCS math.cc DEPS tcmpt)
+cc_library(linalg_api SRCS linalg.cc DEPS tcmpt)
+cc_library(creation_api SRCS creation.cc DEPS tcmpt)
diff --git a/paddle/tcmpt/api/src/creation.cc b/paddle/tcmpt/hapi/lib/creation.cc
similarity index 65%
rename from paddle/tcmpt/api/src/creation.cc
rename to paddle/tcmpt/hapi/lib/creation.cc
index 668b14776d70d..e182a496df262 100644
--- a/paddle/tcmpt/api/src/creation.cc
+++ b/paddle/tcmpt/hapi/lib/creation.cc
@@ -12,49 +12,49 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/tcmpt/api/include/creation.h"
+#include "paddle/tcmpt/hapi/include/creation.h"
 
 #include <memory>
 
 #include "glog/logging.h"
 
-#include "paddle/tcmpt/api/include/dev/core.h"
-#include "paddle/tcmpt/api/include/dev/creation.h"
-#include "paddle/tcmpt/api/include/dev/infershape.h"
-#include "paddle/tcmpt/core/kernel_generate.h"
+#include "paddle/tcmpt/api/include/core.h"
+#include "paddle/tcmpt/api/include/infershape.h"
+#include "paddle/tcmpt/hapi/lib/kernel_generate.h"
 
-namespace pt {
+namespace paddle {
+namespace experimental {
 
 Tensor full_like(const Tensor& x, float value) {
   // 1. Get kernel signature and kernel
   auto kernel_signature = ParseKernelNameAndKeyByArgs("fill_any_like", x);
   VLOG(1) << kernel_signature.first;
   VLOG(1) << kernel_signature.second;
-  VLOG(1) << KernelFactory::Instance();
+  VLOG(1) << pt::KernelFactory::Instance();
 
-  auto kernel = KernelFactory::Instance().SelectKernelOrThrowError(
+  auto kernel = pt::KernelFactory::Instance().SelectKernelOrThrowError(
       kernel_signature.first, kernel_signature.second);
   VLOG(1) << kernel;
 
   // 2. Get Device Context
   auto* dev_ctx = GetDeviceContextByBackend(kernel_signature.second.backend());
-  auto kernel_context = KernelContext(*dev_ctx);
+  auto kernel_context = pt::KernelContext(*dev_ctx);
 
   // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<DenseTensor>(x.impl());
+  auto dense_x = std::dynamic_pointer_cast<pt::DenseTensor>(x.impl());
   kernel_context.EmplaceBackInput(dense_x);
 
   kernel_context.EmplaceBackAttr(value);
 
   // 4. InferShape
-  auto out_dims = UnchangedInferShape(dense_x->dims());
+  auto out_dims = pt::UnchangedInferShape(dense_x->dims());
 
   // 5. Prepare outputs
-  pt::Tensor out;
+  Tensor out;
   auto out_def = kernel.args_def().output_defs()[0];
-  auto dense_out = std::make_shared<DenseTensor>(
-      TensorMeta(out_dims, out_def.backend, out_def.dtype, out_def.layout),
-      TensorStatus());
+  auto dense_out = std::make_shared<pt::DenseTensor>(
+      pt::TensorMeta(out_dims, out_def.backend, out_def.dtype, out_def.layout),
+      pt::TensorStatus());
   kernel_context.EmplaceBackOutput(dense_out);
   out.set_impl(dense_out);
 
@@ -64,4 +64,5 @@ Tensor full_like(const Tensor& x, float value) {
   return out;
 }
 
-}  // namespace pt
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/tcmpt/core/kernel_generate.h b/paddle/tcmpt/hapi/lib/kernel_generate.h
similarity index 84%
rename from paddle/tcmpt/core/kernel_generate.h
rename to paddle/tcmpt/hapi/lib/kernel_generate.h
index 6cc8f411924d2..1b5f9d7ae02ac 100644
--- a/paddle/tcmpt/core/kernel_generate.h
+++ b/paddle/tcmpt/hapi/lib/kernel_generate.h
@@ -17,13 +17,16 @@ limitations under the License. */
 #include <string>
 #include <utility>
 
+#include "paddle/tcmpt/hapi/include/tensor.h"
+
 // TODO(chenweihang): split KernelName, Key, Kernel, Factory into diff files
 #include "paddle/tcmpt/core/kernel_factory.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
 
-namespace pt {
+namespace paddle {
+namespace experimental {
 
 // TODO(shixiaowei): replaced by new DeviceContext later
 using CPUContext = paddle::platform::CPUDeviceContext;
@@ -58,9 +61,9 @@ struct ArgsIterator {
 
 struct KernelNameAndKeyParser : ArgsIterator<KernelNameAndKeyParser> {
   std::string kernel_name;
-  Backend backend;
-  DataLayout layout;
-  DataType dtype;
+  pt::Backend backend;
+  pt::DataLayout layout;
+  pt::DataType dtype;
 
   explicit KernelNameAndKeyParser(const std::string& name)
       : kernel_name(name) {}
@@ -69,9 +72,9 @@ struct KernelNameAndKeyParser : ArgsIterator<KernelNameAndKeyParser> {
   // TODO(chenweihang): deal with multiple diff input Tensors
   void operator()(const Tensor& x) {
     if (x.is_cpu()) {
-      backend = Backend::kCPU;
+      backend = pt::Backend::kCPU;
     } else if (x.is_cuda()) {
-      backend = Backend::kCUDA;
+      backend = pt::Backend::kCUDA;
     } else {
       throw std::runtime_error("Unsupported backend when parser args.");
     }
@@ -94,19 +97,20 @@ struct KernelNameAndKeyParser : ArgsIterator<KernelNameAndKeyParser> {
 // suffix on the basis of the function name, or the input contains HostTensor,
 // and the `host` suffix should be added on the basis of the function name.
 template <typename... Args>
-std::pair<KernelName, KernelKey> ParseKernelNameAndKeyByArgs(
+std::pair<pt::KernelName, pt::KernelKey> ParseKernelNameAndKeyByArgs(
     const std::string& fn_name, const Args&... args) {
   auto parser = detail::KernelNameAndKeyParser(fn_name);
   parser(args...);
   // TODO(chenweihang): polish design here
-  KernelName kernel_name(parser.kernel_name);
-  KernelKey kernel_key(parser.backend, parser.layout, parser.dtype);
+  pt::KernelName kernel_name(parser.kernel_name);
+  pt::KernelKey kernel_key(parser.backend, parser.layout, parser.dtype);
   return std::make_pair(kernel_name, kernel_key);
 }
 
-paddle::platform::DeviceContext* GetDeviceContextByBackend(Backend backend) {
+paddle::platform::DeviceContext* GetDeviceContextByBackend(
+    pt::Backend backend) {
   auto& pool = paddle::platform::DeviceContextPool::Instance();
-  auto place = TransToFluidPlace(backend);
+  auto place = pt::TransToFluidPlace(backend);
   // switch (backend) {
   //   case Backend::kCPU:
   //     return pool.GetByPlace(paddle::platform::CPUPlace());
@@ -119,4 +123,5 @@ paddle::platform::DeviceContext* GetDeviceContextByBackend(Backend backend) {
   return pool.Get(place);
 }
 
-}  // namespace pt
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/tcmpt/api/src/linalg.cc b/paddle/tcmpt/hapi/lib/linalg.cc
similarity index 65%
rename from paddle/tcmpt/api/src/linalg.cc
rename to paddle/tcmpt/hapi/lib/linalg.cc
index 4be1c67bd169b..c21f37ead223a 100644
--- a/paddle/tcmpt/api/src/linalg.cc
+++ b/paddle/tcmpt/hapi/lib/linalg.cc
@@ -12,53 +12,52 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/tcmpt/api/include/linalg.h"
+#include "paddle/tcmpt/hapi/include/linalg.h"
 
 #include <memory>
 
 #include "glog/logging.h"
 
-#include "paddle/tcmpt/core/convert_utils.h"
-#include "paddle/tcmpt/core/dense_tensor.h"
-#include "paddle/tcmpt/core/kernel_context.h"
-#include "paddle/tcmpt/core/kernel_generate.h"
-#include "paddle/tcmpt/infershape/unary.h"
+#include "paddle/tcmpt/api/include/core.h"
+#include "paddle/tcmpt/api/include/infershape.h"
+#include "paddle/tcmpt/hapi/lib/kernel_generate.h"
 
-namespace pt {
+namespace paddle {
+namespace experimental {
 
 Tensor dot(const Tensor& x, const Tensor& y) {
   // 1. Get kernel signature and kernel
   auto kernel_signature = ParseKernelNameAndKeyByArgs("dot", x);
   VLOG(1) << kernel_signature.first;
   VLOG(1) << kernel_signature.second;
-  VLOG(1) << KernelFactory::Instance();
+  VLOG(1) << pt::KernelFactory::Instance();
 
-  auto kernel = KernelFactory::Instance().SelectKernelOrThrowError(
+  auto kernel = pt::KernelFactory::Instance().SelectKernelOrThrowError(
       kernel_signature.first, kernel_signature.second);
   VLOG(1) << kernel;
 
   // 2. Get Device Context
   auto* dev_ctx = GetDeviceContextByBackend(kernel_signature.second.backend());
-  auto kernel_context = KernelContext(*dev_ctx);
+  auto kernel_context = pt::KernelContext(*dev_ctx);
 
   // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<DenseTensor>(x.impl());
+  auto dense_x = std::dynamic_pointer_cast<pt::DenseTensor>(x.impl());
   kernel_context.EmplaceBackInput(dense_x);
-  auto dense_y = std::dynamic_pointer_cast<DenseTensor>(y.impl());
+  auto dense_y = std::dynamic_pointer_cast<pt::DenseTensor>(y.impl());
   kernel_context.EmplaceBackInput(dense_y);
   // TODO(chenweihang): add transform impl
 
   // 4. InferShape
   // TODO(chenweihang): how to auto selected infershape?
-  auto out_dims = DotInferShape(dense_x->dims());
+  auto out_dims = pt::DotInferShape(dense_x->dims());
 
   // 5. Prepare outputs
-  pt::Tensor out;
+  Tensor out;
   // TODO(chenweihang): deal with multiple outputs
   auto out_def = kernel.args_def().output_defs()[0];
-  auto dense_out = std::make_shared<DenseTensor>(
-      TensorMeta(out_dims, out_def.backend, out_def.dtype, out_def.layout),
-      TensorStatus());
+  auto dense_out = std::make_shared<pt::DenseTensor>(
+      pt::TensorMeta(out_dims, out_def.backend, out_def.dtype, out_def.layout),
+      pt::TensorStatus());
   kernel_context.EmplaceBackOutput(dense_out);
   out.set_impl(dense_out);
 
@@ -68,4 +67,5 @@ Tensor dot(const Tensor& x, const Tensor& y) {
   return out;
 }
 
-}  // namespace pt
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/tcmpt/api/src/math.cc b/paddle/tcmpt/hapi/lib/math.cc
similarity index 67%
rename from paddle/tcmpt/api/src/math.cc
rename to paddle/tcmpt/hapi/lib/math.cc
index 813cfde997edc..6088b24f2eda9 100644
--- a/paddle/tcmpt/api/src/math.cc
+++ b/paddle/tcmpt/hapi/lib/math.cc
@@ -12,50 +12,50 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/tcmpt/api/include/math.h"
+#include "paddle/tcmpt/hapi/include/math.h"
 
 #include <memory>
 
 #include "glog/logging.h"
 
-#include "paddle/tcmpt/api/include/dev/core.h"
-#include "paddle/tcmpt/api/include/dev/infershape.h"
-#include "paddle/tcmpt/api/include/dev/math.h"
-#include "paddle/tcmpt/core/kernel_generate.h"
+#include "paddle/tcmpt/api/include/core.h"
+#include "paddle/tcmpt/api/include/infershape.h"
+#include "paddle/tcmpt/hapi/lib/kernel_generate.h"
 
-namespace pt {
+namespace paddle {
+namespace experimental {
 
 Tensor mean(const Tensor& x) {
   // 1. Get kernel signature and kernel
   auto kernel_signature = ParseKernelNameAndKeyByArgs("mean", x);
   VLOG(1) << kernel_signature.first;
   VLOG(1) << kernel_signature.second;
-  VLOG(1) << KernelFactory::Instance();
+  VLOG(1) << pt::KernelFactory::Instance();
 
-  auto kernel = KernelFactory::Instance().SelectKernelOrThrowError(
+  auto kernel = pt::KernelFactory::Instance().SelectKernelOrThrowError(
       kernel_signature.first, kernel_signature.second);
   VLOG(1) << kernel;
 
   // 2. Get Device Context
   auto* dev_ctx = GetDeviceContextByBackend(kernel_signature.second.backend());
-  auto kernel_context = KernelContext(*dev_ctx);
+  auto kernel_context = pt::KernelContext(*dev_ctx);
 
   // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<DenseTensor>(x.impl());
+  auto dense_x = std::dynamic_pointer_cast<pt::DenseTensor>(x.impl());
   kernel_context.EmplaceBackInput(dense_x);
   // TODO(chenweihang): add transform impl
 
   // 4. InferShape
   // TODO(chenweihang): how to auto selected infershape?
-  auto out_dims = MeanInferShape(dense_x->dims());
+  auto out_dims = pt::MeanInferShape(dense_x->dims());
 
   // 5. Prepare outputs
-  pt::Tensor out;
+  Tensor out;
   // TODO(chenweihang): deal with multiple outputs
   auto out_def = kernel.args_def().output_defs()[0];
-  auto dense_out = std::make_shared<DenseTensor>(
-      TensorMeta(out_dims, out_def.backend, out_def.dtype, out_def.layout),
-      TensorStatus());
+  auto dense_out = std::make_shared<pt::DenseTensor>(
+      pt::TensorMeta(out_dims, out_def.backend, out_def.dtype, out_def.layout),
+      pt::TensorStatus());
   kernel_context.EmplaceBackOutput(dense_out);
   out.set_impl(dense_out);
 
@@ -66,4 +66,5 @@ Tensor mean(const Tensor& x) {
   return out;
 }
 
-}  // namespace pt
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/tcmpt/tests/CMakeLists.txt b/paddle/tcmpt/tests/CMakeLists.txt
index 96df8853f3b26..acf1624bc7e12 100644
--- a/paddle/tcmpt/tests/CMakeLists.txt
+++ b/paddle/tcmpt/tests/CMakeLists.txt
@@ -2,4 +2,4 @@ cc_test(dense_tensor_test SRCS dense_tensor_test.cc DEPS dense_tensor)
 cc_test(kernel_factory_test SRCS kernel_factory_test.cc DEPS kernel_factory)
 cc_test(test_mean_api SRCS test_mean_api.cc DEPS math_api)
 cc_test(test_dot_api SRCS test_dot_api.cc DEPS linalg_api)
-cc_test(test_fill_api SRCS test_fill_api.cc DEPS fill_api)
+cc_test(test_fill_api SRCS test_fill_api.cc DEPS creation_api)
diff --git a/paddle/tcmpt/tests/test_dot_api.cc b/paddle/tcmpt/tests/test_dot_api.cc
index ee541a5a1feed..8fdae5050e239 100644
--- a/paddle/tcmpt/tests/test_dot_api.cc
+++ b/paddle/tcmpt/tests/test_dot_api.cc
@@ -15,10 +15,9 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/tcmpt/api/include/linalg.h"
+#include "paddle/tcmpt/hapi/include/linalg.h"
 
 #include "paddle/tcmpt/core/dense_tensor.h"
-
 #include "paddle/tcmpt/core/kernel_registry.h"
 
 PT_DECLARE_MODULE(LinalgCPU);
@@ -57,11 +56,11 @@ TEST(API, dot) {
     }
   }
 
-  pt::Tensor x(dense_x);
-  pt::Tensor y(dense_y);
+  paddle::experimental::Tensor x(dense_x);
+  paddle::experimental::Tensor y(dense_y);
 
   // 2. test API
-  auto out = pt::dot(x, y);
+  auto out = paddle::experimental::dot(x, y);
 
   // 3. check result
   ASSERT_EQ(out.shape().size(), 2);
diff --git a/paddle/tcmpt/tests/test_fill_api.cc b/paddle/tcmpt/tests/test_fill_api.cc
index 9b9add32f5b2b..39a23a44bfa59 100644
--- a/paddle/tcmpt/tests/test_fill_api.cc
+++ b/paddle/tcmpt/tests/test_fill_api.cc
@@ -15,16 +15,15 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/tcmpt/api/include/creation.h"
+#include "paddle/tcmpt/hapi/include/creation.h"
 
 #include "paddle/tcmpt/core/dense_tensor.h"
-
 #include "paddle/tcmpt/core/kernel_registry.h"
 
-PT_DECLARE_MODULE(FillCPU);
+PT_DECLARE_MODULE(CreationCPU);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_DECLARE_MODULE(FillCUDA);
+PT_DECLARE_MODULE(CreationCUDA);
 #endif
 
 namespace framework = paddle::framework;
@@ -43,10 +42,10 @@ TEST(API, fill) {
 
   float val = 1.0;
 
-  pt::Tensor x(dense_x);
+  paddle::experimental::Tensor x(dense_x);
 
   // 2. test API
-  auto out = pt::full_like(x, val);
+  auto out = paddle::experimental::full_like(x, val);
 
   // 3. check result
   ASSERT_EQ(out.shape().size(), 2);
diff --git a/paddle/tcmpt/tests/test_mean_api.cc b/paddle/tcmpt/tests/test_mean_api.cc
index c3c993130d030..518a98738961c 100644
--- a/paddle/tcmpt/tests/test_mean_api.cc
+++ b/paddle/tcmpt/tests/test_mean_api.cc
@@ -15,10 +15,9 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/tcmpt/api/include/math.h"
+#include "paddle/tcmpt/hapi/include/math.h"
 
 #include "paddle/tcmpt/core/dense_tensor.h"
-
 #include "paddle/tcmpt/core/kernel_registry.h"
 
 PT_DECLARE_MODULE(MathCPU);
@@ -46,10 +45,10 @@ TEST(API, mean) {
     sum += i * 1.0;
   }
 
-  pt::Tensor x(dense_x);
+  paddle::experimental::Tensor x(dense_x);
 
   // 2. test API
-  auto out = pt::mean(x);
+  auto out = paddle::experimental::mean(x);
 
   // 3. check result
   ASSERT_EQ(out.shape().size(), 1);

From 46ba70c1dda1e89852ab4fd7b268d0a7466bdd95 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 14 Oct 2021 08:30:44 +0000
Subject: [PATCH 078/125] remove selectedrows adapting temporarily

---
 cmake/tcmpt.cmake                            |   3 +-
 paddle/fluid/framework/operator.cc           |   7 +-
 paddle/fluid/framework/tcmpt_utils.cc        |  51 +++------
 paddle/fluid/imperative/prepared_operator.cc |   7 +-
 paddle/tcmpt/api/CMakeLists.txt              |   2 +-
 paddle/tcmpt/api/include/core.h              |   1 -
 paddle/tcmpt/core/CMakeLists.txt             |   1 -
 paddle/tcmpt/core/kernel_registry.h          |   7 +-
 paddle/tcmpt/core/kernel_utils.h             |   7 +-
 paddle/tcmpt/core/scalar_tensor.h            |  19 ----
 paddle/tcmpt/core/selected_rows_tensor.cc    |  17 ---
 paddle/tcmpt/core/selected_rows_tensor.h     | 110 -------------------
 paddle/tcmpt/cpu/math.cc                     |  56 ----------
 paddle/tcmpt/cpu/math.h                      |  17 ---
 paddle/tcmpt/cuda/math.cu                    |  56 ----------
 paddle/tcmpt/cuda/math.h                     |  17 ---
 16 files changed, 33 insertions(+), 345 deletions(-)
 delete mode 100644 paddle/tcmpt/core/scalar_tensor.h
 delete mode 100644 paddle/tcmpt/core/selected_rows_tensor.cc
 delete mode 100644 paddle/tcmpt/core/selected_rows_tensor.h

diff --git a/cmake/tcmpt.cmake b/cmake/tcmpt.cmake
index 26d5eff926b55..3ffc168c6bed0 100644
--- a/cmake/tcmpt.cmake
+++ b/cmake/tcmpt.cmake
@@ -28,7 +28,8 @@ function(kernel_instantiate TARGET)
             string(REPLACE "CPUContext" "pt::CPUContext" inst_signature ${inst_signature})
             string(REPLACE "CUDAContext" "pt::CUDAContext" inst_signature ${inst_signature})
             string(REPLACE "DenseTensor" "pt::DenseTensor" inst_signature ${inst_signature})
-            string(REPLACE "SelectedRowsTensor" "pt::SelectedRowsTensor" inst_signature ${inst_signature})
+            # TODO(chenweihang): adapt SelectedRows after adding it
+            # string(REPLACE "SelectedRowsTensor" "pt::SelectedRowsTensor" inst_signature ${inst_signature})
             # message(STATUS "INST FUNC: ${inst_signature}")
             string(APPEND instantiate_context "template ${inst_signature};\n")
         endforeach()
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index eb1889ae1d8ef..b34cc9037fbff 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1302,9 +1302,10 @@ static pt::KernelName ConstructPtKernelName(const std::string& op_type,
                                             const proto::OpProto& op_proto,
                                             const VariableValueMap& inputs) {
   std::string overload_name;
-  if (ContainSelectedRows(inputs)) {
-    overload_name = pt::kContainSelectedRowsSuffix;
-  }
+  // TODO(chenweihang): adapt SelectedRows by xiaowei's design
+  // if (ContainSelectedRows(inputs)) {
+  //   overload_name = pt::kContainSelectedRowsSuffix;
+  // }
   if (ContainHostTensor(op_proto, inputs)) {
     if (overload_name != "") {
       overload_name += ".";
diff --git a/paddle/fluid/framework/tcmpt_utils.cc b/paddle/fluid/framework/tcmpt_utils.cc
index 799fecfa442c2..f83f6b593a60d 100644
--- a/paddle/fluid/framework/tcmpt_utils.cc
+++ b/paddle/fluid/framework/tcmpt_utils.cc
@@ -23,7 +23,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-/* For DenseTensor */
+// TODO(chenweihang, shixiaowei): adapt SelectedRows
 
 template <>
 std::shared_ptr<pt::DenseTensor> MakeTensorImpl<pt::DenseTensor, LoDTensor>(
@@ -59,26 +59,6 @@ std::shared_ptr<pt::DenseTensor> MakeTensorImpl<pt::DenseTensor, Tensor>(
   return tensor_impl;
 }
 
-template <>
-std::shared_ptr<pt::SelectedRowsTensor>
-MakeTensorImpl<pt::SelectedRowsTensor, SelectedRows>(const SelectedRows& tensor,
-                                                     pt::Backend backend,
-                                                     pt::DataType dtype,
-                                                     pt::DataLayout layout) {
-  auto value = tensor.value();
-  auto holder = value.Holder();
-  auto tensor_impl = std::make_shared<pt::SelectedRowsTensor>(
-      pt::TensorMeta(value.dims(), backend, dtype, layout, value.offset()),
-      pt::TensorStatus(), tensor.rows(), tensor.height());
-
-  if (holder != nullptr) {
-    tensor_impl->mutable_value()->ShareAllocation(tensor.value().Holder());
-  } else {
-    VLOG(1) << "Old SelectedRows holder is nullptr.";
-  }
-  return tensor_impl;
-}
-
 template <>
 std::shared_ptr<pt::DenseTensor> MakeTensorImpl<pt::DenseTensor>(
     const LoDTensor& tensor, const platform::Place& place,
@@ -131,21 +111,21 @@ std::shared_ptr<pt::TensorInterface> InputVariableToPtTensor(
       return pt_in;
     }
   } else if (variable.template IsType<framework::SelectedRows>()) {
+    // TODO(chenweihang): now we don't deal with row and height
+    // by xiaowei's advice
     const auto& tensor = variable.template Get<framework::SelectedRows>();
     if (!platform::is_same_place(tensor.value().place(), expected_place)) {
-      framework::SelectedRows tmp_tensor;
-      tmp_tensor.set_rows(tensor.rows());
-      tmp_tensor.set_height(tensor.height());
-      TensorCopySync(tensor.value(), expected_place,
-                     tmp_tensor.mutable_value());
-      auto pt_in = framework::MakeTensorImpl<pt::SelectedRowsTensor,
-                                             framework::SelectedRows>(
-          tmp_tensor, arg_def.backend, arg_def.dtype, arg_def.layout);
+      framework::Tensor tmp_tensor;
+      TensorCopySync(tensor.value(), expected_place, &tmp_tensor);
+      // TODO(chenweihang): adapt SelectedRows by xiaowei's design
+      auto pt_in =
+          framework::MakeTensorImpl<pt::DenseTensor, framework::Tensor>(
+              tmp_tensor, arg_def.backend, arg_def.dtype, arg_def.layout);
       return pt_in;
     } else {
-      auto pt_in = framework::MakeTensorImpl<pt::SelectedRowsTensor,
-                                             framework::SelectedRows>(
-          tensor, arg_def.backend, arg_def.dtype, arg_def.layout);
+      auto pt_in =
+          framework::MakeTensorImpl<pt::DenseTensor, framework::Tensor>(
+              tensor.value(), arg_def.backend, arg_def.dtype, arg_def.layout);
       return pt_in;
     }
   } else {
@@ -173,9 +153,10 @@ std::shared_ptr<pt::TensorInterface> OutputVariableToPtTensor(
     tensor->mutable_value()->mutable_data(
         pt::TransToFluidPlace(arg_def.backend),
         pt::TransToProtoVarType(arg_def.dtype));
-    auto pt_out = framework::MakeTensorImpl<pt::SelectedRowsTensor,
-                                            framework::SelectedRows>(
-        *tensor, arg_def.backend, arg_def.dtype, arg_def.layout);
+    // TODO(chenweihang): adapt SelectedRows by xiaowei's design,
+    // here the row and height will lost in output!
+    auto pt_out = framework::MakeTensorImpl<pt::DenseTensor, framework::Tensor>(
+        tensor->value(), arg_def.backend, arg_def.dtype, arg_def.layout);
     return pt_out;
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 645343316a5b9..c800e6de5a89d 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -168,9 +168,10 @@ static pt::KernelName ConstructPtKernelName(
     const std::string& op_type, const framework::proto::OpProto& op_proto,
     const NameVarMap<VarType>& inputs) {
   std::string overload_name;
-  if (ContainSelectedRows<VarType>(inputs)) {
-    overload_name = pt::kContainSelectedRowsSuffix;
-  }
+  // TODO(chenweihang): adapt SelectedRows by xiaowei's design
+  // if (ContainSelectedRows<VarType>(inputs)) {
+  //   overload_name = pt::kContainSelectedRowsSuffix;
+  // }
   if (ContainHostTensor<VarType>(op_proto, inputs)) {
     if (overload_name != "") {
       overload_name += ".";
diff --git a/paddle/tcmpt/api/CMakeLists.txt b/paddle/tcmpt/api/CMakeLists.txt
index 4eee2c538d716..54a48ca6a57a0 100644
--- a/paddle/tcmpt/api/CMakeLists.txt
+++ b/paddle/tcmpt/api/CMakeLists.txt
@@ -11,7 +11,7 @@
 # declare_module(MathCPU)
 # declare_module(MathCUDA)
 
-set(TCMPT_DEPS convert_utils dense_tensor selected_rows_tensor kernel_factory kernel_context)
+set(TCMPT_DEPS convert_utils dense_tensor kernel_factory kernel_context)
 set(TCMPT_DEPS ${TCMPT_DEPS} math_cpu linalg_cpu creation_cpu)
 if(WITH_GPU OR WITH_ROCM)
   set(TCMPT_DEPS ${TCMPT_DEPS} math_cuda linalg_cuda creation_cuda)
diff --git a/paddle/tcmpt/api/include/core.h b/paddle/tcmpt/api/include/core.h
index 687dc72bb351f..3f95e8ceb38da 100644
--- a/paddle/tcmpt/api/include/core.h
+++ b/paddle/tcmpt/api/include/core.h
@@ -20,4 +20,3 @@ limitations under the License. */
 #include "paddle/tcmpt/core/kernel_context.h"
 #include "paddle/tcmpt/core/kernel_factory.h"
 #include "paddle/tcmpt/core/mkldnn_dense_tensor.h"
-#include "paddle/tcmpt/core/selected_rows_tensor.h"
diff --git a/paddle/tcmpt/core/CMakeLists.txt b/paddle/tcmpt/core/CMakeLists.txt
index 8c9e5ef9e7c74..5eadf3db39a64 100644
--- a/paddle/tcmpt/core/CMakeLists.txt
+++ b/paddle/tcmpt/core/CMakeLists.txt
@@ -16,7 +16,6 @@ else()
     cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place backend dtype layout)
 endif()
 cc_library(dense_tensor SRCS dense_tensor.cc DEPS enforce data_type ddim allocator place convert_utils ${MKLDNN_CTX_DEPS})
-cc_library(selected_rows_tensor SRCS selected_rows_tensor.cc DEPS dense_tensor)
 
 cc_library(kernel_factory SRCS kernel_factory.cc DEPS enforce backend dtype layout)
 cc_library(kernel_context SRCS kernel_context.cc DEPS enforce device_context)
diff --git a/paddle/tcmpt/core/kernel_registry.h b/paddle/tcmpt/core/kernel_registry.h
index 2874f4db203f2..d31cb9b692184 100644
--- a/paddle/tcmpt/core/kernel_registry.h
+++ b/paddle/tcmpt/core/kernel_registry.h
@@ -52,13 +52,10 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
               ) {
 #endif
         // do nothing, skip context arg now
-      } else if (arg_type == std::type_index(typeid(const DenseTensor&)) ||
-                 arg_type ==
-                     std::type_index(typeid(const SelectedRowsTensor&))) {
+      } else if (arg_type == std::type_index(typeid(const DenseTensor&))) {
         args_def->AppendInput(
             default_key.backend(), default_key.layout(), default_key.dtype());
-      } else if (arg_type == std::type_index(typeid(DenseTensor*)) ||
-                 arg_type == std::type_index(typeid(SelectedRowsTensor*))) {
+      } else if (arg_type == std::type_index(typeid(DenseTensor*))) {
         args_def->AppendOutput(
             default_key.backend(), default_key.layout(), default_key.dtype());
       } else {
diff --git a/paddle/tcmpt/core/kernel_utils.h b/paddle/tcmpt/core/kernel_utils.h
index 05503dbd36116..7059d85ea39fb 100644
--- a/paddle/tcmpt/core/kernel_utils.h
+++ b/paddle/tcmpt/core/kernel_utils.h
@@ -17,7 +17,6 @@
 #include "paddle/tcmpt/core/dense_tensor.h"
 #include "paddle/tcmpt/core/kernel_context.h"
 #include "paddle/tcmpt/core/kernel_def.h"
-#include "paddle/tcmpt/core/selected_rows_tensor.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
@@ -152,7 +151,8 @@ struct KernelImpl<Return (*)(Args...), kernel_fn> {
   /* Input Helpers */
 
   PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(DenseTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SelectedRowsTensor);
+  // TODO(chenweihang): adapt SelectedRows
+  // PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SelectedRowsTensor);
 
   /* Attribute Helpers */
 
@@ -166,7 +166,8 @@ struct KernelImpl<Return (*)(Args...), kernel_fn> {
   /* Output Helpers */
 
   PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(DenseTensor);
-  PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SelectedRowsTensor);
+  // TODO(chenweihang): adapt SelectedRows
+  // PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SelectedRowsTensor);
 
   /* End case */
   template <typename T>
diff --git a/paddle/tcmpt/core/scalar_tensor.h b/paddle/tcmpt/core/scalar_tensor.h
deleted file mode 100644
index 0ae0b768cfa11..0000000000000
--- a/paddle/tcmpt/core/scalar_tensor.h
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/tcmpt/core/dense_tensor.h"
-
-class LoDTensor : public DenseTensor {};
diff --git a/paddle/tcmpt/core/selected_rows_tensor.cc b/paddle/tcmpt/core/selected_rows_tensor.cc
deleted file mode 100644
index 65a544009d20f..0000000000000
--- a/paddle/tcmpt/core/selected_rows_tensor.cc
+++ /dev/null
@@ -1,17 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/tcmpt/core/selected_rows_tensor.h"
-
-namespace pt {}  // namespace pt
diff --git a/paddle/tcmpt/core/selected_rows_tensor.h b/paddle/tcmpt/core/selected_rows_tensor.h
deleted file mode 100644
index 3d03c891395f6..0000000000000
--- a/paddle/tcmpt/core/selected_rows_tensor.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <algorithm>
-#include <memory>
-#include <mutex>  // NOLINT
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "paddle/tcmpt/core/dense_tensor.h"
-#include "paddle/tcmpt/core/tensor_interface.h"
-
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/framework/mixed_vector.h"
-#include "paddle/fluid/framework/rw_lock.h"
-
-namespace pt {
-
-template <typename T>
-using Vector = paddle::framework::Vector<T>;
-using RWLock = paddle::framework::RWLock;
-
-/**
- * SelectedRowsTensor: compatible with SelectedRows in fluid and related
- * operators.
- *
- * SelectedRowsTensor is not a typical design of sparse Tensor, and may
- * no longer be recommended for use in the future, and there may be new
- * SparseTensor later.
- */
-
-// TODO(chenweihang): add other methods later
-
-class SelectedRowsTensor : public TensorInterface {
- public:
-  SelectedRowsTensor() = delete;
-
-  // SelectedRowsTensor(const SelectedRowsTensor&) = delete;
-  // SelectedRowsTensor& operator=(const SelectedRowsTensor&) = delete;
-  SelectedRowsTensor(SelectedRowsTensor&&) = delete;
-  SelectedRowsTensor& operator=(SelectedRowsTensor&&) = delete;
-
-  SelectedRowsTensor(const TensorMeta& meta,
-                     const TensorStatus& status,
-                     const std::vector<int64_t>& rows,
-                     int64_t height) {
-    value_.reset(new DenseTensor(meta, status));
-    rows_ = rows;
-    height_ = height;
-  }
-
-  ~SelectedRowsTensor() override {}
-
-  int64_t numel() const override { return value_->numel(); }
-
-  DDim dims() const override {
-    std::vector<int64_t> dims = vectorize(value_->dims());
-    dims[0] = height_;
-    return paddle::framework::make_ddim(dims);
-  }
-
-  DataType type() const override { return value_->type(); }
-
-  DataLayout layout() const override { return value_->layout(); }
-
-  Place place() const override { return value_->place(); }
-
-  Backend backend() const override { return value_->backend(); }
-
-  bool initialized() const override { return value_->initialized(); }
-
-  const DenseTensor& value() const { return *value_; }
-
-  DenseTensor* mutable_value() { return value_.get(); }
-
-  const Vector<int64_t>& rows() const { return rows_; }
-
-  Vector<int64_t>* mutable_rows() { return &rows_; }
-
-  void set_rows(const Vector<int64_t>& rows) { rows_ = rows; }
-
-  int64_t height() const { return height_; }
-
-  void set_height(int64_t height) { height_ = height; }
-
- private:
-  std::unique_ptr<DenseTensor> value_{nullptr};
-
-  Vector<int64_t> rows_;
-  int64_t height_;
-
-  std::unordered_map<int64_t, int64_t> id_to_index_;
-  std::unique_ptr<RWLock> rwlock_{nullptr};
-};
-
-}  // namespace pt
diff --git a/paddle/tcmpt/cpu/math.cc b/paddle/tcmpt/cpu/math.cc
index 1c27c9e53005c..80dec2530f718 100644
--- a/paddle/tcmpt/cpu/math.cc
+++ b/paddle/tcmpt/cpu/math.cc
@@ -44,19 +44,6 @@ void Scale(const CPUContext& dev_ctx,
   eigen::Scale<CPUContext, T>(dev_ctx, x, scale, bias, bias_after_scale, out);
 }
 
-template <typename T>
-void ScaleSelectedRows(const CPUContext& dev_ctx,
-                       const SelectedRowsTensor& x,
-                       float scale,
-                       float bias,
-                       bool bias_after_scale,
-                       SelectedRowsTensor* out) {
-  out->set_rows(x.rows());
-  out->set_height(x.height());
-  Scale<T>(
-      dev_ctx, x.value(), scale, bias, bias_after_scale, out->mutable_value());
-}
-
 // TODO(chenweihang): now the ScaleTensor's dtype are same as x, so we cannot
 // register its dtype def
 template <typename T>
@@ -74,23 +61,6 @@ void ScaleHost(const CPUContext& dev_ctx,
                               out);
 }
 
-template <typename T>
-void ScaleSelectedRowsHost(const CPUContext& dev_ctx,
-                           const SelectedRowsTensor& x,
-                           const DenseTensor& scale,
-                           float bias,
-                           bool bias_after_scale,
-                           SelectedRowsTensor* out) {
-  out->set_rows(x.rows());
-  out->set_height(x.height());
-  Scale<T>(dev_ctx,
-           x.value(),
-           static_cast<float>(*scale.data<T>()),
-           bias,
-           bias_after_scale,
-           out->mutable_value());
-}
-
 }  // namespace pt
 
 // TODO(chenweihang): replace by better impl
@@ -113,18 +83,6 @@ PT_REGISTER_KERNEL("scale",
                    int16_t,
                    int,
                    int64_t) {}
-PT_REGISTER_KERNEL("scale.sr",
-                   CPU,
-                   NCHW,
-                   pt::ScaleSelectedRows,
-                   float,
-                   double,
-                   paddle::platform::bfloat16,
-                   uint8_t,
-                   int8_t,
-                   int16_t,
-                   int,
-                   int64_t) {}
 PT_REGISTER_KERNEL("scale.host",
                    CPU,
                    NCHW,
@@ -139,17 +97,3 @@ PT_REGISTER_KERNEL("scale.host",
                    int64_t) {
   kernel->InputAt(1).SetBackend(pt::Backend::kCPU);
 }
-PT_REGISTER_KERNEL("scale.sr.host",
-                   CPU,
-                   NCHW,
-                   pt::ScaleSelectedRowsHost,
-                   float,
-                   double,
-                   paddle::platform::bfloat16,
-                   uint8_t,
-                   int8_t,
-                   int16_t,
-                   int,
-                   int64_t) {
-  kernel->InputAt(1).SetBackend(pt::Backend::kCPU);
-}
diff --git a/paddle/tcmpt/cpu/math.h b/paddle/tcmpt/cpu/math.h
index e0694beafe4d5..3fb669b084095 100644
--- a/paddle/tcmpt/cpu/math.h
+++ b/paddle/tcmpt/cpu/math.h
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include "paddle/tcmpt/core/dense_tensor.h"
 #include "paddle/tcmpt/core/kernel_registry.h"
-#include "paddle/tcmpt/core/selected_rows_tensor.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
@@ -39,14 +38,6 @@ void Scale(const CPUContext& dev_ctx,
            bool bias_after_scale,
            DenseTensor* out);
 
-template <typename T>
-void ScaleSelectedRows(const CPUContext& dev_ctx,
-                       const SelectedRowsTensor& x,
-                       float scale,
-                       float bias,
-                       bool bias_after_scale,
-                       SelectedRowsTensor* out);
-
 template <typename T>
 void ScaleHost(const CPUContext& dev_ctx,
                const DenseTensor& x,
@@ -55,12 +46,4 @@ void ScaleHost(const CPUContext& dev_ctx,
                bool bias_after_scale,
                DenseTensor* out);
 
-template <typename T>
-void ScaleSelectedRowsHost(const CPUContext& dev_ctx,
-                           const SelectedRowsTensor& x,
-                           const DenseTensor& scale,
-                           float bias,
-                           bool bias_after_scale,
-                           SelectedRowsTensor* out);
-
 }  // namespace pt
diff --git a/paddle/tcmpt/cuda/math.cu b/paddle/tcmpt/cuda/math.cu
index 15aa8c6966977..293f0cf8bfc91 100644
--- a/paddle/tcmpt/cuda/math.cu
+++ b/paddle/tcmpt/cuda/math.cu
@@ -97,19 +97,6 @@ void Scale(const CUDAContext& dev_ctx,
   eigen::Scale<CUDAContext, T>(dev_ctx, x, scale, bias, bias_after_scale, out);
 }
 
-template <typename T>
-void ScaleSelectedRows(const CUDAContext& dev_ctx,
-                       const SelectedRowsTensor& x,
-                       float scale,
-                       float bias,
-                       bool bias_after_scale,
-                       SelectedRowsTensor* out) {
-  out->set_rows(x.rows());
-  out->set_height(x.height());
-  Scale<T>(
-      dev_ctx, x.value(), scale, bias, bias_after_scale, out->mutable_value());
-}
-
 template <typename T>
 void ScaleHost(const CUDAContext& dev_ctx,
                const DenseTensor& x,
@@ -128,23 +115,6 @@ void ScaleHost(const CUDAContext& dev_ctx,
                                out);
 }
 
-template <typename T>
-void ScaleSelectedRowsHost(const CUDAContext& dev_ctx,
-                           const SelectedRowsTensor& x,
-                           const DenseTensor& scale,
-                           float bias,
-                           bool bias_after_scale,
-                           SelectedRowsTensor* out) {
-  out->set_rows(x.rows());
-  out->set_height(x.height());
-  Scale<T>(dev_ctx,
-           x.value(),
-           static_cast<float>(*scale.data<T>()),
-           bias,
-           bias_after_scale,
-           out->mutable_value());
-}
-
 }  // namespace pt
 
 // TODO(chenweihang): replace by better impl
@@ -165,18 +135,6 @@ PT_REGISTER_KERNEL("scale",
                    int16_t,
                    int,
                    int64_t) {}
-PT_REGISTER_KERNEL("scale.sr",
-                   CUDA,
-                   NCHW,
-                   pt::ScaleSelectedRows,
-                   float,
-                   double,
-                   float16,
-                   uint8_t,
-                   int8_t,
-                   int16_t,
-                   int,
-                   int64_t) {}
 PT_REGISTER_KERNEL("scale.host",
                    CUDA,
                    NCHW,
@@ -191,17 +149,3 @@ PT_REGISTER_KERNEL("scale.host",
                    int64_t) {
   kernel->InputAt(1).SetBackend(pt::Backend::kCPU);
 }
-PT_REGISTER_KERNEL("scale.sr.host",
-                   CUDA,
-                   NCHW,
-                   pt::ScaleSelectedRowsHost,
-                   float,
-                   double,
-                   float16,
-                   uint8_t,
-                   int8_t,
-                   int16_t,
-                   int,
-                   int64_t) {
-  kernel->InputAt(1).SetBackend(pt::Backend::kCPU);
-}
diff --git a/paddle/tcmpt/cuda/math.h b/paddle/tcmpt/cuda/math.h
index 282803a54a292..dc8221d6345d6 100644
--- a/paddle/tcmpt/cuda/math.h
+++ b/paddle/tcmpt/cuda/math.h
@@ -18,7 +18,6 @@ limitations under the License. */
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 #include "paddle/tcmpt/core/dense_tensor.h"
-#include "paddle/tcmpt/core/selected_rows_tensor.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
@@ -41,14 +40,6 @@ void Scale(const CUDAContext& dev_ctx,
            bool bias_after_scale,
            DenseTensor* out);
 
-template <typename T>
-void ScaleSelectedRows(const CUDAContext& dev_ctx,
-                       const SelectedRowsTensor& x,
-                       float scale,
-                       float bias,
-                       bool bias_after_scale,
-                       SelectedRowsTensor* out);
-
 template <typename T>
 void ScaleHost(const CUDAContext& dev_ctx,
                const DenseTensor& x,
@@ -57,14 +48,6 @@ void ScaleHost(const CUDAContext& dev_ctx,
                bool bias_after_scale,
                DenseTensor* out);
 
-template <typename T>
-void ScaleSelectedRowsHost(const CUDAContext& dev_ctx,
-                           const SelectedRowsTensor& x,
-                           const DenseTensor& scale,
-                           float bias,
-                           bool bias_after_scale,
-                           SelectedRowsTensor* out);
-
 }  // namespace pt
 
 #endif

From 073aef32b6ab346d75982139afd8a66d62fb57d1 Mon Sep 17 00:00:00 2001
From: zyfncg <1370305206@qq.com>
Date: Thu, 14 Oct 2021 16:30:58 +0800
Subject: [PATCH 079/125] Support Scalar in Tensor Compute Library (#14)

* fill_any_like kernel refactor

* remove useless code of full_like c++ api

* Support Scalar in Tensor Compute Library

* add scalar in dygraph and static graph mode

* keep the basic type for attr, instead of using scalar for all

* merge the code
---
 paddle/fluid/framework/operator.cc           | 50 ++++++++++----
 paddle/fluid/imperative/prepared_operator.cc | 49 ++++++++++----
 paddle/tcmpt/api/include/core.h              |  1 +
 paddle/tcmpt/core/kernel_utils.h             |  2 +
 paddle/tcmpt/core/scalar.h                   | 63 ++++++++++++++++++
 paddle/tcmpt/cpu/creation.cc                 |  8 +--
 paddle/tcmpt/cpu/creation.h                  |  3 +-
 paddle/tcmpt/cuda/creation.cu                |  8 +--
 paddle/tcmpt/cuda/creation.h                 |  3 +-
 paddle/tcmpt/hapi/include/creation.h         | 10 ++-
 paddle/tcmpt/hapi/lib/creation.cc            | 14 +++-
 paddle/tcmpt/tests/test_fill_api.cc          | 68 +++++++++++++++++++-
 12 files changed, 235 insertions(+), 44 deletions(-)
 create mode 100644 paddle/tcmpt/core/scalar.h

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index eb1889ae1d8ef..213c7451b43dd 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1959,27 +1959,51 @@ pt::KernelContext OperatorWithKernel::ConstructPtKernelContext(
     op_kernel_ctx.EmplaceBackOutputs(tmp_outputs);
   }
 
-  for (size_t i = 0; i < attr_pairs.size(); ++i) {
-    // TODO(chenweihang): support other attrs
-    // In principle, the attr required by the dynamic mode should be
-    // passed in from the Python side, and there is no need to look up
-    // from the default_map, but now this nor work
-    switch (attr_pairs[i].second) {
-      case framework::proto::AttrType::INT:
+  for (size_t i = 0; i < attr_defs.size(); ++i) {
+    paddle::any attr_item;
+    if (attr_defs[i].type_index == std::type_index(typeid(pt::Scalar))) {
+      // TODO(chenweihang): support other attrs
+      // In principle, the attr required by the dynamic mode should be
+      // passed in from the Python side, and there is no need to look up
+      // from the default_map, but now this nor work
+      switch (attr_pairs[i].second) {
+        case framework::proto::AttrType::INT:
+          op_kernel_ctx.EmplaceBackAttr(
+              pt::Scalar(Attr<int>(attr_pairs[i].first)));
+          break;
+        case framework::proto::AttrType::FLOAT:
+          op_kernel_ctx.EmplaceBackAttr(
+              pt::Scalar(Attr<float>(attr_pairs[i].first)));
+          break;
+        case framework::proto::AttrType::BOOLEAN:
+          op_kernel_ctx.EmplaceBackAttr(
+              pt::Scalar(Attr<double>(attr_pairs[i].first)));
+          break;
+        default:
+          // TODO(chenweihang): support other attrs type
+          PADDLE_THROW(platform::errors::Unimplemented(
+              "unsupported cast op attribute `%s` when construct "
+              "KernelContext.",
+              attr_pairs[i].first));
+      }
+    } else {
+      // TODO(chenweihang): support other attrs
+      // In principle, the attr required by the dynamic mode should be
+      // passed in from the Python side, and there is no need to look up
+      // from the default_map, but now this nor work
+      if (attr_defs[i].type_index == std::type_index(typeid(int))) {
         op_kernel_ctx.EmplaceBackAttr(Attr<int>(attr_pairs[i].first));
-        break;
-      case framework::proto::AttrType::FLOAT:
+      } else if (attr_defs[i].type_index == std::type_index(typeid(float))) {
         op_kernel_ctx.EmplaceBackAttr(Attr<float>(attr_pairs[i].first));
-        break;
-      case framework::proto::AttrType::BOOLEAN:
+      } else if (attr_defs[i].type_index == std::type_index(typeid(bool))) {
         op_kernel_ctx.EmplaceBackAttr(Attr<bool>(attr_pairs[i].first));
-        break;
-      default:
+      } else {
         // TODO(chenweihang): support other attrs type
         PADDLE_THROW(platform::errors::Unimplemented(
             "unsupported cast op attribute `%s` when construct "
             "KernelContext.",
             attr_pairs[i].first));
+      }
     }
   }
 
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 645343316a5b9..6fcb3641ee7b0 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -365,30 +365,53 @@ static pt::KernelContext BuildDygraphKernelContext(
     op_kernel_ctx.EmplaceBackOutputs(tmp_outputs);
   }
 
-  for (size_t i = 0; i < attr_pairs.size(); ++i) {
-    // TODO(chenweihang): support other attrs
-    // In principle, the attr required by the dynamic mode should be
-    // passed in from the Python side, and there is no need to look up
-    // from the default_map, but now this nor work
-    switch (attr_pairs[i].second) {
-      case framework::proto::AttrType::INT:
+  for (size_t i = 0; i < attr_defs.size(); ++i) {
+    if (attr_defs[i].type_index == std::type_index(typeid(pt::Scalar))) {
+      // TODO(chenweihang): support other attrs
+      // In principle, the attr required by the dynamic mode should be
+      // passed in from the Python side, and there is no need to look up
+      // from the default_map, but now this nor work
+      switch (attr_pairs[i].second) {
+        case framework::proto::AttrType::INT:
+          op_kernel_ctx.EmplaceBackAttr(pt::Scalar(
+              GetAttr<int>(attrs, default_attrs, attr_pairs[i].first)));
+          break;
+        case framework::proto::AttrType::FLOAT:
+          op_kernel_ctx.EmplaceBackAttr(pt::Scalar(
+              GetAttr<float>(attrs, default_attrs, attr_pairs[i].first)));
+          break;
+        case framework::proto::AttrType::BOOLEAN:
+          op_kernel_ctx.EmplaceBackAttr(pt::Scalar(
+              GetAttr<bool>(attrs, default_attrs, attr_pairs[i].first)));
+          break;
+        default:
+          // TODO(chenweihang): support other attrs type
+          PADDLE_THROW(platform::errors::Unimplemented(
+              "unsupported cast op attribute `%s` when construct "
+              "KernelContext.",
+              attr_pairs[i].first));
+      }
+    } else {
+      // TODO(chenweihang): support other attrs
+      // In principle, the attr required by the dynamic mode should be
+      // passed in from the Python side, and there is no need to look up
+      // from the default_map, but now this nor work
+      if (attr_defs[i].type_index == std::type_index(typeid(int))) {
         op_kernel_ctx.EmplaceBackAttr(
             GetAttr<int>(attrs, default_attrs, attr_pairs[i].first));
-        break;
-      case framework::proto::AttrType::FLOAT:
+      } else if (attr_defs[i].type_index == std::type_index(typeid(float))) {
         op_kernel_ctx.EmplaceBackAttr(
             GetAttr<float>(attrs, default_attrs, attr_pairs[i].first));
-        break;
-      case framework::proto::AttrType::BOOLEAN:
+      } else if (attr_defs[i].type_index == std::type_index(typeid(bool))) {
         op_kernel_ctx.EmplaceBackAttr(
             GetAttr<bool>(attrs, default_attrs, attr_pairs[i].first));
-        break;
-      default:
+      } else {
         // TODO(chenweihang): support other attrs type
         PADDLE_THROW(platform::errors::Unimplemented(
             "unsupported cast op attribute `%s` when construct "
             "KernelContext.",
             attr_pairs[i].first));
+      }
     }
   }
 
diff --git a/paddle/tcmpt/api/include/core.h b/paddle/tcmpt/api/include/core.h
index 687dc72bb351f..7e02f600a5e7c 100644
--- a/paddle/tcmpt/api/include/core.h
+++ b/paddle/tcmpt/api/include/core.h
@@ -20,4 +20,5 @@ limitations under the License. */
 #include "paddle/tcmpt/core/kernel_context.h"
 #include "paddle/tcmpt/core/kernel_factory.h"
 #include "paddle/tcmpt/core/mkldnn_dense_tensor.h"
+#include "paddle/tcmpt/core/scalar.h"
 #include "paddle/tcmpt/core/selected_rows_tensor.h"
diff --git a/paddle/tcmpt/core/kernel_utils.h b/paddle/tcmpt/core/kernel_utils.h
index 05503dbd36116..a25c5a71c8c67 100644
--- a/paddle/tcmpt/core/kernel_utils.h
+++ b/paddle/tcmpt/core/kernel_utils.h
@@ -17,6 +17,7 @@
 #include "paddle/tcmpt/core/dense_tensor.h"
 #include "paddle/tcmpt/core/kernel_context.h"
 #include "paddle/tcmpt/core/kernel_def.h"
+#include "paddle/tcmpt/core/scalar.h"
 #include "paddle/tcmpt/core/selected_rows_tensor.h"
 
 // See Note [ Why still include the fluid headers? ]
@@ -162,6 +163,7 @@ struct KernelImpl<Return (*)(Args...), kernel_fn> {
   PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int);
   PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int64_t);
   PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(paddle::platform::float16);
+  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const pt::Scalar&);
 
   /* Output Helpers */
 
diff --git a/paddle/tcmpt/core/scalar.h b/paddle/tcmpt/core/scalar.h
new file mode 100644
index 0000000000000..8f30d81bcfb28
--- /dev/null
+++ b/paddle/tcmpt/core/scalar.h
@@ -0,0 +1,63 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace pt {
+
+class Scalar {
+ public:
+  // Constructor support implicit
+  Scalar(float val) : tag(Tag::HAS_F) { data_.f = val; }  // NOLINT
+
+  Scalar(double val) : tag(Tag::HAS_D) { data_.d = val; }  // NOLINT
+
+  Scalar(int32_t val) : tag(Tag::HAS_I32) { data_.i32 = val; }  // NOLINT
+
+  Scalar(int64_t val) : tag(Tag::HAS_I64) { data_.i64 = val; }  // NOLINT
+
+  Scalar(bool val) : tag(Tag::HAS_B) { data_.b = val; }  // NOLINT
+
+  template <typename T>
+  inline T to() const {
+    switch (tag) {
+      case Tag::HAS_F:
+        return static_cast<T>(data_.f);
+      case Tag::HAS_D:
+        return static_cast<T>(data_.d);
+      case Tag::HAS_I32:
+        return static_cast<T>(data_.i32);
+      case Tag::HAS_I64:
+        return static_cast<T>(data_.i64);
+      case Tag::HAS_B:
+        return static_cast<T>(data_.b);
+      default:
+        throw std::runtime_error("Invalid Scalar type.");
+    }
+  }
+
+ private:
+  enum class Tag { HAS_F, HAS_D, HAS_I32, HAS_I64, HAS_B };
+  Tag tag;
+
+  union data {
+    float f;
+    double d;
+    int32_t i32;
+    int64_t i64;
+    bool b;
+  } data_;
+};
+
+}  // namespace pt
diff --git a/paddle/tcmpt/cpu/creation.cc b/paddle/tcmpt/cpu/creation.cc
index b117209fd35b0..8e4399c41bf17 100644
--- a/paddle/tcmpt/cpu/creation.cc
+++ b/paddle/tcmpt/cpu/creation.cc
@@ -22,13 +22,9 @@ namespace pt {
 template <typename T>
 void FillAnyLike(const CPUContext& dev_ctx,
                  const DenseTensor& x,
-                 float val,
+                 const Scalar& val,
                  DenseTensor* out) {
-  PADDLE_ENFORCE_EQ(
-      std::isnan(val),
-      false,
-      paddle::platform::errors::InvalidArgument("The filled value is NaN."));
-  eigen::fill<CPUContext, T>(dev_ctx, out, val);
+  eigen::fill<CPUContext, T>(dev_ctx, out, val.to<T>());
 }
 
 }  // namespace pt
diff --git a/paddle/tcmpt/cpu/creation.h b/paddle/tcmpt/cpu/creation.h
index 090112911bbab..2c67945892b82 100644
--- a/paddle/tcmpt/cpu/creation.h
+++ b/paddle/tcmpt/cpu/creation.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include "paddle/tcmpt/core/dense_tensor.h"
+#include "paddle/tcmpt/core/scalar.h"
 
 #include "paddle/fluid/platform/device_context.h"
 
@@ -25,7 +26,7 @@ using CPUContext = paddle::platform::CPUDeviceContext;
 template <typename T>
 void FillAnyLike(const CPUContext& dev_ctx,
                  const DenseTensor& x,
-                 float val,
+                 const Scalar& val,
                  DenseTensor* out);
 
 }  // namespace pt
diff --git a/paddle/tcmpt/cuda/creation.cu b/paddle/tcmpt/cuda/creation.cu
index 07fc5ee5f9b2b..cca9199b76cfd 100644
--- a/paddle/tcmpt/cuda/creation.cu
+++ b/paddle/tcmpt/cuda/creation.cu
@@ -22,13 +22,9 @@ namespace pt {
 template <typename T>
 void FillAnyLike(const CUDAContext& dev_ctx,
                  const DenseTensor& x,
-                 float val,
+                 const Scalar& val,
                  DenseTensor* out) {
-  PADDLE_ENFORCE_EQ(
-      std::isnan(val),
-      false,
-      paddle::platform::errors::InvalidArgument("The filled value is NaN."));
-  eigen::fill<CUDAContext, T>(dev_ctx, out, val);
+  eigen::fill<CUDAContext, T>(dev_ctx, out, val.to<T>());
 }
 
 }  // namespace pt
diff --git a/paddle/tcmpt/cuda/creation.h b/paddle/tcmpt/cuda/creation.h
index ff26ca11ca2a5..7de9ce1371fff 100644
--- a/paddle/tcmpt/cuda/creation.h
+++ b/paddle/tcmpt/cuda/creation.h
@@ -18,6 +18,7 @@
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
 #include "paddle/tcmpt/core/dense_tensor.h"
+#include "paddle/tcmpt/core/scalar.h"
 
 #include "paddle/fluid/platform/device_context.h"
 
@@ -28,7 +29,7 @@ using CUDAContext = paddle::platform::CUDADeviceContext;
 template <typename T>
 void FillAnyLike(const CUDAContext& dev_ctx,
                  const DenseTensor& x,
-                 float val,
+                 const Scalar& val,
                  DenseTensor* out);
 
 }  // namespace pt
diff --git a/paddle/tcmpt/hapi/include/creation.h b/paddle/tcmpt/hapi/include/creation.h
index 98044636b12bb..f502adb2e2472 100644
--- a/paddle/tcmpt/hapi/include/creation.h
+++ b/paddle/tcmpt/hapi/include/creation.h
@@ -14,12 +14,20 @@
 
 #pragma once
 
+#include "paddle/tcmpt/core/dtype.h"
+#include "paddle/tcmpt/core/scalar.h"
 #include "paddle/tcmpt/hapi/include/tensor.h"
 
 namespace paddle {
 namespace experimental {
 
-Tensor full_like(const Tensor& x, float value);
+Tensor full_like(const Tensor& x,
+                 const pt::Scalar& value,
+                 pt::DataType dtype = pt::DataType::kUndef);
+
+Tensor ones_like(const Tensor& x, pt::DataType dtype = pt::DataType::kUndef);
+
+Tensor zeros_like(const Tensor& x, pt::DataType dtype = pt::DataType::kUndef);
 
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/tcmpt/hapi/lib/creation.cc b/paddle/tcmpt/hapi/lib/creation.cc
index e182a496df262..87fdd204dadd5 100644
--- a/paddle/tcmpt/hapi/lib/creation.cc
+++ b/paddle/tcmpt/hapi/lib/creation.cc
@@ -25,7 +25,7 @@ limitations under the License. */
 namespace paddle {
 namespace experimental {
 
-Tensor full_like(const Tensor& x, float value) {
+Tensor full_like(const Tensor& x, const pt::Scalar& value, pt::DataType dtype) {
   // 1. Get kernel signature and kernel
   auto kernel_signature = ParseKernelNameAndKeyByArgs("fill_any_like", x);
   VLOG(1) << kernel_signature.first;
@@ -52,6 +52,10 @@ Tensor full_like(const Tensor& x, float value) {
   // 5. Prepare outputs
   Tensor out;
   auto out_def = kernel.args_def().output_defs()[0];
+  // InferDataType
+  if (dtype != pt::DataType::kUndef) {
+    out_def.SetDataType(dtype);
+  }
   auto dense_out = std::make_shared<pt::DenseTensor>(
       pt::TensorMeta(out_dims, out_def.backend, out_def.dtype, out_def.layout),
       pt::TensorStatus());
@@ -64,5 +68,13 @@ Tensor full_like(const Tensor& x, float value) {
   return out;
 }
 
+Tensor ones_like(const Tensor& x, pt::DataType dtype) {
+  return full_like(x, 1, dtype);
+}
+
+Tensor zeros_like(const Tensor& x, pt::DataType dtype) {
+  return full_like(x, 0, dtype);
+}
+
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/tcmpt/tests/test_fill_api.cc b/paddle/tcmpt/tests/test_fill_api.cc
index 39a23a44bfa59..0ed7248604654 100644
--- a/paddle/tcmpt/tests/test_fill_api.cc
+++ b/paddle/tcmpt/tests/test_fill_api.cc
@@ -29,7 +29,7 @@ PT_DECLARE_MODULE(CreationCUDA);
 namespace framework = paddle::framework;
 using DDim = paddle::framework::DDim;
 
-TEST(API, fill) {
+TEST(API, full_like) {
   // 1. create tensor
   auto dense_x = std::make_shared<pt::DenseTensor>(
       pt::TensorMeta(framework::make_ddim({3, 2}),
@@ -45,7 +45,7 @@ TEST(API, fill) {
   paddle::experimental::Tensor x(dense_x);
 
   // 2. test API
-  auto out = paddle::experimental::full_like(x, val);
+  auto out = paddle::experimental::full_like(x, val, pt::DataType::kFLOAT32);
 
   // 3. check result
   ASSERT_EQ(out.shape().size(), 2);
@@ -62,3 +62,67 @@ TEST(API, fill) {
     ASSERT_NEAR(actual_result[i], val, 1e-6f);
   }
 }
+
+TEST(API, zeros_like) {
+  // 1. create tensor
+  auto dense_x = std::make_shared<pt::DenseTensor>(
+      pt::TensorMeta(framework::make_ddim({3, 2}),
+                     pt::Backend::kCPU,
+                     pt::DataType::kFLOAT32,
+                     pt::DataLayout::kNCHW),
+      pt::TensorStatus());
+  auto* dense_x_data = dense_x->mutable_data<float>();
+  dense_x_data[0] = 1;
+
+  paddle::experimental::Tensor x(dense_x);
+
+  // 2. test API
+  auto out = paddle::experimental::zeros_like(x, pt::DataType::kFLOAT32);
+
+  // 3. check result
+  ASSERT_EQ(out.shape().size(), 2);
+  ASSERT_EQ(out.shape()[0], 3);
+  ASSERT_EQ(out.numel(), 6);
+  ASSERT_EQ(out.is_cpu(), true);
+  ASSERT_EQ(out.type(), pt::DataType::kFLOAT32);
+  ASSERT_EQ(out.layout(), pt::DataLayout::kNCHW);
+  ASSERT_EQ(out.initialized(), true);
+
+  auto dense_out = std::dynamic_pointer_cast<pt::DenseTensor>(out.impl());
+  auto* actual_result = dense_out->data<float>();
+  for (auto i = 0; i < 6; i++) {
+    ASSERT_NEAR(actual_result[i], 0, 1e-6f);
+  }
+}
+
+TEST(API, ones_like) {
+  // 1. create tensor
+  auto dense_x = std::make_shared<pt::DenseTensor>(
+      pt::TensorMeta(framework::make_ddim({3, 2}),
+                     pt::Backend::kCPU,
+                     pt::DataType::kFLOAT32,
+                     pt::DataLayout::kNCHW),
+      pt::TensorStatus());
+  auto* dense_x_data = dense_x->mutable_data<float>();
+  dense_x_data[0] = 0;
+
+  paddle::experimental::Tensor x(dense_x);
+
+  // 2. test API
+  auto out = paddle::experimental::ones_like(x, pt::DataType::kINT32);
+
+  // 3. check result
+  ASSERT_EQ(out.shape().size(), 2);
+  ASSERT_EQ(out.shape()[0], 3);
+  ASSERT_EQ(out.numel(), 6);
+  ASSERT_EQ(out.is_cpu(), true);
+  ASSERT_EQ(out.type(), pt::DataType::kINT32);
+  ASSERT_EQ(out.layout(), pt::DataLayout::kNCHW);
+  ASSERT_EQ(out.initialized(), true);
+
+  auto dense_out = std::dynamic_pointer_cast<pt::DenseTensor>(out.impl());
+  auto* actual_result = dense_out->data<float>();
+  for (auto i = 0; i < 6; i++) {
+    ASSERT_EQ(actual_result[i], 1);
+  }
+}

From 3f5f789ed8e2f64c83c672f5ec842332879f1c04 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 14 Oct 2021 12:32:29 +0000
Subject: [PATCH 080/125] remove mkldnn tensor & polish details

---
 cmake/generic.cmake                           |   2 +-
 cmake/tcmpt.cmake                             |   9 +-
 paddle/fluid/framework/eigen.h                |  44 -----
 ...est_reference_count_pass_last_lived_ops.cc |   2 +-
 paddle/fluid/framework/operator.cc            |  15 --
 paddle/fluid/framework/tcmpt_utils.cc         |  38 +---
 paddle/fluid/framework/type_defs.h            |   2 -
 paddle/fluid/imperative/prepared_operator.cc  |  15 --
 .../pscore/heter_listen_and_server_test.cc    |   2 +-
 .../operators/pscore/heter_server_test.cc     |   2 +-
 paddle/fluid/operators/scale_op_xpu.cc        |   1 -
 paddle/fluid/operators/sign_op.cc             |   3 +-
 paddle/tcmpt/api/include/core.h               |   1 -
 paddle/tcmpt/core/mkldnn_dense_tensor.h       |  56 ------
 paddle/tcmpt/cpu/CMakeLists.txt               |   1 +
 paddle/tcmpt/cuda/CMakeLists.txt              |   1 +
 paddle/tcmpt/cuda/linalg.cu                   |  20 +--
 paddle/tcmpt/eigen/common.h                   | 170 ++++++++++++++++++
 paddle/tcmpt/eigen/dot.h                      |  50 ++++++
 paddle/tcmpt/eigen/fill.h                     |   5 +-
 paddle/tcmpt/eigen/mean.h                     |   6 +-
 paddle/tcmpt/eigen/scale.h                    |   6 +-
 paddle/tcmpt/eigen/sign.h                     |   6 +-
 23 files changed, 249 insertions(+), 208 deletions(-)
 delete mode 100644 paddle/tcmpt/core/mkldnn_dense_tensor.h
 create mode 100644 paddle/tcmpt/eigen/common.h
 create mode 100644 paddle/tcmpt/eigen/dot.h

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 7390bd17e386e..12b4530a77a4c 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -117,7 +117,7 @@ function(find_fluid_modules TARGET_NAME)
 endfunction(find_fluid_modules)
 
 set_property(GLOBAL PROPERTY TCMPT_MODULES "")
-# find all top modules is used for paddle static library
+# find all tcmpt modules is used for paddle static library
 # for building inference libs
 function(find_tcmpt_modules TARGET_NAME)
   get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
diff --git a/cmake/tcmpt.cmake b/cmake/tcmpt.cmake
index 3ffc168c6bed0..819cd42287974 100644
--- a/cmake/tcmpt.cmake
+++ b/cmake/tcmpt.cmake
@@ -1,4 +1,10 @@
-# TODO(chenweihang): keep message comment for debuging, remove it if needless
+# `kernel_instantiate` functionis used to declare the template instantiation of
+# the Kernel function generated through code analysis, only for windows
+# (because the windows platform msvc compiler cannot automatically instantiate
+# the template function through decltype)
+# TODO(chenweihang): keep message comment for debuging, it is still useful,
+# I will remove it if needless later
+
 function(kernel_instantiate TARGET)
     set(target_file ${CURRENT_BINARY_DIR}/${TARGET}.tmp CACHE INTERNAL "${CURRENT_BINARY_DIR}/${TARGET} file")
     set(target_file_final ${CURRENT_BINARY_DIR}/${TARGET})
@@ -36,7 +42,6 @@ function(kernel_instantiate TARGET)
     endforeach()
     # message(STATUS "INST CONTENT: ${instantiate_context}")
     file(APPEND ${target_file} "${instantiate_context}\n")
-    # copy_if_different(${target_file} ${target_file_final})
     string(REPLACE "." "_" cmd_name ${TARGET})
     # this is a dummy target for custom command, should always be run firstly to update ${target_file_final}
     # TODO(chenweihang): nameing rule need to enchance
diff --git a/paddle/fluid/framework/eigen.h b/paddle/fluid/framework/eigen.h
index 56843b9aa6853..a6abda8a83bc8 100644
--- a/paddle/fluid/framework/eigen.h
+++ b/paddle/fluid/framework/eigen.h
@@ -19,8 +19,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
-#include "paddle/tcmpt/core/dense_tensor.h"
-
 namespace paddle {
 namespace framework {
 
@@ -69,28 +67,6 @@ struct EigenTensor {
   static ConstType From(const Tensor& tensor) {
     return From(tensor, tensor.dims_);
   }
-
-  // for pt::DenseTensor
-  static Type From(pt::DenseTensor& tensor, DDim dims) {  // NOLINT
-    // why tensor.data<T>() not work?
-    // return Type(const_cast<T*>(reinterpret_cast<const T*>(tensor.data())),
-    // EigenDim<D>::From(dims));
-    return Type(const_cast<T*>(tensor.data<T>()), EigenDim<D>::From(dims));
-  }
-
-  static Type From(pt::DenseTensor& tensor) {  // NOLINT
-    return From(tensor, tensor.dims());
-  }  // NOLINT
-
-  static ConstType From(const pt::DenseTensor& tensor, DDim dims) {
-    // return ConstType(reinterpret_cast<const T*>(tensor.data()),
-    // EigenDim<D>::From(dims));
-    return ConstType(tensor.data<T>(), EigenDim<D>::From(dims));
-  }
-
-  static ConstType From(const pt::DenseTensor& tensor) {
-    return From(tensor, tensor.dims());
-  }
 };
 
 template <typename T, int MajorType = Eigen::RowMajor,
@@ -133,17 +109,6 @@ struct EigenVector : public EigenTensor<T, 1, MajorType, IndexType> {
       const Tensor& tensor) {  // NOLINT
     return EigenVector::From(tensor, {product(tensor.dims_)});
   }
-
-  // for pt::DenseTensor
-  static typename EigenVector::Type Flatten(
-      pt::DenseTensor& tensor) {  // NOLINT
-    return EigenVector::From(tensor, {product(tensor.dims())});
-  }
-
-  static typename EigenVector::ConstType Flatten(
-      const pt::DenseTensor& tensor) {  // NOLINT
-    return EigenVector::From(tensor, {product(tensor.dims())});
-  }
 };
 
 template <typename T, int MajorType = Eigen::RowMajor,
@@ -160,15 +125,6 @@ struct EigenScalar {
   static ConstType From(const Tensor& tensor) {
     return ConstType(tensor.data<T>());
   }
-
-  // for pt::DenseTensor
-  static Type From(pt::DenseTensor& tensor) {  // NOLINT
-    return Type(const_cast<T*>(tensor.data<T>()));
-  }
-
-  static ConstType From(const pt::DenseTensor& tensor) {
-    return ConstType(tensor.data<T>());
-  }
 };
 
 // Define Tensor with 32-bit index.
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
index 8cf541637557b..f410171f99896 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
@@ -21,7 +21,7 @@
 #include "paddle/fluid/framework/parallel_executor.h"
 #include "paddle/fluid/framework/program_desc.h"
 
-USE_NO_KERNEL_OP(scale);
+USE_OP(scale);
 USE_OP(elementwise_mul);
 USE_OP(elementwise_add);
 USE_OP(elementwise_add_grad);
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 1b0cf462479d2..a47089ecba5cd 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1155,7 +1155,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   // and RCOM backend, the XPU, NPU and MKLDNN will be supported in the second
   // phase
 
-  // VLOG(1) << "Pt KernelFactory: " << pt::KernelFactory::Instance();
   if (FLAGS_use_pt_kernel &&
       pt::KernelFactory::Instance().ContainsKernel(type_.c_str())) {
     if (pt_kernel_key_.get() == nullptr || pt_kernel_.get() == nullptr) {
@@ -1263,17 +1262,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   }
 }
 
-static bool ContainSelectedRows(const VariableValueMap& inputs) {
-  for (auto& var_pair : inputs) {
-    for (auto* var : var_pair.second) {
-      if (var->IsType<SelectedRows>()) {
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
 // TODO(chenweihang): now only check single var input
 static bool IsValidVar(const std::string& name,
                        const VariableValueMap& inputs) {
@@ -1303,9 +1291,6 @@ static pt::KernelName ConstructPtKernelName(const std::string& op_type,
                                             const VariableValueMap& inputs) {
   std::string overload_name;
   // TODO(chenweihang): adapt SelectedRows by xiaowei's design
-  // if (ContainSelectedRows(inputs)) {
-  //   overload_name = pt::kContainSelectedRowsSuffix;
-  // }
   if (ContainHostTensor(op_proto, inputs)) {
     if (overload_name != "") {
       overload_name += ".";
diff --git a/paddle/fluid/framework/tcmpt_utils.cc b/paddle/fluid/framework/tcmpt_utils.cc
index f83f6b593a60d..71ef2d3450ae9 100644
--- a/paddle/fluid/framework/tcmpt_utils.cc
+++ b/paddle/fluid/framework/tcmpt_utils.cc
@@ -13,18 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/tcmpt_utils.h"
+
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/selected_rows.h"
-
 #include "paddle/fluid/framework/variable.h"
-#include "paddle/tcmpt/api/include/core.h"
-#include "paddle/tcmpt/api/include/symbols.h"
 
 namespace paddle {
 namespace framework {
 
 // TODO(chenweihang, shixiaowei): adapt SelectedRows
-
 template <>
 std::shared_ptr<pt::DenseTensor> MakeTensorImpl<pt::DenseTensor, LoDTensor>(
     const LoDTensor& tensor, pt::Backend backend, pt::DataType dtype,
@@ -167,38 +164,5 @@ std::shared_ptr<pt::TensorInterface> OutputVariableToPtTensor(
   return nullptr;
 }
 
-/* For MKLDNNDenseTensor (move this part into a single file later) */
-#ifdef PADDLE_WITH_MKLDNN
-
-template <>
-std::shared_ptr<pt::MKLDNNDenseTensor> MakeTensorImpl<pt::MKLDNNDenseTensor>(
-    const Tensor& tensor, const platform::Place& place,
-    proto::VarType::Type type) {
-  auto holder = tensor.Holder();
-  auto tensor_impl = std::make_shared<pt::MKLDNNDenseTensor>(
-      pt::TensorMeta(tensor.dims(), pt::TransToPtBackend(place),
-                     pt::TransToPtDataType(type),
-                     pt::TransToPtLayout(tensor.layout()), tensor.offset()),
-      pt::TensorStatus());
-
-  if (holder != nullptr) {
-    tensor_impl->ShareAllocation(tensor.Holder());
-  } else {
-    VLOG(1) << "Old MKLDNN Tensor holder is nullptr.";
-  }
-
-  tensor_impl->set_format(tensor.format());
-  return tensor_impl;
-}
-
-template <>
-void ShareTensorImpl(pt::MKLDNNDenseTensor* tensor_impl, Tensor* out) {
-  out->ResetHolderWithType(tensor_impl->allocation(),
-                           pt::TransToProtoVarType(tensor_impl->type()));
-  out->set_format(tensor_impl->format());
-}
-
-#endif
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h
index 9d19d0bce6071..1c5469d02c3ef 100644
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -33,7 +33,6 @@ class BlockDesc;
 class Variable;
 class InferNoNeedBufferVarsFN;
 
-// TODO(chenweihang): AttirbuteMap also need to be ordered
 // TODO(panyx0718): Replace vector with something like gtl::Vector.
 using VariableNameMap = std::map<std::string, std::vector<std::string>>;
 using VariableValueMap = std::map<std::string, std::vector<Variable*>>;
@@ -44,7 +43,6 @@ using Attribute = boost::variant<
     std::vector<std::string>, bool, std::vector<bool>, BlockDesc*, int64_t,
     std::vector<BlockDesc*>, std::vector<int64_t>, std::vector<double>>;
 
-// TODO(chenweihang): AttirbuteMap also need to be ordered
 using AttributeMap = std::unordered_map<std::string, Attribute>;
 
 #ifdef PADDLE_WITH_ASCEND_CL
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index c3cda9e8e992c..f7e57bec1da9e 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -137,18 +137,6 @@ static framework::VariableValueMap BuildInputMap(
   return inputs;
 }
 
-template <typename VarType>
-static bool ContainSelectedRows(const NameVarMap<VarType>& inputs) {
-  for (auto& var_pair : inputs) {
-    for (auto& var : var_pair.second) {
-      if (var->Var().template IsType<framework::SelectedRows>()) {
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
 // TODO(chenweihang): enhance rules, not all dispensable inputs
 // are host tensor, now only for scale kernel verify
 template <typename VarType>
@@ -169,9 +157,6 @@ static pt::KernelName ConstructPtKernelName(
     const NameVarMap<VarType>& inputs) {
   std::string overload_name;
   // TODO(chenweihang): adapt SelectedRows by xiaowei's design
-  // if (ContainSelectedRows<VarType>(inputs)) {
-  //   overload_name = pt::kContainSelectedRowsSuffix;
-  // }
   if (ContainHostTensor<VarType>(op_proto, inputs)) {
     if (overload_name != "") {
       overload_name += ".";
diff --git a/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc b/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc
index bbc7f01597900..3b005e10d9b98 100644
--- a/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc
+++ b/paddle/fluid/operators/pscore/heter_listen_and_server_test.cc
@@ -32,7 +32,7 @@ using MultiVarMsg = ::paddle::distributed::MultiVariableMessage;
 using VarMsg = ::paddle::distributed::VariableMessage;
 DECLARE_double(eager_delete_tensor_gb);
 
-USE_NO_KERNEL_OP(scale);
+USE_OP(scale);
 USE_NO_KERNEL_OP(heter_listen_and_serv);
 
 framework::BlockDesc* AppendSendAndRecvBlock(framework::ProgramDesc* program) {
diff --git a/paddle/fluid/operators/pscore/heter_server_test.cc b/paddle/fluid/operators/pscore/heter_server_test.cc
index 3e6897073e129..df2eb70b144e4 100644
--- a/paddle/fluid/operators/pscore/heter_server_test.cc
+++ b/paddle/fluid/operators/pscore/heter_server_test.cc
@@ -29,7 +29,7 @@ namespace distributed = paddle::distributed;
 using MultiVarMsg = ::paddle::distributed::MultiVariableMessage;
 using VarMsg = ::paddle::distributed::VariableMessage;
 
-USE_NO_KERNEL_OP(scale);
+USE_OP(scale);
 
 std::shared_ptr<distributed::HeterServer> b_rpc_service;
 
diff --git a/paddle/fluid/operators/scale_op_xpu.cc b/paddle/fluid/operators/scale_op_xpu.cc
index c467f3f89d064..e0dfad91570ad 100644
--- a/paddle/fluid/operators/scale_op_xpu.cc
+++ b/paddle/fluid/operators/scale_op_xpu.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
-
 template <typename DeviceContext, typename T>
 class ScaleXPUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/sign_op.cc b/paddle/fluid/operators/sign_op.cc
index a491da3931964..6207c33f9d629 100644
--- a/paddle/fluid/operators/sign_op.cc
+++ b/paddle/fluid/operators/sign_op.cc
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <memory>
-
 #include "paddle/fluid/operators/sign_op.h"
+#include <memory>
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
diff --git a/paddle/tcmpt/api/include/core.h b/paddle/tcmpt/api/include/core.h
index d6b73dcbee66e..fd863186abb30 100644
--- a/paddle/tcmpt/api/include/core.h
+++ b/paddle/tcmpt/api/include/core.h
@@ -19,5 +19,4 @@ limitations under the License. */
 #include "paddle/tcmpt/core/dense_tensor.h"
 #include "paddle/tcmpt/core/kernel_context.h"
 #include "paddle/tcmpt/core/kernel_factory.h"
-#include "paddle/tcmpt/core/mkldnn_dense_tensor.h"
 #include "paddle/tcmpt/core/scalar.h"
diff --git a/paddle/tcmpt/core/mkldnn_dense_tensor.h b/paddle/tcmpt/core/mkldnn_dense_tensor.h
deleted file mode 100644
index 0aea392fce93d..0000000000000
--- a/paddle/tcmpt/core/mkldnn_dense_tensor.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#ifdef PADDLE_WITH_MKLDNN
-
-#include "mkldnn.hpp"
-
-#include "paddle/tcmpt/core/dense_tensor.h"
-
-namespace pt {
-
-class MKLDNNDenseTensor : public DenseTensor {
- public:
-  // Not allowed to initialize a tensor without descriptive metadata
-  MKLDNNDenseTensor() = delete;
-
-  MKLDNNDenseTensor(const MKLDNNDenseTensor&) = delete;
-  MKLDNNDenseTensor& operator=(const MKLDNNDenseTensor&) = delete;
-  MKLDNNDenseTensor(MKLDNNDenseTensor&&) = delete;
-  MKLDNNDenseTensor& operator=(MKLDNNDenseTensor&&) = delete;
-
-  MKLDNNDenseTensor(const TensorMeta& meta, const TensorStatus& status)
-      : DenseTensor(meta, status) {}
-
-  mkldnn::memory::format_tag format() const { return format_; }
-
-  void set_format(const mkldnn::memory::format_tag format) { format_ = format; }
-
- private:
-  /**
-   * @brief the detail format of memory block which have layout as kMKLDNN
-   *
-   * @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C,
-   *       nChw16c, etc. For a MKLDNN memory block, layout will be set as
-   *       DataLayout::kMKLDNN meanwhile detail memory format will be kept in
-   *       this field.
-   */
-  mkldnn::memory::format_tag format_ = mkldnn::memory::format_tag::undef;
-};
-
-}  // namespace pt
-
-#endif
diff --git a/paddle/tcmpt/cpu/CMakeLists.txt b/paddle/tcmpt/cpu/CMakeLists.txt
index fbb0a45266003..3480ebba53155 100644
--- a/paddle/tcmpt/cpu/CMakeLists.txt
+++ b/paddle/tcmpt/cpu/CMakeLists.txt
@@ -1,5 +1,6 @@
 if(WIN32)
     set(CURRENT_BINARY_DIR ${PADDLE_BINARY_DIR}/paddle/tcmpt/cpu)
+    kernel_instantiate(creation.cc)
     kernel_instantiate(math.cc)
     kernel_instantiate(linalg.cc)
 endif()
diff --git a/paddle/tcmpt/cuda/CMakeLists.txt b/paddle/tcmpt/cuda/CMakeLists.txt
index 94de051e2e3a4..458d93529f435 100644
--- a/paddle/tcmpt/cuda/CMakeLists.txt
+++ b/paddle/tcmpt/cuda/CMakeLists.txt
@@ -1,5 +1,6 @@
 if(WIN32)
     set(CURRENT_BINARY_DIR ${PADDLE_BINARY_DIR}/paddle/tcmpt/cuda)
+    kernel_instantiate(creation.cu)
     kernel_instantiate(math.cu)
     kernel_instantiate(linalg.cu)
 endif()
diff --git a/paddle/tcmpt/cuda/linalg.cu b/paddle/tcmpt/cuda/linalg.cu
index acfdf59b27441..118d3326e5fb5 100644
--- a/paddle/tcmpt/cuda/linalg.cu
+++ b/paddle/tcmpt/cuda/linalg.cu
@@ -15,10 +15,9 @@
 #include "paddle/tcmpt/cuda/linalg.h"
 
 #include "paddle/tcmpt/core/kernel_registry.h"
+#include "paddle/tcmpt/eigen/dot.h"
 
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/platform/complex.h"
 
 namespace pt {
@@ -28,22 +27,7 @@ void Dot(const CUDAContext& dev_ctx,
          const DenseTensor& x,
          const DenseTensor& y,
          DenseTensor* out) {
-  out->mutable_data();
-  if (1 == out->dims().size()) {
-    auto eigen_out = paddle::framework::EigenScalar<T>::From(*out);
-    auto eigen_x = paddle::framework::EigenVector<T>::Flatten(x);
-    auto eigen_y = paddle::framework::EigenVector<T>::Flatten(y);
-
-    auto& dev = *dev_ctx.eigen_device();
-    eigen_out.device(dev) = (eigen_x * eigen_y).sum();
-  } else {
-    auto eigen_out = paddle::framework::EigenMatrix<T>::From(*out);
-    auto eigen_x = paddle::framework::EigenMatrix<T>::From(x);
-    auto eigen_y = paddle::framework::EigenMatrix<T>::From(y);
-
-    auto& dev = *dev_ctx.eigen_device();
-    eigen_out.device(dev) = (eigen_x * eigen_y).sum(Eigen::DSizes<int, 1>(1));
-  }
+  eigen::Dot<CUDAContext, T>(dev_ctx, x, y, out);
 }
 
 }  // namespace pt
diff --git a/paddle/tcmpt/eigen/common.h b/paddle/tcmpt/eigen/common.h
new file mode 100644
index 0000000000000..37bed55a7d97a
--- /dev/null
+++ b/paddle/tcmpt/eigen/common.h
@@ -0,0 +1,170 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+
+#include "paddle/tcmpt/core/dense_tensor.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace pt {
+
+// EigenDim converts paddle::platform::DDim into Eigen::DSizes.
+template <int D>
+struct EigenDim {
+  using Type = Eigen::DSizes<Eigen::DenseIndex, D>;
+
+  static Type From(const DDim& dims) {
+    PADDLE_ENFORCE_EQ(arity(dims),
+                      D,
+                      paddle::platform::errors::InvalidArgument(
+                          "Input dimension size should be equal to %d, but "
+                          "received dimension size is %d.",
+                          arity(dims),
+                          D));
+    Type ret;
+    for (int64_t d = 0; d < arity(dims); d++) {
+      ret[d] = dims[d];
+    }
+    return ret;
+  }
+};
+
+// Interpret paddle::platform::Tensor as EigenTensor and EigenConstTensor.
+template <typename T,
+          size_t D,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+struct EigenTensor {
+  // TODO(qijun) Now, default type in unaligned, and we will make a benchmark on
+  // the speed of aligned and unaligned version in future.
+  using Type = Eigen::TensorMap<Eigen::Tensor<T, D, MajorType, IndexType>>;
+
+  using ConstType =
+      Eigen::TensorMap<Eigen::Tensor<const T, D, MajorType, IndexType>>;
+
+  static Type From(pt::DenseTensor& tensor, DDim dims) {  // NOLINT
+    // why tensor.data<T>() not work?
+    // return Type(const_cast<T*>(reinterpret_cast<const T*>(tensor.data())),
+    // EigenDim<D>::From(dims));
+    return Type(const_cast<T*>(tensor.data<T>()), EigenDim<D>::From(dims));
+  }
+
+  static Type From(pt::DenseTensor& tensor) {  // NOLINT
+    return From(tensor, tensor.dims());
+  }  // NOLINT
+
+  static ConstType From(const pt::DenseTensor& tensor, DDim dims) {
+    // return ConstType(reinterpret_cast<const T*>(tensor.data()),
+    // EigenDim<D>::From(dims));
+    return ConstType(tensor.data<T>(), EigenDim<D>::From(dims));
+  }
+
+  static ConstType From(const pt::DenseTensor& tensor) {
+    return From(tensor, tensor.dims());
+  }
+};
+
+template <typename T,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {
+  static typename EigenMatrix::Type Reshape(pt::DenseTensor& tensor,  // NOLINT
+                                            int num_col_dims) {
+    int rank = tensor.dims().size();
+    PADDLE_ENFORCE_EQ((num_col_dims > 0 && num_col_dims < rank),
+                      true,
+                      paddle::platform::errors::InvalidArgument(
+                          "Input dimension number(num_col_dims) must be "
+                          "between 0 and %d, but received number is %d.",
+                          rank,
+                          num_col_dims));
+    return EigenMatrix::From(tensor,
+                             flatten_to_2d(tensor.dims(), num_col_dims));
+  }
+
+  static typename EigenMatrix::ConstType Reshape(const pt::DenseTensor& tensor,
+                                                 int num_col_dims) {
+    int rank = tensor.dims().size();
+    PADDLE_ENFORCE_EQ((num_col_dims > 0 && num_col_dims < rank),
+                      true,
+                      paddle::platform::errors::InvalidArgument(
+                          "Input dimension number(num_col_dims) must be "
+                          "between 0 and %d, but received number is %d.",
+                          rank,
+                          num_col_dims));
+    return EigenMatrix::From(tensor,
+                             flatten_to_2d(tensor.dims(), num_col_dims));
+  }
+};
+
+template <typename T,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+struct EigenVector : public EigenTensor<T, 1, MajorType, IndexType> {
+  // Flatten reshapes a Tensor into an EigenVector.
+  static typename EigenVector::Type Flatten(
+      pt::DenseTensor& tensor) {  // NOLINT
+    return EigenVector::From(tensor, {product(tensor.dims())});
+  }
+
+  static typename EigenVector::ConstType Flatten(
+      const pt::DenseTensor& tensor) {  // NOLINT
+    return EigenVector::From(tensor, {product(tensor.dims())});
+  }
+};
+
+template <typename T,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+struct EigenScalar {
+  // Scalar tensor (implemented as a rank-0 tensor) of scalar type T.
+  using Type = Eigen::TensorMap<
+      Eigen::TensorFixedSize<T, Eigen::Sizes<>, MajorType, IndexType>>;
+  using ConstType = Eigen::TensorMap<
+      Eigen::TensorFixedSize<const T, Eigen::Sizes<>, MajorType, IndexType>>;
+
+  static Type From(pt::DenseTensor& tensor) {  // NOLINT
+    return Type(const_cast<T*>(tensor.data<T>()));
+  }
+
+  static ConstType From(const pt::DenseTensor& tensor) {
+    return ConstType(tensor.data<T>());
+  }
+};
+
+// Define Tensor with 32-bit index.
+template <typename T, int D, int MajorType = Eigen::RowMajor>
+using Tensor32BitIndex =
+    Eigen::TensorMap<Eigen::Tensor<T, D, MajorType, int>, Eigen::Aligned>;
+
+template <typename DSizes>
+Eigen::DSizes<int, DSizes::count> To32BitDims(const DSizes& in) {
+  Eigen::DSizes<int, DSizes::count> out;
+  for (int i = 0; i < DSizes::count; ++i) {
+    out[i] = in[i];
+  }
+  return out;
+}
+
+template <typename EigenTensor>
+Tensor32BitIndex<typename EigenTensor::Scalar, EigenTensor::NumIndices>
+To32BitIndex(EigenTensor in) {
+  using RetType =
+      Tensor32BitIndex<typename EigenTensor::Scalar, EigenTensor::NumIndices>;
+  return RetType(in.data(), To32BitDims(in.dimensions()));
+}
+
+}  // namespace pt
diff --git a/paddle/tcmpt/eigen/dot.h b/paddle/tcmpt/eigen/dot.h
new file mode 100644
index 0000000000000..5e323e4448409
--- /dev/null
+++ b/paddle/tcmpt/eigen/dot.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/tcmpt/core/dense_tensor.h"
+#include "paddle/tcmpt/eigen/common.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/operators/eigen/eigen_function.h"
+
+namespace pt {
+namespace eigen {
+
+template <typename DevCtx, typename T>
+void Dot(const DevCtx& dev_ctx,
+         const DenseTensor& x,
+         const DenseTensor& y,
+         DenseTensor* out) {
+  out->mutable_data();
+  if (1 == out->dims().size()) {
+    auto eigen_out = pt::EigenScalar<T>::From(*out);
+    auto eigen_x = pt::EigenVector<T>::Flatten(x);
+    auto eigen_y = pt::EigenVector<T>::Flatten(y);
+
+    auto& dev = *dev_ctx.eigen_device();
+    eigen_out.device(dev) = (eigen_x * eigen_y).sum();
+  } else {
+    auto eigen_out = pt::EigenMatrix<T>::From(*out);
+    auto eigen_x = pt::EigenMatrix<T>::From(x);
+    auto eigen_y = pt::EigenMatrix<T>::From(y);
+
+    auto& dev = *dev_ctx.eigen_device();
+    eigen_out.device(dev) = (eigen_x * eigen_y).sum(Eigen::DSizes<int, 1>(1));
+  }
+}
+
+}  // namespace eigen
+}  // namespace pt
diff --git a/paddle/tcmpt/eigen/fill.h b/paddle/tcmpt/eigen/fill.h
index 6a21ca6932cd5..fb56ccdd8e125 100644
--- a/paddle/tcmpt/eigen/fill.h
+++ b/paddle/tcmpt/eigen/fill.h
@@ -15,8 +15,9 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/tcmpt/core/dense_tensor.h"
+#include "paddle/tcmpt/eigen/common.h"
 
-#include "paddle/fluid/framework/eigen.h"
+// See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 
 namespace pt {
@@ -50,7 +51,7 @@ void fill(const DeviceContext& context, DenseTensor* tensor, VType val) {
           static_cast<CommonType>(std::numeric_limits<T>::max()),
           static_cast<float>(val)));
 
-  auto t = paddle::framework::EigenVector<T>::Flatten(*tensor);
+  auto t = pt::EigenVector<T>::Flatten(*tensor);
   t.device(*context.eigen_device()) = t.constant(static_cast<T>(val));
 }
 
diff --git a/paddle/tcmpt/eigen/mean.h b/paddle/tcmpt/eigen/mean.h
index bd2c5ad2bf219..e70870e7954b7 100644
--- a/paddle/tcmpt/eigen/mean.h
+++ b/paddle/tcmpt/eigen/mean.h
@@ -15,9 +15,9 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/tcmpt/core/dense_tensor.h"
+#include "paddle/tcmpt/eigen/common.h"
 
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 
 namespace pt {
@@ -30,8 +30,8 @@ void Mean(const DevCtx& dev_ctx, const DenseTensor& x, DenseTensor* out) {
   // TODO(chenweihang): if we design new tensor, we should support
   // the low-level calc functor use new tensor as input,
   // which may be a big project!
-  auto eigen_x = paddle::framework::EigenVector<T>::Flatten(x);
-  auto eigen_out = paddle::framework::EigenScalar<T>::From(*out);
+  auto eigen_x = pt::EigenVector<T>::Flatten(x);
+  auto eigen_out = pt::EigenScalar<T>::From(*out);
 
   auto& dev = *dev_ctx.eigen_device();
   eigen_out.device(dev) = eigen_x.mean();
diff --git a/paddle/tcmpt/eigen/scale.h b/paddle/tcmpt/eigen/scale.h
index 5bea4fb300af4..152cb61800c8b 100644
--- a/paddle/tcmpt/eigen/scale.h
+++ b/paddle/tcmpt/eigen/scale.h
@@ -15,9 +15,9 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/tcmpt/core/dense_tensor.h"
+#include "paddle/tcmpt/eigen/common.h"
 
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 
 namespace pt {
@@ -32,8 +32,8 @@ void Scale(const DevCtx& dev_ctx,
            DenseTensor* out) {
   // calc
   out->mutable_data<T>();
-  auto eigen_out = paddle::framework::EigenVector<T>::Flatten(*out);
-  auto eigen_x = paddle::framework::EigenVector<T>::Flatten(x);
+  auto eigen_out = pt::EigenVector<T>::Flatten(*out);
+  auto eigen_x = pt::EigenVector<T>::Flatten(x);
   auto& dev = *dev_ctx.eigen_device();
   // TODO(chenweihang): now the eigen function here need the dtype of scale,
   // eigen_x, bias should be same, so here need cast for two scalar arg,
diff --git a/paddle/tcmpt/eigen/sign.h b/paddle/tcmpt/eigen/sign.h
index b138123e81ee0..d41702576b3a1 100644
--- a/paddle/tcmpt/eigen/sign.h
+++ b/paddle/tcmpt/eigen/sign.h
@@ -15,9 +15,9 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/tcmpt/core/dense_tensor.h"
+#include "paddle/tcmpt/eigen/common.h"
 
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 
 namespace pt {
@@ -33,8 +33,8 @@ void Sign(const DevCtx& dev_ctx, const DenseTensor& x, DenseTensor* out) {
   // TODO(chenweihang): if we design new tensor, we should support
   // the low-level calc functor use new tensor as input,
   // which may be a big project!
-  auto eigen_out = paddle::framework::EigenVector<T>::Flatten(*out);
-  auto eigen_x = paddle::framework::EigenVector<T>::Flatten(x);
+  auto eigen_out = pt::EigenVector<T>::Flatten(*out);
+  auto eigen_x = pt::EigenVector<T>::Flatten(x);
 
   auto& dev = *dev_ctx.eigen_device();
   paddle::operators::EigenSign<std::decay_t<decltype(dev)>, T>::Eval(

From 23091495cfdd3df8cc1be592d30f09ea66a7c72b Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Fri, 15 Oct 2021 03:48:54 +0000
Subject: [PATCH 081/125] use flat_hash_map and small_vector in kernel factory

---
 paddle/tcmpt/core/kernel_factory.h | 36 +++++++++++++++++-------------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/paddle/tcmpt/core/kernel_factory.h b/paddle/tcmpt/core/kernel_factory.h
index 180f0ce2c6b87..db1f0df76e6ba 100644
--- a/paddle/tcmpt/core/kernel_factory.h
+++ b/paddle/tcmpt/core/kernel_factory.h
@@ -16,13 +16,14 @@
 
 #include <ostream>
 #include <string>
-#include <unordered_map>
 #include <utility>
 
 #include "paddle/tcmpt/core/backend.h"
 #include "paddle/tcmpt/core/dtype.h"
 #include "paddle/tcmpt/core/kernel_def.h"
 #include "paddle/tcmpt/core/layout.h"
+#include "paddle/utils/flat_hash_map.h"
+#include "paddle/utils/small_vector.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/enforce.h"
@@ -209,25 +210,30 @@ class KernelArgsDef {
     attribute_defs_.emplace_back(AttributeArgDef(type_index));
   }
 
-  const std::vector<TensorArgDef>& input_defs() const { return input_defs_; }
+  const paddle::SmallVector<TensorArgDef>& input_defs() const {
+    return input_defs_;
+  }
 
-  const std::vector<TensorArgDef>& output_defs() const { return output_defs_; }
+  const paddle::SmallVector<TensorArgDef>& output_defs() const {
+    return output_defs_;
+  }
 
-  const std::vector<AttributeArgDef>& attribute_defs() const {
+  const paddle::SmallVector<AttributeArgDef>& attribute_defs() const {
     return attribute_defs_;
   }
 
-  std::vector<TensorArgDef>& input_defs() { return input_defs_; }
+  paddle::SmallVector<TensorArgDef>& input_defs() { return input_defs_; }
 
-  std::vector<TensorArgDef>& output_defs() { return output_defs_; }
+  paddle::SmallVector<TensorArgDef>& output_defs() { return output_defs_; }
 
-  std::vector<AttributeArgDef>& attribute_defs() { return attribute_defs_; }
+  paddle::SmallVector<AttributeArgDef>& attribute_defs() {
+    return attribute_defs_;
+  }
 
  private:
-  // TODO(chenweihang): replaced by paddle::small_vector
-  std::vector<TensorArgDef> input_defs_{{}};
-  std::vector<TensorArgDef> output_defs_{{}};
-  std::vector<AttributeArgDef> attribute_defs_{{}};
+  paddle::SmallVector<TensorArgDef> input_defs_{{}};
+  paddle::SmallVector<TensorArgDef> output_defs_{{}};
+  paddle::SmallVector<AttributeArgDef> attribute_defs_{{}};
 };
 
 class Kernel {
@@ -263,10 +269,10 @@ class Kernel {
 class KernelFactory {
  public:
   // replaced by paddle::flat_hash_map later
-  using KernelMap =
-      std::unordered_map<KernelName,
-                         std::unordered_map<KernelKey, Kernel, KernelKey::Hash>,
-                         KernelName::Hash>;
+  using KernelMap = paddle::flat_hash_map<
+      KernelName,
+      paddle::flat_hash_map<KernelKey, Kernel, KernelKey::Hash>,
+      KernelName::Hash>;
 
   static KernelFactory& Instance();
 

From 6ce92e532ccfd3906925b65e386674b6181eb978 Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Fri, 15 Oct 2021 13:37:40 +0800
Subject: [PATCH 082/125] Refactor flatten kernel (#12)

* refactor flatten kernel

* update infershape function

* fix compile bugs

* fix bugs when merge

* fix compiler bugs

* fix bugs when run test_flatten_api

* fix bugs when run test
---
 paddle/fluid/framework/operator.cc       |  24 ++-
 paddle/tcmpt/api/CMakeLists.txt          |   5 +-
 paddle/tcmpt/api/all.h                   |   1 +
 paddle/tcmpt/api/include/infershape.h    |   1 +
 paddle/tcmpt/api/include/manipulation.h  |  19 ++
 paddle/tcmpt/core/dtype.cc               |   6 +
 paddle/tcmpt/core/dtype.h                |   4 +-
 paddle/tcmpt/core/kernel_def.h           |   2 +
 paddle/tcmpt/core/kernel_registry.h      |  73 ++++++++
 paddle/tcmpt/core/layout.cc              |   5 +
 paddle/tcmpt/core/layout.h               |   2 +
 paddle/tcmpt/core/tensor_meta.h          |   2 +-
 paddle/tcmpt/cpu/CMakeLists.txt          |   2 +
 paddle/tcmpt/cpu/manipulation.cc         |  81 ++++++++
 paddle/tcmpt/cpu/manipulation.h          |  34 ++++
 paddle/tcmpt/cpu/utils.cc                |  58 ++++++
 paddle/tcmpt/cpu/utils.h                 |  28 +++
 paddle/tcmpt/cuda/CMakeLists.txt         |   4 +
 paddle/tcmpt/cuda/manipulation.cu        |  83 +++++++++
 paddle/tcmpt/cuda/manipulation.h         |  38 ++++
 paddle/tcmpt/cuda/utils.cu               | 223 +++++++++++++++++++++++
 paddle/tcmpt/cuda/utils.h                |  28 +++
 paddle/tcmpt/hapi/include/manipulation.h |  25 +++
 paddle/tcmpt/hapi/lib/CMakeLists.txt     |   1 +
 paddle/tcmpt/hapi/lib/creation.cc        |  10 +-
 paddle/tcmpt/hapi/lib/linalg.cc          |  12 +-
 paddle/tcmpt/hapi/lib/manipulation.cc    |  67 +++++++
 paddle/tcmpt/hapi/lib/math.cc            |   9 +-
 paddle/tcmpt/infershape/CMakeLists.txt   |   2 +
 paddle/tcmpt/infershape/binary.cc        |  62 +++++++
 paddle/tcmpt/infershape/binary.h         |  35 ++++
 paddle/tcmpt/infershape/unary.cc         |  77 ++++++++
 paddle/tcmpt/infershape/unary.h          |  36 ++--
 paddle/tcmpt/tests/CMakeLists.txt        |   2 +
 paddle/tcmpt/tests/test_copy_api.cc      |  64 +++++++
 paddle/tcmpt/tests/test_flatten_api.cc   |  69 +++++++
 36 files changed, 1154 insertions(+), 40 deletions(-)
 create mode 100644 paddle/tcmpt/api/include/manipulation.h
 create mode 100644 paddle/tcmpt/cpu/manipulation.cc
 create mode 100644 paddle/tcmpt/cpu/manipulation.h
 create mode 100644 paddle/tcmpt/cpu/utils.cc
 create mode 100644 paddle/tcmpt/cpu/utils.h
 create mode 100644 paddle/tcmpt/cuda/manipulation.cu
 create mode 100644 paddle/tcmpt/cuda/manipulation.h
 create mode 100644 paddle/tcmpt/cuda/utils.cu
 create mode 100644 paddle/tcmpt/cuda/utils.h
 create mode 100644 paddle/tcmpt/hapi/include/manipulation.h
 create mode 100644 paddle/tcmpt/hapi/lib/manipulation.cc
 create mode 100644 paddle/tcmpt/infershape/binary.cc
 create mode 100644 paddle/tcmpt/infershape/binary.h
 create mode 100644 paddle/tcmpt/infershape/unary.cc
 create mode 100644 paddle/tcmpt/tests/test_copy_api.cc
 create mode 100644 paddle/tcmpt/tests/test_flatten_api.cc

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index a47089ecba5cd..32fc10f38bd48 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1286,9 +1286,23 @@ static bool ContainHostTensor(const proto::OpProto& op_proto,
   return false;
 }
 
+// TODO(yuanrisheng): enhance rules, for get kernel that contains Intermediate
+// Tensor
+static bool ContainMidOutputTensor(const proto::OpProto& op_proto,
+                                   const VariableValueMap& outputs) {
+  for (int i = 0; i < op_proto.outputs_size(); ++i) {
+    auto output = op_proto.outputs()[i];
+    if (output.has_intermediate() && output.intermediate()) {
+      return IsValidVar(output.name(), outputs);
+    }
+  }
+  return false;
+}
+
 static pt::KernelName ConstructPtKernelName(const std::string& op_type,
                                             const proto::OpProto& op_proto,
-                                            const VariableValueMap& inputs) {
+                                            const VariableValueMap& inputs,
+                                            const VariableValueMap& outputs) {
   std::string overload_name;
   // TODO(chenweihang): adapt SelectedRows by xiaowei's design
   if (ContainHostTensor(op_proto, inputs)) {
@@ -1297,6 +1311,12 @@ static pt::KernelName ConstructPtKernelName(const std::string& op_type,
     }
     overload_name += pt::kContainHostTensorSuffix;
   }
+  if (ContainMidOutputTensor(op_proto, outputs)) {
+    if (overload_name != "") {
+      overload_name += ".";
+    }
+    overload_name += pt::kContainMidOutputTensorSuffix;
+  }
   return pt::KernelName(op_type, overload_name);
 }
 
@@ -1305,7 +1325,7 @@ void OperatorWithKernel::ChoosePtKernel(
   // 1. construct operation name
   // TODO(chenweihang): add rules for construct op name
   auto kernel_name =
-      ConstructPtKernelName(Type(), *(Info().proto_), ctx.inputs);
+      ConstructPtKernelName(Type(), *(Info().proto_), ctx.inputs, ctx.outputs);
 
   // 2. construct op kernel key
   pt_kernel_key_.reset(new pt::KernelKey(
diff --git a/paddle/tcmpt/api/CMakeLists.txt b/paddle/tcmpt/api/CMakeLists.txt
index 54a48ca6a57a0..bf4d163a62bfc 100644
--- a/paddle/tcmpt/api/CMakeLists.txt
+++ b/paddle/tcmpt/api/CMakeLists.txt
@@ -12,9 +12,10 @@
 # declare_module(MathCUDA)
 
 set(TCMPT_DEPS convert_utils dense_tensor kernel_factory kernel_context)
-set(TCMPT_DEPS ${TCMPT_DEPS} math_cpu linalg_cpu creation_cpu)
+set(TCMPT_DEPS ${TCMPT_DEPS} math_cpu linalg_cpu creation_cpu manipulation_cpu)
+set(TCMPT_DEPS ${TCMPT_DEPS} unary binary)
 if(WITH_GPU OR WITH_ROCM)
-  set(TCMPT_DEPS ${TCMPT_DEPS} math_cuda linalg_cuda creation_cuda)
+  set(TCMPT_DEPS ${TCMPT_DEPS} math_cuda linalg_cuda creation_cuda manipulation_cuda)
 endif()
 
 cc_library(tcmpt SRCS all.cc DEPS ${TCMPT_DEPS})
diff --git a/paddle/tcmpt/api/all.h b/paddle/tcmpt/api/all.h
index 42079764bfe83..0f47f75f8a7fc 100644
--- a/paddle/tcmpt/api/all.h
+++ b/paddle/tcmpt/api/all.h
@@ -19,4 +19,5 @@ limitations under the License. */
 #include "paddle/tcmpt/api/include/creation.h"
 #include "paddle/tcmpt/api/include/infershape.h"
 #include "paddle/tcmpt/api/include/linalg.h"
+#include "paddle/tcmpt/api/include/manipulation.h"
 #include "paddle/tcmpt/api/include/math.h"
diff --git a/paddle/tcmpt/api/include/infershape.h b/paddle/tcmpt/api/include/infershape.h
index 3ac4d37459e71..01ed351fb59b2 100644
--- a/paddle/tcmpt/api/include/infershape.h
+++ b/paddle/tcmpt/api/include/infershape.h
@@ -15,4 +15,5 @@ limitations under the License. */
 #pragma once
 
 // See Note: [ How do we organize the kernel directory ]
+#include "paddle/tcmpt/infershape/binary.h"
 #include "paddle/tcmpt/infershape/unary.h"
diff --git a/paddle/tcmpt/api/include/manipulation.h b/paddle/tcmpt/api/include/manipulation.h
new file mode 100644
index 0000000000000..b44e53c01384b
--- /dev/null
+++ b/paddle/tcmpt/api/include/manipulation.h
@@ -0,0 +1,19 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+// See Note: [ How do we organize the kernel directory ]
+#include "paddle/tcmpt/cpu/manipulation.h"
+#include "paddle/tcmpt/cuda/manipulation.h"
diff --git a/paddle/tcmpt/core/dtype.cc b/paddle/tcmpt/core/dtype.cc
index f1de29f184fc4..c9fefc6a69080 100644
--- a/paddle/tcmpt/core/dtype.cc
+++ b/paddle/tcmpt/core/dtype.cc
@@ -64,4 +64,10 @@ std::ostream& operator<<(std::ostream& os, DataType dtype) {
   return os;
 }
 
+DataType& operator++(DataType& dtype, int) {
+  dtype =
+      DataType(static_cast<std::underlying_type<DataType>::type>(dtype) + 1);
+  return dtype;
+}
+
 }  // namespace pt
diff --git a/paddle/tcmpt/core/dtype.h b/paddle/tcmpt/core/dtype.h
index d7a0b3c007db4..1b5c1b8037a21 100644
--- a/paddle/tcmpt/core/dtype.h
+++ b/paddle/tcmpt/core/dtype.h
@@ -55,11 +55,13 @@ enum class DataType {
   kFLOAT64,
   kCOMPLEX64,
   kCOMPLEX128,
-  kNumDataTypes,
+  kNumDataTypes
 };
 
 std::ostream& operator<<(std::ostream& os, DataType dtype);
 
+DataType& operator++(DataType& dtype, int);
+
 #define PT_FOR_EACH_DATA_TYPE(_)     \
   _(bool, DataType::kBOOL)           \
   _(int8_t, DataType::kINT8)         \
diff --git a/paddle/tcmpt/core/kernel_def.h b/paddle/tcmpt/core/kernel_def.h
index 073d57269c321..70b8be19aaeea 100644
--- a/paddle/tcmpt/core/kernel_def.h
+++ b/paddle/tcmpt/core/kernel_def.h
@@ -37,4 +37,6 @@ constexpr char kContainHostTensorSuffix[] = "host";
 // For kernels with SelectedRowsTensor input and output
 constexpr char kContainSelectedRowsSuffix[] = "sr";
 
+// For kernels with intermediate output
+constexpr char kContainMidOutputTensorSuffix[] = "mid";
 }  // namespace pt
diff --git a/paddle/tcmpt/core/kernel_registry.h b/paddle/tcmpt/core/kernel_registry.h
index d31cb9b692184..40ee968dd987c 100644
--- a/paddle/tcmpt/core/kernel_registry.h
+++ b/paddle/tcmpt/core/kernel_registry.h
@@ -84,6 +84,58 @@ struct KernelRegistrar {
                   KernelArgsParseFn args_parse_fn,
                   KernelArgsDefFn args_def_fn,
                   KernelFn kernel_fn) {
+    ConstructKernel(kernel_name_cstr,
+                    backend,
+                    layout,
+                    dtype,
+                    args_parse_fn,
+                    args_def_fn,
+                    kernel_fn);
+  }
+
+  KernelRegistrar(const char* kernel_name_cstr,
+                  Backend backend,
+                  DataLayout layout,
+                  KernelArgsParseFn args_parse_fn,
+                  KernelArgsDefFn args_def_fn,
+                  KernelFn kernel_fn) {
+    if (layout == DataLayout::kAny) {
+      for (DataLayout layout_iter = DataLayout::kNHWC;
+           layout_iter != DataLayout::kNumLayouts;
+           layout_iter++) {
+        for (DataType dtype = DataType::kBOOL; dtype != DataType::kNumDataTypes;
+             dtype++) {
+          ConstructKernel(kernel_name_cstr,
+                          backend,
+                          layout_iter,
+                          dtype,
+                          args_parse_fn,
+                          args_def_fn,
+                          kernel_fn);
+        }
+      }
+    } else {
+      for (DataType dtype = DataType::kBOOL; dtype != DataType::kNumDataTypes;
+           dtype++) {
+        ConstructKernel(kernel_name_cstr,
+                        backend,
+                        layout,
+                        static_cast<DataType>(dtype),
+                        args_parse_fn,
+                        args_def_fn,
+                        kernel_fn);
+      }
+    }
+  }
+
+ private:
+  void ConstructKernel(const char* kernel_name_cstr,
+                       Backend backend,
+                       DataLayout layout,
+                       DataType dtype,
+                       KernelArgsParseFn args_parse_fn,
+                       KernelArgsDefFn args_def_fn,
+                       KernelFn kernel_fn) {
     KernelName kernel_name(kernel_name_cstr);
     KernelKey kernel_key(backend, layout, dtype);
     Kernel kernel(kernel_fn);
@@ -549,4 +601,25 @@ struct KernelRegistrar {
   void PT_CONCATENATE(__PT_KERNEL_for_test_args_def_FN_,                   \
                       func_id)(::pt::Kernel * kernel)
 
+#define PT_REGISTER_KERNEL_WITH_NO_TYPE(          \
+    kernel_name, backend, layout, meta_kernel_fn) \
+  _PT_REGISTER_KERNEL_WITH_NO_TYPE(               \
+      kernel_name, PT_ID, backend, layout, meta_kernel_fn)
+
+#define _PT_REGISTER_KERNEL_WITH_NO_TYPE(                             \
+    kernel_name, func_id, backend, layout, meta_kernel_fn)            \
+  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                  \
+      PT_CONCATENATE(pt_op_kernel_ns_check_, func_id),                \
+      "PT_REGISTER_KERNEL must be called in global namespace.");      \
+  decltype(meta_kernel_fn) meta_kernel_fn;                            \
+  static void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                \
+                             func_id)(::pt::Kernel*);                 \
+  static const ::pt::KernelRegistrar __reg_pt_op_kernel_##func_id(    \
+      kernel_name,                                                    \
+      BACKEND(backend),                                               \
+      DATALAYOUT(layout),                                             \
+      ::pt::KernelArgsParseFunctor<decltype(&meta_kernel_fn)>::Parse, \
+      &PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id),             \
+      PT_KERNEL(meta_kernel_fn));                                     \
+  void PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id)(::pt::Kernel * kernel)
 }  // namespace pt
diff --git a/paddle/tcmpt/core/layout.cc b/paddle/tcmpt/core/layout.cc
index 5c09e67a79856..4f4fd972516da 100644
--- a/paddle/tcmpt/core/layout.cc
+++ b/paddle/tcmpt/core/layout.cc
@@ -40,4 +40,9 @@ std::ostream& operator<<(std::ostream& os, DataLayout dtype) {
   return os;
 }
 
+DataLayout& operator++(DataLayout& layout, int) {
+  layout = DataLayout(
+      static_cast<std::underlying_type<DataLayout>::type>(layout) + 1);
+  return layout;
+}
 }  // namespace pt
diff --git a/paddle/tcmpt/core/layout.h b/paddle/tcmpt/core/layout.h
index 6a5cdb1c5e8cd..4a8a223b62f84 100644
--- a/paddle/tcmpt/core/layout.h
+++ b/paddle/tcmpt/core/layout.h
@@ -38,4 +38,6 @@ enum class DataLayout {
 
 std::ostream& operator<<(std::ostream& os, DataLayout dtype);
 
+DataLayout& operator++(DataLayout& layout, int);
+
 }  // namespace pt
diff --git a/paddle/tcmpt/core/tensor_meta.h b/paddle/tcmpt/core/tensor_meta.h
index 5789e9a459e0b..bd3319cf4fdad 100644
--- a/paddle/tcmpt/core/tensor_meta.h
+++ b/paddle/tcmpt/core/tensor_meta.h
@@ -48,7 +48,7 @@ namespace pt {
  */
 // using LoD = std::vector<paddle::framework::Vector<size_t>>;
 using LoD = std::vector<std::vector<size_t>>;
-
+using DDim = paddle::framework::DDim;
 /**
  * The Meta data member of DenseTensor.
  *
diff --git a/paddle/tcmpt/cpu/CMakeLists.txt b/paddle/tcmpt/cpu/CMakeLists.txt
index 3480ebba53155..cf3204bc5bcb0 100644
--- a/paddle/tcmpt/cpu/CMakeLists.txt
+++ b/paddle/tcmpt/cpu/CMakeLists.txt
@@ -8,3 +8,5 @@ endif()
 cc_library(math_cpu SRCS math.cc DEPS dense_tensor kernel_context kernel_factory eigen_function)
 cc_library(linalg_cpu SRCS linalg.cc DEPS dense_tensor kernel_context kernel_factory)
 cc_library(creation_cpu SRCS creation.cc DEPS dense_tensor kernel_context kernel_factory eigen_function)
+cc_library(utils_cpu SRCS utils.cc DEPS dense_tensor kernel_context kernel_factory memory)
+cc_library(manipulation_cpu SRCS manipulation.cc DEPS dense_tensor kernel_context kernel_factory utils_cpu unary)
diff --git a/paddle/tcmpt/cpu/manipulation.cc b/paddle/tcmpt/cpu/manipulation.cc
new file mode 100644
index 0000000000000..d2964c5b533a9
--- /dev/null
+++ b/paddle/tcmpt/cpu/manipulation.cc
@@ -0,0 +1,81 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/tcmpt/cpu/manipulation.h"
+#include "paddle/tcmpt/cpu/utils.h"
+#include "paddle/tcmpt/infershape/unary.h"
+
+namespace pt {
+
+template <typename T>
+void Flatten(const CPUContext& dev_ctx,
+             const DenseTensor& x,
+             int start_axis,
+             int stop_axis,
+             DenseTensor* out) {
+  auto out_meta = FlattenInferShape(x.meta(), start_axis, stop_axis);
+  pt::Copy(dev_ctx, x, out);
+  out->mutable_meta()->lod = out_meta.lod;
+  out->Resize(out_meta.dims);
+}
+
+// TODO(yuanrisheng): this kernel is for training and xshape is a Intermediate
+// Output Tensor，
+// is there a more flexible way to deal with this case?
+template <typename T>
+void FlattenWithXShape(const CPUContext& dev_ctx,
+                       const DenseTensor& x,
+                       int start_axis,
+                       int stop_axis,
+                       DenseTensor* out,
+                       DenseTensor* xshape) {
+  Flatten<T>(dev_ctx, x, start_axis, stop_axis, out);
+  const auto& in_dims = x.meta().dims;
+  std::vector<int64_t> xshape_dims(in_dims.size() + 1);
+  xshape_dims[0] = 0;
+  for (int i = 0; i < in_dims.size(); ++i) {
+    xshape_dims[i + 1] = in_dims[i];
+  }
+  xshape->mutable_meta()->dims = paddle::framework::make_ddim(xshape_dims);
+  xshape->mutable_meta()->lod = x.meta().lod;
+}
+
+}  // namespace pt
+
+// TODO(chenweihang): replace by better impl
+PT_REGISTER_MODULE(ManipulationCPU);
+
+// TODO(yuanrisheng): "flatten_contiguous_range" is compatible with old kernel
+// architecture, kernel_name should be "flatten".
+PT_REGISTER_KERNEL("flatten_contiguous_range",
+                   CPU,
+                   NCHW,
+                   pt::Flatten,
+                   float,
+                   double,
+                   uint8_t,
+                   int8_t,
+                   int,
+                   int64_t) {}
+
+PT_REGISTER_KERNEL("flatten_contiguous_range.mid",
+                   CPU,
+                   NCHW,
+                   pt::FlattenWithXShape,
+                   float,
+                   double,
+                   uint8_t,
+                   int8_t,
+                   int,
+                   int64_t) {}
diff --git a/paddle/tcmpt/cpu/manipulation.h b/paddle/tcmpt/cpu/manipulation.h
new file mode 100644
index 0000000000000..0147dca441b25
--- /dev/null
+++ b/paddle/tcmpt/cpu/manipulation.h
@@ -0,0 +1,34 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/tcmpt/core/dense_tensor.h"
+#include "paddle/tcmpt/core/kernel_registry.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/device_context.h"
+
+namespace pt {
+
+using CPUContext = paddle::platform::CPUDeviceContext;
+
+template <typename T>
+void Flatten(const CPUContext& dev_ctx,
+             const DenseTensor& x,
+             int start_axis,
+             int stop_axis,
+             DenseTensor* out);
+
+}  // namespace pt
diff --git a/paddle/tcmpt/cpu/utils.cc b/paddle/tcmpt/cpu/utils.cc
new file mode 100644
index 0000000000000..86b074e49b362
--- /dev/null
+++ b/paddle/tcmpt/cpu/utils.cc
@@ -0,0 +1,58 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/tcmpt/cpu/utils.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/tcmpt/core/convert_utils.h"
+#include "paddle/tcmpt/core/dtype.h"
+
+namespace pt {
+
+void Copy(const CPUContext& dev_ctx, const DenseTensor& src, DenseTensor* dst) {
+  auto* src_ptr = src.data();
+  auto* dst_ptr = dst->mutable_data();
+  const auto& src_place = src.place();
+  const auto& dst_place = dst->place();
+  src.CheckMemorySize();
+
+  if (src_ptr == dst_ptr && src_place == dst_place) {
+    VLOG(3) << "Skip copy the same data async from " << src_place << " to "
+            << dst_place;
+    return;
+  }
+  VLOG(4) << "src:" << src_ptr << ", dst:" << dst_ptr;
+
+  VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
+          << dst_place;
+  dst->Resize(src.dims());
+  dst->mutable_meta()->layout = src.meta().layout;
+  auto size = src.numel() *
+              paddle::framework::SizeOfType(TransToProtoVarType(src.type()));
+
+  if (paddle::platform::is_cpu_place(src_place) &&
+      paddle::platform::is_cpu_place(dst_place)) {
+    paddle::memory::Copy(BOOST_GET_CONST(paddle::platform::CPUPlace, dst_place),
+                         dst_ptr,
+                         BOOST_GET_CONST(paddle::platform::CPUPlace, src_place),
+                         src_ptr,
+                         size);
+  }
+}
+
+}  // namespace pt
+
+// TODO(chenweihang): replace by better impl
+PT_REGISTER_MODULE(UtilsCPU);
+
+PT_REGISTER_KERNEL_WITH_NO_TYPE("copy", CPU, Any, pt::Copy) {}
diff --git a/paddle/tcmpt/cpu/utils.h b/paddle/tcmpt/cpu/utils.h
new file mode 100644
index 0000000000000..95ec606cc37d1
--- /dev/null
+++ b/paddle/tcmpt/cpu/utils.h
@@ -0,0 +1,28 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/tcmpt/core/dense_tensor.h"
+#include "paddle/tcmpt/core/kernel_registry.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/device_context.h"
+namespace pt {
+
+using CPUContext = paddle::platform::CPUDeviceContext;
+
+void Copy(const CPUContext& dev_ctx, const DenseTensor& src, DenseTensor* dst);
+
+}  // namespace pt
diff --git a/paddle/tcmpt/cuda/CMakeLists.txt b/paddle/tcmpt/cuda/CMakeLists.txt
index 458d93529f435..9e56e1a3be82a 100644
--- a/paddle/tcmpt/cuda/CMakeLists.txt
+++ b/paddle/tcmpt/cuda/CMakeLists.txt
@@ -9,8 +9,12 @@ if(WITH_GPU)
   nv_library(math_cuda SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory)
   nv_library(linalg_cuda SRCS linalg.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
   nv_library(creation_cuda SRCS creation.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
+  nv_library(utils_cuda SRCS utils.cu DEPS dense_tensor kernel_context kernel_factory memory)
+  nv_library(manipulation_cuda SRCS manipulation.cu DEPS dense_tensor kernel_context kernel_factory utils_cuda unary)
 elseif(WITH_ROCM)
   hip_library(math_cuda SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory)
   hip_library(linalg_cuda SRCS linalg.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
   hip_library(creation_cuda SRCS creation.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
+  hip_library(utils_cuda SRCS utils.cu DEPS dense_tensor kernel_context kernel_factory memory)
+  hip_library(manipulation_cuda SRCS manipulation.cu DEPS dense_tensor kernel_context kernel_factory utils_cuda unary)
 endif()
diff --git a/paddle/tcmpt/cuda/manipulation.cu b/paddle/tcmpt/cuda/manipulation.cu
new file mode 100644
index 0000000000000..91f69b2fe33d7
--- /dev/null
+++ b/paddle/tcmpt/cuda/manipulation.cu
@@ -0,0 +1,83 @@
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/tcmpt/cuda/manipulation.h"
+#include "paddle/tcmpt/cuda/utils.h"
+#include "paddle/tcmpt/infershape/unary.h"
+
+namespace pt {
+
+template <typename T>
+void Flatten(const CUDAContext& dev_ctx,
+             const DenseTensor& x,
+             int start_axis,
+             int stop_axis,
+             DenseTensor* out) {
+  auto out_meta = FlattenInferShape(x.meta(), start_axis, stop_axis);
+  pt::Copy(dev_ctx, x, out);
+  out->mutable_meta()->lod = out_meta.lod;
+  out->Resize(out_meta.dims);
+}
+
+// TODO(yuanrisheng): this kernel is for training and xshape is a Intermediate
+// Output Tensor，
+// is there a more flexible way to deal with this case?
+template <typename T>
+void FlattenWithXShape(const CUDAContext& dev_ctx,
+                       const DenseTensor& x,
+                       int start_axis,
+                       int stop_axis,
+                       DenseTensor* out,
+                       DenseTensor* xshape) {
+  Flatten<T>(dev_ctx, x, start_axis, stop_axis, out);
+  const auto& in_dims = x.meta().dims;
+  std::vector<int64_t> xshape_dims(in_dims.size() + 1);
+  xshape_dims[0] = 0;
+  for (int i = 0; i < in_dims.size(); ++i) {
+    xshape_dims[i + 1] = in_dims[i];
+  }
+  xshape->mutable_meta()->dims = paddle::framework::make_ddim(xshape_dims);
+  xshape->mutable_meta()->lod = x.meta().lod;
+}
+
+}  // namespace pt
+
+// TODO(chenweihang): replace by better impl
+PT_REGISTER_MODULE(ManipulationCUDA);
+
+using float16 = paddle::platform::float16;
+// TODO(yuanrisheng): "flatten_contiguous_range" is compatible with old kernel
+// architecture, kernel_name should be "flatten".
+PT_REGISTER_KERNEL("flatten_contiguous_range",
+                   CUDA,
+                   NCHW,
+                   pt::Flatten,
+                   float,
+                   float16,
+                   double,
+                   uint8_t,
+                   int8_t,
+                   int,
+                   int64_t) {}
+
+PT_REGISTER_KERNEL("flatten_contiguous_range.mid",
+                   CUDA,
+                   NCHW,
+                   pt::FlattenWithXShape,
+                   float,
+                   double,
+                   uint8_t,
+                   int8_t,
+                   int,
+                   int64_t) {}
diff --git a/paddle/tcmpt/cuda/manipulation.h b/paddle/tcmpt/cuda/manipulation.h
new file mode 100644
index 0000000000000..ca958eab8fa47
--- /dev/null
+++ b/paddle/tcmpt/cuda/manipulation.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+// CUDA and HIP use same api
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+#include "paddle/tcmpt/core/dense_tensor.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/device_context.h"
+
+namespace pt {
+
+using CUDAContext = paddle::platform::CUDADeviceContext;
+
+template <typename T>
+void Flatten(const CUDAContext& dev_ctx,
+             const DenseTensor& x,
+             int start_axis,
+             int stop_axis,
+             DenseTensor* out);
+
+}  // namespace pt
+
+#endif
diff --git a/paddle/tcmpt/cuda/utils.cu b/paddle/tcmpt/cuda/utils.cu
new file mode 100644
index 0000000000000..40b93f3534c1a
--- /dev/null
+++ b/paddle/tcmpt/cuda/utils.cu
@@ -0,0 +1,223 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/tcmpt/core/convert_utils.h"
+#include "paddle/tcmpt/core/dtype.h"
+#include "paddle/tcmpt/core/kernel_registry.h"
+#include "paddle/tcmpt/cuda/utils.h"
+
+namespace pt {
+
+void Copy(const CUDAContext& dev_ctx,
+          const DenseTensor& src,
+          DenseTensor* dst) {
+  auto* src_ptr = src.data();
+  auto* dst_ptr = dst->mutable_data();
+  const auto& src_place = src.place();
+  const auto& dst_place = dst->place();
+  src.CheckMemorySize();
+
+  if (src_ptr == dst_ptr && src_place == dst_place) {
+    VLOG(3) << "Skip copy the same data async from " << src_place << " to "
+            << dst_place;
+    return;
+  }
+  VLOG(4) << "src:" << src_ptr << ", dst:" << dst_ptr;
+
+  VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
+          << dst_place;
+  dst->Resize(src.dims());
+  dst->mutable_meta()->layout = src.meta().layout;
+  auto size = src.numel() *
+              paddle::framework::SizeOfType(TransToProtoVarType(src.type()));
+
+  if (paddle::platform::is_cuda_pinned_place(src_place) &&  // NOLINT
+      paddle::platform::is_cuda_pinned_place(dst_place)) {
+    paddle::memory::Copy(
+        BOOST_GET_CONST(paddle::platform::CUDAPinnedPlace, dst_place),
+        dst_ptr,
+        BOOST_GET_CONST(paddle::platform::CUDAPinnedPlace, src_place),
+        src_ptr,
+        size);
+  } else if (paddle::platform::is_cuda_pinned_place(src_place) &&  // NOLINT
+             paddle::platform::is_cpu_place(dst_place)) {
+    paddle::memory::Copy(
+        BOOST_GET_CONST(paddle::platform::CPUPlace, dst_place),
+        dst_ptr,
+        BOOST_GET_CONST(paddle::platform::CUDAPinnedPlace, src_place),
+        src_ptr,
+        size);
+  } else if (paddle::platform::is_cpu_place(src_place) &&  // NOLINT
+             paddle::platform::is_cuda_pinned_place(dst_place)) {
+    paddle::memory::Copy(
+        BOOST_GET_CONST(paddle::platform::CUDAPinnedPlace, dst_place),
+        dst_ptr,
+        BOOST_GET_CONST(paddle::platform::CPUPlace, src_place),
+        src_ptr,
+        size);
+  } else if (paddle::platform::is_gpu_place(src_place) &&  // NOLINT
+             paddle::platform::is_cpu_place(dst_place)) {
+    auto src_gpu_place =
+        BOOST_GET_CONST(paddle::platform::CUDAPlace, src_place);
+    auto dst_cpu_place = BOOST_GET_CONST(paddle::platform::CPUPlace, dst_place);
+    auto ctx_place = dev_ctx.GetPlace();
+    PADDLE_ENFORCE_EQ(
+        paddle::platform::is_gpu_place(ctx_place),
+        true,
+        paddle::platform::errors::PreconditionNotMet(
+            "Context place error, excepted GPUPlace, but actually %s.",
+            ctx_place));
+    auto ctx_gpu_place =
+        BOOST_GET_CONST(paddle::platform::CUDAPlace, ctx_place);
+    PADDLE_ENFORCE_EQ(src_gpu_place,
+                      ctx_gpu_place,
+                      paddle::platform::errors::Unavailable(
+                          "Source place and context place do not match, source "
+                          "place is %s, context place is %s.",
+                          src_gpu_place,
+                          ctx_gpu_place));
+    auto stream =
+        reinterpret_cast<const paddle::platform::CUDADeviceContext&>(dev_ctx)
+            .stream();
+    paddle::memory::Copy(
+        dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
+  } else if (paddle::platform::is_cpu_place(src_place) &&  // NOLINT
+             paddle::platform::is_gpu_place(dst_place)) {
+    auto src_cpu_place = BOOST_GET_CONST(paddle::platform::CPUPlace, src_place);
+    auto dst_gpu_place =
+        BOOST_GET_CONST(paddle::platform::CUDAPlace, dst_place);
+    auto ctx_place = dev_ctx.GetPlace();
+    PADDLE_ENFORCE_EQ(
+        paddle::platform::is_gpu_place(ctx_place),
+        true,
+        paddle::platform::errors::PreconditionNotMet(
+            "Context place error, excepted GPUPlace, but actually %s.",
+            ctx_place));
+    auto ctx_gpu_place =
+        BOOST_GET_CONST(paddle::platform::CUDAPlace, ctx_place);
+    PADDLE_ENFORCE_EQ(dst_gpu_place,
+                      ctx_gpu_place,
+                      paddle::platform::errors::Unavailable(
+                          "Destination place and context place do not match, "
+                          "destination place is %s, context place is %s.",
+                          dst_gpu_place,
+                          ctx_gpu_place));
+    auto stream =
+        reinterpret_cast<const paddle::platform::CUDADeviceContext&>(dev_ctx)
+            .stream();
+    paddle::memory::Copy(
+        dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
+  } else if (paddle::platform::is_gpu_place(src_place) &&  // NOLINT
+             paddle::platform::is_cuda_pinned_place(dst_place)) {
+    auto src_gpu_place =
+        BOOST_GET_CONST(paddle::platform::CUDAPlace, src_place);
+    auto dst_cuda_pinned_place =
+        BOOST_GET_CONST(paddle::platform::CUDAPinnedPlace, dst_place);
+    auto ctx_place = dev_ctx.GetPlace();
+    PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(ctx_place),
+                      true,
+                      paddle::platform::errors::PreconditionNotMet(
+                          "Device context place mismatch. When copying Tensor "
+                          "data from GPU memory to CUDA Pinned memory, current "
+                          "device context place should be GPU."));
+    auto ctx_gpu_place =
+        BOOST_GET_CONST(paddle::platform::CUDAPlace, ctx_place);
+    PADDLE_ENFORCE_EQ(src_gpu_place,
+                      ctx_gpu_place,
+                      paddle::platform::errors::PreconditionNotMet(
+                          "The source GPU device and current device context do "
+                          "not match. The source GPU device number is %d, but "
+                          "device context GPU number is %d.",
+                          src_gpu_place.device,
+                          ctx_gpu_place.device));
+    auto stream =
+        reinterpret_cast<const paddle::platform::CUDADeviceContext&>(dev_ctx)
+            .stream();
+    paddle::memory::Copy(
+        dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
+  } else if (paddle::platform::is_cuda_pinned_place(src_place) &&  // NOLINT
+             paddle::platform::is_gpu_place(dst_place)) {
+    auto src_cuda_pinned_place =
+        BOOST_GET_CONST(paddle::platform::CUDAPinnedPlace, src_place);
+    auto dst_gpu_place =
+        BOOST_GET_CONST(paddle::platform::CUDAPlace, dst_place);
+    auto ctx_place = dev_ctx.GetPlace();
+    PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(ctx_place),
+                      true,
+                      paddle::platform::errors::PreconditionNotMet(
+                          "Device context place mismatch. When copying Tensor "
+                          "data from CUDA Pinned memory to GPU memory, current "
+                          "device context place should be GPU."));
+    auto ctx_gpu_place =
+        BOOST_GET_CONST(paddle::platform::CUDAPlace, ctx_place);
+    PADDLE_ENFORCE_EQ(dst_gpu_place,
+                      ctx_gpu_place,
+                      paddle::platform::errors::PreconditionNotMet(
+                          "The target GPU device and current device context do "
+                          "not match. The target GPU device number is %d, but "
+                          "device context GPU number is %d.",
+                          dst_gpu_place.device,
+                          ctx_gpu_place.device));
+    auto stream =
+        reinterpret_cast<const paddle::platform::CUDADeviceContext&>(dev_ctx)
+            .stream();
+    paddle::memory::Copy(
+        dst_gpu_place, dst_ptr, src_cuda_pinned_place, src_ptr, size, stream);
+  } else if (paddle::platform::is_gpu_place(src_place) &&  // NOLINT
+             paddle::platform::is_gpu_place(dst_place)) {
+    auto src_gpu_place =
+        BOOST_GET_CONST(paddle::platform::CUDAPlace, src_place);
+    auto dst_gpu_place =
+        BOOST_GET_CONST(paddle::platform::CUDAPlace, dst_place);
+    auto ctx_place = dev_ctx.GetPlace();
+    PADDLE_ENFORCE_EQ(
+        paddle::platform::is_gpu_place(ctx_place),
+        true,
+        paddle::platform::errors::PreconditionNotMet(
+            "Context place error, excepted GPUPlace, but actually %s.",
+            ctx_place));
+    auto stream =
+        reinterpret_cast<const paddle::platform::CUDADeviceContext&>(dev_ctx)
+            .stream();
+    if (paddle::platform::is_same_place(src_place, dst_place)) {
+      paddle::memory::Copy(
+          dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
+    } else {
+      if (paddle::platform::is_same_place(ctx_place, src_place)) {
+        paddle::memory::Copy(
+            dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
+        paddle::platform::DeviceContextPool::Instance()
+            .Get(src.place())
+            ->Wait();
+      } else if (paddle::platform::is_same_place(ctx_place, dst_place)) {
+        paddle::platform::DeviceContextPool::Instance()
+            .Get(src.place())
+            ->Wait();
+        paddle::memory::Copy(
+            dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
+      } else {
+        PADDLE_THROW(paddle::platform::errors::Unavailable(
+            "Context place dose not match the source and destination place."));
+      }
+    }
+  }
+}
+
+}  // namespace pt
+
+// TODO(chenweihang): replace by better impl
+PT_REGISTER_MODULE(UtilsCUDA);
+
+PT_REGISTER_KERNEL_WITH_NO_TYPE("copy", CUDA, Any, pt::Copy) {}
diff --git a/paddle/tcmpt/cuda/utils.h b/paddle/tcmpt/cuda/utils.h
new file mode 100644
index 0000000000000..4d3196b2f877b
--- /dev/null
+++ b/paddle/tcmpt/cuda/utils.h
@@ -0,0 +1,28 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/tcmpt/core/dense_tensor.h"
+#include "paddle/tcmpt/core/kernel_registry.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/device_context.h"
+namespace pt {
+
+using CUDAContext = paddle::platform::CUDADeviceContext;
+
+void Copy(const CUDAContext& dev_ctx, const DenseTensor& src, DenseTensor* dst);
+
+}  // namespace pt
diff --git a/paddle/tcmpt/hapi/include/manipulation.h b/paddle/tcmpt/hapi/include/manipulation.h
new file mode 100644
index 0000000000000..35695f4f6d8b6
--- /dev/null
+++ b/paddle/tcmpt/hapi/include/manipulation.h
@@ -0,0 +1,25 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/tcmpt/hapi/include/tensor.h"
+
+namespace paddle {
+namespace experimental {
+
+Tensor flatten(const Tensor& x, int start_axis, int stop_axis);
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/tcmpt/hapi/lib/CMakeLists.txt b/paddle/tcmpt/hapi/lib/CMakeLists.txt
index c9f0fe2691a92..74467603c62b6 100644
--- a/paddle/tcmpt/hapi/lib/CMakeLists.txt
+++ b/paddle/tcmpt/hapi/lib/CMakeLists.txt
@@ -1,3 +1,4 @@
 cc_library(math_api SRCS math.cc DEPS tcmpt)
 cc_library(linalg_api SRCS linalg.cc DEPS tcmpt)
 cc_library(creation_api SRCS creation.cc DEPS tcmpt)
+cc_library(manipulation_api SRCS manipulation.cc DEPS tcmpt)
diff --git a/paddle/tcmpt/hapi/lib/creation.cc b/paddle/tcmpt/hapi/lib/creation.cc
index 87fdd204dadd5..057855a3dba4c 100644
--- a/paddle/tcmpt/hapi/lib/creation.cc
+++ b/paddle/tcmpt/hapi/lib/creation.cc
@@ -47,18 +47,16 @@ Tensor full_like(const Tensor& x, const pt::Scalar& value, pt::DataType dtype) {
   kernel_context.EmplaceBackAttr(value);
 
   // 4. InferShape
-  auto out_dims = pt::UnchangedInferShape(dense_x->dims());
+  auto out_meta = UnchangedInferShape(dense_x->meta());
 
   // 5. Prepare outputs
   Tensor out;
-  auto out_def = kernel.args_def().output_defs()[0];
   // InferDataType
   if (dtype != pt::DataType::kUndef) {
-    out_def.SetDataType(dtype);
+    out_meta.type = dtype;
   }
-  auto dense_out = std::make_shared<pt::DenseTensor>(
-      pt::TensorMeta(out_dims, out_def.backend, out_def.dtype, out_def.layout),
-      pt::TensorStatus());
+  auto dense_out =
+      std::make_shared<pt::DenseTensor>(out_meta, pt::TensorStatus());
   kernel_context.EmplaceBackOutput(dense_out);
   out.set_impl(dense_out);
 
diff --git a/paddle/tcmpt/hapi/lib/linalg.cc b/paddle/tcmpt/hapi/lib/linalg.cc
index c21f37ead223a..dc11bae3e37b7 100644
--- a/paddle/tcmpt/hapi/lib/linalg.cc
+++ b/paddle/tcmpt/hapi/lib/linalg.cc
@@ -20,7 +20,11 @@ limitations under the License. */
 
 #include "paddle/tcmpt/api/include/core.h"
 #include "paddle/tcmpt/api/include/infershape.h"
+#include "paddle/tcmpt/core/convert_utils.h"
+#include "paddle/tcmpt/core/dense_tensor.h"
+#include "paddle/tcmpt/core/kernel_context.h"
 #include "paddle/tcmpt/hapi/lib/kernel_generate.h"
+#include "paddle/tcmpt/infershape/binary.h"
 
 namespace paddle {
 namespace experimental {
@@ -49,15 +53,13 @@ Tensor dot(const Tensor& x, const Tensor& y) {
 
   // 4. InferShape
   // TODO(chenweihang): how to auto selected infershape?
-  auto out_dims = pt::DotInferShape(dense_x->dims());
+  auto out_meta = DotInferShape(dense_x->meta(), dense_y->meta());
 
   // 5. Prepare outputs
   Tensor out;
   // TODO(chenweihang): deal with multiple outputs
-  auto out_def = kernel.args_def().output_defs()[0];
-  auto dense_out = std::make_shared<pt::DenseTensor>(
-      pt::TensorMeta(out_dims, out_def.backend, out_def.dtype, out_def.layout),
-      pt::TensorStatus());
+  auto dense_out =
+      std::make_shared<pt::DenseTensor>(out_meta, pt::TensorStatus());
   kernel_context.EmplaceBackOutput(dense_out);
   out.set_impl(dense_out);
 
diff --git a/paddle/tcmpt/hapi/lib/manipulation.cc b/paddle/tcmpt/hapi/lib/manipulation.cc
new file mode 100644
index 0000000000000..c8448eecfe2de
--- /dev/null
+++ b/paddle/tcmpt/hapi/lib/manipulation.cc
@@ -0,0 +1,67 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/tcmpt/hapi/include/manipulation.h"
+
+#include <memory>
+
+#include "glog/logging.h"
+#include "paddle/tcmpt/api/include/core.h"
+#include "paddle/tcmpt/hapi/lib/kernel_generate.h"
+#include "paddle/tcmpt/infershape/unary.h"
+
+namespace paddle {
+namespace experimental {
+
+Tensor flatten(const Tensor& x, int start_axis, int stop_axis) {
+  // 1. Get kernel signature and kernel
+  auto kernel_signature =
+      ParseKernelNameAndKeyByArgs("flatten_contiguous_range", x);
+  VLOG(1) << kernel_signature.first;
+  VLOG(1) << kernel_signature.second;
+  VLOG(1) << pt::KernelFactory::Instance();
+
+  auto kernel = pt::KernelFactory::Instance().SelectKernelOrThrowError(
+      kernel_signature.first, kernel_signature.second);
+  VLOG(1) << kernel;
+
+  // 2. Get Device Context
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_signature.second.backend());
+  auto kernel_context = pt::KernelContext(*dev_ctx);
+
+  // 3. Auto data transform
+  auto dense_x = std::dynamic_pointer_cast<pt::DenseTensor>(x.impl());
+  kernel_context.EmplaceBackInput(dense_x);
+  kernel_context.EmplaceBackAttr(start_axis);
+  kernel_context.EmplaceBackAttr(stop_axis);
+
+  // 4. InferShape
+  // TODO(chenweihang): how to auto selected infershape?
+  auto out_meta = FlattenInferShape(dense_x->meta(), start_axis, stop_axis);
+
+  // 5. Prepare outputs
+  Tensor out;
+  // TODO(chenweihang): deal with multiple outputs
+  auto dense_out =
+      std::make_shared<pt::DenseTensor>(out_meta, pt::TensorStatus());
+  kernel_context.EmplaceBackOutput(dense_out);
+  out.set_impl(dense_out);
+
+  // 6. Call kernel
+  kernel(&kernel_context);
+
+  return out;
+}
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/tcmpt/hapi/lib/math.cc b/paddle/tcmpt/hapi/lib/math.cc
index 6088b24f2eda9..531e85298758c 100644
--- a/paddle/tcmpt/hapi/lib/math.cc
+++ b/paddle/tcmpt/hapi/lib/math.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/tcmpt/api/include/core.h"
 #include "paddle/tcmpt/api/include/infershape.h"
 #include "paddle/tcmpt/hapi/lib/kernel_generate.h"
+#include "paddle/tcmpt/infershape/unary.h"
 
 namespace paddle {
 namespace experimental {
@@ -47,15 +48,13 @@ Tensor mean(const Tensor& x) {
 
   // 4. InferShape
   // TODO(chenweihang): how to auto selected infershape?
-  auto out_dims = pt::MeanInferShape(dense_x->dims());
+  auto out_meta = ReductionInferShape(dense_x->meta());
 
   // 5. Prepare outputs
   Tensor out;
   // TODO(chenweihang): deal with multiple outputs
-  auto out_def = kernel.args_def().output_defs()[0];
-  auto dense_out = std::make_shared<pt::DenseTensor>(
-      pt::TensorMeta(out_dims, out_def.backend, out_def.dtype, out_def.layout),
-      pt::TensorStatus());
+  auto dense_out =
+      std::make_shared<pt::DenseTensor>(out_meta, pt::TensorStatus());
   kernel_context.EmplaceBackOutput(dense_out);
   out.set_impl(dense_out);
 
diff --git a/paddle/tcmpt/infershape/CMakeLists.txt b/paddle/tcmpt/infershape/CMakeLists.txt
index e69de29bb2d1d..0b3771df3574a 100644
--- a/paddle/tcmpt/infershape/CMakeLists.txt
+++ b/paddle/tcmpt/infershape/CMakeLists.txt
@@ -0,0 +1,2 @@
+cc_library(unary SRCS unary.cc DEPS convert_utils)
+cc_library(binary SRCS binary.cc DEPS convert_utils)
diff --git a/paddle/tcmpt/infershape/binary.cc b/paddle/tcmpt/infershape/binary.cc
new file mode 100644
index 0000000000000..936af8767ca62
--- /dev/null
+++ b/paddle/tcmpt/infershape/binary.cc
@@ -0,0 +1,62 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/tcmpt/infershape/binary.h"
+
+namespace pt {
+
+TensorMeta DotInferShape(const TensorMeta& x_meta, const TensorMeta& y_meta) {
+  auto x_dims = x_meta.dims;
+  auto x_rank = static_cast<size_t>(x_dims.size());
+  PADDLE_ENFORCE_EQ(true,
+                    1 == x_rank || 2 == x_rank,
+                    paddle::platform::errors::PreconditionNotMet(
+                        "ShapeError: The dimensions of input tensor X (%s) "
+                        "should be 1 or 2",
+                        x_dims.to_str()));
+
+  auto y_dims = y_meta.dims;
+  PADDLE_ENFORCE_EQ(
+      true,
+      x_rank == (size_t)y_dims.size(),
+      paddle::platform::errors::PreconditionNotMet(
+          "ShapeError: The shape of input tensor Y: %s should match with "
+          "input tenosr X: %s",
+          y_dims.to_str(),
+          x_dims.to_str()));
+  bool shape_match = true;
+  for (size_t i = 0; i < x_rank; ++i) {
+    if (x_dims[i] != y_dims[i]) {
+      shape_match = false;
+      break;
+    }
+  }
+
+  PADDLE_ENFORCE_EQ(true,
+                    shape_match,
+                    paddle::platform::errors::PreconditionNotMet(
+                        "ShapeError: The shape of input tensor X: %s should "
+                        "be exactly the same "
+                        "with input tensor Y: %s",
+                        x_dims.to_str(),
+                        y_dims.to_str()));
+
+  x_dims[x_dims.size() - 1] = 1;
+  TensorMeta return_meta(
+      x_dims, x_meta.backend, x_meta.type, x_meta.layout, x_meta.offset);
+  return return_meta;
+}
+
+}  // namespace pt
diff --git a/paddle/tcmpt/infershape/binary.h b/paddle/tcmpt/infershape/binary.h
new file mode 100644
index 0000000000000..816963a277ade
--- /dev/null
+++ b/paddle/tcmpt/infershape/binary.h
@@ -0,0 +1,35 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/tcmpt/core/tensor_meta.h"
+
+namespace pt {
+
+// Common InferShape Functions for binary operators, The format like:
+//
+//   1. TensorMeta [OpName]InferShape(const TensorMeta& x_meta, ...) {}
+//   2. std::pair<TensorMeta, TensorMeta> [OpName]InferShape(const TensorMeta&
+//   x_meta, ...) {}
+//   3. std::tuple<TensorMeta, TensorMeta, TensorMeta> [OpName]InferShape(const
+//   TensorMeta& x_meta, ...)
+//  NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good.
+//  Because functions in this file
+//  not only can infer shape, but alse need infer lod or other useful data.
+
+TensorMeta DotInferShape(const TensorMeta& x_meta, const TensorMeta& y_meta);
+
+}  // namespace pt
diff --git a/paddle/tcmpt/infershape/unary.cc b/paddle/tcmpt/infershape/unary.cc
new file mode 100644
index 0000000000000..3e4a633fa7a7c
--- /dev/null
+++ b/paddle/tcmpt/infershape/unary.cc
@@ -0,0 +1,77 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/tcmpt/infershape/unary.h"
+
+namespace pt {
+
+TensorMeta UnchangedInferShape(const TensorMeta& x_meta) { return x_meta; }
+
+TensorMeta ReductionInferShape(const TensorMeta& x_meta) {
+  const auto& out_dims = paddle::framework::make_ddim({1});
+  TensorMeta return_meta(
+      out_dims, x_meta.backend, x_meta.type, x_meta.layout, x_meta.offset);
+  return return_meta;
+}
+
+TensorMeta FlattenInferShape(const TensorMeta& x_meta,
+                             int start_axis,
+                             int stop_axis) {
+  auto& x_dims = x_meta.dims;
+  int in_dims_size = x_dims.size();
+  if (start_axis < 0) {
+    start_axis = start_axis + in_dims_size;
+  }
+  if (stop_axis < 0) {
+    stop_axis = stop_axis + in_dims_size;
+  }
+  PADDLE_ENFORCE_GE(stop_axis,
+                    start_axis,
+                    paddle::platform::errors::InvalidArgument(
+                        "The stop_axis should be greater"
+                        "than or equal to start_axis."));
+
+  int64_t outer = 1;
+  std::vector<int32_t> out_shape;
+  out_shape.reserve(in_dims_size - stop_axis + start_axis);
+
+  for (int i = 0; i < start_axis; ++i) {
+    out_shape.push_back(x_dims[i]);
+  }
+  for (int i = start_axis; i <= stop_axis; i++) {
+    if (x_dims[i] == -1 || outer == -1) {
+      outer = -1;
+    } else {
+      outer *= x_dims[i];
+    }
+  }
+  out_shape.push_back(outer);
+  for (int i = stop_axis + 1; i < in_dims_size; i++) {
+    out_shape.push_back(x_dims[i]);
+  }
+  const auto& out_dims = paddle::framework::make_ddim(out_shape);
+  TensorMeta return_meta(
+      out_dims, x_meta.backend, x_meta.type, x_meta.layout, x_meta.offset);
+
+  if (x_dims[0] == return_meta.dims[0]) {
+    // Only pass LoD when the first dimension of output and Input(X)
+    // are the same.
+    return_meta.lod = x_meta.lod;
+  }
+
+  return return_meta;
+}
+
+}  // namespace pt
diff --git a/paddle/tcmpt/infershape/unary.h b/paddle/tcmpt/infershape/unary.h
index 64a735c060edc..b835ec4bcfa72 100644
--- a/paddle/tcmpt/infershape/unary.h
+++ b/paddle/tcmpt/infershape/unary.h
@@ -15,27 +15,27 @@ limitations under the License. */
 #pragma once
 
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/framework/ddim.h"
+#include "paddle/tcmpt/core/tensor_meta.h"
 
 namespace pt {
 
-using DDim = paddle::framework::DDim;
-
-// Common InferShape Functions, The format like:
+// Common InferShape Functions for unary operators, The format like:
 //
-//   1. DDim [OpName]InferShape(const DDim& x_dim, ...) {}
-//   2. std::pair<DDim, DDim> [OpName]InferShape(const DDim& x_dim, ...) {}
-//   3. std::tuple<DDim, DDim, DDim> [OpName]InferShape(const DDim& x_dim, ...)
-//   {}
-
-DDim UnchangedInferShape(const DDim& x_dim) { return x_dim; }
-
-DDim MeanInferShape(const DDim& x_dim) { return {1}; }
-
-DDim DotInferShape(const DDim& x_dim) {
-  auto dims = paddle::framework::vectorize(x_dim);
-  dims[dims.size() - 1] = 1;
-  return paddle::framework::make_ddim(dims);
-}
+//   1. TensorMeta [OpName]InferShape(const TensorMeta& x_meta, ...) {}
+//   2. std::pair<TensorMeta, TensorMeta> [OpName]InferShape(const TensorMeta&
+//   x_meta, ...) {}
+//   3. std::tuple<TensorMeta, TensorMeta, TensorMeta> [OpName]InferShape(const
+//   TensorMeta& x_meta, ...)
+//  NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good.
+//  Because functions in this file
+//  not only can infer shape, but alse need infer lod or other useful data.
+
+TensorMeta UnchangedInferShape(const TensorMeta& x_meta);
+
+TensorMeta ReductionInferShape(const TensorMeta& x_meta);
+
+TensorMeta FlattenInferShape(const TensorMeta& x_meta,
+                             int start_axis,
+                             int stop_axis);
 
 }  // namespace pt
diff --git a/paddle/tcmpt/tests/CMakeLists.txt b/paddle/tcmpt/tests/CMakeLists.txt
index acf1624bc7e12..5cc7a3f4cc77e 100644
--- a/paddle/tcmpt/tests/CMakeLists.txt
+++ b/paddle/tcmpt/tests/CMakeLists.txt
@@ -3,3 +3,5 @@ cc_test(kernel_factory_test SRCS kernel_factory_test.cc DEPS kernel_factory)
 cc_test(test_mean_api SRCS test_mean_api.cc DEPS math_api)
 cc_test(test_dot_api SRCS test_dot_api.cc DEPS linalg_api)
 cc_test(test_fill_api SRCS test_fill_api.cc DEPS creation_api)
+cc_test(test_copy_api SRCS test_copy_api.cc DEPS utils_cpu)
+cc_test(test_flatten_api SRCS test_flatten_api.cc DEPS utils_cpu manipulation_api)
diff --git a/paddle/tcmpt/tests/test_copy_api.cc b/paddle/tcmpt/tests/test_copy_api.cc
new file mode 100644
index 0000000000000..7f1158912ebfb
--- /dev/null
+++ b/paddle/tcmpt/tests/test_copy_api.cc
@@ -0,0 +1,64 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/tcmpt/core/kernel_registry.h"
+#include "paddle/tcmpt/cpu/utils.h"
+
+#include "paddle/tcmpt/core/dense_tensor.h"
+
+PT_DECLARE_MODULE(UtilsCPU);
+
+namespace framework = paddle::framework;
+using DDim = paddle::framework::DDim;
+
+// TODO(YuanRisheng): This TEST file need to be refactored after 'copy' realized
+// in
+// 'paddle/api',
+TEST(API, copy) {
+  // 1. create tensor
+  auto dense_src = std::make_shared<pt::DenseTensor>(
+      pt::TensorMeta(framework::make_ddim({2, 3}),
+                     pt::Backend::kCPU,
+                     pt::DataType::kFLOAT32,
+                     pt::DataLayout::kNCHW),
+      pt::TensorStatus());
+  auto* dense_x_data = dense_src->mutable_data<float>();
+
+  auto dense_dst = std::make_shared<pt::DenseTensor>(
+      pt::TensorMeta(framework::make_ddim({2, 3}),
+                     pt::Backend::kCPU,
+                     pt::DataType::kFLOAT32,
+                     pt::DataLayout::kNCHW),
+      pt::TensorStatus());
+
+  for (size_t i = 0; i < 2; ++i) {
+    for (size_t j = 0; j < 3; ++j) {
+      dense_x_data[i * 3 + j] = (i * 3 + j) * 1.0;
+    }
+  }
+  const auto& a = paddle::platform::CPUPlace();
+  std::cout << typeid(a).name() << std::endl;
+  // 2. test API
+  auto& pool = paddle::platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.GetByPlace(paddle::platform::CPUPlace());
+  pt::Copy(*dev_ctx, *(dense_src.get()), dense_dst.get());
+
+  // 3. check result
+  for (int64_t i = 0; i < dense_src->numel(); i++) {
+    ASSERT_EQ(dense_src->data<float>()[i], dense_dst->data<float>()[i]);
+  }
+}
diff --git a/paddle/tcmpt/tests/test_flatten_api.cc b/paddle/tcmpt/tests/test_flatten_api.cc
new file mode 100644
index 0000000000000..d2e3ee4278e1d
--- /dev/null
+++ b/paddle/tcmpt/tests/test_flatten_api.cc
@@ -0,0 +1,69 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/tcmpt/hapi/include/manipulation.h"
+
+#include "paddle/tcmpt/core/dense_tensor.h"
+#include "paddle/tcmpt/core/kernel_registry.h"
+
+PT_DECLARE_MODULE(ManipulationCPU);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PT_DECLARE_MODULE(ManipulationCUDA);
+#endif
+
+namespace framework = paddle::framework;
+using DDim = paddle::framework::DDim;
+
+TEST(API, flatten) {
+  // 1. create tensor
+  auto dense_x = std::make_shared<pt::DenseTensor>(
+      pt::TensorMeta(framework::make_ddim({3, 2, 2, 3}),
+                     pt::Backend::kCPU,
+                     pt::DataType::kFLOAT32,
+                     pt::DataLayout::kNCHW),
+      pt::TensorStatus());
+  auto* dense_x_data = dense_x->mutable_data<float>();
+
+  for (int i = 0; i < dense_x->numel(); i++) {
+    dense_x_data[i] = i;
+  }
+
+  paddle::experimental::Tensor x(dense_x);
+  int start_axis = 1, stop_axis = 2;
+  // 2. test API
+  auto out = paddle::experimental::flatten(x, start_axis, stop_axis);
+
+  // 3. check result
+  std::vector<int> expect_shape = {3, 4, 3};
+  ASSERT_EQ(out.shape()[0], expect_shape[0]);
+  ASSERT_EQ(out.shape()[1], expect_shape[1]);
+  ASSERT_EQ(out.shape()[2], expect_shape[2]);
+  ASSERT_EQ(out.numel(), 36);
+  ASSERT_EQ(out.is_cpu(), true);
+  ASSERT_EQ(out.type(), pt::DataType::kFLOAT32);
+  ASSERT_EQ(out.layout(), pt::DataLayout::kNCHW);
+  ASSERT_EQ(out.initialized(), true);
+  bool value_equal = true;
+  auto dense_out = std::dynamic_pointer_cast<pt::DenseTensor>(out.impl());
+  auto* dense_out_data = dense_out->data<float>();
+  for (int i = 0; i < dense_x->numel(); i++) {
+    if (std::abs(dense_x_data[i] - dense_out_data[i]) > 1e-6f)
+      value_equal = false;
+  }
+  ASSERT_EQ(value_equal, true);
+}

From e0322d5086e9605a33a271ff9c08c4a025b19771 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Fri, 15 Oct 2021 07:04:38 +0000
Subject: [PATCH 083/125] Revert "use flat_hash_map and small_vector in kernel
 factory"

This reverts commit 23091495cfdd3df8cc1be592d30f09ea66a7c72b.
---
 paddle/tcmpt/core/kernel_factory.h | 36 +++++++++++++-----------------
 1 file changed, 15 insertions(+), 21 deletions(-)

diff --git a/paddle/tcmpt/core/kernel_factory.h b/paddle/tcmpt/core/kernel_factory.h
index db1f0df76e6ba..180f0ce2c6b87 100644
--- a/paddle/tcmpt/core/kernel_factory.h
+++ b/paddle/tcmpt/core/kernel_factory.h
@@ -16,14 +16,13 @@
 
 #include <ostream>
 #include <string>
+#include <unordered_map>
 #include <utility>
 
 #include "paddle/tcmpt/core/backend.h"
 #include "paddle/tcmpt/core/dtype.h"
 #include "paddle/tcmpt/core/kernel_def.h"
 #include "paddle/tcmpt/core/layout.h"
-#include "paddle/utils/flat_hash_map.h"
-#include "paddle/utils/small_vector.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/enforce.h"
@@ -210,30 +209,25 @@ class KernelArgsDef {
     attribute_defs_.emplace_back(AttributeArgDef(type_index));
   }
 
-  const paddle::SmallVector<TensorArgDef>& input_defs() const {
-    return input_defs_;
-  }
+  const std::vector<TensorArgDef>& input_defs() const { return input_defs_; }
 
-  const paddle::SmallVector<TensorArgDef>& output_defs() const {
-    return output_defs_;
-  }
+  const std::vector<TensorArgDef>& output_defs() const { return output_defs_; }
 
-  const paddle::SmallVector<AttributeArgDef>& attribute_defs() const {
+  const std::vector<AttributeArgDef>& attribute_defs() const {
     return attribute_defs_;
   }
 
-  paddle::SmallVector<TensorArgDef>& input_defs() { return input_defs_; }
+  std::vector<TensorArgDef>& input_defs() { return input_defs_; }
 
-  paddle::SmallVector<TensorArgDef>& output_defs() { return output_defs_; }
+  std::vector<TensorArgDef>& output_defs() { return output_defs_; }
 
-  paddle::SmallVector<AttributeArgDef>& attribute_defs() {
-    return attribute_defs_;
-  }
+  std::vector<AttributeArgDef>& attribute_defs() { return attribute_defs_; }
 
  private:
-  paddle::SmallVector<TensorArgDef> input_defs_{{}};
-  paddle::SmallVector<TensorArgDef> output_defs_{{}};
-  paddle::SmallVector<AttributeArgDef> attribute_defs_{{}};
+  // TODO(chenweihang): replaced by paddle::small_vector
+  std::vector<TensorArgDef> input_defs_{{}};
+  std::vector<TensorArgDef> output_defs_{{}};
+  std::vector<AttributeArgDef> attribute_defs_{{}};
 };
 
 class Kernel {
@@ -269,10 +263,10 @@ class Kernel {
 class KernelFactory {
  public:
   // replaced by paddle::flat_hash_map later
-  using KernelMap = paddle::flat_hash_map<
-      KernelName,
-      paddle::flat_hash_map<KernelKey, Kernel, KernelKey::Hash>,
-      KernelName::Hash>;
+  using KernelMap =
+      std::unordered_map<KernelName,
+                         std::unordered_map<KernelKey, Kernel, KernelKey::Hash>,
+                         KernelName::Hash>;
 
   static KernelFactory& Instance();
 

From d3ab6553260b8113346cf9d080a73ef2ff0ad1d9 Mon Sep 17 00:00:00 2001
From: zyfncg <1370305206@qq.com>
Date: Fri, 15 Oct 2021 22:19:38 +0800
Subject: [PATCH 084/125] Move cpu, cuda and other device code into kernels
 (#15)

* fill_any_like kernel refactor

* remove useless code of full_like c++ api

* Support Scalar in Tensor Compute Library

* add scalar in dygraph and static graph mode

* keep the basic type for attr, instead of using scalar for all

* merge the code

* start refactor matmul

* move cpu, cuda and other device modules into kernels

* merge code

* polish code in operator.cc
---
 paddle/fluid/framework/operator.cc            |  1 -
 paddle/tcmpt/CMakeLists.txt                   | 21 +------------------
 paddle/tcmpt/api/include/creation.h           |  4 ++--
 paddle/tcmpt/api/include/linalg.h             |  4 ++--
 paddle/tcmpt/api/include/manipulation.h       |  4 ++--
 paddle/tcmpt/api/include/math.h               |  4 ++--
 paddle/tcmpt/hapi/include/linalg.h            |  5 +++++
 paddle/tcmpt/kernels/CMakeLists.txt           | 18 ++++++++++++++++
 .../{ => kernels/common}/eigen/CMakeLists.txt |  0
 .../tcmpt/{ => kernels/common}/eigen/common.h |  0
 paddle/tcmpt/{ => kernels/common}/eigen/dot.h |  2 +-
 .../tcmpt/{ => kernels/common}/eigen/fill.h   |  2 +-
 .../tcmpt/{ => kernels/common}/eigen/mean.h   |  2 +-
 .../tcmpt/{ => kernels/common}/eigen/scale.h  |  2 +-
 .../tcmpt/{ => kernels/common}/eigen/sign.h   |  2 +-
 paddle/tcmpt/{ => kernels}/cpu/CMakeLists.txt |  2 +-
 paddle/tcmpt/{ => kernels}/cpu/creation.cc    |  4 ++--
 paddle/tcmpt/{ => kernels}/cpu/creation.h     |  0
 paddle/tcmpt/{ => kernels}/cpu/linalg.cc      | 11 +++++++++-
 paddle/tcmpt/{ => kernels}/cpu/linalg.h       |  8 +++++++
 .../tcmpt/{ => kernels}/cpu/manipulation.cc   |  4 ++--
 paddle/tcmpt/{ => kernels}/cpu/manipulation.h |  0
 paddle/tcmpt/{ => kernels}/cpu/math.cc        |  8 +++----
 paddle/tcmpt/{ => kernels}/cpu/math.h         |  0
 paddle/tcmpt/{ => kernels}/cpu/utils.cc       |  2 +-
 paddle/tcmpt/{ => kernels}/cpu/utils.h        |  0
 .../tcmpt/{ => kernels}/cuda/CMakeLists.txt   |  2 +-
 paddle/tcmpt/{ => kernels}/cuda/creation.cu   |  4 ++--
 paddle/tcmpt/{ => kernels}/cuda/creation.h    |  0
 paddle/tcmpt/{ => kernels}/cuda/linalg.cu     |  4 ++--
 paddle/tcmpt/{ => kernels}/cuda/linalg.h      |  0
 .../tcmpt/{ => kernels}/cuda/manipulation.cu  |  4 ++--
 .../tcmpt/{ => kernels}/cuda/manipulation.h   |  0
 paddle/tcmpt/{ => kernels}/cuda/math.cu       |  8 +++----
 paddle/tcmpt/{ => kernels}/cuda/math.h        |  0
 paddle/tcmpt/{ => kernels}/cuda/utils.cu      |  2 +-
 paddle/tcmpt/{ => kernels}/cuda/utils.h       |  0
 .../tcmpt/{ => kernels}/mkldnn/CMakeLists.txt |  0
 paddle/tcmpt/{ => kernels}/npu/CMakeLists.txt |  0
 paddle/tcmpt/{ => kernels}/xpu/CMakeLists.txt |  0
 paddle/tcmpt/tests/test_copy_api.cc           |  2 +-
 41 files changed, 78 insertions(+), 58 deletions(-)
 create mode 100644 paddle/tcmpt/kernels/CMakeLists.txt
 rename paddle/tcmpt/{ => kernels/common}/eigen/CMakeLists.txt (100%)
 rename paddle/tcmpt/{ => kernels/common}/eigen/common.h (100%)
 rename paddle/tcmpt/{ => kernels/common}/eigen/dot.h (96%)
 rename paddle/tcmpt/{ => kernels/common}/eigen/fill.h (97%)
 rename paddle/tcmpt/{ => kernels/common}/eigen/mean.h (96%)
 rename paddle/tcmpt/{ => kernels/common}/eigen/scale.h (96%)
 rename paddle/tcmpt/{ => kernels/common}/eigen/sign.h (96%)
 rename paddle/tcmpt/{ => kernels}/cpu/CMakeLists.txt (89%)
 rename paddle/tcmpt/{ => kernels}/cpu/creation.cc (92%)
 rename paddle/tcmpt/{ => kernels}/cpu/creation.h (100%)
 rename paddle/tcmpt/{ => kernels}/cpu/linalg.cc (86%)
 rename paddle/tcmpt/{ => kernels}/cpu/linalg.h (82%)
 rename paddle/tcmpt/{ => kernels}/cpu/manipulation.cc (96%)
 rename paddle/tcmpt/{ => kernels}/cpu/manipulation.h (100%)
 rename paddle/tcmpt/{ => kernels}/cpu/math.cc (93%)
 rename paddle/tcmpt/{ => kernels}/cpu/math.h (100%)
 rename paddle/tcmpt/{ => kernels}/cpu/utils.cc (97%)
 rename paddle/tcmpt/{ => kernels}/cpu/utils.h (100%)
 rename paddle/tcmpt/{ => kernels}/cuda/CMakeLists.txt (94%)
 rename paddle/tcmpt/{ => kernels}/cuda/creation.cu (92%)
 rename paddle/tcmpt/{ => kernels}/cuda/creation.h (100%)
 rename paddle/tcmpt/{ => kernels}/cuda/linalg.cu (93%)
 rename paddle/tcmpt/{ => kernels}/cuda/linalg.h (100%)
 rename paddle/tcmpt/{ => kernels}/cuda/manipulation.cu (96%)
 rename paddle/tcmpt/{ => kernels}/cuda/manipulation.h (100%)
 rename paddle/tcmpt/{ => kernels}/cuda/math.cu (95%)
 rename paddle/tcmpt/{ => kernels}/cuda/math.h (100%)
 rename paddle/tcmpt/{ => kernels}/cuda/utils.cu (99%)
 rename paddle/tcmpt/{ => kernels}/cuda/utils.h (100%)
 rename paddle/tcmpt/{ => kernels}/mkldnn/CMakeLists.txt (100%)
 rename paddle/tcmpt/{ => kernels}/npu/CMakeLists.txt (100%)
 rename paddle/tcmpt/{ => kernels}/xpu/CMakeLists.txt (100%)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 32fc10f38bd48..2ea761944671b 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1966,7 +1966,6 @@ pt::KernelContext OperatorWithKernel::ConstructPtKernelContext(
   }
 
   for (size_t i = 0; i < attr_defs.size(); ++i) {
-    paddle::any attr_item;
     if (attr_defs[i].type_index == std::type_index(typeid(pt::Scalar))) {
       // TODO(chenweihang): support other attrs
       // In principle, the attr required by the dynamic mode should be
diff --git a/paddle/tcmpt/CMakeLists.txt b/paddle/tcmpt/CMakeLists.txt
index c21428ef4715b..0187a63c2ff6d 100644
--- a/paddle/tcmpt/CMakeLists.txt
+++ b/paddle/tcmpt/CMakeLists.txt
@@ -5,27 +5,8 @@ add_subdirectory(api)
 add_subdirectory(hapi)
 # tcmpt core components
 add_subdirectory(core)
-# tcmpt eigne functors, now paddle must compiled with eigen, but eigen just is
-# one backend dtype, we should support cropping it for lite
-add_subdirectory(eigen)
 # tcmpt kernels for diff device
-add_subdirectory(cpu)
-if(WITH_GPU OR WITH_ROCM)
-  # TODO(chenweihang): if hip can split from cuda impl, we should add hip dir
-  add_subdirectory(cuda)
-endif()
-# TODO(chenweihang): migrate MKLDNN Kernel in the second phase of the project
-if(WITH_MKLDNN)
-  add_subdirectory(mkldnn)
-endif()
-# TODO(chenweihang): migrate NPU Kernel in the second phase of the project
-if(WITH_ASCEND_CL)
-  add_subdirectory(npu)
-endif()
-# TODO(chenweihang): migrate XPU Kernel in the second phase of the project
-if(WITH_XPU)
-  add_subdirectory(xpu)
-endif()
+add_subdirectory(kernels)
 # tcmpt infershape
 add_subdirectory(infershape)
 # TODO(xingfeng): tcmpt inner module API designed by a high-performance team
diff --git a/paddle/tcmpt/api/include/creation.h b/paddle/tcmpt/api/include/creation.h
index e0ef25d202c6e..2a87453b32154 100644
--- a/paddle/tcmpt/api/include/creation.h
+++ b/paddle/tcmpt/api/include/creation.h
@@ -14,5 +14,5 @@
 
 #pragma once
 
-#include "paddle/tcmpt/cpu/creation.h"
-#include "paddle/tcmpt/cuda/creation.h"
+#include "paddle/tcmpt/kernels/cpu/creation.h"
+#include "paddle/tcmpt/kernels/cuda/creation.h"
diff --git a/paddle/tcmpt/api/include/linalg.h b/paddle/tcmpt/api/include/linalg.h
index 46acfaea32163..81ea68abcd0bb 100644
--- a/paddle/tcmpt/api/include/linalg.h
+++ b/paddle/tcmpt/api/include/linalg.h
@@ -15,5 +15,5 @@
 #pragma once
 
 // See Note: [ How do we organize the kernel directory ]
-#include "paddle/tcmpt/cpu/linalg.h"
-#include "paddle/tcmpt/cuda/linalg.h"
+#include "paddle/tcmpt/kernels/cpu/linalg.h"
+#include "paddle/tcmpt/kernels/cuda/linalg.h"
diff --git a/paddle/tcmpt/api/include/manipulation.h b/paddle/tcmpt/api/include/manipulation.h
index b44e53c01384b..1746929ca181d 100644
--- a/paddle/tcmpt/api/include/manipulation.h
+++ b/paddle/tcmpt/api/include/manipulation.h
@@ -15,5 +15,5 @@
 #pragma once
 
 // See Note: [ How do we organize the kernel directory ]
-#include "paddle/tcmpt/cpu/manipulation.h"
-#include "paddle/tcmpt/cuda/manipulation.h"
+#include "paddle/tcmpt/kernels/cpu/manipulation.h"
+#include "paddle/tcmpt/kernels/cuda/manipulation.h"
diff --git a/paddle/tcmpt/api/include/math.h b/paddle/tcmpt/api/include/math.h
index 2f1a04d16f8ac..ab3c229806990 100644
--- a/paddle/tcmpt/api/include/math.h
+++ b/paddle/tcmpt/api/include/math.h
@@ -15,5 +15,5 @@ limitations under the License. */
 #pragma once
 
 // See Note: [ How do we organize the kernel directory ]
-#include "paddle/tcmpt/cpu/math.h"
-#include "paddle/tcmpt/cuda/math.h"
+#include "paddle/tcmpt/kernels/cpu/math.h"
+#include "paddle/tcmpt/kernels/cuda/math.h"
diff --git a/paddle/tcmpt/hapi/include/linalg.h b/paddle/tcmpt/hapi/include/linalg.h
index 5e27fecd58a4e..df709b6a3c50f 100644
--- a/paddle/tcmpt/hapi/include/linalg.h
+++ b/paddle/tcmpt/hapi/include/linalg.h
@@ -21,5 +21,10 @@ namespace experimental {
 
 Tensor dot(const Tensor& x, const Tensor& y);
 
+Tensor matmul(const Tensor& x,
+              const Tensor& y,
+              bool transpose_x,
+              bool transpose_y);
+
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/tcmpt/kernels/CMakeLists.txt b/paddle/tcmpt/kernels/CMakeLists.txt
new file mode 100644
index 0000000000000..26b5e16d4428d
--- /dev/null
+++ b/paddle/tcmpt/kernels/CMakeLists.txt
@@ -0,0 +1,18 @@
+# tcmpt kernels for diff device
+add_subdirectory(cpu)
+if(WITH_GPU OR WITH_ROCM)
+  # TODO(chenweihang): if hip can split from cuda impl, we should add hip dir
+  add_subdirectory(cuda)
+endif()
+# TODO(chenweihang): migrate MKLDNN Kernel in the second phase of the project
+if(WITH_MKLDNN)
+  add_subdirectory(mkldnn)
+endif()
+# TODO(chenweihang): migrate NPU Kernel in the second phase of the project
+if(WITH_ASCEND_CL)
+  add_subdirectory(npu)
+endif()
+# TODO(chenweihang): migrate XPU Kernel in the second phase of the project
+if(WITH_XPU)
+  add_subdirectory(xpu)
+endif()
diff --git a/paddle/tcmpt/eigen/CMakeLists.txt b/paddle/tcmpt/kernels/common/eigen/CMakeLists.txt
similarity index 100%
rename from paddle/tcmpt/eigen/CMakeLists.txt
rename to paddle/tcmpt/kernels/common/eigen/CMakeLists.txt
diff --git a/paddle/tcmpt/eigen/common.h b/paddle/tcmpt/kernels/common/eigen/common.h
similarity index 100%
rename from paddle/tcmpt/eigen/common.h
rename to paddle/tcmpt/kernels/common/eigen/common.h
diff --git a/paddle/tcmpt/eigen/dot.h b/paddle/tcmpt/kernels/common/eigen/dot.h
similarity index 96%
rename from paddle/tcmpt/eigen/dot.h
rename to paddle/tcmpt/kernels/common/eigen/dot.h
index 5e323e4448409..32c1e1439fac7 100644
--- a/paddle/tcmpt/eigen/dot.h
+++ b/paddle/tcmpt/kernels/common/eigen/dot.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/tcmpt/core/dense_tensor.h"
-#include "paddle/tcmpt/eigen/common.h"
+#include "paddle/tcmpt/kernels/common/eigen/common.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/operators/eigen/eigen_function.h"
diff --git a/paddle/tcmpt/eigen/fill.h b/paddle/tcmpt/kernels/common/eigen/fill.h
similarity index 97%
rename from paddle/tcmpt/eigen/fill.h
rename to paddle/tcmpt/kernels/common/eigen/fill.h
index fb56ccdd8e125..186163c3fedc4 100644
--- a/paddle/tcmpt/eigen/fill.h
+++ b/paddle/tcmpt/kernels/common/eigen/fill.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/tcmpt/core/dense_tensor.h"
-#include "paddle/tcmpt/eigen/common.h"
+#include "paddle/tcmpt/kernels/common/eigen/common.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/operators/eigen/eigen_function.h"
diff --git a/paddle/tcmpt/eigen/mean.h b/paddle/tcmpt/kernels/common/eigen/mean.h
similarity index 96%
rename from paddle/tcmpt/eigen/mean.h
rename to paddle/tcmpt/kernels/common/eigen/mean.h
index e70870e7954b7..2b1ea95940727 100644
--- a/paddle/tcmpt/eigen/mean.h
+++ b/paddle/tcmpt/kernels/common/eigen/mean.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/tcmpt/core/dense_tensor.h"
-#include "paddle/tcmpt/eigen/common.h"
+#include "paddle/tcmpt/kernels/common/eigen/common.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/operators/eigen/eigen_function.h"
diff --git a/paddle/tcmpt/eigen/scale.h b/paddle/tcmpt/kernels/common/eigen/scale.h
similarity index 96%
rename from paddle/tcmpt/eigen/scale.h
rename to paddle/tcmpt/kernels/common/eigen/scale.h
index 152cb61800c8b..0f3e92d9db787 100644
--- a/paddle/tcmpt/eigen/scale.h
+++ b/paddle/tcmpt/kernels/common/eigen/scale.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/tcmpt/core/dense_tensor.h"
-#include "paddle/tcmpt/eigen/common.h"
+#include "paddle/tcmpt/kernels/common/eigen/common.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/operators/eigen/eigen_function.h"
diff --git a/paddle/tcmpt/eigen/sign.h b/paddle/tcmpt/kernels/common/eigen/sign.h
similarity index 96%
rename from paddle/tcmpt/eigen/sign.h
rename to paddle/tcmpt/kernels/common/eigen/sign.h
index d41702576b3a1..3980976ac9cf5 100644
--- a/paddle/tcmpt/eigen/sign.h
+++ b/paddle/tcmpt/kernels/common/eigen/sign.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/tcmpt/core/dense_tensor.h"
-#include "paddle/tcmpt/eigen/common.h"
+#include "paddle/tcmpt/kernels/common/eigen/common.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/operators/eigen/eigen_function.h"
diff --git a/paddle/tcmpt/cpu/CMakeLists.txt b/paddle/tcmpt/kernels/cpu/CMakeLists.txt
similarity index 89%
rename from paddle/tcmpt/cpu/CMakeLists.txt
rename to paddle/tcmpt/kernels/cpu/CMakeLists.txt
index cf3204bc5bcb0..b70c5f9ec81f0 100644
--- a/paddle/tcmpt/cpu/CMakeLists.txt
+++ b/paddle/tcmpt/kernels/cpu/CMakeLists.txt
@@ -1,5 +1,5 @@
 if(WIN32)
-    set(CURRENT_BINARY_DIR ${PADDLE_BINARY_DIR}/paddle/tcmpt/cpu)
+    set(CURRENT_BINARY_DIR ${PADDLE_BINARY_DIR}/paddle/tcmpt/kernels/cpu)
     kernel_instantiate(creation.cc)
     kernel_instantiate(math.cc)
     kernel_instantiate(linalg.cc)
diff --git a/paddle/tcmpt/cpu/creation.cc b/paddle/tcmpt/kernels/cpu/creation.cc
similarity index 92%
rename from paddle/tcmpt/cpu/creation.cc
rename to paddle/tcmpt/kernels/cpu/creation.cc
index 8e4399c41bf17..4871e11da2112 100644
--- a/paddle/tcmpt/cpu/creation.cc
+++ b/paddle/tcmpt/kernels/cpu/creation.cc
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/tcmpt/cpu/creation.h"
+#include "paddle/tcmpt/kernels/cpu/creation.h"
 
 #include "paddle/tcmpt/core/kernel_registry.h"
-#include "paddle/tcmpt/eigen/fill.h"
+#include "paddle/tcmpt/kernels/common/eigen/fill.h"
 
 namespace pt {
 
diff --git a/paddle/tcmpt/cpu/creation.h b/paddle/tcmpt/kernels/cpu/creation.h
similarity index 100%
rename from paddle/tcmpt/cpu/creation.h
rename to paddle/tcmpt/kernels/cpu/creation.h
diff --git a/paddle/tcmpt/cpu/linalg.cc b/paddle/tcmpt/kernels/cpu/linalg.cc
similarity index 86%
rename from paddle/tcmpt/cpu/linalg.cc
rename to paddle/tcmpt/kernels/cpu/linalg.cc
index 96c1a4e937fce..8b63219fdd2db 100644
--- a/paddle/tcmpt/cpu/linalg.cc
+++ b/paddle/tcmpt/kernels/cpu/linalg.cc
@@ -12,12 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/tcmpt/cpu/linalg.h"
+#include "paddle/tcmpt/kernels/cpu/linalg.h"
 
 #include "paddle/tcmpt/core/kernel_registry.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/complex.h"
 
 namespace pt {
@@ -44,6 +45,14 @@ void Dot(const CPUContext& dev_ctx,
   }
 }
 
+template <typename T>
+void matmul(const CPUContext& dev_ctx,
+            const DenseTensor& x,
+            const DenseTensor& y,
+            bool transpose_x,
+            bool transpose_y,
+            DenseTensor* out) {}
+
 }  // namespace pt
 
 PT_REGISTER_MODULE(LinalgCPU);
diff --git a/paddle/tcmpt/cpu/linalg.h b/paddle/tcmpt/kernels/cpu/linalg.h
similarity index 82%
rename from paddle/tcmpt/cpu/linalg.h
rename to paddle/tcmpt/kernels/cpu/linalg.h
index c457943538761..6d9550b2882b2 100644
--- a/paddle/tcmpt/cpu/linalg.h
+++ b/paddle/tcmpt/kernels/cpu/linalg.h
@@ -29,4 +29,12 @@ void Dot(const CPUContext& dev_ctx,
          const DenseTensor& y,
          DenseTensor* out);
 
+template <typename T>
+void matmul(const CPUContext& dev_ctx,
+            const DenseTensor& x,
+            const DenseTensor& y,
+            bool transpose_x,
+            bool transpose_y,
+            DenseTensor* out);
+
 }  // namespace pt
diff --git a/paddle/tcmpt/cpu/manipulation.cc b/paddle/tcmpt/kernels/cpu/manipulation.cc
similarity index 96%
rename from paddle/tcmpt/cpu/manipulation.cc
rename to paddle/tcmpt/kernels/cpu/manipulation.cc
index d2964c5b533a9..91f1e941cd028 100644
--- a/paddle/tcmpt/cpu/manipulation.cc
+++ b/paddle/tcmpt/kernels/cpu/manipulation.cc
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/tcmpt/cpu/manipulation.h"
-#include "paddle/tcmpt/cpu/utils.h"
+#include "paddle/tcmpt/kernels/cpu/manipulation.h"
 #include "paddle/tcmpt/infershape/unary.h"
+#include "paddle/tcmpt/kernels/cpu/utils.h"
 
 namespace pt {
 
diff --git a/paddle/tcmpt/cpu/manipulation.h b/paddle/tcmpt/kernels/cpu/manipulation.h
similarity index 100%
rename from paddle/tcmpt/cpu/manipulation.h
rename to paddle/tcmpt/kernels/cpu/manipulation.h
diff --git a/paddle/tcmpt/cpu/math.cc b/paddle/tcmpt/kernels/cpu/math.cc
similarity index 93%
rename from paddle/tcmpt/cpu/math.cc
rename to paddle/tcmpt/kernels/cpu/math.cc
index 80dec2530f718..d304db0a9a34e 100644
--- a/paddle/tcmpt/cpu/math.cc
+++ b/paddle/tcmpt/kernels/cpu/math.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/tcmpt/cpu/math.h"
+#include "paddle/tcmpt/kernels/cpu/math.h"
 
-#include "paddle/tcmpt/eigen/mean.h"
-#include "paddle/tcmpt/eigen/scale.h"
-#include "paddle/tcmpt/eigen/sign.h"
+#include "paddle/tcmpt/kernels/common/eigen/mean.h"
+#include "paddle/tcmpt/kernels/common/eigen/scale.h"
+#include "paddle/tcmpt/kernels/common/eigen/sign.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/eigen.h"
diff --git a/paddle/tcmpt/cpu/math.h b/paddle/tcmpt/kernels/cpu/math.h
similarity index 100%
rename from paddle/tcmpt/cpu/math.h
rename to paddle/tcmpt/kernels/cpu/math.h
diff --git a/paddle/tcmpt/cpu/utils.cc b/paddle/tcmpt/kernels/cpu/utils.cc
similarity index 97%
rename from paddle/tcmpt/cpu/utils.cc
rename to paddle/tcmpt/kernels/cpu/utils.cc
index 86b074e49b362..7550934d70be4 100644
--- a/paddle/tcmpt/cpu/utils.cc
+++ b/paddle/tcmpt/kernels/cpu/utils.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/tcmpt/cpu/utils.h"
+#include "paddle/tcmpt/kernels/cpu/utils.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/tcmpt/core/convert_utils.h"
 #include "paddle/tcmpt/core/dtype.h"
diff --git a/paddle/tcmpt/cpu/utils.h b/paddle/tcmpt/kernels/cpu/utils.h
similarity index 100%
rename from paddle/tcmpt/cpu/utils.h
rename to paddle/tcmpt/kernels/cpu/utils.h
diff --git a/paddle/tcmpt/cuda/CMakeLists.txt b/paddle/tcmpt/kernels/cuda/CMakeLists.txt
similarity index 94%
rename from paddle/tcmpt/cuda/CMakeLists.txt
rename to paddle/tcmpt/kernels/cuda/CMakeLists.txt
index 9e56e1a3be82a..e243bad09563b 100644
--- a/paddle/tcmpt/cuda/CMakeLists.txt
+++ b/paddle/tcmpt/kernels/cuda/CMakeLists.txt
@@ -1,5 +1,5 @@
 if(WIN32)
-    set(CURRENT_BINARY_DIR ${PADDLE_BINARY_DIR}/paddle/tcmpt/cuda)
+    set(CURRENT_BINARY_DIR ${PADDLE_BINARY_DIR}/paddle/tcmpt/kernels/cuda)
     kernel_instantiate(creation.cu)
     kernel_instantiate(math.cu)
     kernel_instantiate(linalg.cu)
diff --git a/paddle/tcmpt/cuda/creation.cu b/paddle/tcmpt/kernels/cuda/creation.cu
similarity index 92%
rename from paddle/tcmpt/cuda/creation.cu
rename to paddle/tcmpt/kernels/cuda/creation.cu
index cca9199b76cfd..7f082400eaaf7 100644
--- a/paddle/tcmpt/cuda/creation.cu
+++ b/paddle/tcmpt/kernels/cuda/creation.cu
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/tcmpt/cuda/creation.h"
+#include "paddle/tcmpt/kernels/cuda/creation.h"
 
 #include "paddle/tcmpt/core/kernel_registry.h"
-#include "paddle/tcmpt/eigen/fill.h"
+#include "paddle/tcmpt/kernels/common/eigen/fill.h"
 
 namespace pt {
 
diff --git a/paddle/tcmpt/cuda/creation.h b/paddle/tcmpt/kernels/cuda/creation.h
similarity index 100%
rename from paddle/tcmpt/cuda/creation.h
rename to paddle/tcmpt/kernels/cuda/creation.h
diff --git a/paddle/tcmpt/cuda/linalg.cu b/paddle/tcmpt/kernels/cuda/linalg.cu
similarity index 93%
rename from paddle/tcmpt/cuda/linalg.cu
rename to paddle/tcmpt/kernels/cuda/linalg.cu
index 118d3326e5fb5..25d1df5cbc65a 100644
--- a/paddle/tcmpt/cuda/linalg.cu
+++ b/paddle/tcmpt/kernels/cuda/linalg.cu
@@ -12,10 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/tcmpt/cuda/linalg.h"
+#include "paddle/tcmpt/kernels/cuda/linalg.h"
 
 #include "paddle/tcmpt/core/kernel_registry.h"
-#include "paddle/tcmpt/eigen/dot.h"
+#include "paddle/tcmpt/kernels/common/eigen/dot.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/complex.h"
diff --git a/paddle/tcmpt/cuda/linalg.h b/paddle/tcmpt/kernels/cuda/linalg.h
similarity index 100%
rename from paddle/tcmpt/cuda/linalg.h
rename to paddle/tcmpt/kernels/cuda/linalg.h
diff --git a/paddle/tcmpt/cuda/manipulation.cu b/paddle/tcmpt/kernels/cuda/manipulation.cu
similarity index 96%
rename from paddle/tcmpt/cuda/manipulation.cu
rename to paddle/tcmpt/kernels/cuda/manipulation.cu
index 91f69b2fe33d7..bb4a2cc9a677b 100644
--- a/paddle/tcmpt/cuda/manipulation.cu
+++ b/paddle/tcmpt/kernels/cuda/manipulation.cu
@@ -12,9 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/tcmpt/cuda/manipulation.h"
-#include "paddle/tcmpt/cuda/utils.h"
 #include "paddle/tcmpt/infershape/unary.h"
+#include "paddle/tcmpt/kernels/cuda/manipulation.h"
+#include "paddle/tcmpt/kernels/cuda/utils.h"
 
 namespace pt {
 
diff --git a/paddle/tcmpt/cuda/manipulation.h b/paddle/tcmpt/kernels/cuda/manipulation.h
similarity index 100%
rename from paddle/tcmpt/cuda/manipulation.h
rename to paddle/tcmpt/kernels/cuda/manipulation.h
diff --git a/paddle/tcmpt/cuda/math.cu b/paddle/tcmpt/kernels/cuda/math.cu
similarity index 95%
rename from paddle/tcmpt/cuda/math.cu
rename to paddle/tcmpt/kernels/cuda/math.cu
index 293f0cf8bfc91..743615d70f996 100644
--- a/paddle/tcmpt/cuda/math.cu
+++ b/paddle/tcmpt/kernels/cuda/math.cu
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/tcmpt/cuda/math.h"
+#include "paddle/tcmpt/kernels/cuda/math.h"
 
-#include "paddle/tcmpt/eigen/mean.h"
-#include "paddle/tcmpt/eigen/scale.h"
-#include "paddle/tcmpt/eigen/sign.h"
+#include "paddle/tcmpt/kernels/common/eigen/mean.h"
+#include "paddle/tcmpt/kernels/common/eigen/scale.h"
+#include "paddle/tcmpt/kernels/common/eigen/sign.h"
 
 #ifdef __NVCC__
 #include "cub/cub.cuh"
diff --git a/paddle/tcmpt/cuda/math.h b/paddle/tcmpt/kernels/cuda/math.h
similarity index 100%
rename from paddle/tcmpt/cuda/math.h
rename to paddle/tcmpt/kernels/cuda/math.h
diff --git a/paddle/tcmpt/cuda/utils.cu b/paddle/tcmpt/kernels/cuda/utils.cu
similarity index 99%
rename from paddle/tcmpt/cuda/utils.cu
rename to paddle/tcmpt/kernels/cuda/utils.cu
index 40b93f3534c1a..b8483d17cfc24 100644
--- a/paddle/tcmpt/cuda/utils.cu
+++ b/paddle/tcmpt/kernels/cuda/utils.cu
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/tcmpt/core/convert_utils.h"
 #include "paddle/tcmpt/core/dtype.h"
 #include "paddle/tcmpt/core/kernel_registry.h"
-#include "paddle/tcmpt/cuda/utils.h"
+#include "paddle/tcmpt/kernels/cuda/utils.h"
 
 namespace pt {
 
diff --git a/paddle/tcmpt/cuda/utils.h b/paddle/tcmpt/kernels/cuda/utils.h
similarity index 100%
rename from paddle/tcmpt/cuda/utils.h
rename to paddle/tcmpt/kernels/cuda/utils.h
diff --git a/paddle/tcmpt/mkldnn/CMakeLists.txt b/paddle/tcmpt/kernels/mkldnn/CMakeLists.txt
similarity index 100%
rename from paddle/tcmpt/mkldnn/CMakeLists.txt
rename to paddle/tcmpt/kernels/mkldnn/CMakeLists.txt
diff --git a/paddle/tcmpt/npu/CMakeLists.txt b/paddle/tcmpt/kernels/npu/CMakeLists.txt
similarity index 100%
rename from paddle/tcmpt/npu/CMakeLists.txt
rename to paddle/tcmpt/kernels/npu/CMakeLists.txt
diff --git a/paddle/tcmpt/xpu/CMakeLists.txt b/paddle/tcmpt/kernels/xpu/CMakeLists.txt
similarity index 100%
rename from paddle/tcmpt/xpu/CMakeLists.txt
rename to paddle/tcmpt/kernels/xpu/CMakeLists.txt
diff --git a/paddle/tcmpt/tests/test_copy_api.cc b/paddle/tcmpt/tests/test_copy_api.cc
index 7f1158912ebfb..2d70e37d051d9 100644
--- a/paddle/tcmpt/tests/test_copy_api.cc
+++ b/paddle/tcmpt/tests/test_copy_api.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <memory>
 
 #include "paddle/tcmpt/core/kernel_registry.h"
-#include "paddle/tcmpt/cpu/utils.h"
+#include "paddle/tcmpt/kernels/cpu/utils.h"
 
 #include "paddle/tcmpt/core/dense_tensor.h"
 

From ddc7de85e15c6ad0e3309c6a77b1ee6c4b9c0ba8 Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Mon, 18 Oct 2021 11:53:39 +0800
Subject: [PATCH 085/125] Perfect unitests (#16)

* perfect unittest

* update license
---
 paddle/fluid/framework/CMakeLists.txt      |  3 +-
 paddle/fluid/framework/tcmpt_utils.cc      | 14 -----
 paddle/fluid/framework/tcmpt_utils.h       |  6 ---
 paddle/fluid/framework/tcmpt_utils_test.cc | 62 ++++++++++++++++++++++
 4 files changed, 64 insertions(+), 21 deletions(-)
 create mode 100644 paddle/fluid/framework/tcmpt_utils_test.cc

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index c1285f5d3eb93..27f83a266ec9c 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -390,7 +390,7 @@ cc_library(save_load_util SRCS save_load_util.cc DEPS tensor scope layer)
 cc_test(save_load_util_test SRCS save_load_util_test.cc DEPS save_load_util tensor scope layer)
 cc_library(generator SRCS generator.cc DEPS enforce place)
 
-cc_library(tcmpt_utils SRCS tcmpt_utils.cc DEPS lod_tensor selected_rows place tcmpt)
+cc_library(tcmpt_utils SRCS tcmpt_utils.cc DEPS lod_tensor selected_rows place tcmpt var_type_traits)
 
 # Get the current working branch
 execute_process(
@@ -454,3 +454,4 @@ if(WITH_TESTING AND TEST selected_rows_test)
 endif()
 
 cc_test(scope_guard_test SRCS scope_guard_test.cc)
+cc_test(tcmpt_utils_test SRCS tcmpt_utils_test.cc DEPS tcmpt_utils)
diff --git a/paddle/fluid/framework/tcmpt_utils.cc b/paddle/fluid/framework/tcmpt_utils.cc
index 71ef2d3450ae9..e065199d62b7a 100644
--- a/paddle/fluid/framework/tcmpt_utils.cc
+++ b/paddle/fluid/framework/tcmpt_utils.cc
@@ -74,20 +74,6 @@ std::shared_ptr<pt::DenseTensor> MakeTensorImpl<pt::DenseTensor>(
       pt::TransToPtLayout(tensor.layout()));
 }
 
-template <>
-void ShareTensorImpl<pt::DenseTensor>(pt::DenseTensor* tensor_impl,
-                                      LoDTensor* out) {
-  out->ResetHolderWithType(tensor_impl->allocation(),
-                           pt::TransToProtoVarType(tensor_impl->type()));
-}
-
-template <>
-void ShareTensorImpl<pt::DenseTensor>(pt::DenseTensor* tensor_impl,
-                                      Tensor* out) {
-  out->ResetHolderWithType(tensor_impl->allocation(),
-                           pt::TransToProtoVarType(tensor_impl->type()));
-}
-
 std::shared_ptr<pt::TensorInterface> InputVariableToPtTensor(
     const framework::Variable& variable, const pt::TensorArgDef& arg_def) {
   auto expected_place = pt::TransToFluidPlace(arg_def.backend);
diff --git a/paddle/fluid/framework/tcmpt_utils.h b/paddle/fluid/framework/tcmpt_utils.h
index 0af8cd30bd34d..d41b05a57d9b8 100644
--- a/paddle/fluid/framework/tcmpt_utils.h
+++ b/paddle/fluid/framework/tcmpt_utils.h
@@ -38,12 +38,6 @@ std::shared_ptr<PtTensorImplT> MakeTensorImpl(const Tensor& tensor,
                                               const platform::Place& place,
                                               proto::VarType::Type type);
 
-template <typename PtTensorImplT>
-void ShareTensorImpl(PtTensorImplT* tensor_impl, LoDTensor* out);
-
-template <typename PtTensorImplT>
-void ShareTensorImpl(PtTensorImplT* tensor_impl, Tensor* out);
-
 std::shared_ptr<pt::TensorInterface> InputVariableToPtTensor(
     const framework::Variable& variable, const pt::TensorArgDef& arg_def);
 std::shared_ptr<pt::TensorInterface> OutputVariableToPtTensor(
diff --git a/paddle/fluid/framework/tcmpt_utils_test.cc b/paddle/fluid/framework/tcmpt_utils_test.cc
new file mode 100644
index 0000000000000..c5af18f6f65aa
--- /dev/null
+++ b/paddle/fluid/framework/tcmpt_utils_test.cc
@@ -0,0 +1,62 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/tcmpt_utils.h"
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/selected_rows.h"
+#include "paddle/fluid/framework/variable.h"
+
+namespace paddle {
+namespace framework {
+
+TEST(TcmptUtils, MakeTensor) {
+  // 1. create tensor
+  LoDTensor x;
+  Tensor x2;
+  x.Resize({2});
+  x.mutable_data<float>(platform::CPUPlace());
+  x.data<float>()[0] = 0.2;
+  x.data<float>()[1] = 0.5;
+
+  // 2. test API
+  auto dense_x = MakeTensorImpl<pt::DenseTensor>(x, x.place(), x.type());
+
+  // 3. check result
+  std::vector<float> expect_value = {0.2, 0.5};
+  ASSERT_EQ(dense_x->data<float>()[0], expect_value[0]);
+  ASSERT_EQ(dense_x->data<float>()[1], expect_value[1]);
+  ASSERT_EQ(dense_x->backend(), pt::Backend::kCPU);
+  ASSERT_EQ(dense_x->type(), pt::DataType::kFLOAT32);
+}
+
+TEST(TcmptUtils, VarToPtTensor) {
+  // 1. create Variable
+  Variable v;
+  auto selected_rows = v.GetMutable<SelectedRows>();
+  Tensor* value = selected_rows->mutable_value();
+  auto* data =
+      value->mutable_data<int>(make_ddim({1, 1}), paddle::platform::CPUPlace());
+  data[0] = 123;
+  auto tensor_def = pt::TensorArgDef(pt::Backend::kCUDA, pt::DataLayout::kNCHW,
+                                     pt::DataType::kINT32);
+  // 2. test API
+  auto tensor_x = InputVariableToPtTensor(v, tensor_def);
+  // 3. check result
+  ASSERT_EQ(tensor_x->backend(), pt::Backend::kCUDA);
+  ASSERT_EQ(tensor_x->type(), pt::DataType::kINT32);
+}
+
+}  // namespace framework
+}  // namespace paddle

From 37791f7cb8378f72200762b82266c56153c9d866 Mon Sep 17 00:00:00 2001
From: chentianyu03 <ctychentianyu@gmail.com>
Date: Mon, 18 Oct 2021 17:12:09 +0800
Subject: [PATCH 086/125] replace with flat_hash_map, small_vector (#19)

* fix small_vector build error on windows platform

* replace with flat_hash_map, small_vector

* remove todo
---
 paddle/tcmpt/core/kernel_factory.h | 35 ++++++++++++++++++------------
 paddle/utils/small_vector.h        | 12 +++++-----
 2 files changed, 28 insertions(+), 19 deletions(-)

diff --git a/paddle/tcmpt/core/kernel_factory.h b/paddle/tcmpt/core/kernel_factory.h
index 180f0ce2c6b87..5978264c9ef26 100644
--- a/paddle/tcmpt/core/kernel_factory.h
+++ b/paddle/tcmpt/core/kernel_factory.h
@@ -26,6 +26,8 @@
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/utils/flat_hash_map.h"
+#include "paddle/utils/small_vector.h"
 
 namespace pt {
 
@@ -209,25 +211,30 @@ class KernelArgsDef {
     attribute_defs_.emplace_back(AttributeArgDef(type_index));
   }
 
-  const std::vector<TensorArgDef>& input_defs() const { return input_defs_; }
+  const paddle::SmallVector<TensorArgDef>& input_defs() const {
+    return input_defs_;
+  }
 
-  const std::vector<TensorArgDef>& output_defs() const { return output_defs_; }
+  const paddle::SmallVector<TensorArgDef>& output_defs() const {
+    return output_defs_;
+  }
 
-  const std::vector<AttributeArgDef>& attribute_defs() const {
+  const paddle::SmallVector<AttributeArgDef>& attribute_defs() const {
     return attribute_defs_;
   }
 
-  std::vector<TensorArgDef>& input_defs() { return input_defs_; }
+  paddle::SmallVector<TensorArgDef>& input_defs() { return input_defs_; }
 
-  std::vector<TensorArgDef>& output_defs() { return output_defs_; }
+  paddle::SmallVector<TensorArgDef>& output_defs() { return output_defs_; }
 
-  std::vector<AttributeArgDef>& attribute_defs() { return attribute_defs_; }
+  paddle::SmallVector<AttributeArgDef>& attribute_defs() {
+    return attribute_defs_;
+  }
 
  private:
-  // TODO(chenweihang): replaced by paddle::small_vector
-  std::vector<TensorArgDef> input_defs_{{}};
-  std::vector<TensorArgDef> output_defs_{{}};
-  std::vector<AttributeArgDef> attribute_defs_{{}};
+  paddle::SmallVector<TensorArgDef> input_defs_{{}};
+  paddle::SmallVector<TensorArgDef> output_defs_{{}};
+  paddle::SmallVector<AttributeArgDef> attribute_defs_{{}};
 };
 
 class Kernel {
@@ -263,10 +270,10 @@ class Kernel {
 class KernelFactory {
  public:
   // replaced by paddle::flat_hash_map later
-  using KernelMap =
-      std::unordered_map<KernelName,
-                         std::unordered_map<KernelKey, Kernel, KernelKey::Hash>,
-                         KernelName::Hash>;
+  using KernelMap = paddle::flat_hash_map<
+      KernelName,
+      paddle::flat_hash_map<KernelKey, Kernel, KernelKey::Hash>,
+      KernelName::Hash>;
 
   static KernelFactory& Instance();
 
diff --git a/paddle/utils/small_vector.h b/paddle/utils/small_vector.h
index f51a3b623ce3b..e9e7996babcf7 100644
--- a/paddle/utils/small_vector.h
+++ b/paddle/utils/small_vector.h
@@ -3,6 +3,8 @@
 // 1. remove  macro
 // 2. remove LLVM_LIKELY and LLVM_UNLIKELY
 // 3. add at(index) method for small vector
+// 4. wrap the call to max and min with parenthesis to prevent the macro
+// expansion to fix the build error on windows platform
 
 //===- llvm/ADT/SmallVector.h - 'Normally small' vectors --------*- C++ -*-===//
 //
@@ -90,7 +92,7 @@ class SmallVectorBase {
 
   /// The maximum value of the Size_T used.
   static constexpr size_t SizeTypeMax() {
-    return std::numeric_limits<Size_T>::max();
+    return (std::numeric_limits<Size_T>::max)();
   }
 
   SmallVectorBase() = delete;
@@ -309,7 +311,7 @@ class SmallVectorTemplateCommon
 
   size_type size_in_bytes() const { return size() * sizeof(T); }
   size_type max_size() const {
-    return std::min(this->SizeTypeMax(), size_type(-1) / sizeof(T));
+    return (std::min)(this->SizeTypeMax(), size_type(-1) / sizeof(T));
   }
 
   size_t capacity_in_bytes() const { return capacity() * sizeof(T); }
@@ -727,7 +729,7 @@ class SmallVectorImpl : public SmallVectorTemplateBase<T> {
     }
 
     // Assign over existing elements.
-    std::fill_n(this->begin(), std::min(NumElts, this->size()), Elt);
+    std::fill_n(this->begin(), (std::min)(NumElts, this->size()), Elt);
     if (NumElts > this->size())
       std::uninitialized_fill_n(this->end(), NumElts - this->size(), Elt);
     else if (NumElts < this->size())
@@ -1393,7 +1395,7 @@ static void report_at_maximum_capacity(size_t MaxSize) {
 // Note: Moving this function into the header may cause performance regression.
 template <class Size_T>
 static size_t getNewCapacity(size_t MinSize, size_t TSize, size_t OldCapacity) {
-  constexpr size_t MaxSize = std::numeric_limits<Size_T>::max();
+  constexpr size_t MaxSize = (std::numeric_limits<Size_T>::max)();
 
   // Ensure we can fit the new capacity.
   // This is only going to be applicable when the capacity is 32 bit.
@@ -1408,7 +1410,7 @@ static size_t getNewCapacity(size_t MinSize, size_t TSize, size_t OldCapacity) {
   // In theory 2*capacity can overflow if the capacity is 64 bit, but the
   // original capacity would never be large enough for this to be a problem.
   size_t NewCapacity = 2 * OldCapacity + 1;  // Always grow.
-  return std::min(std::max(NewCapacity, MinSize), MaxSize);
+  return (std::min)((std::max)(NewCapacity, MinSize), MaxSize);
 }
 
 // Note: Moving this function into the header may cause performance regression.

From 28a637415e288f71f23a4006e99767623e0294b8 Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Mon, 18 Oct 2021 20:23:30 +0800
Subject: [PATCH 087/125] Perfect unitests (#20)

* perfect unittest

* update license

* fix bug when run tcmpt_utils_test
---
 paddle/fluid/framework/tcmpt_utils_test.cc | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/tcmpt_utils_test.cc b/paddle/fluid/framework/tcmpt_utils_test.cc
index c5af18f6f65aa..f1966789c1dde 100644
--- a/paddle/fluid/framework/tcmpt_utils_test.cc
+++ b/paddle/fluid/framework/tcmpt_utils_test.cc
@@ -49,12 +49,17 @@ TEST(TcmptUtils, VarToPtTensor) {
   auto* data =
       value->mutable_data<int>(make_ddim({1, 1}), paddle::platform::CPUPlace());
   data[0] = 123;
-  auto tensor_def = pt::TensorArgDef(pt::Backend::kCUDA, pt::DataLayout::kNCHW,
+  pt::Backend expect_backend = pt::Backend::kCPU;
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  expect_backend = pt::Backend::kCUDA;
+#endif
+  auto tensor_def = pt::TensorArgDef(expect_backend, pt::DataLayout::kNCHW,
                                      pt::DataType::kINT32);
   // 2. test API
   auto tensor_x = InputVariableToPtTensor(v, tensor_def);
   // 3. check result
-  ASSERT_EQ(tensor_x->backend(), pt::Backend::kCUDA);
+  ASSERT_EQ(tensor_x->backend(), expect_backend);
   ASSERT_EQ(tensor_x->type(), pt::DataType::kINT32);
 }
 

From e3e2b5071e24ee894fd12d11e3c41e3035ea7c69 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Mon, 18 Oct 2021 14:33:15 +0000
Subject: [PATCH 088/125] refactor execution adapting impl

---
 paddle/fluid/framework/operator.cc            | 319 +++++-------------
 paddle/fluid/framework/operator.h             |  36 +-
 paddle/fluid/framework/tcmpt_utils.cc         | 117 ++++++-
 paddle/fluid/framework/tcmpt_utils.h          |  84 ++++-
 paddle/fluid/framework/type_defs.h            |  10 +
 .../imperative/kernel_args_names_maker.h      | 165 ---------
 paddle/fluid/imperative/prepared_operator.cc  | 229 +++++--------
 paddle/fluid/imperative/prepared_operator.h   |  14 +-
 paddle/fluid/imperative/type_defs.h           |  11 -
 paddle/fluid/operators/fill_any_like_op.cc    |   9 +
 paddle/fluid/operators/scale_op.cc            |  18 +
 paddle/fluid/platform/flags.cc                |   6 +-
 paddle/tcmpt/core/convert_utils.cc            |   2 +-
 paddle/tcmpt/core/convert_utils.h             |   2 +-
 paddle/tcmpt/core/kernel_factory.cc           |   5 +
 paddle/tcmpt/core/kernel_registry.h           |  11 +-
 paddle/tcmpt/cpu/creation.cc                  |   4 +-
 paddle/tcmpt/cpu/linalg.cc                    |   2 +-
 paddle/tcmpt/cpu/manipulation.cc              |   4 +-
 paddle/tcmpt/cpu/math.cc                      |   8 +-
 paddle/tcmpt/cuda/creation.cu                 |   2 +-
 paddle/tcmpt/cuda/linalg.cu                   |   2 +-
 paddle/tcmpt/cuda/manipulation.cu             |   4 +-
 paddle/tcmpt/cuda/math.cu                     |   8 +-
 24 files changed, 476 insertions(+), 596 deletions(-)
 delete mode 100644 paddle/fluid/imperative/kernel_args_names_maker.h

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 32fc10f38bd48..7cadf53cc5299 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -28,7 +28,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/transfer_scope_cache.h"
 #include "paddle/fluid/framework/unused_var_check.h"
 #include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/imperative/kernel_args_names_maker.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -51,7 +50,7 @@ DECLARE_bool(check_nan_inf);
 DECLARE_bool(enable_unused_var_check);
 PADDLE_DEFINE_EXPORTED_int32(inner_op_parallelism, 0,
                              "number of threads for inner op");
-DECLARE_bool(use_pt_kernel);
+DECLARE_bool(run_pt_kernel);
 
 namespace paddle {
 namespace framework {
@@ -1077,22 +1076,6 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
   this->InferShape(&infer_shape_ctx);
 }
 
-OpKernelType TransPtKernelKeyToOpKernelType(const pt::KernelKey& kernel_key) {
-  proto::VarType::Type data_type = pt::TransToProtoVarType(kernel_key.dtype());
-  platform::Place place = pt::TransToFluidPlace(kernel_key.backend());
-  DataLayout data_layout = pt::TransToFluidDataLayout(kernel_key.layout());
-  LibraryType library_type = LibraryType::kPlain;
-  if (kernel_key.backend() == pt::Backend::kMKLDNN) {
-    library_type = LibraryType::kMKLDNN;
-  } else if (kernel_key.backend() == pt::Backend::kCUDNN) {
-    library_type = LibraryType::kCUDNN;
-  } else {
-    // do nothing
-  }
-  // TODO(chenweihang): the customized_type_value is lost
-  return OpKernelType(data_type, place, data_layout, library_type);
-}
-
 static std::string RuntimeContextDebugString(const RuntimeContext& ctx) {
   std::stringstream ss;
   ss << "RuntimeContext(Inputs: ";
@@ -1149,22 +1132,23 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   }
 #endif
 
+  auto exe_ctx = ExecutionContext(*this, scope, *dev_ctx, *runtime_ctx);
+
   // TODO(chenweihang): Now we are still reusing a lot of the original fluid
   // implementation, this is a gradual replacement process
   // TODO(chenweihang): in the first phase of project, we only support CPU, CUDA
   // and RCOM backend, the XPU, NPU and MKLDNN will be supported in the second
   // phase
-
-  if (FLAGS_use_pt_kernel &&
+  if (FLAGS_run_pt_kernel &&
       pt::KernelFactory::Instance().ContainsKernel(type_.c_str())) {
-    if (pt_kernel_key_.get() == nullptr || pt_kernel_.get() == nullptr) {
-      ChoosePtKernel(*runtime_ctx, *dev_ctx);
+    if (pt_kernel_signature_.get() == nullptr || pt_kernel_.get() == nullptr) {
+      ChoosePtKernel(exe_ctx);
     }
     run_pt_kernel_ = pt_kernel_->IsValid();
   }
   if (!run_pt_kernel_) {
     if (kernel_type_.get() == nullptr || kernel_func_.get() == nullptr) {
-      ChooseKernel(*runtime_ctx, scope, place);
+      ChooseKernel(exe_ctx);
     }
   }
 
@@ -1175,10 +1159,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
     platform::RecordEvent record_event("prepare_data",
                                        platform::EventRole::kInnerOp);
     if (need_prepare_data_) {
-      if (run_pt_kernel_) {
-        kernel_type_.reset(
-            new OpKernelType(TransPtKernelKeyToOpKernelType(*pt_kernel_key_)));
-      }
       transfer_scope = PrepareData(scope, *kernel_type_,
                                    &transfered_inplace_vars, runtime_ctx);
     }
@@ -1208,8 +1188,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
     platform::RecordEvent record_event("compute",
                                        platform::EventRole::kInnerOp);
     if (run_pt_kernel_) {
-      // TODO(chenweihang): here will intrduce copy
-      auto op_kernel_ctx = ConstructPtKernelContext(*runtime_ctx, *dev_ctx);
+      auto op_kernel_ctx = BuildPtKernelContext(*runtime_ctx, *dev_ctx);
       (*pt_kernel_)(&op_kernel_ctx);
     } else {
       (*kernel_func_)(
@@ -1262,104 +1241,11 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   }
 }
 
-// TODO(chenweihang): now only check single var input
-static bool IsValidVar(const std::string& name,
-                       const VariableValueMap& inputs) {
-  auto it = inputs.find(name);
-  if (it == inputs.end()) {
-    return false;
-  }
-  auto* var = it->second.empty() ? nullptr : it->second[0];
-  return var != nullptr;
-}
-
-// TODO(chenweihang): enhance rules, not all dispensable inputs
-// are host tensor, now only for scale kernel verify
-static bool ContainHostTensor(const proto::OpProto& op_proto,
-                              const VariableValueMap& inputs) {
-  for (int i = 0; i < op_proto.inputs_size(); ++i) {
-    auto in = op_proto.inputs()[i];
-    if (in.has_dispensable() && in.dispensable()) {
-      return IsValidVar(in.name(), inputs);
-    }
-  }
-  return false;
-}
-
-// TODO(yuanrisheng): enhance rules, for get kernel that contains Intermediate
-// Tensor
-static bool ContainMidOutputTensor(const proto::OpProto& op_proto,
-                                   const VariableValueMap& outputs) {
-  for (int i = 0; i < op_proto.outputs_size(); ++i) {
-    auto output = op_proto.outputs()[i];
-    if (output.has_intermediate() && output.intermediate()) {
-      return IsValidVar(output.name(), outputs);
-    }
-  }
-  return false;
-}
-
-static pt::KernelName ConstructPtKernelName(const std::string& op_type,
-                                            const proto::OpProto& op_proto,
-                                            const VariableValueMap& inputs,
-                                            const VariableValueMap& outputs) {
-  std::string overload_name;
-  // TODO(chenweihang): adapt SelectedRows by xiaowei's design
-  if (ContainHostTensor(op_proto, inputs)) {
-    if (overload_name != "") {
-      overload_name += ".";
-    }
-    overload_name += pt::kContainHostTensorSuffix;
-  }
-  if (ContainMidOutputTensor(op_proto, outputs)) {
-    if (overload_name != "") {
-      overload_name += ".";
-    }
-    overload_name += pt::kContainMidOutputTensorSuffix;
-  }
-  return pt::KernelName(op_type, overload_name);
-}
-
-void OperatorWithKernel::ChoosePtKernel(
-    const RuntimeContext& ctx, const platform::DeviceContext& dev_ctx) const {
-  // 1. construct operation name
-  // TODO(chenweihang): add rules for construct op name
-  auto kernel_name =
-      ConstructPtKernelName(Type(), *(Info().proto_), ctx.inputs, ctx.outputs);
-
-  // 2. construct op kernel key
-  pt_kernel_key_.reset(new pt::KernelKey(
-      ConstructPtKernelKey(ctx.inputs, Attrs(), dev_ctx.GetPlace())));
-
-  // 3. selecte op kernel
-  pt_kernel_.reset(new pt::Kernel(pt::KernelFactory::Instance().SelectKernel(
-      kernel_name, *pt_kernel_key_)));
-
-  // for debug
-  VLOG(1) << "ChoosePtKernel - kernel name: " << kernel_name
-          << " | kernel key: " << *pt_kernel_key_
-          << " | kernel: " << *pt_kernel_;
-}
-
-void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
-                                      const Scope& scope,
-                                      const platform::Place& place) const {
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto* dev_ctx = pool.Get(place);
-
-  // check if op[type] has kernel registered.
-  auto& all_op_kernels = AllOpKernels();
-  auto kernels_iter = all_op_kernels.find(type_);
-  PADDLE_ENFORCE_NE(
-      kernels_iter, all_op_kernels.end(),
-      platform::errors::Unavailable(
-          "There are no kernels which are registered in the %s operator.",
-          type_));
-
-  OpKernelMap& kernels = kernels_iter->second;
+OpKernelType OperatorWithKernel::InnerGetExpectedKernelType(
+    const ExecutionContext& ctx) const {
+  auto& dev_ctx = ctx.device_context();
 
-  auto expected_kernel_key = this->GetExpectedKernelType(
-      ExecutionContext(*this, scope, *dev_ctx, ctx));
+  auto expected_kernel_key = this->GetExpectedKernelType(ctx);
   if (HasAttr("op_device")) {
     if (Attr<std::string>("op_device") == "cpu") {
       expected_kernel_key.place_ = platform::CPUPlace();
@@ -1376,9 +1262,9 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
       // when the Op that only has CPUKernel is assigned to GPU, the CPUKernel
       // will be executed and a warning will be given at the same time.
       if (SupportGPU()) {
-        expected_kernel_key.place_ = dev_ctx->GetPlace();
+        expected_kernel_key.place_ = dev_ctx.GetPlace();
       } else if (SupportNPU()) {
-        expected_kernel_key.place_ = dev_ctx->GetPlace();
+        expected_kernel_key.place_ = dev_ctx.GetPlace();
       } else {
         expected_kernel_key.place_ = platform::CPUPlace();
         LOG_FIRST_N(WARNING, 1)
@@ -1389,6 +1275,45 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
   }
   VLOG(3) << "op type:" << type_
           << ", expected_kernel_key:" << expected_kernel_key;
+  return expected_kernel_key;
+}
+
+void OperatorWithKernel::ChoosePtKernel(const ExecutionContext& ctx) const {
+  pt_kernel_signature_.reset(
+      new KernelSignature(this->GetExpectedPtKernelArgs(ctx)));
+
+  VLOG(1) << KernelSignatureToString(*pt_kernel_signature_.get());
+
+  kernel_type_.reset(new OpKernelType(InnerGetExpectedKernelType(ctx)));
+
+  auto pt_kernel_name = pt::KernelName(pt_kernel_signature_->first);
+  auto pt_kernel_key = TransOpKernelTypeToPtKernelKey(*kernel_type_.get());
+  pt_kernel_.reset(new pt::Kernel(pt::KernelFactory::Instance().SelectKernel(
+      pt_kernel_name, pt_kernel_key)));
+
+  if (pt_kernel_->IsValid()) {
+    VLOG(1) << "Static mode ChoosePtKernel - kernel name: " << pt_kernel_name
+            << " | kernel key: " << pt_kernel_key
+            << " | kernel: " << *pt_kernel_;
+  } else {
+    VLOG(1) << "Static mode ChoosePtKernel - kernel `" << pt_kernel_name
+            << "` not found.";
+  }
+}
+
+void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
+  // check if op[type] has kernel registered.
+  auto& all_op_kernels = AllOpKernels();
+  auto kernels_iter = all_op_kernels.find(type_);
+  PADDLE_ENFORCE_NE(
+      kernels_iter, all_op_kernels.end(),
+      platform::errors::Unavailable(
+          "There are no kernels which are registered in the %s operator.",
+          type_));
+
+  OpKernelMap& kernels = kernels_iter->second;
+
+  auto expected_kernel_key = InnerGetExpectedKernelType(ctx);
 
   auto kernel_iter = kernels.find(expected_kernel_key);
 #ifdef PADDLE_WITH_MKLDNN
@@ -1844,60 +1769,23 @@ OpKernelType OperatorWithKernel::GetKernelTypeForVar(
                       tensor.layout());
 }
 
-pt::KernelKey OperatorWithKernel::ConstructPtKernelKey(
-    const VariableValueMap& inputs, const AttributeMap& attrs,
-    const platform::Place& ctx_place) const {
-  // 1. get backend based place and attrs
-  auto attr_reader = AttrReader(attrs);
-  pt::Backend backend = pt::TransToPtBackend(ctx_place);
-  if (attrs.count("use_mkldnn") != 0 &&
-      attr_reader.Get<bool>("use_mkldnn") == true) {
-    backend = pt::Backend::kMKLDNN;
-  } else if (attrs.count("use_cudnn") != 0 &&
-             attr_reader.Get<bool>("use_cudnn") == true) {
-    backend = pt::Backend::kCUDNN;
+KernelSignature OperatorWithKernel::GetExpectedPtKernelArgs(
+    const ExecutionContext& ctx) const {
+  if (KernelSignatureMap::Instance().Has(Type())) {
+    return *(KernelSignatureMap::Instance().GetNullable(Type()));
   } else {
-    // do nothing
+    KernelArgsNameMakerByOpProto maker(Info().proto_);
+    auto signature = maker.GetKernelSignature();
+    KernelSignatureMap::Instance().Insert(Type(), signature);
+    return signature;
   }
-  // TODO(chenweihang): add more rules
-  // if (HasAttr("op_device"))
-
-  // 2. get layout
-  // default layout same as tensor default layout, need futher check
-  pt::DataLayout layout = pt::DataLayout::kNCHW;
-  if (backend == pt::Backend::kMKLDNN) {
-    layout = pt::DataLayout::kMKLDNN;
-  }
-
-  // 3. parse data_type form inputs
-  proto::VarType::Type dafault_data_type =
-      static_cast<proto::VarType::Type>(-1);
-  proto::VarType::Type data_type = dafault_data_type;
-  for (auto& var_pair : inputs) {
-    ParseInputDataType(var_pair.second, var_pair.first, &data_type);
-  }
-  PADDLE_ENFORCE_NE(
-      data_type, dafault_data_type,
-      platform::errors::NotFound(
-          "DataType should be indicated by input Variable at %s.", Type()));
-  pt::DataType dtype = pt::TransToPtDataType(data_type);
-
-  // TODO(chenweihang): polish special dtype rules
-  if (attrs.count("dtype") != 0 &&
-      attr_reader.Get<int>("dtype") != static_cast<int>(data_type)) {
-    dtype = pt::TransToPtDataType(static_cast<framework::proto::VarType::Type>(
-        attr_reader.Get<int>("dtype")));
-  }
-
-  // 4. build pt KernelKey
-  return pt::KernelKey(backend, layout, dtype);
 }
 
-pt::KernelContext OperatorWithKernel::ConstructPtKernelContext(
+pt::KernelContext OperatorWithKernel::BuildPtKernelContext(
     const RuntimeContext& ctx, const platform::DeviceContext& dev_ctx) const {
   VLOG(1) << RuntimeContextDebugString(ctx);
 
-  // TODO(chenweihang): now only work for very simple case (sign op),
+  // TODO(chenweihang): now only work for very simple case,
   // many cases need to be deal with later:
   // 1. the input and output are not tensor
   // 2. the dispensbale, duplicable input and output
@@ -1905,42 +1793,36 @@ pt::KernelContext OperatorWithKernel::ConstructPtKernelContext(
   // 4. use pt Tensor directly
   // 5. kernel input is not DenseTensor
   pt::KernelContext op_kernel_ctx(dev_ctx);
-  auto input_defs = pt_kernel_->args_def().input_defs();
-  auto output_defs = pt_kernel_->args_def().output_defs();
-  auto attr_defs = pt_kernel_->args_def().attribute_defs();
-
-  // TODO(chenweihang): use ordered_map for VariableNameMap and VariableValueMap
-  // If we the VariableValueMap are ordered, we can get tensor by iter the map,
-  // and its order is same as OpProto
 
-  paddle::imperative::KernelArgsNameMakerByOpProto<Variable> argMaker(
-      Info().proto_, &ctx.inputs, &ctx.outputs);
+  auto& input_names = std::get<0>(pt_kernel_signature_->second);
+  auto& attr_names = std::get<1>(pt_kernel_signature_->second);
+  auto& output_names = std::get<2>(pt_kernel_signature_->second);
 
-  auto& input_names = argMaker.GetInputArgsNames();
-  auto& output_names = argMaker.GetOutputArgsNames();
-  auto& attr_pairs = argMaker.GetAttrsArgsNamesAndTypes();
+  auto input_defs = pt_kernel_->args_def().input_defs();
+  auto attr_defs = pt_kernel_->args_def().attribute_defs();
+  auto output_defs = pt_kernel_->args_def().output_defs();
 
   PADDLE_ENFORCE_EQ(input_names.size(), input_defs.size(),
                     platform::errors::InvalidArgument(
-                        "the size of inputs_args names (%d) must be equal to "
+                        "The size of inputs_args names (%d) must be equal to "
                         "the size of kernel input_defs (%d).",
                         input_names.size(), input_defs.size()));
 
   PADDLE_ENFORCE_EQ(output_names.size(), output_defs.size(),
                     platform::errors::InvalidArgument(
-                        "the size of outputs_args names (%d) must be equal to "
+                        "The size of outputs_args names (%d) must be equal to "
                         "the size of kernel output_defs (%d).",
                         output_names.size(), output_defs.size()));
 
-  PADDLE_ENFORCE_EQ(attr_pairs.size(), attr_defs.size(),
+  PADDLE_ENFORCE_EQ(attr_names.size(), attr_defs.size(),
                     platform::errors::InvalidArgument(
-                        "the size of attribute_args names (%d) must be equal "
+                        "The size of attribute_args names (%d) must be equal "
                         "to the size of kernel attribute_defs (%d).",
-                        attr_pairs.size(), attr_defs.size()));
+                        attr_names.size(), attr_defs.size()));
 
   for (size_t i = 0; i < input_names.size(); ++i) {
     auto in_def = input_defs.at(i);
-    VLOG(1) << "in_def: " << in_def.backend << ", " << in_def.dtype << ", "
+    VLOG(2) << "in_def: " << in_def.backend << ", " << in_def.dtype << ", "
             << in_def.layout;
 
     auto ins_vector = ctx.inputs.at(input_names[i]);
@@ -1965,50 +1847,33 @@ pt::KernelContext OperatorWithKernel::ConstructPtKernelContext(
     op_kernel_ctx.EmplaceBackOutputs(tmp_outputs);
   }
 
-  for (size_t i = 0; i < attr_defs.size(); ++i) {
-    paddle::any attr_item;
+  for (size_t i = 0; i < attr_names.size(); ++i) {
+    auto& attr = Attrs().at(attr_names[i]);
     if (attr_defs[i].type_index == std::type_index(typeid(pt::Scalar))) {
-      // TODO(chenweihang): support other attrs
-      // In principle, the attr required by the dynamic mode should be
-      // passed in from the Python side, and there is no need to look up
-      // from the default_map, but now this nor work
-      switch (attr_pairs[i].second) {
-        case framework::proto::AttrType::INT:
-          op_kernel_ctx.EmplaceBackAttr(
-              pt::Scalar(Attr<int>(attr_pairs[i].first)));
-          break;
-        case framework::proto::AttrType::FLOAT:
-          op_kernel_ctx.EmplaceBackAttr(
-              pt::Scalar(Attr<float>(attr_pairs[i].first)));
-          break;
-        case framework::proto::AttrType::BOOLEAN:
-          op_kernel_ctx.EmplaceBackAttr(
-              pt::Scalar(Attr<double>(attr_pairs[i].first)));
-          break;
-        default:
-          // TODO(chenweihang): support other attrs type
-          PADDLE_THROW(platform::errors::Unimplemented(
-              "unsupported cast op attribute `%s` when construct "
-              "KernelContext.",
-              attr_pairs[i].first));
+      // TODO(chenweihang): support other attrs later
+      // TODO(zhangyunfei): Scalar should hold scaler type, and we should check
+      // attribtue type by attr_defs
+      if (std::type_index(attr.type()) == std::type_index(typeid(float))) {
+        op_kernel_ctx.EmplaceBackAttr(pt::Scalar(BOOST_GET_CONST(float, attr)));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "unsupported cast op attribute `%s` to Scalar when construct "
+            "KernelContext.",
+            attr_names[i]));
       }
     } else {
-      // TODO(chenweihang): support other attrs
-      // In principle, the attr required by the dynamic mode should be
-      // passed in from the Python side, and there is no need to look up
-      // from the default_map, but now this nor work
+      // TODO(chenweihang): support other attrs later
       if (attr_defs[i].type_index == std::type_index(typeid(int))) {
-        op_kernel_ctx.EmplaceBackAttr(Attr<int>(attr_pairs[i].first));
+        op_kernel_ctx.EmplaceBackAttr(BOOST_GET_CONST(int, attr));
       } else if (attr_defs[i].type_index == std::type_index(typeid(float))) {
-        op_kernel_ctx.EmplaceBackAttr(Attr<float>(attr_pairs[i].first));
+        op_kernel_ctx.EmplaceBackAttr(BOOST_GET_CONST(float, attr));
       } else if (attr_defs[i].type_index == std::type_index(typeid(bool))) {
-        op_kernel_ctx.EmplaceBackAttr(Attr<bool>(attr_pairs[i].first));
+        op_kernel_ctx.EmplaceBackAttr(BOOST_GET_CONST(bool, attr));
       } else {
-        // TODO(chenweihang): support other attrs type
         PADDLE_THROW(platform::errors::Unimplemented(
             "unsupported cast op attribute `%s` when construct "
             "KernelContext.",
-            attr_pairs[i].first));
+            attr_names[i]));
       }
     }
   }
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index b844c2cf61407..7581b65e3b68b 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -116,8 +116,6 @@ inline std::string GradOriginalVarName(const std::string& grad_var_name) {
 const Tensor* GetLoDTensorOrSelectedRowsValueFromVar(const Variable& var);
 Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var);
 
-OpKernelType TransPtKernelKeyToOpKernelType(const pt::KernelKey& kernel_key);
-
 class ExecutionContext;
 class OperatorBase;
 
@@ -534,13 +532,15 @@ class OperatorWithKernel : public OperatorBase {
   }
 
   /* member functions for adapting to tcmpt lib */
-  // TODO(chenweihang): Temporarily as a class method
-  virtual pt::KernelKey ConstructPtKernelKey(
-      const VariableValueMap& inputs, const AttributeMap& attrs,
-      const platform::Place& ctx_place) const;
-
-  virtual pt::KernelContext ConstructPtKernelContext(
-      const RuntimeContext& ctx, const platform::DeviceContext& dev_ctx) const;
+  /** In the Tensor calculation library, the new Kernel adopts a clearer and
+    * more streamlined design. The arguments of the Kernel and the input and
+    * output arguments registered in the original OpMaker do not match in some
+    * cases, so we use map to record the arguments required by the kernel.
+    * When selecting Kernel during Op execution, select the arguments of the
+    * original Op according to the GetExpectedPtKernelArgs returned arguments.
+    */
+  virtual KernelSignature GetExpectedPtKernelArgs(
+      const ExecutionContext& ctx) const;
 
  private:
   void RunImpl(const Scope& scope, const platform::Place& place) const final;
@@ -563,8 +563,9 @@ class OperatorWithKernel : public OperatorBase {
                                const std::vector<std::string>& inplace_vars,
                                const Scope& exec_scope) const;
 
-  void ChooseKernel(const RuntimeContext& ctx, const Scope& scope,
-                    const platform::Place& place) const;
+  OpKernelType InnerGetExpectedKernelType(const ExecutionContext& ctx) const;
+
+  void ChooseKernel(const ExecutionContext& ctx) const;
 
   void HandleComplexGradToRealGrad(const Scope& scope,
                                    RuntimeContext* ctx) const;
@@ -582,8 +583,10 @@ class OperatorWithKernel : public OperatorBase {
                                    const std::string& name) const;
 
   /* member functions for adapting to tcmpt lib */
-  void ChoosePtKernel(const RuntimeContext& ctx,
-                      const platform::DeviceContext& dev_ctx) const;
+  void ChoosePtKernel(const ExecutionContext& ctx) const;
+
+  pt::KernelContext BuildPtKernelContext(
+      const RuntimeContext& ctx, const platform::DeviceContext& dev_ctx) const;
 
  protected:
   mutable std::unique_ptr<OpKernelType> kernel_type_;
@@ -595,10 +598,11 @@ class OperatorWithKernel : public OperatorBase {
   mutable bool all_kernels_must_compute_runtime_shape_ = false;
   mutable std::mutex cache_update_mutex_;
   mutable bool enable_cache_transfer_scope_ = false;
-  // TODO(chenweihang): Similar duplicate members are used for new tcmpt lib,
-  // maybe we have better impl methods
+  // NOTE(chenweihang): Similar op members are used to adapt to
+  // new tcmpt kernel, if there is a better design in the future,
+  // we may polish the implementation here
   mutable bool run_pt_kernel_ = false;
-  mutable std::unique_ptr<pt::KernelKey> pt_kernel_key_;
+  mutable std::unique_ptr<KernelSignature> pt_kernel_signature_;
   mutable std::unique_ptr<pt::Kernel> pt_kernel_;
 };
 
diff --git a/paddle/fluid/framework/tcmpt_utils.cc b/paddle/fluid/framework/tcmpt_utils.cc
index 71ef2d3450ae9..7f8c7af609d65 100644
--- a/paddle/fluid/framework/tcmpt_utils.cc
+++ b/paddle/fluid/framework/tcmpt_utils.cc
@@ -12,11 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <sstream>
+
 #include "paddle/fluid/framework/tcmpt_utils.h"
 
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/variable.h"
+#include "paddle/fluid/string/string_helper.h"
 
 namespace paddle {
 namespace framework {
@@ -62,7 +65,7 @@ std::shared_ptr<pt::DenseTensor> MakeTensorImpl<pt::DenseTensor>(
     proto::VarType::Type type) {
   return MakeTensorImpl<pt::DenseTensor, LoDTensor>(
       tensor, pt::TransToPtBackend(place), pt::TransToPtDataType(type),
-      pt::TransToPtLayout(tensor.layout()));
+      pt::TransToPtDataLayout(tensor.layout()));
 }
 
 template <>
@@ -71,7 +74,7 @@ std::shared_ptr<pt::DenseTensor> MakeTensorImpl<pt::DenseTensor>(
     proto::VarType::Type type) {
   return MakeTensorImpl<pt::DenseTensor, Tensor>(
       tensor, pt::TransToPtBackend(place), pt::TransToPtDataType(type),
-      pt::TransToPtLayout(tensor.layout()));
+      pt::TransToPtDataLayout(tensor.layout()));
 }
 
 template <>
@@ -164,5 +167,115 @@ std::shared_ptr<pt::TensorInterface> OutputVariableToPtTensor(
   return nullptr;
 }
 
+OpKernelType TransPtKernelKeyToOpKernelType(const pt::KernelKey& kernel_key) {
+  proto::VarType::Type data_type = pt::TransToProtoVarType(kernel_key.dtype());
+  platform::Place place = pt::TransToFluidPlace(kernel_key.backend());
+  DataLayout data_layout = pt::TransToFluidDataLayout(kernel_key.layout());
+  LibraryType library_type = LibraryType::kPlain;
+  if (kernel_key.backend() == pt::Backend::kMKLDNN) {
+    library_type = LibraryType::kMKLDNN;
+  } else if (kernel_key.backend() == pt::Backend::kCUDNN) {
+    library_type = LibraryType::kCUDNN;
+  } else {
+    // do nothing
+  }
+  // TODO(chenweihang): the customized_type_value is lost
+  return OpKernelType(data_type, place, data_layout, library_type);
+}
+
+pt::KernelKey TransOpKernelTypeToPtKernelKey(const OpKernelType& kernel_type) {
+  pt::Backend backend = pt::TransToPtBackend(kernel_type.place_);
+  if (kernel_type.library_type_ == LibraryType::kMKLDNN) {
+    backend = pt::Backend::kMKLDNN;
+  } else if (kernel_type.library_type_ == LibraryType::kCUDNN) {
+    backend = pt::Backend::kCUDNN;
+  } else {
+    // do
+  }
+  pt::DataLayout layout = pt::TransToPtDataLayout(kernel_type.data_layout_);
+  pt::DataType dtype = pt::TransToPtDataType(kernel_type.data_type_);
+  return pt::KernelKey(backend, layout, dtype);
+}
+
+KernelSignatureMap& KernelSignatureMap::Instance() {
+  static KernelSignatureMap g_kernel_signature_map;
+  return g_kernel_signature_map;
+}
+
+const paddle::SmallVector<std::string>&
+KernelArgsNameMakerByOpProto::GetInputArgsNames() {
+  for (int i = 0; i < op_proto_->inputs_size(); ++i) {
+    auto& in = op_proto_->inputs()[i];
+    auto& in_name = in.name();
+    if ((in.has_extra() && in.extra()) || (in.has_quant() && in.quant())) {
+      VLOG(1) << "Parse PtKernel input: skip extra & quant input - " << in_name;
+      continue;
+    }
+    // If contains dispensable input, we should override the
+    // GetExpectedPtKernelArgs method self
+    if (in.has_dispensable() && in.dispensable()) {
+      VLOG(1) << "Parse PtKernel input: skip dispensable input - " << in_name;
+      continue;
+    }
+    VLOG(1) << "Parse PtKernel input: " << in_name;
+    input_names_.emplace_back(in_name);
+  }
+  return input_names_;
+}
+
+const paddle::SmallVector<std::string>&
+KernelArgsNameMakerByOpProto::GetOutputArgsNames() {
+  for (int i = 0; i < op_proto_->outputs_size(); ++i) {
+    auto& out = op_proto_->outputs()[i];
+    auto& out_name = out.name();
+    // TODO(chenweihang): outputs also need skip some cases
+    VLOG(1) << "Parse PtKernel output: " << out_name;
+    output_names_.emplace_back(out_name);
+  }
+  return output_names_;
+}
+
+const paddle::SmallVector<std::string>&
+KernelArgsNameMakerByOpProto::GetAttrsArgsNames() {
+  for (int i = 0; i < op_proto_->attrs_size(); ++i) {
+    auto& attr = op_proto_->attrs()[i];
+    auto& attr_name = attr.name();
+    if (attr_name == "use_mkldnn" || attr_name == "op_role" ||
+        attr_name == "op_role_var" || attr_name == "op_namescope" ||
+        attr_name == "op_callstack" || attr_name == "op_device") {
+      VLOG(1) << "Parse PtKernel attribute: skip needless attr - " << attr_name;
+      continue;
+    }
+    if ((attr.has_extra() && attr.extra()) ||
+        (attr.has_quant() && attr.quant())) {
+      VLOG(1) << "Parse PtKernel attribute: skip extra & quant attr - "
+              << attr_name;
+      continue;
+    }
+    VLOG(1) << "Parse PtKernel attribute: " << attr_name;
+    attr_names_.emplace_back(attr_name);
+  }
+
+  return attr_names_;
+}
+
+KernelSignature KernelArgsNameMakerByOpProto::GetKernelSignature() {
+  return std::make_pair(
+      op_proto_->type(),
+      std::make_tuple(GetInputArgsNames(), GetAttrsArgsNames(),
+                      GetOutputArgsNames()));
+}
+
+std::string KernelSignatureToString(const KernelSignature& signature) {
+  std::stringstream os;
+  os << "Kernel Signature - name: " << signature.first << "; inputs: "
+     << string::join_strings(std::get<0>(signature.second), ", ")
+     << "; attributes: "
+     << string::join_strings(std::get<1>(signature.second), ", ")
+     << "; outputs: "
+     << string::join_strings(std::get<2>(signature.second), ", ");
+  return os.str();
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/tcmpt_utils.h b/paddle/fluid/framework/tcmpt_utils.h
index 0af8cd30bd34d..8618a3a570302 100644
--- a/paddle/fluid/framework/tcmpt_utils.h
+++ b/paddle/fluid/framework/tcmpt_utils.h
@@ -14,14 +14,25 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/imperative/type_defs.h"
+#include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/place.h"
-
 #include "paddle/tcmpt/api/include/core.h"
+#include "paddle/utils/flat_hash_map.h"
+#include "paddle/utils/small_vector.h"
 
 namespace paddle {
 namespace framework {
 
+/* tensor translate */
+
 template <typename PtTensorImplT, typename VariableT>
 std::shared_ptr<PtTensorImplT> MakeTensorImpl(const VariableT& tensor,
                                               pt::Backend backend,
@@ -49,5 +60,76 @@ std::shared_ptr<pt::TensorInterface> InputVariableToPtTensor(
 std::shared_ptr<pt::TensorInterface> OutputVariableToPtTensor(
     framework::Variable* variable, const pt::TensorArgDef& arg_def);
 
+/* Kernel Key translate */
+
+OpKernelType TransPtKernelKeyToOpKernelType(const pt::KernelKey& kernel_key);
+pt::KernelKey TransOpKernelTypeToPtKernelKey(const OpKernelType& kernel_type);
+
+/* Kernel Args parse */
+
+// TODO(chenweihang): we can generate this map by proto info in compile time
+class KernelSignatureMap {
+ public:
+  static KernelSignatureMap& Instance();
+
+  bool Has(const std::string& op_type) const {
+    return map_.find(op_type) != map_.end();
+  }
+
+  void Insert(const std::string& op_type, const KernelSignature& signature) {
+    PADDLE_ENFORCE_NE(
+        Has(op_type), true,
+        platform::errors::AlreadyExists(
+            "Operator (%s)'s Kernel Signature has been registered.", op_type));
+    map_.insert({op_type, signature});
+  }
+
+  const KernelSignature* GetNullable(const std::string& op_type) const {
+    auto it = map_.find(op_type);
+    if (it == map_.end()) {
+      return nullptr;
+    } else {
+      return &it->second;
+    }
+  }
+
+ private:
+  KernelSignatureMap() = default;
+  paddle::flat_hash_map<std::string, KernelSignature> map_;
+
+  DISABLE_COPY_AND_ASSIGN(KernelSignatureMap);
+};
+
+class KernelArgsNameMaker {
+ public:
+  virtual ~KernelArgsNameMaker() {}
+  virtual const paddle::SmallVector<std::string>& GetInputArgsNames() = 0;
+  virtual const paddle::SmallVector<std::string>& GetOutputArgsNames() = 0;
+  virtual const paddle::SmallVector<std::string>& GetAttrsArgsNames() = 0;
+};
+
+class KernelArgsNameMakerByOpProto : public KernelArgsNameMaker {
+ public:
+  explicit KernelArgsNameMakerByOpProto(framework::proto::OpProto* op_proto)
+      : op_proto_(op_proto) {}
+
+  ~KernelArgsNameMakerByOpProto() {}
+
+  const paddle::SmallVector<std::string>& GetInputArgsNames() override;
+  const paddle::SmallVector<std::string>& GetOutputArgsNames() override;
+  const paddle::SmallVector<std::string>& GetAttrsArgsNames() override;
+
+  KernelSignature GetKernelSignature();
+
+ private:
+  framework::proto::OpProto* op_proto_;
+
+  paddle::SmallVector<std::string> input_names_;
+  paddle::SmallVector<std::string> output_names_;
+  paddle::SmallVector<std::string> attr_names_;
+};
+
+std::string KernelSignatureToString(const KernelSignature& signature);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h
index 1c5469d02c3ef..d0d1b915f2317 100644
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -17,11 +17,13 @@ limitations under the License. */
 #include <map>
 #include <memory>
 #include <string>
+#include <tuple>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/platform/variant.h"
+#include "paddle/utils/small_vector.h"
 
 namespace paddle {
 namespace framework {
@@ -82,5 +84,13 @@ using InferShapeFN = std::function<void(InferShapeContext*)>;
 using InplacePair = std::unordered_map<std::string, std::string>;
 using InferInplaceOpFN = std::function<InplacePair(bool /*use_cuda*/)>;
 
+// tuple(input_names, attr_names, output_names)
+using KernelArgsTuple = std::tuple<paddle::SmallVector<std::string>,
+                                   paddle::SmallVector<std::string>,
+                                   paddle::SmallVector<std::string>>;
+// TODD(yuanrisheng): impl implicit overload signature, use KernelArgsTuple
+// directly
+using KernelSignature = std::pair<std::string, KernelArgsTuple>;
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/imperative/kernel_args_names_maker.h b/paddle/fluid/imperative/kernel_args_names_maker.h
deleted file mode 100644
index 5863f3cae95c2..0000000000000
--- a/paddle/fluid/imperative/kernel_args_names_maker.h
+++ /dev/null
@@ -1,165 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "glog/logging.h"
-
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/imperative/type_defs.h"
-#include "paddle/utils/small_vector.h"
-
-namespace paddle {
-namespace imperative {
-// TODO(chenweihang): now only check single var input
-template <typename VarType>
-static bool IsValidVar(const std::string& name,
-                       const NameVarMap<VarType>& inputs) {
-  auto it = inputs.find(name);
-  if (it == inputs.end()) {
-    return false;
-  }
-  if (it->second.empty()) {
-    return false;
-  }
-  return it->second[0] != nullptr;
-}
-
-class KernelArgsNameMaker {
- public:
-  virtual ~KernelArgsNameMaker() {}
-  virtual const paddle::SmallVector<std::string>& GetInputArgsNames() = 0;
-  virtual const paddle::SmallVector<std::string>& GetOutputArgsNames() = 0;
-  virtual const paddle::SmallVector<
-      std::pair<std::string, framework::proto::AttrType>>&
-  GetAttrsArgsNamesAndTypes() = 0;
-};
-
-template <typename VarType>
-class KernelArgsNameMakerByOpProto : public KernelArgsNameMaker {
- public:
-  KernelArgsNameMakerByOpProto(framework::proto::OpProto* op_proto,
-                               const imperative::NameVarMap<VarType>* inputs,
-                               const imperative::NameVarMap<VarType>* outputs)
-      : op_proto_(op_proto), inputs_(inputs), outputs_(outputs) {}
-
-  ~KernelArgsNameMakerByOpProto() {}
-
-  const paddle::SmallVector<std::string>& GetInputArgsNames() override {
-    for (int i = 0; i < op_proto_->inputs_size(); ++i) {
-      auto in = op_proto_->inputs()[i];
-
-      // TODO(chenweihang): deal with diff param in vector
-      if ((in.has_extra() && in.extra()) || (in.has_quant() && in.quant())) {
-        VLOG(1) << "Dygraph PtKernel input: skip extra & quant input - "
-                << in.name();
-        continue;
-      }
-
-      std::string in_name = in.name();
-      if (in.has_dispensable() && in.dispensable()) {
-        if (this->contain_host_tensor_flags.count(in_name) > 0 &&
-            IsValidVar<VarType>(in_name, *inputs_)) {
-          VLOG(1) << "Dygraph PtKernel input: contain host input - " << in_name;
-          this->contain_host_tensor_flags[in_name] = true;
-        } else {
-          VLOG(1) << "Dygraph PtKernel input: skip dispensable input - "
-                  << in_name;
-          continue;
-        }
-      }
-
-      input_names.emplace_back(in.name());
-    }
-    return input_names;
-  }
-
-  const paddle::SmallVector<std::string>& GetOutputArgsNames() override {
-    for (int i = 0; i < op_proto_->outputs_size(); ++i) {
-      auto out_name = op_proto_->outputs()[i].name();
-      VLOG(1) << "Dygraph PtKernel output: " << out_name;
-      // TODO(chenweihang): outputs also need skip some cases
-
-      output_names.emplace_back(out_name);
-    }
-    return output_names;
-  }
-
-  const paddle::SmallVector<std::pair<std::string, framework::proto::AttrType>>&
-  GetAttrsArgsNamesAndTypes() override {
-    for (int i = 0; i < op_proto_->attrs_size(); ++i) {
-      auto attr = op_proto_->attrs()[i];
-      if (attr.name() == "use_mkldnn" || attr.name() == "op_role" ||
-          attr.name() == "op_role_var" || attr.name() == "op_namescope" ||
-          attr.name() == "op_callstack" || attr.name() == "op_device") {
-        VLOG(1) << "Dygraph PtKernel attribute: skip needless attr - "
-                << attr.name();
-        continue;
-      }
-      if ((attr.has_extra() && attr.extra()) ||
-          (attr.has_quant() && attr.quant())) {
-        VLOG(1) << "Dygraph PtKernel attribute: skip extra & quant attr - "
-                << attr.name();
-        continue;
-      }
-      if (attr_to_host_tensor.count(attr.name()) > 0 &&
-          contain_host_tensor_flags.at(attr_to_host_tensor.at(attr.name())) ==
-              true) {
-        VLOG(1) << "Dygraph PtKernel attribute: skip dynaimc attr - "
-                << attr.name() << ", because "
-                << attr_to_host_tensor.at(attr.name()) << " exists.";
-        continue;
-      }
-      // TODO(chenweihang): we need better methods to deal with special cases
-      if (attr.name() == "dtype") {
-        VLOG(1) << "Dygraph PtKernel attribute: skip " << op_proto_->type()
-                << "'s dtype attr.";
-        continue;
-      }
-      VLOG(1) << "Dygraph PtKernel attribute: " << attr.name();
-      attr_names.emplace_back(
-          std::pair<std::string, framework::proto::AttrType>(attr.name(),
-                                                             attr.type()));
-    }
-
-    return attr_names;
-  }
-
- private:
-  framework::proto::OpProto* op_proto_;
-
-  const imperative::NameVarMap<VarType>* inputs_;
-  const imperative::NameVarMap<VarType>* outputs_;
-
-  paddle::SmallVector<std::string> input_names;
-  paddle::SmallVector<std::string> output_names;
-  paddle::SmallVector<std::pair<std::string, framework::proto::AttrType>>
-      attr_names;
-
-  // TODO(chenweihang): For scale op, when the input has a `ScaleTensor`,
-  // the following scale attribute should be skipped, and there are many
-  // such ops, which require certain rules to process, now only for verify
-  // scale op
-  std::unordered_map<std::string, bool> contain_host_tensor_flags{
-      {"ScaleTensor", false}};
-  std::unordered_map<std::string, std::string> attr_to_host_tensor{
-      {"scale", "ScaleTensor"}};
-};
-
-}  // namespace imperative
-}  // namespace paddle
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index f7e57bec1da9e..87e7e754e3ee8 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -22,7 +22,7 @@
 #include "paddle/fluid/platform/xpu/xpu_op_list.h"
 #endif
 DECLARE_bool(check_nan_inf);
-DECLARE_bool(use_pt_kernel);
+DECLARE_bool(run_pt_kernel);
 
 namespace paddle {
 namespace imperative {
@@ -47,10 +47,9 @@ const framework::Tensor* GetTensorFromVar(const framework::Variable& var) {
   }
 }
 
-template <typename T>
-static const T& GetAttr(const framework::AttributeMap& attrs,
-                        const framework::AttributeMap& default_attrs,
-                        const std::string& name) {
+static const framework::Attribute& GetAttr(
+    const framework::AttributeMap& attrs,
+    const framework::AttributeMap& default_attrs, const std::string& name) {
   auto it = attrs.find(name);
   bool found = it != attrs.end();
   if (!found) {
@@ -60,7 +59,7 @@ static const T& GetAttr(const framework::AttributeMap& attrs,
   PADDLE_ENFORCE_EQ(
       found, true,
       platform::errors::NotFound("(%s) is not found in AttributeMap.", name));
-  return BOOST_GET_CONST(T, it->second);
+  return it->second;
 }
 
 template <typename VarType>
@@ -108,63 +107,18 @@ PreparedOp::PreparedOp(const framework::OperatorBase& op,
 
 PreparedOp::PreparedOp(const framework::OperatorBase& op,
                        const framework::RuntimeContext& ctx,
-                       const pt::KernelKey& pt_kernel_key,
+                       const framework::OpKernelType& kernel_type,
+                       const framework::KernelSignature& kernel_signature,
                        const pt::Kernel& pt_kernel,
                        platform::DeviceContext* dev_ctx)
     : op_(op),
       ctx_(ctx),
-      kernel_type_(framework::OpKernelType(framework::proto::VarType::RAW,
-                                           platform::CPUPlace())),
+      kernel_type_(kernel_type),
       func_(nullptr),
       dev_ctx_(dev_ctx),
       run_pt_kernel_(true),
-      pt_kernel_key_(pt_kernel_key),
-      pt_kernel_(pt_kernel) {
-  // TODO(chenweihang): PrepareData still use old impl, so here need save
-  // old kernel type, trans it later
-  kernel_type_ = framework::TransPtKernelKeyToOpKernelType(pt_kernel_key_);
-}
-
-template <typename VarType>
-static framework::VariableValueMap BuildInputMap(
-    const NameVarMap<VarType>& ins) {
-  framework::VariableValueMap inputs;
-  for (auto& var_pair : ins) {
-    for (auto& var : var_pair.second) {
-      inputs[var_pair.first].emplace_back(var->MutableVar());
-    }
-  }
-  return inputs;
-}
-
-// TODO(chenweihang): enhance rules, not all dispensable inputs
-// are host tensor, now only for scale kernel verify
-template <typename VarType>
-static bool ContainHostTensor(const framework::proto::OpProto& op_proto,
-                              const NameVarMap<VarType>& inputs) {
-  for (int i = 0; i < op_proto.inputs_size(); ++i) {
-    auto in = op_proto.inputs()[i];
-    if (in.has_dispensable() && in.dispensable()) {
-      return IsValidVar<VarType>(in.name(), inputs);
-    }
-  }
-  return false;
-}
-
-template <typename VarType>
-static pt::KernelName ConstructPtKernelName(
-    const std::string& op_type, const framework::proto::OpProto& op_proto,
-    const NameVarMap<VarType>& inputs) {
-  std::string overload_name;
-  // TODO(chenweihang): adapt SelectedRows by xiaowei's design
-  if (ContainHostTensor<VarType>(op_proto, inputs)) {
-    if (overload_name != "") {
-      overload_name += ".";
-    }
-    overload_name += pt::kContainHostTensorSuffix;
-  }
-  return pt::KernelName(op_type, overload_name);
-}
+      pt_kernel_signature_(kernel_signature),
+      pt_kernel_(pt_kernel) {}
 
 template <typename VarType>
 PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
@@ -192,30 +146,36 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
 #endif
 
   // 1. get expected kernel key
-  if (FLAGS_use_pt_kernel &&
+  auto dygraph_exe_ctx = DygraphExecutionContext<VarType>(
+      op, framework::Scope(), *dev_ctx, ctx, ins, outs, attrs, default_attrs);
+  auto expected_kernel_key = op.GetExpectedKernelType(dygraph_exe_ctx);
+  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
+
+  if (FLAGS_run_pt_kernel &&
       pt::KernelFactory::Instance().ContainsKernel(op.Type().c_str())) {
-    auto kernel_name =
-        ConstructPtKernelName<VarType>(op.Type(), (*op.Info().proto_), ins);
-    auto inputs = BuildInputMap<VarType>(ins);
-    // we only need attrs here
-    // auto final_attrs = BuildAttrMap(attrs, default_attrs);
-    auto pt_kernel_key = op.ConstructPtKernelKey(inputs, attrs, place);
-    auto pt_kernel =
-        pt::KernelFactory::Instance().SelectKernel(kernel_name, pt_kernel_key);
-    // for debug
-    VLOG(1) << "PrepareImpl - kernel name: " << kernel_name
-            << " | kernel key: " << pt_kernel_key << " | kernel: " << pt_kernel;
+    auto pt_kernel_signature = op.GetExpectedPtKernelArgs(dygraph_exe_ctx);
+
+    VLOG(1) << framework::KernelSignatureToString(pt_kernel_signature);
+
+    auto pt_kernel_name = pt::KernelName(pt_kernel_signature.first);
+    auto pt_kernel_key = TransOpKernelTypeToPtKernelKey(expected_kernel_key);
+    auto pt_kernel = pt::KernelFactory::Instance().SelectKernel(pt_kernel_name,
+                                                                pt_kernel_key);
+
     if (pt_kernel.IsValid()) {
+      VLOG(1) << "Dynamic mode PrepareImpl - kernel name: " << pt_kernel_name
+              << " | kernel key: " << pt_kernel_key
+              << " | kernel: " << pt_kernel;
+
       // TODO(chenweihang): using CPUKernel when miss device kernel case
-      return PreparedOp(op, ctx, pt_kernel_key, pt_kernel, dev_ctx);
+      return PreparedOp(op, ctx, expected_kernel_key, pt_kernel_signature,
+                        pt_kernel, dev_ctx);
+    } else {
+      VLOG(1) << "Dynamic mode ChoosePtKernel - kernel `" << pt_kernel_name
+              << "` not found.";
     }
   }
 
-  auto expected_kernel_key = op.GetExpectedKernelType(
-      DygraphExecutionContext<VarType>(op, framework::Scope(), *dev_ctx, ctx,
-                                       ins, outs, attrs, default_attrs));
-  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
-
   // 2. check if op[type] has kernel registered.
   auto& all_op_kernels = op.AllOpKernels();
   auto kernels_iter = all_op_kernels.find(op.Type());
@@ -283,13 +243,13 @@ PreparedOp PreparedOp::Prepare(const NameVarMap<VariableWrapper>& ins,
 }
 
 template <typename VarType>
-static pt::KernelContext BuildDygraphKernelContext(
-    const pt::Kernel& pt_kernel, KernelArgsNameMaker* argsNameMaker,
-    const NameVarMap<VarType>& ins, const NameVarMap<VarType>& outs,
-    const framework::AttributeMap& attrs,
+static pt::KernelContext BuildDygraphPtKernelContext(
+    const framework::KernelSignature& pt_kernel_signature,
+    const pt::Kernel& pt_kernel, const NameVarMap<VarType>& ins,
+    const NameVarMap<VarType>& outs, const framework::AttributeMap& attrs,
     const framework::AttributeMap& default_attrs,
     const platform::DeviceContext& dev_ctx) {
-  // TODO(chenweihang): now only work for very simple case (sign op),
+  // TODO(chenweihang): now only work for very simple case,
   // many cases need to be deal with later:
   // 1. the input and output are not tensor
   // 2. the dispensbale, duplicable input and output
@@ -297,14 +257,15 @@ static pt::KernelContext BuildDygraphKernelContext(
   // 4. use pt Tensor directly
   // 5. kernel input is not DenseTensor
   pt::KernelContext op_kernel_ctx(dev_ctx);
+
+  auto& input_names = std::get<0>(pt_kernel_signature.second);
+  auto& attr_names = std::get<1>(pt_kernel_signature.second);
+  auto& output_names = std::get<2>(pt_kernel_signature.second);
+
   auto input_defs = pt_kernel.args_def().input_defs();
   auto output_defs = pt_kernel.args_def().output_defs();
   auto attr_defs = pt_kernel.args_def().attribute_defs();
 
-  auto& input_names = argsNameMaker->GetInputArgsNames();
-  auto& output_names = argsNameMaker->GetOutputArgsNames();
-  auto& attr_pairs = argsNameMaker->GetAttrsArgsNamesAndTypes();
-
   PADDLE_ENFORCE_EQ(input_names.size(), input_defs.size(),
                     platform::errors::InvalidArgument(
                         "the size of inputs_args names (%d) must be equal to "
@@ -317,16 +278,16 @@ static pt::KernelContext BuildDygraphKernelContext(
                         "the size of kernel output_defs (%d).",
                         output_names.size(), output_defs.size()));
 
-  PADDLE_ENFORCE_EQ(attr_pairs.size(), attr_defs.size(),
+  PADDLE_ENFORCE_EQ(attr_names.size(), attr_defs.size(),
                     platform::errors::InvalidArgument(
                         "the size of attribute_args names (%d) must be equal "
                         "to the size of kernel attribute_defs (%d).",
-                        attr_pairs.size(), attr_defs.size()));
+                        attr_names.size(), attr_defs.size()));
 
   for (size_t i = 0; i < input_names.size(); ++i) {
-    auto in_def = input_defs.at(i);
+    auto& in_def = input_defs.at(i);
+    auto& ins_vector = ins.at(input_names[i]);
 
-    auto ins_vector = ins.at(input_names[i]);
     std::vector<std::shared_ptr<pt::TensorInterface>> tmp_inputs;
     for (auto var : ins_vector) {
       const auto& variable = var->Var();
@@ -338,12 +299,12 @@ static pt::KernelContext BuildDygraphKernelContext(
   }
 
   for (size_t i = 0; i < output_names.size(); ++i) {
-    auto out_def = output_defs.at(i);
-    auto outs_vector = outs.at(output_names[i]);
+    auto& out_def = output_defs.at(i);
+    auto& outs_vector = outs.at(output_names[i]);
 
     std::vector<std::shared_ptr<pt::TensorInterface>> tmp_outputs;
     for (auto var : outs_vector) {
-      auto variable = var->MutableVar();
+      auto* variable = var->MutableVar();
 
       auto pt_out = framework::OutputVariableToPtTensor(variable, out_def);
       tmp_outputs.emplace_back(pt_out);
@@ -351,52 +312,33 @@ static pt::KernelContext BuildDygraphKernelContext(
     op_kernel_ctx.EmplaceBackOutputs(tmp_outputs);
   }
 
-  for (size_t i = 0; i < attr_defs.size(); ++i) {
+  for (size_t i = 0; i < attr_names.size(); ++i) {
+    auto& attr = GetAttr(attrs, default_attrs, attr_names[i]);
     if (attr_defs[i].type_index == std::type_index(typeid(pt::Scalar))) {
-      // TODO(chenweihang): support other attrs
-      // In principle, the attr required by the dynamic mode should be
-      // passed in from the Python side, and there is no need to look up
-      // from the default_map, but now this nor work
-      switch (attr_pairs[i].second) {
-        case framework::proto::AttrType::INT:
-          op_kernel_ctx.EmplaceBackAttr(pt::Scalar(
-              GetAttr<int>(attrs, default_attrs, attr_pairs[i].first)));
-          break;
-        case framework::proto::AttrType::FLOAT:
-          op_kernel_ctx.EmplaceBackAttr(pt::Scalar(
-              GetAttr<float>(attrs, default_attrs, attr_pairs[i].first)));
-          break;
-        case framework::proto::AttrType::BOOLEAN:
-          op_kernel_ctx.EmplaceBackAttr(pt::Scalar(
-              GetAttr<bool>(attrs, default_attrs, attr_pairs[i].first)));
-          break;
-        default:
-          // TODO(chenweihang): support other attrs type
-          PADDLE_THROW(platform::errors::Unimplemented(
-              "unsupported cast op attribute `%s` when construct "
-              "KernelContext.",
-              attr_pairs[i].first));
+      // TODO(chenweihang): support other attrs later
+      // TODO(zhangyunfei): Scalar should hold scaler type, and we should check
+      // attribtue type by attr_defs
+      if (std::type_index(attr.type()) == std::type_index(typeid(float))) {
+        op_kernel_ctx.EmplaceBackAttr(pt::Scalar(BOOST_GET_CONST(float, attr)));
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "unsupported cast op attribute `%s` to Scalar when construct "
+            "KernelContext in dygraph.",
+            attr_names[i]));
       }
     } else {
-      // TODO(chenweihang): support other attrs
-      // In principle, the attr required by the dynamic mode should be
-      // passed in from the Python side, and there is no need to look up
-      // from the default_map, but now this nor work
+      // TODO(chenweihang): support other attrs later
       if (attr_defs[i].type_index == std::type_index(typeid(int))) {
-        op_kernel_ctx.EmplaceBackAttr(
-            GetAttr<int>(attrs, default_attrs, attr_pairs[i].first));
+        op_kernel_ctx.EmplaceBackAttr(BOOST_GET_CONST(int, attr));
       } else if (attr_defs[i].type_index == std::type_index(typeid(float))) {
-        op_kernel_ctx.EmplaceBackAttr(
-            GetAttr<float>(attrs, default_attrs, attr_pairs[i].first));
+        op_kernel_ctx.EmplaceBackAttr(BOOST_GET_CONST(float, attr));
       } else if (attr_defs[i].type_index == std::type_index(typeid(bool))) {
-        op_kernel_ctx.EmplaceBackAttr(
-            GetAttr<bool>(attrs, default_attrs, attr_pairs[i].first));
+        op_kernel_ctx.EmplaceBackAttr(BOOST_GET_CONST(bool, attr));
       } else {
-        // TODO(chenweihang): support other attrs type
         PADDLE_THROW(platform::errors::Unimplemented(
             "unsupported cast op attribute `%s` when construct "
-            "KernelContext.",
-            attr_pairs[i].first));
+            "KernelContext in dygraph.",
+            attr_names[i]));
       }
     }
   }
@@ -446,27 +388,26 @@ static void PreparedOpRunImpl(
 }
 
 template <typename VarType>
-static void PreparedOpRunPtImpl(const framework::OperatorBase& op,
-                                const pt::KernelKey& pt_kernel_key,
-                                const pt::Kernel& pt_kernel,
-                                platform::DeviceContext* dev_ctx,
-                                const NameVarMap<VarType>& ins,
-                                const NameVarMap<VarType>& outs,
-                                const framework::AttributeMap& attrs,
-                                const framework::AttributeMap& default_attrs) {
+static void PreparedOpRunPtImpl(
+    const framework::OperatorBase& op,
+    const framework::KernelSignature& pt_kernel_signature,
+    const pt::Kernel& pt_kernel, platform::DeviceContext* dev_ctx,
+    const NameVarMap<VarType>& ins, const NameVarMap<VarType>& outs,
+    const framework::AttributeMap& attrs,
+    const framework::AttributeMap& default_attrs) {
   DygraphInferShapeContext<VarType> infer_shape_ctx(&ins, &outs, &attrs,
                                                     &default_attrs, op.Type());
   static_cast<const framework::OperatorWithKernel&>(op).InferShape(
       &infer_shape_ctx);
 
-  paddle::imperative::KernelArgsNameMakerByOpProto<VarType> argMaker(
-      op.Info().proto_, &ins, &outs);
-  auto op_kernel_ctx = BuildDygraphKernelContext<VarType>(
-      pt_kernel, &argMaker, ins, outs, attrs, default_attrs, *dev_ctx);
+  auto op_kernel_ctx = BuildDygraphPtKernelContext<VarType>(
+      pt_kernel_signature, pt_kernel, ins, outs, attrs, default_attrs,
+      *dev_ctx);
+
   pt_kernel(&op_kernel_ctx);
 
-  // TODO(chenweihang): add flags
-  // TODO(chenweihang): deal with complex cases
+  // TODO(chenweihang): add debug flags later
+  // TODO(chenweihang): deal with complex cases later
 }
 
 void PreparedOp::Run(const NameVarMap<VarBase>& ins,
@@ -474,8 +415,8 @@ void PreparedOp::Run(const NameVarMap<VarBase>& ins,
                      const framework::AttributeMap& attrs,
                      const framework::AttributeMap& default_attrs) {
   if (run_pt_kernel_) {
-    PreparedOpRunPtImpl<VarBase>(op_, pt_kernel_key_, pt_kernel_, dev_ctx_, ins,
-                                 outs, attrs, default_attrs);
+    PreparedOpRunPtImpl<VarBase>(op_, pt_kernel_signature_, pt_kernel_,
+                                 dev_ctx_, ins, outs, attrs, default_attrs);
   } else {
     PreparedOpRunImpl<VarBase>(op_, ctx_, kernel_type_, func_, dev_ctx_, ins,
                                outs, attrs, default_attrs);
@@ -487,7 +428,7 @@ void PreparedOp::Run(const NameVarMap<VariableWrapper>& ins,
                      const framework::AttributeMap& attrs,
                      const framework::AttributeMap& default_attrs) {
   if (run_pt_kernel_) {
-    PreparedOpRunPtImpl<VariableWrapper>(op_, pt_kernel_key_, pt_kernel_,
+    PreparedOpRunPtImpl<VariableWrapper>(op_, pt_kernel_signature_, pt_kernel_,
                                          dev_ctx_, ins, outs, attrs,
                                          default_attrs);
   } else {
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index d6ea055cecff2..d1a47117f389b 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -21,11 +21,11 @@
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/imperative/execution_context.h"
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/imperative/type_defs.h"
 
-#include "paddle/fluid/imperative/kernel_args_names_maker.h"
 #include "paddle/tcmpt/api/include/core.h"
 
 DECLARE_bool(use_mkldnn);
@@ -152,8 +152,9 @@ class PreparedOp {
 
   PreparedOp(const framework::OperatorBase& op,
              const framework::RuntimeContext& ctx,
-             const pt::KernelKey& pt_kernel_key, const pt::Kernel& pt_kernel,
-             platform::DeviceContext* dev_ctx);
+             const framework::OpKernelType& kernel_type,
+             const framework::KernelSignature& kernel_signature,
+             const pt::Kernel& pt_kernel, platform::DeviceContext* dev_ctx);
 
   static PreparedOp Prepare(const NameVarMap<VarBase>& ins,
                             const NameVarMap<VarBase>& outs,
@@ -186,10 +187,11 @@ class PreparedOp {
   framework::OpKernelType kernel_type_;
   framework::OperatorWithKernel::OpKernelFunc func_;
   platform::DeviceContext* dev_ctx_;
-  // TODo(chenweihang): Similar duplicate members are used for new tcmpt lib,
-  // maybe we have better impl methods
+  // NOTE(chenweihang): Similar op members are used to adapt to
+  // new tcmpt kernel, if there is a better design in the future,
+  // we may polish the implementation here
   bool run_pt_kernel_{false};
-  pt::KernelKey pt_kernel_key_;
+  framework::KernelSignature pt_kernel_signature_;
   pt::Kernel pt_kernel_;
 };
 
diff --git a/paddle/fluid/imperative/type_defs.h b/paddle/fluid/imperative/type_defs.h
index fdbbc586979cd..74fd152e72a57 100644
--- a/paddle/fluid/imperative/type_defs.h
+++ b/paddle/fluid/imperative/type_defs.h
@@ -20,11 +20,6 @@ limitations under the License. */
 #include <vector>
 
 namespace paddle {
-
-namespace framework {
-class Variable;
-}  // namespace framework
-
 namespace imperative {
 
 class VariableWrapper;
@@ -50,12 +45,6 @@ template <>
 struct NameVarMapTrait<VariableWrapper> {
   using Type = std::map<std::string, SavedVariableWrapperList>;
 };
-
-template <>
-struct NameVarMapTrait<paddle::framework::Variable> {
-  using Type = std::map<std::string, std::vector<paddle::framework::Variable*>>;
-};
-
 }  // namespace details
 
 template <typename T>
diff --git a/paddle/fluid/operators/fill_any_like_op.cc b/paddle/fluid/operators/fill_any_like_op.cc
index 1e908d5ead9c6..b46a1c3c89b6a 100644
--- a/paddle/fluid/operators/fill_any_like_op.cc
+++ b/paddle/fluid/operators/fill_any_like_op.cc
@@ -47,6 +47,15 @@ class FillAnyLikeOp : public framework::OperatorWithKernel {
                                    expected_kernel_type.place_,
                                    tensor.layout());
   }
+
+  framework::KernelSignature GetExpectedPtKernelArgs(
+      const framework::ExecutionContext &ctx) const override {
+    return std::make_pair(
+        "fill_any_like",
+        std::make_tuple(paddle::SmallVector<std::string>({"X"}),
+                        paddle::SmallVector<std::string>({"value"}),
+                        paddle::SmallVector<std::string>({"Out"})));
+  }
 };
 
 class FillAnyLikeOpMaker : public framework::OpProtoAndCheckerMaker {
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index a195452791048..329a649a5a34d 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -70,6 +70,24 @@ class ScaleOp : public framework::OperatorWithKernel {
 #endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
+
+  framework::KernelSignature GetExpectedPtKernelArgs(
+      const framework::ExecutionContext &ctx) const override {
+    if (ctx.HasInput("ScaleTensor")) {
+      return std::make_pair(
+          "scale.host",
+          std::make_tuple(
+              paddle::SmallVector<std::string>({"X", "ScaleTensor"}),
+              paddle::SmallVector<std::string>({"bias", "bias_after_scale"}),
+              paddle::SmallVector<std::string>({"Out"})));
+    } else {
+      return std::make_pair(
+          "scale", std::make_tuple(paddle::SmallVector<std::string>({"X"}),
+                                   paddle::SmallVector<std::string>(
+                                       {"scale", "bias", "bias_after_scale"}),
+                                   paddle::SmallVector<std::string>({"Out"})));
+    }
+  }
 };
 
 class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index b9c87c672df6e..c3d63f6eb2745 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -683,16 +683,16 @@ PADDLE_DEFINE_EXPORTED_bool(
 
 /**
  * Pt kernel related FLAG
- * Name: FLAGS_use_pt_kernel
+ * Name: FLAGS_run_pt_kernel
  * Since Version: 2.2.0
  * Value Range: bool, default=false
- * Example: FLAGS_use_pt_kernel=true would use the pt kernel to compute in the
+ * Example: FLAGS_run_pt_kernel=true would use the pt kernel to compute in the
  * Op.
  * Note:
  */
 // TODO(chentianyu03): change default value to false before merge into develop
 // branch
-PADDLE_DEFINE_EXPORTED_bool(use_pt_kernel, true,
+PADDLE_DEFINE_EXPORTED_bool(run_pt_kernel, true,
                             "It controls whether to use pt kernel");
 
 /**
diff --git a/paddle/tcmpt/core/convert_utils.cc b/paddle/tcmpt/core/convert_utils.cc
index d393dcf51c61b..e5b8acba19cf0 100644
--- a/paddle/tcmpt/core/convert_utils.cc
+++ b/paddle/tcmpt/core/convert_utils.cc
@@ -72,7 +72,7 @@ pt::DataType TransToPtDataType(
   }
 }
 
-DataLayout TransToPtLayout(const paddle::framework::DataLayout& layout) {
+DataLayout TransToPtDataLayout(const paddle::framework::DataLayout& layout) {
   switch (layout) {
     case paddle::framework::DataLayout::kNHWC:
       return DataLayout::kNHWC;
diff --git a/paddle/tcmpt/core/convert_utils.h b/paddle/tcmpt/core/convert_utils.h
index 9e8d85c7cfa92..a567775811349 100644
--- a/paddle/tcmpt/core/convert_utils.h
+++ b/paddle/tcmpt/core/convert_utils.h
@@ -32,7 +32,7 @@ namespace pt {
 Backend TransToPtBackend(const paddle::platform::Place& place);
 DataType TransToPtDataType(
     const paddle::framework::proto::VarType::Type& dtype);
-DataLayout TransToPtLayout(const paddle::framework::DataLayout& layout);
+DataLayout TransToPtDataLayout(const paddle::framework::DataLayout& layout);
 
 paddle::platform::Place TransToFluidPlace(const Backend& backend);
 paddle::framework::proto::VarType::Type TransToProtoVarType(
diff --git a/paddle/tcmpt/core/kernel_factory.cc b/paddle/tcmpt/core/kernel_factory.cc
index 3c6daaa776742..a301d6a995ce7 100644
--- a/paddle/tcmpt/core/kernel_factory.cc
+++ b/paddle/tcmpt/core/kernel_factory.cc
@@ -51,6 +51,11 @@ const Kernel& KernelFactory::SelectKernelOrThrowError(
                         "The kernel `%s` is not registered.", kernel_name));
 
   auto kernel_iter = iter->second.find(kernel_key);
+  if (kernel_key.layout() != pt::DataLayout::kAny) {
+    pt::KernelKey any_layout_kernel_key(
+        kernel_key.backend(), pt::DataLayout::kAny, kernel_key.dtype());
+    kernel_iter = iter->second.find(any_layout_kernel_key);
+  }
   PADDLE_ENFORCE_NE(
       kernel_iter,
       iter->second.end(),
diff --git a/paddle/tcmpt/core/kernel_registry.h b/paddle/tcmpt/core/kernel_registry.h
index 40ee968dd987c..661d387e9b8e2 100644
--- a/paddle/tcmpt/core/kernel_registry.h
+++ b/paddle/tcmpt/core/kernel_registry.h
@@ -42,6 +42,13 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
   using Arg = typename std::tuple_element<Index, Args>::type;
 
   static void Parse(const KernelKey& default_key, KernelArgsDef* args_def) {
+    // TODO(chenweihang): The fluid Tensor's default layout is NCHW,
+    // it is not same as kernel's layout, we should fix this error on
+    // fluid Tensor
+    auto default_tensor_layout = pt::DataLayout::kNCHW;
+    if (default_key.layout() != pt::DataLayout::kAny) {
+      default_tensor_layout = default_key.layout();
+    }
     auto args_type = ParseArgType(Indices{});
     for (auto arg_type : args_type) {
       if (arg_type == std::type_index(typeid(const CPUContext&))
@@ -54,10 +61,10 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
         // do nothing, skip context arg now
       } else if (arg_type == std::type_index(typeid(const DenseTensor&))) {
         args_def->AppendInput(
-            default_key.backend(), default_key.layout(), default_key.dtype());
+            default_key.backend(), default_tensor_layout, default_key.dtype());
       } else if (arg_type == std::type_index(typeid(DenseTensor*))) {
         args_def->AppendOutput(
-            default_key.backend(), default_key.layout(), default_key.dtype());
+            default_key.backend(), default_tensor_layout, default_key.dtype());
       } else {
         // Attribute deal with
         // TODO(chenweihang): now here allow any types of attribute, maybe
diff --git a/paddle/tcmpt/cpu/creation.cc b/paddle/tcmpt/cpu/creation.cc
index 8e4399c41bf17..617168d8359e3 100644
--- a/paddle/tcmpt/cpu/creation.cc
+++ b/paddle/tcmpt/cpu/creation.cc
@@ -24,7 +24,7 @@ void FillAnyLike(const CPUContext& dev_ctx,
                  const DenseTensor& x,
                  const Scalar& val,
                  DenseTensor* out) {
-  eigen::fill<CPUContext, T>(dev_ctx, out, val.to<T>());
+  eigen::fill<CPUContext, T>(dev_ctx, out, val.to<float>());
 }
 
 }  // namespace pt
@@ -33,7 +33,7 @@ PT_REGISTER_MODULE(CreationCPU);
 
 PT_REGISTER_KERNEL("fill_any_like",
                    CPU,
-                   NCHW,
+                   Any,
                    pt::FillAnyLike,
                    float,
                    double,
diff --git a/paddle/tcmpt/cpu/linalg.cc b/paddle/tcmpt/cpu/linalg.cc
index 96c1a4e937fce..7d2d8de5287d3 100644
--- a/paddle/tcmpt/cpu/linalg.cc
+++ b/paddle/tcmpt/cpu/linalg.cc
@@ -53,7 +53,7 @@ using complex128 = ::paddle::platform::complex<double>;
 
 PT_REGISTER_KERNEL("dot",
                    CPU,
-                   NCHW,
+                   Any,
                    pt::Dot,
                    float,
                    double,
diff --git a/paddle/tcmpt/cpu/manipulation.cc b/paddle/tcmpt/cpu/manipulation.cc
index d2964c5b533a9..b73c02ad8f26c 100644
--- a/paddle/tcmpt/cpu/manipulation.cc
+++ b/paddle/tcmpt/cpu/manipulation.cc
@@ -60,7 +60,7 @@ PT_REGISTER_MODULE(ManipulationCPU);
 // architecture, kernel_name should be "flatten".
 PT_REGISTER_KERNEL("flatten_contiguous_range",
                    CPU,
-                   NCHW,
+                   Any,
                    pt::Flatten,
                    float,
                    double,
@@ -71,7 +71,7 @@ PT_REGISTER_KERNEL("flatten_contiguous_range",
 
 PT_REGISTER_KERNEL("flatten_contiguous_range.mid",
                    CPU,
-                   NCHW,
+                   Any,
                    pt::FlattenWithXShape,
                    float,
                    double,
diff --git a/paddle/tcmpt/cpu/math.cc b/paddle/tcmpt/cpu/math.cc
index 80dec2530f718..47d59af29dab2 100644
--- a/paddle/tcmpt/cpu/math.cc
+++ b/paddle/tcmpt/cpu/math.cc
@@ -69,11 +69,11 @@ PT_REGISTER_MODULE(MathCPU);
 // NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
 // using bfloat16 = ::paddle::platform::bfloat16;
 
-PT_REGISTER_KERNEL("sign", CPU, NCHW, pt::Sign, float, double) {}
-PT_REGISTER_KERNEL("mean", CPU, NCHW, pt::Mean, float, double) {}
+PT_REGISTER_KERNEL("sign", CPU, Any, pt::Sign, float, double) {}
+PT_REGISTER_KERNEL("mean", CPU, Any, pt::Mean, float, double) {}
 PT_REGISTER_KERNEL("scale",
                    CPU,
-                   NCHW,
+                   Any,
                    pt::Scale,
                    float,
                    double,
@@ -85,7 +85,7 @@ PT_REGISTER_KERNEL("scale",
                    int64_t) {}
 PT_REGISTER_KERNEL("scale.host",
                    CPU,
-                   NCHW,
+                   Any,
                    pt::ScaleHost,
                    float,
                    double,
diff --git a/paddle/tcmpt/cuda/creation.cu b/paddle/tcmpt/cuda/creation.cu
index cca9199b76cfd..2aea68d72dbd5 100644
--- a/paddle/tcmpt/cuda/creation.cu
+++ b/paddle/tcmpt/cuda/creation.cu
@@ -33,7 +33,7 @@ PT_REGISTER_MODULE(CreationCUDA);
 
 PT_REGISTER_KERNEL("fill_any_like",
                    CUDA,
-                   NCHW,
+                   Any,
                    pt::FillAnyLike,
                    float,
                    double,
diff --git a/paddle/tcmpt/cuda/linalg.cu b/paddle/tcmpt/cuda/linalg.cu
index 118d3326e5fb5..b9ad00b403278 100644
--- a/paddle/tcmpt/cuda/linalg.cu
+++ b/paddle/tcmpt/cuda/linalg.cu
@@ -39,7 +39,7 @@ using complex128 = ::paddle::platform::complex<double>;
 
 PT_REGISTER_KERNEL("dot",
                    CUDA,
-                   NCHW,
+                   Any,
                    pt::Dot,
                    float,
                    double,
diff --git a/paddle/tcmpt/cuda/manipulation.cu b/paddle/tcmpt/cuda/manipulation.cu
index 91f69b2fe33d7..80649dc79ca3f 100644
--- a/paddle/tcmpt/cuda/manipulation.cu
+++ b/paddle/tcmpt/cuda/manipulation.cu
@@ -61,7 +61,7 @@ using float16 = paddle::platform::float16;
 // architecture, kernel_name should be "flatten".
 PT_REGISTER_KERNEL("flatten_contiguous_range",
                    CUDA,
-                   NCHW,
+                   Any,
                    pt::Flatten,
                    float,
                    float16,
@@ -73,7 +73,7 @@ PT_REGISTER_KERNEL("flatten_contiguous_range",
 
 PT_REGISTER_KERNEL("flatten_contiguous_range.mid",
                    CUDA,
-                   NCHW,
+                   Any,
                    pt::FlattenWithXShape,
                    float,
                    double,
diff --git a/paddle/tcmpt/cuda/math.cu b/paddle/tcmpt/cuda/math.cu
index 293f0cf8bfc91..4b3a0b365bb44 100644
--- a/paddle/tcmpt/cuda/math.cu
+++ b/paddle/tcmpt/cuda/math.cu
@@ -121,11 +121,11 @@ void ScaleHost(const CUDAContext& dev_ctx,
 PT_REGISTER_MODULE(MathCUDA);
 
 using float16 = paddle::platform::float16;
-PT_REGISTER_KERNEL("sign", CUDA, NCHW, pt::Sign, float, double, float16) {}
-PT_REGISTER_KERNEL("mean", CUDA, NCHW, pt::Mean, float, double, float16) {}
+PT_REGISTER_KERNEL("sign", CUDA, Any, pt::Sign, float, double, float16) {}
+PT_REGISTER_KERNEL("mean", CUDA, Any, pt::Mean, float, double, float16) {}
 PT_REGISTER_KERNEL("scale",
                    CUDA,
-                   NCHW,
+                   Any,
                    pt::Scale,
                    float,
                    double,
@@ -137,7 +137,7 @@ PT_REGISTER_KERNEL("scale",
                    int64_t) {}
 PT_REGISTER_KERNEL("scale.host",
                    CUDA,
-                   NCHW,
+                   Any,
                    pt::ScaleHost,
                    float,
                    double,

From ff19bd001904ac04990d0708208478d05031ea87 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Tue, 19 Oct 2021 02:21:59 +0000
Subject: [PATCH 089/125] fix insert conflit

---
 paddle/fluid/framework/tcmpt_utils.h | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/framework/tcmpt_utils.h b/paddle/fluid/framework/tcmpt_utils.h
index 5ec5476f2b8e5..27c2c8e9b5dec 100644
--- a/paddle/fluid/framework/tcmpt_utils.h
+++ b/paddle/fluid/framework/tcmpt_utils.h
@@ -71,11 +71,9 @@ class KernelSignatureMap {
   }
 
   void Insert(const std::string& op_type, const KernelSignature& signature) {
-    PADDLE_ENFORCE_NE(
-        Has(op_type), true,
-        platform::errors::AlreadyExists(
-            "Operator (%s)'s Kernel Signature has been registered.", op_type));
-    map_.insert({op_type, signature});
+    if (!Has(op_type)) {
+      map_.insert({op_type, signature});
+    }
   }
 
   const KernelSignature* GetNullable(const std::string& op_type) const {

From 1dd01453d75ad5d0382c3ba23a0e36d3ba1ef7c6 Mon Sep 17 00:00:00 2001
From: zyfncg <1370305206@qq.com>
Date: Tue, 19 Oct 2021 10:26:42 +0800
Subject: [PATCH 090/125] Fix CI bug of test_yolov3 (#21)

* fill_any_like kernel refactor

* remove useless code of full_like c++ api

* Support Scalar in Tensor Compute Library

* add scalar in dygraph and static graph mode

* keep the basic type for attr, instead of using scalar for all

* merge the code

* start refactor matmul

* move cpu, cuda and other device modules into kernels

* merge code

* polish code in operator.cc

* Fix CI bug of test_yolov3
---
 paddle/tcmpt/core/tensor_meta.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/tcmpt/core/tensor_meta.h b/paddle/tcmpt/core/tensor_meta.h
index bd3319cf4fdad..de564a44de36e 100644
--- a/paddle/tcmpt/core/tensor_meta.h
+++ b/paddle/tcmpt/core/tensor_meta.h
@@ -95,7 +95,7 @@ struct TensorMeta {
         offset(offset),
         lod(lod) {
     int64_t init_numel = paddle::framework::product(dims);
-    if (init_numel > 0) {
+    if (init_numel >= 0) {
       numel = init_numel;
     }
   }

From b77d1eee4ab23812749dd4c275a786a15971b82a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?=
 <39303645+Shixiaowei02@users.noreply.github.com>
Date: Tue, 19 Oct 2021 16:55:03 +0800
Subject: [PATCH 091/125] add the tensor base class, test=develop (#17)

* update the tensor base class, test=develop

* remove two funcs, test=develop

* update the error msg, test=develop

Co-authored-by: Chen Weihang <chenweihang@baidu.com>
---
 paddle/fluid/framework/operator.cc            |   4 +-
 paddle/fluid/framework/tcmpt_utils.cc         |   4 +-
 paddle/fluid/framework/tcmpt_utils.h          |  10 +-
 paddle/fluid/framework/tcmpt_utils_test.cc    |   4 +-
 paddle/fluid/imperative/prepared_operator.cc  |   4 +-
 paddle/tcmpt/common/data_type.h               | 181 ++++++++
 .../tcmpt/{core/layout.cc => common/layout.h} |  26 +-
 paddle/tcmpt/core/CMakeLists.txt              |  10 +-
 paddle/tcmpt/core/allocator.cc                |  19 +
 paddle/tcmpt/core/allocator.h                 | 159 +++++++
 paddle/tcmpt/core/convert_utils.h             |   7 +-
 paddle/tcmpt/core/dense_tensor.cc             |   4 +-
 paddle/tcmpt/core/dense_tensor.h              |  21 +-
 paddle/tcmpt/core/dtype.cc                    |  73 ----
 paddle/tcmpt/core/dtype.h                     | 105 -----
 paddle/tcmpt/core/kernel_context.h            |  22 +-
 paddle/tcmpt/core/kernel_factory.h            |   7 +-
 paddle/tcmpt/core/kernel_registry.h           | 400 +++++++++---------
 paddle/tcmpt/core/layout.h                    |  43 --
 paddle/tcmpt/core/spatial_tensor.h            |   4 +-
 paddle/tcmpt/core/storage.cc                  |  27 ++
 paddle/tcmpt/core/storage.h                   |  78 ++++
 paddle/tcmpt/core/tensor_base.cc              |  20 +
 paddle/tcmpt/core/tensor_base.h               |  78 ++++
 paddle/tcmpt/core/tensor_interface.h          |  77 ----
 paddle/tcmpt/core/tensor_meta.h               |   7 +-
 paddle/tcmpt/core/tensor_status.h             |   4 +-
 paddle/tcmpt/core/utils/CMakeLists.txt        |   0
 paddle/tcmpt/core/utils/intrusive_ptr.h       | 160 +++++++
 .../tcmpt/core/utils/intrusive_ref_counter.h  |  66 +++
 paddle/tcmpt/core/utils/type_info.h           |  61 +++
 paddle/tcmpt/core/utils/type_registry.h       |  86 ++++
 paddle/tcmpt/hapi/include/creation.h          |   2 +-
 paddle/tcmpt/hapi/include/tensor.h            |  24 +-
 paddle/tcmpt/kernels/cpu/utils.cc             |   6 +-
 paddle/tcmpt/kernels/cuda/math.cu             |   2 +-
 paddle/tcmpt/kernels/cuda/utils.cu            |   6 +-
 paddle/tcmpt/tests/dense_tensor_test.cc       |   2 +-
 38 files changed, 1242 insertions(+), 571 deletions(-)
 create mode 100644 paddle/tcmpt/common/data_type.h
 rename paddle/tcmpt/{core/layout.cc => common/layout.h} (75%)
 create mode 100644 paddle/tcmpt/core/allocator.cc
 create mode 100644 paddle/tcmpt/core/allocator.h
 delete mode 100644 paddle/tcmpt/core/dtype.cc
 delete mode 100644 paddle/tcmpt/core/dtype.h
 delete mode 100644 paddle/tcmpt/core/layout.h
 create mode 100644 paddle/tcmpt/core/storage.cc
 create mode 100644 paddle/tcmpt/core/storage.h
 create mode 100644 paddle/tcmpt/core/tensor_base.cc
 create mode 100644 paddle/tcmpt/core/tensor_base.h
 delete mode 100644 paddle/tcmpt/core/tensor_interface.h
 create mode 100644 paddle/tcmpt/core/utils/CMakeLists.txt
 create mode 100644 paddle/tcmpt/core/utils/intrusive_ptr.h
 create mode 100644 paddle/tcmpt/core/utils/intrusive_ref_counter.h
 create mode 100644 paddle/tcmpt/core/utils/type_info.h
 create mode 100644 paddle/tcmpt/core/utils/type_registry.h

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 7cadf53cc5299..5a1c03327d592 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1826,7 +1826,7 @@ pt::KernelContext OperatorWithKernel::BuildPtKernelContext(
             << in_def.layout;
 
     auto ins_vector = ctx.inputs.at(input_names[i]);
-    std::vector<std::shared_ptr<pt::TensorInterface>> tmp_inputs;
+    std::vector<std::shared_ptr<tcmpt::TensorBase>> tmp_inputs;
 
     for (auto var : ins_vector) {
       auto pt_in = framework::InputVariableToPtTensor(*var, in_def);
@@ -1839,7 +1839,7 @@ pt::KernelContext OperatorWithKernel::BuildPtKernelContext(
     auto out_def = output_defs.at(i);
     auto outs_vector = ctx.outputs.at(output_names[i]);
 
-    std::vector<std::shared_ptr<pt::TensorInterface>> tmp_outputs;
+    std::vector<std::shared_ptr<tcmpt::TensorBase>> tmp_outputs;
     for (auto var : outs_vector) {
       auto pt_out = framework::OutputVariableToPtTensor(var, out_def);
       tmp_outputs.emplace_back(pt_out);
diff --git a/paddle/fluid/framework/tcmpt_utils.cc b/paddle/fluid/framework/tcmpt_utils.cc
index a39e653d0349e..fc38eb42d74c7 100644
--- a/paddle/fluid/framework/tcmpt_utils.cc
+++ b/paddle/fluid/framework/tcmpt_utils.cc
@@ -77,7 +77,7 @@ std::shared_ptr<pt::DenseTensor> MakeTensorImpl<pt::DenseTensor>(
       pt::TransToPtDataLayout(tensor.layout()));
 }
 
-std::shared_ptr<pt::TensorInterface> InputVariableToPtTensor(
+std::shared_ptr<tcmpt::TensorBase> InputVariableToPtTensor(
     const framework::Variable& variable, const pt::TensorArgDef& arg_def) {
   auto expected_place = pt::TransToFluidPlace(arg_def.backend);
 
@@ -122,7 +122,7 @@ std::shared_ptr<pt::TensorInterface> InputVariableToPtTensor(
   return nullptr;
 }
 
-std::shared_ptr<pt::TensorInterface> OutputVariableToPtTensor(
+std::shared_ptr<tcmpt::TensorBase> OutputVariableToPtTensor(
     framework::Variable* variable, const pt::TensorArgDef& arg_def) {
   // mutable_data before run kernel, to avoid share output form
   // KernelContext to original tensor
diff --git a/paddle/fluid/framework/tcmpt_utils.h b/paddle/fluid/framework/tcmpt_utils.h
index 27c2c8e9b5dec..4d08692bd9c26 100644
--- a/paddle/fluid/framework/tcmpt_utils.h
+++ b/paddle/fluid/framework/tcmpt_utils.h
@@ -49,9 +49,15 @@ std::shared_ptr<PtTensorImplT> MakeTensorImpl(const Tensor& tensor,
                                               const platform::Place& place,
                                               proto::VarType::Type type);
 
-std::shared_ptr<pt::TensorInterface> InputVariableToPtTensor(
+template <typename PtTensorImplT>
+void ShareTensorImpl(PtTensorImplT* tensor_impl, LoDTensor* out);
+
+template <typename PtTensorImplT>
+void ShareTensorImpl(PtTensorImplT* tensor_impl, Tensor* out);
+
+std::shared_ptr<tcmpt::TensorBase> InputVariableToPtTensor(
     const framework::Variable& variable, const pt::TensorArgDef& arg_def);
-std::shared_ptr<pt::TensorInterface> OutputVariableToPtTensor(
+std::shared_ptr<tcmpt::TensorBase> OutputVariableToPtTensor(
     framework::Variable* variable, const pt::TensorArgDef& arg_def);
 
 /* Kernel Key translate */
diff --git a/paddle/fluid/framework/tcmpt_utils_test.cc b/paddle/fluid/framework/tcmpt_utils_test.cc
index f1966789c1dde..200bd5429cd46 100644
--- a/paddle/fluid/framework/tcmpt_utils_test.cc
+++ b/paddle/fluid/framework/tcmpt_utils_test.cc
@@ -38,7 +38,7 @@ TEST(TcmptUtils, MakeTensor) {
   ASSERT_EQ(dense_x->data<float>()[0], expect_value[0]);
   ASSERT_EQ(dense_x->data<float>()[1], expect_value[1]);
   ASSERT_EQ(dense_x->backend(), pt::Backend::kCPU);
-  ASSERT_EQ(dense_x->type(), pt::DataType::kFLOAT32);
+  ASSERT_EQ(dense_x->data_type(), pt::DataType::kFLOAT32);
 }
 
 TEST(TcmptUtils, VarToPtTensor) {
@@ -60,7 +60,7 @@ TEST(TcmptUtils, VarToPtTensor) {
   auto tensor_x = InputVariableToPtTensor(v, tensor_def);
   // 3. check result
   ASSERT_EQ(tensor_x->backend(), expect_backend);
-  ASSERT_EQ(tensor_x->type(), pt::DataType::kINT32);
+  ASSERT_EQ(tensor_x->data_type(), pt::DataType::kINT32);
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 87e7e754e3ee8..f65b799e150fc 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -288,7 +288,7 @@ static pt::KernelContext BuildDygraphPtKernelContext(
     auto& in_def = input_defs.at(i);
     auto& ins_vector = ins.at(input_names[i]);
 
-    std::vector<std::shared_ptr<pt::TensorInterface>> tmp_inputs;
+    std::vector<std::shared_ptr<tcmpt::TensorBase>> tmp_inputs;
     for (auto var : ins_vector) {
       const auto& variable = var->Var();
 
@@ -302,7 +302,7 @@ static pt::KernelContext BuildDygraphPtKernelContext(
     auto& out_def = output_defs.at(i);
     auto& outs_vector = outs.at(output_names[i]);
 
-    std::vector<std::shared_ptr<pt::TensorInterface>> tmp_outputs;
+    std::vector<std::shared_ptr<tcmpt::TensorBase>> tmp_outputs;
     for (auto var : outs_vector) {
       auto* variable = var->MutableVar();
 
diff --git a/paddle/tcmpt/common/data_type.h b/paddle/tcmpt/common/data_type.h
new file mode 100644
index 0000000000000..03881e6bda1ca
--- /dev/null
+++ b/paddle/tcmpt/common/data_type.h
@@ -0,0 +1,181 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/complex.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/errors.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace experimental {
+
+using complex64 = ::paddle::platform::complex<float>;
+using complex128 = ::paddle::platform::complex<double>;
+using float16 = ::paddle::platform::float16;
+using bfloat16 = ::paddle::platform::bfloat16;
+
+enum class DataType {
+  kUndef = 0,
+  kBOOL,
+  kINT8,   // Char
+  kUINT8,  // BYte
+  kINT16,
+  kINT32,
+  kUINT32,
+  kINT64,
+  kUINT64,
+  kBFLOAT16,
+  kFLOAT16,
+  kUINT16,
+  kFLOAT32,
+  kFLOAT64,
+  kCOMPLEX64,
+  kCOMPLEX128,
+  kNumDataTypes
+};
+
+inline size_t SizeOf(DataType data_type) {
+  switch (data_type) {
+    case DataType::kBOOL:
+    case DataType::kUINT8:
+    case DataType::kINT8:
+      return 1;
+    case DataType::kFLOAT16:
+    case DataType::kINT16:
+    case DataType::kUINT16:
+      return 2;
+    case DataType::kFLOAT32:
+    case DataType::kINT32:
+    case DataType::kUINT32:
+      return 4;
+    case DataType::kFLOAT64:
+    case DataType::kINT64:
+    case DataType::kUINT64:
+      return 8;
+    case DataType::kUndef:
+    case DataType::kBFLOAT16:
+    case DataType::kCOMPLEX64:
+    case DataType::kCOMPLEX128:
+    case DataType::kNumDataTypes:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Data type %d is not supported by tensor.",
+          static_cast<int>(data_type)));
+      return 0;
+  }
+}
+
+#define PT_FOR_EACH_DATA_TYPE(_)     \
+  _(bool, DataType::kBOOL)           \
+  _(int8_t, DataType::kINT8)         \
+  _(uint8_t, DataType::kUINT8)       \
+  _(int16_t, DataType::kINT16)       \
+  _(int, DataType::kINT32)           \
+  _(int64_t, DataType::kINT64)       \
+  _(bfloat16, DataType::kBFLOAT16)   \
+  _(float16, DataType::kFLOAT16)     \
+  _(float, DataType::kFLOAT32)       \
+  _(double, DataType::kFLOAT64)      \
+  _(complex64, DataType::kCOMPLEX64) \
+  _(complex128, DataType::kCOMPLEX128)
+
+template <DataType T>
+struct DataTypeToCppType;
+
+template <typename T>
+struct CppTypeToDataType;
+
+#define PT_SPECIALIZE_DataTypeToCppType(cpp_type, data_type) \
+  template <>                                                \
+  struct DataTypeToCppType<data_type> {                      \
+    using type = cpp_type;                                   \
+  };
+
+PT_FOR_EACH_DATA_TYPE(PT_SPECIALIZE_DataTypeToCppType)
+
+#undef PT_SPECIALIZE_DataTypeToCppType
+
+#define PT_SPECIALIZE_CppTypeToDataType(cpp_type, data_type) \
+  template <>                                                \
+  struct CppTypeToDataType<cpp_type> {                       \
+    constexpr static DataType Type() { return data_type; }   \
+  };
+
+PT_FOR_EACH_DATA_TYPE(PT_SPECIALIZE_CppTypeToDataType)
+
+#undef PT_SPECIALIZE_CppTypeToDataType
+
+inline std::ostream& operator<<(std::ostream& os, DataType dtype) {
+  switch (dtype) {
+    case DataType::kUndef:
+      os << "Undefined";
+      break;
+    case DataType::kBOOL:
+      os << "bool";
+      break;
+    case DataType::kINT8:
+      os << "int8";
+      break;
+    case DataType::kUINT8:
+      os << "uint8";
+      break;
+    case DataType::kINT16:
+      os << "int16";
+      break;
+    case DataType::kINT32:
+      os << "int32";
+      break;
+    case DataType::kINT64:
+      os << "int64";
+      break;
+    case DataType::kBFLOAT16:
+      os << "bfloat16";
+      break;
+    case DataType::kFLOAT16:
+      os << "float16";
+      break;
+    case DataType::kFLOAT32:
+      os << "float32";
+      break;
+    case DataType::kFLOAT64:
+      os << "float64";
+      break;
+    case DataType::kCOMPLEX64:
+      os << "complex64";
+      break;
+    case DataType::kCOMPLEX128:
+      os << "complex128";
+      break;
+    default:
+      // TODO(chenweihang): change to enforce later
+      throw std::runtime_error("Invalid DataType type.");
+  }
+  return os;
+}
+
+inline DataType& operator++(DataType& dtype, int) {
+  dtype =
+      DataType(static_cast<std::underlying_type<DataType>::type>(dtype) + 1);
+  return dtype;
+}
+
+}  // namespace experimental
+}  // namespace paddle
+
+namespace pt {
+using DataType = paddle::experimental::DataType;
+}
diff --git a/paddle/tcmpt/core/layout.cc b/paddle/tcmpt/common/layout.h
similarity index 75%
rename from paddle/tcmpt/core/layout.cc
rename to paddle/tcmpt/common/layout.h
index 4f4fd972516da..ae4e43a9f7197 100644
--- a/paddle/tcmpt/core/layout.cc
+++ b/paddle/tcmpt/common/layout.h
@@ -12,11 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/tcmpt/core/layout.h"
+#pragma once
 
-namespace pt {
+namespace paddle {
+namespace experimental {
+
+enum class DataLayout {
+  kUndef = 0,
+  kAny,
+  kNHWC,
+  kNCHW,
+  kMKLDNN,
+  kNumLayouts,
+};
 
-std::ostream& operator<<(std::ostream& os, DataLayout dtype) {
+inline std::ostream& operator<<(std::ostream& os, DataLayout dtype) {
   switch (dtype) {
     case DataLayout::kUndef:
       os << "Undefined";
@@ -40,9 +50,15 @@ std::ostream& operator<<(std::ostream& os, DataLayout dtype) {
   return os;
 }
 
-DataLayout& operator++(DataLayout& layout, int) {
+inline DataLayout& operator++(DataLayout& layout, int) {
   layout = DataLayout(
       static_cast<std::underlying_type<DataLayout>::type>(layout) + 1);
   return layout;
 }
-}  // namespace pt
+
+}  // namespace experimental
+}  // namespace paddle
+
+namespace pt {
+using DataLayout = paddle::experimental::DataLayout;
+}
diff --git a/paddle/tcmpt/core/CMakeLists.txt b/paddle/tcmpt/core/CMakeLists.txt
index 5eadf3db39a64..88573c729c3f2 100644
--- a/paddle/tcmpt/core/CMakeLists.txt
+++ b/paddle/tcmpt/core/CMakeLists.txt
@@ -5,17 +5,15 @@ ELSE()
 ENDIF()
 
 cc_library(backend SRCS backend.cc)
-cc_library(dtype SRCS dtype.cc)
-cc_library(layout SRCS layout.cc)
 
 if(WITH_GPU)
-    cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place backend dtype layout gpu_info)
+    cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place backend gpu_info)
 elseif(WITH_ROCM)
-    cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place backend dtype layout gpu_info)
+    cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place backend gpu_info)
 else()
-    cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place backend dtype layout)
+    cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place backend)
 endif()
 cc_library(dense_tensor SRCS dense_tensor.cc DEPS enforce data_type ddim allocator place convert_utils ${MKLDNN_CTX_DEPS})
 
-cc_library(kernel_factory SRCS kernel_factory.cc DEPS enforce backend dtype layout)
+cc_library(kernel_factory SRCS kernel_factory.cc DEPS enforce backend)
 cc_library(kernel_context SRCS kernel_context.cc DEPS enforce device_context)
diff --git a/paddle/tcmpt/core/allocator.cc b/paddle/tcmpt/core/allocator.cc
new file mode 100644
index 0000000000000..da1576f81ad71
--- /dev/null
+++ b/paddle/tcmpt/core/allocator.cc
@@ -0,0 +1,19 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/tcmpt/core/allocator.h"
+
+namespace paddle {
+namespace tcmpt {}  // namespace tcmpt
+}  // namespace paddle
diff --git a/paddle/tcmpt/core/allocator.h b/paddle/tcmpt/core/allocator.h
new file mode 100644
index 0000000000000..592f7a4078f80
--- /dev/null
+++ b/paddle/tcmpt/core/allocator.h
@@ -0,0 +1,159 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cstdint>
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace tcmpt {
+
+/// \brief Encapsulates strategies for access/addressing, allocation/
+/// deallocation and construction/destruction of objects.
+class RawAllocator {
+ public:
+  /// \brief Default destructor.
+  virtual ~RawAllocator() = default;
+
+  /// \brief Allocates storage suitable for an array object of n bytes
+  /// and creates the array, but does not construct array elements.
+  /// May throw exceptions.
+  /// \param bytes_size The number of bytes to allocate.
+  /// \return The first address allocated.
+  virtual void* Allocate(size_t bytes_size) = 0;
+
+  /// \brief Deallocates storage pointed to ptr, which must be a value
+  /// returned by a previous call to allocate that has not been
+  /// invalidated by an intervening call to deallocate. The bytes_size
+  /// must match the value previously passed to allocate.
+  /// \param ptr The first address to deallocate.
+  /// \param bytes_size The number of bytes to deallocate.
+  virtual void Deallocate(void* ptr, size_t bytes_size) = 0;
+
+  /// \brief Get the place value of the allocator and the allocation.
+  /// \return The place value of the allocator and the allocation.
+  virtual const platform::Place& place() const = 0;
+};
+
+/// \brief Fancy pointer with context. The use of this data type
+/// is to be compatible with allocators from different frameworks
+/// without significant performance loss. This class does not
+/// support being inherited.
+class Allocation final {
+ public:
+  using DeleterFnPtr = void (*)(void*);
+
+  Allocation() = default;
+  Allocation(Allocation&&) = default;
+  Allocation& operator=(Allocation&&) = default;
+
+  Allocation(void* data, const platform::Place& place)
+      : data_(data), place_(place) {}
+
+  Allocation(void* data,
+             void* ctx,
+             DeleterFnPtr ctx_deleter,
+             const platform::Place& place)
+      : data_(data), ctx_(ctx, ctx_deleter), place_(place) {}
+
+  void* operator->() const noexcept { return data_; }
+  operator bool() const noexcept { return data_ || ctx_.Get(); }
+  const platform::Place& place() const noexcept { return place_; }
+
+  void Clear() noexcept {
+    data_ = nullptr;
+    ctx_.Clear();
+  }
+
+  /// \brief Statically cast the void pointer of the context object to
+  /// the primitive type. Conversion of any pointer to void* and back
+  /// to pointer to the original cv type preserves its original value.
+  /// \param T The primitive type name of the context pointer.
+  /// \param expected_deleter The destructor passed in to enhance type
+  /// safety checking.
+  template <typename T>
+  T* CastContext(DeleterFnPtr expected_deleter) const noexcept {
+    if (ctx_.deleter() != expected_deleter) {
+      return nullptr;
+    }
+    return static_cast<T*>(ctx_.Get());
+  }
+
+ public:
+  class Context {
+   public:
+    Context() = default;
+    Context(void* ctx, DeleterFnPtr deleter) noexcept : ctx_(ctx),
+                                                        deleter_(deleter) {}
+    Context(Context&& other) noexcept {
+      // Exchange them explicitly to avoid moving is equivalent
+      // to copying.
+      swap(*this, other);
+    }
+    Context& operator=(Context&& other) noexcept {
+      swap(*this, other);
+      return *this;
+    }
+    ~Context() {
+      if (deleter_) {
+        deleter_(ctx_);
+      }
+    }
+    void Clear() noexcept {
+      ctx_ = nullptr;
+      deleter_ = nullptr;
+    }
+    void* Get() const noexcept { return ctx_; }
+    DeleterFnPtr deleter() const noexcept { return deleter_; }
+    void* Release() noexcept {
+      deleter_ = nullptr;
+      return ctx_;
+    }
+    friend void swap(Context& a, Context& b) noexcept;
+
+   private:
+    void* ctx_{nullptr};
+    DeleterFnPtr deleter_{nullptr};
+  };
+
+ private:
+  void* data_{nullptr};
+  Context ctx_;
+  // TODO(Shixiaowei02): Enum needs to be used instead to reduce
+  // the construction overhead by more than 50%.
+  platform::Place place_;
+};
+
+inline void swap(Allocation::Context& a, Allocation::Context& b) noexcept {
+  ::std::swap(a.ctx_, b.ctx_);
+  ::std::swap(a.deleter_, b.deleter_);
+}
+
+/// \brief Context compatible allocator interface. This allocator is
+/// mainly used for general data structures such as Tensor. The raw
+/// allocator is more universal and efficient.
+class Allocator {
+ public:
+  virtual ~Allocator() = default;
+  virtual Allocation Allocate(size_t bytes_size) = 0;
+};
+
+inline Allocation Allocate(const std::shared_ptr<Allocator>& a, size_t n) {
+  CHECK(a);
+  return a->Allocate(n);
+}
+
+}  // namespace tcmpt
+}  // namespace paddle
diff --git a/paddle/tcmpt/core/convert_utils.h b/paddle/tcmpt/core/convert_utils.h
index a567775811349..011652bdc9572 100644
--- a/paddle/tcmpt/core/convert_utils.h
+++ b/paddle/tcmpt/core/convert_utils.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/tcmpt/common/data_type.h"
+#include "paddle/tcmpt/common/layout.h"
 #include "paddle/tcmpt/core/backend.h"
-#include "paddle/tcmpt/core/dtype.h"
-#include "paddle/tcmpt/core/layout.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/data_layout.h"
@@ -27,6 +27,9 @@ limitations under the License. */
 
 namespace pt {
 
+using DataType = paddle::experimental::DataType;
+using DataLayout = paddle::experimental::DataLayout;
+
 // TODO(chenweihang): Use the original var type as much as possible
 // to avoid transform, such as DataLayout, VarType
 Backend TransToPtBackend(const paddle::platform::Place& place);
diff --git a/paddle/tcmpt/core/dense_tensor.cc b/paddle/tcmpt/core/dense_tensor.cc
index 921f0ee8d9102..9c34b5823d590 100644
--- a/paddle/tcmpt/core/dense_tensor.cc
+++ b/paddle/tcmpt/core/dense_tensor.cc
@@ -31,7 +31,7 @@ using XPUPlace = paddle::platform::XPUPlace;
 using NPUPlace = paddle::platform::NPUPlace;
 using NPUPinnedPlace = paddle::platform::NPUPinnedPlace;
 
-Place DenseTensor::place() const {
+const paddle::platform::Place& DenseTensor::place() const {
   PADDLE_ENFORCE_NOT_NULL(
       allocation_,
       paddle::platform::errors::PreconditionNotMet(
@@ -52,7 +52,7 @@ void DenseTensor::ShareAllocation(
 }
 
 // TODO(chenweihang): Add other place branchs
-Place DenseTensor::GetPlaceByBackend() const {
+paddle::platform::Place DenseTensor::GetPlaceByBackend() const {
   switch (meta_.backend) {
     case Backend::kCPU:
       return CPUPlace();
diff --git a/paddle/tcmpt/core/dense_tensor.h b/paddle/tcmpt/core/dense_tensor.h
index d7853e7cba201..a0d195b740bed 100644
--- a/paddle/tcmpt/core/dense_tensor.h
+++ b/paddle/tcmpt/core/dense_tensor.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <memory>
 
-#include "paddle/tcmpt/core/tensor_interface.h"
+#include "paddle/tcmpt/core/tensor_base.h"
 #include "paddle/tcmpt/core/tensor_meta.h"
 #include "paddle/tcmpt/core/tensor_status.h"
 
@@ -30,6 +30,9 @@ class Allocation;
 
 namespace pt {
 
+using TensorBase = paddle::tcmpt::TensorBase;
+using DataType = paddle::experimental::DataType;
+
 // TODO(chenweihang): Allocation still link to framework, Redesign and
 // decoupled Allocation and Allocator?
 using Allocation = paddle::memory::allocation::Allocation;
@@ -47,9 +50,9 @@ using Allocation = paddle::memory::allocation::Allocation;
  *
  * If the memory layout is different, it cannot be described based on the
  * general Allocation, and it needs to be directly inherited from
- * TensorInterface.
+ * TensorBase.
  */
-class DenseTensor : public TensorInterface {
+class DenseTensor : public TensorBase {
  public:
   // Not allowed to initialize a tensor without descriptive metadata
   DenseTensor() = delete;
@@ -71,20 +74,20 @@ class DenseTensor : public TensorInterface {
   DenseTensor(TensorMeta&& meta, TensorStatus&& status)
       : meta_(std::move(meta)), status_(std::move(status)) {}
 
-  ~DenseTensor() override {}
-
   int64_t numel() const override { return meta_.numel; }
 
-  DDim dims() const override { return meta_.dims; }
+  const paddle::framework::DDim& dims() const override { return meta_.dims; }
 
-  DataType type() const override { return meta_.type; }
+  DataType data_type() const override { return meta_.type; }
 
   DataLayout layout() const override { return meta_.layout; }
 
-  Place place() const override;
+  const paddle::platform::Place& place() const override;
 
   Backend backend() const override { return meta_.backend; }
 
+  bool valid() const override { return allocation_ != nullptr; }
+
   bool initialized() const override { return allocation_ != nullptr; }
 
   /* member methods */
@@ -130,7 +133,7 @@ class DenseTensor : public TensorInterface {
 
   void ShareAllocation(const std::shared_ptr<Allocation>& allocation);
 
-  Place GetPlaceByBackend() const;
+  paddle::platform::Place GetPlaceByBackend() const;
 
   size_t MemorySize() const;
 
diff --git a/paddle/tcmpt/core/dtype.cc b/paddle/tcmpt/core/dtype.cc
deleted file mode 100644
index c9fefc6a69080..0000000000000
--- a/paddle/tcmpt/core/dtype.cc
+++ /dev/null
@@ -1,73 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/tcmpt/core/dtype.h"
-
-namespace pt {
-
-std::ostream& operator<<(std::ostream& os, DataType dtype) {
-  switch (dtype) {
-    case DataType::kUndef:
-      os << "Undefined";
-      break;
-    case DataType::kBOOL:
-      os << "bool";
-      break;
-    case DataType::kINT8:
-      os << "int8";
-      break;
-    case DataType::kUINT8:
-      os << "uint8";
-      break;
-    case DataType::kINT16:
-      os << "int16";
-      break;
-    case DataType::kINT32:
-      os << "int32";
-      break;
-    case DataType::kINT64:
-      os << "int64";
-      break;
-    case DataType::kBFLOAT16:
-      os << "bfloat16";
-      break;
-    case DataType::kFLOAT16:
-      os << "float16";
-      break;
-    case DataType::kFLOAT32:
-      os << "float32";
-      break;
-    case DataType::kFLOAT64:
-      os << "float64";
-      break;
-    case DataType::kCOMPLEX64:
-      os << "complex64";
-      break;
-    case DataType::kCOMPLEX128:
-      os << "complex128";
-      break;
-    default:
-      // TODO(chenweihang): change to enforce later
-      throw std::runtime_error("Invalid DataType type.");
-  }
-  return os;
-}
-
-DataType& operator++(DataType& dtype, int) {
-  dtype =
-      DataType(static_cast<std::underlying_type<DataType>::type>(dtype) + 1);
-  return dtype;
-}
-
-}  // namespace pt
diff --git a/paddle/tcmpt/core/dtype.h b/paddle/tcmpt/core/dtype.h
deleted file mode 100644
index 1b5c1b8037a21..0000000000000
--- a/paddle/tcmpt/core/dtype.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <ostream>
-
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace pt {
-
-using complex64 = paddle::platform::complex<float>;
-using complex128 = paddle::platform::complex<double>;
-using float16 = paddle::platform::float16;
-using bfloat16 = paddle::platform::bfloat16;
-
-/**
- * [ Why need new data type? ]
- *
- * The Var data type design in framework.proto is confusing, maybe we need
- * polish the VarType in framework.proto.
- *
- * We need to ensure that the operator library is relatively independent
- * and does not depend on the framework. Therefore, before calling the kernel
- * in the Tensor Compute library inside the framework, the internal
- * data type needs to be converted to the data type in the Tensor Compute
- * library.
- *
- */
-enum class DataType {
-  kUndef = 0,
-  kBOOL,
-  kINT8,   // Char
-  kUINT8,  // BYte
-  kINT16,
-  kINT32,
-  kINT64,
-  kBFLOAT16,
-  kFLOAT16,
-  kFLOAT32,
-  kFLOAT64,
-  kCOMPLEX64,
-  kCOMPLEX128,
-  kNumDataTypes
-};
-
-std::ostream& operator<<(std::ostream& os, DataType dtype);
-
-DataType& operator++(DataType& dtype, int);
-
-#define PT_FOR_EACH_DATA_TYPE(_)     \
-  _(bool, DataType::kBOOL)           \
-  _(int8_t, DataType::kINT8)         \
-  _(uint8_t, DataType::kUINT8)       \
-  _(int16_t, DataType::kINT16)       \
-  _(int, DataType::kINT32)           \
-  _(int64_t, DataType::kINT64)       \
-  _(bfloat16, DataType::kBFLOAT16)   \
-  _(float16, DataType::kFLOAT16)     \
-  _(float, DataType::kFLOAT32)       \
-  _(double, DataType::kFLOAT64)      \
-  _(complex64, DataType::kCOMPLEX64) \
-  _(complex128, DataType::kCOMPLEX128)
-
-template <pt::DataType T>
-struct DataTypeToCppType;
-
-template <typename T>
-struct CppTypeToDataType;
-
-#define PT_SPECIALIZE_DataTypeToCppType(cpp_type, data_type) \
-  template <>                                                \
-  struct DataTypeToCppType<data_type> {                      \
-    using type = cpp_type;                                   \
-  };
-
-PT_FOR_EACH_DATA_TYPE(PT_SPECIALIZE_DataTypeToCppType)
-
-#undef PT_SPECIALIZE_DataTypeToCppType
-
-#define PT_SPECIALIZE_CppTypeToDataType(cpp_type, data_type) \
-  template <>                                                \
-  struct CppTypeToDataType<cpp_type> {                       \
-    constexpr static DataType Type() { return data_type; }   \
-  };
-
-PT_FOR_EACH_DATA_TYPE(PT_SPECIALIZE_CppTypeToDataType)
-
-#undef PT_SPECIALIZE_CppTypeToDataType
-
-}  // namespace pt
diff --git a/paddle/tcmpt/core/kernel_context.h b/paddle/tcmpt/core/kernel_context.h
index 057cbc11689f1..022d8a6713155 100644
--- a/paddle/tcmpt/core/kernel_context.h
+++ b/paddle/tcmpt/core/kernel_context.h
@@ -16,7 +16,7 @@
 
 #include <utility>
 
-#include "paddle/tcmpt/core/tensor_interface.h"
+#include "paddle/tcmpt/core/tensor_base.h"
 #include "paddle/utils/any.h"
 
 // See Note [ Why still include the fluid headers? ]
@@ -26,6 +26,9 @@
 namespace pt {
 
 using DeviceContext = paddle::platform::DeviceContext;
+using TensorBase = paddle::tcmpt::TensorBase;
+using DataType = paddle::experimental::DataType;
+using DataLayout = paddle::experimental::DataLayout;
 
 /**
  * Note: KernelContext doesn't manage the life if DeviceContext and Tensor
@@ -38,8 +41,8 @@ class KernelContext {
  public:
   explicit KernelContext(const DeviceContext& dev_ctx) : dev_ctx_(dev_ctx) {}
   KernelContext(const DeviceContext& dev_ctx,
-                const std::vector<std::shared_ptr<TensorInterface>>& inputs,
-                const std::vector<std::shared_ptr<TensorInterface>>& outputs,
+                const std::vector<std::shared_ptr<TensorBase>>& inputs,
+                const std::vector<std::shared_ptr<TensorBase>>& outputs,
                 const std::vector<paddle::any>& attrs)
       : dev_ctx_(dev_ctx), inputs_(inputs), outputs_(outputs), attrs_(attrs) {}
 
@@ -48,14 +51,14 @@ class KernelContext {
     return static_cast<const CtxType&>(dev_ctx_);
   }
 
-  void EmplaceBackInput(std::shared_ptr<TensorInterface> input) {
+  void EmplaceBackInput(std::shared_ptr<TensorBase> input) {
     inputs_.emplace_back(input);
     // Record the start and end index of the input
     int index = inputs_.size();
     input_range_.emplace_back(std::pair<int, int>(index, index + 1));
   }
 
-  void EmplaceBackInputs(std::vector<std::shared_ptr<TensorInterface>> inputs) {
+  void EmplaceBackInputs(std::vector<std::shared_ptr<TensorBase>> inputs) {
     for (auto in : inputs) {
       inputs_.emplace_back(in);
     }
@@ -65,15 +68,14 @@ class KernelContext {
         std::pair<int, int>(index, index + inputs.size()));
   }
 
-  void EmplaceBackOutput(std::shared_ptr<TensorInterface> output) {
+  void EmplaceBackOutput(std::shared_ptr<TensorBase> output) {
     outputs_.emplace_back(output);
     // Record the start and end index of the input
     int index = outputs_.size();
     output_range_.emplace_back(std::pair<int, int>(index, index + 1));
   }
 
-  void EmplaceBackOutputs(
-      std::vector<std::shared_ptr<TensorInterface>> outputs) {
+  void EmplaceBackOutputs(std::vector<std::shared_ptr<TensorBase>> outputs) {
     for (auto out : outputs) {
       outputs_.emplace_back(out);
     }
@@ -115,8 +117,8 @@ class KernelContext {
   // TODO(chenweihang): replaced by small_vector
   // TODO(chenweihang): Tensor -> Tensor*, Tensor should by managed `scope`
   // Note: can't use API Tensor here, the inference don't use this API Tensor
-  std::vector<std::shared_ptr<TensorInterface>> inputs_{};
-  std::vector<std::shared_ptr<TensorInterface>> outputs_{};
+  std::vector<std::shared_ptr<TensorBase>> inputs_{};
+  std::vector<std::shared_ptr<TensorBase>> outputs_{};
   std::vector<paddle::any> attrs_{};
 
   // Only contains input like list[Tensor] need `range`
diff --git a/paddle/tcmpt/core/kernel_factory.h b/paddle/tcmpt/core/kernel_factory.h
index 5978264c9ef26..6e4a3fa86dfda 100644
--- a/paddle/tcmpt/core/kernel_factory.h
+++ b/paddle/tcmpt/core/kernel_factory.h
@@ -19,10 +19,10 @@
 #include <unordered_map>
 #include <utility>
 
+#include "paddle/tcmpt/common/data_type.h"
+#include "paddle/tcmpt/common/layout.h"
 #include "paddle/tcmpt/core/backend.h"
-#include "paddle/tcmpt/core/dtype.h"
 #include "paddle/tcmpt/core/kernel_def.h"
-#include "paddle/tcmpt/core/layout.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/enforce.h"
@@ -31,6 +31,9 @@
 
 namespace pt {
 
+using DataType = paddle::experimental::DataType;
+using DataLayout = paddle::experimental::DataLayout;
+
 /**
  * [ Naming considerations ]
  *
diff --git a/paddle/tcmpt/core/kernel_registry.h b/paddle/tcmpt/core/kernel_registry.h
index 661d387e9b8e2..caa42546ab054 100644
--- a/paddle/tcmpt/core/kernel_registry.h
+++ b/paddle/tcmpt/core/kernel_registry.h
@@ -336,213 +336,213 @@ struct KernelRegistrar {
 
 // clang-format on
 
-#define _PT_KERNEL_REGISTRAR_INIT_1(kernel_name,      \
-                                    func_id,          \
-                                    registrar_id,     \
-                                    backend,          \
-                                    layout,           \
-                                    args_def_fn,      \
-                                    meta_kernel_fn,   \
-                                    cpp_dtype,        \
-                                    ...)              \
-  static const ::pt::KernelRegistrar PT_CONCATENATE(  \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)( \
-      kernel_name,                                    \
-      BACKEND(backend),                               \
-      DATALAYOUT(layout),                             \
-      ::pt::CppTypeToDataType<cpp_dtype>::Type(),     \
-      ::pt::KernelArgsParseFunctor<decltype(          \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,        \
-      args_def_fn,                                    \
+#define _PT_KERNEL_REGISTRAR_INIT_1(kernel_name,                    \
+                                    func_id,                        \
+                                    registrar_id,                   \
+                                    backend,                        \
+                                    layout,                         \
+                                    args_def_fn,                    \
+                                    meta_kernel_fn,                 \
+                                    cpp_dtype,                      \
+                                    ...)                            \
+  static const ::pt::KernelRegistrar PT_CONCATENATE(                \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
+      kernel_name,                                                  \
+      BACKEND(backend),                                             \
+      DATALAYOUT(layout),                                           \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
+      ::pt::KernelArgsParseFunctor<decltype(                        \
+          &meta_kernel_fn<cpp_dtype>)>::Parse,                      \
+      args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));
-#define _PT_KERNEL_REGISTRAR_INIT_2(kernel_name,        \
-                                    func_id,            \
-                                    registrar_id,       \
-                                    backend,            \
-                                    layout,             \
-                                    args_def_fn,        \
-                                    meta_kernel_fn,     \
-                                    cpp_dtype,          \
-                                    ...)                \
-  static const ::pt::KernelRegistrar PT_CONCATENATE(    \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(   \
-      kernel_name,                                      \
-      BACKEND(backend),                                 \
-      DATALAYOUT(layout),                               \
-      ::pt::CppTypeToDataType<cpp_dtype>::Type(),       \
-      ::pt::KernelArgsParseFunctor<decltype(            \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,          \
-      args_def_fn,                                      \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>));            \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_1(kernel_name,    \
-                                        func_id,        \
-                                        PT_ID,          \
-                                        backend,        \
-                                        layout,         \
-                                        args_def_fn,    \
-                                        meta_kernel_fn, \
+#define _PT_KERNEL_REGISTRAR_INIT_2(kernel_name,                    \
+                                    func_id,                        \
+                                    registrar_id,                   \
+                                    backend,                        \
+                                    layout,                         \
+                                    args_def_fn,                    \
+                                    meta_kernel_fn,                 \
+                                    cpp_dtype,                      \
+                                    ...)                            \
+  static const ::pt::KernelRegistrar PT_CONCATENATE(                \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
+      kernel_name,                                                  \
+      BACKEND(backend),                                             \
+      DATALAYOUT(layout),                                           \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
+      ::pt::KernelArgsParseFunctor<decltype(                        \
+          &meta_kernel_fn<cpp_dtype>)>::Parse,                      \
+      args_def_fn,                                                  \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_1(kernel_name,                \
+                                        func_id,                    \
+                                        PT_ID,                      \
+                                        backend,                    \
+                                        layout,                     \
+                                        args_def_fn,                \
+                                        meta_kernel_fn,             \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_3(kernel_name,        \
-                                    func_id,            \
-                                    registrar_id,       \
-                                    backend,            \
-                                    layout,             \
-                                    args_def_fn,        \
-                                    meta_kernel_fn,     \
-                                    cpp_dtype,          \
-                                    ...)                \
-  static const ::pt::KernelRegistrar PT_CONCATENATE(    \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(   \
-      kernel_name,                                      \
-      BACKEND(backend),                                 \
-      DATALAYOUT(layout),                               \
-      ::pt::CppTypeToDataType<cpp_dtype>::Type(),       \
-      ::pt::KernelArgsParseFunctor<decltype(            \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,          \
-      args_def_fn,                                      \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>));            \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_2(kernel_name,    \
-                                        func_id,        \
-                                        PT_ID,          \
-                                        backend,        \
-                                        layout,         \
-                                        args_def_fn,    \
-                                        meta_kernel_fn, \
+#define _PT_KERNEL_REGISTRAR_INIT_3(kernel_name,                    \
+                                    func_id,                        \
+                                    registrar_id,                   \
+                                    backend,                        \
+                                    layout,                         \
+                                    args_def_fn,                    \
+                                    meta_kernel_fn,                 \
+                                    cpp_dtype,                      \
+                                    ...)                            \
+  static const ::pt::KernelRegistrar PT_CONCATENATE(                \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
+      kernel_name,                                                  \
+      BACKEND(backend),                                             \
+      DATALAYOUT(layout),                                           \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
+      ::pt::KernelArgsParseFunctor<decltype(                        \
+          &meta_kernel_fn<cpp_dtype>)>::Parse,                      \
+      args_def_fn,                                                  \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_2(kernel_name,                \
+                                        func_id,                    \
+                                        PT_ID,                      \
+                                        backend,                    \
+                                        layout,                     \
+                                        args_def_fn,                \
+                                        meta_kernel_fn,             \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_4(kernel_name,        \
-                                    func_id,            \
-                                    registrar_id,       \
-                                    backend,            \
-                                    layout,             \
-                                    args_def_fn,        \
-                                    meta_kernel_fn,     \
-                                    cpp_dtype,          \
-                                    ...)                \
-  static const ::pt::KernelRegistrar PT_CONCATENATE(    \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(   \
-      kernel_name,                                      \
-      BACKEND(backend),                                 \
-      DATALAYOUT(layout),                               \
-      ::pt::CppTypeToDataType<cpp_dtype>::Type(),       \
-      ::pt::KernelArgsParseFunctor<decltype(            \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,          \
-      args_def_fn,                                      \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>));            \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_3(kernel_name,    \
-                                        func_id,        \
-                                        PT_ID,          \
-                                        backend,        \
-                                        layout,         \
-                                        args_def_fn,    \
-                                        meta_kernel_fn, \
+#define _PT_KERNEL_REGISTRAR_INIT_4(kernel_name,                    \
+                                    func_id,                        \
+                                    registrar_id,                   \
+                                    backend,                        \
+                                    layout,                         \
+                                    args_def_fn,                    \
+                                    meta_kernel_fn,                 \
+                                    cpp_dtype,                      \
+                                    ...)                            \
+  static const ::pt::KernelRegistrar PT_CONCATENATE(                \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
+      kernel_name,                                                  \
+      BACKEND(backend),                                             \
+      DATALAYOUT(layout),                                           \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
+      ::pt::KernelArgsParseFunctor<decltype(                        \
+          &meta_kernel_fn<cpp_dtype>)>::Parse,                      \
+      args_def_fn,                                                  \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_3(kernel_name,                \
+                                        func_id,                    \
+                                        PT_ID,                      \
+                                        backend,                    \
+                                        layout,                     \
+                                        args_def_fn,                \
+                                        meta_kernel_fn,             \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_5(kernel_name,        \
-                                    func_id,            \
-                                    registrar_id,       \
-                                    backend,            \
-                                    layout,             \
-                                    args_def_fn,        \
-                                    meta_kernel_fn,     \
-                                    cpp_dtype,          \
-                                    ...)                \
-  static const ::pt::KernelRegistrar PT_CONCATENATE(    \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(   \
-      kernel_name,                                      \
-      BACKEND(backend),                                 \
-      DATALAYOUT(layout),                               \
-      ::pt::CppTypeToDataType<cpp_dtype>::Type(),       \
-      ::pt::KernelArgsParseFunctor<decltype(            \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,          \
-      args_def_fn,                                      \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>));            \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_4(kernel_name,    \
-                                        func_id,        \
-                                        PT_ID,          \
-                                        backend,        \
-                                        layout,         \
-                                        args_def_fn,    \
-                                        meta_kernel_fn, \
+#define _PT_KERNEL_REGISTRAR_INIT_5(kernel_name,                    \
+                                    func_id,                        \
+                                    registrar_id,                   \
+                                    backend,                        \
+                                    layout,                         \
+                                    args_def_fn,                    \
+                                    meta_kernel_fn,                 \
+                                    cpp_dtype,                      \
+                                    ...)                            \
+  static const ::pt::KernelRegistrar PT_CONCATENATE(                \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
+      kernel_name,                                                  \
+      BACKEND(backend),                                             \
+      DATALAYOUT(layout),                                           \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
+      ::pt::KernelArgsParseFunctor<decltype(                        \
+          &meta_kernel_fn<cpp_dtype>)>::Parse,                      \
+      args_def_fn,                                                  \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_4(kernel_name,                \
+                                        func_id,                    \
+                                        PT_ID,                      \
+                                        backend,                    \
+                                        layout,                     \
+                                        args_def_fn,                \
+                                        meta_kernel_fn,             \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_6(kernel_name,        \
-                                    func_id,            \
-                                    registrar_id,       \
-                                    backend,            \
-                                    layout,             \
-                                    args_def_fn,        \
-                                    meta_kernel_fn,     \
-                                    cpp_dtype,          \
-                                    ...)                \
-  static const ::pt::KernelRegistrar PT_CONCATENATE(    \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(   \
-      kernel_name,                                      \
-      BACKEND(backend),                                 \
-      DATALAYOUT(layout),                               \
-      ::pt::CppTypeToDataType<cpp_dtype>::Type(),       \
-      ::pt::KernelArgsParseFunctor<decltype(            \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,          \
-      args_def_fn,                                      \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>));            \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_5(kernel_name,    \
-                                        func_id,        \
-                                        PT_ID,          \
-                                        backend,        \
-                                        layout,         \
-                                        args_def_fn,    \
-                                        meta_kernel_fn, \
+#define _PT_KERNEL_REGISTRAR_INIT_6(kernel_name,                    \
+                                    func_id,                        \
+                                    registrar_id,                   \
+                                    backend,                        \
+                                    layout,                         \
+                                    args_def_fn,                    \
+                                    meta_kernel_fn,                 \
+                                    cpp_dtype,                      \
+                                    ...)                            \
+  static const ::pt::KernelRegistrar PT_CONCATENATE(                \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
+      kernel_name,                                                  \
+      BACKEND(backend),                                             \
+      DATALAYOUT(layout),                                           \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
+      ::pt::KernelArgsParseFunctor<decltype(                        \
+          &meta_kernel_fn<cpp_dtype>)>::Parse,                      \
+      args_def_fn,                                                  \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_5(kernel_name,                \
+                                        func_id,                    \
+                                        PT_ID,                      \
+                                        backend,                    \
+                                        layout,                     \
+                                        args_def_fn,                \
+                                        meta_kernel_fn,             \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_7(kernel_name,        \
-                                    func_id,            \
-                                    registrar_id,       \
-                                    backend,            \
-                                    layout,             \
-                                    args_def_fn,        \
-                                    meta_kernel_fn,     \
-                                    cpp_dtype,          \
-                                    ...)                \
-  static const ::pt::KernelRegistrar PT_CONCATENATE(    \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(   \
-      kernel_name,                                      \
-      BACKEND(backend),                                 \
-      DATALAYOUT(layout),                               \
-      ::pt::CppTypeToDataType<cpp_dtype>::Type(),       \
-      ::pt::KernelArgsParseFunctor<decltype(            \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,          \
-      args_def_fn,                                      \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>));            \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_6(kernel_name,    \
-                                        func_id,        \
-                                        PT_ID,          \
-                                        backend,        \
-                                        layout,         \
-                                        args_def_fn,    \
-                                        meta_kernel_fn, \
+#define _PT_KERNEL_REGISTRAR_INIT_7(kernel_name,                    \
+                                    func_id,                        \
+                                    registrar_id,                   \
+                                    backend,                        \
+                                    layout,                         \
+                                    args_def_fn,                    \
+                                    meta_kernel_fn,                 \
+                                    cpp_dtype,                      \
+                                    ...)                            \
+  static const ::pt::KernelRegistrar PT_CONCATENATE(                \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
+      kernel_name,                                                  \
+      BACKEND(backend),                                             \
+      DATALAYOUT(layout),                                           \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
+      ::pt::KernelArgsParseFunctor<decltype(                        \
+          &meta_kernel_fn<cpp_dtype>)>::Parse,                      \
+      args_def_fn,                                                  \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_6(kernel_name,                \
+                                        func_id,                    \
+                                        PT_ID,                      \
+                                        backend,                    \
+                                        layout,                     \
+                                        args_def_fn,                \
+                                        meta_kernel_fn,             \
                                         __VA_ARGS__))
-#define _PT_KERNEL_REGISTRAR_INIT_8(kernel_name,        \
-                                    func_id,            \
-                                    registrar_id,       \
-                                    backend,            \
-                                    layout,             \
-                                    args_def_fn,        \
-                                    meta_kernel_fn,     \
-                                    cpp_dtype,          \
-                                    ...)                \
-  static const ::pt::KernelRegistrar PT_CONCATENATE(    \
-      __reg_pt_op_kernel_##func_id##_, registrar_id)(   \
-      kernel_name,                                      \
-      BACKEND(backend),                                 \
-      DATALAYOUT(layout),                               \
-      ::pt::CppTypeToDataType<cpp_dtype>::Type(),       \
-      ::pt::KernelArgsParseFunctor<decltype(            \
-          &meta_kernel_fn<cpp_dtype>)>::Parse,          \
-      args_def_fn,                                      \
-      PT_KERNEL(meta_kernel_fn<cpp_dtype>));            \
-  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_7(kernel_name,    \
-                                        func_id,        \
-                                        PT_ID,          \
-                                        backend,        \
-                                        layout,         \
-                                        args_def_fn,    \
-                                        meta_kernel_fn, \
+#define _PT_KERNEL_REGISTRAR_INIT_8(kernel_name,                    \
+                                    func_id,                        \
+                                    registrar_id,                   \
+                                    backend,                        \
+                                    layout,                         \
+                                    args_def_fn,                    \
+                                    meta_kernel_fn,                 \
+                                    cpp_dtype,                      \
+                                    ...)                            \
+  static const ::pt::KernelRegistrar PT_CONCATENATE(                \
+      __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
+      kernel_name,                                                  \
+      BACKEND(backend),                                             \
+      DATALAYOUT(layout),                                           \
+      ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
+      ::pt::KernelArgsParseFunctor<decltype(                        \
+          &meta_kernel_fn<cpp_dtype>)>::Parse,                      \
+      args_def_fn,                                                  \
+      PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
+  PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_7(kernel_name,                \
+                                        func_id,                    \
+                                        PT_ID,                      \
+                                        backend,                    \
+                                        layout,                     \
+                                        args_def_fn,                \
+                                        meta_kernel_fn,             \
                                         __VA_ARGS__))
 
 #define PT_REGISTER_KERNEL_STANDARD(                \
diff --git a/paddle/tcmpt/core/layout.h b/paddle/tcmpt/core/layout.h
deleted file mode 100644
index 4a8a223b62f84..0000000000000
--- a/paddle/tcmpt/core/layout.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <ostream>
-
-namespace pt {
-
-/**
- * We need to ensure that the operator library is relatively independent
- * and does not depend on the framework. Therefore, before calling the kernel
- * in the Tensor Compute library inside the framework, the internal
- * layout needs to be converted to the data type in the Tensor Compute
- * library.
- *
- * Here we also can use the DataLayout in framework, they are all enum classes.
- */
-enum class DataLayout {
-  kUndef = 0,
-  kAny,
-  kNHWC,
-  kNCHW,
-  kMKLDNN,
-  kNumLayouts,
-};
-
-std::ostream& operator<<(std::ostream& os, DataLayout dtype);
-
-DataLayout& operator++(DataLayout& layout, int);
-
-}  // namespace pt
diff --git a/paddle/tcmpt/core/spatial_tensor.h b/paddle/tcmpt/core/spatial_tensor.h
index 5e51322bb8339..0e5bdd8be50a3 100644
--- a/paddle/tcmpt/core/spatial_tensor.h
+++ b/paddle/tcmpt/core/spatial_tensor.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/tcmpt/core/tensor_interface.h"
+#include "paddle/tcmpt/core/tensor_base.h"
 
 namespace pt {
 
@@ -27,7 +27,7 @@ namespace pt {
  */
 
 template <typename AllocationType>
-class SpatialTensor : public TensorInterface {
+class SpatialTensor : public TensorBase {
  public:
   SpatialTensor(std::shared_ptr<AllocationType> allocation,
                 std::unique_ptr<TensorMeta> meta,
diff --git a/paddle/tcmpt/core/storage.cc b/paddle/tcmpt/core/storage.cc
new file mode 100644
index 0000000000000..02fbea8d0b3a1
--- /dev/null
+++ b/paddle/tcmpt/core/storage.cc
@@ -0,0 +1,27 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/tcmpt/core/storage.h"
+
+namespace paddle {
+namespace tcmpt {
+
+void TensorStorage::Realloc(size_t size) {
+  data_.Clear();
+  data_ = Allocate(alloc_, size);
+  size_ = size;
+}
+
+}  // namespace tcmpt
+}  // namespace paddle
diff --git a/paddle/tcmpt/core/storage.h b/paddle/tcmpt/core/storage.h
new file mode 100644
index 0000000000000..d838d0cd1c957
--- /dev/null
+++ b/paddle/tcmpt/core/storage.h
@@ -0,0 +1,78 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cstddef>
+
+#include "boost/intrusive_ptr.hpp"
+#include "paddle/tcmpt/core/utils/intrusive_ptr.h"
+#include "paddle/tcmpt/core/utils/intrusive_ref_counter.h"
+
+#include "paddle/fluid/platform/place.h"
+#include "paddle/tcmpt/core/allocator.h"
+
+namespace paddle {
+namespace tcmpt {
+
+/// \brief The interface of contiguous storage used for the dense tensor.
+/// It should be used in conjunction with the intrusive pointer. We prohibit
+/// all default copy operations to ensure the integrity of the package.
+class Storage : public intrusive_ref_counter<Storage> {
+ public:
+  Storage() = default;
+  Storage(const Storage&) = delete;
+
+  explicit Storage(Allocation&& data) : data_(std::move(data)) {}
+
+  virtual ~Storage() = default;
+
+  /// \brief Get the mutable data pointer of the storage.
+  /// This function is set to inline to improve performance.
+  /// \return The mutable data pointer of the storage.
+  void* data() const noexcept { return data_.operator->(); }
+
+  virtual size_t size() const = 0;
+  virtual const platform::Place& place() const = 0;
+  virtual bool OwnsMemory() const = 0;
+  virtual void Realloc(size_t n) = 0;
+
+ protected:
+  Allocation data_;
+};
+
+class TensorStorage : public Storage {
+ public:
+  explicit TensorStorage(const std::shared_ptr<Allocator>& a) : alloc_(a) {}
+  TensorStorage(const std::shared_ptr<Allocator>& a, size_t size)
+      : Storage(Allocate(a, size)), alloc_(a), size_(size) {}
+
+  ~TensorStorage() = default;
+
+  void Realloc(size_t size) override;
+
+  size_t size() const noexcept override { return size_; }
+  const platform::Place& place() const override { return data_.place(); }
+  bool OwnsMemory() const noexcept override { return true; }
+  const std::shared_ptr<Allocator>& allocator() const noexcept {
+    return alloc_;
+  }
+
+ private:
+  const std::shared_ptr<Allocator> alloc_;
+  int64_t size_{0};
+};
+
+}  // namespace tcmpt
+}  // namespace paddle
diff --git a/paddle/tcmpt/core/tensor_base.cc b/paddle/tcmpt/core/tensor_base.cc
new file mode 100644
index 0000000000000..05dba1206075d
--- /dev/null
+++ b/paddle/tcmpt/core/tensor_base.cc
@@ -0,0 +1,20 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/tcmpt/core/tensor_base.h"
+#include "paddle/tcmpt/core/utils/type_registry.h"
+
+namespace paddle {
+namespace tcmpt {}
+}
diff --git a/paddle/tcmpt/core/tensor_base.h b/paddle/tcmpt/core/tensor_base.h
new file mode 100644
index 0000000000000..240808e3cc492
--- /dev/null
+++ b/paddle/tcmpt/core/tensor_base.h
@@ -0,0 +1,78 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/tcmpt/common/data_type.h"
+#include "paddle/tcmpt/common/layout.h"
+#include "paddle/tcmpt/core/storage.h"
+#include "paddle/tcmpt/core/utils/type_registry.h"
+
+#include "paddle/tcmpt/core/backend.h"
+
+namespace paddle {
+namespace tcmpt {
+
+class TensorBase {
+ public:
+  using DataType = experimental::DataType;
+  using DataLayout = experimental::DataLayout;
+
+  virtual ~TensorBase() = default;
+
+  /// \brief Returns the number of elements contained in tensor.
+  /// \return The number of elements contained in tensor.
+  virtual int64_t numel() const = 0;
+
+  /// \brief Returns the dims of the tensor.
+  /// \return The dims of the tensor.
+  virtual const paddle::framework::DDim& dims() const = 0;
+
+  /// \brief Returns the data type of the tensor.
+  /// \return The data type of the tensor.
+  virtual DataType data_type() const = 0;
+
+  /// \brief Returns the data layout of the tensor.
+  /// \return The data layout of the tensor.
+  virtual DataLayout layout() const = 0;
+
+  /// \brief Returns the data place of the tensor.
+  /// \return The data place of the tensor.
+  virtual const platform::Place& place() const = 0;
+
+  /// \brief Test whether the metadata is valid.
+  /// \return Whether the metadata is valid.
+  virtual bool valid() const = 0;
+
+  /// \brief Test whether the storage is allocated.
+  /// return Whether the storage is allocated.
+  virtual bool initialized() const = 0;
+
+  virtual pt::Backend backend() const = 0;
+
+  /// \brief Return the type information of the derived class to support
+  /// safely downcast in non-rtti environment.
+  /// return The type information of the derived class.
+  TypeInfo<TensorBase> type_info() const { return type_info_; }
+
+ private:
+  template <typename T, typename U>
+  friend class TypeInfoTraits;
+  TypeInfo<TensorBase> type_info_{TypeInfo<TensorBase>::kUnknownType};
+};
+
+}  // namespace tcmpt
+}  // namespace paddle
diff --git a/paddle/tcmpt/core/tensor_interface.h b/paddle/tcmpt/core/tensor_interface.h
deleted file mode 100644
index 6991c0d7f7f71..0000000000000
--- a/paddle/tcmpt/core/tensor_interface.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/tcmpt/core/backend.h"
-#include "paddle/tcmpt/core/dtype.h"
-#include "paddle/tcmpt/core/layout.h"
-
-namespace paddle {
-namespace framework {
-class DDim;
-}
-namespace platform {
-class Place;
-}
-}
-
-namespace pt {
-
-// TODO(shixiaowei): replace by new DDim
-using DDim = paddle::framework::DDim;
-
-// TODO(shixiaowei): replace by new Place?
-using Place = paddle::platform::Place;
-
-/**
- * The abstract class of Tensor implemention, it needs to define its basic
- * behavior through inherited classes.
- *
- * TensorInterface allows Tensor to uniformly access various different
- * TensorImpls within the framework. It will not be used as a kernel argument,
- * but only contains the interfaces supported by various TensorImpls.
- * In extreme cases, it can be an empty base class.
- *
- * If we don't use TensorInterface, we may need to use shared_ptr<void>
- * to unify Tensor's API.
- */
-class TensorInterface {
- public:
-  // Not allowed to initialize a tensor without descriptive metadata
-  TensorInterface() = default;
-
-  TensorInterface(const TensorInterface&) = delete;
-  TensorInterface& operator=(const TensorInterface&) = delete;
-  TensorInterface(TensorInterface&&) = delete;
-  TensorInterface& operator=(TensorInterface&&) = delete;
-
-  virtual ~TensorInterface() {}
-
-  virtual int64_t numel() const = 0;
-
-  virtual DDim dims() const = 0;
-
-  virtual DataType type() const = 0;
-
-  virtual DataLayout layout() const = 0;
-
-  virtual Place place() const = 0;
-
-  virtual Backend backend() const = 0;
-
-  virtual bool initialized() const = 0;
-};
-
-}  // namespace pt
diff --git a/paddle/tcmpt/core/tensor_meta.h b/paddle/tcmpt/core/tensor_meta.h
index de564a44de36e..3cc557e05b4c1 100644
--- a/paddle/tcmpt/core/tensor_meta.h
+++ b/paddle/tcmpt/core/tensor_meta.h
@@ -16,9 +16,9 @@ limitations under the License. */
 
 #include <vector>
 
+#include "paddle/tcmpt/common/data_type.h"
+#include "paddle/tcmpt/common/layout.h"
 #include "paddle/tcmpt/core/backend.h"
-#include "paddle/tcmpt/core/dtype.h"
-#include "paddle/tcmpt/core/layout.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/ddim.h"
@@ -28,6 +28,9 @@ limitations under the License. */
 
 namespace pt {
 
+using DataType = paddle::experimental::DataType;
+using DataLayout = paddle::experimental::DataLayout;
+
 // template <typename T>
 // using Vector = paddle::framework::Vector<T>;
 
diff --git a/paddle/tcmpt/core/tensor_status.h b/paddle/tcmpt/core/tensor_status.h
index 1328c88dd014a..1eb56397414b5 100644
--- a/paddle/tcmpt/core/tensor_status.h
+++ b/paddle/tcmpt/core/tensor_status.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/tcmpt/common/data_type.h"
+#include "paddle/tcmpt/common/layout.h"
 #include "paddle/tcmpt/core/backend.h"
-#include "paddle/tcmpt/core/dtype.h"
-#include "paddle/tcmpt/core/layout.h"
 
 namespace pt {
 
diff --git a/paddle/tcmpt/core/utils/CMakeLists.txt b/paddle/tcmpt/core/utils/CMakeLists.txt
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/paddle/tcmpt/core/utils/intrusive_ptr.h b/paddle/tcmpt/core/utils/intrusive_ptr.h
new file mode 100644
index 0000000000000..f368d05cb47db
--- /dev/null
+++ b/paddle/tcmpt/core/utils/intrusive_ptr.h
@@ -0,0 +1,160 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <utility>
+#include "glog/logging.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace tcmpt {
+
+template <typename T>
+class intrusive_ptr {
+ public:
+  using this_type = intrusive_ptr;
+  constexpr intrusive_ptr() noexcept = default;
+
+  ~intrusive_ptr() {
+    if (px) {
+      intrusive_ptr_release(px);
+    }
+  }
+
+  intrusive_ptr(intrusive_ptr&& rhs) noexcept : px(rhs.px) { rhs.px = nullptr; }
+
+  template <typename U,
+            typename = std::enable_if_t<std::is_base_of<T, U>::value>>
+  intrusive_ptr(intrusive_ptr<U>&& rhs) noexcept : px(rhs.get()) {
+    rhs.reset();
+  }
+
+  void reset() { this_type().swap(*this); }
+
+  void reset(T* rhs) { this_type(rhs).swap(*this); }
+
+  void reset(T* rhs, bool add_ref) { this_type(rhs, add_ref).swap(*this); }
+
+  T* get() const noexcept { return px; }
+
+  T* detach() noexcept {
+    T* ret = px;
+    px = nullptr;
+    return ret;
+  }
+
+  T& operator*() const {
+    PADDLE_ENFORCE_NOT_NULL(
+        px,
+        platform::errors::PreconditionNotMet(
+            "The pointer must be non-null before the dereference operation."));
+    return *px;
+  }
+
+  T* operator->() const {
+    PADDLE_ENFORCE_NOT_NULL(
+        px,
+        platform::errors::PreconditionNotMet(
+            "The pointer must be non-null before the dereference operation."));
+    return px;
+  }
+
+  void swap(intrusive_ptr& rhs) noexcept {
+    T* tmp = px;
+    px = rhs.px;
+    rhs.px = tmp;
+  }
+
+ private:
+  template <typename U,
+            typename = std::enable_if_t<std::is_base_of<T, U>::value>>
+  explicit intrusive_ptr(U* p, bool add_ref = true) : px(p) {
+    if (px && add_ref) {
+      intrusive_ptr_add_ref(px);
+    }
+  }
+
+  template <typename R, typename... Args>
+  friend intrusive_ptr<R> make_intrusive(Args&&...);
+  template <typename R>
+  friend intrusive_ptr<R> copy_intrusive(const intrusive_ptr<R>&);
+
+  T* px{nullptr};
+};
+
+template <typename T, typename U>
+inline bool operator==(const intrusive_ptr<T>& a,
+                       const intrusive_ptr<U>& b) noexcept {
+  return a.get() == b.get();
+}
+
+template <typename T, typename U>
+inline bool operator!=(const intrusive_ptr<T>& a,
+                       const intrusive_ptr<U>& b) noexcept {
+  return a.get() != b.get();
+}
+
+template <typename T, typename U>
+inline bool operator==(const intrusive_ptr<T>& a, U* b) noexcept {
+  return a.get() == b;
+}
+
+template <typename T, typename U>
+inline bool operator!=(const intrusive_ptr<T>& a, U* b) noexcept {
+  return a.get() != b;
+}
+
+template <typename T, typename U>
+inline bool operator==(T* a, const intrusive_ptr<U>& b) noexcept {
+  return a == b.get();
+}
+
+template <typename T, typename U>
+inline bool operator!=(T* a, const intrusive_ptr<U>& b) noexcept {
+  return a != b.get();
+}
+
+template <typename T>
+inline bool operator==(const intrusive_ptr<T>& p, std::nullptr_t) noexcept {
+  return p.get() == nullptr;
+}
+
+template <typename T>
+inline bool operator==(std::nullptr_t, const intrusive_ptr<T>& p) noexcept {
+  return p.get() == nullptr;
+}
+
+template <typename T>
+inline bool operator!=(const intrusive_ptr<T>& p, std::nullptr_t) noexcept {
+  return p.get() != nullptr;
+}
+
+template <typename T>
+inline bool operator!=(std::nullptr_t, const intrusive_ptr<T>& p) noexcept {
+  return p.get() != nullptr;
+}
+
+template <typename T, typename... Args>
+inline intrusive_ptr<T> make_intrusive(Args&&... args) {
+  return intrusive_ptr<T>(new T(std::forward<Args>(args)...), false);
+}
+
+template <typename T>
+inline intrusive_ptr<T> copy_intrusive(const intrusive_ptr<T>& rhs) {
+  return intrusive_ptr<T>(rhs.get(), true);
+}
+
+}  // namespace tcmpt
+}  // namespace paddle
diff --git a/paddle/tcmpt/core/utils/intrusive_ref_counter.h b/paddle/tcmpt/core/utils/intrusive_ref_counter.h
new file mode 100644
index 0000000000000..1c93bede71df1
--- /dev/null
+++ b/paddle/tcmpt/core/utils/intrusive_ref_counter.h
@@ -0,0 +1,66 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <atomic>
+
+namespace paddle {
+namespace tcmpt {
+
+template <typename DerivedT>
+class intrusive_ref_counter;
+template <typename DerivedT>
+void intrusive_ptr_add_ref(const intrusive_ref_counter<DerivedT>* p) noexcept;
+template <typename DerivedT>
+void intrusive_ptr_release(const intrusive_ref_counter<DerivedT>* p) noexcept;
+
+template <typename DerivedT>
+class intrusive_ref_counter {
+ public:
+  constexpr intrusive_ref_counter() noexcept : ref_(1) {}
+  virtual ~intrusive_ref_counter() = default;
+
+  unsigned int use_count() const noexcept { return ref_.load(); }
+
+ protected:
+  intrusive_ref_counter(const intrusive_ref_counter&) = delete;
+  intrusive_ref_counter& operator=(const intrusive_ref_counter&) = delete;
+
+  friend void intrusive_ptr_add_ref<DerivedT>(
+      const intrusive_ref_counter<DerivedT>* p) noexcept;
+  friend void intrusive_ptr_release<DerivedT>(
+      const intrusive_ref_counter<DerivedT>* p) noexcept;
+
+ private:
+  mutable std::atomic_int_fast32_t ref_;
+};
+
+template <typename DerivedT>
+inline void intrusive_ptr_add_ref(
+    const intrusive_ref_counter<DerivedT>* p) noexcept {
+  p->ref_.fetch_add(1, std::memory_order_relaxed);
+}
+
+template <typename DerivedT>
+inline void intrusive_ptr_release(
+    const intrusive_ref_counter<DerivedT>* p) noexcept {
+  if (p->ref_.load(std::memory_order_acquire) == 0 ||
+      p->ref_.fetch_sub(1) == 0) {
+    delete static_cast<const DerivedT*>(p);
+  }
+}
+
+}  // namespace tcmpt
+}  // namespace paddle
diff --git a/paddle/tcmpt/core/utils/type_info.h b/paddle/tcmpt/core/utils/type_info.h
new file mode 100644
index 0000000000000..ba5bc641b94b2
--- /dev/null
+++ b/paddle/tcmpt/core/utils/type_info.h
@@ -0,0 +1,61 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+
+namespace paddle {
+namespace tcmpt {
+
+template <typename BaseT>
+class TypeRegistry;
+
+template <typename BaseT>
+class TypeInfo {
+ public:
+  const std::string& name() const;
+
+  int8_t id() const { return id_; }
+
+  bool operator==(TypeInfo other) const { return id_ == other.id(); }
+  bool operator!=(TypeInfo other) const { return id_ != other.id(); }
+
+  static const TypeInfo kUnknownType;
+
+ private:
+  friend class TypeRegistry<BaseT>;
+  explicit TypeInfo(int8_t id) : id_(id) {}
+  int8_t id_;
+};
+
+template <typename BaseT, typename DerivedT>
+class TypeInfoTraits {
+ public:
+  static const TypeInfo<BaseT> kType;
+  TypeInfoTraits() {
+    static_cast<BaseT*>(static_cast<DerivedT*>(this))->type_info_ = kType;
+  }
+  static bool classof(const BaseT* obj) { return obj->type_info() == kType; }
+};
+
+template <typename BaseT>
+TypeInfo<BaseT> RegisterStaticType(const std::string& type);
+
+template <typename BaseT, typename DerivedT>
+const TypeInfo<BaseT> TypeInfoTraits<BaseT, DerivedT>::kType =
+    RegisterStaticType<BaseT>(DerivedT::name());
+
+}  // namespace tcmpt
+}  // namespace paddle
diff --git a/paddle/tcmpt/core/utils/type_registry.h b/paddle/tcmpt/core/utils/type_registry.h
new file mode 100644
index 0000000000000..52b699a0dd413
--- /dev/null
+++ b/paddle/tcmpt/core/utils/type_registry.h
@@ -0,0 +1,86 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cassert>
+#include <mutex>
+#include <string>
+
+#include "paddle/tcmpt/core/utils/type_info.h"
+
+namespace paddle {
+namespace tcmpt {
+
+template <typename BaseT>
+class TypeRegistry {
+ public:
+  TypeRegistry(const TypeRegistry&) = delete;
+  TypeRegistry& operator=(const TypeRegistry&) = delete;
+
+  static TypeRegistry& GetInstance();
+
+  TypeInfo<BaseT> RegisterType(const std::string& type);
+  const std::string& GetTypeName(TypeInfo<BaseT> info) const;
+
+ private:
+  TypeRegistry() = default;
+  mutable std::mutex mutex_;
+  std::vector<std::string> names_;
+  std::map<std::string, int8_t> name_to_id_;
+};
+
+template <typename BaseT>
+TypeRegistry<BaseT>& TypeRegistry<BaseT>::GetInstance() {
+  static TypeRegistry<BaseT> registry;
+  return registry;
+}
+
+template <typename BaseT>
+TypeInfo<BaseT> TypeRegistry<BaseT>::RegisterType(const std::string& type) {
+  std::lock_guard<std::mutex> guard(mutex_);
+  assert(name_to_id_.find(type) == name_to_id_.end());
+  assert(names_.size() < std::numeric_limits<int8_t>::max());
+  int8_t id = names_.size();
+  names_.emplace_back(type);
+  name_to_id_[type] = id;
+  return TypeInfo<BaseT>(id);
+}
+
+template <typename BaseT>
+const std::string& TypeRegistry<BaseT>::GetTypeName(
+    TypeInfo<BaseT> info) const {
+  std::lock_guard<std::mutex> guard(mutex_);
+  int8_t id = info.id();
+  assert(id >= 0);
+  assert(static_cast<size_t>(id) < names_.size());
+  return names_[id];
+}
+
+template <typename BaseT>
+TypeInfo<BaseT> RegisterStaticType(const std::string& type) {
+  return TypeRegistry<BaseT>::GetInstance().RegisterType(type);
+}
+
+template <typename BaseT>
+const std::string& TypeInfo<BaseT>::name() const {
+  return TypeRegistry<BaseT>::GetInstance().GetTypeName(*this);
+}
+
+template <typename BaseT>
+const TypeInfo<BaseT> TypeInfo<BaseT>::kUnknownType =
+    RegisterStaticType<BaseT>("Unknown");
+
+}  // namespace tcmpt
+}  // namespace paddle
diff --git a/paddle/tcmpt/hapi/include/creation.h b/paddle/tcmpt/hapi/include/creation.h
index f502adb2e2472..d2d68e3bb7e61 100644
--- a/paddle/tcmpt/hapi/include/creation.h
+++ b/paddle/tcmpt/hapi/include/creation.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "paddle/tcmpt/core/dtype.h"
+#include "paddle/tcmpt/common/data_type.h"
 #include "paddle/tcmpt/core/scalar.h"
 #include "paddle/tcmpt/hapi/include/tensor.h"
 
diff --git a/paddle/tcmpt/hapi/include/tensor.h b/paddle/tcmpt/hapi/include/tensor.h
index eb64d66435c90..ccca911cf8c86 100644
--- a/paddle/tcmpt/hapi/include/tensor.h
+++ b/paddle/tcmpt/hapi/include/tensor.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include <memory>
 #include <utility>
 
-#include "paddle/tcmpt/core/tensor_interface.h"
+#include "paddle/tcmpt/core/tensor_base.h"
 
 /**
  * [ Why still include the fluid headers? ]
@@ -73,7 +73,7 @@ class AutogradMetaInterface {
  * letters and underscores.
  *
  * Note: Tensor cannot be inherited. The heterogeneous Tensor implementation
- * can be achieved by inheriting the underlying TensorInterface.
+ * can be achieved by inheriting the underlying TensorBase.
  *
  * Note: This Tensor API is suitable for training and custom operators,
  * another simple Tensor design may be required for inference.
@@ -88,10 +88,10 @@ class Tensor final {
 
   /**
    * @description: Use a TensorImpl pointer to construct a Tensor
-   * @param {shared_ptr<TensorInterface>} tensor_impl
+   * @param {shared_ptr<TensorBase>} tensor_impl
    * @return {Tensor}
    */
-  explicit Tensor(std::shared_ptr<pt::TensorInterface> tensor_impl)
+  explicit Tensor(std::shared_ptr<tcmpt::TensorBase> tensor_impl)
       : impl_(std::move(tensor_impl)) {
     if (impl_.get() == nullptr) {
       throw std::runtime_error("TensorImpl with nullptr is not supported");
@@ -111,14 +111,14 @@ class Tensor final {
    * @param None
    * @return {DDim}
    */
-  pt::DDim shape() const { return impl_->dims(); }
+  paddle::framework::DDim shape() const { return impl_->dims(); }
 
   /**
    * @description: Return the data type of current Tensor.
    * @param None
    * @return {DataType}
    */
-  pt::DataType type() const { return impl_->type(); }
+  pt::DataType type() const { return impl_->data_type(); }
 
   /**
    * @description: Return the layout of current Tensor.
@@ -133,7 +133,7 @@ class Tensor final {
    * @param None
    * @return {Place}
    */
-  pt::Place place() const { return impl_->place(); }
+  paddle::platform::Place place() const { return impl_->place(); }
 
   /**
    * Backend judgment APIs, shield the concept of Backend.
@@ -163,16 +163,16 @@ class Tensor final {
   /**
    * @description: Return the implemention of current Tensor.
    * @param None
-   * @return {std::shared_ptr<TensorInterface>}
+   * @return {std::shared_ptr<TensorBase>}
    */
-  std::shared_ptr<pt::TensorInterface> impl() const { return impl_; }
+  std::shared_ptr<tcmpt::TensorBase> impl() const { return impl_; }
 
   /**
    * @description: Set the implemention of current Tensor.
-   * @param {std::shared_ptr<TensorInterface>}
+   * @param {std::shared_ptr<TensorBase>}
    * @return None
    */
-  void set_impl(const std::shared_ptr<pt::TensorInterface>& impl) {
+  void set_impl(const std::shared_ptr<tcmpt::TensorBase>& impl) {
     impl_ = impl;
   }
 
@@ -245,7 +245,7 @@ class Tensor final {
    * heterogeneous Tensor implementation, so that the API level can be unified
    * to one `Tensor`.
    */
-  std::shared_ptr<pt::TensorInterface> impl_;
+  std::shared_ptr<tcmpt::TensorBase> impl_;
 
   /**
    * [ Why need abstract AutogradMetaInterface here? ]
diff --git a/paddle/tcmpt/kernels/cpu/utils.cc b/paddle/tcmpt/kernels/cpu/utils.cc
index 7550934d70be4..a50cfad481693 100644
--- a/paddle/tcmpt/kernels/cpu/utils.cc
+++ b/paddle/tcmpt/kernels/cpu/utils.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "paddle/tcmpt/kernels/cpu/utils.h"
 #include "paddle/fluid/memory/memcpy.h"
+#include "paddle/tcmpt/common/data_type.h"
 #include "paddle/tcmpt/core/convert_utils.h"
-#include "paddle/tcmpt/core/dtype.h"
 
 namespace pt {
 
@@ -37,8 +37,8 @@ void Copy(const CPUContext& dev_ctx, const DenseTensor& src, DenseTensor* dst) {
           << dst_place;
   dst->Resize(src.dims());
   dst->mutable_meta()->layout = src.meta().layout;
-  auto size = src.numel() *
-              paddle::framework::SizeOfType(TransToProtoVarType(src.type()));
+  auto size = src.numel() * paddle::framework::SizeOfType(
+                                TransToProtoVarType(src.data_type()));
 
   if (paddle::platform::is_cpu_place(src_place) &&
       paddle::platform::is_cpu_place(dst_place)) {
diff --git a/paddle/tcmpt/kernels/cuda/math.cu b/paddle/tcmpt/kernels/cuda/math.cu
index f0d76744f68bd..113971126a71f 100644
--- a/paddle/tcmpt/kernels/cuda/math.cu
+++ b/paddle/tcmpt/kernels/cuda/math.cu
@@ -78,7 +78,7 @@ void Mean(const CUDAContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
       TensorMeta(paddle::framework::make_ddim(
                      {static_cast<int64_t>(temp_storage_bytes)}),
                  pt::TransToPtBackend(dev_ctx.GetPlace()),
-                 x.type(),
+                 x.data_type(),
                  x.layout()),
       TensorStatus());
   auto* temp_storage = tmp.mutable_data<uint8_t>();
diff --git a/paddle/tcmpt/kernels/cuda/utils.cu b/paddle/tcmpt/kernels/cuda/utils.cu
index b8483d17cfc24..00b32e2fbb10a 100644
--- a/paddle/tcmpt/kernels/cuda/utils.cu
+++ b/paddle/tcmpt/kernels/cuda/utils.cu
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/memory/memcpy.h"
+#include "paddle/tcmpt/common/data_type.h"
 #include "paddle/tcmpt/core/convert_utils.h"
-#include "paddle/tcmpt/core/dtype.h"
 #include "paddle/tcmpt/core/kernel_registry.h"
 #include "paddle/tcmpt/kernels/cuda/utils.h"
 
@@ -40,8 +40,8 @@ void Copy(const CUDAContext& dev_ctx,
           << dst_place;
   dst->Resize(src.dims());
   dst->mutable_meta()->layout = src.meta().layout;
-  auto size = src.numel() *
-              paddle::framework::SizeOfType(TransToProtoVarType(src.type()));
+  auto size = src.numel() * paddle::framework::SizeOfType(
+                                TransToProtoVarType(src.data_type()));
 
   if (paddle::platform::is_cuda_pinned_place(src_place) &&  // NOLINT
       paddle::platform::is_cuda_pinned_place(dst_place)) {
diff --git a/paddle/tcmpt/tests/dense_tensor_test.cc b/paddle/tcmpt/tests/dense_tensor_test.cc
index 633e787159444..138ef1e30e76e 100644
--- a/paddle/tcmpt/tests/dense_tensor_test.cc
+++ b/paddle/tcmpt/tests/dense_tensor_test.cc
@@ -28,7 +28,7 @@ TEST(DenseTensor, Constructor) {
                          pt::TensorStatus());
   ASSERT_EQ(tensor.dims().size(), 2);
   ASSERT_EQ(tensor.backend(), pt::Backend::kCPU);
-  ASSERT_EQ(tensor.type(), pt::DataType::kFLOAT32);
+  ASSERT_EQ(tensor.data_type(), pt::DataType::kFLOAT32);
   ASSERT_EQ(tensor.layout(), pt::DataLayout::kNCHW);
 }
 

From 320b5f136f0101e0aef71ec5d34484844c50018e Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Tue, 19 Oct 2021 09:43:24 +0000
Subject: [PATCH 092/125] [no-verify] commit backend and tensor signature
 changes

---
 paddle/tcmpt/core/CMakeLists.txt             |   1 -
 paddle/tcmpt/core/backend.cc                 |  58 --------
 paddle/tcmpt/core/backend.h                  |  48 -------
 paddle/tcmpt/core/dense_tensor.h             |   2 -
 paddle/tcmpt/core/tensor_interface.h         |   2 -
 paddle/tcmpt/core/tensor_meta.h              |  10 +-
 paddle/tcmpt/hapi/include/backend.h          | 135 +++++++++++++++++++
 paddle/tcmpt/hapi/include/tensor.h           |  21 ++-
 paddle/tcmpt/hapi/include/tensor_signature.h |  44 ++++++
 9 files changed, 199 insertions(+), 122 deletions(-)
 delete mode 100644 paddle/tcmpt/core/backend.cc
 delete mode 100644 paddle/tcmpt/core/backend.h
 create mode 100644 paddle/tcmpt/hapi/include/backend.h
 create mode 100644 paddle/tcmpt/hapi/include/tensor_signature.h

diff --git a/paddle/tcmpt/core/CMakeLists.txt b/paddle/tcmpt/core/CMakeLists.txt
index 5eadf3db39a64..7f0cbf88ebc98 100644
--- a/paddle/tcmpt/core/CMakeLists.txt
+++ b/paddle/tcmpt/core/CMakeLists.txt
@@ -4,7 +4,6 @@ ELSE()
     set(MKLDNN_CTX_DEPS)
 ENDIF()
 
-cc_library(backend SRCS backend.cc)
 cc_library(dtype SRCS dtype.cc)
 cc_library(layout SRCS layout.cc)
 
diff --git a/paddle/tcmpt/core/backend.cc b/paddle/tcmpt/core/backend.cc
deleted file mode 100644
index 68c7adfcc2810..0000000000000
--- a/paddle/tcmpt/core/backend.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/tcmpt/core/backend.h"
-
-namespace pt {
-
-std::ostream& operator<<(std::ostream& os, Backend backend) {
-  switch (backend) {
-    case Backend::kUndef:
-      os << "Undefined";
-      break;
-    case Backend::kCPU:
-      os << "CPU";
-      break;
-    case Backend::kCUDA:
-      os << "CUDA";
-      break;
-    case Backend::kCUDAPinned:
-      os << "CUDAPinned";
-      break;
-    case Backend::kHIP:
-      os << "HIP";
-      break;
-    case Backend::kXPU:
-      os << "XPU";
-      break;
-    case Backend::kNPU:
-      os << "NPU";
-      break;
-    case Backend::kNPUPinned:
-      os << "NPUPinned";
-      break;
-    case Backend::kMKLDNN:
-      os << "MKLDNN";
-      break;
-    case Backend::kCUDNN:
-      os << "CUDNN";
-      break;
-    default:
-      // TODO(chenweihang): change to enforce later
-      throw std::runtime_error("Invalid Backend type.");
-  }
-  return os;
-}
-
-}  // namespace pt
diff --git a/paddle/tcmpt/core/backend.h b/paddle/tcmpt/core/backend.h
deleted file mode 100644
index b1ee09c177f29..0000000000000
--- a/paddle/tcmpt/core/backend.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <ostream>
-namespace pt {
-
-/**
- * [ Why need Backend? ]
- *
- * Backend not only means place. Backend is a superset of place.
- *
- * Place cannot indicate the difference in calculation methods on the device,
- * but in order to make the boundary of the kernel clearer and the function
- * more specific, we need to distinguish the calculation method.
- *
- * Such as the kernel for CUDA device, it is native CUDA kernel, or kernel
- * by calling CUDNN library.
- */
-enum class Backend {
-  kUndef = 0,
-  kCPU,
-  kCUDA,
-  kCUDAPinned,  // TODO(chenweihang): need to be removed
-  kHIP,         // TODO(chenweihang): hip is not need now
-  kXPU,
-  kNPU,
-  kNPUPinned,  // TODO(chenweihang): need to be removed
-  kMKLDNN,
-  kCUDNN,
-  kNumBackends,
-};
-
-std::ostream& operator<<(std::ostream& os, Backend backend);
-
-}  // namespace pt
diff --git a/paddle/tcmpt/core/dense_tensor.h b/paddle/tcmpt/core/dense_tensor.h
index d7853e7cba201..167b86e54efef 100644
--- a/paddle/tcmpt/core/dense_tensor.h
+++ b/paddle/tcmpt/core/dense_tensor.h
@@ -83,8 +83,6 @@ class DenseTensor : public TensorInterface {
 
   Place place() const override;
 
-  Backend backend() const override { return meta_.backend; }
-
   bool initialized() const override { return allocation_ != nullptr; }
 
   /* member methods */
diff --git a/paddle/tcmpt/core/tensor_interface.h b/paddle/tcmpt/core/tensor_interface.h
index 6991c0d7f7f71..c88c63b179d09 100644
--- a/paddle/tcmpt/core/tensor_interface.h
+++ b/paddle/tcmpt/core/tensor_interface.h
@@ -69,8 +69,6 @@ class TensorInterface {
 
   virtual Place place() const = 0;
 
-  virtual Backend backend() const = 0;
-
   virtual bool initialized() const = 0;
 };
 
diff --git a/paddle/tcmpt/core/tensor_meta.h b/paddle/tcmpt/core/tensor_meta.h
index de564a44de36e..35d636fde175d 100644
--- a/paddle/tcmpt/core/tensor_meta.h
+++ b/paddle/tcmpt/core/tensor_meta.h
@@ -71,16 +71,13 @@ struct TensorMeta {
 
   TensorMeta(TensorMeta&& meta)
       : dims(meta.dims),
-        backend(meta.backend),
+        backend_set(meta.backend_set),
         type(meta.type),
         layout(meta.layout),
         numel(meta.numel),
         offset(meta.offset),
         lod(meta.lod) {}
 
-  // Bad constructor, may introduce bug
-  // explicit TensorMeta(DDim dims) : dims(dims) {}
-
   // Compatible Contructor
   TensorMeta(const DDim& dims,
              Backend backend,
@@ -89,7 +86,7 @@ struct TensorMeta {
              size_t offset = 0UL,
              const LoD& lod = {})
       : dims(dims),
-        backend(backend),
+        backend_set(backend),
         type(type),
         layout(layout),
         offset(offset),
@@ -104,7 +101,8 @@ struct TensorMeta {
 
   DDim dims;
 
-  Backend backend{Backend::kCPU};
+  BackendSet backend_set{Backend::CPU};
+
   DataType type{DataType::kFLOAT32};
   DataLayout layout{DataLayout::kNCHW};
 
diff --git a/paddle/tcmpt/hapi/include/backend.h b/paddle/tcmpt/hapi/include/backend.h
new file mode 100644
index 0000000000000..b86029551d1b6
--- /dev/null
+++ b/paddle/tcmpt/hapi/include/backend.h
@@ -0,0 +1,135 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <ostream>
+
+namespace paddle {
+namespace experimental {
+
+/**
+ * [ Why need Backend? ]
+ *
+ * Backend not only means place. Backend is a superset of place.
+ *
+ * Place cannot indicate the difference in calculation methods on the device,
+ * but in order to make the boundary of the kernel clearer and the function
+ * more specific, we need to distinguish the calculation method.
+ *
+ * Such as the kernel for CUDA device, it can be a native CUDA kernel,
+ * or a kernel implemented by CUDNN library.
+ *
+ * Note(chenweihang): HIP is not needed now, we can added it if needed
+ * in the future
+ */
+enum class Backend : uint8_t {
+  // kernel backend cannot be undefined
+  UNDEFINED = 0,
+
+  // basic kernel backend
+  CPU,
+
+  // various acceleration devices' backends
+  CUDA,
+  XPU,  // XPU currently does not exist at the same time as CUDA
+  NPU,  // NPU currently does not exist at the same time as CUDA
+
+  // the third library backend
+  MKLDNN,
+  CUDNN,
+
+  // end of backend types
+  kNumBackends,
+};
+
+/**
+ * We use the backend to form a bit set to assist the runtime kernel selection,
+ * and the higher backend bit has a higher priority.
+ *
+ * A Tensor may belong to multiple backends at the same time, such CUDNN and
+ * CUDA. Only one backend value cannot
+ */
+class BackendSet final {
+ public:
+  constexpr BackendSet() : bitset_(0) {}
+  explicit constexpr BackendSet(Backend b)
+      : bitset_(b == Backend::UNDEFINED ? 0 : 1ULL << (static_cast<uint8_t>(b) -
+                                                       1)) {}
+
+  uint64_t bitset() const { return bitset_; }
+
+  bool inline Has(Backend b) const {
+    // TODO(chenweihang): replace by internal assert method later
+    if (b == Backend::UNDEFINED) {
+      throw std::runtime_error("Backend argument can't be UNDEFINED.");
+    }
+    return static_cast<bool>(bitset_ & BackendSet(b).bitset())
+  }
+  bool IsEmpty() const { return bitset_ == 0; }
+
+  BackendSet operator|(const BackendSet& other) const {
+    return BackendSet(bitset_ | other.bitset());
+  }
+  BackendSet operator&(const BackendSet& other) const {
+    return BackendSet(bitset_ & other.bitset());
+  }
+  BackendSet operator-(const BackendSet& other) const {
+    return BackendSet(bitset_ & ~other.bitset());
+  }
+  BackendSet operator^(const BackendSet& other) const {
+    return BackendSet(bitset_ ^ other.bitset());
+  }
+
+  bool operator==(const BackendSet& other) const {
+    return bitset_ == other.bitset();
+  }
+
+ private:
+  constexpr BackendSet(uint64_t bitset) : bitset_(bitset) {}
+  uint64_t bitset_;
+};
+
+std::ostream& operator<<(std::ostream& os, Backend backend) {
+  switch (backend) {
+    case Backend::UNDEFINED:
+      os << "Undefined";
+      break;
+    case Backend::CPU:
+      os << "CPU";
+      break;
+    case Backend::CUDA:
+      os << "CUDA";
+      break;
+    case Backend::XPU:
+      os << "XPU";
+      break;
+    case Backend::NPU:
+      os << "NPU";
+      break;
+    case Backend::MKLDNN:
+      os << "MKLDNN";
+      break;
+    case Backend::CUDNN:
+      os << "CUDNN";
+      break;
+    default:
+      // TODO(chenweihang): replace by internal enforce method later
+      throw std::runtime_error("Invalid Backend type.");
+  }
+  return os;
+}
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/tcmpt/hapi/include/tensor.h b/paddle/tcmpt/hapi/include/tensor.h
index eb64d66435c90..3c4c8728c6c11 100644
--- a/paddle/tcmpt/hapi/include/tensor.h
+++ b/paddle/tcmpt/hapi/include/tensor.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <utility>
 
 #include "paddle/tcmpt/core/tensor_interface.h"
+#include "paddle/tcmpt/core/tensor_signature.h"
 
 /**
  * [ Why still include the fluid headers? ]
@@ -138,16 +139,16 @@ class Tensor final {
   /**
    * Backend judgment APIs, shield the concept of Backend.
    */
-  bool is_cpu() const { return impl_->backend() == pt::Backend::kCPU; }
-  bool is_cuda() const { return impl_->backend() == pt::Backend::kCUDA; }
+  BackendSet backend_set() const { return signature_->backend_set; }
+
+  bool is_cpu() const;
+  bool is_cuda() const;
   bool is_hip() const;
   bool is_xpu() const;
   bool is_npu() const;
   bool is_mkldnn() const;
   bool is_cudnn() const;
 
-  bool is_selected_rows() const;
-
   /**
    * Backend convert APIs.
    */
@@ -258,7 +259,17 @@ class Tensor final {
    *    information, not Tensor data description-related information.
    * 2. Kernel calculation does not require AutogradMeta.
    */
-  std::shared_ptr<AutogradMetaInterface> autograd_meta_ = nullptr;
+  std::shared_ptr<AutogradMetaInterface> autograd_meta_{nullptr};
+
+  /**
+   * TensorSignature is used to store auxiliary description information
+   * needed by Tensor.
+   *
+   * The currently stored information includes:
+   * 1. name: used for Debug analysis in the development of new dygraph.
+   * 2. backend_set: used by the API to determine the kernel backend.
+   */
+  std::shared_ptr<TensorSignature> signature_{nullptr};
 };
 
 }  // namespace experimental
diff --git a/paddle/tcmpt/hapi/include/tensor_signature.h b/paddle/tcmpt/hapi/include/tensor_signature.h
new file mode 100644
index 0000000000000..31076758c0944
--- /dev/null
+++ b/paddle/tcmpt/hapi/include/tensor_signature.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+
+#include "paddle/tcmpt/hapi/include/backend.h"
+
+namespace paddle {
+namespace experimental {
+
+struct TensorSignature final {
+ public:
+  TensorSignature() = default;
+  TensorSignature& operator=(const TensorSignature&) = delete;
+  TensorSignature& operator=(TensorSignature&&) = delete;
+  TensorSignature(const TensorSignature&) = delete;
+  TensorSignature(TensorSignature&&) = delete;
+
+  TensorSignature(const std::string& t_name) : name(t_name) {}
+  TensorSignature(const BackendSet& t_backend_set)
+      : backend_set(t_backend_set) {}
+  TensorSignature(const std::string& t_name, const BackendSet& t_backend_set)
+      : name(t_name), backend_set(t_backend_set) {}
+
+ private:
+  std::string name{""};
+  BackendSet backend_set{Backend::CPU};
+};
+
+}  // namespace experimental
+}  // namespace paddle

From 466ce03d3e718e81a066f85e97e789d94c14d636 Mon Sep 17 00:00:00 2001
From: zyfncg <1370305206@qq.com>
Date: Wed, 20 Oct 2021 10:31:22 +0800
Subject: [PATCH 093/125] Rename tcmpt to pten (#23)

* rename tcmpt to pten

* update omitted files for rename to pten

* update omitted file for rename to pten
---
 cmake/generic.cmake                           |  22 ++--
 cmake/{tcmpt.cmake => pten.cmake}             |  10 +-
 paddle/CMakeLists.txt                         |   2 +-
 paddle/fluid/framework/CMakeLists.txt         |   8 +-
 paddle/fluid/framework/operator.cc            |  24 ++--
 paddle/fluid/framework/operator.h             |  12 +-
 .../{tcmpt_utils.cc => pten_utils.cc}         | 107 +++++++++--------
 .../framework/{tcmpt_utils.h => pten_utils.h} |  22 ++--
 ...tcmpt_utils_test.cc => pten_utils_test.cc} |  19 +--
 paddle/fluid/imperative/CMakeLists.txt        |   4 +-
 paddle/fluid/imperative/prepared_operator.cc  |  29 ++---
 paddle/fluid/imperative/prepared_operator.h   |   8 +-
 paddle/fluid/inference/CMakeLists.txt         |   8 +-
 paddle/fluid/operators/CMakeLists.txt         |   4 +-
 paddle/fluid/operators/dot_op.h               |  18 +--
 paddle/fluid/operators/fill_any_like_op.h     |  16 +--
 paddle/fluid/operators/mean_op.h              |  14 +--
 paddle/fluid/operators/scale_op.h             |  16 +--
 paddle/fluid/operators/sign_op.h              |  16 +--
 paddle/fluid/platform/CMakeLists.txt          |   2 +-
 paddle/fluid/pybind/op_function_generator.cc  |   4 +-
 paddle/pten/CMakeLists.txt                    |  15 +++
 paddle/pten/api/CMakeLists.txt                |  21 ++++
 paddle/{tcmpt => pten}/api/all.cc             |   4 +-
 paddle/{tcmpt => pten}/api/all.h              |  12 +-
 paddle/{tcmpt => pten}/api/include/core.h     |  10 +-
 paddle/{tcmpt => pten}/api/include/creation.h |   4 +-
 .../{tcmpt => pten}/api/include/infershape.h  |   4 +-
 paddle/{tcmpt => pten}/api/include/linalg.h   |   4 +-
 .../api/include/manipulation.h                |   4 +-
 paddle/{tcmpt => pten}/api/include/math.h     |   4 +-
 paddle/{tcmpt => pten}/api/include/symbols.h  |   2 +-
 paddle/{tcmpt => pten}/common/data_type.h     |   2 +-
 paddle/{tcmpt => pten}/common/layout.h        |   2 +-
 paddle/{tcmpt => pten}/core/CMakeLists.txt    |   0
 paddle/{tcmpt => pten}/core/allocator.cc      |   6 +-
 paddle/{tcmpt => pten}/core/allocator.h       |  16 ++-
 paddle/{tcmpt => pten}/core/backend.cc        |   6 +-
 paddle/{tcmpt => pten}/core/backend.h         |   4 +-
 paddle/{tcmpt => pten}/core/convert_utils.cc  |  22 ++--
 paddle/{tcmpt => pten}/core/convert_utils.h   |  10 +-
 paddle/{tcmpt => pten}/core/dense_tensor.cc   |  10 +-
 paddle/{tcmpt => pten}/core/dense_tensor.h    |  25 ++--
 paddle/{tcmpt => pten}/core/kernel_context.cc |   4 +-
 paddle/{tcmpt => pten}/core/kernel_context.h  |   7 +-
 paddle/{tcmpt => pten}/core/kernel_def.h      |   4 +-
 paddle/{tcmpt => pten}/core/kernel_factory.cc |  14 ++-
 paddle/{tcmpt => pten}/core/kernel_factory.h  |  12 +-
 paddle/{tcmpt => pten}/core/kernel_registry.h | 109 +++++++++---------
 paddle/{tcmpt => pten}/core/kernel_utils.h    |  16 +--
 paddle/{tcmpt => pten}/core/scalar.h          |   4 +-
 paddle/{tcmpt => pten}/core/spatial_tensor.h  |   6 +-
 paddle/{tcmpt => pten}/core/storage.cc        |   8 +-
 paddle/{tcmpt => pten}/core/storage.h         |  18 +--
 paddle/{tcmpt => pten}/core/tensor_base.cc    |   8 +-
 paddle/{tcmpt => pten}/core/tensor_base.h     |  24 ++--
 paddle/{tcmpt => pten}/core/tensor_meta.h     |  10 +-
 paddle/{tcmpt => pten}/core/tensor_status.h   |  10 +-
 .../{tcmpt => pten}/core/utils/CMakeLists.txt |   0
 .../core/utils/intrusive_ptr.h                |  10 +-
 .../core/utils/intrusive_ref_counter.h        |   6 +-
 paddle/{tcmpt => pten}/core/utils/type_info.h |   6 +-
 .../core/utils/type_registry.h                |   8 +-
 paddle/pten/hapi/CMakeLists.txt               |   3 +
 paddle/{tcmpt => pten}/hapi/all.cc            |   2 +-
 paddle/{tcmpt => pten}/hapi/all.h             |   8 +-
 .../{tcmpt => pten}/hapi/include/creation.h   |  19 +--
 paddle/{tcmpt => pten}/hapi/include/linalg.h  |   2 +-
 .../hapi/include/manipulation.h               |   2 +-
 paddle/{tcmpt => pten}/hapi/include/math.h    |   2 +-
 paddle/{tcmpt => pten}/hapi/include/tensor.h  |  24 ++--
 paddle/pten/hapi/lib/CMakeLists.txt           |   4 +
 paddle/{tcmpt => pten}/hapi/lib/creation.cc   |  28 ++---
 .../hapi/lib/kernel_generate.h                |  24 ++--
 paddle/{tcmpt => pten}/hapi/lib/linalg.cc     |  28 ++---
 .../{tcmpt => pten}/hapi/lib/manipulation.cc  |  18 +--
 paddle/{tcmpt => pten}/hapi/lib/math.cc       |  20 ++--
 .../{tcmpt => pten}/infershape/CMakeLists.txt |   0
 paddle/{tcmpt => pten}/infershape/binary.cc   |   6 +-
 paddle/{tcmpt => pten}/infershape/binary.h    |   6 +-
 paddle/{tcmpt => pten}/infershape/unary.cc    |   6 +-
 paddle/{tcmpt => pten}/infershape/unary.h     |   6 +-
 paddle/{tcmpt => pten}/kernels/CMakeLists.txt |   2 +-
 .../kernels/common/eigen/CMakeLists.txt       |   0
 .../kernels/common/eigen/common.h             |  31 ++---
 .../kernels/common/eigen/dot.h                |  20 ++--
 .../kernels/common/eigen/fill.h               |  10 +-
 .../kernels/common/eigen/mean.h               |  12 +-
 .../kernels/common/eigen/scale.h              |  12 +-
 .../kernels/common/eigen/sign.h               |  12 +-
 .../kernels/cpu/CMakeLists.txt                |   2 +-
 .../{tcmpt => pten}/kernels/cpu/creation.cc   |  12 +-
 paddle/{tcmpt => pten}/kernels/cpu/creation.h |   8 +-
 paddle/{tcmpt => pten}/kernels/cpu/linalg.cc  |  10 +-
 paddle/{tcmpt => pten}/kernels/cpu/linalg.h   |   6 +-
 .../kernels/cpu/manipulation.cc               |  16 +--
 .../kernels/cpu/manipulation.h                |   8 +-
 paddle/{tcmpt => pten}/kernels/cpu/math.cc    |  22 ++--
 paddle/{tcmpt => pten}/kernels/cpu/math.h     |   8 +-
 paddle/{tcmpt => pten}/kernels/cpu/utils.cc   |  12 +-
 paddle/{tcmpt => pten}/kernels/cpu/utils.h    |   8 +-
 .../kernels/cuda/CMakeLists.txt               |   2 +-
 .../{tcmpt => pten}/kernels/cuda/creation.cu  |  12 +-
 .../{tcmpt => pten}/kernels/cuda/creation.h   |   8 +-
 paddle/{tcmpt => pten}/kernels/cuda/linalg.cu |  12 +-
 paddle/{tcmpt => pten}/kernels/cuda/linalg.h  |   6 +-
 .../kernels/cuda/manipulation.cu              |  16 +--
 .../kernels/cuda/manipulation.h               |   6 +-
 paddle/{tcmpt => pten}/kernels/cuda/math.cu   |  30 ++---
 paddle/{tcmpt => pten}/kernels/cuda/math.h    |   6 +-
 paddle/{tcmpt => pten}/kernels/cuda/utils.cu  |  14 +--
 paddle/{tcmpt => pten}/kernels/cuda/utils.h   |   8 +-
 .../kernels/mkldnn/CMakeLists.txt             |   0
 .../kernels/npu/CMakeLists.txt                |   0
 .../kernels/xpu/CMakeLists.txt                |   0
 paddle/{tcmpt => pten}/module/CMakeLists.txt  |   0
 paddle/{tcmpt => pten}/tests/CMakeLists.txt   |   0
 paddle/{tcmpt => pten}/tests/backend_test.cc  |   2 +-
 .../tests/dense_tensor_test.cc                |  21 ++--
 paddle/{tcmpt => pten}/tests/dtype_test.cc    |   0
 .../tests/kernel_factory_test.cc              |   7 +-
 paddle/{tcmpt => pten}/tests/layout_test.cc   |   0
 paddle/{tcmpt => pten}/tests/test_copy_api.cc |  32 ++---
 paddle/{tcmpt => pten}/tests/test_dot_api.cc  |  36 +++---
 paddle/{tcmpt => pten}/tests/test_fill_api.cc |  69 +++++------
 .../{tcmpt => pten}/tests/test_flatten_api.cc |  24 ++--
 paddle/{tcmpt => pten}/tests/test_mean_api.cc |  24 ++--
 paddle/tcmpt/CMakeLists.txt                   |  15 ---
 paddle/tcmpt/api/CMakeLists.txt               |  21 ----
 paddle/tcmpt/hapi/CMakeLists.txt              |   3 -
 paddle/tcmpt/hapi/lib/CMakeLists.txt          |   4 -
 131 files changed, 820 insertions(+), 813 deletions(-)
 rename cmake/{tcmpt.cmake => pten.cmake} (84%)
 rename paddle/fluid/framework/{tcmpt_utils.cc => pten_utils.cc} (68%)
 rename paddle/fluid/framework/{tcmpt_utils.h => pten_utils.h} (83%)
 rename paddle/fluid/framework/{tcmpt_utils_test.cc => pten_utils_test.cc} (73%)
 create mode 100644 paddle/pten/CMakeLists.txt
 create mode 100644 paddle/pten/api/CMakeLists.txt
 rename paddle/{tcmpt => pten}/api/all.cc (89%)
 rename paddle/{tcmpt => pten}/api/all.h (69%)
 rename paddle/{tcmpt => pten}/api/include/core.h (75%)
 rename paddle/{tcmpt => pten}/api/include/creation.h (87%)
 rename paddle/{tcmpt => pten}/api/include/infershape.h (88%)
 rename paddle/{tcmpt => pten}/api/include/linalg.h (88%)
 rename paddle/{tcmpt => pten}/api/include/manipulation.h (87%)
 rename paddle/{tcmpt => pten}/api/include/math.h (88%)
 rename paddle/{tcmpt => pten}/api/include/symbols.h (94%)
 rename paddle/{tcmpt => pten}/common/data_type.h (99%)
 rename paddle/{tcmpt => pten}/common/layout.h (98%)
 rename paddle/{tcmpt => pten}/core/CMakeLists.txt (100%)
 rename paddle/{tcmpt => pten}/core/allocator.cc (82%)
 rename paddle/{tcmpt => pten}/core/allocator.h (93%)
 rename paddle/{tcmpt => pten}/core/backend.cc (94%)
 rename paddle/{tcmpt => pten}/core/backend.h (97%)
 rename paddle/{tcmpt => pten}/core/convert_utils.cc (94%)
 rename paddle/{tcmpt => pten}/core/convert_utils.h (90%)
 rename paddle/{tcmpt => pten}/core/dense_tensor.cc (95%)
 rename paddle/{tcmpt => pten}/core/dense_tensor.h (88%)
 rename paddle/{tcmpt => pten}/core/kernel_context.cc (88%)
 rename paddle/{tcmpt => pten}/core/kernel_context.h (97%)
 rename paddle/{tcmpt => pten}/core/kernel_def.h (97%)
 rename paddle/{tcmpt => pten}/core/kernel_factory.cc (91%)
 rename paddle/{tcmpt => pten}/core/kernel_factory.h (97%)
 rename paddle/{tcmpt => pten}/core/kernel_registry.h (91%)
 rename paddle/{tcmpt => pten}/core/kernel_utils.h (96%)
 rename paddle/{tcmpt => pten}/core/scalar.h (97%)
 rename paddle/{tcmpt => pten}/core/spatial_tensor.h (95%)
 rename paddle/{tcmpt => pten}/core/storage.cc (85%)
 rename paddle/{tcmpt => pten}/core/storage.h (85%)
 rename paddle/{tcmpt => pten}/core/tensor_base.cc (81%)
 rename paddle/{tcmpt => pten}/core/tensor_base.h (81%)
 rename paddle/{tcmpt => pten}/core/tensor_meta.h (96%)
 rename paddle/{tcmpt => pten}/core/tensor_status.h (92%)
 rename paddle/{tcmpt => pten}/core/utils/CMakeLists.txt (100%)
 rename paddle/{tcmpt => pten}/core/utils/intrusive_ptr.h (95%)
 rename paddle/{tcmpt => pten}/core/utils/intrusive_ref_counter.h (96%)
 rename paddle/{tcmpt => pten}/core/utils/type_info.h (95%)
 rename paddle/{tcmpt => pten}/core/utils/type_registry.h (94%)
 create mode 100644 paddle/pten/hapi/CMakeLists.txt
 rename paddle/{tcmpt => pten}/hapi/all.cc (95%)
 rename paddle/{tcmpt => pten}/hapi/all.h (77%)
 rename paddle/{tcmpt => pten}/hapi/include/creation.h (56%)
 rename paddle/{tcmpt => pten}/hapi/include/linalg.h (95%)
 rename paddle/{tcmpt => pten}/hapi/include/manipulation.h (94%)
 rename paddle/{tcmpt => pten}/hapi/include/math.h (94%)
 rename paddle/{tcmpt => pten}/hapi/include/tensor.h (91%)
 create mode 100644 paddle/pten/hapi/lib/CMakeLists.txt
 rename paddle/{tcmpt => pten}/hapi/lib/creation.cc (65%)
 rename paddle/{tcmpt => pten}/hapi/lib/kernel_generate.h (86%)
 rename paddle/{tcmpt => pten}/hapi/lib/linalg.cc (69%)
 rename paddle/{tcmpt => pten}/hapi/lib/manipulation.cc (77%)
 rename paddle/{tcmpt => pten}/hapi/lib/math.cc (75%)
 rename paddle/{tcmpt => pten}/infershape/CMakeLists.txt (100%)
 rename paddle/{tcmpt => pten}/infershape/binary.cc (96%)
 rename paddle/{tcmpt => pten}/infershape/binary.h (94%)
 rename paddle/{tcmpt => pten}/infershape/unary.cc (96%)
 rename paddle/{tcmpt => pten}/infershape/unary.h (94%)
 rename paddle/{tcmpt => pten}/kernels/CMakeLists.txt (94%)
 rename paddle/{tcmpt => pten}/kernels/common/eigen/CMakeLists.txt (100%)
 rename paddle/{tcmpt => pten}/kernels/common/eigen/common.h (86%)
 rename paddle/{tcmpt => pten}/kernels/common/eigen/dot.h (72%)
 rename paddle/{tcmpt => pten}/kernels/common/eigen/fill.h (91%)
 rename paddle/{tcmpt => pten}/kernels/common/eigen/mean.h (82%)
 rename paddle/{tcmpt => pten}/kernels/common/eigen/scale.h (85%)
 rename paddle/{tcmpt => pten}/kernels/common/eigen/sign.h (84%)
 rename paddle/{tcmpt => pten}/kernels/cpu/CMakeLists.txt (89%)
 rename paddle/{tcmpt => pten}/kernels/cpu/creation.cc (84%)
 rename paddle/{tcmpt => pten}/kernels/cpu/creation.h (88%)
 rename paddle/{tcmpt => pten}/kernels/cpu/linalg.cc (92%)
 rename paddle/{tcmpt => pten}/kernels/cpu/linalg.h (93%)
 rename paddle/{tcmpt => pten}/kernels/cpu/manipulation.cc (89%)
 rename paddle/{tcmpt => pten}/kernels/cpu/manipulation.h (88%)
 rename paddle/{tcmpt => pten}/kernels/cpu/math.cc (85%)
 rename paddle/{tcmpt => pten}/kernels/cpu/math.h (91%)
 rename paddle/{tcmpt => pten}/kernels/cpu/utils.cc (89%)
 rename paddle/{tcmpt => pten}/kernels/cpu/utils.h (87%)
 rename paddle/{tcmpt => pten}/kernels/cuda/CMakeLists.txt (94%)
 rename paddle/{tcmpt => pten}/kernels/cuda/creation.cu (84%)
 rename paddle/{tcmpt => pten}/kernels/cuda/creation.h (89%)
 rename paddle/{tcmpt => pten}/kernels/cuda/linalg.cu (86%)
 rename paddle/{tcmpt => pten}/kernels/cuda/linalg.h (92%)
 rename paddle/{tcmpt => pten}/kernels/cuda/manipulation.cu (90%)
 rename paddle/{tcmpt => pten}/kernels/cuda/manipulation.h (93%)
 rename paddle/{tcmpt => pten}/kernels/cuda/math.cu (85%)
 rename paddle/{tcmpt => pten}/kernels/cuda/math.h (94%)
 rename paddle/{tcmpt => pten}/kernels/cuda/utils.cu (97%)
 rename paddle/{tcmpt => pten}/kernels/cuda/utils.h (87%)
 rename paddle/{tcmpt => pten}/kernels/mkldnn/CMakeLists.txt (100%)
 rename paddle/{tcmpt => pten}/kernels/npu/CMakeLists.txt (100%)
 rename paddle/{tcmpt => pten}/kernels/xpu/CMakeLists.txt (100%)
 rename paddle/{tcmpt => pten}/module/CMakeLists.txt (100%)
 rename paddle/{tcmpt => pten}/tests/CMakeLists.txt (100%)
 rename paddle/{tcmpt => pten}/tests/backend_test.cc (94%)
 rename paddle/{tcmpt => pten}/tests/dense_tensor_test.cc (62%)
 rename paddle/{tcmpt => pten}/tests/dtype_test.cc (100%)
 rename paddle/{tcmpt => pten}/tests/kernel_factory_test.cc (75%)
 rename paddle/{tcmpt => pten}/tests/layout_test.cc (100%)
 rename paddle/{tcmpt => pten}/tests/test_copy_api.cc (64%)
 rename paddle/{tcmpt => pten}/tests/test_dot_api.cc (67%)
 rename paddle/{tcmpt => pten}/tests/test_fill_api.cc (54%)
 rename paddle/{tcmpt => pten}/tests/test_flatten_api.cc (72%)
 rename paddle/{tcmpt => pten}/tests/test_mean_api.cc (69%)
 delete mode 100644 paddle/tcmpt/CMakeLists.txt
 delete mode 100644 paddle/tcmpt/api/CMakeLists.txt
 delete mode 100644 paddle/tcmpt/hapi/CMakeLists.txt
 delete mode 100644 paddle/tcmpt/hapi/lib/CMakeLists.txt

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 12b4530a77a4c..2004abcbfa1f2 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -116,19 +116,19 @@ function(find_fluid_modules TARGET_NAME)
   endif()
 endfunction(find_fluid_modules)
 
-set_property(GLOBAL PROPERTY TCMPT_MODULES "")
-# find all tcmpt modules is used for paddle static library
+set_property(GLOBAL PROPERTY PTEN_MODULES "")
+# find all pten modules is used for paddle static library
 # for building inference libs
-function(find_tcmpt_modules TARGET_NAME)
+function(find_pten_modules TARGET_NAME)
   get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
   string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
-  string(FIND "${__target_path}" "tcmpt" pos)
+  string(FIND "${__target_path}" "pten" pos)
   if(pos GREATER 1)
-    get_property(tcmpt_modules GLOBAL PROPERTY TCMPT_MODULES)
-    set(tcmpt_modules ${tcmpt_modules} ${TARGET_NAME})
-    set_property(GLOBAL PROPERTY TCMPT_MODULES "${tcmpt_modules}")
+    get_property(pten_modules GLOBAL PROPERTY PTEN_MODULES)
+    set(pten_modules ${pten_modules} ${TARGET_NAME})
+    set_property(GLOBAL PROPERTY PTEN_MODULES "${pten_modules}")
   endif()
-endfunction(find_tcmpt_modules)
+endfunction(find_pten_modules)
 
 function(common_link TARGET_NAME)
   if (WITH_PROFILER)
@@ -324,7 +324,7 @@ function(cc_library TARGET_NAME)
       else()
         add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
         find_fluid_modules(${TARGET_NAME})
-        find_tcmpt_modules(${TARGET_NAME})
+        find_pten_modules(${TARGET_NAME})
       endif()
     if(cc_library_DEPS)
       # Don't need link libwarpctc.so
@@ -497,7 +497,7 @@ function(nv_library TARGET_NAME)
       else()
         add_library(${TARGET_NAME} STATIC ${nv_library_SRCS})
         find_fluid_modules(${TARGET_NAME})
-        find_tcmpt_modules(${TARGET_NAME})
+        find_pten_modules(${TARGET_NAME})
       endif()
       if (nv_library_DEPS)
         add_dependencies(${TARGET_NAME} ${nv_library_DEPS})
@@ -588,7 +588,7 @@ function(hip_library TARGET_NAME)
       else()
         hip_add_library(${TARGET_NAME} STATIC ${hip_library_SRCS})
         find_fluid_modules(${TARGET_NAME})
-        find_tcmpt_modules(${TARGET_NAME})
+        find_pten_modules(${TARGET_NAME})
       endif()
       if (hip_library_DEPS)
         add_dependencies(${TARGET_NAME} ${hip_library_DEPS})
diff --git a/cmake/tcmpt.cmake b/cmake/pten.cmake
similarity index 84%
rename from cmake/tcmpt.cmake
rename to cmake/pten.cmake
index 819cd42287974..bfe75475edcc0 100644
--- a/cmake/tcmpt.cmake
+++ b/cmake/pten.cmake
@@ -29,13 +29,13 @@ function(kernel_instantiate TARGET)
             string(REGEX MATCH "[A-Z][A-Za-z0-9]+\\(" func_name ${signature})
             string(REPLACE "(" "" func_name ${func_name})
             # message(STATUS "FUNC NAME: ${func_name}")
-            string(REGEX REPLACE "${func_name}" "pt::${func_name}<${dtype}>" inst_signature ${signature})
+            string(REGEX REPLACE "${func_name}" "pten::${func_name}<${dtype}>" inst_signature ${signature})
             # append namespace
-            string(REPLACE "CPUContext" "pt::CPUContext" inst_signature ${inst_signature})
-            string(REPLACE "CUDAContext" "pt::CUDAContext" inst_signature ${inst_signature})
-            string(REPLACE "DenseTensor" "pt::DenseTensor" inst_signature ${inst_signature})
+            string(REPLACE "CPUContext" "pten::CPUContext" inst_signature ${inst_signature})
+            string(REPLACE "CUDAContext" "pten::CUDAContext" inst_signature ${inst_signature})
+            string(REPLACE "DenseTensor" "pten::DenseTensor" inst_signature ${inst_signature})
             # TODO(chenweihang): adapt SelectedRows after adding it
-            # string(REPLACE "SelectedRowsTensor" "pt::SelectedRowsTensor" inst_signature ${inst_signature})
+            # string(REPLACE "SelectedRowsTensor" "pten::SelectedRowsTensor" inst_signature ${inst_signature})
             # message(STATUS "INST FUNC: ${inst_signature}")
             string(APPEND instantiate_context "template ${inst_signature};\n")
         endforeach()
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index ce3f6973e7a68..b3a1b2e8c9587 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -1,5 +1,5 @@
 add_subdirectory(scripts)
 add_subdirectory(testing)
 set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests CACHE INTERNAL "python tests directory")
-add_subdirectory(tcmpt)
+add_subdirectory(pten)
 add_subdirectory(fluid)
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 27f83a266ec9c..b1f23e50d31d2 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -193,10 +193,10 @@ cc_library(unused_var_check SRCS unused_var_check.cc DEPS glog no_need_buffer_va
 
 IF(WITH_XPU)
 cc_library(operator SRCS operator.cc DEPS xpu_op_list op_info device_context tensor scope glog trainer_desc_proto data_feed_proto
-    shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils tcmpt tcmpt_utils)
+    shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils pten pten_utils)
 ELSE()
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog trainer_desc_proto data_feed_proto
-    shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils tcmpt tcmpt_utils)
+    shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils pten pten_utils)
 ENDIF()
 
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
@@ -390,7 +390,7 @@ cc_library(save_load_util SRCS save_load_util.cc DEPS tensor scope layer)
 cc_test(save_load_util_test SRCS save_load_util_test.cc DEPS save_load_util tensor scope layer)
 cc_library(generator SRCS generator.cc DEPS enforce place)
 
-cc_library(tcmpt_utils SRCS tcmpt_utils.cc DEPS lod_tensor selected_rows place tcmpt var_type_traits)
+cc_library(pten_utils SRCS pten_utils.cc DEPS lod_tensor selected_rows place pten var_type_traits)
 
 # Get the current working branch
 execute_process(
@@ -454,4 +454,4 @@ if(WITH_TESTING AND TEST selected_rows_test)
 endif()
 
 cc_test(scope_guard_test SRCS scope_guard_test.cc)
-cc_test(tcmpt_utils_test SRCS tcmpt_utils_test.cc DEPS tcmpt_utils)
+cc_test(pten_utils_test SRCS pten_utils_test.cc DEPS pten_utils)
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 5a1c03327d592..d2704f046cb36 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -23,8 +23,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/details/nan_inf_utils.h"
 #include "paddle/fluid/framework/op_call_stack.h"
+#include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/framework/shape_inference.h"
-#include "paddle/fluid/framework/tcmpt_utils.h"
 #include "paddle/fluid/framework/transfer_scope_cache.h"
 #include "paddle/fluid/framework/unused_var_check.h"
 #include "paddle/fluid/framework/var_type.h"
@@ -1140,7 +1140,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   // and RCOM backend, the XPU, NPU and MKLDNN will be supported in the second
   // phase
   if (FLAGS_run_pt_kernel &&
-      pt::KernelFactory::Instance().ContainsKernel(type_.c_str())) {
+      pten::KernelFactory::Instance().ContainsKernel(type_.c_str())) {
     if (pt_kernel_signature_.get() == nullptr || pt_kernel_.get() == nullptr) {
       ChoosePtKernel(exe_ctx);
     }
@@ -1286,10 +1286,11 @@ void OperatorWithKernel::ChoosePtKernel(const ExecutionContext& ctx) const {
 
   kernel_type_.reset(new OpKernelType(InnerGetExpectedKernelType(ctx)));
 
-  auto pt_kernel_name = pt::KernelName(pt_kernel_signature_->first);
+  auto pt_kernel_name = pten::KernelName(pt_kernel_signature_->first);
   auto pt_kernel_key = TransOpKernelTypeToPtKernelKey(*kernel_type_.get());
-  pt_kernel_.reset(new pt::Kernel(pt::KernelFactory::Instance().SelectKernel(
-      pt_kernel_name, pt_kernel_key)));
+  pt_kernel_.reset(
+      new pten::Kernel(pten::KernelFactory::Instance().SelectKernel(
+          pt_kernel_name, pt_kernel_key)));
 
   if (pt_kernel_->IsValid()) {
     VLOG(1) << "Static mode ChoosePtKernel - kernel name: " << pt_kernel_name
@@ -1781,7 +1782,7 @@ KernelSignature OperatorWithKernel::GetExpectedPtKernelArgs(
   }
 }
 
-pt::KernelContext OperatorWithKernel::BuildPtKernelContext(
+pten::KernelContext OperatorWithKernel::BuildPtKernelContext(
     const RuntimeContext& ctx, const platform::DeviceContext& dev_ctx) const {
   VLOG(1) << RuntimeContextDebugString(ctx);
 
@@ -1792,7 +1793,7 @@ pt::KernelContext OperatorWithKernel::BuildPtKernelContext(
   // 3. needless attributes remove
   // 4. use pt Tensor directly
   // 5. kernel input is not DenseTensor
-  pt::KernelContext op_kernel_ctx(dev_ctx);
+  pten::KernelContext op_kernel_ctx(dev_ctx);
 
   auto& input_names = std::get<0>(pt_kernel_signature_->second);
   auto& attr_names = std::get<1>(pt_kernel_signature_->second);
@@ -1826,7 +1827,7 @@ pt::KernelContext OperatorWithKernel::BuildPtKernelContext(
             << in_def.layout;
 
     auto ins_vector = ctx.inputs.at(input_names[i]);
-    std::vector<std::shared_ptr<tcmpt::TensorBase>> tmp_inputs;
+    std::vector<std::shared_ptr<pten::TensorBase>> tmp_inputs;
 
     for (auto var : ins_vector) {
       auto pt_in = framework::InputVariableToPtTensor(*var, in_def);
@@ -1839,7 +1840,7 @@ pt::KernelContext OperatorWithKernel::BuildPtKernelContext(
     auto out_def = output_defs.at(i);
     auto outs_vector = ctx.outputs.at(output_names[i]);
 
-    std::vector<std::shared_ptr<tcmpt::TensorBase>> tmp_outputs;
+    std::vector<std::shared_ptr<pten::TensorBase>> tmp_outputs;
     for (auto var : outs_vector) {
       auto pt_out = framework::OutputVariableToPtTensor(var, out_def);
       tmp_outputs.emplace_back(pt_out);
@@ -1849,12 +1850,13 @@ pt::KernelContext OperatorWithKernel::BuildPtKernelContext(
 
   for (size_t i = 0; i < attr_names.size(); ++i) {
     auto& attr = Attrs().at(attr_names[i]);
-    if (attr_defs[i].type_index == std::type_index(typeid(pt::Scalar))) {
+    if (attr_defs[i].type_index == std::type_index(typeid(pten::Scalar))) {
       // TODO(chenweihang): support other attrs later
       // TODO(zhangyunfei): Scalar should hold scaler type, and we should check
       // attribtue type by attr_defs
       if (std::type_index(attr.type()) == std::type_index(typeid(float))) {
-        op_kernel_ctx.EmplaceBackAttr(pt::Scalar(BOOST_GET_CONST(float, attr)));
+        op_kernel_ctx.EmplaceBackAttr(
+            pten::Scalar(BOOST_GET_CONST(float, attr)));
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "unsupported cast op attribute `%s` to Scalar when construct "
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 7581b65e3b68b..29c60877b8116 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -39,7 +39,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/variant.h"
 #include "paddle/utils/flat_hash_map.h"
 
-#include "paddle/tcmpt/api/include/core.h"
+#include "paddle/pten/api/include/core.h"
 
 namespace paddle {
 namespace framework {
@@ -531,7 +531,7 @@ class OperatorWithKernel : public OperatorBase {
     return kernel_type_->place_;
   }
 
-  /* member functions for adapting to tcmpt lib */
+  /* member functions for adapting to pten lib */
   /** In the Tensor calculation library, the new Kernel adopts a clearer and
     * more streamlined design. The arguments of the Kernel and the input and
     * output arguments registered in the original OpMaker do not match in some
@@ -582,10 +582,10 @@ class OperatorWithKernel : public OperatorBase {
   Tensor* GetTensorFormInputSafely(const ExecutionContext& ctx,
                                    const std::string& name) const;
 
-  /* member functions for adapting to tcmpt lib */
+  /* member functions for adapting to pten lib */
   void ChoosePtKernel(const ExecutionContext& ctx) const;
 
-  pt::KernelContext BuildPtKernelContext(
+  pten::KernelContext BuildPtKernelContext(
       const RuntimeContext& ctx, const platform::DeviceContext& dev_ctx) const;
 
  protected:
@@ -599,11 +599,11 @@ class OperatorWithKernel : public OperatorBase {
   mutable std::mutex cache_update_mutex_;
   mutable bool enable_cache_transfer_scope_ = false;
   // NOTE(chenweihang): Similar op members are used to adapt to
-  // new tcmpt kernel, if there is a better design in the future,
+  // new pten kernel, if there is a better design in the future,
   // we may polish the implementation here
   mutable bool run_pt_kernel_ = false;
   mutable std::unique_ptr<KernelSignature> pt_kernel_signature_;
-  mutable std::unique_ptr<pt::Kernel> pt_kernel_;
+  mutable std::unique_ptr<pten::Kernel> pt_kernel_;
 };
 
 extern bool OpSupportGPU(const std::string& op_type);
diff --git a/paddle/fluid/framework/tcmpt_utils.cc b/paddle/fluid/framework/pten_utils.cc
similarity index 68%
rename from paddle/fluid/framework/tcmpt_utils.cc
rename to paddle/fluid/framework/pten_utils.cc
index fc38eb42d74c7..22d07e0d38fdb 100644
--- a/paddle/fluid/framework/tcmpt_utils.cc
+++ b/paddle/fluid/framework/pten_utils.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include <sstream>
 
-#include "paddle/fluid/framework/tcmpt_utils.h"
+#include "paddle/fluid/framework/pten_utils.h"
 
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/selected_rows.h"
@@ -26,13 +26,14 @@ namespace framework {
 
 // TODO(chenweihang, shixiaowei): adapt SelectedRows
 template <>
-std::shared_ptr<pt::DenseTensor> MakeTensorImpl<pt::DenseTensor, LoDTensor>(
-    const LoDTensor& tensor, pt::Backend backend, pt::DataType dtype,
-    pt::DataLayout layout) {
+std::shared_ptr<pten::DenseTensor> MakeTensorImpl<pten::DenseTensor, LoDTensor>(
+    const LoDTensor& tensor, pten::Backend backend,
+    paddle::experimental::DataType dtype,
+    paddle::experimental::DataLayout layout) {
   auto holder = tensor.Holder();
-  auto tensor_impl = std::make_shared<pt::DenseTensor>(
-      pt::TensorMeta(tensor.dims(), backend, dtype, layout, tensor.offset()),
-      pt::TensorStatus());
+  auto tensor_impl = std::make_shared<pten::DenseTensor>(
+      pten::TensorMeta(tensor.dims(), backend, dtype, layout, tensor.offset()),
+      pten::TensorStatus());
 
   if (holder != nullptr) {
     tensor_impl->ShareAllocation(tensor.Holder());
@@ -43,13 +44,14 @@ std::shared_ptr<pt::DenseTensor> MakeTensorImpl<pt::DenseTensor, LoDTensor>(
 }
 
 template <>
-std::shared_ptr<pt::DenseTensor> MakeTensorImpl<pt::DenseTensor, Tensor>(
-    const Tensor& tensor, pt::Backend backend, pt::DataType dtype,
-    pt::DataLayout layout) {
+std::shared_ptr<pten::DenseTensor> MakeTensorImpl<pten::DenseTensor, Tensor>(
+    const Tensor& tensor, pten::Backend backend,
+    paddle::experimental::DataType dtype,
+    paddle::experimental::DataLayout layout) {
   auto holder = tensor.Holder();
-  auto tensor_impl = std::make_shared<pt::DenseTensor>(
-      pt::TensorMeta(tensor.dims(), backend, dtype, layout, tensor.offset()),
-      pt::TensorStatus());
+  auto tensor_impl = std::make_shared<pten::DenseTensor>(
+      pten::TensorMeta(tensor.dims(), backend, dtype, layout, tensor.offset()),
+      pten::TensorStatus());
 
   if (holder != nullptr) {
     tensor_impl->ShareAllocation(tensor.Holder());
@@ -60,26 +62,26 @@ std::shared_ptr<pt::DenseTensor> MakeTensorImpl<pt::DenseTensor, Tensor>(
 }
 
 template <>
-std::shared_ptr<pt::DenseTensor> MakeTensorImpl<pt::DenseTensor>(
+std::shared_ptr<pten::DenseTensor> MakeTensorImpl<pten::DenseTensor>(
     const LoDTensor& tensor, const platform::Place& place,
     proto::VarType::Type type) {
-  return MakeTensorImpl<pt::DenseTensor, LoDTensor>(
-      tensor, pt::TransToPtBackend(place), pt::TransToPtDataType(type),
-      pt::TransToPtDataLayout(tensor.layout()));
+  return MakeTensorImpl<pten::DenseTensor, LoDTensor>(
+      tensor, pten::TransToPtBackend(place), pten::TransToPtDataType(type),
+      pten::TransToPtDataLayout(tensor.layout()));
 }
 
 template <>
-std::shared_ptr<pt::DenseTensor> MakeTensorImpl<pt::DenseTensor>(
+std::shared_ptr<pten::DenseTensor> MakeTensorImpl<pten::DenseTensor>(
     const Tensor& tensor, const platform::Place& place,
     proto::VarType::Type type) {
-  return MakeTensorImpl<pt::DenseTensor, Tensor>(
-      tensor, pt::TransToPtBackend(place), pt::TransToPtDataType(type),
-      pt::TransToPtDataLayout(tensor.layout()));
+  return MakeTensorImpl<pten::DenseTensor, Tensor>(
+      tensor, pten::TransToPtBackend(place), pten::TransToPtDataType(type),
+      pten::TransToPtDataLayout(tensor.layout()));
 }
 
-std::shared_ptr<tcmpt::TensorBase> InputVariableToPtTensor(
-    const framework::Variable& variable, const pt::TensorArgDef& arg_def) {
-  auto expected_place = pt::TransToFluidPlace(arg_def.backend);
+std::shared_ptr<pten::TensorBase> InputVariableToPtTensor(
+    const framework::Variable& variable, const pten::TensorArgDef& arg_def) {
+  auto expected_place = pten::TransToFluidPlace(arg_def.backend);
 
   if (variable.template IsType<framework::LoDTensor>()) {
     const auto& tensor = variable.template Get<framework::LoDTensor>();
@@ -87,12 +89,12 @@ std::shared_ptr<tcmpt::TensorBase> InputVariableToPtTensor(
       framework::LoDTensor tmp_tensor;
       framework::TensorCopySync(tensor, expected_place, &tmp_tensor);
       auto pt_in =
-          framework::MakeTensorImpl<pt::DenseTensor, framework::LoDTensor>(
+          framework::MakeTensorImpl<pten::DenseTensor, framework::LoDTensor>(
               tmp_tensor, arg_def.backend, arg_def.dtype, arg_def.layout);
       return pt_in;
     } else {
       auto pt_in =
-          framework::MakeTensorImpl<pt::DenseTensor, framework::LoDTensor>(
+          framework::MakeTensorImpl<pten::DenseTensor, framework::LoDTensor>(
               tensor, arg_def.backend, arg_def.dtype, arg_def.layout);
       return pt_in;
     }
@@ -105,12 +107,12 @@ std::shared_ptr<tcmpt::TensorBase> InputVariableToPtTensor(
       TensorCopySync(tensor.value(), expected_place, &tmp_tensor);
       // TODO(chenweihang): adapt SelectedRows by xiaowei's design
       auto pt_in =
-          framework::MakeTensorImpl<pt::DenseTensor, framework::Tensor>(
+          framework::MakeTensorImpl<pten::DenseTensor, framework::Tensor>(
               tmp_tensor, arg_def.backend, arg_def.dtype, arg_def.layout);
       return pt_in;
     } else {
       auto pt_in =
-          framework::MakeTensorImpl<pt::DenseTensor, framework::Tensor>(
+          framework::MakeTensorImpl<pten::DenseTensor, framework::Tensor>(
               tensor.value(), arg_def.backend, arg_def.dtype, arg_def.layout);
       return pt_in;
     }
@@ -122,27 +124,28 @@ std::shared_ptr<tcmpt::TensorBase> InputVariableToPtTensor(
   return nullptr;
 }
 
-std::shared_ptr<tcmpt::TensorBase> OutputVariableToPtTensor(
-    framework::Variable* variable, const pt::TensorArgDef& arg_def) {
+std::shared_ptr<pten::TensorBase> OutputVariableToPtTensor(
+    framework::Variable* variable, const pten::TensorArgDef& arg_def) {
   // mutable_data before run kernel, to avoid share output form
   // KernelContext to original tensor
   if (variable->template IsType<framework::LoDTensor>()) {
     auto* tensor = variable->template GetMutable<framework::LoDTensor>();
-    tensor->mutable_data(pt::TransToFluidPlace(arg_def.backend),
-                         pt::TransToProtoVarType(arg_def.dtype));
+    tensor->mutable_data(pten::TransToFluidPlace(arg_def.backend),
+                         pten::TransToProtoVarType(arg_def.dtype));
     auto pt_out =
-        framework::MakeTensorImpl<pt::DenseTensor, framework::LoDTensor>(
+        framework::MakeTensorImpl<pten::DenseTensor, framework::LoDTensor>(
             *tensor, arg_def.backend, arg_def.dtype, arg_def.layout);
     return pt_out;
   } else if (variable->template IsType<framework::SelectedRows>()) {
     auto* tensor = variable->template GetMutable<framework::SelectedRows>();
     tensor->mutable_value()->mutable_data(
-        pt::TransToFluidPlace(arg_def.backend),
-        pt::TransToProtoVarType(arg_def.dtype));
+        pten::TransToFluidPlace(arg_def.backend),
+        pten::TransToProtoVarType(arg_def.dtype));
     // TODO(chenweihang): adapt SelectedRows by xiaowei's design,
     // here the row and height will lost in output!
-    auto pt_out = framework::MakeTensorImpl<pt::DenseTensor, framework::Tensor>(
-        tensor->value(), arg_def.backend, arg_def.dtype, arg_def.layout);
+    auto pt_out =
+        framework::MakeTensorImpl<pten::DenseTensor, framework::Tensor>(
+            tensor->value(), arg_def.backend, arg_def.dtype, arg_def.layout);
     return pt_out;
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
@@ -153,14 +156,15 @@ std::shared_ptr<tcmpt::TensorBase> OutputVariableToPtTensor(
   return nullptr;
 }
 
-OpKernelType TransPtKernelKeyToOpKernelType(const pt::KernelKey& kernel_key) {
-  proto::VarType::Type data_type = pt::TransToProtoVarType(kernel_key.dtype());
-  platform::Place place = pt::TransToFluidPlace(kernel_key.backend());
-  DataLayout data_layout = pt::TransToFluidDataLayout(kernel_key.layout());
+OpKernelType TransPtKernelKeyToOpKernelType(const pten::KernelKey& kernel_key) {
+  proto::VarType::Type data_type =
+      pten::TransToProtoVarType(kernel_key.dtype());
+  platform::Place place = pten::TransToFluidPlace(kernel_key.backend());
+  DataLayout data_layout = pten::TransToFluidDataLayout(kernel_key.layout());
   LibraryType library_type = LibraryType::kPlain;
-  if (kernel_key.backend() == pt::Backend::kMKLDNN) {
+  if (kernel_key.backend() == pten::Backend::kMKLDNN) {
     library_type = LibraryType::kMKLDNN;
-  } else if (kernel_key.backend() == pt::Backend::kCUDNN) {
+  } else if (kernel_key.backend() == pten::Backend::kCUDNN) {
     library_type = LibraryType::kCUDNN;
   } else {
     // do nothing
@@ -169,18 +173,21 @@ OpKernelType TransPtKernelKeyToOpKernelType(const pt::KernelKey& kernel_key) {
   return OpKernelType(data_type, place, data_layout, library_type);
 }
 
-pt::KernelKey TransOpKernelTypeToPtKernelKey(const OpKernelType& kernel_type) {
-  pt::Backend backend = pt::TransToPtBackend(kernel_type.place_);
+pten::KernelKey TransOpKernelTypeToPtKernelKey(
+    const OpKernelType& kernel_type) {
+  pten::Backend backend = pten::TransToPtBackend(kernel_type.place_);
   if (kernel_type.library_type_ == LibraryType::kMKLDNN) {
-    backend = pt::Backend::kMKLDNN;
+    backend = pten::Backend::kMKLDNN;
   } else if (kernel_type.library_type_ == LibraryType::kCUDNN) {
-    backend = pt::Backend::kCUDNN;
+    backend = pten::Backend::kCUDNN;
   } else {
     // do
   }
-  pt::DataLayout layout = pt::TransToPtDataLayout(kernel_type.data_layout_);
-  pt::DataType dtype = pt::TransToPtDataType(kernel_type.data_type_);
-  return pt::KernelKey(backend, layout, dtype);
+  paddle::experimental::DataLayout layout =
+      pten::TransToPtDataLayout(kernel_type.data_layout_);
+  paddle::experimental::DataType dtype =
+      pten::TransToPtDataType(kernel_type.data_type_);
+  return pten::KernelKey(backend, layout, dtype);
 }
 
 KernelSignatureMap& KernelSignatureMap::Instance() {
diff --git a/paddle/fluid/framework/tcmpt_utils.h b/paddle/fluid/framework/pten_utils.h
similarity index 83%
rename from paddle/fluid/framework/tcmpt_utils.h
rename to paddle/fluid/framework/pten_utils.h
index 4d08692bd9c26..14dbe933195be 100644
--- a/paddle/fluid/framework/tcmpt_utils.h
+++ b/paddle/fluid/framework/pten_utils.h
@@ -24,7 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/tcmpt/api/include/core.h"
+#include "paddle/pten/api/include/core.h"
 #include "paddle/utils/flat_hash_map.h"
 #include "paddle/utils/small_vector.h"
 
@@ -34,10 +34,10 @@ namespace framework {
 /* tensor translate */
 
 template <typename PtTensorImplT, typename VariableT>
-std::shared_ptr<PtTensorImplT> MakeTensorImpl(const VariableT& tensor,
-                                              pt::Backend backend,
-                                              pt::DataType dtype,
-                                              pt::DataLayout layout);
+std::shared_ptr<PtTensorImplT> MakeTensorImpl(
+    const VariableT& tensor, pten::Backend backend,
+    paddle::experimental::DataType dtype,
+    paddle::experimental::DataLayout layout);
 
 template <typename PtTensorImplT>
 std::shared_ptr<PtTensorImplT> MakeTensorImpl(const LoDTensor& tensor,
@@ -55,15 +55,15 @@ void ShareTensorImpl(PtTensorImplT* tensor_impl, LoDTensor* out);
 template <typename PtTensorImplT>
 void ShareTensorImpl(PtTensorImplT* tensor_impl, Tensor* out);
 
-std::shared_ptr<tcmpt::TensorBase> InputVariableToPtTensor(
-    const framework::Variable& variable, const pt::TensorArgDef& arg_def);
-std::shared_ptr<tcmpt::TensorBase> OutputVariableToPtTensor(
-    framework::Variable* variable, const pt::TensorArgDef& arg_def);
+std::shared_ptr<pten::TensorBase> InputVariableToPtTensor(
+    const framework::Variable& variable, const pten::TensorArgDef& arg_def);
+std::shared_ptr<pten::TensorBase> OutputVariableToPtTensor(
+    framework::Variable* variable, const pten::TensorArgDef& arg_def);
 
 /* Kernel Key translate */
 
-OpKernelType TransPtKernelKeyToOpKernelType(const pt::KernelKey& kernel_key);
-pt::KernelKey TransOpKernelTypeToPtKernelKey(const OpKernelType& kernel_type);
+OpKernelType TransPtKernelKeyToOpKernelType(const pten::KernelKey& kernel_key);
+pten::KernelKey TransOpKernelTypeToPtKernelKey(const OpKernelType& kernel_type);
 
 /* Kernel Args parse */
 
diff --git a/paddle/fluid/framework/tcmpt_utils_test.cc b/paddle/fluid/framework/pten_utils_test.cc
similarity index 73%
rename from paddle/fluid/framework/tcmpt_utils_test.cc
rename to paddle/fluid/framework/pten_utils_test.cc
index 200bd5429cd46..96f75ac0c1121 100644
--- a/paddle/fluid/framework/tcmpt_utils_test.cc
+++ b/paddle/fluid/framework/pten_utils_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/framework/tcmpt_utils.h"
+#include "paddle/fluid/framework/pten_utils.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/selected_rows.h"
@@ -31,14 +31,14 @@ TEST(TcmptUtils, MakeTensor) {
   x.data<float>()[1] = 0.5;
 
   // 2. test API
-  auto dense_x = MakeTensorImpl<pt::DenseTensor>(x, x.place(), x.type());
+  auto dense_x = MakeTensorImpl<pten::DenseTensor>(x, x.place(), x.type());
 
   // 3. check result
   std::vector<float> expect_value = {0.2, 0.5};
   ASSERT_EQ(dense_x->data<float>()[0], expect_value[0]);
   ASSERT_EQ(dense_x->data<float>()[1], expect_value[1]);
-  ASSERT_EQ(dense_x->backend(), pt::Backend::kCPU);
-  ASSERT_EQ(dense_x->data_type(), pt::DataType::kFLOAT32);
+  ASSERT_EQ(dense_x->backend(), pten::Backend::kCPU);
+  ASSERT_EQ(dense_x->data_type(), paddle::experimental::DataType::kFLOAT32);
 }
 
 TEST(TcmptUtils, VarToPtTensor) {
@@ -49,18 +49,19 @@ TEST(TcmptUtils, VarToPtTensor) {
   auto* data =
       value->mutable_data<int>(make_ddim({1, 1}), paddle::platform::CPUPlace());
   data[0] = 123;
-  pt::Backend expect_backend = pt::Backend::kCPU;
+  pten::Backend expect_backend = pten::Backend::kCPU;
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  expect_backend = pt::Backend::kCUDA;
+  expect_backend = pten::Backend::kCUDA;
 #endif
-  auto tensor_def = pt::TensorArgDef(expect_backend, pt::DataLayout::kNCHW,
-                                     pt::DataType::kINT32);
+  auto tensor_def = pten::TensorArgDef(expect_backend,
+                                       paddle::experimental::DataLayout::kNCHW,
+                                       paddle::experimental::DataType::kINT32);
   // 2. test API
   auto tensor_x = InputVariableToPtTensor(v, tensor_def);
   // 3. check result
   ASSERT_EQ(tensor_x->backend(), expect_backend);
-  ASSERT_EQ(tensor_x->data_type(), pt::DataType::kINT32);
+  ASSERT_EQ(tensor_x->data_type(), paddle::experimental::DataType::kINT32);
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index 617825870301b..c45f92496b3e8 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -1,9 +1,9 @@
 cc_library(imperative_flag SRCS flags.cc DEPS gflags flags)
 
 IF(WITH_XPU)
-cc_library(prepared_operator SRCS prepared_operator.cc DEPS xpu_op_list proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform nan_inf_utils tcmpt_utils)
+cc_library(prepared_operator SRCS prepared_operator.cc DEPS xpu_op_list proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform nan_inf_utils pten_utils)
 ELSE()
-cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform nan_inf_utils tcmpt_utils)
+cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows var_type_traits op_kernel_type data_transform nan_inf_utils pten_utils)
 ENDIF()
 cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry)
 add_subdirectory(jit)
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index f65b799e150fc..97d893babae18 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -16,7 +16,7 @@
 
 #include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/details/nan_inf_utils.h"
-#include "paddle/fluid/framework/tcmpt_utils.h"
+#include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/imperative/infer_shape_context.h"
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/xpu/xpu_op_list.h"
@@ -109,7 +109,7 @@ PreparedOp::PreparedOp(const framework::OperatorBase& op,
                        const framework::RuntimeContext& ctx,
                        const framework::OpKernelType& kernel_type,
                        const framework::KernelSignature& kernel_signature,
-                       const pt::Kernel& pt_kernel,
+                       const pten::Kernel& pt_kernel,
                        platform::DeviceContext* dev_ctx)
     : op_(op),
       ctx_(ctx),
@@ -152,15 +152,15 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
   VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
 
   if (FLAGS_run_pt_kernel &&
-      pt::KernelFactory::Instance().ContainsKernel(op.Type().c_str())) {
+      pten::KernelFactory::Instance().ContainsKernel(op.Type().c_str())) {
     auto pt_kernel_signature = op.GetExpectedPtKernelArgs(dygraph_exe_ctx);
 
     VLOG(1) << framework::KernelSignatureToString(pt_kernel_signature);
 
-    auto pt_kernel_name = pt::KernelName(pt_kernel_signature.first);
+    auto pt_kernel_name = pten::KernelName(pt_kernel_signature.first);
     auto pt_kernel_key = TransOpKernelTypeToPtKernelKey(expected_kernel_key);
-    auto pt_kernel = pt::KernelFactory::Instance().SelectKernel(pt_kernel_name,
-                                                                pt_kernel_key);
+    auto pt_kernel = pten::KernelFactory::Instance().SelectKernel(
+        pt_kernel_name, pt_kernel_key);
 
     if (pt_kernel.IsValid()) {
       VLOG(1) << "Dynamic mode PrepareImpl - kernel name: " << pt_kernel_name
@@ -243,9 +243,9 @@ PreparedOp PreparedOp::Prepare(const NameVarMap<VariableWrapper>& ins,
 }
 
 template <typename VarType>
-static pt::KernelContext BuildDygraphPtKernelContext(
+static pten::KernelContext BuildDygraphPtKernelContext(
     const framework::KernelSignature& pt_kernel_signature,
-    const pt::Kernel& pt_kernel, const NameVarMap<VarType>& ins,
+    const pten::Kernel& pt_kernel, const NameVarMap<VarType>& ins,
     const NameVarMap<VarType>& outs, const framework::AttributeMap& attrs,
     const framework::AttributeMap& default_attrs,
     const platform::DeviceContext& dev_ctx) {
@@ -256,7 +256,7 @@ static pt::KernelContext BuildDygraphPtKernelContext(
   // 3. needless attributes remove
   // 4. use pt Tensor directly
   // 5. kernel input is not DenseTensor
-  pt::KernelContext op_kernel_ctx(dev_ctx);
+  pten::KernelContext op_kernel_ctx(dev_ctx);
 
   auto& input_names = std::get<0>(pt_kernel_signature.second);
   auto& attr_names = std::get<1>(pt_kernel_signature.second);
@@ -288,7 +288,7 @@ static pt::KernelContext BuildDygraphPtKernelContext(
     auto& in_def = input_defs.at(i);
     auto& ins_vector = ins.at(input_names[i]);
 
-    std::vector<std::shared_ptr<tcmpt::TensorBase>> tmp_inputs;
+    std::vector<std::shared_ptr<pten::TensorBase>> tmp_inputs;
     for (auto var : ins_vector) {
       const auto& variable = var->Var();
 
@@ -302,7 +302,7 @@ static pt::KernelContext BuildDygraphPtKernelContext(
     auto& out_def = output_defs.at(i);
     auto& outs_vector = outs.at(output_names[i]);
 
-    std::vector<std::shared_ptr<tcmpt::TensorBase>> tmp_outputs;
+    std::vector<std::shared_ptr<pten::TensorBase>> tmp_outputs;
     for (auto var : outs_vector) {
       auto* variable = var->MutableVar();
 
@@ -314,12 +314,13 @@ static pt::KernelContext BuildDygraphPtKernelContext(
 
   for (size_t i = 0; i < attr_names.size(); ++i) {
     auto& attr = GetAttr(attrs, default_attrs, attr_names[i]);
-    if (attr_defs[i].type_index == std::type_index(typeid(pt::Scalar))) {
+    if (attr_defs[i].type_index == std::type_index(typeid(pten::Scalar))) {
       // TODO(chenweihang): support other attrs later
       // TODO(zhangyunfei): Scalar should hold scaler type, and we should check
       // attribtue type by attr_defs
       if (std::type_index(attr.type()) == std::type_index(typeid(float))) {
-        op_kernel_ctx.EmplaceBackAttr(pt::Scalar(BOOST_GET_CONST(float, attr)));
+        op_kernel_ctx.EmplaceBackAttr(
+            pten::Scalar(BOOST_GET_CONST(float, attr)));
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "unsupported cast op attribute `%s` to Scalar when construct "
@@ -391,7 +392,7 @@ template <typename VarType>
 static void PreparedOpRunPtImpl(
     const framework::OperatorBase& op,
     const framework::KernelSignature& pt_kernel_signature,
-    const pt::Kernel& pt_kernel, platform::DeviceContext* dev_ctx,
+    const pten::Kernel& pt_kernel, platform::DeviceContext* dev_ctx,
     const NameVarMap<VarType>& ins, const NameVarMap<VarType>& outs,
     const framework::AttributeMap& attrs,
     const framework::AttributeMap& default_attrs) {
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index d1a47117f389b..42bd581b9f24a 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -26,7 +26,7 @@
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/imperative/type_defs.h"
 
-#include "paddle/tcmpt/api/include/core.h"
+#include "paddle/pten/api/include/core.h"
 
 DECLARE_bool(use_mkldnn);
 
@@ -154,7 +154,7 @@ class PreparedOp {
              const framework::RuntimeContext& ctx,
              const framework::OpKernelType& kernel_type,
              const framework::KernelSignature& kernel_signature,
-             const pt::Kernel& pt_kernel, platform::DeviceContext* dev_ctx);
+             const pten::Kernel& pt_kernel, platform::DeviceContext* dev_ctx);
 
   static PreparedOp Prepare(const NameVarMap<VarBase>& ins,
                             const NameVarMap<VarBase>& outs,
@@ -188,11 +188,11 @@ class PreparedOp {
   framework::OperatorWithKernel::OpKernelFunc func_;
   platform::DeviceContext* dev_ctx_;
   // NOTE(chenweihang): Similar op members are used to adapt to
-  // new tcmpt kernel, if there is a better design in the future,
+  // new pten kernel, if there is a better design in the future,
   // we may polish the implementation here
   bool run_pt_kernel_{false};
   framework::KernelSignature pt_kernel_signature_;
-  pt::Kernel pt_kernel_;
+  pten::Kernel pt_kernel_;
 };
 
 }  // namespace imperative
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 3357625b74c22..09c72cb13b803 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -35,7 +35,7 @@ endif()
 
 # fluid_modules exclude API-interface of inference/api and inference/capi_exp
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
-get_property(tcmpt_modules GLOBAL PROPERTY TCMPT_MODULES)
+get_property(pten_modules GLOBAL PROPERTY PTEN_MODULES)
 
 # Adapt to custom op mechanism: Include the header files related to the data type
 # to avoid exposing the path of the underlying file
@@ -51,9 +51,9 @@ set(STATIC_INFERENCE_API paddle_inference_api analysis_predictor
         analysis_config paddle_pass_builder activation_functions ${mkldnn_quantizer_cfg})
 #TODO(wilber, T8T9): Do we still need to support windows gpu static library?
 if(WIN32 AND WITH_GPU)
-  cc_library(paddle_inference DEPS ${fluid_modules} ${tcmpt_modules} ${STATIC_INFERENCE_API})
+  cc_library(paddle_inference DEPS ${fluid_modules} ${pten_modules} ${STATIC_INFERENCE_API})
 else()
-  create_static_lib(paddle_inference ${fluid_modules} ${tcmpt_modules} ${STATIC_INFERENCE_API})
+  create_static_lib(paddle_inference ${fluid_modules} ${pten_modules} ${STATIC_INFERENCE_API})
 endif()
 
 if(NOT APPLE)
@@ -83,7 +83,7 @@ set(SHARED_INFERENCE_SRCS
     ${PADDLE_CUSTOM_OP_SRCS})
 
 # shared inference library deps
-set(SHARED_INFERENCE_DEPS ${fluid_modules} ${tcmpt_modules} analysis_predictor)
+set(SHARED_INFERENCE_DEPS ${fluid_modules} ${pten_modules} analysis_predictor)
 
 if (WITH_CRYPTO) 
     set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} paddle_crypto)
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 1ce7fd8d0f91b..bfeb2db6d885b 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -78,8 +78,8 @@ if(WITH_UNITY_BUILD)
     include(unity_build_rule.cmake)
 endif()
 
-set(OP_HEADER_DEPS ${OP_HEADER_DEPS} tcmpt)
-set(OP_HEADER_DEPS ${OP_HEADER_DEPS} tcmpt_utils)
+set(OP_HEADER_DEPS ${OP_HEADER_DEPS} pten)
+set(OP_HEADER_DEPS ${OP_HEADER_DEPS} pten_utils)
 register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op sparse_attention_op lstm_op run_program_op eye_op recurrent_op
         sync_batch_norm_op spectral_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
 
diff --git a/paddle/fluid/operators/dot_op.h b/paddle/fluid/operators/dot_op.h
index a427da4f40f9f..641b0d653d5b0 100644
--- a/paddle/fluid/operators/dot_op.h
+++ b/paddle/fluid/operators/dot_op.h
@@ -16,13 +16,13 @@
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tcmpt_utils.h"
+#include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/operators/math/complex_functors.h"
 #include "paddle/fluid/platform/for_range.h"
 
-// only can include the headers in paddle/tcmpt/api dirs
-#include "paddle/tcmpt/api/include/core.h"
-#include "paddle/tcmpt/api/include/linalg.h"
+// only can include the headers in paddle/pten/api dirs
+#include "paddle/pten/api/include/core.h"
+#include "paddle/pten/api/include/linalg.h"
 
 namespace paddle {
 namespace operators {
@@ -245,14 +245,14 @@ class DotKernel : public framework::OpKernel<T> {
     out->mutable_data<T>(x->place());
 
     auto pt_x =
-        framework::MakeTensorImpl<pt::DenseTensor>(*x, x->place(), x->type());
+        framework::MakeTensorImpl<pten::DenseTensor>(*x, x->place(), x->type());
     auto pt_y =
-        framework::MakeTensorImpl<pt::DenseTensor>(*y, y->place(), y->type());
-    auto pt_out =
-        framework::MakeTensorImpl<pt::DenseTensor>(*out, x->place(), x->type());
+        framework::MakeTensorImpl<pten::DenseTensor>(*y, y->place(), y->type());
+    auto pt_out = framework::MakeTensorImpl<pten::DenseTensor>(*out, x->place(),
+                                                               x->type());
 
     // call new kernel
-    pt::Dot<T>(dev_ctx, *pt_x.get(), *pt_y.get(), pt_out.get());
+    pten::Dot<T>(dev_ctx, *pt_x.get(), *pt_y.get(), pt_out.get());
   }
 };
 
diff --git a/paddle/fluid/operators/fill_any_like_op.h b/paddle/fluid/operators/fill_any_like_op.h
index c1c7152581ce5..73170c6e2e277 100644
--- a/paddle/fluid/operators/fill_any_like_op.h
+++ b/paddle/fluid/operators/fill_any_like_op.h
@@ -17,10 +17,10 @@ limitations under the License. */
 #include <limits>
 #include <type_traits>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tcmpt_utils.h"
+#include "paddle/fluid/framework/pten_utils.h"
 
-#include "paddle/tcmpt/api/include/core.h"
-#include "paddle/tcmpt/api/include/creation.h"
+#include "paddle/pten/api/include/core.h"
+#include "paddle/pten/api/include/creation.h"
 
 namespace paddle {
 namespace operators {
@@ -62,14 +62,14 @@ class FillAnyLikeKernel : public framework::OpKernel<T> {
         std::isnan(value), false,
         platform::errors::InvalidArgument("The filled value is NaN."));
 
-    auto pt_x = framework::MakeTensorImpl<pt::DenseTensor>(*in, in->place(),
-                                                           in->type());
-    auto pt_out = framework::MakeTensorImpl<pt::DenseTensor>(*out, out->place(),
-                                                             out->type());
+    auto pt_x = framework::MakeTensorImpl<pten::DenseTensor>(*in, in->place(),
+                                                             in->type());
+    auto pt_out = framework::MakeTensorImpl<pten::DenseTensor>(
+        *out, out->place(), out->type());
 
     const auto& dev_ctx = context.template device_context<DeviceContext>();
     // call new kernel
-    pt::FillAnyLike<T>(dev_ctx, *pt_x, value, pt_out.get());
+    pten::FillAnyLike<T>(dev_ctx, *pt_x, value, pt_out.get());
   }
 };
 
diff --git a/paddle/fluid/operators/mean_op.h b/paddle/fluid/operators/mean_op.h
index 1ae6f453a873e..661ff41f10f85 100644
--- a/paddle/fluid/operators/mean_op.h
+++ b/paddle/fluid/operators/mean_op.h
@@ -15,11 +15,11 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tcmpt_utils.h"
+#include "paddle/fluid/framework/pten_utils.h"
 
 // only can include the headers in paddle/top/api dirs
-#include "paddle/tcmpt/api/include/core.h"
-#include "paddle/tcmpt/api/include/math.h"
+#include "paddle/pten/api/include/core.h"
+#include "paddle/pten/api/include/math.h"
 
 namespace paddle {
 namespace operators {
@@ -62,13 +62,13 @@ class MeanKernel : public framework::OpKernel<T> {
     out->mutable_data<T>(x->place());
 
     auto pt_x =
-        framework::MakeTensorImpl<pt::DenseTensor>(*x, x->place(), x->type());
-    auto pt_out =
-        framework::MakeTensorImpl<pt::DenseTensor>(*out, x->place(), x->type());
+        framework::MakeTensorImpl<pten::DenseTensor>(*x, x->place(), x->type());
+    auto pt_out = framework::MakeTensorImpl<pten::DenseTensor>(*out, x->place(),
+                                                               x->type());
 
     // call new kernel
     VLOG(1) << "chenweihang: call original mean kernel compute.";
-    pt::Mean<T>(dev_ctx, *pt_x.get(), pt_out.get());
+    pten::Mean<T>(dev_ctx, *pt_x.get(), pt_out.get());
   }
 };
 
diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h
index ffc2a49232cd8..9a043361678b2 100644
--- a/paddle/fluid/operators/scale_op.h
+++ b/paddle/fluid/operators/scale_op.h
@@ -15,11 +15,11 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tcmpt_utils.h"
+#include "paddle/fluid/framework/pten_utils.h"
 
 // only can include the headers in paddle/top/api dirs
-#include "paddle/tcmpt/api/include/core.h"
-#include "paddle/tcmpt/api/include/math.h"
+#include "paddle/pten/api/include/core.h"
+#include "paddle/pten/api/include/math.h"
 
 namespace paddle {
 namespace operators {
@@ -66,14 +66,14 @@ class ScaleKernel : public framework::OpKernel<T> {
     out->mutable_data<T>(in->place());
     auto& dev_ctx = ctx.device_context<DeviceContext>();
 
-    auto pt_x = framework::MakeTensorImpl<pt::DenseTensor>(*in, in->place(),
-                                                           in->type());
-    auto pt_out = framework::MakeTensorImpl<pt::DenseTensor>(*out, in->place(),
+    auto pt_x = framework::MakeTensorImpl<pten::DenseTensor>(*in, in->place(),
                                                              in->type());
+    auto pt_out = framework::MakeTensorImpl<pten::DenseTensor>(
+        *out, in->place(), in->type());
 
     // call new kernel
-    pt::Scale<T>(dev_ctx, *pt_x.get(), scale, bias, bias_after_scale,
-                 pt_out.get());
+    pten::Scale<T>(dev_ctx, *pt_x.get(), scale, bias, bias_after_scale,
+                   pt_out.get());
   }
 };
 
diff --git a/paddle/fluid/operators/sign_op.h b/paddle/fluid/operators/sign_op.h
index bb439839bd330..f3083f4937875 100644
--- a/paddle/fluid/operators/sign_op.h
+++ b/paddle/fluid/operators/sign_op.h
@@ -16,12 +16,12 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tcmpt_utils.h"
+#include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 
-// only can include the headers in paddle/tcmpt/api dirs
-#include "paddle/tcmpt/api/include/core.h"
-#include "paddle/tcmpt/api/include/math.h"
+// only can include the headers in paddle/pten/api dirs
+#include "paddle/pten/api/include/core.h"
+#include "paddle/pten/api/include/math.h"
 
 namespace paddle {
 namespace operators {
@@ -37,12 +37,12 @@ class SignKernel : public framework::OpKernel<T> {
     out->mutable_data<T>(x->place());
 
     auto pt_x =
-        framework::MakeTensorImpl<pt::DenseTensor>(*x, x->place(), x->type());
-    auto pt_out =
-        framework::MakeTensorImpl<pt::DenseTensor>(*out, x->place(), x->type());
+        framework::MakeTensorImpl<pten::DenseTensor>(*x, x->place(), x->type());
+    auto pt_out = framework::MakeTensorImpl<pten::DenseTensor>(*out, x->place(),
+                                                               x->type());
 
     // call new kernel
-    pt::Sign<T>(dev_ctx, *pt_x.get(), pt_out.get());
+    pten::Sign<T>(dev_ctx, *pt_x.get(), pt_out.get());
   }
 };
 
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 96bcbe7d0238e..54e73c5c1d9fa 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -169,7 +169,7 @@ if(WITH_GPU)
   nv_test(device_event_test SRCS device_event_test.cc DEPS device_event_gpu)
 
   nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
-  nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda tcmpt)
+  nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda pten)
   nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda)
   nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
 endif()
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index c92173b230ae6..b8b0f65eaa1ce 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -554,9 +554,9 @@ GenerateOpFunctions() {
     auto& op_type = op_proto->type();
     // Skip ooerator which is not inherit form OperatorWithKernel, like while,
     // since only OperatorWithKernel can run in dygraph mode.
-    // if the tcmpt lib contains op kernel, we still generate ops method
+    // if the pten lib contains op kernel, we still generate ops method
     if (!all_kernels.count(op_type) &&
-        !pt::KernelFactory::Instance().ContainsKernel(op_type.c_str())) {
+        !pten::KernelFactory::Instance().ContainsKernel(op_type.c_str())) {
       continue;
     }
 
diff --git a/paddle/pten/CMakeLists.txt b/paddle/pten/CMakeLists.txt
new file mode 100644
index 0000000000000..3bf1e6759b35a
--- /dev/null
+++ b/paddle/pten/CMakeLists.txt
@@ -0,0 +1,15 @@
+include(pten)
+# pten api
+add_subdirectory(api)
+# pten high level api
+add_subdirectory(hapi)
+# pten core components
+add_subdirectory(core)
+# pten kernels for diff device
+add_subdirectory(kernels)
+# pten infershape
+add_subdirectory(infershape)
+# TODO(xingfeng): pten inner module API designed by a high-performance team
+add_subdirectory(module)
+# pten tests
+add_subdirectory(tests)
diff --git a/paddle/pten/api/CMakeLists.txt b/paddle/pten/api/CMakeLists.txt
new file mode 100644
index 0000000000000..aabef9185f6c1
--- /dev/null
+++ b/paddle/pten/api/CMakeLists.txt
@@ -0,0 +1,21 @@
+# set(declare_file ${PADDLE_BINARY_DIR}/paddle/pten/api/symbols.h.tmp CACHE INTERNAL "symbols.h file")
+# set(declare_file_final ${PADDLE_BINARY_DIR}/paddle/pten/api/symbols.h)
+# file(WRITE ${declare_file} "// Generated by the paddle/pten/api/CMakeLists.txt.  DO NOT EDIT!\n\n")
+
+# function(declare_module TARGTE)
+#     file(APPEND ${declare_file} "extern int RegisterSymbolsFor${TARGET}();\n")
+#     message(STATUS "")
+# endfunction()
+
+# TODO(chenweihang): unify decclare into **_library
+# declare_module(MathCPU)
+# declare_module(MathCUDA)
+
+set(PTEN_DEPS convert_utils dense_tensor kernel_factory kernel_context)
+set(PTEN_DEPS ${PTEN_DEPS} math_cpu linalg_cpu creation_cpu manipulation_cpu)
+set(PTEN_DEPS ${PTEN_DEPS} unary binary)
+if(WITH_GPU OR WITH_ROCM)
+  set(PTEN_DEPS ${PTEN_DEPS} math_cuda linalg_cuda creation_cuda manipulation_cuda)
+endif()
+
+cc_library(pten SRCS all.cc DEPS ${PTEN_DEPS})
diff --git a/paddle/tcmpt/api/all.cc b/paddle/pten/api/all.cc
similarity index 89%
rename from paddle/tcmpt/api/all.cc
rename to paddle/pten/api/all.cc
index 05922e02c4998..0704d6c516fa6 100644
--- a/paddle/tcmpt/api/all.cc
+++ b/paddle/pten/api/all.cc
@@ -12,6 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/tcmpt/api/all.h"
+#include "paddle/pten/api/all.h"
 
-namespace pt {}  // namespace pt
+namespace pten {}  // namespace pten
diff --git a/paddle/tcmpt/api/all.h b/paddle/pten/api/all.h
similarity index 69%
rename from paddle/tcmpt/api/all.h
rename to paddle/pten/api/all.h
index 0f47f75f8a7fc..c760960967d95 100644
--- a/paddle/tcmpt/api/all.h
+++ b/paddle/pten/api/all.h
@@ -15,9 +15,9 @@ limitations under the License. */
 #pragma once
 
 // develop apis
-#include "paddle/tcmpt/api/include/core.h"
-#include "paddle/tcmpt/api/include/creation.h"
-#include "paddle/tcmpt/api/include/infershape.h"
-#include "paddle/tcmpt/api/include/linalg.h"
-#include "paddle/tcmpt/api/include/manipulation.h"
-#include "paddle/tcmpt/api/include/math.h"
+#include "paddle/pten/api/include/core.h"
+#include "paddle/pten/api/include/creation.h"
+#include "paddle/pten/api/include/infershape.h"
+#include "paddle/pten/api/include/linalg.h"
+#include "paddle/pten/api/include/manipulation.h"
+#include "paddle/pten/api/include/math.h"
diff --git a/paddle/tcmpt/api/include/core.h b/paddle/pten/api/include/core.h
similarity index 75%
rename from paddle/tcmpt/api/include/core.h
rename to paddle/pten/api/include/core.h
index fd863186abb30..7872580ad8d7c 100644
--- a/paddle/tcmpt/api/include/core.h
+++ b/paddle/pten/api/include/core.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 
 // See Note: [ How do we organize the kernel directory ]
-#include "paddle/tcmpt/core/convert_utils.h"
-#include "paddle/tcmpt/core/dense_tensor.h"
-#include "paddle/tcmpt/core/kernel_context.h"
-#include "paddle/tcmpt/core/kernel_factory.h"
-#include "paddle/tcmpt/core/scalar.h"
+#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_context.h"
+#include "paddle/pten/core/kernel_factory.h"
+#include "paddle/pten/core/scalar.h"
diff --git a/paddle/tcmpt/api/include/creation.h b/paddle/pten/api/include/creation.h
similarity index 87%
rename from paddle/tcmpt/api/include/creation.h
rename to paddle/pten/api/include/creation.h
index 2a87453b32154..d7311e6cd283b 100644
--- a/paddle/tcmpt/api/include/creation.h
+++ b/paddle/pten/api/include/creation.h
@@ -14,5 +14,5 @@
 
 #pragma once
 
-#include "paddle/tcmpt/kernels/cpu/creation.h"
-#include "paddle/tcmpt/kernels/cuda/creation.h"
+#include "paddle/pten/kernels/cpu/creation.h"
+#include "paddle/pten/kernels/cuda/creation.h"
diff --git a/paddle/tcmpt/api/include/infershape.h b/paddle/pten/api/include/infershape.h
similarity index 88%
rename from paddle/tcmpt/api/include/infershape.h
rename to paddle/pten/api/include/infershape.h
index 01ed351fb59b2..8c1bd43aaa24e 100644
--- a/paddle/tcmpt/api/include/infershape.h
+++ b/paddle/pten/api/include/infershape.h
@@ -15,5 +15,5 @@ limitations under the License. */
 #pragma once
 
 // See Note: [ How do we organize the kernel directory ]
-#include "paddle/tcmpt/infershape/binary.h"
-#include "paddle/tcmpt/infershape/unary.h"
+#include "paddle/pten/infershape/binary.h"
+#include "paddle/pten/infershape/unary.h"
diff --git a/paddle/tcmpt/api/include/linalg.h b/paddle/pten/api/include/linalg.h
similarity index 88%
rename from paddle/tcmpt/api/include/linalg.h
rename to paddle/pten/api/include/linalg.h
index 81ea68abcd0bb..d9798c3a2e0a8 100644
--- a/paddle/tcmpt/api/include/linalg.h
+++ b/paddle/pten/api/include/linalg.h
@@ -15,5 +15,5 @@
 #pragma once
 
 // See Note: [ How do we organize the kernel directory ]
-#include "paddle/tcmpt/kernels/cpu/linalg.h"
-#include "paddle/tcmpt/kernels/cuda/linalg.h"
+#include "paddle/pten/kernels/cpu/linalg.h"
+#include "paddle/pten/kernels/cuda/linalg.h"
diff --git a/paddle/tcmpt/api/include/manipulation.h b/paddle/pten/api/include/manipulation.h
similarity index 87%
rename from paddle/tcmpt/api/include/manipulation.h
rename to paddle/pten/api/include/manipulation.h
index 1746929ca181d..f2acad9649969 100644
--- a/paddle/tcmpt/api/include/manipulation.h
+++ b/paddle/pten/api/include/manipulation.h
@@ -15,5 +15,5 @@
 #pragma once
 
 // See Note: [ How do we organize the kernel directory ]
-#include "paddle/tcmpt/kernels/cpu/manipulation.h"
-#include "paddle/tcmpt/kernels/cuda/manipulation.h"
+#include "paddle/pten/kernels/cpu/manipulation.h"
+#include "paddle/pten/kernels/cuda/manipulation.h"
diff --git a/paddle/tcmpt/api/include/math.h b/paddle/pten/api/include/math.h
similarity index 88%
rename from paddle/tcmpt/api/include/math.h
rename to paddle/pten/api/include/math.h
index ab3c229806990..5145c823a5c6e 100644
--- a/paddle/tcmpt/api/include/math.h
+++ b/paddle/pten/api/include/math.h
@@ -15,5 +15,5 @@ limitations under the License. */
 #pragma once
 
 // See Note: [ How do we organize the kernel directory ]
-#include "paddle/tcmpt/kernels/cpu/math.h"
-#include "paddle/tcmpt/kernels/cuda/math.h"
+#include "paddle/pten/kernels/cpu/math.h"
+#include "paddle/pten/kernels/cuda/math.h"
diff --git a/paddle/tcmpt/api/include/symbols.h b/paddle/pten/api/include/symbols.h
similarity index 94%
rename from paddle/tcmpt/api/include/symbols.h
rename to paddle/pten/api/include/symbols.h
index 8dc75f859ce52..1ec14a41861d8 100644
--- a/paddle/tcmpt/api/include/symbols.h
+++ b/paddle/pten/api/include/symbols.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/tcmpt/core/kernel_registry.h"
+#include "paddle/pten/core/kernel_registry.h"
 
 // symbol declare
 PT_DECLARE_MODULE(MathCPU);
diff --git a/paddle/tcmpt/common/data_type.h b/paddle/pten/common/data_type.h
similarity index 99%
rename from paddle/tcmpt/common/data_type.h
rename to paddle/pten/common/data_type.h
index 03881e6bda1ca..bd33bf70541a8 100644
--- a/paddle/tcmpt/common/data_type.h
+++ b/paddle/pten/common/data_type.h
@@ -176,6 +176,6 @@ inline DataType& operator++(DataType& dtype, int) {
 }  // namespace experimental
 }  // namespace paddle
 
-namespace pt {
+namespace pten {
 using DataType = paddle::experimental::DataType;
 }
diff --git a/paddle/tcmpt/common/layout.h b/paddle/pten/common/layout.h
similarity index 98%
rename from paddle/tcmpt/common/layout.h
rename to paddle/pten/common/layout.h
index ae4e43a9f7197..da41aaaaed33a 100644
--- a/paddle/tcmpt/common/layout.h
+++ b/paddle/pten/common/layout.h
@@ -59,6 +59,6 @@ inline DataLayout& operator++(DataLayout& layout, int) {
 }  // namespace experimental
 }  // namespace paddle
 
-namespace pt {
+namespace pten {
 using DataLayout = paddle::experimental::DataLayout;
 }
diff --git a/paddle/tcmpt/core/CMakeLists.txt b/paddle/pten/core/CMakeLists.txt
similarity index 100%
rename from paddle/tcmpt/core/CMakeLists.txt
rename to paddle/pten/core/CMakeLists.txt
diff --git a/paddle/tcmpt/core/allocator.cc b/paddle/pten/core/allocator.cc
similarity index 82%
rename from paddle/tcmpt/core/allocator.cc
rename to paddle/pten/core/allocator.cc
index da1576f81ad71..bcf03ee5acf0a 100644
--- a/paddle/tcmpt/core/allocator.cc
+++ b/paddle/pten/core/allocator.cc
@@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/tcmpt/core/allocator.h"
+#include "paddle/pten/core/allocator.h"
 
-namespace paddle {
-namespace tcmpt {}  // namespace tcmpt
-}  // namespace paddle
+namespace pten {}  // namespace pten
diff --git a/paddle/tcmpt/core/allocator.h b/paddle/pten/core/allocator.h
similarity index 93%
rename from paddle/tcmpt/core/allocator.h
rename to paddle/pten/core/allocator.h
index 592f7a4078f80..b96e695a4f8cf 100644
--- a/paddle/tcmpt/core/allocator.h
+++ b/paddle/pten/core/allocator.h
@@ -17,8 +17,7 @@ limitations under the License. */
 #include <cstdint>
 #include "paddle/fluid/platform/place.h"
 
-namespace paddle {
-namespace tcmpt {
+namespace pten {
 
 /// \brief Encapsulates strategies for access/addressing, allocation/
 /// deallocation and construction/destruction of objects.
@@ -44,7 +43,7 @@ class RawAllocator {
 
   /// \brief Get the place value of the allocator and the allocation.
   /// \return The place value of the allocator and the allocation.
-  virtual const platform::Place& place() const = 0;
+  virtual const paddle::platform::Place& place() const = 0;
 };
 
 /// \brief Fancy pointer with context. The use of this data type
@@ -59,18 +58,18 @@ class Allocation final {
   Allocation(Allocation&&) = default;
   Allocation& operator=(Allocation&&) = default;
 
-  Allocation(void* data, const platform::Place& place)
+  Allocation(void* data, const paddle::platform::Place& place)
       : data_(data), place_(place) {}
 
   Allocation(void* data,
              void* ctx,
              DeleterFnPtr ctx_deleter,
-             const platform::Place& place)
+             const paddle::platform::Place& place)
       : data_(data), ctx_(ctx, ctx_deleter), place_(place) {}
 
   void* operator->() const noexcept { return data_; }
   operator bool() const noexcept { return data_ || ctx_.Get(); }
-  const platform::Place& place() const noexcept { return place_; }
+  const paddle::platform::Place& place() const noexcept { return place_; }
 
   void Clear() noexcept {
     data_ = nullptr;
@@ -133,7 +132,7 @@ class Allocation final {
   Context ctx_;
   // TODO(Shixiaowei02): Enum needs to be used instead to reduce
   // the construction overhead by more than 50%.
-  platform::Place place_;
+  paddle::platform::Place place_;
 };
 
 inline void swap(Allocation::Context& a, Allocation::Context& b) noexcept {
@@ -155,5 +154,4 @@ inline Allocation Allocate(const std::shared_ptr<Allocator>& a, size_t n) {
   return a->Allocate(n);
 }
 
-}  // namespace tcmpt
-}  // namespace paddle
+}  // namespace pten
diff --git a/paddle/tcmpt/core/backend.cc b/paddle/pten/core/backend.cc
similarity index 94%
rename from paddle/tcmpt/core/backend.cc
rename to paddle/pten/core/backend.cc
index 68c7adfcc2810..0e4029cfc38e2 100644
--- a/paddle/tcmpt/core/backend.cc
+++ b/paddle/pten/core/backend.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/tcmpt/core/backend.h"
+#include "paddle/pten/core/backend.h"
 
-namespace pt {
+namespace pten {
 
 std::ostream& operator<<(std::ostream& os, Backend backend) {
   switch (backend) {
@@ -55,4 +55,4 @@ std::ostream& operator<<(std::ostream& os, Backend backend) {
   return os;
 }
 
-}  // namespace pt
+}  // namespace pten
diff --git a/paddle/tcmpt/core/backend.h b/paddle/pten/core/backend.h
similarity index 97%
rename from paddle/tcmpt/core/backend.h
rename to paddle/pten/core/backend.h
index b1ee09c177f29..c10d4bd308331 100644
--- a/paddle/tcmpt/core/backend.h
+++ b/paddle/pten/core/backend.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <ostream>
-namespace pt {
+namespace pten {
 
 /**
  * [ Why need Backend? ]
@@ -45,4 +45,4 @@ enum class Backend {
 
 std::ostream& operator<<(std::ostream& os, Backend backend);
 
-}  // namespace pt
+}  // namespace pten
diff --git a/paddle/tcmpt/core/convert_utils.cc b/paddle/pten/core/convert_utils.cc
similarity index 94%
rename from paddle/tcmpt/core/convert_utils.cc
rename to paddle/pten/core/convert_utils.cc
index e5b8acba19cf0..2320fc632c936 100644
--- a/paddle/tcmpt/core/convert_utils.cc
+++ b/paddle/pten/core/convert_utils.cc
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/tcmpt/core/convert_utils.h"
+#include "paddle/pten/core/convert_utils.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/gpu_info.h"
 
-namespace pt {
+namespace pten {
 
 // TODO(chenweihang): Add other place branchs
 Backend TransToPtBackend(const paddle::platform::Place& place) {
@@ -38,7 +38,7 @@ Backend TransToPtBackend(const paddle::platform::Place& place) {
   }
 }
 
-pt::DataType TransToPtDataType(
+paddle::experimental::DataType TransToPtDataType(
     const paddle::framework::proto::VarType::Type& dtype) {
   // Set the order of case branches according to the frequency with
   // the data type is used
@@ -90,29 +90,29 @@ DataLayout TransToPtDataLayout(const paddle::framework::DataLayout& layout) {
 paddle::platform::Place TransToFluidPlace(const Backend& backend) {
   // TODO(chenweihang): add other trans cases
   switch (backend) {
-    case pt::Backend::kCPU:
+    case pten::Backend::kCPU:
       return paddle::platform::CPUPlace();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    case pt::Backend::kCUDA:
+    case pten::Backend::kCUDA:
       return paddle::platform::CUDAPlace(
           paddle::platform::GetCurrentDeviceId());
 #endif
 #ifdef PADDLE_WITH_XPU
-    case pt::Backend::kXPU:
+    case pten::Backend::kXPU:
       // TODO(chenweihang): add device id
       return paddle::platform::XPUPlace();
 #endif
 #ifdef PADDLE_WITH_NPU
-    case pt::Backend::kNPU:
+    case pten::Backend::kNPU:
       // TODO(chenweihang): add device id
       return paddle::platform::NPUPlace();
 #endif
 #ifdef PADDLE_WITH_MKLDNN
-    case pt::Backend::kMKLDNN:
+    case pten::Backend::kMKLDNN:
       return paddle::platform::CPUPlace();
 #endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    case pt::Backend::kCUDNN:
+    case pten::Backend::kCUDNN:
       return paddle::platform::CUDAPlace(
           paddle::platform::GetCurrentDeviceId());
 #endif
@@ -124,7 +124,7 @@ paddle::platform::Place TransToFluidPlace(const Backend& backend) {
 }
 
 paddle::framework::proto::VarType::Type TransToProtoVarType(
-    const pt::DataType& dtype) {
+    const paddle::experimental::DataType& dtype) {
   // Set the order of case branches according to the frequency with
   // the data type is used
   switch (dtype) {
@@ -178,4 +178,4 @@ paddle::framework::DataLayout TransToFluidDataLayout(const DataLayout& layout) {
   }
 }
 
-}  // namespace pt
+}  // namespace pten
diff --git a/paddle/tcmpt/core/convert_utils.h b/paddle/pten/core/convert_utils.h
similarity index 90%
rename from paddle/tcmpt/core/convert_utils.h
rename to paddle/pten/core/convert_utils.h
index 011652bdc9572..2c7ad35881e7c 100644
--- a/paddle/tcmpt/core/convert_utils.h
+++ b/paddle/pten/core/convert_utils.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/tcmpt/common/data_type.h"
-#include "paddle/tcmpt/common/layout.h"
-#include "paddle/tcmpt/core/backend.h"
+#include "paddle/pten/common/data_type.h"
+#include "paddle/pten/common/layout.h"
+#include "paddle/pten/core/backend.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/data_layout.h"
@@ -25,7 +25,7 @@ limitations under the License. */
 
 // TODO(chenweihang): this file may need to be removed
 
-namespace pt {
+namespace pten {
 
 using DataType = paddle::experimental::DataType;
 using DataLayout = paddle::experimental::DataLayout;
@@ -42,4 +42,4 @@ paddle::framework::proto::VarType::Type TransToProtoVarType(
     const DataType& dtype);
 paddle::framework::DataLayout TransToFluidDataLayout(const DataLayout& layout);
 
-}  // namespace pt
+}  // namespace pten
diff --git a/paddle/tcmpt/core/dense_tensor.cc b/paddle/pten/core/dense_tensor.cc
similarity index 95%
rename from paddle/tcmpt/core/dense_tensor.cc
rename to paddle/pten/core/dense_tensor.cc
index 9c34b5823d590..022127773909d 100644
--- a/paddle/tcmpt/core/dense_tensor.cc
+++ b/paddle/pten/core/dense_tensor.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/tcmpt/core/dense_tensor.h"
-#include "paddle/tcmpt/core/convert_utils.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/convert_utils.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/data_type.h"
@@ -22,7 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/place.h"
 
-namespace pt {
+namespace pten {
 
 using CPUPlace = paddle::platform::CPUPlace;
 using CUDAPlace = paddle::platform::CUDAPlace;
@@ -43,7 +43,7 @@ const paddle::platform::Place& DenseTensor::place() const {
 // Inner methods
 
 void DenseTensor::ShareAllocation(
-    const std::shared_ptr<Allocation>& allocation) {
+    const std::shared_ptr<paddle::memory::allocation::Allocation>& allocation) {
   // This operation can be very slow!
   // std::shared_ptr reference count is atomic. increasing or decreasing
   // the reference count requires atomic increment or decrement.
@@ -137,4 +137,4 @@ void* DenseTensor::mutable_data() {
       reinterpret_cast<uintptr_t>(allocation_->ptr()) + meta_.offset);
 }
 
-}  // namespace pt
+}  // namespace pten
diff --git a/paddle/tcmpt/core/dense_tensor.h b/paddle/pten/core/dense_tensor.h
similarity index 88%
rename from paddle/tcmpt/core/dense_tensor.h
rename to paddle/pten/core/dense_tensor.h
index a0d195b740bed..e913440a7e663 100644
--- a/paddle/tcmpt/core/dense_tensor.h
+++ b/paddle/pten/core/dense_tensor.h
@@ -16,9 +16,9 @@ limitations under the License. */
 
 #include <memory>
 
-#include "paddle/tcmpt/core/tensor_base.h"
-#include "paddle/tcmpt/core/tensor_meta.h"
-#include "paddle/tcmpt/core/tensor_status.h"
+#include "paddle/pten/core/tensor_base.h"
+#include "paddle/pten/core/tensor_meta.h"
+#include "paddle/pten/core/tensor_status.h"
 
 namespace paddle {
 namespace memory {
@@ -28,15 +28,10 @@ class Allocation;
 }
 }
 
-namespace pt {
+namespace pten {
 
-using TensorBase = paddle::tcmpt::TensorBase;
 using DataType = paddle::experimental::DataType;
 
-// TODO(chenweihang): Allocation still link to framework, Redesign and
-// decoupled Allocation and Allocator?
-using Allocation = paddle::memory::allocation::Allocation;
-
 /**
  * The implementation of general Tensor (For CPU, CUDA, HIP, etc.), similar
  * to the Tensor in fluid, contains a pointer to Allocation and a series of
@@ -92,7 +87,10 @@ class DenseTensor : public TensorBase {
 
   /* member methods */
 
-  const std::shared_ptr<Allocation>& allocation() const { return allocation_; }
+  const std::shared_ptr<paddle::memory::allocation::Allocation>& allocation()
+      const {
+    return allocation_;
+  }
 
   const TensorMeta& meta() const { return meta_; }
 
@@ -131,7 +129,8 @@ class DenseTensor : public TensorBase {
 
   void Resize(const DDim& dims) { meta_.dims = dims; }
 
-  void ShareAllocation(const std::shared_ptr<Allocation>& allocation);
+  void ShareAllocation(const std::shared_ptr<
+                       paddle::memory::allocation::Allocation>& allocation);
 
   paddle::platform::Place GetPlaceByBackend() const;
 
@@ -141,11 +140,11 @@ class DenseTensor : public TensorBase {
 
  private:
   // The actual Tensor storage holder
-  std::shared_ptr<Allocation> allocation_;
+  std::shared_ptr<paddle::memory::allocation::Allocation> allocation_;
   // The Tensor meta data
   TensorMeta meta_;
   // The Tensor status data
   TensorStatus status_;
 };
 
-}  // namespace pt
+}  // namespace pten
diff --git a/paddle/tcmpt/core/kernel_context.cc b/paddle/pten/core/kernel_context.cc
similarity index 88%
rename from paddle/tcmpt/core/kernel_context.cc
rename to paddle/pten/core/kernel_context.cc
index 5bfcaf137fedf..443990c07247d 100644
--- a/paddle/tcmpt/core/kernel_context.cc
+++ b/paddle/pten/core/kernel_context.cc
@@ -12,6 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/tcmpt/core/kernel_context.h"
+#include "paddle/pten/core/kernel_context.h"
 
-namespace pt {}  // namespace pt
+namespace pten {}  // namespace pten
diff --git a/paddle/tcmpt/core/kernel_context.h b/paddle/pten/core/kernel_context.h
similarity index 97%
rename from paddle/tcmpt/core/kernel_context.h
rename to paddle/pten/core/kernel_context.h
index 022d8a6713155..c17248831c10e 100644
--- a/paddle/tcmpt/core/kernel_context.h
+++ b/paddle/pten/core/kernel_context.h
@@ -16,17 +16,16 @@
 
 #include <utility>
 
-#include "paddle/tcmpt/core/tensor_base.h"
+#include "paddle/pten/core/tensor_base.h"
 #include "paddle/utils/any.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 
-namespace pt {
+namespace pten {
 
 using DeviceContext = paddle::platform::DeviceContext;
-using TensorBase = paddle::tcmpt::TensorBase;
 using DataType = paddle::experimental::DataType;
 using DataLayout = paddle::experimental::DataLayout;
 
@@ -132,4 +131,4 @@ class KernelContext {
   std::vector<std::string> output_names_{{}};
 };
 
-}  // namespace pt
+}  // namespace pten
diff --git a/paddle/tcmpt/core/kernel_def.h b/paddle/pten/core/kernel_def.h
similarity index 97%
rename from paddle/tcmpt/core/kernel_def.h
rename to paddle/pten/core/kernel_def.h
index 70b8be19aaeea..48a579cd02b51 100644
--- a/paddle/tcmpt/core/kernel_def.h
+++ b/paddle/pten/core/kernel_def.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-namespace pt {
+namespace pten {
 
 class Kernel;
 class KernelKey;
@@ -39,4 +39,4 @@ constexpr char kContainSelectedRowsSuffix[] = "sr";
 
 // For kernels with intermediate output
 constexpr char kContainMidOutputTensorSuffix[] = "mid";
-}  // namespace pt
+}  // namespace pten
diff --git a/paddle/tcmpt/core/kernel_factory.cc b/paddle/pten/core/kernel_factory.cc
similarity index 91%
rename from paddle/tcmpt/core/kernel_factory.cc
rename to paddle/pten/core/kernel_factory.cc
index a301d6a995ce7..243808c67b843 100644
--- a/paddle/tcmpt/core/kernel_factory.cc
+++ b/paddle/pten/core/kernel_factory.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/tcmpt/core/kernel_factory.h"
+#include "paddle/pten/core/kernel_factory.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/enforce.h"
 
-namespace pt {
+namespace pten {
 
 KernelFactory& KernelFactory::Instance() {
   static KernelFactory g_op_kernel_factory;
@@ -51,9 +51,11 @@ const Kernel& KernelFactory::SelectKernelOrThrowError(
                         "The kernel `%s` is not registered.", kernel_name));
 
   auto kernel_iter = iter->second.find(kernel_key);
-  if (kernel_key.layout() != pt::DataLayout::kAny) {
-    pt::KernelKey any_layout_kernel_key(
-        kernel_key.backend(), pt::DataLayout::kAny, kernel_key.dtype());
+  if (kernel_key.layout() != paddle::experimental::DataLayout::kAny) {
+    pten::KernelKey any_layout_kernel_key(
+        kernel_key.backend(),
+        paddle::experimental::DataLayout::kAny,
+        kernel_key.dtype());
     kernel_iter = iter->second.find(any_layout_kernel_key);
   }
   PADDLE_ENFORCE_NE(
@@ -98,4 +100,4 @@ std::ostream& operator<<(std::ostream& os, KernelFactory& kernel_factory) {
   return os;
 }
 
-}  // namespace pt
+}  // namespace pten
diff --git a/paddle/tcmpt/core/kernel_factory.h b/paddle/pten/core/kernel_factory.h
similarity index 97%
rename from paddle/tcmpt/core/kernel_factory.h
rename to paddle/pten/core/kernel_factory.h
index 6e4a3fa86dfda..32c8462585878 100644
--- a/paddle/tcmpt/core/kernel_factory.h
+++ b/paddle/pten/core/kernel_factory.h
@@ -19,17 +19,17 @@
 #include <unordered_map>
 #include <utility>
 
-#include "paddle/tcmpt/common/data_type.h"
-#include "paddle/tcmpt/common/layout.h"
-#include "paddle/tcmpt/core/backend.h"
-#include "paddle/tcmpt/core/kernel_def.h"
+#include "paddle/pten/common/data_type.h"
+#include "paddle/pten/common/layout.h"
+#include "paddle/pten/core/backend.h"
+#include "paddle/pten/core/kernel_def.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/utils/flat_hash_map.h"
 #include "paddle/utils/small_vector.h"
 
-namespace pt {
+namespace pten {
 
 using DataType = paddle::experimental::DataType;
 using DataLayout = paddle::experimental::DataLayout;
@@ -323,4 +323,4 @@ std::ostream& operator<<(std::ostream& os, const Kernel& kernel);
 
 std::ostream& operator<<(std::ostream& os, KernelFactory& kernel_factory);
 
-}  // namespace pt
+}  // namespace pten
diff --git a/paddle/tcmpt/core/kernel_registry.h b/paddle/pten/core/kernel_registry.h
similarity index 91%
rename from paddle/tcmpt/core/kernel_registry.h
rename to paddle/pten/core/kernel_registry.h
index caa42546ab054..666b700a671b9 100644
--- a/paddle/tcmpt/core/kernel_registry.h
+++ b/paddle/pten/core/kernel_registry.h
@@ -20,15 +20,15 @@
 #include <typeinfo>
 #include <vector>
 
-#include "paddle/tcmpt/core/kernel_def.h"
-#include "paddle/tcmpt/core/kernel_factory.h"
-#include "paddle/tcmpt/core/kernel_utils.h"
+#include "paddle/pten/core/kernel_def.h"
+#include "paddle/pten/core/kernel_factory.h"
+#include "paddle/pten/core/kernel_utils.h"
 
-namespace pt {
+namespace pten {
 
-#define BACKEND(arg__) pt::Backend::k##arg__
-#define DATALAYOUT(arg__) pt::DataLayout::k##arg__
-#define DATATYPE(arg__) pt::DataType::k##arg__
+#define BACKEND(arg__) pten::Backend::k##arg__
+#define DATALAYOUT(arg__) paddle::experimental::DataLayout::k##arg__
+#define DATATYPE(arg__) paddle::experimental::DataType::k##arg__
 
 template <typename Func>
 struct KernelArgsParseFunctor;
@@ -45,8 +45,8 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
     // TODO(chenweihang): The fluid Tensor's default layout is NCHW,
     // it is not same as kernel's layout, we should fix this error on
     // fluid Tensor
-    auto default_tensor_layout = pt::DataLayout::kNCHW;
-    if (default_key.layout() != pt::DataLayout::kAny) {
+    auto default_tensor_layout = paddle::experimental::DataLayout::kNCHW;
+    if (default_key.layout() != paddle::experimental::DataLayout::kAny) {
       default_tensor_layout = default_key.layout();
     }
     auto args_type = ParseArgType(Indices{});
@@ -216,7 +216,7 @@ struct KernelRegistrar {
       "PT_REGISTER_KERNEL must be called in global namespace.");               \
   PT_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, __VA_ARGS__);             \
   static void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                         \
-                             func_id)(::pt::Kernel*);                          \
+                             func_id)(::pten::Kernel*);                        \
   PT_KERNEL_REGISTRAR_INIT(kernel_name,                                        \
                            func_id,                                            \
                            backend,                                            \
@@ -225,7 +225,8 @@ struct KernelRegistrar {
                            meta_kernel_fn,                                     \
                            cpp_dtype,                                          \
                            __VA_ARGS__);                                       \
-  void PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id)(::pt::Kernel * kernel)
+  void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                                \
+                      func_id)(::pten::Kernel * kernel)
 #else
 #define _PT_REGISTER_KERNEL(                                                   \
     kernel_name, func_id, backend, layout, meta_kernel_fn, cpp_dtype, ...)     \
@@ -233,7 +234,7 @@ struct KernelRegistrar {
       PT_CONCATENATE(pt_op_kernel_ns_check_, func_id),                         \
       "PT_REGISTER_KERNEL must be called in global namespace.");               \
   static void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                         \
-                             func_id)(::pt::Kernel*);                          \
+                             func_id)(::pten::Kernel*);                        \
   PT_KERNEL_REGISTRAR_INIT(kernel_name,                                        \
                            func_id,                                            \
                            backend,                                            \
@@ -242,7 +243,8 @@ struct KernelRegistrar {
                            meta_kernel_fn,                                     \
                            cpp_dtype,                                          \
                            __VA_ARGS__);                                       \
-  void PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id)(::pt::Kernel * kernel)
+  void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                                \
+                      func_id)(::pten::Kernel * kernel)
 #endif
 
 #define PT_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, ...) \
@@ -345,13 +347,13 @@ struct KernelRegistrar {
                                     meta_kernel_fn,                 \
                                     cpp_dtype,                      \
                                     ...)                            \
-  static const ::pt::KernelRegistrar PT_CONCATENATE(                \
+  static const ::pten::KernelRegistrar PT_CONCATENATE(              \
       __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
       kernel_name,                                                  \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
-      ::pt::KernelArgsParseFunctor<decltype(                        \
+      ::pten::KernelArgsParseFunctor<decltype(                      \
           &meta_kernel_fn<cpp_dtype>)>::Parse,                      \
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));
@@ -364,13 +366,13 @@ struct KernelRegistrar {
                                     meta_kernel_fn,                 \
                                     cpp_dtype,                      \
                                     ...)                            \
-  static const ::pt::KernelRegistrar PT_CONCATENATE(                \
+  static const ::pten::KernelRegistrar PT_CONCATENATE(              \
       __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
       kernel_name,                                                  \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
-      ::pt::KernelArgsParseFunctor<decltype(                        \
+      ::pten::KernelArgsParseFunctor<decltype(                      \
           &meta_kernel_fn<cpp_dtype>)>::Parse,                      \
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
@@ -391,13 +393,13 @@ struct KernelRegistrar {
                                     meta_kernel_fn,                 \
                                     cpp_dtype,                      \
                                     ...)                            \
-  static const ::pt::KernelRegistrar PT_CONCATENATE(                \
+  static const ::pten::KernelRegistrar PT_CONCATENATE(              \
       __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
       kernel_name,                                                  \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
-      ::pt::KernelArgsParseFunctor<decltype(                        \
+      ::pten::KernelArgsParseFunctor<decltype(                      \
           &meta_kernel_fn<cpp_dtype>)>::Parse,                      \
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
@@ -418,13 +420,13 @@ struct KernelRegistrar {
                                     meta_kernel_fn,                 \
                                     cpp_dtype,                      \
                                     ...)                            \
-  static const ::pt::KernelRegistrar PT_CONCATENATE(                \
+  static const ::pten::KernelRegistrar PT_CONCATENATE(              \
       __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
       kernel_name,                                                  \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
-      ::pt::KernelArgsParseFunctor<decltype(                        \
+      ::pten::KernelArgsParseFunctor<decltype(                      \
           &meta_kernel_fn<cpp_dtype>)>::Parse,                      \
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
@@ -445,13 +447,13 @@ struct KernelRegistrar {
                                     meta_kernel_fn,                 \
                                     cpp_dtype,                      \
                                     ...)                            \
-  static const ::pt::KernelRegistrar PT_CONCATENATE(                \
+  static const ::pten::KernelRegistrar PT_CONCATENATE(              \
       __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
       kernel_name,                                                  \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
-      ::pt::KernelArgsParseFunctor<decltype(                        \
+      ::pten::KernelArgsParseFunctor<decltype(                      \
           &meta_kernel_fn<cpp_dtype>)>::Parse,                      \
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
@@ -472,13 +474,13 @@ struct KernelRegistrar {
                                     meta_kernel_fn,                 \
                                     cpp_dtype,                      \
                                     ...)                            \
-  static const ::pt::KernelRegistrar PT_CONCATENATE(                \
+  static const ::pten::KernelRegistrar PT_CONCATENATE(              \
       __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
       kernel_name,                                                  \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
-      ::pt::KernelArgsParseFunctor<decltype(                        \
+      ::pten::KernelArgsParseFunctor<decltype(                      \
           &meta_kernel_fn<cpp_dtype>)>::Parse,                      \
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
@@ -499,13 +501,13 @@ struct KernelRegistrar {
                                     meta_kernel_fn,                 \
                                     cpp_dtype,                      \
                                     ...)                            \
-  static const ::pt::KernelRegistrar PT_CONCATENATE(                \
+  static const ::pten::KernelRegistrar PT_CONCATENATE(              \
       __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
       kernel_name,                                                  \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
-      ::pt::KernelArgsParseFunctor<decltype(                        \
+      ::pten::KernelArgsParseFunctor<decltype(                      \
           &meta_kernel_fn<cpp_dtype>)>::Parse,                      \
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
@@ -526,13 +528,13 @@ struct KernelRegistrar {
                                     meta_kernel_fn,                 \
                                     cpp_dtype,                      \
                                     ...)                            \
-  static const ::pt::KernelRegistrar PT_CONCATENATE(                \
+  static const ::pten::KernelRegistrar PT_CONCATENATE(              \
       __reg_pt_op_kernel_##func_id##_, registrar_id)(               \
       kernel_name,                                                  \
       BACKEND(backend),                                             \
       DATALAYOUT(layout),                                           \
       ::paddle::experimental::CppTypeToDataType<cpp_dtype>::Type(), \
-      ::pt::KernelArgsParseFunctor<decltype(                        \
+      ::pten::KernelArgsParseFunctor<decltype(                      \
           &meta_kernel_fn<cpp_dtype>)>::Parse,                      \
       args_def_fn,                                                  \
       PT_KERNEL(meta_kernel_fn<cpp_dtype>));                        \
@@ -557,17 +559,17 @@ struct KernelRegistrar {
       "_PT_REGISTER_KERNEL_STANDARD must be called in global namespace."); \
   template decltype(kernel_fn) kernel_fn;                                  \
   static void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                     \
-                             func_id)(::pt::Kernel*);                      \
-  static const ::pt::KernelRegistrar PT_CONCATENATE(__reg_pt_op_kernel_,   \
-                                                    func_id)(              \
+                             func_id)(::pten::Kernel*);                    \
+  static const ::pten::KernelRegistrar PT_CONCATENATE(__reg_pt_op_kernel_, \
+                                                      func_id)(            \
       kernel_name,                                                         \
       BACKEND(backend),                                                    \
       DATALAYOUT(layout),                                                  \
       DATATYPE(dtype),                                                     \
-      ::pt::KernelArgsParseFunctor<decltype(&kernel_fn)>::Parse,           \
+      ::pten::KernelArgsParseFunctor<decltype(&kernel_fn)>::Parse,         \
       args_def_fn,                                                         \
       PT_KERNEL(kernel_fn));                                               \
-  void PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id)(::pt::Kernel*)
+  void PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id)(::pten::Kernel*)
 
 // use to declare symbol
 #define PT_REGISTER_MODULE(name) \
@@ -595,7 +597,7 @@ struct KernelRegistrar {
       PT_CONCATENATE(pt_op_kernel_for_test_ns_check_, func_id),            \
       "PT_REGISTER_KERNEL must be called in global namespace.");           \
   static void PT_CONCATENATE(__PT_KERNEL_for_test_args_def_FN_,            \
-                             func_id)(::pt::Kernel*);                      \
+                             func_id)(::pten::Kernel*);                    \
   PT_KERNEL_REGISTRAR_INIT(                                                \
       kernel_name,                                                         \
       func_id,                                                             \
@@ -606,27 +608,28 @@ struct KernelRegistrar {
       cpp_dtype,                                                           \
       __VA_ARGS__);                                                        \
   void PT_CONCATENATE(__PT_KERNEL_for_test_args_def_FN_,                   \
-                      func_id)(::pt::Kernel * kernel)
+                      func_id)(::pten::Kernel * kernel)
 
 #define PT_REGISTER_KERNEL_WITH_NO_TYPE(          \
     kernel_name, backend, layout, meta_kernel_fn) \
   _PT_REGISTER_KERNEL_WITH_NO_TYPE(               \
       kernel_name, PT_ID, backend, layout, meta_kernel_fn)
 
-#define _PT_REGISTER_KERNEL_WITH_NO_TYPE(                             \
-    kernel_name, func_id, backend, layout, meta_kernel_fn)            \
-  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                  \
-      PT_CONCATENATE(pt_op_kernel_ns_check_, func_id),                \
-      "PT_REGISTER_KERNEL must be called in global namespace.");      \
-  decltype(meta_kernel_fn) meta_kernel_fn;                            \
-  static void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                \
-                             func_id)(::pt::Kernel*);                 \
-  static const ::pt::KernelRegistrar __reg_pt_op_kernel_##func_id(    \
-      kernel_name,                                                    \
-      BACKEND(backend),                                               \
-      DATALAYOUT(layout),                                             \
-      ::pt::KernelArgsParseFunctor<decltype(&meta_kernel_fn)>::Parse, \
-      &PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id),             \
-      PT_KERNEL(meta_kernel_fn));                                     \
-  void PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id)(::pt::Kernel * kernel)
-}  // namespace pt
+#define _PT_REGISTER_KERNEL_WITH_NO_TYPE(                               \
+    kernel_name, func_id, backend, layout, meta_kernel_fn)              \
+  PT_STATIC_ASSERT_GLOBAL_NAMESPACE(                                    \
+      PT_CONCATENATE(pt_op_kernel_ns_check_, func_id),                  \
+      "PT_REGISTER_KERNEL must be called in global namespace.");        \
+  decltype(meta_kernel_fn) meta_kernel_fn;                              \
+  static void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                  \
+                             func_id)(::pten::Kernel*);                 \
+  static const ::pten::KernelRegistrar __reg_pt_op_kernel_##func_id(    \
+      kernel_name,                                                      \
+      BACKEND(backend),                                                 \
+      DATALAYOUT(layout),                                               \
+      ::pten::KernelArgsParseFunctor<decltype(&meta_kernel_fn)>::Parse, \
+      &PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id),               \
+      PT_KERNEL(meta_kernel_fn));                                       \
+  void PT_CONCATENATE(__PT_KERNEL_args_def_FN_,                         \
+                      func_id)(::pten::Kernel * kernel)
+}  // namespace pten
diff --git a/paddle/tcmpt/core/kernel_utils.h b/paddle/pten/core/kernel_utils.h
similarity index 96%
rename from paddle/tcmpt/core/kernel_utils.h
rename to paddle/pten/core/kernel_utils.h
index 54d3d373da7c7..3f8458aed6dfc 100644
--- a/paddle/tcmpt/core/kernel_utils.h
+++ b/paddle/pten/core/kernel_utils.h
@@ -14,16 +14,16 @@
 
 #pragma once
 
-#include "paddle/tcmpt/core/dense_tensor.h"
-#include "paddle/tcmpt/core/kernel_context.h"
-#include "paddle/tcmpt/core/kernel_def.h"
-#include "paddle/tcmpt/core/scalar.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_context.h"
+#include "paddle/pten/core/kernel_def.h"
+#include "paddle/pten/core/scalar.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 
-namespace pt {
+namespace pten {
 
 // TODO(shixiaowei): replaced by new DeviceContext later
 using CPUContext = paddle::platform::CPUDeviceContext;
@@ -41,7 +41,7 @@ using XPUContext = paddle::platform::XPUDeviceContext;
 #endif
 
 #define PT_KERNEL(...) \
-  ::pt::KernelImpl<decltype(&__VA_ARGS__), &__VA_ARGS__>::Compute
+  ::pten::KernelImpl<decltype(&__VA_ARGS__), &__VA_ARGS__>::Compute
 
 #define PT_SPECIALIZE_KernelCallHelper_FOR_DEVICE_CONTEXT(dev_ctx)           \
   template <typename... Tail>                                                \
@@ -163,7 +163,7 @@ struct KernelImpl<Return (*)(Args...), kernel_fn> {
   PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int);
   PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int64_t);
   PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(paddle::platform::float16);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const pt::Scalar&);
+  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const pten::Scalar&);
 
   /* Output Helpers */
 
@@ -185,4 +185,4 @@ struct KernelImpl<Return (*)(Args...), kernel_fn> {
   };
 };
 
-}  // namespace pt
+}  // namespace pten
diff --git a/paddle/tcmpt/core/scalar.h b/paddle/pten/core/scalar.h
similarity index 97%
rename from paddle/tcmpt/core/scalar.h
rename to paddle/pten/core/scalar.h
index 8f30d81bcfb28..f8cdd43cc5e4c 100644
--- a/paddle/tcmpt/core/scalar.h
+++ b/paddle/pten/core/scalar.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-namespace pt {
+namespace pten {
 
 class Scalar {
  public:
@@ -60,4 +60,4 @@ class Scalar {
   } data_;
 };
 
-}  // namespace pt
+}  // namespace pten
diff --git a/paddle/tcmpt/core/spatial_tensor.h b/paddle/pten/core/spatial_tensor.h
similarity index 95%
rename from paddle/tcmpt/core/spatial_tensor.h
rename to paddle/pten/core/spatial_tensor.h
index 0e5bdd8be50a3..f1bd4add19771 100644
--- a/paddle/tcmpt/core/spatial_tensor.h
+++ b/paddle/pten/core/spatial_tensor.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/tcmpt/core/tensor_base.h"
+#include "paddle/pten/core/tensor_base.h"
 
-namespace pt {
+namespace pten {
 
 /**
  * SpatialTensor represents a Tensor whose memory layout is different from
@@ -48,4 +48,4 @@ class MetalTensor : public SpatialTensor<AllocationType> {};
 template <typename AllocationType>
 class OpenCLTensor : public SpatialTensor<AllocationType> {};
 
-}  // namespace pt
+}  // namespace pten
diff --git a/paddle/tcmpt/core/storage.cc b/paddle/pten/core/storage.cc
similarity index 85%
rename from paddle/tcmpt/core/storage.cc
rename to paddle/pten/core/storage.cc
index 02fbea8d0b3a1..5cac122b7dee6 100644
--- a/paddle/tcmpt/core/storage.cc
+++ b/paddle/pten/core/storage.cc
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/tcmpt/core/storage.h"
+#include "paddle/pten/core/storage.h"
 
-namespace paddle {
-namespace tcmpt {
+namespace pten {
 
 void TensorStorage::Realloc(size_t size) {
   data_.Clear();
@@ -23,5 +22,4 @@ void TensorStorage::Realloc(size_t size) {
   size_ = size;
 }
 
-}  // namespace tcmpt
-}  // namespace paddle
+}  // namespace pten
diff --git a/paddle/tcmpt/core/storage.h b/paddle/pten/core/storage.h
similarity index 85%
rename from paddle/tcmpt/core/storage.h
rename to paddle/pten/core/storage.h
index d838d0cd1c957..b1c6de7fff8f6 100644
--- a/paddle/tcmpt/core/storage.h
+++ b/paddle/pten/core/storage.h
@@ -17,14 +17,13 @@ limitations under the License. */
 #include <cstddef>
 
 #include "boost/intrusive_ptr.hpp"
-#include "paddle/tcmpt/core/utils/intrusive_ptr.h"
-#include "paddle/tcmpt/core/utils/intrusive_ref_counter.h"
+#include "paddle/pten/core/utils/intrusive_ptr.h"
+#include "paddle/pten/core/utils/intrusive_ref_counter.h"
 
 #include "paddle/fluid/platform/place.h"
-#include "paddle/tcmpt/core/allocator.h"
+#include "paddle/pten/core/allocator.h"
 
-namespace paddle {
-namespace tcmpt {
+namespace pten {
 
 /// \brief The interface of contiguous storage used for the dense tensor.
 /// It should be used in conjunction with the intrusive pointer. We prohibit
@@ -44,7 +43,7 @@ class Storage : public intrusive_ref_counter<Storage> {
   void* data() const noexcept { return data_.operator->(); }
 
   virtual size_t size() const = 0;
-  virtual const platform::Place& place() const = 0;
+  virtual const paddle::platform::Place& place() const = 0;
   virtual bool OwnsMemory() const = 0;
   virtual void Realloc(size_t n) = 0;
 
@@ -63,7 +62,9 @@ class TensorStorage : public Storage {
   void Realloc(size_t size) override;
 
   size_t size() const noexcept override { return size_; }
-  const platform::Place& place() const override { return data_.place(); }
+  const paddle::platform::Place& place() const override {
+    return data_.place();
+  }
   bool OwnsMemory() const noexcept override { return true; }
   const std::shared_ptr<Allocator>& allocator() const noexcept {
     return alloc_;
@@ -74,5 +75,4 @@ class TensorStorage : public Storage {
   int64_t size_{0};
 };
 
-}  // namespace tcmpt
-}  // namespace paddle
+}  // namespace pten
diff --git a/paddle/tcmpt/core/tensor_base.cc b/paddle/pten/core/tensor_base.cc
similarity index 81%
rename from paddle/tcmpt/core/tensor_base.cc
rename to paddle/pten/core/tensor_base.cc
index 05dba1206075d..f9169674a4bbe 100644
--- a/paddle/tcmpt/core/tensor_base.cc
+++ b/paddle/pten/core/tensor_base.cc
@@ -12,9 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/tcmpt/core/tensor_base.h"
-#include "paddle/tcmpt/core/utils/type_registry.h"
+#include "paddle/pten/core/tensor_base.h"
+#include "paddle/pten/core/utils/type_registry.h"
 
-namespace paddle {
-namespace tcmpt {}
-}
+namespace pten {}
diff --git a/paddle/tcmpt/core/tensor_base.h b/paddle/pten/core/tensor_base.h
similarity index 81%
rename from paddle/tcmpt/core/tensor_base.h
rename to paddle/pten/core/tensor_base.h
index 240808e3cc492..92b1ebaca4f1c 100644
--- a/paddle/tcmpt/core/tensor_base.h
+++ b/paddle/pten/core/tensor_base.h
@@ -16,20 +16,19 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/platform/place.h"
-#include "paddle/tcmpt/common/data_type.h"
-#include "paddle/tcmpt/common/layout.h"
-#include "paddle/tcmpt/core/storage.h"
-#include "paddle/tcmpt/core/utils/type_registry.h"
+#include "paddle/pten/common/data_type.h"
+#include "paddle/pten/common/layout.h"
+#include "paddle/pten/core/storage.h"
+#include "paddle/pten/core/utils/type_registry.h"
 
-#include "paddle/tcmpt/core/backend.h"
+#include "paddle/pten/core/backend.h"
 
-namespace paddle {
-namespace tcmpt {
+namespace pten {
 
 class TensorBase {
  public:
-  using DataType = experimental::DataType;
-  using DataLayout = experimental::DataLayout;
+  using DataType = paddle::experimental::DataType;
+  using DataLayout = paddle::experimental::DataLayout;
 
   virtual ~TensorBase() = default;
 
@@ -51,7 +50,7 @@ class TensorBase {
 
   /// \brief Returns the data place of the tensor.
   /// \return The data place of the tensor.
-  virtual const platform::Place& place() const = 0;
+  virtual const paddle::platform::Place& place() const = 0;
 
   /// \brief Test whether the metadata is valid.
   /// \return Whether the metadata is valid.
@@ -61,7 +60,7 @@ class TensorBase {
   /// return Whether the storage is allocated.
   virtual bool initialized() const = 0;
 
-  virtual pt::Backend backend() const = 0;
+  virtual pten::Backend backend() const = 0;
 
   /// \brief Return the type information of the derived class to support
   /// safely downcast in non-rtti environment.
@@ -74,5 +73,4 @@ class TensorBase {
   TypeInfo<TensorBase> type_info_{TypeInfo<TensorBase>::kUnknownType};
 };
 
-}  // namespace tcmpt
-}  // namespace paddle
+}  // namespace pten
diff --git a/paddle/tcmpt/core/tensor_meta.h b/paddle/pten/core/tensor_meta.h
similarity index 96%
rename from paddle/tcmpt/core/tensor_meta.h
rename to paddle/pten/core/tensor_meta.h
index 3cc557e05b4c1..c305ed2a850ee 100644
--- a/paddle/tcmpt/core/tensor_meta.h
+++ b/paddle/pten/core/tensor_meta.h
@@ -16,9 +16,9 @@ limitations under the License. */
 
 #include <vector>
 
-#include "paddle/tcmpt/common/data_type.h"
-#include "paddle/tcmpt/common/layout.h"
-#include "paddle/tcmpt/core/backend.h"
+#include "paddle/pten/common/data_type.h"
+#include "paddle/pten/common/layout.h"
+#include "paddle/pten/core/backend.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/ddim.h"
@@ -26,7 +26,7 @@ limitations under the License. */
 // used on CUDA device? Can we use small_vector here?
 // #include "paddle/fluid/framework/mixed_vector.h"
 
-namespace pt {
+namespace pten {
 
 using DataType = paddle::experimental::DataType;
 using DataLayout = paddle::experimental::DataLayout;
@@ -144,4 +144,4 @@ struct TensorMeta {
   LoD lod;
 };
 
-}  // namespace pt
+}  // namespace pten
diff --git a/paddle/tcmpt/core/tensor_status.h b/paddle/pten/core/tensor_status.h
similarity index 92%
rename from paddle/tcmpt/core/tensor_status.h
rename to paddle/pten/core/tensor_status.h
index 1eb56397414b5..2abc8ff1b1b92 100644
--- a/paddle/tcmpt/core/tensor_status.h
+++ b/paddle/pten/core/tensor_status.h
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/tcmpt/common/data_type.h"
-#include "paddle/tcmpt/common/layout.h"
-#include "paddle/tcmpt/core/backend.h"
+#include "paddle/pten/common/data_type.h"
+#include "paddle/pten/common/layout.h"
+#include "paddle/pten/core/backend.h"
 
-namespace pt {
+namespace pten {
 
 class TensorInplaceVersion {
  public:
@@ -61,4 +61,4 @@ struct TensorStatus {
   bool is_scalar{false};
 };
 
-}  // namespace pt
+}  // namespace pten
diff --git a/paddle/tcmpt/core/utils/CMakeLists.txt b/paddle/pten/core/utils/CMakeLists.txt
similarity index 100%
rename from paddle/tcmpt/core/utils/CMakeLists.txt
rename to paddle/pten/core/utils/CMakeLists.txt
diff --git a/paddle/tcmpt/core/utils/intrusive_ptr.h b/paddle/pten/core/utils/intrusive_ptr.h
similarity index 95%
rename from paddle/tcmpt/core/utils/intrusive_ptr.h
rename to paddle/pten/core/utils/intrusive_ptr.h
index f368d05cb47db..f0e94fadac973 100644
--- a/paddle/tcmpt/core/utils/intrusive_ptr.h
+++ b/paddle/pten/core/utils/intrusive_ptr.h
@@ -18,8 +18,7 @@ limitations under the License. */
 #include "glog/logging.h"
 #include "paddle/fluid/platform/enforce.h"
 
-namespace paddle {
-namespace tcmpt {
+namespace pten {
 
 template <typename T>
 class intrusive_ptr {
@@ -58,7 +57,7 @@ class intrusive_ptr {
   T& operator*() const {
     PADDLE_ENFORCE_NOT_NULL(
         px,
-        platform::errors::PreconditionNotMet(
+        paddle::platform::errors::PreconditionNotMet(
             "The pointer must be non-null before the dereference operation."));
     return *px;
   }
@@ -66,7 +65,7 @@ class intrusive_ptr {
   T* operator->() const {
     PADDLE_ENFORCE_NOT_NULL(
         px,
-        platform::errors::PreconditionNotMet(
+        paddle::platform::errors::PreconditionNotMet(
             "The pointer must be non-null before the dereference operation."));
     return px;
   }
@@ -156,5 +155,4 @@ inline intrusive_ptr<T> copy_intrusive(const intrusive_ptr<T>& rhs) {
   return intrusive_ptr<T>(rhs.get(), true);
 }
 
-}  // namespace tcmpt
-}  // namespace paddle
+}  // namespace pten
diff --git a/paddle/tcmpt/core/utils/intrusive_ref_counter.h b/paddle/pten/core/utils/intrusive_ref_counter.h
similarity index 96%
rename from paddle/tcmpt/core/utils/intrusive_ref_counter.h
rename to paddle/pten/core/utils/intrusive_ref_counter.h
index 1c93bede71df1..8e18c82197eb6 100644
--- a/paddle/tcmpt/core/utils/intrusive_ref_counter.h
+++ b/paddle/pten/core/utils/intrusive_ref_counter.h
@@ -16,8 +16,7 @@ limitations under the License. */
 
 #include <atomic>
 
-namespace paddle {
-namespace tcmpt {
+namespace pten {
 
 template <typename DerivedT>
 class intrusive_ref_counter;
@@ -62,5 +61,4 @@ inline void intrusive_ptr_release(
   }
 }
 
-}  // namespace tcmpt
-}  // namespace paddle
+}  // namespace pten
diff --git a/paddle/tcmpt/core/utils/type_info.h b/paddle/pten/core/utils/type_info.h
similarity index 95%
rename from paddle/tcmpt/core/utils/type_info.h
rename to paddle/pten/core/utils/type_info.h
index ba5bc641b94b2..4e4084a4c785b 100644
--- a/paddle/tcmpt/core/utils/type_info.h
+++ b/paddle/pten/core/utils/type_info.h
@@ -16,8 +16,7 @@ limitations under the License. */
 
 #include <string>
 
-namespace paddle {
-namespace tcmpt {
+namespace pten {
 
 template <typename BaseT>
 class TypeRegistry;
@@ -57,5 +56,4 @@ template <typename BaseT, typename DerivedT>
 const TypeInfo<BaseT> TypeInfoTraits<BaseT, DerivedT>::kType =
     RegisterStaticType<BaseT>(DerivedT::name());
 
-}  // namespace tcmpt
-}  // namespace paddle
+}  // namespace pten
diff --git a/paddle/tcmpt/core/utils/type_registry.h b/paddle/pten/core/utils/type_registry.h
similarity index 94%
rename from paddle/tcmpt/core/utils/type_registry.h
rename to paddle/pten/core/utils/type_registry.h
index 52b699a0dd413..82eb9ae52bd7e 100644
--- a/paddle/tcmpt/core/utils/type_registry.h
+++ b/paddle/pten/core/utils/type_registry.h
@@ -18,10 +18,9 @@ limitations under the License. */
 #include <mutex>
 #include <string>
 
-#include "paddle/tcmpt/core/utils/type_info.h"
+#include "paddle/pten/core/utils/type_info.h"
 
-namespace paddle {
-namespace tcmpt {
+namespace pten {
 
 template <typename BaseT>
 class TypeRegistry {
@@ -82,5 +81,4 @@ template <typename BaseT>
 const TypeInfo<BaseT> TypeInfo<BaseT>::kUnknownType =
     RegisterStaticType<BaseT>("Unknown");
 
-}  // namespace tcmpt
-}  // namespace paddle
+}  // namespace pten
diff --git a/paddle/pten/hapi/CMakeLists.txt b/paddle/pten/hapi/CMakeLists.txt
new file mode 100644
index 0000000000000..8a33de85bddd3
--- /dev/null
+++ b/paddle/pten/hapi/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_subdirectory(lib)
+
+cc_library(pten_hapi SRCS all.cc DEPS math_api linalg_api creation_api)
diff --git a/paddle/tcmpt/hapi/all.cc b/paddle/pten/hapi/all.cc
similarity index 95%
rename from paddle/tcmpt/hapi/all.cc
rename to paddle/pten/hapi/all.cc
index f43cdb9f78b53..4ea6fabeecf2e 100644
--- a/paddle/tcmpt/hapi/all.cc
+++ b/paddle/pten/hapi/all.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/tcmpt/hapi/all.h"
+#include "paddle/pten/hapi/all.h"
 
 namespace paddle {
 namespace experimental {}  // namespace experimental
diff --git a/paddle/tcmpt/hapi/all.h b/paddle/pten/hapi/all.h
similarity index 77%
rename from paddle/tcmpt/hapi/all.h
rename to paddle/pten/hapi/all.h
index bd1c51fc49ed3..de2e14db421f6 100644
--- a/paddle/tcmpt/hapi/all.h
+++ b/paddle/pten/hapi/all.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 // user apis
-#include "paddle/tcmpt/hapi/include/creation.h"
-#include "paddle/tcmpt/hapi/include/linalg.h"
-#include "paddle/tcmpt/hapi/include/math.h"
-#include "paddle/tcmpt/hapi/include/tensor.h"
+#include "paddle/pten/hapi/include/creation.h"
+#include "paddle/pten/hapi/include/linalg.h"
+#include "paddle/pten/hapi/include/math.h"
+#include "paddle/pten/hapi/include/tensor.h"
diff --git a/paddle/tcmpt/hapi/include/creation.h b/paddle/pten/hapi/include/creation.h
similarity index 56%
rename from paddle/tcmpt/hapi/include/creation.h
rename to paddle/pten/hapi/include/creation.h
index d2d68e3bb7e61..3929d8d026e08 100644
--- a/paddle/tcmpt/hapi/include/creation.h
+++ b/paddle/pten/hapi/include/creation.h
@@ -14,20 +14,25 @@
 
 #pragma once
 
-#include "paddle/tcmpt/common/data_type.h"
-#include "paddle/tcmpt/core/scalar.h"
-#include "paddle/tcmpt/hapi/include/tensor.h"
+#include "paddle/pten/common/data_type.h"
+#include "paddle/pten/core/scalar.h"
+#include "paddle/pten/hapi/include/tensor.h"
 
 namespace paddle {
 namespace experimental {
 
 Tensor full_like(const Tensor& x,
-                 const pt::Scalar& value,
-                 pt::DataType dtype = pt::DataType::kUndef);
+                 const pten::Scalar& value,
+                 paddle::experimental::DataType dtype =
+                     paddle::experimental::DataType::kUndef);
 
-Tensor ones_like(const Tensor& x, pt::DataType dtype = pt::DataType::kUndef);
+Tensor ones_like(const Tensor& x,
+                 paddle::experimental::DataType dtype =
+                     paddle::experimental::DataType::kUndef);
 
-Tensor zeros_like(const Tensor& x, pt::DataType dtype = pt::DataType::kUndef);
+Tensor zeros_like(const Tensor& x,
+                  paddle::experimental::DataType dtype =
+                      paddle::experimental::DataType::kUndef);
 
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/tcmpt/hapi/include/linalg.h b/paddle/pten/hapi/include/linalg.h
similarity index 95%
rename from paddle/tcmpt/hapi/include/linalg.h
rename to paddle/pten/hapi/include/linalg.h
index df709b6a3c50f..6e78b50af11c3 100644
--- a/paddle/tcmpt/hapi/include/linalg.h
+++ b/paddle/pten/hapi/include/linalg.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include "paddle/tcmpt/hapi/include/tensor.h"
+#include "paddle/pten/hapi/include/tensor.h"
 
 namespace paddle {
 namespace experimental {
diff --git a/paddle/tcmpt/hapi/include/manipulation.h b/paddle/pten/hapi/include/manipulation.h
similarity index 94%
rename from paddle/tcmpt/hapi/include/manipulation.h
rename to paddle/pten/hapi/include/manipulation.h
index 35695f4f6d8b6..4622032f5ad54 100644
--- a/paddle/tcmpt/hapi/include/manipulation.h
+++ b/paddle/pten/hapi/include/manipulation.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/tcmpt/hapi/include/tensor.h"
+#include "paddle/pten/hapi/include/tensor.h"
 
 namespace paddle {
 namespace experimental {
diff --git a/paddle/tcmpt/hapi/include/math.h b/paddle/pten/hapi/include/math.h
similarity index 94%
rename from paddle/tcmpt/hapi/include/math.h
rename to paddle/pten/hapi/include/math.h
index 9245d1033c791..0b3dbab70e86f 100644
--- a/paddle/tcmpt/hapi/include/math.h
+++ b/paddle/pten/hapi/include/math.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/tcmpt/hapi/include/tensor.h"
+#include "paddle/pten/hapi/include/tensor.h"
 
 namespace paddle {
 namespace experimental {
diff --git a/paddle/tcmpt/hapi/include/tensor.h b/paddle/pten/hapi/include/tensor.h
similarity index 91%
rename from paddle/tcmpt/hapi/include/tensor.h
rename to paddle/pten/hapi/include/tensor.h
index ccca911cf8c86..1982483fe4119 100644
--- a/paddle/tcmpt/hapi/include/tensor.h
+++ b/paddle/pten/hapi/include/tensor.h
@@ -18,14 +18,14 @@ limitations under the License. */
 #include <memory>
 #include <utility>
 
-#include "paddle/tcmpt/core/tensor_base.h"
+#include "paddle/pten/core/tensor_base.h"
 
 /**
  * [ Why still include the fluid headers? ]
  *
  * We hope to organize the basic implementation of Tensor and the logic related
  * to Tensor computation into an independent library, which we call
- * [Tensor Compute Library, tcmpt], so we extract or rewrite the original
+ * [Tensor Compute Library, pten], so we extract or rewrite the original
  * Kernels.
  *
  * In the future, the training library, inference library and custom operators
@@ -54,7 +54,7 @@ class AutogradMetaInterface {
 
 /**
  * Tensor is the API description of the basic data structure in the
- * [ Paddle "Tensor CoMPuTe (tcmpt)" Library ].
+ * [ Paddle "Tensor CoMPuTe (pten)" Library ].
  *
  * It is not limited to a simple n-dimensional array.
  * It contains a smart pointer to `TensorImpl`. The data description contained
@@ -91,7 +91,7 @@ class Tensor final {
    * @param {shared_ptr<TensorBase>} tensor_impl
    * @return {Tensor}
    */
-  explicit Tensor(std::shared_ptr<tcmpt::TensorBase> tensor_impl)
+  explicit Tensor(std::shared_ptr<pten::TensorBase> tensor_impl)
       : impl_(std::move(tensor_impl)) {
     if (impl_.get() == nullptr) {
       throw std::runtime_error("TensorImpl with nullptr is not supported");
@@ -118,14 +118,14 @@ class Tensor final {
    * @param None
    * @return {DataType}
    */
-  pt::DataType type() const { return impl_->data_type(); }
+  paddle::experimental::DataType type() const { return impl_->data_type(); }
 
   /**
    * @description: Return the layout of current Tensor.
    * @param None
    * @return {DataLayout}
    */
-  pt::DataLayout layout() const { return impl_->layout(); }
+  paddle::experimental::DataLayout layout() const { return impl_->layout(); }
 
   /* Part 3: Device and Backend methods */
   /**
@@ -138,8 +138,8 @@ class Tensor final {
   /**
    * Backend judgment APIs, shield the concept of Backend.
    */
-  bool is_cpu() const { return impl_->backend() == pt::Backend::kCPU; }
-  bool is_cuda() const { return impl_->backend() == pt::Backend::kCUDA; }
+  bool is_cpu() const { return impl_->backend() == pten::Backend::kCPU; }
+  bool is_cuda() const { return impl_->backend() == pten::Backend::kCUDA; }
   bool is_hip() const;
   bool is_xpu() const;
   bool is_npu() const;
@@ -165,16 +165,14 @@ class Tensor final {
    * @param None
    * @return {std::shared_ptr<TensorBase>}
    */
-  std::shared_ptr<tcmpt::TensorBase> impl() const { return impl_; }
+  std::shared_ptr<pten::TensorBase> impl() const { return impl_; }
 
   /**
    * @description: Set the implemention of current Tensor.
    * @param {std::shared_ptr<TensorBase>}
    * @return None
    */
-  void set_impl(const std::shared_ptr<tcmpt::TensorBase>& impl) {
-    impl_ = impl;
-  }
+  void set_impl(const std::shared_ptr<pten::TensorBase>& impl) { impl_ = impl; }
 
   // TODO(chenweihang): Whether API Tensor need `data` and `mutable_data`?
 
@@ -245,7 +243,7 @@ class Tensor final {
    * heterogeneous Tensor implementation, so that the API level can be unified
    * to one `Tensor`.
    */
-  std::shared_ptr<tcmpt::TensorBase> impl_;
+  std::shared_ptr<pten::TensorBase> impl_;
 
   /**
    * [ Why need abstract AutogradMetaInterface here? ]
diff --git a/paddle/pten/hapi/lib/CMakeLists.txt b/paddle/pten/hapi/lib/CMakeLists.txt
new file mode 100644
index 0000000000000..54cabb7e69baa
--- /dev/null
+++ b/paddle/pten/hapi/lib/CMakeLists.txt
@@ -0,0 +1,4 @@
+cc_library(math_api SRCS math.cc DEPS pten)
+cc_library(linalg_api SRCS linalg.cc DEPS pten)
+cc_library(creation_api SRCS creation.cc DEPS pten)
+cc_library(manipulation_api SRCS manipulation.cc DEPS pten)
diff --git a/paddle/tcmpt/hapi/lib/creation.cc b/paddle/pten/hapi/lib/creation.cc
similarity index 65%
rename from paddle/tcmpt/hapi/lib/creation.cc
rename to paddle/pten/hapi/lib/creation.cc
index 057855a3dba4c..3004f935f4833 100644
--- a/paddle/tcmpt/hapi/lib/creation.cc
+++ b/paddle/pten/hapi/lib/creation.cc
@@ -12,36 +12,38 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/tcmpt/hapi/include/creation.h"
+#include "paddle/pten/hapi/include/creation.h"
 
 #include <memory>
 
 #include "glog/logging.h"
 
-#include "paddle/tcmpt/api/include/core.h"
-#include "paddle/tcmpt/api/include/infershape.h"
-#include "paddle/tcmpt/hapi/lib/kernel_generate.h"
+#include "paddle/pten/api/include/core.h"
+#include "paddle/pten/api/include/infershape.h"
+#include "paddle/pten/hapi/lib/kernel_generate.h"
 
 namespace paddle {
 namespace experimental {
 
-Tensor full_like(const Tensor& x, const pt::Scalar& value, pt::DataType dtype) {
+Tensor full_like(const Tensor& x,
+                 const pten::Scalar& value,
+                 paddle::experimental::DataType dtype) {
   // 1. Get kernel signature and kernel
   auto kernel_signature = ParseKernelNameAndKeyByArgs("fill_any_like", x);
   VLOG(1) << kernel_signature.first;
   VLOG(1) << kernel_signature.second;
-  VLOG(1) << pt::KernelFactory::Instance();
+  VLOG(1) << pten::KernelFactory::Instance();
 
-  auto kernel = pt::KernelFactory::Instance().SelectKernelOrThrowError(
+  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
       kernel_signature.first, kernel_signature.second);
   VLOG(1) << kernel;
 
   // 2. Get Device Context
   auto* dev_ctx = GetDeviceContextByBackend(kernel_signature.second.backend());
-  auto kernel_context = pt::KernelContext(*dev_ctx);
+  auto kernel_context = pten::KernelContext(*dev_ctx);
 
   // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<pt::DenseTensor>(x.impl());
+  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
   kernel_context.EmplaceBackInput(dense_x);
 
   kernel_context.EmplaceBackAttr(value);
@@ -52,11 +54,11 @@ Tensor full_like(const Tensor& x, const pt::Scalar& value, pt::DataType dtype) {
   // 5. Prepare outputs
   Tensor out;
   // InferDataType
-  if (dtype != pt::DataType::kUndef) {
+  if (dtype != paddle::experimental::DataType::kUndef) {
     out_meta.type = dtype;
   }
   auto dense_out =
-      std::make_shared<pt::DenseTensor>(out_meta, pt::TensorStatus());
+      std::make_shared<pten::DenseTensor>(out_meta, pten::TensorStatus());
   kernel_context.EmplaceBackOutput(dense_out);
   out.set_impl(dense_out);
 
@@ -66,11 +68,11 @@ Tensor full_like(const Tensor& x, const pt::Scalar& value, pt::DataType dtype) {
   return out;
 }
 
-Tensor ones_like(const Tensor& x, pt::DataType dtype) {
+Tensor ones_like(const Tensor& x, paddle::experimental::DataType dtype) {
   return full_like(x, 1, dtype);
 }
 
-Tensor zeros_like(const Tensor& x, pt::DataType dtype) {
+Tensor zeros_like(const Tensor& x, paddle::experimental::DataType dtype) {
   return full_like(x, 0, dtype);
 }
 
diff --git a/paddle/tcmpt/hapi/lib/kernel_generate.h b/paddle/pten/hapi/lib/kernel_generate.h
similarity index 86%
rename from paddle/tcmpt/hapi/lib/kernel_generate.h
rename to paddle/pten/hapi/lib/kernel_generate.h
index 1b5f9d7ae02ac..82214c96fb5c7 100644
--- a/paddle/tcmpt/hapi/lib/kernel_generate.h
+++ b/paddle/pten/hapi/lib/kernel_generate.h
@@ -17,10 +17,10 @@ limitations under the License. */
 #include <string>
 #include <utility>
 
-#include "paddle/tcmpt/hapi/include/tensor.h"
+#include "paddle/pten/hapi/include/tensor.h"
 
 // TODO(chenweihang): split KernelName, Key, Kernel, Factory into diff files
-#include "paddle/tcmpt/core/kernel_factory.h"
+#include "paddle/pten/core/kernel_factory.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
@@ -61,9 +61,9 @@ struct ArgsIterator {
 
 struct KernelNameAndKeyParser : ArgsIterator<KernelNameAndKeyParser> {
   std::string kernel_name;
-  pt::Backend backend;
-  pt::DataLayout layout;
-  pt::DataType dtype;
+  pten::Backend backend;
+  paddle::experimental::DataLayout layout;
+  paddle::experimental::DataType dtype;
 
   explicit KernelNameAndKeyParser(const std::string& name)
       : kernel_name(name) {}
@@ -72,9 +72,9 @@ struct KernelNameAndKeyParser : ArgsIterator<KernelNameAndKeyParser> {
   // TODO(chenweihang): deal with multiple diff input Tensors
   void operator()(const Tensor& x) {
     if (x.is_cpu()) {
-      backend = pt::Backend::kCPU;
+      backend = pten::Backend::kCPU;
     } else if (x.is_cuda()) {
-      backend = pt::Backend::kCUDA;
+      backend = pten::Backend::kCUDA;
     } else {
       throw std::runtime_error("Unsupported backend when parser args.");
     }
@@ -97,20 +97,20 @@ struct KernelNameAndKeyParser : ArgsIterator<KernelNameAndKeyParser> {
 // suffix on the basis of the function name, or the input contains HostTensor,
 // and the `host` suffix should be added on the basis of the function name.
 template <typename... Args>
-std::pair<pt::KernelName, pt::KernelKey> ParseKernelNameAndKeyByArgs(
+std::pair<pten::KernelName, pten::KernelKey> ParseKernelNameAndKeyByArgs(
     const std::string& fn_name, const Args&... args) {
   auto parser = detail::KernelNameAndKeyParser(fn_name);
   parser(args...);
   // TODO(chenweihang): polish design here
-  pt::KernelName kernel_name(parser.kernel_name);
-  pt::KernelKey kernel_key(parser.backend, parser.layout, parser.dtype);
+  pten::KernelName kernel_name(parser.kernel_name);
+  pten::KernelKey kernel_key(parser.backend, parser.layout, parser.dtype);
   return std::make_pair(kernel_name, kernel_key);
 }
 
 paddle::platform::DeviceContext* GetDeviceContextByBackend(
-    pt::Backend backend) {
+    pten::Backend backend) {
   auto& pool = paddle::platform::DeviceContextPool::Instance();
-  auto place = pt::TransToFluidPlace(backend);
+  auto place = pten::TransToFluidPlace(backend);
   // switch (backend) {
   //   case Backend::kCPU:
   //     return pool.GetByPlace(paddle::platform::CPUPlace());
diff --git a/paddle/tcmpt/hapi/lib/linalg.cc b/paddle/pten/hapi/lib/linalg.cc
similarity index 69%
rename from paddle/tcmpt/hapi/lib/linalg.cc
rename to paddle/pten/hapi/lib/linalg.cc
index dc11bae3e37b7..c8198052f43b0 100644
--- a/paddle/tcmpt/hapi/lib/linalg.cc
+++ b/paddle/pten/hapi/lib/linalg.cc
@@ -12,19 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/tcmpt/hapi/include/linalg.h"
+#include "paddle/pten/hapi/include/linalg.h"
 
 #include <memory>
 
 #include "glog/logging.h"
 
-#include "paddle/tcmpt/api/include/core.h"
-#include "paddle/tcmpt/api/include/infershape.h"
-#include "paddle/tcmpt/core/convert_utils.h"
-#include "paddle/tcmpt/core/dense_tensor.h"
-#include "paddle/tcmpt/core/kernel_context.h"
-#include "paddle/tcmpt/hapi/lib/kernel_generate.h"
-#include "paddle/tcmpt/infershape/binary.h"
+#include "paddle/pten/api/include/core.h"
+#include "paddle/pten/api/include/infershape.h"
+#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_context.h"
+#include "paddle/pten/hapi/lib/kernel_generate.h"
+#include "paddle/pten/infershape/binary.h"
 
 namespace paddle {
 namespace experimental {
@@ -34,20 +34,20 @@ Tensor dot(const Tensor& x, const Tensor& y) {
   auto kernel_signature = ParseKernelNameAndKeyByArgs("dot", x);
   VLOG(1) << kernel_signature.first;
   VLOG(1) << kernel_signature.second;
-  VLOG(1) << pt::KernelFactory::Instance();
+  VLOG(1) << pten::KernelFactory::Instance();
 
-  auto kernel = pt::KernelFactory::Instance().SelectKernelOrThrowError(
+  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
       kernel_signature.first, kernel_signature.second);
   VLOG(1) << kernel;
 
   // 2. Get Device Context
   auto* dev_ctx = GetDeviceContextByBackend(kernel_signature.second.backend());
-  auto kernel_context = pt::KernelContext(*dev_ctx);
+  auto kernel_context = pten::KernelContext(*dev_ctx);
 
   // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<pt::DenseTensor>(x.impl());
+  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
   kernel_context.EmplaceBackInput(dense_x);
-  auto dense_y = std::dynamic_pointer_cast<pt::DenseTensor>(y.impl());
+  auto dense_y = std::dynamic_pointer_cast<pten::DenseTensor>(y.impl());
   kernel_context.EmplaceBackInput(dense_y);
   // TODO(chenweihang): add transform impl
 
@@ -59,7 +59,7 @@ Tensor dot(const Tensor& x, const Tensor& y) {
   Tensor out;
   // TODO(chenweihang): deal with multiple outputs
   auto dense_out =
-      std::make_shared<pt::DenseTensor>(out_meta, pt::TensorStatus());
+      std::make_shared<pten::DenseTensor>(out_meta, pten::TensorStatus());
   kernel_context.EmplaceBackOutput(dense_out);
   out.set_impl(dense_out);
 
diff --git a/paddle/tcmpt/hapi/lib/manipulation.cc b/paddle/pten/hapi/lib/manipulation.cc
similarity index 77%
rename from paddle/tcmpt/hapi/lib/manipulation.cc
rename to paddle/pten/hapi/lib/manipulation.cc
index c8448eecfe2de..8a64d0e9f4a45 100644
--- a/paddle/tcmpt/hapi/lib/manipulation.cc
+++ b/paddle/pten/hapi/lib/manipulation.cc
@@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/tcmpt/hapi/include/manipulation.h"
+#include "paddle/pten/hapi/include/manipulation.h"
 
 #include <memory>
 
 #include "glog/logging.h"
-#include "paddle/tcmpt/api/include/core.h"
-#include "paddle/tcmpt/hapi/lib/kernel_generate.h"
-#include "paddle/tcmpt/infershape/unary.h"
+#include "paddle/pten/api/include/core.h"
+#include "paddle/pten/hapi/lib/kernel_generate.h"
+#include "paddle/pten/infershape/unary.h"
 
 namespace paddle {
 namespace experimental {
@@ -30,18 +30,18 @@ Tensor flatten(const Tensor& x, int start_axis, int stop_axis) {
       ParseKernelNameAndKeyByArgs("flatten_contiguous_range", x);
   VLOG(1) << kernel_signature.first;
   VLOG(1) << kernel_signature.second;
-  VLOG(1) << pt::KernelFactory::Instance();
+  VLOG(1) << pten::KernelFactory::Instance();
 
-  auto kernel = pt::KernelFactory::Instance().SelectKernelOrThrowError(
+  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
       kernel_signature.first, kernel_signature.second);
   VLOG(1) << kernel;
 
   // 2. Get Device Context
   auto* dev_ctx = GetDeviceContextByBackend(kernel_signature.second.backend());
-  auto kernel_context = pt::KernelContext(*dev_ctx);
+  auto kernel_context = pten::KernelContext(*dev_ctx);
 
   // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<pt::DenseTensor>(x.impl());
+  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
   kernel_context.EmplaceBackInput(dense_x);
   kernel_context.EmplaceBackAttr(start_axis);
   kernel_context.EmplaceBackAttr(stop_axis);
@@ -54,7 +54,7 @@ Tensor flatten(const Tensor& x, int start_axis, int stop_axis) {
   Tensor out;
   // TODO(chenweihang): deal with multiple outputs
   auto dense_out =
-      std::make_shared<pt::DenseTensor>(out_meta, pt::TensorStatus());
+      std::make_shared<pten::DenseTensor>(out_meta, pten::TensorStatus());
   kernel_context.EmplaceBackOutput(dense_out);
   out.set_impl(dense_out);
 
diff --git a/paddle/tcmpt/hapi/lib/math.cc b/paddle/pten/hapi/lib/math.cc
similarity index 75%
rename from paddle/tcmpt/hapi/lib/math.cc
rename to paddle/pten/hapi/lib/math.cc
index 531e85298758c..764511702f0ea 100644
--- a/paddle/tcmpt/hapi/lib/math.cc
+++ b/paddle/pten/hapi/lib/math.cc
@@ -12,16 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/tcmpt/hapi/include/math.h"
+#include "paddle/pten/hapi/include/math.h"
 
 #include <memory>
 
 #include "glog/logging.h"
 
-#include "paddle/tcmpt/api/include/core.h"
-#include "paddle/tcmpt/api/include/infershape.h"
-#include "paddle/tcmpt/hapi/lib/kernel_generate.h"
-#include "paddle/tcmpt/infershape/unary.h"
+#include "paddle/pten/api/include/core.h"
+#include "paddle/pten/api/include/infershape.h"
+#include "paddle/pten/hapi/lib/kernel_generate.h"
+#include "paddle/pten/infershape/unary.h"
 
 namespace paddle {
 namespace experimental {
@@ -31,18 +31,18 @@ Tensor mean(const Tensor& x) {
   auto kernel_signature = ParseKernelNameAndKeyByArgs("mean", x);
   VLOG(1) << kernel_signature.first;
   VLOG(1) << kernel_signature.second;
-  VLOG(1) << pt::KernelFactory::Instance();
+  VLOG(1) << pten::KernelFactory::Instance();
 
-  auto kernel = pt::KernelFactory::Instance().SelectKernelOrThrowError(
+  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
       kernel_signature.first, kernel_signature.second);
   VLOG(1) << kernel;
 
   // 2. Get Device Context
   auto* dev_ctx = GetDeviceContextByBackend(kernel_signature.second.backend());
-  auto kernel_context = pt::KernelContext(*dev_ctx);
+  auto kernel_context = pten::KernelContext(*dev_ctx);
 
   // 3. Auto data transform
-  auto dense_x = std::dynamic_pointer_cast<pt::DenseTensor>(x.impl());
+  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
   kernel_context.EmplaceBackInput(dense_x);
   // TODO(chenweihang): add transform impl
 
@@ -54,7 +54,7 @@ Tensor mean(const Tensor& x) {
   Tensor out;
   // TODO(chenweihang): deal with multiple outputs
   auto dense_out =
-      std::make_shared<pt::DenseTensor>(out_meta, pt::TensorStatus());
+      std::make_shared<pten::DenseTensor>(out_meta, pten::TensorStatus());
   kernel_context.EmplaceBackOutput(dense_out);
   out.set_impl(dense_out);
 
diff --git a/paddle/tcmpt/infershape/CMakeLists.txt b/paddle/pten/infershape/CMakeLists.txt
similarity index 100%
rename from paddle/tcmpt/infershape/CMakeLists.txt
rename to paddle/pten/infershape/CMakeLists.txt
diff --git a/paddle/tcmpt/infershape/binary.cc b/paddle/pten/infershape/binary.cc
similarity index 96%
rename from paddle/tcmpt/infershape/binary.cc
rename to paddle/pten/infershape/binary.cc
index 936af8767ca62..7d224835cc05a 100644
--- a/paddle/tcmpt/infershape/binary.cc
+++ b/paddle/pten/infershape/binary.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/tcmpt/infershape/binary.h"
+#include "paddle/pten/infershape/binary.h"
 
-namespace pt {
+namespace pten {
 
 TensorMeta DotInferShape(const TensorMeta& x_meta, const TensorMeta& y_meta) {
   auto x_dims = x_meta.dims;
@@ -59,4 +59,4 @@ TensorMeta DotInferShape(const TensorMeta& x_meta, const TensorMeta& y_meta) {
   return return_meta;
 }
 
-}  // namespace pt
+}  // namespace pten
diff --git a/paddle/tcmpt/infershape/binary.h b/paddle/pten/infershape/binary.h
similarity index 94%
rename from paddle/tcmpt/infershape/binary.h
rename to paddle/pten/infershape/binary.h
index 816963a277ade..8e44b520e0a9f 100644
--- a/paddle/tcmpt/infershape/binary.h
+++ b/paddle/pten/infershape/binary.h
@@ -15,9 +15,9 @@ limitations under the License. */
 #pragma once
 
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/tcmpt/core/tensor_meta.h"
+#include "paddle/pten/core/tensor_meta.h"
 
-namespace pt {
+namespace pten {
 
 // Common InferShape Functions for binary operators, The format like:
 //
@@ -32,4 +32,4 @@ namespace pt {
 
 TensorMeta DotInferShape(const TensorMeta& x_meta, const TensorMeta& y_meta);
 
-}  // namespace pt
+}  // namespace pten
diff --git a/paddle/tcmpt/infershape/unary.cc b/paddle/pten/infershape/unary.cc
similarity index 96%
rename from paddle/tcmpt/infershape/unary.cc
rename to paddle/pten/infershape/unary.cc
index 3e4a633fa7a7c..57e74345b7d42 100644
--- a/paddle/tcmpt/infershape/unary.cc
+++ b/paddle/pten/infershape/unary.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/tcmpt/infershape/unary.h"
+#include "paddle/pten/infershape/unary.h"
 
-namespace pt {
+namespace pten {
 
 TensorMeta UnchangedInferShape(const TensorMeta& x_meta) { return x_meta; }
 
@@ -74,4 +74,4 @@ TensorMeta FlattenInferShape(const TensorMeta& x_meta,
   return return_meta;
 }
 
-}  // namespace pt
+}  // namespace pten
diff --git a/paddle/tcmpt/infershape/unary.h b/paddle/pten/infershape/unary.h
similarity index 94%
rename from paddle/tcmpt/infershape/unary.h
rename to paddle/pten/infershape/unary.h
index b835ec4bcfa72..1d8fac05d0eaa 100644
--- a/paddle/tcmpt/infershape/unary.h
+++ b/paddle/pten/infershape/unary.h
@@ -15,9 +15,9 @@ limitations under the License. */
 #pragma once
 
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/tcmpt/core/tensor_meta.h"
+#include "paddle/pten/core/tensor_meta.h"
 
-namespace pt {
+namespace pten {
 
 // Common InferShape Functions for unary operators, The format like:
 //
@@ -38,4 +38,4 @@ TensorMeta FlattenInferShape(const TensorMeta& x_meta,
                              int start_axis,
                              int stop_axis);
 
-}  // namespace pt
+}  // namespace pten
diff --git a/paddle/tcmpt/kernels/CMakeLists.txt b/paddle/pten/kernels/CMakeLists.txt
similarity index 94%
rename from paddle/tcmpt/kernels/CMakeLists.txt
rename to paddle/pten/kernels/CMakeLists.txt
index 26b5e16d4428d..09f7a1b102436 100644
--- a/paddle/tcmpt/kernels/CMakeLists.txt
+++ b/paddle/pten/kernels/CMakeLists.txt
@@ -1,4 +1,4 @@
-# tcmpt kernels for diff device
+# pten kernels for diff device
 add_subdirectory(cpu)
 if(WITH_GPU OR WITH_ROCM)
   # TODO(chenweihang): if hip can split from cuda impl, we should add hip dir
diff --git a/paddle/tcmpt/kernels/common/eigen/CMakeLists.txt b/paddle/pten/kernels/common/eigen/CMakeLists.txt
similarity index 100%
rename from paddle/tcmpt/kernels/common/eigen/CMakeLists.txt
rename to paddle/pten/kernels/common/eigen/CMakeLists.txt
diff --git a/paddle/tcmpt/kernels/common/eigen/common.h b/paddle/pten/kernels/common/eigen/common.h
similarity index 86%
rename from paddle/tcmpt/kernels/common/eigen/common.h
rename to paddle/pten/kernels/common/eigen/common.h
index 37bed55a7d97a..f3a6f5fb51ff2 100644
--- a/paddle/tcmpt/kernels/common/eigen/common.h
+++ b/paddle/pten/kernels/common/eigen/common.h
@@ -16,10 +16,10 @@ limitations under the License. */
 
 #include <stdint.h>
 
-#include "paddle/tcmpt/core/dense_tensor.h"
+#include "paddle/pten/core/dense_tensor.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
-namespace pt {
+namespace pten {
 
 // EigenDim converts paddle::platform::DDim into Eigen::DSizes.
 template <int D>
@@ -55,24 +55,24 @@ struct EigenTensor {
   using ConstType =
       Eigen::TensorMap<Eigen::Tensor<const T, D, MajorType, IndexType>>;
 
-  static Type From(pt::DenseTensor& tensor, DDim dims) {  // NOLINT
+  static Type From(pten::DenseTensor& tensor, DDim dims) {  // NOLINT
     // why tensor.data<T>() not work?
     // return Type(const_cast<T*>(reinterpret_cast<const T*>(tensor.data())),
     // EigenDim<D>::From(dims));
     return Type(const_cast<T*>(tensor.data<T>()), EigenDim<D>::From(dims));
   }
 
-  static Type From(pt::DenseTensor& tensor) {  // NOLINT
+  static Type From(pten::DenseTensor& tensor) {  // NOLINT
     return From(tensor, tensor.dims());
   }  // NOLINT
 
-  static ConstType From(const pt::DenseTensor& tensor, DDim dims) {
+  static ConstType From(const pten::DenseTensor& tensor, DDim dims) {
     // return ConstType(reinterpret_cast<const T*>(tensor.data()),
     // EigenDim<D>::From(dims));
     return ConstType(tensor.data<T>(), EigenDim<D>::From(dims));
   }
 
-  static ConstType From(const pt::DenseTensor& tensor) {
+  static ConstType From(const pten::DenseTensor& tensor) {
     return From(tensor, tensor.dims());
   }
 };
@@ -81,8 +81,9 @@ template <typename T,
           int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {
-  static typename EigenMatrix::Type Reshape(pt::DenseTensor& tensor,  // NOLINT
-                                            int num_col_dims) {
+  static typename EigenMatrix::Type Reshape(
+      pten::DenseTensor& tensor,  // NOLINT
+      int num_col_dims) {
     int rank = tensor.dims().size();
     PADDLE_ENFORCE_EQ((num_col_dims > 0 && num_col_dims < rank),
                       true,
@@ -95,8 +96,8 @@ struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {
                              flatten_to_2d(tensor.dims(), num_col_dims));
   }
 
-  static typename EigenMatrix::ConstType Reshape(const pt::DenseTensor& tensor,
-                                                 int num_col_dims) {
+  static typename EigenMatrix::ConstType Reshape(
+      const pten::DenseTensor& tensor, int num_col_dims) {
     int rank = tensor.dims().size();
     PADDLE_ENFORCE_EQ((num_col_dims > 0 && num_col_dims < rank),
                       true,
@@ -116,12 +117,12 @@ template <typename T,
 struct EigenVector : public EigenTensor<T, 1, MajorType, IndexType> {
   // Flatten reshapes a Tensor into an EigenVector.
   static typename EigenVector::Type Flatten(
-      pt::DenseTensor& tensor) {  // NOLINT
+      pten::DenseTensor& tensor) {  // NOLINT
     return EigenVector::From(tensor, {product(tensor.dims())});
   }
 
   static typename EigenVector::ConstType Flatten(
-      const pt::DenseTensor& tensor) {  // NOLINT
+      const pten::DenseTensor& tensor) {  // NOLINT
     return EigenVector::From(tensor, {product(tensor.dims())});
   }
 };
@@ -136,11 +137,11 @@ struct EigenScalar {
   using ConstType = Eigen::TensorMap<
       Eigen::TensorFixedSize<const T, Eigen::Sizes<>, MajorType, IndexType>>;
 
-  static Type From(pt::DenseTensor& tensor) {  // NOLINT
+  static Type From(pten::DenseTensor& tensor) {  // NOLINT
     return Type(const_cast<T*>(tensor.data<T>()));
   }
 
-  static ConstType From(const pt::DenseTensor& tensor) {
+  static ConstType From(const pten::DenseTensor& tensor) {
     return ConstType(tensor.data<T>());
   }
 };
@@ -167,4 +168,4 @@ To32BitIndex(EigenTensor in) {
   return RetType(in.data(), To32BitDims(in.dimensions()));
 }
 
-}  // namespace pt
+}  // namespace pten
diff --git a/paddle/tcmpt/kernels/common/eigen/dot.h b/paddle/pten/kernels/common/eigen/dot.h
similarity index 72%
rename from paddle/tcmpt/kernels/common/eigen/dot.h
rename to paddle/pten/kernels/common/eigen/dot.h
index 32c1e1439fac7..8a7789f3dfb64 100644
--- a/paddle/tcmpt/kernels/common/eigen/dot.h
+++ b/paddle/pten/kernels/common/eigen/dot.h
@@ -14,13 +14,13 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/tcmpt/core/dense_tensor.h"
-#include "paddle/tcmpt/kernels/common/eigen/common.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/kernels/common/eigen/common.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 
-namespace pt {
+namespace pten {
 namespace eigen {
 
 template <typename DevCtx, typename T>
@@ -30,16 +30,16 @@ void Dot(const DevCtx& dev_ctx,
          DenseTensor* out) {
   out->mutable_data();
   if (1 == out->dims().size()) {
-    auto eigen_out = pt::EigenScalar<T>::From(*out);
-    auto eigen_x = pt::EigenVector<T>::Flatten(x);
-    auto eigen_y = pt::EigenVector<T>::Flatten(y);
+    auto eigen_out = pten::EigenScalar<T>::From(*out);
+    auto eigen_x = pten::EigenVector<T>::Flatten(x);
+    auto eigen_y = pten::EigenVector<T>::Flatten(y);
 
     auto& dev = *dev_ctx.eigen_device();
     eigen_out.device(dev) = (eigen_x * eigen_y).sum();
   } else {
-    auto eigen_out = pt::EigenMatrix<T>::From(*out);
-    auto eigen_x = pt::EigenMatrix<T>::From(x);
-    auto eigen_y = pt::EigenMatrix<T>::From(y);
+    auto eigen_out = pten::EigenMatrix<T>::From(*out);
+    auto eigen_x = pten::EigenMatrix<T>::From(x);
+    auto eigen_y = pten::EigenMatrix<T>::From(y);
 
     auto& dev = *dev_ctx.eigen_device();
     eigen_out.device(dev) = (eigen_x * eigen_y).sum(Eigen::DSizes<int, 1>(1));
@@ -47,4 +47,4 @@ void Dot(const DevCtx& dev_ctx,
 }
 
 }  // namespace eigen
-}  // namespace pt
+}  // namespace pten
diff --git a/paddle/tcmpt/kernels/common/eigen/fill.h b/paddle/pten/kernels/common/eigen/fill.h
similarity index 91%
rename from paddle/tcmpt/kernels/common/eigen/fill.h
rename to paddle/pten/kernels/common/eigen/fill.h
index 186163c3fedc4..df76194839ed7 100644
--- a/paddle/tcmpt/kernels/common/eigen/fill.h
+++ b/paddle/pten/kernels/common/eigen/fill.h
@@ -14,13 +14,13 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/tcmpt/core/dense_tensor.h"
-#include "paddle/tcmpt/kernels/common/eigen/common.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/kernels/common/eigen/common.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 
-namespace pt {
+namespace pten {
 namespace eigen {
 
 template <typename DeviceContext, typename T, typename VType>
@@ -51,9 +51,9 @@ void fill(const DeviceContext& context, DenseTensor* tensor, VType val) {
           static_cast<CommonType>(std::numeric_limits<T>::max()),
           static_cast<float>(val)));
 
-  auto t = pt::EigenVector<T>::Flatten(*tensor);
+  auto t = pten::EigenVector<T>::Flatten(*tensor);
   t.device(*context.eigen_device()) = t.constant(static_cast<T>(val));
 }
 
 }  // namespace eigen
-}  // namespace pt
+}  // namespace pten
diff --git a/paddle/tcmpt/kernels/common/eigen/mean.h b/paddle/pten/kernels/common/eigen/mean.h
similarity index 82%
rename from paddle/tcmpt/kernels/common/eigen/mean.h
rename to paddle/pten/kernels/common/eigen/mean.h
index 2b1ea95940727..9ee5ab12c9332 100644
--- a/paddle/tcmpt/kernels/common/eigen/mean.h
+++ b/paddle/pten/kernels/common/eigen/mean.h
@@ -14,13 +14,13 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/tcmpt/core/dense_tensor.h"
-#include "paddle/tcmpt/kernels/common/eigen/common.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/kernels/common/eigen/common.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 
-namespace pt {
+namespace pten {
 namespace eigen {
 
 template <typename DevCtx, typename T>
@@ -30,12 +30,12 @@ void Mean(const DevCtx& dev_ctx, const DenseTensor& x, DenseTensor* out) {
   // TODO(chenweihang): if we design new tensor, we should support
   // the low-level calc functor use new tensor as input,
   // which may be a big project!
-  auto eigen_x = pt::EigenVector<T>::Flatten(x);
-  auto eigen_out = pt::EigenScalar<T>::From(*out);
+  auto eigen_x = pten::EigenVector<T>::Flatten(x);
+  auto eigen_out = pten::EigenScalar<T>::From(*out);
 
   auto& dev = *dev_ctx.eigen_device();
   eigen_out.device(dev) = eigen_x.mean();
 }
 
 }  // namespace eigen
-}  // namespace pt
+}  // namespace pten
diff --git a/paddle/tcmpt/kernels/common/eigen/scale.h b/paddle/pten/kernels/common/eigen/scale.h
similarity index 85%
rename from paddle/tcmpt/kernels/common/eigen/scale.h
rename to paddle/pten/kernels/common/eigen/scale.h
index 0f3e92d9db787..fda15302e2971 100644
--- a/paddle/tcmpt/kernels/common/eigen/scale.h
+++ b/paddle/pten/kernels/common/eigen/scale.h
@@ -14,13 +14,13 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/tcmpt/core/dense_tensor.h"
-#include "paddle/tcmpt/kernels/common/eigen/common.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/kernels/common/eigen/common.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 
-namespace pt {
+namespace pten {
 namespace eigen {
 
 template <typename DevCtx, typename T>
@@ -32,8 +32,8 @@ void Scale(const DevCtx& dev_ctx,
            DenseTensor* out) {
   // calc
   out->mutable_data<T>();
-  auto eigen_out = pt::EigenVector<T>::Flatten(*out);
-  auto eigen_x = pt::EigenVector<T>::Flatten(x);
+  auto eigen_out = pten::EigenVector<T>::Flatten(*out);
+  auto eigen_x = pten::EigenVector<T>::Flatten(x);
   auto& dev = *dev_ctx.eigen_device();
   // TODO(chenweihang): now the eigen function here need the dtype of scale,
   // eigen_x, bias should be same, so here need cast for two scalar arg,
@@ -48,4 +48,4 @@ void Scale(const DevCtx& dev_ctx,
 }
 
 }  // namespace eigen
-}  // namespace pt
+}  // namespace pten
diff --git a/paddle/tcmpt/kernels/common/eigen/sign.h b/paddle/pten/kernels/common/eigen/sign.h
similarity index 84%
rename from paddle/tcmpt/kernels/common/eigen/sign.h
rename to paddle/pten/kernels/common/eigen/sign.h
index 3980976ac9cf5..1e60965b1d91b 100644
--- a/paddle/tcmpt/kernels/common/eigen/sign.h
+++ b/paddle/pten/kernels/common/eigen/sign.h
@@ -14,13 +14,13 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/tcmpt/core/dense_tensor.h"
-#include "paddle/tcmpt/kernels/common/eigen/common.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/kernels/common/eigen/common.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 
-namespace pt {
+namespace pten {
 namespace eigen {
 
 template <typename DevCtx, typename T>
@@ -33,8 +33,8 @@ void Sign(const DevCtx& dev_ctx, const DenseTensor& x, DenseTensor* out) {
   // TODO(chenweihang): if we design new tensor, we should support
   // the low-level calc functor use new tensor as input,
   // which may be a big project!
-  auto eigen_out = pt::EigenVector<T>::Flatten(*out);
-  auto eigen_x = pt::EigenVector<T>::Flatten(x);
+  auto eigen_out = pten::EigenVector<T>::Flatten(*out);
+  auto eigen_x = pten::EigenVector<T>::Flatten(x);
 
   auto& dev = *dev_ctx.eigen_device();
   paddle::operators::EigenSign<std::decay_t<decltype(dev)>, T>::Eval(
@@ -42,4 +42,4 @@ void Sign(const DevCtx& dev_ctx, const DenseTensor& x, DenseTensor* out) {
 }
 
 }  // namespace eigen
-}  // namespace pt
+}  // namespace pten
diff --git a/paddle/tcmpt/kernels/cpu/CMakeLists.txt b/paddle/pten/kernels/cpu/CMakeLists.txt
similarity index 89%
rename from paddle/tcmpt/kernels/cpu/CMakeLists.txt
rename to paddle/pten/kernels/cpu/CMakeLists.txt
index b70c5f9ec81f0..9536f7e7d50f5 100644
--- a/paddle/tcmpt/kernels/cpu/CMakeLists.txt
+++ b/paddle/pten/kernels/cpu/CMakeLists.txt
@@ -1,5 +1,5 @@
 if(WIN32)
-    set(CURRENT_BINARY_DIR ${PADDLE_BINARY_DIR}/paddle/tcmpt/kernels/cpu)
+    set(CURRENT_BINARY_DIR ${PADDLE_BINARY_DIR}/paddle/pten/kernels/cpu)
     kernel_instantiate(creation.cc)
     kernel_instantiate(math.cc)
     kernel_instantiate(linalg.cc)
diff --git a/paddle/tcmpt/kernels/cpu/creation.cc b/paddle/pten/kernels/cpu/creation.cc
similarity index 84%
rename from paddle/tcmpt/kernels/cpu/creation.cc
rename to paddle/pten/kernels/cpu/creation.cc
index 37b589d776822..c150a7f5ae442 100644
--- a/paddle/tcmpt/kernels/cpu/creation.cc
+++ b/paddle/pten/kernels/cpu/creation.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/tcmpt/kernels/cpu/creation.h"
+#include "paddle/pten/kernels/cpu/creation.h"
 
-#include "paddle/tcmpt/core/kernel_registry.h"
-#include "paddle/tcmpt/kernels/common/eigen/fill.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/common/eigen/fill.h"
 
-namespace pt {
+namespace pten {
 
 template <typename T>
 void FillAnyLike(const CPUContext& dev_ctx,
@@ -27,14 +27,14 @@ void FillAnyLike(const CPUContext& dev_ctx,
   eigen::fill<CPUContext, T>(dev_ctx, out, val.to<float>());
 }
 
-}  // namespace pt
+}  // namespace pten
 
 PT_REGISTER_MODULE(CreationCPU);
 
 PT_REGISTER_KERNEL("fill_any_like",
                    CPU,
                    Any,
-                   pt::FillAnyLike,
+                   pten::FillAnyLike,
                    float,
                    double,
                    int,
diff --git a/paddle/tcmpt/kernels/cpu/creation.h b/paddle/pten/kernels/cpu/creation.h
similarity index 88%
rename from paddle/tcmpt/kernels/cpu/creation.h
rename to paddle/pten/kernels/cpu/creation.h
index 2c67945892b82..7674e6bb05157 100644
--- a/paddle/tcmpt/kernels/cpu/creation.h
+++ b/paddle/pten/kernels/cpu/creation.h
@@ -14,12 +14,12 @@
 
 #pragma once
 
-#include "paddle/tcmpt/core/dense_tensor.h"
-#include "paddle/tcmpt/core/scalar.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/scalar.h"
 
 #include "paddle/fluid/platform/device_context.h"
 
-namespace pt {
+namespace pten {
 
 using CPUContext = paddle::platform::CPUDeviceContext;
 
@@ -29,4 +29,4 @@ void FillAnyLike(const CPUContext& dev_ctx,
                  const Scalar& val,
                  DenseTensor* out);
 
-}  // namespace pt
+}  // namespace pten
diff --git a/paddle/tcmpt/kernels/cpu/linalg.cc b/paddle/pten/kernels/cpu/linalg.cc
similarity index 92%
rename from paddle/tcmpt/kernels/cpu/linalg.cc
rename to paddle/pten/kernels/cpu/linalg.cc
index 821cd5c092e85..5da375c99e91d 100644
--- a/paddle/tcmpt/kernels/cpu/linalg.cc
+++ b/paddle/pten/kernels/cpu/linalg.cc
@@ -12,16 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/tcmpt/kernels/cpu/linalg.h"
+#include "paddle/pten/kernels/cpu/linalg.h"
 
-#include "paddle/tcmpt/core/kernel_registry.h"
+#include "paddle/pten/core/kernel_registry.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/complex.h"
 
-namespace pt {
+namespace pten {
 
 template <typename T>
 void Dot(const CPUContext& dev_ctx,
@@ -53,7 +53,7 @@ void matmul(const CPUContext& dev_ctx,
             bool transpose_y,
             DenseTensor* out) {}
 
-}  // namespace pt
+}  // namespace pten
 
 PT_REGISTER_MODULE(LinalgCPU);
 
@@ -63,7 +63,7 @@ using complex128 = ::paddle::platform::complex<double>;
 PT_REGISTER_KERNEL("dot",
                    CPU,
                    Any,
-                   pt::Dot,
+                   pten::Dot,
                    float,
                    double,
                    int,
diff --git a/paddle/tcmpt/kernels/cpu/linalg.h b/paddle/pten/kernels/cpu/linalg.h
similarity index 93%
rename from paddle/tcmpt/kernels/cpu/linalg.h
rename to paddle/pten/kernels/cpu/linalg.h
index 6d9550b2882b2..a9447be74934c 100644
--- a/paddle/tcmpt/kernels/cpu/linalg.h
+++ b/paddle/pten/kernels/cpu/linalg.h
@@ -14,12 +14,12 @@
 
 #pragma once
 
-#include "paddle/tcmpt/core/dense_tensor.h"
+#include "paddle/pten/core/dense_tensor.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
 
-namespace pt {
+namespace pten {
 
 using CPUContext = paddle::platform::CPUDeviceContext;
 
@@ -37,4 +37,4 @@ void matmul(const CPUContext& dev_ctx,
             bool transpose_y,
             DenseTensor* out);
 
-}  // namespace pt
+}  // namespace pten
diff --git a/paddle/tcmpt/kernels/cpu/manipulation.cc b/paddle/pten/kernels/cpu/manipulation.cc
similarity index 89%
rename from paddle/tcmpt/kernels/cpu/manipulation.cc
rename to paddle/pten/kernels/cpu/manipulation.cc
index edf7f5aff0389..8bc3fcc14cf7e 100644
--- a/paddle/tcmpt/kernels/cpu/manipulation.cc
+++ b/paddle/pten/kernels/cpu/manipulation.cc
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/tcmpt/kernels/cpu/manipulation.h"
-#include "paddle/tcmpt/infershape/unary.h"
-#include "paddle/tcmpt/kernels/cpu/utils.h"
+#include "paddle/pten/kernels/cpu/manipulation.h"
+#include "paddle/pten/infershape/unary.h"
+#include "paddle/pten/kernels/cpu/utils.h"
 
-namespace pt {
+namespace pten {
 
 template <typename T>
 void Flatten(const CPUContext& dev_ctx,
@@ -25,7 +25,7 @@ void Flatten(const CPUContext& dev_ctx,
              int stop_axis,
              DenseTensor* out) {
   auto out_meta = FlattenInferShape(x.meta(), start_axis, stop_axis);
-  pt::Copy(dev_ctx, x, out);
+  pten::Copy(dev_ctx, x, out);
   out->mutable_meta()->lod = out_meta.lod;
   out->Resize(out_meta.dims);
 }
@@ -51,7 +51,7 @@ void FlattenWithXShape(const CPUContext& dev_ctx,
   xshape->mutable_meta()->lod = x.meta().lod;
 }
 
-}  // namespace pt
+}  // namespace pten
 
 // TODO(chenweihang): replace by better impl
 PT_REGISTER_MODULE(ManipulationCPU);
@@ -61,7 +61,7 @@ PT_REGISTER_MODULE(ManipulationCPU);
 PT_REGISTER_KERNEL("flatten_contiguous_range",
                    CPU,
                    Any,
-                   pt::Flatten,
+                   pten::Flatten,
                    float,
                    double,
                    uint8_t,
@@ -72,7 +72,7 @@ PT_REGISTER_KERNEL("flatten_contiguous_range",
 PT_REGISTER_KERNEL("flatten_contiguous_range.mid",
                    CPU,
                    Any,
-                   pt::FlattenWithXShape,
+                   pten::FlattenWithXShape,
                    float,
                    double,
                    uint8_t,
diff --git a/paddle/tcmpt/kernels/cpu/manipulation.h b/paddle/pten/kernels/cpu/manipulation.h
similarity index 88%
rename from paddle/tcmpt/kernels/cpu/manipulation.h
rename to paddle/pten/kernels/cpu/manipulation.h
index 0147dca441b25..22dfb0d8fccba 100644
--- a/paddle/tcmpt/kernels/cpu/manipulation.h
+++ b/paddle/pten/kernels/cpu/manipulation.h
@@ -14,13 +14,13 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/tcmpt/core/dense_tensor.h"
-#include "paddle/tcmpt/core/kernel_registry.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
 
-namespace pt {
+namespace pten {
 
 using CPUContext = paddle::platform::CPUDeviceContext;
 
@@ -31,4 +31,4 @@ void Flatten(const CPUContext& dev_ctx,
              int stop_axis,
              DenseTensor* out);
 
-}  // namespace pt
+}  // namespace pten
diff --git a/paddle/tcmpt/kernels/cpu/math.cc b/paddle/pten/kernels/cpu/math.cc
similarity index 85%
rename from paddle/tcmpt/kernels/cpu/math.cc
rename to paddle/pten/kernels/cpu/math.cc
index 4fa14141209a1..4fbd7cf04bf45 100644
--- a/paddle/tcmpt/kernels/cpu/math.cc
+++ b/paddle/pten/kernels/cpu/math.cc
@@ -12,17 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/tcmpt/kernels/cpu/math.h"
+#include "paddle/pten/kernels/cpu/math.h"
 
-#include "paddle/tcmpt/kernels/common/eigen/mean.h"
-#include "paddle/tcmpt/kernels/common/eigen/scale.h"
-#include "paddle/tcmpt/kernels/common/eigen/sign.h"
+#include "paddle/pten/kernels/common/eigen/mean.h"
+#include "paddle/pten/kernels/common/eigen/scale.h"
+#include "paddle/pten/kernels/common/eigen/sign.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/platform/bfloat16.h"
 
-namespace pt {
+namespace pten {
 
 template <typename T>
 void Sign(const CPUContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
@@ -61,7 +61,7 @@ void ScaleHost(const CPUContext& dev_ctx,
                               out);
 }
 
-}  // namespace pt
+}  // namespace pten
 
 // TODO(chenweihang): replace by better impl
 PT_REGISTER_MODULE(MathCPU);
@@ -69,12 +69,12 @@ PT_REGISTER_MODULE(MathCPU);
 // NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
 // using bfloat16 = ::paddle::platform::bfloat16;
 
-PT_REGISTER_KERNEL("sign", CPU, Any, pt::Sign, float, double) {}
-PT_REGISTER_KERNEL("mean", CPU, Any, pt::Mean, float, double) {}
+PT_REGISTER_KERNEL("sign", CPU, Any, pten::Sign, float, double) {}
+PT_REGISTER_KERNEL("mean", CPU, Any, pten::Mean, float, double) {}
 PT_REGISTER_KERNEL("scale",
                    CPU,
                    Any,
-                   pt::Scale,
+                   pten::Scale,
                    float,
                    double,
                    paddle::platform::bfloat16,
@@ -86,7 +86,7 @@ PT_REGISTER_KERNEL("scale",
 PT_REGISTER_KERNEL("scale.host",
                    CPU,
                    Any,
-                   pt::ScaleHost,
+                   pten::ScaleHost,
                    float,
                    double,
                    paddle::platform::bfloat16,
@@ -95,5 +95,5 @@ PT_REGISTER_KERNEL("scale.host",
                    int16_t,
                    int,
                    int64_t) {
-  kernel->InputAt(1).SetBackend(pt::Backend::kCPU);
+  kernel->InputAt(1).SetBackend(pten::Backend::kCPU);
 }
diff --git a/paddle/tcmpt/kernels/cpu/math.h b/paddle/pten/kernels/cpu/math.h
similarity index 91%
rename from paddle/tcmpt/kernels/cpu/math.h
rename to paddle/pten/kernels/cpu/math.h
index 3fb669b084095..3013ad9d04d0b 100644
--- a/paddle/tcmpt/kernels/cpu/math.h
+++ b/paddle/pten/kernels/cpu/math.h
@@ -14,13 +14,13 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/tcmpt/core/dense_tensor.h"
-#include "paddle/tcmpt/core/kernel_registry.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
 
-namespace pt {
+namespace pten {
 
 using CPUContext = paddle::platform::CPUDeviceContext;
 
@@ -46,4 +46,4 @@ void ScaleHost(const CPUContext& dev_ctx,
                bool bias_after_scale,
                DenseTensor* out);
 
-}  // namespace pt
+}  // namespace pten
diff --git a/paddle/tcmpt/kernels/cpu/utils.cc b/paddle/pten/kernels/cpu/utils.cc
similarity index 89%
rename from paddle/tcmpt/kernels/cpu/utils.cc
rename to paddle/pten/kernels/cpu/utils.cc
index a50cfad481693..f79a0a34fa6fd 100644
--- a/paddle/tcmpt/kernels/cpu/utils.cc
+++ b/paddle/pten/kernels/cpu/utils.cc
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/tcmpt/kernels/cpu/utils.h"
+#include "paddle/pten/kernels/cpu/utils.h"
 #include "paddle/fluid/memory/memcpy.h"
-#include "paddle/tcmpt/common/data_type.h"
-#include "paddle/tcmpt/core/convert_utils.h"
+#include "paddle/pten/common/data_type.h"
+#include "paddle/pten/core/convert_utils.h"
 
-namespace pt {
+namespace pten {
 
 void Copy(const CPUContext& dev_ctx, const DenseTensor& src, DenseTensor* dst) {
   auto* src_ptr = src.data();
@@ -50,9 +50,9 @@ void Copy(const CPUContext& dev_ctx, const DenseTensor& src, DenseTensor* dst) {
   }
 }
 
-}  // namespace pt
+}  // namespace pten
 
 // TODO(chenweihang): replace by better impl
 PT_REGISTER_MODULE(UtilsCPU);
 
-PT_REGISTER_KERNEL_WITH_NO_TYPE("copy", CPU, Any, pt::Copy) {}
+PT_REGISTER_KERNEL_WITH_NO_TYPE("copy", CPU, Any, pten::Copy) {}
diff --git a/paddle/tcmpt/kernels/cpu/utils.h b/paddle/pten/kernels/cpu/utils.h
similarity index 87%
rename from paddle/tcmpt/kernels/cpu/utils.h
rename to paddle/pten/kernels/cpu/utils.h
index 95ec606cc37d1..38f601b4cf91f 100644
--- a/paddle/tcmpt/kernels/cpu/utils.h
+++ b/paddle/pten/kernels/cpu/utils.h
@@ -14,15 +14,15 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/tcmpt/core/dense_tensor.h"
-#include "paddle/tcmpt/core/kernel_registry.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
-namespace pt {
+namespace pten {
 
 using CPUContext = paddle::platform::CPUDeviceContext;
 
 void Copy(const CPUContext& dev_ctx, const DenseTensor& src, DenseTensor* dst);
 
-}  // namespace pt
+}  // namespace pten
diff --git a/paddle/tcmpt/kernels/cuda/CMakeLists.txt b/paddle/pten/kernels/cuda/CMakeLists.txt
similarity index 94%
rename from paddle/tcmpt/kernels/cuda/CMakeLists.txt
rename to paddle/pten/kernels/cuda/CMakeLists.txt
index e243bad09563b..1271d93558d5b 100644
--- a/paddle/tcmpt/kernels/cuda/CMakeLists.txt
+++ b/paddle/pten/kernels/cuda/CMakeLists.txt
@@ -1,5 +1,5 @@
 if(WIN32)
-    set(CURRENT_BINARY_DIR ${PADDLE_BINARY_DIR}/paddle/tcmpt/kernels/cuda)
+    set(CURRENT_BINARY_DIR ${PADDLE_BINARY_DIR}/paddle/pten/kernels/cuda)
     kernel_instantiate(creation.cu)
     kernel_instantiate(math.cu)
     kernel_instantiate(linalg.cu)
diff --git a/paddle/tcmpt/kernels/cuda/creation.cu b/paddle/pten/kernels/cuda/creation.cu
similarity index 84%
rename from paddle/tcmpt/kernels/cuda/creation.cu
rename to paddle/pten/kernels/cuda/creation.cu
index 54afec95735df..e0732269d874a 100644
--- a/paddle/tcmpt/kernels/cuda/creation.cu
+++ b/paddle/pten/kernels/cuda/creation.cu
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/tcmpt/kernels/cuda/creation.h"
+#include "paddle/pten/kernels/cuda/creation.h"
 
-#include "paddle/tcmpt/core/kernel_registry.h"
-#include "paddle/tcmpt/kernels/common/eigen/fill.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/common/eigen/fill.h"
 
-namespace pt {
+namespace pten {
 
 template <typename T>
 void FillAnyLike(const CUDAContext& dev_ctx,
@@ -27,14 +27,14 @@ void FillAnyLike(const CUDAContext& dev_ctx,
   eigen::fill<CUDAContext, T>(dev_ctx, out, val.to<float>());
 }
 
-}  // namespace pt
+}  // namespace pten
 
 PT_REGISTER_MODULE(CreationCUDA);
 
 PT_REGISTER_KERNEL("fill_any_like",
                    CUDA,
                    Any,
-                   pt::FillAnyLike,
+                   pten::FillAnyLike,
                    float,
                    double,
                    int,
diff --git a/paddle/tcmpt/kernels/cuda/creation.h b/paddle/pten/kernels/cuda/creation.h
similarity index 89%
rename from paddle/tcmpt/kernels/cuda/creation.h
rename to paddle/pten/kernels/cuda/creation.h
index 7de9ce1371fff..21772f1f98d07 100644
--- a/paddle/tcmpt/kernels/cuda/creation.h
+++ b/paddle/pten/kernels/cuda/creation.h
@@ -17,12 +17,12 @@
 // CUDA and HIP use same api
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
-#include "paddle/tcmpt/core/dense_tensor.h"
-#include "paddle/tcmpt/core/scalar.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/scalar.h"
 
 #include "paddle/fluid/platform/device_context.h"
 
-namespace pt {
+namespace pten {
 
 using CUDAContext = paddle::platform::CUDADeviceContext;
 
@@ -32,6 +32,6 @@ void FillAnyLike(const CUDAContext& dev_ctx,
                  const Scalar& val,
                  DenseTensor* out);
 
-}  // namespace pt
+}  // namespace pten
 
 #endif
diff --git a/paddle/tcmpt/kernels/cuda/linalg.cu b/paddle/pten/kernels/cuda/linalg.cu
similarity index 86%
rename from paddle/tcmpt/kernels/cuda/linalg.cu
rename to paddle/pten/kernels/cuda/linalg.cu
index 77001d988038d..a57f230244dbb 100644
--- a/paddle/tcmpt/kernels/cuda/linalg.cu
+++ b/paddle/pten/kernels/cuda/linalg.cu
@@ -12,15 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/tcmpt/kernels/cuda/linalg.h"
+#include "paddle/pten/kernels/cuda/linalg.h"
 
-#include "paddle/tcmpt/core/kernel_registry.h"
-#include "paddle/tcmpt/kernels/common/eigen/dot.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/common/eigen/dot.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/complex.h"
 
-namespace pt {
+namespace pten {
 
 template <typename T>
 void Dot(const CUDAContext& dev_ctx,
@@ -30,7 +30,7 @@ void Dot(const CUDAContext& dev_ctx,
   eigen::Dot<CUDAContext, T>(dev_ctx, x, y, out);
 }
 
-}  // namespace pt
+}  // namespace pten
 
 PT_REGISTER_MODULE(LinalgCUDA);
 
@@ -40,7 +40,7 @@ using complex128 = ::paddle::platform::complex<double>;
 PT_REGISTER_KERNEL("dot",
                    CUDA,
                    Any,
-                   pt::Dot,
+                   pten::Dot,
                    float,
                    double,
                    int,
diff --git a/paddle/tcmpt/kernels/cuda/linalg.h b/paddle/pten/kernels/cuda/linalg.h
similarity index 92%
rename from paddle/tcmpt/kernels/cuda/linalg.h
rename to paddle/pten/kernels/cuda/linalg.h
index 20fe0d1a4f49a..ad38f71ec080a 100644
--- a/paddle/tcmpt/kernels/cuda/linalg.h
+++ b/paddle/pten/kernels/cuda/linalg.h
@@ -17,12 +17,12 @@
 // CUDA and HIP use same api
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
-#include "paddle/tcmpt/core/dense_tensor.h"
+#include "paddle/pten/core/dense_tensor.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
 
-namespace pt {
+namespace pten {
 
 using CUDAContext = paddle::platform::CUDADeviceContext;
 
@@ -32,6 +32,6 @@ void Dot(const CUDAContext& dev_ctx,
          const DenseTensor& y,
          DenseTensor* out);
 
-}  // namespace pt
+}  // namespace pten
 
 #endif
diff --git a/paddle/tcmpt/kernels/cuda/manipulation.cu b/paddle/pten/kernels/cuda/manipulation.cu
similarity index 90%
rename from paddle/tcmpt/kernels/cuda/manipulation.cu
rename to paddle/pten/kernels/cuda/manipulation.cu
index 99ee2506fdf41..2b68d4a292017 100644
--- a/paddle/tcmpt/kernels/cuda/manipulation.cu
+++ b/paddle/pten/kernels/cuda/manipulation.cu
@@ -12,11 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/tcmpt/infershape/unary.h"
-#include "paddle/tcmpt/kernels/cuda/manipulation.h"
-#include "paddle/tcmpt/kernels/cuda/utils.h"
+#include "paddle/pten/infershape/unary.h"
+#include "paddle/pten/kernels/cuda/manipulation.h"
+#include "paddle/pten/kernels/cuda/utils.h"
 
-namespace pt {
+namespace pten {
 
 template <typename T>
 void Flatten(const CUDAContext& dev_ctx,
@@ -25,7 +25,7 @@ void Flatten(const CUDAContext& dev_ctx,
              int stop_axis,
              DenseTensor* out) {
   auto out_meta = FlattenInferShape(x.meta(), start_axis, stop_axis);
-  pt::Copy(dev_ctx, x, out);
+  pten::Copy(dev_ctx, x, out);
   out->mutable_meta()->lod = out_meta.lod;
   out->Resize(out_meta.dims);
 }
@@ -51,7 +51,7 @@ void FlattenWithXShape(const CUDAContext& dev_ctx,
   xshape->mutable_meta()->lod = x.meta().lod;
 }
 
-}  // namespace pt
+}  // namespace pten
 
 // TODO(chenweihang): replace by better impl
 PT_REGISTER_MODULE(ManipulationCUDA);
@@ -62,7 +62,7 @@ using float16 = paddle::platform::float16;
 PT_REGISTER_KERNEL("flatten_contiguous_range",
                    CUDA,
                    Any,
-                   pt::Flatten,
+                   pten::Flatten,
                    float,
                    float16,
                    double,
@@ -74,7 +74,7 @@ PT_REGISTER_KERNEL("flatten_contiguous_range",
 PT_REGISTER_KERNEL("flatten_contiguous_range.mid",
                    CUDA,
                    Any,
-                   pt::FlattenWithXShape,
+                   pten::FlattenWithXShape,
                    float,
                    double,
                    uint8_t,
diff --git a/paddle/tcmpt/kernels/cuda/manipulation.h b/paddle/pten/kernels/cuda/manipulation.h
similarity index 93%
rename from paddle/tcmpt/kernels/cuda/manipulation.h
rename to paddle/pten/kernels/cuda/manipulation.h
index ca958eab8fa47..ac1cb0324f4ec 100644
--- a/paddle/tcmpt/kernels/cuda/manipulation.h
+++ b/paddle/pten/kernels/cuda/manipulation.h
@@ -17,12 +17,12 @@
 // CUDA and HIP use same api
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
-#include "paddle/tcmpt/core/dense_tensor.h"
+#include "paddle/pten/core/dense_tensor.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
 
-namespace pt {
+namespace pten {
 
 using CUDAContext = paddle::platform::CUDADeviceContext;
 
@@ -33,6 +33,6 @@ void Flatten(const CUDAContext& dev_ctx,
              int stop_axis,
              DenseTensor* out);
 
-}  // namespace pt
+}  // namespace pten
 
 #endif
diff --git a/paddle/tcmpt/kernels/cuda/math.cu b/paddle/pten/kernels/cuda/math.cu
similarity index 85%
rename from paddle/tcmpt/kernels/cuda/math.cu
rename to paddle/pten/kernels/cuda/math.cu
index 113971126a71f..8a2d1dff9a67b 100644
--- a/paddle/tcmpt/kernels/cuda/math.cu
+++ b/paddle/pten/kernels/cuda/math.cu
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/tcmpt/kernels/cuda/math.h"
+#include "paddle/pten/kernels/cuda/math.h"
 
-#include "paddle/tcmpt/kernels/common/eigen/mean.h"
-#include "paddle/tcmpt/kernels/common/eigen/scale.h"
-#include "paddle/tcmpt/kernels/common/eigen/sign.h"
+#include "paddle/pten/kernels/common/eigen/mean.h"
+#include "paddle/pten/kernels/common/eigen/scale.h"
+#include "paddle/pten/kernels/common/eigen/sign.h"
 
 #ifdef __NVCC__
 #include "cub/cub.cuh"
@@ -27,10 +27,10 @@ namespace cub = hipcub;
 #endif
 
 #include "paddle/fluid/platform/float16.h"
-#include "paddle/tcmpt/core/convert_utils.h"
-#include "paddle/tcmpt/core/kernel_registry.h"
+#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/kernel_registry.h"
 
-namespace pt {
+namespace pten {
 
 /**
  * Util Functors
@@ -74,10 +74,10 @@ void Mean(const CUDAContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
       nullptr, temp_storage_bytes, trans_x, out_data, size_prob, stream);
   PADDLE_ENFORCE_CUDA_SUCCESS(err);
 
-  pt::DenseTensor tmp(
+  pten::DenseTensor tmp(
       TensorMeta(paddle::framework::make_ddim(
                      {static_cast<int64_t>(temp_storage_bytes)}),
-                 pt::TransToPtBackend(dev_ctx.GetPlace()),
+                 pten::TransToPtBackend(dev_ctx.GetPlace()),
                  x.data_type(),
                  x.layout()),
       TensorStatus());
@@ -115,18 +115,18 @@ void ScaleHost(const CUDAContext& dev_ctx,
                                out);
 }
 
-}  // namespace pt
+}  // namespace pten
 
 // TODO(chenweihang): replace by better impl
 PT_REGISTER_MODULE(MathCUDA);
 
 using float16 = paddle::platform::float16;
-PT_REGISTER_KERNEL("sign", CUDA, Any, pt::Sign, float, double, float16) {}
-PT_REGISTER_KERNEL("mean", CUDA, Any, pt::Mean, float, double, float16) {}
+PT_REGISTER_KERNEL("sign", CUDA, Any, pten::Sign, float, double, float16) {}
+PT_REGISTER_KERNEL("mean", CUDA, Any, pten::Mean, float, double, float16) {}
 PT_REGISTER_KERNEL("scale",
                    CUDA,
                    Any,
-                   pt::Scale,
+                   pten::Scale,
                    float,
                    double,
                    float16,
@@ -138,7 +138,7 @@ PT_REGISTER_KERNEL("scale",
 PT_REGISTER_KERNEL("scale.host",
                    CUDA,
                    Any,
-                   pt::ScaleHost,
+                   pten::ScaleHost,
                    float,
                    double,
                    float16,
@@ -147,5 +147,5 @@ PT_REGISTER_KERNEL("scale.host",
                    int16_t,
                    int,
                    int64_t) {
-  kernel->InputAt(1).SetBackend(pt::Backend::kCPU);
+  kernel->InputAt(1).SetBackend(pten::Backend::kCPU);
 }
diff --git a/paddle/tcmpt/kernels/cuda/math.h b/paddle/pten/kernels/cuda/math.h
similarity index 94%
rename from paddle/tcmpt/kernels/cuda/math.h
rename to paddle/pten/kernels/cuda/math.h
index dc8221d6345d6..65f4f41265836 100644
--- a/paddle/tcmpt/kernels/cuda/math.h
+++ b/paddle/pten/kernels/cuda/math.h
@@ -17,12 +17,12 @@ limitations under the License. */
 // CUDA and HIP use same api
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
-#include "paddle/tcmpt/core/dense_tensor.h"
+#include "paddle/pten/core/dense_tensor.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
 
-namespace pt {
+namespace pten {
 
 using CUDAContext = paddle::platform::CUDADeviceContext;
 
@@ -48,6 +48,6 @@ void ScaleHost(const CUDAContext& dev_ctx,
                bool bias_after_scale,
                DenseTensor* out);
 
-}  // namespace pt
+}  // namespace pten
 
 #endif
diff --git a/paddle/tcmpt/kernels/cuda/utils.cu b/paddle/pten/kernels/cuda/utils.cu
similarity index 97%
rename from paddle/tcmpt/kernels/cuda/utils.cu
rename to paddle/pten/kernels/cuda/utils.cu
index 00b32e2fbb10a..0c83c1c5c3cae 100644
--- a/paddle/tcmpt/kernels/cuda/utils.cu
+++ b/paddle/pten/kernels/cuda/utils.cu
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/memory/memcpy.h"
-#include "paddle/tcmpt/common/data_type.h"
-#include "paddle/tcmpt/core/convert_utils.h"
-#include "paddle/tcmpt/core/kernel_registry.h"
-#include "paddle/tcmpt/kernels/cuda/utils.h"
+#include "paddle/pten/common/data_type.h"
+#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/cuda/utils.h"
 
-namespace pt {
+namespace pten {
 
 void Copy(const CUDAContext& dev_ctx,
           const DenseTensor& src,
@@ -215,9 +215,9 @@ void Copy(const CUDAContext& dev_ctx,
   }
 }
 
-}  // namespace pt
+}  // namespace pten
 
 // TODO(chenweihang): replace by better impl
 PT_REGISTER_MODULE(UtilsCUDA);
 
-PT_REGISTER_KERNEL_WITH_NO_TYPE("copy", CUDA, Any, pt::Copy) {}
+PT_REGISTER_KERNEL_WITH_NO_TYPE("copy", CUDA, Any, pten::Copy) {}
diff --git a/paddle/tcmpt/kernels/cuda/utils.h b/paddle/pten/kernels/cuda/utils.h
similarity index 87%
rename from paddle/tcmpt/kernels/cuda/utils.h
rename to paddle/pten/kernels/cuda/utils.h
index 4d3196b2f877b..a8a6838f4602a 100644
--- a/paddle/tcmpt/kernels/cuda/utils.h
+++ b/paddle/pten/kernels/cuda/utils.h
@@ -14,15 +14,15 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/tcmpt/core/dense_tensor.h"
-#include "paddle/tcmpt/core/kernel_registry.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
-namespace pt {
+namespace pten {
 
 using CUDAContext = paddle::platform::CUDADeviceContext;
 
 void Copy(const CUDAContext& dev_ctx, const DenseTensor& src, DenseTensor* dst);
 
-}  // namespace pt
+}  // namespace pten
diff --git a/paddle/tcmpt/kernels/mkldnn/CMakeLists.txt b/paddle/pten/kernels/mkldnn/CMakeLists.txt
similarity index 100%
rename from paddle/tcmpt/kernels/mkldnn/CMakeLists.txt
rename to paddle/pten/kernels/mkldnn/CMakeLists.txt
diff --git a/paddle/tcmpt/kernels/npu/CMakeLists.txt b/paddle/pten/kernels/npu/CMakeLists.txt
similarity index 100%
rename from paddle/tcmpt/kernels/npu/CMakeLists.txt
rename to paddle/pten/kernels/npu/CMakeLists.txt
diff --git a/paddle/tcmpt/kernels/xpu/CMakeLists.txt b/paddle/pten/kernels/xpu/CMakeLists.txt
similarity index 100%
rename from paddle/tcmpt/kernels/xpu/CMakeLists.txt
rename to paddle/pten/kernels/xpu/CMakeLists.txt
diff --git a/paddle/tcmpt/module/CMakeLists.txt b/paddle/pten/module/CMakeLists.txt
similarity index 100%
rename from paddle/tcmpt/module/CMakeLists.txt
rename to paddle/pten/module/CMakeLists.txt
diff --git a/paddle/tcmpt/tests/CMakeLists.txt b/paddle/pten/tests/CMakeLists.txt
similarity index 100%
rename from paddle/tcmpt/tests/CMakeLists.txt
rename to paddle/pten/tests/CMakeLists.txt
diff --git a/paddle/tcmpt/tests/backend_test.cc b/paddle/pten/tests/backend_test.cc
similarity index 94%
rename from paddle/tcmpt/tests/backend_test.cc
rename to paddle/pten/tests/backend_test.cc
index 026e94ec4d0e7..46e099e216c41 100644
--- a/paddle/tcmpt/tests/backend_test.cc
+++ b/paddle/pten/tests/backend_test.cc
@@ -12,6 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/tcmpt/core/backend.h"
+#include "paddle/pten/core/backend.h"
 
 #include <gtest/gtest.h>
diff --git a/paddle/tcmpt/tests/dense_tensor_test.cc b/paddle/pten/tests/dense_tensor_test.cc
similarity index 62%
rename from paddle/tcmpt/tests/dense_tensor_test.cc
rename to paddle/pten/tests/dense_tensor_test.cc
index 138ef1e30e76e..db747e15a8db7 100644
--- a/paddle/tcmpt/tests/dense_tensor_test.cc
+++ b/paddle/pten/tests/dense_tensor_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/tcmpt/core/dense_tensor.h"
+#include "paddle/pten/core/dense_tensor.h"
 
 #include <gtest/gtest.h>
 
@@ -20,16 +20,17 @@ namespace framework = paddle::framework;
 using DDim = paddle::framework::DDim;
 
 TEST(DenseTensor, Constructor) {
-  pt::DenseTensor tensor(pt::TensorMeta(framework::make_ddim({5, 10}),
-                                        pt::Backend::kCPU,
-                                        pt::DataType::kFLOAT32,
-                                        pt::DataLayout::kNCHW,
-                                        0UL),
-                         pt::TensorStatus());
+  pten::DenseTensor tensor(
+      pten::TensorMeta(framework::make_ddim({5, 10}),
+                       pten::Backend::kCPU,
+                       paddle::experimental::DataType::kFLOAT32,
+                       paddle::experimental::DataLayout::kNCHW,
+                       0UL),
+      pten::TensorStatus());
   ASSERT_EQ(tensor.dims().size(), 2);
-  ASSERT_EQ(tensor.backend(), pt::Backend::kCPU);
-  ASSERT_EQ(tensor.data_type(), pt::DataType::kFLOAT32);
-  ASSERT_EQ(tensor.layout(), pt::DataLayout::kNCHW);
+  ASSERT_EQ(tensor.backend(), pten::Backend::kCPU);
+  ASSERT_EQ(tensor.data_type(), paddle::experimental::DataType::kFLOAT32);
+  ASSERT_EQ(tensor.layout(), paddle::experimental::DataLayout::kNCHW);
 }
 
 TEST(DenseTensor, Dims) {
diff --git a/paddle/tcmpt/tests/dtype_test.cc b/paddle/pten/tests/dtype_test.cc
similarity index 100%
rename from paddle/tcmpt/tests/dtype_test.cc
rename to paddle/pten/tests/dtype_test.cc
diff --git a/paddle/tcmpt/tests/kernel_factory_test.cc b/paddle/pten/tests/kernel_factory_test.cc
similarity index 75%
rename from paddle/tcmpt/tests/kernel_factory_test.cc
rename to paddle/pten/tests/kernel_factory_test.cc
index 66ce7cd9892ef..a3ac561d6364a 100644
--- a/paddle/tcmpt/tests/kernel_factory_test.cc
+++ b/paddle/pten/tests/kernel_factory_test.cc
@@ -12,12 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/tcmpt/core/kernel_factory.h"
+#include "paddle/pten/core/kernel_factory.h"
 
 #include "gtest/gtest.h"
 
 TEST(KernelFactory, KernelKey) {
-  pt::KernelKey key(
-      pt::Backend::kCPU, pt::DataLayout::kNCHW, pt::DataType::kFLOAT32);
+  pten::KernelKey key(pten::Backend::kCPU,
+                      paddle::experimental::DataLayout::kNCHW,
+                      paddle::experimental::DataType::kFLOAT32);
   std::cout << key;
 }
diff --git a/paddle/tcmpt/tests/layout_test.cc b/paddle/pten/tests/layout_test.cc
similarity index 100%
rename from paddle/tcmpt/tests/layout_test.cc
rename to paddle/pten/tests/layout_test.cc
diff --git a/paddle/tcmpt/tests/test_copy_api.cc b/paddle/pten/tests/test_copy_api.cc
similarity index 64%
rename from paddle/tcmpt/tests/test_copy_api.cc
rename to paddle/pten/tests/test_copy_api.cc
index 2d70e37d051d9..3307ffeb1943b 100644
--- a/paddle/tcmpt/tests/test_copy_api.cc
+++ b/paddle/pten/tests/test_copy_api.cc
@@ -15,10 +15,10 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/tcmpt/core/kernel_registry.h"
-#include "paddle/tcmpt/kernels/cpu/utils.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/kernels/cpu/utils.h"
 
-#include "paddle/tcmpt/core/dense_tensor.h"
+#include "paddle/pten/core/dense_tensor.h"
 
 PT_DECLARE_MODULE(UtilsCPU);
 
@@ -30,20 +30,20 @@ using DDim = paddle::framework::DDim;
 // 'paddle/api',
 TEST(API, copy) {
   // 1. create tensor
-  auto dense_src = std::make_shared<pt::DenseTensor>(
-      pt::TensorMeta(framework::make_ddim({2, 3}),
-                     pt::Backend::kCPU,
-                     pt::DataType::kFLOAT32,
-                     pt::DataLayout::kNCHW),
-      pt::TensorStatus());
+  auto dense_src = std::make_shared<pten::DenseTensor>(
+      pten::TensorMeta(framework::make_ddim({2, 3}),
+                       pten::Backend::kCPU,
+                       paddle::experimental::DataType::kFLOAT32,
+                       paddle::experimental::DataLayout::kNCHW),
+      pten::TensorStatus());
   auto* dense_x_data = dense_src->mutable_data<float>();
 
-  auto dense_dst = std::make_shared<pt::DenseTensor>(
-      pt::TensorMeta(framework::make_ddim({2, 3}),
-                     pt::Backend::kCPU,
-                     pt::DataType::kFLOAT32,
-                     pt::DataLayout::kNCHW),
-      pt::TensorStatus());
+  auto dense_dst = std::make_shared<pten::DenseTensor>(
+      pten::TensorMeta(framework::make_ddim({2, 3}),
+                       pten::Backend::kCPU,
+                       paddle::experimental::DataType::kFLOAT32,
+                       paddle::experimental::DataLayout::kNCHW),
+      pten::TensorStatus());
 
   for (size_t i = 0; i < 2; ++i) {
     for (size_t j = 0; j < 3; ++j) {
@@ -55,7 +55,7 @@ TEST(API, copy) {
   // 2. test API
   auto& pool = paddle::platform::DeviceContextPool::Instance();
   auto* dev_ctx = pool.GetByPlace(paddle::platform::CPUPlace());
-  pt::Copy(*dev_ctx, *(dense_src.get()), dense_dst.get());
+  pten::Copy(*dev_ctx, *(dense_src.get()), dense_dst.get());
 
   // 3. check result
   for (int64_t i = 0; i < dense_src->numel(); i++) {
diff --git a/paddle/tcmpt/tests/test_dot_api.cc b/paddle/pten/tests/test_dot_api.cc
similarity index 67%
rename from paddle/tcmpt/tests/test_dot_api.cc
rename to paddle/pten/tests/test_dot_api.cc
index 8fdae5050e239..967f1a8f17c1c 100644
--- a/paddle/tcmpt/tests/test_dot_api.cc
+++ b/paddle/pten/tests/test_dot_api.cc
@@ -15,10 +15,10 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/tcmpt/hapi/include/linalg.h"
+#include "paddle/pten/hapi/include/linalg.h"
 
-#include "paddle/tcmpt/core/dense_tensor.h"
-#include "paddle/tcmpt/core/kernel_registry.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
 
 PT_DECLARE_MODULE(LinalgCPU);
 
@@ -31,20 +31,20 @@ using DDim = paddle::framework::DDim;
 
 TEST(API, dot) {
   // 1. create tensor
-  auto dense_x = std::make_shared<pt::DenseTensor>(
-      pt::TensorMeta(framework::make_ddim({3, 10}),
-                     pt::Backend::kCPU,
-                     pt::DataType::kFLOAT32,
-                     pt::DataLayout::kNCHW),
-      pt::TensorStatus());
+  auto dense_x = std::make_shared<pten::DenseTensor>(
+      pten::TensorMeta(framework::make_ddim({3, 10}),
+                       pten::Backend::kCPU,
+                       paddle::experimental::DataType::kFLOAT32,
+                       paddle::experimental::DataLayout::kNCHW),
+      pten::TensorStatus());
   auto* dense_x_data = dense_x->mutable_data<float>();
 
-  auto dense_y = std::make_shared<pt::DenseTensor>(
-      pt::TensorMeta(framework::make_ddim({3, 10}),
-                     pt::Backend::kCPU,
-                     pt::DataType::kFLOAT32,
-                     pt::DataLayout::kNCHW),
-      pt::TensorStatus());
+  auto dense_y = std::make_shared<pten::DenseTensor>(
+      pten::TensorMeta(framework::make_ddim({3, 10}),
+                       pten::Backend::kCPU,
+                       paddle::experimental::DataType::kFLOAT32,
+                       paddle::experimental::DataLayout::kNCHW),
+      pten::TensorStatus());
   auto* dense_y_data = dense_y->mutable_data<float>();
 
   float sum[3] = {0.0, 0.0, 0.0};
@@ -67,12 +67,12 @@ TEST(API, dot) {
   ASSERT_EQ(out.shape()[0], 3);
   ASSERT_EQ(out.numel(), 3);
   ASSERT_EQ(out.is_cpu(), true);
-  ASSERT_EQ(out.type(), pt::DataType::kFLOAT32);
-  ASSERT_EQ(out.layout(), pt::DataLayout::kNCHW);
+  ASSERT_EQ(out.type(), paddle::experimental::DataType::kFLOAT32);
+  ASSERT_EQ(out.layout(), paddle::experimental::DataLayout::kNCHW);
   ASSERT_EQ(out.initialized(), true);
 
   auto expect_result = sum;
-  auto dense_out = std::dynamic_pointer_cast<pt::DenseTensor>(out.impl());
+  auto dense_out = std::dynamic_pointer_cast<pten::DenseTensor>(out.impl());
   auto actual_result0 = dense_out->data<float>()[0];
   auto actual_result1 = dense_out->data<float>()[1];
   auto actual_result2 = dense_out->data<float>()[2];
diff --git a/paddle/tcmpt/tests/test_fill_api.cc b/paddle/pten/tests/test_fill_api.cc
similarity index 54%
rename from paddle/tcmpt/tests/test_fill_api.cc
rename to paddle/pten/tests/test_fill_api.cc
index 0ed7248604654..5c044f520af07 100644
--- a/paddle/tcmpt/tests/test_fill_api.cc
+++ b/paddle/pten/tests/test_fill_api.cc
@@ -15,10 +15,10 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/tcmpt/hapi/include/creation.h"
+#include "paddle/pten/hapi/include/creation.h"
 
-#include "paddle/tcmpt/core/dense_tensor.h"
-#include "paddle/tcmpt/core/kernel_registry.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
 
 PT_DECLARE_MODULE(CreationCPU);
 
@@ -31,12 +31,12 @@ using DDim = paddle::framework::DDim;
 
 TEST(API, full_like) {
   // 1. create tensor
-  auto dense_x = std::make_shared<pt::DenseTensor>(
-      pt::TensorMeta(framework::make_ddim({3, 2}),
-                     pt::Backend::kCPU,
-                     pt::DataType::kFLOAT32,
-                     pt::DataLayout::kNCHW),
-      pt::TensorStatus());
+  auto dense_x = std::make_shared<pten::DenseTensor>(
+      pten::TensorMeta(framework::make_ddim({3, 2}),
+                       pten::Backend::kCPU,
+                       paddle::experimental::DataType::kFLOAT32,
+                       paddle::experimental::DataLayout::kNCHW),
+      pten::TensorStatus());
   auto* dense_x_data = dense_x->mutable_data<float>();
   dense_x_data[0] = 0;
 
@@ -45,18 +45,19 @@ TEST(API, full_like) {
   paddle::experimental::Tensor x(dense_x);
 
   // 2. test API
-  auto out = paddle::experimental::full_like(x, val, pt::DataType::kFLOAT32);
+  auto out = paddle::experimental::full_like(
+      x, val, paddle::experimental::DataType::kFLOAT32);
 
   // 3. check result
   ASSERT_EQ(out.shape().size(), 2);
   ASSERT_EQ(out.shape()[0], 3);
   ASSERT_EQ(out.numel(), 6);
   ASSERT_EQ(out.is_cpu(), true);
-  ASSERT_EQ(out.type(), pt::DataType::kFLOAT32);
-  ASSERT_EQ(out.layout(), pt::DataLayout::kNCHW);
+  ASSERT_EQ(out.type(), paddle::experimental::DataType::kFLOAT32);
+  ASSERT_EQ(out.layout(), paddle::experimental::DataLayout::kNCHW);
   ASSERT_EQ(out.initialized(), true);
 
-  auto dense_out = std::dynamic_pointer_cast<pt::DenseTensor>(out.impl());
+  auto dense_out = std::dynamic_pointer_cast<pten::DenseTensor>(out.impl());
   auto* actual_result = dense_out->data<float>();
   for (auto i = 0; i < 6; i++) {
     ASSERT_NEAR(actual_result[i], val, 1e-6f);
@@ -65,30 +66,31 @@ TEST(API, full_like) {
 
 TEST(API, zeros_like) {
   // 1. create tensor
-  auto dense_x = std::make_shared<pt::DenseTensor>(
-      pt::TensorMeta(framework::make_ddim({3, 2}),
-                     pt::Backend::kCPU,
-                     pt::DataType::kFLOAT32,
-                     pt::DataLayout::kNCHW),
-      pt::TensorStatus());
+  auto dense_x = std::make_shared<pten::DenseTensor>(
+      pten::TensorMeta(framework::make_ddim({3, 2}),
+                       pten::Backend::kCPU,
+                       paddle::experimental::DataType::kFLOAT32,
+                       paddle::experimental::DataLayout::kNCHW),
+      pten::TensorStatus());
   auto* dense_x_data = dense_x->mutable_data<float>();
   dense_x_data[0] = 1;
 
   paddle::experimental::Tensor x(dense_x);
 
   // 2. test API
-  auto out = paddle::experimental::zeros_like(x, pt::DataType::kFLOAT32);
+  auto out = paddle::experimental::zeros_like(
+      x, paddle::experimental::DataType::kFLOAT32);
 
   // 3. check result
   ASSERT_EQ(out.shape().size(), 2);
   ASSERT_EQ(out.shape()[0], 3);
   ASSERT_EQ(out.numel(), 6);
   ASSERT_EQ(out.is_cpu(), true);
-  ASSERT_EQ(out.type(), pt::DataType::kFLOAT32);
-  ASSERT_EQ(out.layout(), pt::DataLayout::kNCHW);
+  ASSERT_EQ(out.type(), paddle::experimental::DataType::kFLOAT32);
+  ASSERT_EQ(out.layout(), paddle::experimental::DataLayout::kNCHW);
   ASSERT_EQ(out.initialized(), true);
 
-  auto dense_out = std::dynamic_pointer_cast<pt::DenseTensor>(out.impl());
+  auto dense_out = std::dynamic_pointer_cast<pten::DenseTensor>(out.impl());
   auto* actual_result = dense_out->data<float>();
   for (auto i = 0; i < 6; i++) {
     ASSERT_NEAR(actual_result[i], 0, 1e-6f);
@@ -97,30 +99,31 @@ TEST(API, zeros_like) {
 
 TEST(API, ones_like) {
   // 1. create tensor
-  auto dense_x = std::make_shared<pt::DenseTensor>(
-      pt::TensorMeta(framework::make_ddim({3, 2}),
-                     pt::Backend::kCPU,
-                     pt::DataType::kFLOAT32,
-                     pt::DataLayout::kNCHW),
-      pt::TensorStatus());
+  auto dense_x = std::make_shared<pten::DenseTensor>(
+      pten::TensorMeta(framework::make_ddim({3, 2}),
+                       pten::Backend::kCPU,
+                       paddle::experimental::DataType::kFLOAT32,
+                       paddle::experimental::DataLayout::kNCHW),
+      pten::TensorStatus());
   auto* dense_x_data = dense_x->mutable_data<float>();
   dense_x_data[0] = 0;
 
   paddle::experimental::Tensor x(dense_x);
 
   // 2. test API
-  auto out = paddle::experimental::ones_like(x, pt::DataType::kINT32);
+  auto out = paddle::experimental::ones_like(
+      x, paddle::experimental::DataType::kINT32);
 
   // 3. check result
   ASSERT_EQ(out.shape().size(), 2);
   ASSERT_EQ(out.shape()[0], 3);
   ASSERT_EQ(out.numel(), 6);
   ASSERT_EQ(out.is_cpu(), true);
-  ASSERT_EQ(out.type(), pt::DataType::kINT32);
-  ASSERT_EQ(out.layout(), pt::DataLayout::kNCHW);
+  ASSERT_EQ(out.type(), paddle::experimental::DataType::kINT32);
+  ASSERT_EQ(out.layout(), paddle::experimental::DataLayout::kNCHW);
   ASSERT_EQ(out.initialized(), true);
 
-  auto dense_out = std::dynamic_pointer_cast<pt::DenseTensor>(out.impl());
+  auto dense_out = std::dynamic_pointer_cast<pten::DenseTensor>(out.impl());
   auto* actual_result = dense_out->data<float>();
   for (auto i = 0; i < 6; i++) {
     ASSERT_EQ(actual_result[i], 1);
diff --git a/paddle/tcmpt/tests/test_flatten_api.cc b/paddle/pten/tests/test_flatten_api.cc
similarity index 72%
rename from paddle/tcmpt/tests/test_flatten_api.cc
rename to paddle/pten/tests/test_flatten_api.cc
index d2e3ee4278e1d..1deb41f3a6722 100644
--- a/paddle/tcmpt/tests/test_flatten_api.cc
+++ b/paddle/pten/tests/test_flatten_api.cc
@@ -15,10 +15,10 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/tcmpt/hapi/include/manipulation.h"
+#include "paddle/pten/hapi/include/manipulation.h"
 
-#include "paddle/tcmpt/core/dense_tensor.h"
-#include "paddle/tcmpt/core/kernel_registry.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
 
 PT_DECLARE_MODULE(ManipulationCPU);
 
@@ -31,12 +31,12 @@ using DDim = paddle::framework::DDim;
 
 TEST(API, flatten) {
   // 1. create tensor
-  auto dense_x = std::make_shared<pt::DenseTensor>(
-      pt::TensorMeta(framework::make_ddim({3, 2, 2, 3}),
-                     pt::Backend::kCPU,
-                     pt::DataType::kFLOAT32,
-                     pt::DataLayout::kNCHW),
-      pt::TensorStatus());
+  auto dense_x = std::make_shared<pten::DenseTensor>(
+      pten::TensorMeta(framework::make_ddim({3, 2, 2, 3}),
+                       pten::Backend::kCPU,
+                       paddle::experimental::DataType::kFLOAT32,
+                       paddle::experimental::DataLayout::kNCHW),
+      pten::TensorStatus());
   auto* dense_x_data = dense_x->mutable_data<float>();
 
   for (int i = 0; i < dense_x->numel(); i++) {
@@ -55,11 +55,11 @@ TEST(API, flatten) {
   ASSERT_EQ(out.shape()[2], expect_shape[2]);
   ASSERT_EQ(out.numel(), 36);
   ASSERT_EQ(out.is_cpu(), true);
-  ASSERT_EQ(out.type(), pt::DataType::kFLOAT32);
-  ASSERT_EQ(out.layout(), pt::DataLayout::kNCHW);
+  ASSERT_EQ(out.type(), paddle::experimental::DataType::kFLOAT32);
+  ASSERT_EQ(out.layout(), paddle::experimental::DataLayout::kNCHW);
   ASSERT_EQ(out.initialized(), true);
   bool value_equal = true;
-  auto dense_out = std::dynamic_pointer_cast<pt::DenseTensor>(out.impl());
+  auto dense_out = std::dynamic_pointer_cast<pten::DenseTensor>(out.impl());
   auto* dense_out_data = dense_out->data<float>();
   for (int i = 0; i < dense_x->numel(); i++) {
     if (std::abs(dense_x_data[i] - dense_out_data[i]) > 1e-6f)
diff --git a/paddle/tcmpt/tests/test_mean_api.cc b/paddle/pten/tests/test_mean_api.cc
similarity index 69%
rename from paddle/tcmpt/tests/test_mean_api.cc
rename to paddle/pten/tests/test_mean_api.cc
index 518a98738961c..fbcd375d51328 100644
--- a/paddle/tcmpt/tests/test_mean_api.cc
+++ b/paddle/pten/tests/test_mean_api.cc
@@ -15,10 +15,10 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <memory>
 
-#include "paddle/tcmpt/hapi/include/math.h"
+#include "paddle/pten/hapi/include/math.h"
 
-#include "paddle/tcmpt/core/dense_tensor.h"
-#include "paddle/tcmpt/core/kernel_registry.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
 
 PT_DECLARE_MODULE(MathCPU);
 
@@ -31,12 +31,12 @@ using DDim = paddle::framework::DDim;
 
 TEST(API, mean) {
   // 1. create tensor
-  auto dense_x = std::make_shared<pt::DenseTensor>(
-      pt::TensorMeta(framework::make_ddim({3, 4}),
-                     pt::Backend::kCPU,
-                     pt::DataType::kFLOAT32,
-                     pt::DataLayout::kNCHW),
-      pt::TensorStatus());
+  auto dense_x = std::make_shared<pten::DenseTensor>(
+      pten::TensorMeta(framework::make_ddim({3, 4}),
+                       pten::Backend::kCPU,
+                       paddle::experimental::DataType::kFLOAT32,
+                       paddle::experimental::DataLayout::kNCHW),
+      pten::TensorStatus());
   auto* dense_x_data = dense_x->mutable_data<float>();
 
   float sum = 0.0;
@@ -55,12 +55,12 @@ TEST(API, mean) {
   ASSERT_EQ(out.shape()[0], 1);
   ASSERT_EQ(out.numel(), 1);
   ASSERT_EQ(out.is_cpu(), true);
-  ASSERT_EQ(out.type(), pt::DataType::kFLOAT32);
-  ASSERT_EQ(out.layout(), pt::DataLayout::kNCHW);
+  ASSERT_EQ(out.type(), paddle::experimental::DataType::kFLOAT32);
+  ASSERT_EQ(out.layout(), paddle::experimental::DataLayout::kNCHW);
   ASSERT_EQ(out.initialized(), true);
 
   auto expect_result = sum / 12;
-  auto dense_out = std::dynamic_pointer_cast<pt::DenseTensor>(out.impl());
+  auto dense_out = std::dynamic_pointer_cast<pten::DenseTensor>(out.impl());
   auto actual_result = dense_out->data<float>()[0];
   ASSERT_NEAR(expect_result, actual_result, 1e-6f);
 }
diff --git a/paddle/tcmpt/CMakeLists.txt b/paddle/tcmpt/CMakeLists.txt
deleted file mode 100644
index 0187a63c2ff6d..0000000000000
--- a/paddle/tcmpt/CMakeLists.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-include(tcmpt)
-# tcmpt api
-add_subdirectory(api)
-# tcmpt high level api
-add_subdirectory(hapi)
-# tcmpt core components
-add_subdirectory(core)
-# tcmpt kernels for diff device
-add_subdirectory(kernels)
-# tcmpt infershape
-add_subdirectory(infershape)
-# TODO(xingfeng): tcmpt inner module API designed by a high-performance team
-add_subdirectory(module)
-# tcmpt tests
-add_subdirectory(tests)
diff --git a/paddle/tcmpt/api/CMakeLists.txt b/paddle/tcmpt/api/CMakeLists.txt
deleted file mode 100644
index bf4d163a62bfc..0000000000000
--- a/paddle/tcmpt/api/CMakeLists.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-# set(declare_file ${PADDLE_BINARY_DIR}/paddle/tcmpt/api/symbols.h.tmp CACHE INTERNAL "symbols.h file")
-# set(declare_file_final ${PADDLE_BINARY_DIR}/paddle/tcmpt/api/symbols.h)
-# file(WRITE ${declare_file} "// Generated by the paddle/tcmpt/api/CMakeLists.txt.  DO NOT EDIT!\n\n")
-
-# function(declare_module TARGTE)
-#     file(APPEND ${declare_file} "extern int RegisterSymbolsFor${TARGET}();\n")
-#     message(STATUS "")
-# endfunction()
-
-# TODO(chenweihang): unify decclare into **_library
-# declare_module(MathCPU)
-# declare_module(MathCUDA)
-
-set(TCMPT_DEPS convert_utils dense_tensor kernel_factory kernel_context)
-set(TCMPT_DEPS ${TCMPT_DEPS} math_cpu linalg_cpu creation_cpu manipulation_cpu)
-set(TCMPT_DEPS ${TCMPT_DEPS} unary binary)
-if(WITH_GPU OR WITH_ROCM)
-  set(TCMPT_DEPS ${TCMPT_DEPS} math_cuda linalg_cuda creation_cuda manipulation_cuda)
-endif()
-
-cc_library(tcmpt SRCS all.cc DEPS ${TCMPT_DEPS})
diff --git a/paddle/tcmpt/hapi/CMakeLists.txt b/paddle/tcmpt/hapi/CMakeLists.txt
deleted file mode 100644
index ebc247ef8a2e2..0000000000000
--- a/paddle/tcmpt/hapi/CMakeLists.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-add_subdirectory(lib)
-
-cc_library(tcmpt_hapi SRCS all.cc DEPS math_api linalg_api creation_api)
diff --git a/paddle/tcmpt/hapi/lib/CMakeLists.txt b/paddle/tcmpt/hapi/lib/CMakeLists.txt
deleted file mode 100644
index 74467603c62b6..0000000000000
--- a/paddle/tcmpt/hapi/lib/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-cc_library(math_api SRCS math.cc DEPS tcmpt)
-cc_library(linalg_api SRCS linalg.cc DEPS tcmpt)
-cc_library(creation_api SRCS creation.cc DEPS tcmpt)
-cc_library(manipulation_api SRCS manipulation.cc DEPS tcmpt)

From beec280677aef38b181618cd94b3182d94a1f165 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 20 Oct 2021 03:35:39 +0000
Subject: [PATCH 094/125] remove k of all enum var

---
 paddle/fluid/framework/tcmpt_utils.cc         |   8 +-
 paddle/fluid/framework/tcmpt_utils_test.cc    |  14 +-
 paddle/tcmpt/api/CMakeLists.txt               |  13 --
 paddle/tcmpt/common/backend.h                 |  91 +++++++++++++
 paddle/tcmpt/common/data_type.h               | 120 +++++++++---------
 paddle/tcmpt/common/layout.h                  |  22 ++--
 paddle/tcmpt/core/CMakeLists.txt              |   8 +-
 paddle/tcmpt/core/convert_utils.cc            | 104 +++++++--------
 paddle/tcmpt/core/convert_utils.h             |   4 +-
 paddle/tcmpt/core/dense_tensor.cc             |  18 +--
 paddle/tcmpt/core/kernel_factory.cc           |   5 +-
 paddle/tcmpt/core/kernel_factory.h            |   8 +-
 paddle/tcmpt/core/kernel_registry.h           |  20 +--
 paddle/tcmpt/core/tensor_base.h               |   2 +-
 paddle/tcmpt/core/tensor_meta.h               |  16 +--
 paddle/tcmpt/core/tensor_status.h             |   2 +-
 .../hapi/include/{backend.h => backend_set.h} |  70 +---------
 paddle/tcmpt/hapi/include/creation.h          |   7 +-
 paddle/tcmpt/hapi/include/tensor.h            |  23 ++--
 paddle/tcmpt/hapi/include/tensor_signature.h  |  17 +--
 paddle/tcmpt/hapi/lib/creation.cc             |  18 +--
 .../{kernel_generate.h => kernel_dispatch.h}  |  93 +++++++-------
 paddle/tcmpt/hapi/lib/linalg.cc               |  19 +--
 paddle/tcmpt/hapi/lib/manipulation.cc         |  18 +--
 paddle/tcmpt/hapi/lib/math.cc                 |  19 +--
 paddle/tcmpt/kernels/cpu/creation.cc          |   2 +-
 paddle/tcmpt/kernels/cpu/linalg.cc            |   2 +-
 paddle/tcmpt/kernels/cpu/manipulation.cc      |   4 +-
 paddle/tcmpt/kernels/cpu/math.cc              |  10 +-
 paddle/tcmpt/kernels/cpu/utils.cc             |   2 +-
 paddle/tcmpt/kernels/cuda/creation.cu         |   2 +-
 paddle/tcmpt/kernels/cuda/linalg.cu           |   2 +-
 paddle/tcmpt/kernels/cuda/manipulation.cu     |   4 +-
 paddle/tcmpt/kernels/cuda/math.cu             |  10 +-
 paddle/tcmpt/kernels/cuda/utils.cu            |   2 +-
 paddle/tcmpt/tests/backend_test.cc            |   2 +-
 paddle/tcmpt/tests/dense_tensor_test.cc       |  12 +-
 paddle/tcmpt/tests/kernel_factory_test.cc     |   2 +-
 paddle/tcmpt/tests/test_copy_api.cc           |  12 +-
 paddle/tcmpt/tests/test_dot_api.cc            |  16 +--
 paddle/tcmpt/tests/test_fill_api.cc           |  36 +++---
 paddle/tcmpt/tests/test_flatten_api.cc        |  10 +-
 paddle/tcmpt/tests/test_mean_api.cc           |  10 +-
 43 files changed, 420 insertions(+), 459 deletions(-)
 create mode 100644 paddle/tcmpt/common/backend.h
 rename paddle/tcmpt/hapi/include/{backend.h => backend_set.h} (57%)
 rename paddle/tcmpt/hapi/lib/{kernel_generate.h => kernel_dispatch.h} (54%)

diff --git a/paddle/fluid/framework/tcmpt_utils.cc b/paddle/fluid/framework/tcmpt_utils.cc
index fc38eb42d74c7..81ad798c9686c 100644
--- a/paddle/fluid/framework/tcmpt_utils.cc
+++ b/paddle/fluid/framework/tcmpt_utils.cc
@@ -158,9 +158,9 @@ OpKernelType TransPtKernelKeyToOpKernelType(const pt::KernelKey& kernel_key) {
   platform::Place place = pt::TransToFluidPlace(kernel_key.backend());
   DataLayout data_layout = pt::TransToFluidDataLayout(kernel_key.layout());
   LibraryType library_type = LibraryType::kPlain;
-  if (kernel_key.backend() == pt::Backend::kMKLDNN) {
+  if (kernel_key.backend() == pt::Backend::MKLDNN) {
     library_type = LibraryType::kMKLDNN;
-  } else if (kernel_key.backend() == pt::Backend::kCUDNN) {
+  } else if (kernel_key.backend() == pt::Backend::CUDNN) {
     library_type = LibraryType::kCUDNN;
   } else {
     // do nothing
@@ -172,9 +172,9 @@ OpKernelType TransPtKernelKeyToOpKernelType(const pt::KernelKey& kernel_key) {
 pt::KernelKey TransOpKernelTypeToPtKernelKey(const OpKernelType& kernel_type) {
   pt::Backend backend = pt::TransToPtBackend(kernel_type.place_);
   if (kernel_type.library_type_ == LibraryType::kMKLDNN) {
-    backend = pt::Backend::kMKLDNN;
+    backend = pt::Backend::MKLDNN;
   } else if (kernel_type.library_type_ == LibraryType::kCUDNN) {
-    backend = pt::Backend::kCUDNN;
+    backend = pt::Backend::CUDNN;
   } else {
     // do
   }
diff --git a/paddle/fluid/framework/tcmpt_utils_test.cc b/paddle/fluid/framework/tcmpt_utils_test.cc
index 200bd5429cd46..c2b31b01716af 100644
--- a/paddle/fluid/framework/tcmpt_utils_test.cc
+++ b/paddle/fluid/framework/tcmpt_utils_test.cc
@@ -37,8 +37,8 @@ TEST(TcmptUtils, MakeTensor) {
   std::vector<float> expect_value = {0.2, 0.5};
   ASSERT_EQ(dense_x->data<float>()[0], expect_value[0]);
   ASSERT_EQ(dense_x->data<float>()[1], expect_value[1]);
-  ASSERT_EQ(dense_x->backend(), pt::Backend::kCPU);
-  ASSERT_EQ(dense_x->data_type(), pt::DataType::kFLOAT32);
+  ASSERT_EQ(dense_x->backend(), pt::Backend::CPU);
+  ASSERT_EQ(dense_x->data_type(), pt::DataType::FLOAT32);
 }
 
 TEST(TcmptUtils, VarToPtTensor) {
@@ -49,18 +49,18 @@ TEST(TcmptUtils, VarToPtTensor) {
   auto* data =
       value->mutable_data<int>(make_ddim({1, 1}), paddle::platform::CPUPlace());
   data[0] = 123;
-  pt::Backend expect_backend = pt::Backend::kCPU;
+  pt::Backend expect_backend = pt::Backend::CPU;
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  expect_backend = pt::Backend::kCUDA;
+  expect_backend = pt::Backend::CUDA;
 #endif
-  auto tensor_def = pt::TensorArgDef(expect_backend, pt::DataLayout::kNCHW,
-                                     pt::DataType::kINT32);
+  auto tensor_def = pt::TensorArgDef(expect_backend, pt::DataLayout::NCHW,
+                                     pt::DataType::INT32);
   // 2. test API
   auto tensor_x = InputVariableToPtTensor(v, tensor_def);
   // 3. check result
   ASSERT_EQ(tensor_x->backend(), expect_backend);
-  ASSERT_EQ(tensor_x->data_type(), pt::DataType::kINT32);
+  ASSERT_EQ(tensor_x->data_type(), pt::DataType::INT32);
 }
 
 }  // namespace framework
diff --git a/paddle/tcmpt/api/CMakeLists.txt b/paddle/tcmpt/api/CMakeLists.txt
index bf4d163a62bfc..0616aa3dfc578 100644
--- a/paddle/tcmpt/api/CMakeLists.txt
+++ b/paddle/tcmpt/api/CMakeLists.txt
@@ -1,16 +1,3 @@
-# set(declare_file ${PADDLE_BINARY_DIR}/paddle/tcmpt/api/symbols.h.tmp CACHE INTERNAL "symbols.h file")
-# set(declare_file_final ${PADDLE_BINARY_DIR}/paddle/tcmpt/api/symbols.h)
-# file(WRITE ${declare_file} "// Generated by the paddle/tcmpt/api/CMakeLists.txt.  DO NOT EDIT!\n\n")
-
-# function(declare_module TARGTE)
-#     file(APPEND ${declare_file} "extern int RegisterSymbolsFor${TARGET}();\n")
-#     message(STATUS "")
-# endfunction()
-
-# TODO(chenweihang): unify decclare into **_library
-# declare_module(MathCPU)
-# declare_module(MathCUDA)
-
 set(TCMPT_DEPS convert_utils dense_tensor kernel_factory kernel_context)
 set(TCMPT_DEPS ${TCMPT_DEPS} math_cpu linalg_cpu creation_cpu manipulation_cpu)
 set(TCMPT_DEPS ${TCMPT_DEPS} unary binary)
diff --git a/paddle/tcmpt/common/backend.h b/paddle/tcmpt/common/backend.h
new file mode 100644
index 0000000000000..c4bb334f86c6d
--- /dev/null
+++ b/paddle/tcmpt/common/backend.h
@@ -0,0 +1,91 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <ostream>
+
+namespace paddle {
+namespace experimental {
+
+/**
+ * [ Why need Backend? ]
+ *
+ * Backend not only means place. Backend is a superset of place.
+ *
+ * Place cannot indicate the difference in calculation methods on the device,
+ * but in order to make the boundary of the kernel clearer and the function
+ * more specific, we need to distinguish the calculation method.
+ *
+ * Such as the kernel for CUDA device, it can be a native CUDA kernel,
+ * or a kernel implemented by CUDNN library.
+ *
+ * Note(chenweihang): HIP is not needed now, we can added it if needed
+ * in the future
+ */
+enum class Backend : uint8_t {
+  // kernel backend cannot be undefined
+  UNDEFINED = 0,
+
+  // basic kernel backend
+  CPU,
+
+  // various acceleration devices' backends
+  CUDA,
+  XPU,  // XPU currently does not exist at the same time as CUDA
+  NPU,  // NPU currently does not exist at the same time as CUDA
+
+  // the third library backend
+  MKLDNN,
+  CUDNN,
+
+  // end of backend types
+  NUM_BACKENDS,
+};
+
+inline std::ostream& operator<<(std::ostream& os, Backend backend) {
+  switch (backend) {
+    case Backend::UNDEFINED:
+      os << "Undefined";
+      break;
+    case Backend::CPU:
+      os << "CPU";
+      break;
+    case Backend::CUDA:
+      os << "CUDA";
+      break;
+    case Backend::XPU:
+      os << "XPU";
+      break;
+    case Backend::NPU:
+      os << "NPU";
+      break;
+    case Backend::MKLDNN:
+      os << "MKLDNN";
+      break;
+    case Backend::CUDNN:
+      os << "CUDNN";
+      break;
+    default:
+      throw std::runtime_error("Invalid Backend type.");
+  }
+  return os;
+}
+
+}  // namespace experimental
+}  // namespace paddle
+
+namespace pt {
+using Backend = paddle::experimental::Backend;
+}
diff --git a/paddle/tcmpt/common/data_type.h b/paddle/tcmpt/common/data_type.h
index 03881e6bda1ca..195a0fceef6dd 100644
--- a/paddle/tcmpt/common/data_type.h
+++ b/paddle/tcmpt/common/data_type.h
@@ -30,48 +30,48 @@ using float16 = ::paddle::platform::float16;
 using bfloat16 = ::paddle::platform::bfloat16;
 
 enum class DataType {
-  kUndef = 0,
-  kBOOL,
-  kINT8,   // Char
-  kUINT8,  // BYte
-  kINT16,
-  kINT32,
-  kUINT32,
-  kINT64,
-  kUINT64,
-  kBFLOAT16,
-  kFLOAT16,
-  kUINT16,
-  kFLOAT32,
-  kFLOAT64,
-  kCOMPLEX64,
-  kCOMPLEX128,
-  kNumDataTypes
+  UNDEFINED = 0,
+  BOOL,
+  INT8,   // Char
+  UINT8,  // BYte
+  INT16,
+  INT32,
+  UINT32,
+  INT64,
+  UINT64,
+  BFLOAT16,
+  FLOAT16,
+  UINT16,
+  FLOAT32,
+  FLOAT64,
+  COMPLEX64,
+  COMPLEX128,
+  NUM_DATA_TYPES
 };
 
 inline size_t SizeOf(DataType data_type) {
   switch (data_type) {
-    case DataType::kBOOL:
-    case DataType::kUINT8:
-    case DataType::kINT8:
+    case DataType::BOOL:
+    case DataType::UINT8:
+    case DataType::INT8:
       return 1;
-    case DataType::kFLOAT16:
-    case DataType::kINT16:
-    case DataType::kUINT16:
+    case DataType::FLOAT16:
+    case DataType::INT16:
+    case DataType::UINT16:
       return 2;
-    case DataType::kFLOAT32:
-    case DataType::kINT32:
-    case DataType::kUINT32:
+    case DataType::FLOAT32:
+    case DataType::INT32:
+    case DataType::UINT32:
       return 4;
-    case DataType::kFLOAT64:
-    case DataType::kINT64:
-    case DataType::kUINT64:
+    case DataType::FLOAT64:
+    case DataType::INT64:
+    case DataType::UINT64:
       return 8;
-    case DataType::kUndef:
-    case DataType::kBFLOAT16:
-    case DataType::kCOMPLEX64:
-    case DataType::kCOMPLEX128:
-    case DataType::kNumDataTypes:
+    case DataType::UNDEFINED:
+    case DataType::BFLOAT16:
+    case DataType::COMPLEX64:
+    case DataType::COMPLEX128:
+    case DataType::NUM_DATA_TYPES:
       PADDLE_THROW(platform::errors::Unimplemented(
           "Data type %d is not supported by tensor.",
           static_cast<int>(data_type)));
@@ -79,19 +79,19 @@ inline size_t SizeOf(DataType data_type) {
   }
 }
 
-#define PT_FOR_EACH_DATA_TYPE(_)     \
-  _(bool, DataType::kBOOL)           \
-  _(int8_t, DataType::kINT8)         \
-  _(uint8_t, DataType::kUINT8)       \
-  _(int16_t, DataType::kINT16)       \
-  _(int, DataType::kINT32)           \
-  _(int64_t, DataType::kINT64)       \
-  _(bfloat16, DataType::kBFLOAT16)   \
-  _(float16, DataType::kFLOAT16)     \
-  _(float, DataType::kFLOAT32)       \
-  _(double, DataType::kFLOAT64)      \
-  _(complex64, DataType::kCOMPLEX64) \
-  _(complex128, DataType::kCOMPLEX128)
+#define PT_FOR_EACH_DATA_TYPE(_)    \
+  _(bool, DataType::BOOL)           \
+  _(int8_t, DataType::INT8)         \
+  _(uint8_t, DataType::UINT8)       \
+  _(int16_t, DataType::INT16)       \
+  _(int, DataType::INT32)           \
+  _(int64_t, DataType::INT64)       \
+  _(bfloat16, DataType::BFLOAT16)   \
+  _(float16, DataType::FLOAT16)     \
+  _(float, DataType::FLOAT32)       \
+  _(double, DataType::FLOAT64)      \
+  _(complex64, DataType::COMPLEX64) \
+  _(complex128, DataType::COMPLEX128)
 
 template <DataType T>
 struct DataTypeToCppType;
@@ -121,43 +121,43 @@ PT_FOR_EACH_DATA_TYPE(PT_SPECIALIZE_CppTypeToDataType)
 
 inline std::ostream& operator<<(std::ostream& os, DataType dtype) {
   switch (dtype) {
-    case DataType::kUndef:
+    case DataType::UNDEFINED:
       os << "Undefined";
       break;
-    case DataType::kBOOL:
+    case DataType::BOOL:
       os << "bool";
       break;
-    case DataType::kINT8:
+    case DataType::INT8:
       os << "int8";
       break;
-    case DataType::kUINT8:
+    case DataType::UINT8:
       os << "uint8";
       break;
-    case DataType::kINT16:
+    case DataType::INT16:
       os << "int16";
       break;
-    case DataType::kINT32:
+    case DataType::INT32:
       os << "int32";
       break;
-    case DataType::kINT64:
+    case DataType::INT64:
       os << "int64";
       break;
-    case DataType::kBFLOAT16:
+    case DataType::BFLOAT16:
       os << "bfloat16";
       break;
-    case DataType::kFLOAT16:
+    case DataType::FLOAT16:
       os << "float16";
       break;
-    case DataType::kFLOAT32:
+    case DataType::FLOAT32:
       os << "float32";
       break;
-    case DataType::kFLOAT64:
+    case DataType::FLOAT64:
       os << "float64";
       break;
-    case DataType::kCOMPLEX64:
+    case DataType::COMPLEX64:
       os << "complex64";
       break;
-    case DataType::kCOMPLEX128:
+    case DataType::COMPLEX128:
       os << "complex128";
       break;
     default:
diff --git a/paddle/tcmpt/common/layout.h b/paddle/tcmpt/common/layout.h
index ae4e43a9f7197..b99dae4d031c6 100644
--- a/paddle/tcmpt/common/layout.h
+++ b/paddle/tcmpt/common/layout.h
@@ -18,29 +18,29 @@ namespace paddle {
 namespace experimental {
 
 enum class DataLayout {
-  kUndef = 0,
-  kAny,
-  kNHWC,
-  kNCHW,
-  kMKLDNN,
-  kNumLayouts,
+  UNDEFINED = 0,
+  ANY,
+  NHWC,
+  NCHW,
+  MKLDNN,
+  NUM_DATA_LAYOUTS,
 };
 
 inline std::ostream& operator<<(std::ostream& os, DataLayout dtype) {
   switch (dtype) {
-    case DataLayout::kUndef:
+    case DataLayout::UNDEFINED:
       os << "Undefined";
       break;
-    case DataLayout::kAny:
+    case DataLayout::ANY:
       os << "Any";
       break;
-    case DataLayout::kNHWC:
+    case DataLayout::NHWC:
       os << "NHWC";
       break;
-    case DataLayout::kNCHW:
+    case DataLayout::NCHW:
       os << "NCHW";
       break;
-    case DataLayout::kMKLDNN:
+    case DataLayout::MKLDNN:
       os << "MKLDNN";
       break;
     default:
diff --git a/paddle/tcmpt/core/CMakeLists.txt b/paddle/tcmpt/core/CMakeLists.txt
index 3e6a26fa27c2b..448f7123c38b9 100644
--- a/paddle/tcmpt/core/CMakeLists.txt
+++ b/paddle/tcmpt/core/CMakeLists.txt
@@ -5,13 +5,13 @@ ELSE()
 ENDIF()
 
 if(WITH_GPU)
-    cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place backend gpu_info)
+    cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place gpu_info)
 elseif(WITH_ROCM)
-    cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place backend gpu_info)
+    cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place gpu_info)
 else()
-    cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place backend)
+    cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place)
 endif()
 cc_library(dense_tensor SRCS dense_tensor.cc DEPS enforce data_type ddim allocator place convert_utils ${MKLDNN_CTX_DEPS})
 
-cc_library(kernel_factory SRCS kernel_factory.cc DEPS enforce backend)
+cc_library(kernel_factory SRCS kernel_factory.cc DEPS enforce)
 cc_library(kernel_context SRCS kernel_context.cc DEPS enforce device_context)
diff --git a/paddle/tcmpt/core/convert_utils.cc b/paddle/tcmpt/core/convert_utils.cc
index e5b8acba19cf0..c615bd3bfaa7f 100644
--- a/paddle/tcmpt/core/convert_utils.cc
+++ b/paddle/tcmpt/core/convert_utils.cc
@@ -19,22 +19,14 @@ limitations under the License. */
 
 namespace pt {
 
-// TODO(chenweihang): Add other place branchs
+// TODO(chenweihang): Add other place trans cases later
 Backend TransToPtBackend(const paddle::platform::Place& place) {
   if (paddle::platform::is_cpu_place(place)) {
-    return Backend::kCPU;
+    return Backend::CPU;
   } else if (paddle::platform::is_gpu_place(place)) {
-    return Backend::kCUDA;
-  } else if (paddle::platform::is_cuda_pinned_place(place)) {
-    return Backend::kCUDAPinned;
-  } else if (paddle::platform::is_xpu_place(place)) {
-    return Backend::kXPU;
-  } else if (paddle::platform::is_npu_place(place)) {
-    return Backend::kNPU;
-  } else if (paddle::platform::is_npu_pinned_place(place)) {
-    return Backend::kNPUPinned;
+    return Backend::CUDA;
   } else {
-    return Backend::kUndef;
+    return Backend::UNDEFINED;
   }
 }
 
@@ -44,75 +36,65 @@ pt::DataType TransToPtDataType(
   // the data type is used
   switch (dtype) {
     case paddle::framework::proto::VarType::FP32:
-      return DataType::kFLOAT32;
+      return DataType::FLOAT32;
     case paddle::framework::proto::VarType::FP64:
-      return DataType::kFLOAT64;
+      return DataType::FLOAT64;
     case paddle::framework::proto::VarType::INT64:
-      return DataType::kINT64;
+      return DataType::INT64;
     case paddle::framework::proto::VarType::INT32:
-      return DataType::kINT32;
+      return DataType::INT32;
     case paddle::framework::proto::VarType::INT8:
-      return DataType::kINT8;
+      return DataType::INT8;
     case paddle::framework::proto::VarType::UINT8:
-      return DataType::kUINT8;
+      return DataType::UINT8;
     case paddle::framework::proto::VarType::INT16:
-      return DataType::kINT16;
+      return DataType::INT16;
     case paddle::framework::proto::VarType::COMPLEX64:
-      return DataType::kCOMPLEX64;
+      return DataType::COMPLEX64;
     case paddle::framework::proto::VarType::COMPLEX128:
-      return DataType::kCOMPLEX128;
+      return DataType::COMPLEX128;
     case paddle::framework::proto::VarType::FP16:
-      return DataType::kFLOAT16;
+      return DataType::FLOAT16;
     case paddle::framework::proto::VarType::BF16:
-      return DataType::kBFLOAT16;
+      return DataType::BFLOAT16;
     case paddle::framework::proto::VarType::BOOL:
-      return DataType::kBOOL;
+      return DataType::BOOL;
     default:
-      return DataType::kUndef;
+      return DataType::UNDEFINED;
   }
 }
 
 DataLayout TransToPtDataLayout(const paddle::framework::DataLayout& layout) {
   switch (layout) {
     case paddle::framework::DataLayout::kNHWC:
-      return DataLayout::kNHWC;
+      return DataLayout::NHWC;
     case paddle::framework::DataLayout::kNCHW:
-      return DataLayout::kNCHW;
+      return DataLayout::NCHW;
     case paddle::framework::DataLayout::kAnyLayout:
-      return DataLayout::kAny;
+      return DataLayout::ANY;
     case paddle::framework::DataLayout::kMKLDNN:
-      return DataLayout::kMKLDNN;
+      return DataLayout::MKLDNN;
     default:
-      return DataLayout::kUndef;
+      return DataLayout::UNDEFINED;
   }
 }
 
 paddle::platform::Place TransToFluidPlace(const Backend& backend) {
-  // TODO(chenweihang): add other trans cases
+  // TODO(chenweihang): add other trans cases later
   switch (backend) {
-    case pt::Backend::kCPU:
+    case pt::Backend::CPU:
       return paddle::platform::CPUPlace();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    case pt::Backend::kCUDA:
+    case pt::Backend::CUDA:
       return paddle::platform::CUDAPlace(
           paddle::platform::GetCurrentDeviceId());
 #endif
-#ifdef PADDLE_WITH_XPU
-    case pt::Backend::kXPU:
-      // TODO(chenweihang): add device id
-      return paddle::platform::XPUPlace();
-#endif
-#ifdef PADDLE_WITH_NPU
-    case pt::Backend::kNPU:
-      // TODO(chenweihang): add device id
-      return paddle::platform::NPUPlace();
-#endif
 #ifdef PADDLE_WITH_MKLDNN
-    case pt::Backend::kMKLDNN:
+    case pt::Backend::MKLDNN:
       return paddle::platform::CPUPlace();
 #endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    case pt::Backend::kCUDNN:
+    case pt::Backend::CUDNN:
       return paddle::platform::CUDAPlace(
           paddle::platform::GetCurrentDeviceId());
 #endif
@@ -128,29 +110,29 @@ paddle::framework::proto::VarType::Type TransToProtoVarType(
   // Set the order of case branches according to the frequency with
   // the data type is used
   switch (dtype) {
-    case DataType::kFLOAT32:
+    case DataType::FLOAT32:
       return paddle::framework::proto::VarType::FP32;
-    case DataType::kFLOAT64:
+    case DataType::FLOAT64:
       return paddle::framework::proto::VarType::FP64;
-    case DataType::kINT64:
+    case DataType::INT64:
       return paddle::framework::proto::VarType::INT64;
-    case DataType::kINT32:
+    case DataType::INT32:
       return paddle::framework::proto::VarType::INT32;
-    case DataType::kINT8:
+    case DataType::INT8:
       return paddle::framework::proto::VarType::INT8;
-    case DataType::kUINT8:
+    case DataType::UINT8:
       return paddle::framework::proto::VarType::UINT8;
-    case DataType::kINT16:
+    case DataType::INT16:
       return paddle::framework::proto::VarType::INT16;
-    case DataType::kCOMPLEX64:
+    case DataType::COMPLEX64:
       return paddle::framework::proto::VarType::COMPLEX64;
-    case DataType::kCOMPLEX128:
+    case DataType::COMPLEX128:
       return paddle::framework::proto::VarType::COMPLEX128;
-    case DataType::kFLOAT16:
+    case DataType::FLOAT16:
       return paddle::framework::proto::VarType::FP16;
-    case DataType::kBFLOAT16:
+    case DataType::BFLOAT16:
       return paddle::framework::proto::VarType::BF16;
-    case DataType::kBOOL:
+    case DataType::BOOL:
       return paddle::framework::proto::VarType::BOOL;
     default:
       PADDLE_THROW(paddle::platform::errors::Unimplemented(
@@ -162,13 +144,13 @@ paddle::framework::proto::VarType::Type TransToProtoVarType(
 
 paddle::framework::DataLayout TransToFluidDataLayout(const DataLayout& layout) {
   switch (layout) {
-    case DataLayout::kNHWC:
+    case DataLayout::NHWC:
       return paddle::framework::DataLayout::kNHWC;
-    case DataLayout::kNCHW:
+    case DataLayout::NCHW:
       return paddle::framework::DataLayout::kNCHW;
-    case DataLayout::kAny:
+    case DataLayout::ANY:
       return paddle::framework::DataLayout::kAnyLayout;
-    case DataLayout::kMKLDNN:
+    case DataLayout::MKLDNN:
       return paddle::framework::DataLayout::kMKLDNN;
     default:
       PADDLE_THROW(paddle::platform::errors::Unimplemented(
diff --git a/paddle/tcmpt/core/convert_utils.h b/paddle/tcmpt/core/convert_utils.h
index 011652bdc9572..8fbacc8f663b0 100644
--- a/paddle/tcmpt/core/convert_utils.h
+++ b/paddle/tcmpt/core/convert_utils.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/tcmpt/common/backend.h"
 #include "paddle/tcmpt/common/data_type.h"
 #include "paddle/tcmpt/common/layout.h"
-#include "paddle/tcmpt/core/backend.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/data_layout.h"
@@ -30,8 +30,6 @@ namespace pt {
 using DataType = paddle::experimental::DataType;
 using DataLayout = paddle::experimental::DataLayout;
 
-// TODO(chenweihang): Use the original var type as much as possible
-// to avoid transform, such as DataLayout, VarType
 Backend TransToPtBackend(const paddle::platform::Place& place);
 DataType TransToPtDataType(
     const paddle::framework::proto::VarType::Type& dtype);
diff --git a/paddle/tcmpt/core/dense_tensor.cc b/paddle/tcmpt/core/dense_tensor.cc
index 9c34b5823d590..806a5fb938419 100644
--- a/paddle/tcmpt/core/dense_tensor.cc
+++ b/paddle/tcmpt/core/dense_tensor.cc
@@ -54,25 +54,11 @@ void DenseTensor::ShareAllocation(
 // TODO(chenweihang): Add other place branchs
 paddle::platform::Place DenseTensor::GetPlaceByBackend() const {
   switch (meta_.backend) {
-    case Backend::kCPU:
+    case Backend::CPU:
       return CPUPlace();
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    case Backend::kCUDA:
+    case Backend::CUDA:
       return CUDAPlace(paddle::platform::GetCurrentDeviceId());
-    case Backend::kCUDAPinned:
-      return CUDAPinnedPlace();
-#endif
-#ifdef PADDLE_WITH_XPU
-    case Backend::kXPU:
-      // TODO(chenweihang): add device id
-      return XPUPlace();
-#endif
-#ifdef PADDLE_WITH_NPU
-    case Backend::kNPU:
-      // TODO(chenweihang): add device id
-      return NPUPlace();
-    case Backend::kNPUPinned:
-      return NPUPinnedPlace();
 #endif
     default:
       PADDLE_THROW(paddle::platform::errors::Unimplemented(
diff --git a/paddle/tcmpt/core/kernel_factory.cc b/paddle/tcmpt/core/kernel_factory.cc
index a301d6a995ce7..75df74fb31ad1 100644
--- a/paddle/tcmpt/core/kernel_factory.cc
+++ b/paddle/tcmpt/core/kernel_factory.cc
@@ -51,9 +51,10 @@ const Kernel& KernelFactory::SelectKernelOrThrowError(
                         "The kernel `%s` is not registered.", kernel_name));
 
   auto kernel_iter = iter->second.find(kernel_key);
-  if (kernel_key.layout() != pt::DataLayout::kAny) {
+  // TODO(chenweihang): polish refind impl here
+  if (kernel_key.layout() != pt::DataLayout::ANY) {
     pt::KernelKey any_layout_kernel_key(
-        kernel_key.backend(), pt::DataLayout::kAny, kernel_key.dtype());
+        kernel_key.backend(), pt::DataLayout::ANY, kernel_key.dtype());
     kernel_iter = iter->second.find(any_layout_kernel_key);
   }
   PADDLE_ENFORCE_NE(
diff --git a/paddle/tcmpt/core/kernel_factory.h b/paddle/tcmpt/core/kernel_factory.h
index 6e4a3fa86dfda..e11cf2cee0c2a 100644
--- a/paddle/tcmpt/core/kernel_factory.h
+++ b/paddle/tcmpt/core/kernel_factory.h
@@ -19,9 +19,9 @@
 #include <unordered_map>
 #include <utility>
 
+#include "paddle/tcmpt/common/backend.h"
 #include "paddle/tcmpt/common/data_type.h"
 #include "paddle/tcmpt/common/layout.h"
-#include "paddle/tcmpt/core/backend.h"
 #include "paddle/tcmpt/core/kernel_def.h"
 
 // See Note [ Why still include the fluid headers? ]
@@ -155,9 +155,9 @@ class KernelKey {
   constexpr static int kDataLayoutBitLength = 4;
   constexpr static int kDataTypeBitLength = 8;
 
-  Backend backend_{Backend::kUndef};
-  DataLayout layout_{DataLayout::kUndef};
-  DataType dtype_{DataType::kUndef};
+  Backend backend_{Backend::UNDEFINED};
+  DataLayout layout_{DataLayout::UNDEFINED};
+  DataType dtype_{DataType::UNDEFINED};
 
   // Avoid calculating Hash value at runtime.
   // Note: Now the number of bits we need does not exceed 32 bits, so there is
diff --git a/paddle/tcmpt/core/kernel_registry.h b/paddle/tcmpt/core/kernel_registry.h
index caa42546ab054..2664288ebcc5b 100644
--- a/paddle/tcmpt/core/kernel_registry.h
+++ b/paddle/tcmpt/core/kernel_registry.h
@@ -26,9 +26,9 @@
 
 namespace pt {
 
-#define BACKEND(arg__) pt::Backend::k##arg__
-#define DATALAYOUT(arg__) pt::DataLayout::k##arg__
-#define DATATYPE(arg__) pt::DataType::k##arg__
+#define BACKEND(arg__) pt::Backend::arg__
+#define DATALAYOUT(arg__) pt::DataLayout::arg__
+#define DATATYPE(arg__) pt::DataType::arg__
 
 template <typename Func>
 struct KernelArgsParseFunctor;
@@ -45,8 +45,8 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
     // TODO(chenweihang): The fluid Tensor's default layout is NCHW,
     // it is not same as kernel's layout, we should fix this error on
     // fluid Tensor
-    auto default_tensor_layout = pt::DataLayout::kNCHW;
-    if (default_key.layout() != pt::DataLayout::kAny) {
+    auto default_tensor_layout = pt::DataLayout::NCHW;
+    if (default_key.layout() != pt::DataLayout::ANY) {
       default_tensor_layout = default_key.layout();
     }
     auto args_type = ParseArgType(Indices{});
@@ -106,11 +106,11 @@ struct KernelRegistrar {
                   KernelArgsParseFn args_parse_fn,
                   KernelArgsDefFn args_def_fn,
                   KernelFn kernel_fn) {
-    if (layout == DataLayout::kAny) {
-      for (DataLayout layout_iter = DataLayout::kNHWC;
-           layout_iter != DataLayout::kNumLayouts;
+    if (layout == DataLayout::ANY) {
+      for (DataLayout layout_iter = DataLayout::NHWC;
+           layout_iter != DataLayout::NUM_DATA_LAYOUTS;
            layout_iter++) {
-        for (DataType dtype = DataType::kBOOL; dtype != DataType::kNumDataTypes;
+        for (DataType dtype = DataType::BOOL; dtype != DataType::NUM_DATA_TYPES;
              dtype++) {
           ConstructKernel(kernel_name_cstr,
                           backend,
@@ -122,7 +122,7 @@ struct KernelRegistrar {
         }
       }
     } else {
-      for (DataType dtype = DataType::kBOOL; dtype != DataType::kNumDataTypes;
+      for (DataType dtype = DataType::BOOL; dtype != DataType::NUM_DATA_TYPES;
            dtype++) {
         ConstructKernel(kernel_name_cstr,
                         backend,
diff --git a/paddle/tcmpt/core/tensor_base.h b/paddle/tcmpt/core/tensor_base.h
index 240808e3cc492..a4e67d88303db 100644
--- a/paddle/tcmpt/core/tensor_base.h
+++ b/paddle/tcmpt/core/tensor_base.h
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/tcmpt/core/storage.h"
 #include "paddle/tcmpt/core/utils/type_registry.h"
 
-#include "paddle/tcmpt/core/backend.h"
+#include "paddle/tcmpt/common/backend.h"
 
 namespace paddle {
 namespace tcmpt {
diff --git a/paddle/tcmpt/core/tensor_meta.h b/paddle/tcmpt/core/tensor_meta.h
index e875c73d980b7..0612e58350ab5 100644
--- a/paddle/tcmpt/core/tensor_meta.h
+++ b/paddle/tcmpt/core/tensor_meta.h
@@ -16,9 +16,9 @@ limitations under the License. */
 
 #include <vector>
 
+#include "paddle/tcmpt/common/backend.h"
 #include "paddle/tcmpt/common/data_type.h"
 #include "paddle/tcmpt/common/layout.h"
-#include "paddle/tcmpt/core/backend.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/ddim.h"
@@ -28,9 +28,6 @@ limitations under the License. */
 
 namespace pt {
 
-using DataType = paddle::experimental::DataType;
-using DataLayout = paddle::experimental::DataLayout;
-
 // template <typename T>
 // using Vector = paddle::framework::Vector<T>;
 
@@ -74,7 +71,7 @@ struct TensorMeta {
 
   TensorMeta(TensorMeta&& meta)
       : dims(meta.dims),
-        backend_set(meta.backend_set),
+        backend(meta.backend),
         type(meta.type),
         layout(meta.layout),
         numel(meta.numel),
@@ -89,7 +86,7 @@ struct TensorMeta {
              size_t offset = 0UL,
              const LoD& lod = {})
       : dims(dims),
-        backend_set(backend),
+        backend(backend),
         type(type),
         layout(layout),
         offset(offset),
@@ -104,10 +101,9 @@ struct TensorMeta {
 
   DDim dims;
 
-  BackendSet backend_set{Backend::CPU};
-
-  DataType type{DataType::kFLOAT32};
-  DataLayout layout{DataLayout::kNCHW};
+  Backend backend{Backend::CPU};
+  DataType type{DataType::FLOAT32};
+  DataLayout layout{DataLayout::NCHW};
 
   /**
    * [ Why not calculate numel based on dims? ]
diff --git a/paddle/tcmpt/core/tensor_status.h b/paddle/tcmpt/core/tensor_status.h
index 1eb56397414b5..2e934f7a667f6 100644
--- a/paddle/tcmpt/core/tensor_status.h
+++ b/paddle/tcmpt/core/tensor_status.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/tcmpt/common/backend.h"
 #include "paddle/tcmpt/common/data_type.h"
 #include "paddle/tcmpt/common/layout.h"
-#include "paddle/tcmpt/core/backend.h"
 
 namespace pt {
 
diff --git a/paddle/tcmpt/hapi/include/backend.h b/paddle/tcmpt/hapi/include/backend_set.h
similarity index 57%
rename from paddle/tcmpt/hapi/include/backend.h
rename to paddle/tcmpt/hapi/include/backend_set.h
index b86029551d1b6..39b9cab56053e 100644
--- a/paddle/tcmpt/hapi/include/backend.h
+++ b/paddle/tcmpt/hapi/include/backend_set.h
@@ -16,44 +16,12 @@ limitations under the License. */
 
 #include <ostream>
 
+// TODO(chenweihang): move this file into hapi/include when compile
+#include "paddle/tcmpt/common/backend.h"
+
 namespace paddle {
 namespace experimental {
 
-/**
- * [ Why need Backend? ]
- *
- * Backend not only means place. Backend is a superset of place.
- *
- * Place cannot indicate the difference in calculation methods on the device,
- * but in order to make the boundary of the kernel clearer and the function
- * more specific, we need to distinguish the calculation method.
- *
- * Such as the kernel for CUDA device, it can be a native CUDA kernel,
- * or a kernel implemented by CUDNN library.
- *
- * Note(chenweihang): HIP is not needed now, we can added it if needed
- * in the future
- */
-enum class Backend : uint8_t {
-  // kernel backend cannot be undefined
-  UNDEFINED = 0,
-
-  // basic kernel backend
-  CPU,
-
-  // various acceleration devices' backends
-  CUDA,
-  XPU,  // XPU currently does not exist at the same time as CUDA
-  NPU,  // NPU currently does not exist at the same time as CUDA
-
-  // the third library backend
-  MKLDNN,
-  CUDNN,
-
-  // end of backend types
-  kNumBackends,
-};
-
 /**
  * We use the backend to form a bit set to assist the runtime kernel selection,
  * and the higher backend bit has a higher priority.
@@ -75,7 +43,7 @@ class BackendSet final {
     if (b == Backend::UNDEFINED) {
       throw std::runtime_error("Backend argument can't be UNDEFINED.");
     }
-    return static_cast<bool>(bitset_ & BackendSet(b).bitset())
+    return static_cast<bool>(bitset_ & BackendSet(b).bitset());
   }
   bool IsEmpty() const { return bitset_ == 0; }
 
@@ -101,35 +69,5 @@ class BackendSet final {
   uint64_t bitset_;
 };
 
-std::ostream& operator<<(std::ostream& os, Backend backend) {
-  switch (backend) {
-    case Backend::UNDEFINED:
-      os << "Undefined";
-      break;
-    case Backend::CPU:
-      os << "CPU";
-      break;
-    case Backend::CUDA:
-      os << "CUDA";
-      break;
-    case Backend::XPU:
-      os << "XPU";
-      break;
-    case Backend::NPU:
-      os << "NPU";
-      break;
-    case Backend::MKLDNN:
-      os << "MKLDNN";
-      break;
-    case Backend::CUDNN:
-      os << "CUDNN";
-      break;
-    default:
-      // TODO(chenweihang): replace by internal enforce method later
-      throw std::runtime_error("Invalid Backend type.");
-  }
-  return os;
-}
-
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/tcmpt/hapi/include/creation.h b/paddle/tcmpt/hapi/include/creation.h
index d2d68e3bb7e61..a3875d99afa0a 100644
--- a/paddle/tcmpt/hapi/include/creation.h
+++ b/paddle/tcmpt/hapi/include/creation.h
@@ -23,11 +23,12 @@ namespace experimental {
 
 Tensor full_like(const Tensor& x,
                  const pt::Scalar& value,
-                 pt::DataType dtype = pt::DataType::kUndef);
+                 pt::DataType dtype = pt::DataType::UNDEFINED);
 
-Tensor ones_like(const Tensor& x, pt::DataType dtype = pt::DataType::kUndef);
+Tensor ones_like(const Tensor& x, pt::DataType dtype = pt::DataType::UNDEFINED);
 
-Tensor zeros_like(const Tensor& x, pt::DataType dtype = pt::DataType::kUndef);
+Tensor zeros_like(const Tensor& x,
+                  pt::DataType dtype = pt::DataType::UNDEFINED);
 
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/tcmpt/hapi/include/tensor.h b/paddle/tcmpt/hapi/include/tensor.h
index 50fcc00966a6f..95cccd2feb38e 100644
--- a/paddle/tcmpt/hapi/include/tensor.h
+++ b/paddle/tcmpt/hapi/include/tensor.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include <utility>
 
 #include "paddle/tcmpt/core/tensor_base.h"
-#include "paddle/tcmpt/core/tensor_signature.h"
+#include "paddle/tcmpt/hapi/include/tensor_signature.h"
 
 /**
  * [ Why still include the fluid headers? ]
@@ -97,6 +97,7 @@ class Tensor final {
     if (impl_.get() == nullptr) {
       throw std::runtime_error("TensorImpl with nullptr is not supported");
     }
+    signature_.reset(new TensorSignature(impl_->backend()));
   }
 
   /* Part 2: Dimension, DataType and DataLayout methods */
@@ -140,25 +141,21 @@ class Tensor final {
    * Backend judgment APIs, shield the concept of Backend.
    */
   BackendSet backend_set() const { return signature_->backend_set; }
+  void set_backend_set(const BackendSet& backend_set) {
+    if (signature_ == nullptr) {
+      signature_.reset(new TensorSignature());
+    }
+    signature_->backend_set = backend_set;
+  }
 
-  bool is_cpu() const;
-  bool is_cuda() const;
-  bool is_hip() const;
-  bool is_xpu() const;
-  bool is_npu() const;
-  bool is_mkldnn() const;
-  bool is_cudnn() const;
+  bool is_cpu() const { return signature_->backend_set.Has(Backend::CPU); }
+  bool is_cuda() const { return signature_->backend_set.Has(Backend::CUDA); }
 
   /**
    * Backend convert APIs.
    */
   Tensor cpu() const;
   Tensor cuda() const;
-  Tensor hip() const;
-  Tensor xpu() const;
-  Tensor npu() const;
-  Tensor mkldnn() const;
-  Tensor cudnn() const;
 
   /* Part 4: Data Access methods */
   /**
diff --git a/paddle/tcmpt/hapi/include/tensor_signature.h b/paddle/tcmpt/hapi/include/tensor_signature.h
index 31076758c0944..80558bd7885ca 100644
--- a/paddle/tcmpt/hapi/include/tensor_signature.h
+++ b/paddle/tcmpt/hapi/include/tensor_signature.h
@@ -16,28 +16,29 @@ limitations under the License. */
 
 #include <string>
 
-#include "paddle/tcmpt/hapi/include/backend.h"
+#include "paddle/tcmpt/hapi/include/backend_set.h"
 
 namespace paddle {
 namespace experimental {
 
 struct TensorSignature final {
- public:
+  std::string name{""};
+  BackendSet backend_set{Backend::CPU};
+
   TensorSignature() = default;
+
+  // open default methods if needed
   TensorSignature& operator=(const TensorSignature&) = delete;
   TensorSignature& operator=(TensorSignature&&) = delete;
   TensorSignature(const TensorSignature&) = delete;
   TensorSignature(TensorSignature&&) = delete;
 
-  TensorSignature(const std::string& t_name) : name(t_name) {}
-  TensorSignature(const BackendSet& t_backend_set)
+  explicit TensorSignature(const std::string& t_name) : name(t_name) {}
+  explicit TensorSignature(const Backend& t_backend) : backend_set(t_backend) {}
+  explicit TensorSignature(const BackendSet& t_backend_set)
       : backend_set(t_backend_set) {}
   TensorSignature(const std::string& t_name, const BackendSet& t_backend_set)
       : name(t_name), backend_set(t_backend_set) {}
-
- private:
-  std::string name{""};
-  BackendSet backend_set{Backend::CPU};
 };
 
 }  // namespace experimental
diff --git a/paddle/tcmpt/hapi/lib/creation.cc b/paddle/tcmpt/hapi/lib/creation.cc
index 057855a3dba4c..0566e8a68b5af 100644
--- a/paddle/tcmpt/hapi/lib/creation.cc
+++ b/paddle/tcmpt/hapi/lib/creation.cc
@@ -20,30 +20,25 @@ limitations under the License. */
 
 #include "paddle/tcmpt/api/include/core.h"
 #include "paddle/tcmpt/api/include/infershape.h"
-#include "paddle/tcmpt/hapi/lib/kernel_generate.h"
+#include "paddle/tcmpt/hapi/lib/kernel_dispatch.h"
 
 namespace paddle {
 namespace experimental {
 
 Tensor full_like(const Tensor& x, const pt::Scalar& value, pt::DataType dtype) {
   // 1. Get kernel signature and kernel
-  auto kernel_signature = ParseKernelNameAndKeyByArgs("fill_any_like", x);
-  VLOG(1) << kernel_signature.first;
-  VLOG(1) << kernel_signature.second;
-  VLOG(1) << pt::KernelFactory::Instance();
-
+  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
+  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
   auto kernel = pt::KernelFactory::Instance().SelectKernelOrThrowError(
-      kernel_signature.first, kernel_signature.second);
-  VLOG(1) << kernel;
+      "fill_any_like", kernel_key);
 
   // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_signature.second.backend());
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
   auto kernel_context = pt::KernelContext(*dev_ctx);
 
   // 3. Auto data transform
   auto dense_x = std::dynamic_pointer_cast<pt::DenseTensor>(x.impl());
   kernel_context.EmplaceBackInput(dense_x);
-
   kernel_context.EmplaceBackAttr(value);
 
   // 4. InferShape
@@ -52,13 +47,14 @@ Tensor full_like(const Tensor& x, const pt::Scalar& value, pt::DataType dtype) {
   // 5. Prepare outputs
   Tensor out;
   // InferDataType
-  if (dtype != pt::DataType::kUndef) {
+  if (dtype != pt::DataType::UNDEFINED) {
     out_meta.type = dtype;
   }
   auto dense_out =
       std::make_shared<pt::DenseTensor>(out_meta, pt::TensorStatus());
   kernel_context.EmplaceBackOutput(dense_out);
   out.set_impl(dense_out);
+  out.set_backend_set(x.backend_set());
 
   // 6. Call kernel
   kernel(&kernel_context);
diff --git a/paddle/tcmpt/hapi/lib/kernel_generate.h b/paddle/tcmpt/hapi/lib/kernel_dispatch.h
similarity index 54%
rename from paddle/tcmpt/hapi/lib/kernel_generate.h
rename to paddle/tcmpt/hapi/lib/kernel_dispatch.h
index 1b5f9d7ae02ac..7c53b573d796f 100644
--- a/paddle/tcmpt/hapi/lib/kernel_generate.h
+++ b/paddle/tcmpt/hapi/lib/kernel_dispatch.h
@@ -14,9 +14,13 @@ limitations under the License. */
 
 #pragma once
 
+#include <limits>
 #include <string>
 #include <utility>
 
+#include "paddle/tcmpt/common/data_type.h"
+#include "paddle/tcmpt/common/layout.h"
+#include "paddle/tcmpt/hapi/include/backend_set.h"
 #include "paddle/tcmpt/hapi/include/tensor.h"
 
 // TODO(chenweihang): split KernelName, Key, Kernel, Factory into diff files
@@ -34,6 +38,39 @@ using CPUContext = paddle::platform::CPUDeviceContext;
 using CUDAContext = paddle::platform::CUDADeviceContext;
 #endif
 
+namespace detail {
+std::size_t CountLeadingZeros(uint64_t val) {
+  if (val == 0) {
+    return 64;
+  }
+  std::size_t zero_bits = 0;
+  for (std::size_t shift = 64 >> 1; shift; shift >>= 1) {
+    uint64_t tmp = val >> shift;
+    if (tmp) {
+      val = tmp;
+    } else {
+      zero_bits |= shift;
+    }
+  }
+  return zero_bits;
+}
+}  // namespace detail
+
+// TODO(chenweihang): support DataLayout and DataType selected
+struct KernelKeySet {
+  BackendSet backend_set{Backend::UNDEFINED};
+  DataLayout layout{DataLayout::UNDEFINED};
+  DataType dtype{DataType::UNDEFINED};
+
+  // TODO(chenweihang): iterate all kernelkey for kernel selection
+  pt::KernelKey GetHigestPriorityKernelKey() {
+    return pt::KernelKey(static_cast<Backend>(64 - detail::CountLeadingZeros(
+                                                       backend_set.bitset())),
+                         layout,
+                         dtype);
+  }
+};
+
 namespace detail {
 
 template <typename Functor>
@@ -46,7 +83,7 @@ struct ArgsIterator {
   template <typename T, typename... Args>
   inline Functor& apply(T&& arg, Args&&... args) {
     self()(std::forward<T>(arg));
-    if (self().short_circurt()) {
+    if (self().short_circuit()) {
       return self();
     } else {
       return apply(std::forward<Args>(args)...);
@@ -59,30 +96,19 @@ struct ArgsIterator {
   inline Functor& self() { return *static_cast<Functor*>(this); }
 };
 
-struct KernelNameAndKeyParser : ArgsIterator<KernelNameAndKeyParser> {
-  std::string kernel_name;
-  pt::Backend backend;
-  pt::DataLayout layout;
-  pt::DataType dtype;
+struct KernelKeyParser : ArgsIterator<KernelKeyParser> {
+  KernelKeySet key_set;
 
-  explicit KernelNameAndKeyParser(const std::string& name)
-      : kernel_name(name) {}
-
-  // TODO(chenweihang): use bit set here
   // TODO(chenweihang): deal with multiple diff input Tensors
+  // TODO(chenweihang): add global device guard method to set backend
   void operator()(const Tensor& x) {
-    if (x.is_cpu()) {
-      backend = pt::Backend::kCPU;
-    } else if (x.is_cuda()) {
-      backend = pt::Backend::kCUDA;
-    } else {
-      throw std::runtime_error("Unsupported backend when parser args.");
-    }
-    layout = x.layout();
-    dtype = x.type();
+    key_set.backend_set = key_set.backend_set | x.backend_set();
+    // TODO(chenweihang): selecte multi layout and dtype
+    key_set.layout = x.layout();
+    key_set.dtype = x.type();
   }
 
-  // skip other type args
+  // skip other type args, these args don't used in kernel selection
   template <typename T>
   void operator()(const T& x) {
     // do nothing
@@ -91,36 +117,15 @@ struct KernelNameAndKeyParser : ArgsIterator<KernelNameAndKeyParser> {
 
 }  // namespace detail
 
-// TODO(chenweihang): Determine the Kernel name and key according to the
-// function name and the input Tensor parameters. For example, if the input
-// x holds SelectedRows, then the Kernel name should be added with the `sr`
-// suffix on the basis of the function name, or the input contains HostTensor,
-// and the `host` suffix should be added on the basis of the function name.
 template <typename... Args>
-std::pair<pt::KernelName, pt::KernelKey> ParseKernelNameAndKeyByArgs(
-    const std::string& fn_name, const Args&... args) {
-  auto parser = detail::KernelNameAndKeyParser(fn_name);
-  parser(args...);
-  // TODO(chenweihang): polish design here
-  pt::KernelName kernel_name(parser.kernel_name);
-  pt::KernelKey kernel_key(parser.backend, parser.layout, parser.dtype);
-  return std::make_pair(kernel_name, kernel_key);
+KernelKeySet ParseKernelKeyByInputArgs(const Args&... args) {
+  return detail::KernelKeyParser().apply(args...).key_set;
 }
 
 paddle::platform::DeviceContext* GetDeviceContextByBackend(
     pt::Backend backend) {
   auto& pool = paddle::platform::DeviceContextPool::Instance();
-  auto place = pt::TransToFluidPlace(backend);
-  // switch (backend) {
-  //   case Backend::kCPU:
-  //     return pool.GetByPlace(paddle::platform::CPUPlace());
-  //   case Backend::kCUDA:
-  //     return pool.GetByPlace(paddle::platform::CUDAPlace());
-  //   default:
-  //     throw std::runtime_error(
-  //       "Unsupported backend when getting device context.");
-  // }
-  return pool.Get(place);
+  return pool.Get(pt::TransToFluidPlace(backend));
 }
 
 }  // namespace experimental
diff --git a/paddle/tcmpt/hapi/lib/linalg.cc b/paddle/tcmpt/hapi/lib/linalg.cc
index dc11bae3e37b7..f045ae82bffa6 100644
--- a/paddle/tcmpt/hapi/lib/linalg.cc
+++ b/paddle/tcmpt/hapi/lib/linalg.cc
@@ -23,7 +23,7 @@ limitations under the License. */
 #include "paddle/tcmpt/core/convert_utils.h"
 #include "paddle/tcmpt/core/dense_tensor.h"
 #include "paddle/tcmpt/core/kernel_context.h"
-#include "paddle/tcmpt/hapi/lib/kernel_generate.h"
+#include "paddle/tcmpt/hapi/lib/kernel_dispatch.h"
 #include "paddle/tcmpt/infershape/binary.h"
 
 namespace paddle {
@@ -31,17 +31,13 @@ namespace experimental {
 
 Tensor dot(const Tensor& x, const Tensor& y) {
   // 1. Get kernel signature and kernel
-  auto kernel_signature = ParseKernelNameAndKeyByArgs("dot", x);
-  VLOG(1) << kernel_signature.first;
-  VLOG(1) << kernel_signature.second;
-  VLOG(1) << pt::KernelFactory::Instance();
-
-  auto kernel = pt::KernelFactory::Instance().SelectKernelOrThrowError(
-      kernel_signature.first, kernel_signature.second);
-  VLOG(1) << kernel;
+  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
+  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
+  auto kernel =
+      pt::KernelFactory::Instance().SelectKernelOrThrowError("dot", kernel_key);
 
   // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_signature.second.backend());
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
   auto kernel_context = pt::KernelContext(*dev_ctx);
 
   // 3. Auto data transform
@@ -52,16 +48,15 @@ Tensor dot(const Tensor& x, const Tensor& y) {
   // TODO(chenweihang): add transform impl
 
   // 4. InferShape
-  // TODO(chenweihang): how to auto selected infershape?
   auto out_meta = DotInferShape(dense_x->meta(), dense_y->meta());
 
   // 5. Prepare outputs
   Tensor out;
-  // TODO(chenweihang): deal with multiple outputs
   auto dense_out =
       std::make_shared<pt::DenseTensor>(out_meta, pt::TensorStatus());
   kernel_context.EmplaceBackOutput(dense_out);
   out.set_impl(dense_out);
+  out.set_backend_set(x.backend_set());
 
   // 6. Call kernel
   kernel(&kernel_context);
diff --git a/paddle/tcmpt/hapi/lib/manipulation.cc b/paddle/tcmpt/hapi/lib/manipulation.cc
index c8448eecfe2de..fd4f51c991354 100644
--- a/paddle/tcmpt/hapi/lib/manipulation.cc
+++ b/paddle/tcmpt/hapi/lib/manipulation.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 
 #include "glog/logging.h"
 #include "paddle/tcmpt/api/include/core.h"
-#include "paddle/tcmpt/hapi/lib/kernel_generate.h"
+#include "paddle/tcmpt/hapi/lib/kernel_dispatch.h"
 #include "paddle/tcmpt/infershape/unary.h"
 
 namespace paddle {
@@ -26,18 +26,13 @@ namespace experimental {
 
 Tensor flatten(const Tensor& x, int start_axis, int stop_axis) {
   // 1. Get kernel signature and kernel
-  auto kernel_signature =
-      ParseKernelNameAndKeyByArgs("flatten_contiguous_range", x);
-  VLOG(1) << kernel_signature.first;
-  VLOG(1) << kernel_signature.second;
-  VLOG(1) << pt::KernelFactory::Instance();
-
+  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
+  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
   auto kernel = pt::KernelFactory::Instance().SelectKernelOrThrowError(
-      kernel_signature.first, kernel_signature.second);
-  VLOG(1) << kernel;
+      "flatten_contiguous_range", kernel_key);
 
   // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_signature.second.backend());
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
   auto kernel_context = pt::KernelContext(*dev_ctx);
 
   // 3. Auto data transform
@@ -47,16 +42,15 @@ Tensor flatten(const Tensor& x, int start_axis, int stop_axis) {
   kernel_context.EmplaceBackAttr(stop_axis);
 
   // 4. InferShape
-  // TODO(chenweihang): how to auto selected infershape?
   auto out_meta = FlattenInferShape(dense_x->meta(), start_axis, stop_axis);
 
   // 5. Prepare outputs
   Tensor out;
-  // TODO(chenweihang): deal with multiple outputs
   auto dense_out =
       std::make_shared<pt::DenseTensor>(out_meta, pt::TensorStatus());
   kernel_context.EmplaceBackOutput(dense_out);
   out.set_impl(dense_out);
+  out.set_backend_set(x.backend_set());
 
   // 6. Call kernel
   kernel(&kernel_context);
diff --git a/paddle/tcmpt/hapi/lib/math.cc b/paddle/tcmpt/hapi/lib/math.cc
index 531e85298758c..b21a06581e82a 100644
--- a/paddle/tcmpt/hapi/lib/math.cc
+++ b/paddle/tcmpt/hapi/lib/math.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 
 #include "paddle/tcmpt/api/include/core.h"
 #include "paddle/tcmpt/api/include/infershape.h"
-#include "paddle/tcmpt/hapi/lib/kernel_generate.h"
+#include "paddle/tcmpt/hapi/lib/kernel_dispatch.h"
 #include "paddle/tcmpt/infershape/unary.h"
 
 namespace paddle {
@@ -28,38 +28,31 @@ namespace experimental {
 
 Tensor mean(const Tensor& x) {
   // 1. Get kernel signature and kernel
-  auto kernel_signature = ParseKernelNameAndKeyByArgs("mean", x);
-  VLOG(1) << kernel_signature.first;
-  VLOG(1) << kernel_signature.second;
-  VLOG(1) << pt::KernelFactory::Instance();
-
+  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
+  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
   auto kernel = pt::KernelFactory::Instance().SelectKernelOrThrowError(
-      kernel_signature.first, kernel_signature.second);
-  VLOG(1) << kernel;
+      "mean", kernel_key);
 
   // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_signature.second.backend());
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
   auto kernel_context = pt::KernelContext(*dev_ctx);
 
   // 3. Auto data transform
   auto dense_x = std::dynamic_pointer_cast<pt::DenseTensor>(x.impl());
   kernel_context.EmplaceBackInput(dense_x);
-  // TODO(chenweihang): add transform impl
 
   // 4. InferShape
-  // TODO(chenweihang): how to auto selected infershape?
   auto out_meta = ReductionInferShape(dense_x->meta());
 
   // 5. Prepare outputs
   Tensor out;
-  // TODO(chenweihang): deal with multiple outputs
   auto dense_out =
       std::make_shared<pt::DenseTensor>(out_meta, pt::TensorStatus());
   kernel_context.EmplaceBackOutput(dense_out);
   out.set_impl(dense_out);
+  out.set_backend_set(x.backend_set());
 
   // 6. Call kernel
-  // TODO(chenweihang): finally, we may call the function directly,
   kernel(&kernel_context);
 
   return out;
diff --git a/paddle/tcmpt/kernels/cpu/creation.cc b/paddle/tcmpt/kernels/cpu/creation.cc
index 37b589d776822..134badd40f985 100644
--- a/paddle/tcmpt/kernels/cpu/creation.cc
+++ b/paddle/tcmpt/kernels/cpu/creation.cc
@@ -33,7 +33,7 @@ PT_REGISTER_MODULE(CreationCPU);
 
 PT_REGISTER_KERNEL("fill_any_like",
                    CPU,
-                   Any,
+                   ANY,
                    pt::FillAnyLike,
                    float,
                    double,
diff --git a/paddle/tcmpt/kernels/cpu/linalg.cc b/paddle/tcmpt/kernels/cpu/linalg.cc
index 821cd5c092e85..93f0ef4303862 100644
--- a/paddle/tcmpt/kernels/cpu/linalg.cc
+++ b/paddle/tcmpt/kernels/cpu/linalg.cc
@@ -62,7 +62,7 @@ using complex128 = ::paddle::platform::complex<double>;
 
 PT_REGISTER_KERNEL("dot",
                    CPU,
-                   Any,
+                   ANY,
                    pt::Dot,
                    float,
                    double,
diff --git a/paddle/tcmpt/kernels/cpu/manipulation.cc b/paddle/tcmpt/kernels/cpu/manipulation.cc
index edf7f5aff0389..3ddae94e47cd1 100644
--- a/paddle/tcmpt/kernels/cpu/manipulation.cc
+++ b/paddle/tcmpt/kernels/cpu/manipulation.cc
@@ -60,7 +60,7 @@ PT_REGISTER_MODULE(ManipulationCPU);
 // architecture, kernel_name should be "flatten".
 PT_REGISTER_KERNEL("flatten_contiguous_range",
                    CPU,
-                   Any,
+                   ANY,
                    pt::Flatten,
                    float,
                    double,
@@ -71,7 +71,7 @@ PT_REGISTER_KERNEL("flatten_contiguous_range",
 
 PT_REGISTER_KERNEL("flatten_contiguous_range.mid",
                    CPU,
-                   Any,
+                   ANY,
                    pt::FlattenWithXShape,
                    float,
                    double,
diff --git a/paddle/tcmpt/kernels/cpu/math.cc b/paddle/tcmpt/kernels/cpu/math.cc
index 4fa14141209a1..afb3ab7d6e63d 100644
--- a/paddle/tcmpt/kernels/cpu/math.cc
+++ b/paddle/tcmpt/kernels/cpu/math.cc
@@ -69,11 +69,11 @@ PT_REGISTER_MODULE(MathCPU);
 // NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
 // using bfloat16 = ::paddle::platform::bfloat16;
 
-PT_REGISTER_KERNEL("sign", CPU, Any, pt::Sign, float, double) {}
-PT_REGISTER_KERNEL("mean", CPU, Any, pt::Mean, float, double) {}
+PT_REGISTER_KERNEL("sign", CPU, ANY, pt::Sign, float, double) {}
+PT_REGISTER_KERNEL("mean", CPU, ANY, pt::Mean, float, double) {}
 PT_REGISTER_KERNEL("scale",
                    CPU,
-                   Any,
+                   ANY,
                    pt::Scale,
                    float,
                    double,
@@ -85,7 +85,7 @@ PT_REGISTER_KERNEL("scale",
                    int64_t) {}
 PT_REGISTER_KERNEL("scale.host",
                    CPU,
-                   Any,
+                   ANY,
                    pt::ScaleHost,
                    float,
                    double,
@@ -95,5 +95,5 @@ PT_REGISTER_KERNEL("scale.host",
                    int16_t,
                    int,
                    int64_t) {
-  kernel->InputAt(1).SetBackend(pt::Backend::kCPU);
+  kernel->InputAt(1).SetBackend(pt::Backend::CPU);
 }
diff --git a/paddle/tcmpt/kernels/cpu/utils.cc b/paddle/tcmpt/kernels/cpu/utils.cc
index a50cfad481693..02b0b5a752708 100644
--- a/paddle/tcmpt/kernels/cpu/utils.cc
+++ b/paddle/tcmpt/kernels/cpu/utils.cc
@@ -55,4 +55,4 @@ void Copy(const CPUContext& dev_ctx, const DenseTensor& src, DenseTensor* dst) {
 // TODO(chenweihang): replace by better impl
 PT_REGISTER_MODULE(UtilsCPU);
 
-PT_REGISTER_KERNEL_WITH_NO_TYPE("copy", CPU, Any, pt::Copy) {}
+PT_REGISTER_KERNEL_WITH_NO_TYPE("copy", CPU, ANY, pt::Copy) {}
diff --git a/paddle/tcmpt/kernels/cuda/creation.cu b/paddle/tcmpt/kernels/cuda/creation.cu
index 54afec95735df..9791dbcc3d2cc 100644
--- a/paddle/tcmpt/kernels/cuda/creation.cu
+++ b/paddle/tcmpt/kernels/cuda/creation.cu
@@ -33,7 +33,7 @@ PT_REGISTER_MODULE(CreationCUDA);
 
 PT_REGISTER_KERNEL("fill_any_like",
                    CUDA,
-                   Any,
+                   ANY,
                    pt::FillAnyLike,
                    float,
                    double,
diff --git a/paddle/tcmpt/kernels/cuda/linalg.cu b/paddle/tcmpt/kernels/cuda/linalg.cu
index 77001d988038d..a1df291db1967 100644
--- a/paddle/tcmpt/kernels/cuda/linalg.cu
+++ b/paddle/tcmpt/kernels/cuda/linalg.cu
@@ -39,7 +39,7 @@ using complex128 = ::paddle::platform::complex<double>;
 
 PT_REGISTER_KERNEL("dot",
                    CUDA,
-                   Any,
+                   ANY,
                    pt::Dot,
                    float,
                    double,
diff --git a/paddle/tcmpt/kernels/cuda/manipulation.cu b/paddle/tcmpt/kernels/cuda/manipulation.cu
index 99ee2506fdf41..d4b6d2d872a96 100644
--- a/paddle/tcmpt/kernels/cuda/manipulation.cu
+++ b/paddle/tcmpt/kernels/cuda/manipulation.cu
@@ -61,7 +61,7 @@ using float16 = paddle::platform::float16;
 // architecture, kernel_name should be "flatten".
 PT_REGISTER_KERNEL("flatten_contiguous_range",
                    CUDA,
-                   Any,
+                   ANY,
                    pt::Flatten,
                    float,
                    float16,
@@ -73,7 +73,7 @@ PT_REGISTER_KERNEL("flatten_contiguous_range",
 
 PT_REGISTER_KERNEL("flatten_contiguous_range.mid",
                    CUDA,
-                   Any,
+                   ANY,
                    pt::FlattenWithXShape,
                    float,
                    double,
diff --git a/paddle/tcmpt/kernels/cuda/math.cu b/paddle/tcmpt/kernels/cuda/math.cu
index 113971126a71f..2bc8501f46822 100644
--- a/paddle/tcmpt/kernels/cuda/math.cu
+++ b/paddle/tcmpt/kernels/cuda/math.cu
@@ -121,11 +121,11 @@ void ScaleHost(const CUDAContext& dev_ctx,
 PT_REGISTER_MODULE(MathCUDA);
 
 using float16 = paddle::platform::float16;
-PT_REGISTER_KERNEL("sign", CUDA, Any, pt::Sign, float, double, float16) {}
-PT_REGISTER_KERNEL("mean", CUDA, Any, pt::Mean, float, double, float16) {}
+PT_REGISTER_KERNEL("sign", CUDA, ANY, pt::Sign, float, double, float16) {}
+PT_REGISTER_KERNEL("mean", CUDA, ANY, pt::Mean, float, double, float16) {}
 PT_REGISTER_KERNEL("scale",
                    CUDA,
-                   Any,
+                   ANY,
                    pt::Scale,
                    float,
                    double,
@@ -137,7 +137,7 @@ PT_REGISTER_KERNEL("scale",
                    int64_t) {}
 PT_REGISTER_KERNEL("scale.host",
                    CUDA,
-                   Any,
+                   ANY,
                    pt::ScaleHost,
                    float,
                    double,
@@ -147,5 +147,5 @@ PT_REGISTER_KERNEL("scale.host",
                    int16_t,
                    int,
                    int64_t) {
-  kernel->InputAt(1).SetBackend(pt::Backend::kCPU);
+  kernel->InputAt(1).SetBackend(pt::Backend::CPU);
 }
diff --git a/paddle/tcmpt/kernels/cuda/utils.cu b/paddle/tcmpt/kernels/cuda/utils.cu
index 00b32e2fbb10a..a90df3f14640b 100644
--- a/paddle/tcmpt/kernels/cuda/utils.cu
+++ b/paddle/tcmpt/kernels/cuda/utils.cu
@@ -220,4 +220,4 @@ void Copy(const CUDAContext& dev_ctx,
 // TODO(chenweihang): replace by better impl
 PT_REGISTER_MODULE(UtilsCUDA);
 
-PT_REGISTER_KERNEL_WITH_NO_TYPE("copy", CUDA, Any, pt::Copy) {}
+PT_REGISTER_KERNEL_WITH_NO_TYPE("copy", CUDA, ANY, pt::Copy) {}
diff --git a/paddle/tcmpt/tests/backend_test.cc b/paddle/tcmpt/tests/backend_test.cc
index 026e94ec4d0e7..af102d8e7388c 100644
--- a/paddle/tcmpt/tests/backend_test.cc
+++ b/paddle/tcmpt/tests/backend_test.cc
@@ -12,6 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/tcmpt/core/backend.h"
+#include "paddle/tcmpt/common/backend.h"
 
 #include <gtest/gtest.h>
diff --git a/paddle/tcmpt/tests/dense_tensor_test.cc b/paddle/tcmpt/tests/dense_tensor_test.cc
index 138ef1e30e76e..7117fdba6dc2a 100644
--- a/paddle/tcmpt/tests/dense_tensor_test.cc
+++ b/paddle/tcmpt/tests/dense_tensor_test.cc
@@ -21,15 +21,15 @@ using DDim = paddle::framework::DDim;
 
 TEST(DenseTensor, Constructor) {
   pt::DenseTensor tensor(pt::TensorMeta(framework::make_ddim({5, 10}),
-                                        pt::Backend::kCPU,
-                                        pt::DataType::kFLOAT32,
-                                        pt::DataLayout::kNCHW,
+                                        pt::Backend::CPU,
+                                        pt::DataType::FLOAT32,
+                                        pt::DataLayout::NCHW,
                                         0UL),
                          pt::TensorStatus());
   ASSERT_EQ(tensor.dims().size(), 2);
-  ASSERT_EQ(tensor.backend(), pt::Backend::kCPU);
-  ASSERT_EQ(tensor.data_type(), pt::DataType::kFLOAT32);
-  ASSERT_EQ(tensor.layout(), pt::DataLayout::kNCHW);
+  ASSERT_EQ(tensor.backend(), pt::Backend::CPU);
+  ASSERT_EQ(tensor.data_type(), pt::DataType::FLOAT32);
+  ASSERT_EQ(tensor.layout(), pt::DataLayout::NCHW);
 }
 
 TEST(DenseTensor, Dims) {
diff --git a/paddle/tcmpt/tests/kernel_factory_test.cc b/paddle/tcmpt/tests/kernel_factory_test.cc
index 66ce7cd9892ef..71634484a4e58 100644
--- a/paddle/tcmpt/tests/kernel_factory_test.cc
+++ b/paddle/tcmpt/tests/kernel_factory_test.cc
@@ -18,6 +18,6 @@ limitations under the License. */
 
 TEST(KernelFactory, KernelKey) {
   pt::KernelKey key(
-      pt::Backend::kCPU, pt::DataLayout::kNCHW, pt::DataType::kFLOAT32);
+      pt::Backend::CPU, pt::DataLayout::NCHW, pt::DataType::FLOAT32);
   std::cout << key;
 }
diff --git a/paddle/tcmpt/tests/test_copy_api.cc b/paddle/tcmpt/tests/test_copy_api.cc
index 2d70e37d051d9..4345b8dc31863 100644
--- a/paddle/tcmpt/tests/test_copy_api.cc
+++ b/paddle/tcmpt/tests/test_copy_api.cc
@@ -32,17 +32,17 @@ TEST(API, copy) {
   // 1. create tensor
   auto dense_src = std::make_shared<pt::DenseTensor>(
       pt::TensorMeta(framework::make_ddim({2, 3}),
-                     pt::Backend::kCPU,
-                     pt::DataType::kFLOAT32,
-                     pt::DataLayout::kNCHW),
+                     pt::Backend::CPU,
+                     pt::DataType::FLOAT32,
+                     pt::DataLayout::NCHW),
       pt::TensorStatus());
   auto* dense_x_data = dense_src->mutable_data<float>();
 
   auto dense_dst = std::make_shared<pt::DenseTensor>(
       pt::TensorMeta(framework::make_ddim({2, 3}),
-                     pt::Backend::kCPU,
-                     pt::DataType::kFLOAT32,
-                     pt::DataLayout::kNCHW),
+                     pt::Backend::CPU,
+                     pt::DataType::FLOAT32,
+                     pt::DataLayout::NCHW),
       pt::TensorStatus());
 
   for (size_t i = 0; i < 2; ++i) {
diff --git a/paddle/tcmpt/tests/test_dot_api.cc b/paddle/tcmpt/tests/test_dot_api.cc
index 8fdae5050e239..c3bea2570730e 100644
--- a/paddle/tcmpt/tests/test_dot_api.cc
+++ b/paddle/tcmpt/tests/test_dot_api.cc
@@ -33,17 +33,17 @@ TEST(API, dot) {
   // 1. create tensor
   auto dense_x = std::make_shared<pt::DenseTensor>(
       pt::TensorMeta(framework::make_ddim({3, 10}),
-                     pt::Backend::kCPU,
-                     pt::DataType::kFLOAT32,
-                     pt::DataLayout::kNCHW),
+                     pt::Backend::CPU,
+                     pt::DataType::FLOAT32,
+                     pt::DataLayout::NCHW),
       pt::TensorStatus());
   auto* dense_x_data = dense_x->mutable_data<float>();
 
   auto dense_y = std::make_shared<pt::DenseTensor>(
       pt::TensorMeta(framework::make_ddim({3, 10}),
-                     pt::Backend::kCPU,
-                     pt::DataType::kFLOAT32,
-                     pt::DataLayout::kNCHW),
+                     pt::Backend::CPU,
+                     pt::DataType::FLOAT32,
+                     pt::DataLayout::NCHW),
       pt::TensorStatus());
   auto* dense_y_data = dense_y->mutable_data<float>();
 
@@ -67,8 +67,8 @@ TEST(API, dot) {
   ASSERT_EQ(out.shape()[0], 3);
   ASSERT_EQ(out.numel(), 3);
   ASSERT_EQ(out.is_cpu(), true);
-  ASSERT_EQ(out.type(), pt::DataType::kFLOAT32);
-  ASSERT_EQ(out.layout(), pt::DataLayout::kNCHW);
+  ASSERT_EQ(out.type(), pt::DataType::FLOAT32);
+  ASSERT_EQ(out.layout(), pt::DataLayout::NCHW);
   ASSERT_EQ(out.initialized(), true);
 
   auto expect_result = sum;
diff --git a/paddle/tcmpt/tests/test_fill_api.cc b/paddle/tcmpt/tests/test_fill_api.cc
index 0ed7248604654..653cf6b7ceb3f 100644
--- a/paddle/tcmpt/tests/test_fill_api.cc
+++ b/paddle/tcmpt/tests/test_fill_api.cc
@@ -33,9 +33,9 @@ TEST(API, full_like) {
   // 1. create tensor
   auto dense_x = std::make_shared<pt::DenseTensor>(
       pt::TensorMeta(framework::make_ddim({3, 2}),
-                     pt::Backend::kCPU,
-                     pt::DataType::kFLOAT32,
-                     pt::DataLayout::kNCHW),
+                     pt::Backend::CPU,
+                     pt::DataType::FLOAT32,
+                     pt::DataLayout::NCHW),
       pt::TensorStatus());
   auto* dense_x_data = dense_x->mutable_data<float>();
   dense_x_data[0] = 0;
@@ -45,15 +45,15 @@ TEST(API, full_like) {
   paddle::experimental::Tensor x(dense_x);
 
   // 2. test API
-  auto out = paddle::experimental::full_like(x, val, pt::DataType::kFLOAT32);
+  auto out = paddle::experimental::full_like(x, val, pt::DataType::FLOAT32);
 
   // 3. check result
   ASSERT_EQ(out.shape().size(), 2);
   ASSERT_EQ(out.shape()[0], 3);
   ASSERT_EQ(out.numel(), 6);
   ASSERT_EQ(out.is_cpu(), true);
-  ASSERT_EQ(out.type(), pt::DataType::kFLOAT32);
-  ASSERT_EQ(out.layout(), pt::DataLayout::kNCHW);
+  ASSERT_EQ(out.type(), pt::DataType::FLOAT32);
+  ASSERT_EQ(out.layout(), pt::DataLayout::NCHW);
   ASSERT_EQ(out.initialized(), true);
 
   auto dense_out = std::dynamic_pointer_cast<pt::DenseTensor>(out.impl());
@@ -67,9 +67,9 @@ TEST(API, zeros_like) {
   // 1. create tensor
   auto dense_x = std::make_shared<pt::DenseTensor>(
       pt::TensorMeta(framework::make_ddim({3, 2}),
-                     pt::Backend::kCPU,
-                     pt::DataType::kFLOAT32,
-                     pt::DataLayout::kNCHW),
+                     pt::Backend::CPU,
+                     pt::DataType::FLOAT32,
+                     pt::DataLayout::NCHW),
       pt::TensorStatus());
   auto* dense_x_data = dense_x->mutable_data<float>();
   dense_x_data[0] = 1;
@@ -77,15 +77,15 @@ TEST(API, zeros_like) {
   paddle::experimental::Tensor x(dense_x);
 
   // 2. test API
-  auto out = paddle::experimental::zeros_like(x, pt::DataType::kFLOAT32);
+  auto out = paddle::experimental::zeros_like(x, pt::DataType::FLOAT32);
 
   // 3. check result
   ASSERT_EQ(out.shape().size(), 2);
   ASSERT_EQ(out.shape()[0], 3);
   ASSERT_EQ(out.numel(), 6);
   ASSERT_EQ(out.is_cpu(), true);
-  ASSERT_EQ(out.type(), pt::DataType::kFLOAT32);
-  ASSERT_EQ(out.layout(), pt::DataLayout::kNCHW);
+  ASSERT_EQ(out.type(), pt::DataType::FLOAT32);
+  ASSERT_EQ(out.layout(), pt::DataLayout::NCHW);
   ASSERT_EQ(out.initialized(), true);
 
   auto dense_out = std::dynamic_pointer_cast<pt::DenseTensor>(out.impl());
@@ -99,9 +99,9 @@ TEST(API, ones_like) {
   // 1. create tensor
   auto dense_x = std::make_shared<pt::DenseTensor>(
       pt::TensorMeta(framework::make_ddim({3, 2}),
-                     pt::Backend::kCPU,
-                     pt::DataType::kFLOAT32,
-                     pt::DataLayout::kNCHW),
+                     pt::Backend::CPU,
+                     pt::DataType::FLOAT32,
+                     pt::DataLayout::NCHW),
       pt::TensorStatus());
   auto* dense_x_data = dense_x->mutable_data<float>();
   dense_x_data[0] = 0;
@@ -109,15 +109,15 @@ TEST(API, ones_like) {
   paddle::experimental::Tensor x(dense_x);
 
   // 2. test API
-  auto out = paddle::experimental::ones_like(x, pt::DataType::kINT32);
+  auto out = paddle::experimental::ones_like(x, pt::DataType::INT32);
 
   // 3. check result
   ASSERT_EQ(out.shape().size(), 2);
   ASSERT_EQ(out.shape()[0], 3);
   ASSERT_EQ(out.numel(), 6);
   ASSERT_EQ(out.is_cpu(), true);
-  ASSERT_EQ(out.type(), pt::DataType::kINT32);
-  ASSERT_EQ(out.layout(), pt::DataLayout::kNCHW);
+  ASSERT_EQ(out.type(), pt::DataType::INT32);
+  ASSERT_EQ(out.layout(), pt::DataLayout::NCHW);
   ASSERT_EQ(out.initialized(), true);
 
   auto dense_out = std::dynamic_pointer_cast<pt::DenseTensor>(out.impl());
diff --git a/paddle/tcmpt/tests/test_flatten_api.cc b/paddle/tcmpt/tests/test_flatten_api.cc
index d2e3ee4278e1d..061c43ae6cb4d 100644
--- a/paddle/tcmpt/tests/test_flatten_api.cc
+++ b/paddle/tcmpt/tests/test_flatten_api.cc
@@ -33,9 +33,9 @@ TEST(API, flatten) {
   // 1. create tensor
   auto dense_x = std::make_shared<pt::DenseTensor>(
       pt::TensorMeta(framework::make_ddim({3, 2, 2, 3}),
-                     pt::Backend::kCPU,
-                     pt::DataType::kFLOAT32,
-                     pt::DataLayout::kNCHW),
+                     pt::Backend::CPU,
+                     pt::DataType::FLOAT32,
+                     pt::DataLayout::NCHW),
       pt::TensorStatus());
   auto* dense_x_data = dense_x->mutable_data<float>();
 
@@ -55,8 +55,8 @@ TEST(API, flatten) {
   ASSERT_EQ(out.shape()[2], expect_shape[2]);
   ASSERT_EQ(out.numel(), 36);
   ASSERT_EQ(out.is_cpu(), true);
-  ASSERT_EQ(out.type(), pt::DataType::kFLOAT32);
-  ASSERT_EQ(out.layout(), pt::DataLayout::kNCHW);
+  ASSERT_EQ(out.type(), pt::DataType::FLOAT32);
+  ASSERT_EQ(out.layout(), pt::DataLayout::NCHW);
   ASSERT_EQ(out.initialized(), true);
   bool value_equal = true;
   auto dense_out = std::dynamic_pointer_cast<pt::DenseTensor>(out.impl());
diff --git a/paddle/tcmpt/tests/test_mean_api.cc b/paddle/tcmpt/tests/test_mean_api.cc
index 518a98738961c..91e847c5ffeed 100644
--- a/paddle/tcmpt/tests/test_mean_api.cc
+++ b/paddle/tcmpt/tests/test_mean_api.cc
@@ -33,9 +33,9 @@ TEST(API, mean) {
   // 1. create tensor
   auto dense_x = std::make_shared<pt::DenseTensor>(
       pt::TensorMeta(framework::make_ddim({3, 4}),
-                     pt::Backend::kCPU,
-                     pt::DataType::kFLOAT32,
-                     pt::DataLayout::kNCHW),
+                     pt::Backend::CPU,
+                     pt::DataType::FLOAT32,
+                     pt::DataLayout::NCHW),
       pt::TensorStatus());
   auto* dense_x_data = dense_x->mutable_data<float>();
 
@@ -55,8 +55,8 @@ TEST(API, mean) {
   ASSERT_EQ(out.shape()[0], 1);
   ASSERT_EQ(out.numel(), 1);
   ASSERT_EQ(out.is_cpu(), true);
-  ASSERT_EQ(out.type(), pt::DataType::kFLOAT32);
-  ASSERT_EQ(out.layout(), pt::DataLayout::kNCHW);
+  ASSERT_EQ(out.type(), pt::DataType::FLOAT32);
+  ASSERT_EQ(out.layout(), pt::DataLayout::NCHW);
   ASSERT_EQ(out.initialized(), true);
 
   auto expect_result = sum / 12;

From ce210b4ba42520461b5036bd5a5e1bc5daa2b945 Mon Sep 17 00:00:00 2001
From: chentianyu03 <ctychentianyu@gmail.com>
Date: Wed, 20 Oct 2021 16:33:32 +0800
Subject: [PATCH 095/125] remove kernel_instantiate (#26)

---
 cmake/pten.cmake                        | 54 -------------------------
 paddle/pten/CMakeLists.txt              |  1 -
 paddle/pten/kernels/cpu/CMakeLists.txt  |  7 ----
 paddle/pten/kernels/cuda/CMakeLists.txt |  7 ----
 4 files changed, 69 deletions(-)
 delete mode 100644 cmake/pten.cmake

diff --git a/cmake/pten.cmake b/cmake/pten.cmake
deleted file mode 100644
index bfe75475edcc0..0000000000000
--- a/cmake/pten.cmake
+++ /dev/null
@@ -1,54 +0,0 @@
-# `kernel_instantiate` functionis used to declare the template instantiation of
-# the Kernel function generated through code analysis, only for windows
-# (because the windows platform msvc compiler cannot automatically instantiate
-# the template function through decltype)
-# TODO(chenweihang): keep message comment for debuging, it is still useful,
-# I will remove it if needless later
-
-function(kernel_instantiate TARGET)
-    set(target_file ${CURRENT_BINARY_DIR}/${TARGET}.tmp CACHE INTERNAL "${CURRENT_BINARY_DIR}/${TARGET} file")
-    set(target_file_final ${CURRENT_BINARY_DIR}/${TARGET})
-    file(READ ${TARGET} TARGET_CONTENT)
-    file(WRITE ${target_file} ${TARGET_CONTENT})
-    string(REGEX MATCHALL "void [A-Z][A-Za-z0-9_]+\\(.[^\\)]+\\)" func_signatures ${TARGET_CONTENT})
-    # message(STATUS "FUNCS: ${func_signatures}")
-    string(REGEX MATCHALL "PT_REGISTER_KERNEL\\(.[^\\)]+\\) \\{" func_registrars ${TARGET_CONTENT})
-    # message(STATUS "REGISTRARS: ${func_registrars}")
-    set(instantiate_context "")
-    foreach(signature ${func_signatures})
-        # message(STATUS "FUNC: ${signature}")
-        list(POP_FRONT func_registrars registrar)
-        # message(STATUS "REG: ${registrar}")
-        string(REGEX MATCHALL "[a-z0-9_:]+(,|\\))" dtypes ${registrar})
-        # message(STATUS "DTYPES: ${dtypes}")
-        list(REMOVE_AT dtypes 0)
-        # message(STATUS "REMOVED DTYPES: ${dtypes}")
-        foreach(dtype ${dtypes})
-            string(REGEX REPLACE ",|\\)" "" dtype ${dtype})
-            # message(STATUS "DTYPE: ${dtype}")
-            string(REGEX MATCH "[A-Z][A-Za-z0-9]+\\(" func_name ${signature})
-            string(REPLACE "(" "" func_name ${func_name})
-            # message(STATUS "FUNC NAME: ${func_name}")
-            string(REGEX REPLACE "${func_name}" "pten::${func_name}<${dtype}>" inst_signature ${signature})
-            # append namespace
-            string(REPLACE "CPUContext" "pten::CPUContext" inst_signature ${inst_signature})
-            string(REPLACE "CUDAContext" "pten::CUDAContext" inst_signature ${inst_signature})
-            string(REPLACE "DenseTensor" "pten::DenseTensor" inst_signature ${inst_signature})
-            # TODO(chenweihang): adapt SelectedRows after adding it
-            # string(REPLACE "SelectedRowsTensor" "pten::SelectedRowsTensor" inst_signature ${inst_signature})
-            # message(STATUS "INST FUNC: ${inst_signature}")
-            string(APPEND instantiate_context "template ${inst_signature};\n")
-        endforeach()
-    endforeach()
-    # message(STATUS "INST CONTENT: ${instantiate_context}")
-    file(APPEND ${target_file} "${instantiate_context}\n")
-    string(REPLACE "." "_" cmd_name ${TARGET})
-    # this is a dummy target for custom command, should always be run firstly to update ${target_file_final}
-    # TODO(chenweihang): nameing rule need to enchance
-    add_custom_target(copy_${cmd_name}_command ALL
-        COMMAND ${CMAKE_COMMAND} -E copy_if_different ${target_file} ${target_file_final}
-        COMMENT "copy_if_different ${target_file_final}"
-        VERBATIM
-    )
-    add_dependencies(extern_glog copy_${cmd_name}_command)
-endfunction()
\ No newline at end of file
diff --git a/paddle/pten/CMakeLists.txt b/paddle/pten/CMakeLists.txt
index 3bf1e6759b35a..4fc1c7f18e54f 100644
--- a/paddle/pten/CMakeLists.txt
+++ b/paddle/pten/CMakeLists.txt
@@ -1,4 +1,3 @@
-include(pten)
 # pten api
 add_subdirectory(api)
 # pten high level api
diff --git a/paddle/pten/kernels/cpu/CMakeLists.txt b/paddle/pten/kernels/cpu/CMakeLists.txt
index 9536f7e7d50f5..ad18a2f555265 100644
--- a/paddle/pten/kernels/cpu/CMakeLists.txt
+++ b/paddle/pten/kernels/cpu/CMakeLists.txt
@@ -1,10 +1,3 @@
-if(WIN32)
-    set(CURRENT_BINARY_DIR ${PADDLE_BINARY_DIR}/paddle/pten/kernels/cpu)
-    kernel_instantiate(creation.cc)
-    kernel_instantiate(math.cc)
-    kernel_instantiate(linalg.cc)
-endif()
-
 cc_library(math_cpu SRCS math.cc DEPS dense_tensor kernel_context kernel_factory eigen_function)
 cc_library(linalg_cpu SRCS linalg.cc DEPS dense_tensor kernel_context kernel_factory)
 cc_library(creation_cpu SRCS creation.cc DEPS dense_tensor kernel_context kernel_factory eigen_function)
diff --git a/paddle/pten/kernels/cuda/CMakeLists.txt b/paddle/pten/kernels/cuda/CMakeLists.txt
index 1271d93558d5b..54df37ecb5e26 100644
--- a/paddle/pten/kernels/cuda/CMakeLists.txt
+++ b/paddle/pten/kernels/cuda/CMakeLists.txt
@@ -1,10 +1,3 @@
-if(WIN32)
-    set(CURRENT_BINARY_DIR ${PADDLE_BINARY_DIR}/paddle/pten/kernels/cuda)
-    kernel_instantiate(creation.cu)
-    kernel_instantiate(math.cu)
-    kernel_instantiate(linalg.cu)
-endif()
-
 if(WITH_GPU)
   nv_library(math_cuda SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory)
   nv_library(linalg_cuda SRCS linalg.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)

From 4e71d151930f6a34dad776d55696029be60aedab Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 20 Oct 2021 14:09:54 +0000
Subject: [PATCH 096/125] remove symbols and spatial_tensor

---
 paddle/pten/api/include/symbols.h | 28 -----------------
 paddle/pten/core/spatial_tensor.h | 51 -------------------------------
 2 files changed, 79 deletions(-)
 delete mode 100644 paddle/pten/api/include/symbols.h
 delete mode 100644 paddle/pten/core/spatial_tensor.h

diff --git a/paddle/pten/api/include/symbols.h b/paddle/pten/api/include/symbols.h
deleted file mode 100644
index 1ec14a41861d8..0000000000000
--- a/paddle/pten/api/include/symbols.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/pten/core/kernel_registry.h"
-
-// symbol declare
-PT_DECLARE_MODULE(MathCPU);
-PT_DECLARE_MODULE(LinalgCPU);
-PT_DECLARE_MODULE(CreationCPU);
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PT_DECLARE_MODULE(MathCUDA);
-PT_DECLARE_MODULE(LinalgCUDA);
-PT_DECLARE_MODULE(CreationCUDA);
-#endif
diff --git a/paddle/pten/core/spatial_tensor.h b/paddle/pten/core/spatial_tensor.h
deleted file mode 100644
index f1bd4add19771..0000000000000
--- a/paddle/pten/core/spatial_tensor.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/pten/core/tensor_base.h"
-
-namespace pten {
-
-/**
- * SpatialTensor represents a Tensor whose memory layout is different from
- * the typical Allocation (size+ptr).
- *
- * It needs to pass in a specific Allocation implementation when it is
- * instantiated.
- */
-
-template <typename AllocationType>
-class SpatialTensor : public TensorBase {
- public:
-  SpatialTensor(std::shared_ptr<AllocationType> allocation,
-                std::unique_ptr<TensorMeta> meta,
-                std::unique_ptr<TensorStatus> status)
-      : allocation_(std::move(allocation)),
-        meta_(std::move(meta)),
-        status_(std::move(status)) {}
-
- private:
-  std::shared_ptr<AllocationType> allocation_;
-  std::unique_ptr<TensorMeta> meta_;
-  std::unique_ptr<TensorStatus> status_;
-};
-
-template <typename AllocationType>
-class MetalTensor : public SpatialTensor<AllocationType> {};
-
-template <typename AllocationType>
-class OpenCLTensor : public SpatialTensor<AllocationType> {};
-
-}  // namespace pten

From 04cf058682b3161b547fe2e7db80284f74ec7f4f Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 20 Oct 2021 14:16:17 +0000
Subject: [PATCH 097/125] change common to functions

---
 paddle/pten/kernels/cpu/creation.cc                         | 2 +-
 paddle/pten/kernels/cpu/math.cc                             | 6 +++---
 paddle/pten/kernels/cuda/creation.cu                        | 2 +-
 paddle/pten/kernels/cuda/linalg.cu                          | 2 +-
 paddle/pten/kernels/cuda/math.cu                            | 6 +++---
 .../pten/kernels/{common => functions}/eigen/CMakeLists.txt | 0
 paddle/pten/kernels/{common => functions}/eigen/common.h    | 0
 paddle/pten/kernels/{common => functions}/eigen/dot.h       | 2 +-
 paddle/pten/kernels/{common => functions}/eigen/fill.h      | 2 +-
 paddle/pten/kernels/{common => functions}/eigen/mean.h      | 2 +-
 paddle/pten/kernels/{common => functions}/eigen/scale.h     | 2 +-
 paddle/pten/kernels/{common => functions}/eigen/sign.h      | 2 +-
 12 files changed, 14 insertions(+), 14 deletions(-)
 rename paddle/pten/kernels/{common => functions}/eigen/CMakeLists.txt (100%)
 rename paddle/pten/kernels/{common => functions}/eigen/common.h (100%)
 rename paddle/pten/kernels/{common => functions}/eigen/dot.h (96%)
 rename paddle/pten/kernels/{common => functions}/eigen/fill.h (97%)
 rename paddle/pten/kernels/{common => functions}/eigen/mean.h (95%)
 rename paddle/pten/kernels/{common => functions}/eigen/scale.h (96%)
 rename paddle/pten/kernels/{common => functions}/eigen/sign.h (96%)

diff --git a/paddle/pten/kernels/cpu/creation.cc b/paddle/pten/kernels/cpu/creation.cc
index fd8e053ba1113..c3986c985bd0a 100644
--- a/paddle/pten/kernels/cpu/creation.cc
+++ b/paddle/pten/kernels/cpu/creation.cc
@@ -15,7 +15,7 @@
 #include "paddle/pten/kernels/cpu/creation.h"
 
 #include "paddle/pten/core/kernel_registry.h"
-#include "paddle/pten/kernels/common/eigen/fill.h"
+#include "paddle/pten/kernels/functions/eigen/fill.h"
 
 namespace pten {
 
diff --git a/paddle/pten/kernels/cpu/math.cc b/paddle/pten/kernels/cpu/math.cc
index 6b9506acbfd60..0682479993f35 100644
--- a/paddle/pten/kernels/cpu/math.cc
+++ b/paddle/pten/kernels/cpu/math.cc
@@ -14,9 +14,9 @@
 
 #include "paddle/pten/kernels/cpu/math.h"
 
-#include "paddle/pten/kernels/common/eigen/mean.h"
-#include "paddle/pten/kernels/common/eigen/scale.h"
-#include "paddle/pten/kernels/common/eigen/sign.h"
+#include "paddle/pten/kernels/functions/eigen/mean.h"
+#include "paddle/pten/kernels/functions/eigen/scale.h"
+#include "paddle/pten/kernels/functions/eigen/sign.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/eigen.h"
diff --git a/paddle/pten/kernels/cuda/creation.cu b/paddle/pten/kernels/cuda/creation.cu
index d1f682ff98c17..40e965e5aaca1 100644
--- a/paddle/pten/kernels/cuda/creation.cu
+++ b/paddle/pten/kernels/cuda/creation.cu
@@ -15,7 +15,7 @@
 #include "paddle/pten/kernels/cuda/creation.h"
 
 #include "paddle/pten/core/kernel_registry.h"
-#include "paddle/pten/kernels/common/eigen/fill.h"
+#include "paddle/pten/kernels/functions/eigen/fill.h"
 
 namespace pten {
 
diff --git a/paddle/pten/kernels/cuda/linalg.cu b/paddle/pten/kernels/cuda/linalg.cu
index 0dad40a76893d..928a09a4edbff 100644
--- a/paddle/pten/kernels/cuda/linalg.cu
+++ b/paddle/pten/kernels/cuda/linalg.cu
@@ -15,7 +15,7 @@
 #include "paddle/pten/kernels/cuda/linalg.h"
 
 #include "paddle/pten/core/kernel_registry.h"
-#include "paddle/pten/kernels/common/eigen/dot.h"
+#include "paddle/pten/kernels/functions/eigen/dot.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/complex.h"
diff --git a/paddle/pten/kernels/cuda/math.cu b/paddle/pten/kernels/cuda/math.cu
index f2ee5b91a3b1c..b9230dbf47a1f 100644
--- a/paddle/pten/kernels/cuda/math.cu
+++ b/paddle/pten/kernels/cuda/math.cu
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include "paddle/pten/kernels/cuda/math.h"
 
-#include "paddle/pten/kernels/common/eigen/mean.h"
-#include "paddle/pten/kernels/common/eigen/scale.h"
-#include "paddle/pten/kernels/common/eigen/sign.h"
+#include "paddle/pten/kernels/functions/eigen/mean.h"
+#include "paddle/pten/kernels/functions/eigen/scale.h"
+#include "paddle/pten/kernels/functions/eigen/sign.h"
 
 #ifdef __NVCC__
 #include "cub/cub.cuh"
diff --git a/paddle/pten/kernels/common/eigen/CMakeLists.txt b/paddle/pten/kernels/functions/eigen/CMakeLists.txt
similarity index 100%
rename from paddle/pten/kernels/common/eigen/CMakeLists.txt
rename to paddle/pten/kernels/functions/eigen/CMakeLists.txt
diff --git a/paddle/pten/kernels/common/eigen/common.h b/paddle/pten/kernels/functions/eigen/common.h
similarity index 100%
rename from paddle/pten/kernels/common/eigen/common.h
rename to paddle/pten/kernels/functions/eigen/common.h
diff --git a/paddle/pten/kernels/common/eigen/dot.h b/paddle/pten/kernels/functions/eigen/dot.h
similarity index 96%
rename from paddle/pten/kernels/common/eigen/dot.h
rename to paddle/pten/kernels/functions/eigen/dot.h
index 8a7789f3dfb64..605517bad6a9a 100644
--- a/paddle/pten/kernels/common/eigen/dot.h
+++ b/paddle/pten/kernels/functions/eigen/dot.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/common/eigen/common.h"
+#include "paddle/pten/kernels/functions/eigen/common.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/operators/eigen/eigen_function.h"
diff --git a/paddle/pten/kernels/common/eigen/fill.h b/paddle/pten/kernels/functions/eigen/fill.h
similarity index 97%
rename from paddle/pten/kernels/common/eigen/fill.h
rename to paddle/pten/kernels/functions/eigen/fill.h
index df76194839ed7..3897da415c638 100644
--- a/paddle/pten/kernels/common/eigen/fill.h
+++ b/paddle/pten/kernels/functions/eigen/fill.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/common/eigen/common.h"
+#include "paddle/pten/kernels/functions/eigen/common.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/operators/eigen/eigen_function.h"
diff --git a/paddle/pten/kernels/common/eigen/mean.h b/paddle/pten/kernels/functions/eigen/mean.h
similarity index 95%
rename from paddle/pten/kernels/common/eigen/mean.h
rename to paddle/pten/kernels/functions/eigen/mean.h
index 9ee5ab12c9332..574a1957ae558 100644
--- a/paddle/pten/kernels/common/eigen/mean.h
+++ b/paddle/pten/kernels/functions/eigen/mean.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/common/eigen/common.h"
+#include "paddle/pten/kernels/functions/eigen/common.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/operators/eigen/eigen_function.h"
diff --git a/paddle/pten/kernels/common/eigen/scale.h b/paddle/pten/kernels/functions/eigen/scale.h
similarity index 96%
rename from paddle/pten/kernels/common/eigen/scale.h
rename to paddle/pten/kernels/functions/eigen/scale.h
index fda15302e2971..49ee561df50ec 100644
--- a/paddle/pten/kernels/common/eigen/scale.h
+++ b/paddle/pten/kernels/functions/eigen/scale.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/common/eigen/common.h"
+#include "paddle/pten/kernels/functions/eigen/common.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/operators/eigen/eigen_function.h"
diff --git a/paddle/pten/kernels/common/eigen/sign.h b/paddle/pten/kernels/functions/eigen/sign.h
similarity index 96%
rename from paddle/pten/kernels/common/eigen/sign.h
rename to paddle/pten/kernels/functions/eigen/sign.h
index 1e60965b1d91b..13c8d3f3cfe8c 100644
--- a/paddle/pten/kernels/common/eigen/sign.h
+++ b/paddle/pten/kernels/functions/eigen/sign.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/kernels/common/eigen/common.h"
+#include "paddle/pten/kernels/functions/eigen/common.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/operators/eigen/eigen_function.h"

From ab8db2d3d6d330036895afa1db466e3c81b5300d Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 20 Oct 2021 14:22:59 +0000
Subject: [PATCH 098/125] readd share tensor impl methods

---
 paddle/fluid/framework/pten_utils.cc | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/paddle/fluid/framework/pten_utils.cc b/paddle/fluid/framework/pten_utils.cc
index 9965085cdbb52..fbe9a4759bbf1 100644
--- a/paddle/fluid/framework/pten_utils.cc
+++ b/paddle/fluid/framework/pten_utils.cc
@@ -79,6 +79,20 @@ std::shared_ptr<pten::DenseTensor> MakeTensorImpl<pten::DenseTensor>(
       pten::TransToPtDataLayout(tensor.layout()));
 }
 
+template <>
+void ShareTensorImpl<pten::DenseTensor>(pten::DenseTensor* tensor_impl,
+                                        LoDTensor* out) {
+  out->ResetHolderWithType(tensor_impl->allocation(),
+                           pten::TransToProtoVarType(tensor_impl->data_type()));
+}
+
+template <>
+void ShareTensorImpl<pten::DenseTensor>(pten::DenseTensor* tensor_impl,
+                                        Tensor* out) {
+  out->ResetHolderWithType(tensor_impl->allocation(),
+                           pten::TransToProtoVarType(tensor_impl->data_type()));
+}
+
 std::shared_ptr<pten::TensorBase> InputVariableToPtTensor(
     const framework::Variable& variable, const pten::TensorArgDef& arg_def) {
   auto expected_place = pten::TransToFluidPlace(arg_def.backend);

From f1c9661ce07b329a1aa5bbb5f56c1e6f117b9ebe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?=
 <39303645+Shixiaowei02@users.noreply.github.com>
Date: Wed, 20 Oct 2021 22:49:54 +0800
Subject: [PATCH 099/125] add a candidate dense tensor class, test=develop
 (#28)

---
 paddle/pten/common/data_type.h                |   7 +-
 paddle/pten/core/CMakeLists.txt               |   4 +
 paddle/pten/core/allocator.h                  |  14 +-
 paddle/pten/core/candidate/CMakeLists.txt     |   1 +
 paddle/pten/core/candidate/dense_tensor.cc    | 145 ++++++++++++++
 paddle/pten/core/candidate/dense_tensor.h     | 188 ++++++++++++++++++
 paddle/pten/core/storage.h                    |  12 +-
 paddle/pten/core/tensor_base.h                |   8 +-
 paddle/pten/core/utils/CMakeLists.txt         |   0
 paddle/pten/hapi/lib/CMakeLists.txt           |   2 +
 paddle/pten/hapi/lib/utils/CMakeLists.txt     |   3 +
 paddle/pten/hapi/lib/utils/allocator.cc       |  23 +++
 paddle/pten/hapi/lib/utils/allocator.h        |  47 +++++
 paddle/pten/hapi/lib/utils/storage.cc         |  39 ++++
 paddle/pten/hapi/lib/utils/storage.h          |  91 +++++++++
 paddle/pten/hapi/lib/utils/tensor_utils.cc    |  19 ++
 paddle/pten/hapi/lib/utils/tensor_utils.h     |  80 ++++++++
 .../pten/hapi/lib/utils/tests/CMakeLists.txt  |   2 +
 .../pten/hapi/lib/utils/tests/test_storage.cc |  65 ++++++
 .../hapi/lib/utils/tests/test_tensor_utils.cc | 103 ++++++++++
 20 files changed, 838 insertions(+), 15 deletions(-)
 create mode 100644 paddle/pten/core/candidate/CMakeLists.txt
 create mode 100644 paddle/pten/core/candidate/dense_tensor.cc
 create mode 100644 paddle/pten/core/candidate/dense_tensor.h
 delete mode 100644 paddle/pten/core/utils/CMakeLists.txt
 create mode 100644 paddle/pten/hapi/lib/utils/CMakeLists.txt
 create mode 100644 paddle/pten/hapi/lib/utils/allocator.cc
 create mode 100644 paddle/pten/hapi/lib/utils/allocator.h
 create mode 100644 paddle/pten/hapi/lib/utils/storage.cc
 create mode 100644 paddle/pten/hapi/lib/utils/storage.h
 create mode 100644 paddle/pten/hapi/lib/utils/tensor_utils.cc
 create mode 100644 paddle/pten/hapi/lib/utils/tensor_utils.h
 create mode 100644 paddle/pten/hapi/lib/utils/tests/CMakeLists.txt
 create mode 100644 paddle/pten/hapi/lib/utils/tests/test_storage.cc
 create mode 100644 paddle/pten/hapi/lib/utils/tests/test_tensor_utils.cc

diff --git a/paddle/pten/common/data_type.h b/paddle/pten/common/data_type.h
index 2c0bd96429aa6..f9c6d032f71ed 100644
--- a/paddle/pten/common/data_type.h
+++ b/paddle/pten/common/data_type.h
@@ -75,8 +75,8 @@ inline size_t SizeOf(DataType data_type) {
       PADDLE_THROW(platform::errors::Unimplemented(
           "Data type %d is not supported by tensor.",
           static_cast<int>(data_type)));
-      return 0;
   }
+  return 0;
 }
 
 #define PT_FOR_EACH_DATA_TYPE(_)    \
@@ -84,8 +84,11 @@ inline size_t SizeOf(DataType data_type) {
   _(int8_t, DataType::INT8)         \
   _(uint8_t, DataType::UINT8)       \
   _(int16_t, DataType::INT16)       \
-  _(int, DataType::INT32)           \
+  _(uint16_t, DataType::UINT16)     \
+  _(int32_t, DataType::INT32)       \
+  _(uint32_t, DataType::UINT32)     \
   _(int64_t, DataType::INT64)       \
+  _(uint64_t, DataType::UINT64)     \
   _(bfloat16, DataType::BFLOAT16)   \
   _(float16, DataType::FLOAT16)     \
   _(float, DataType::FLOAT32)       \
diff --git a/paddle/pten/core/CMakeLists.txt b/paddle/pten/core/CMakeLists.txt
index 448f7123c38b9..ca562332bb79f 100644
--- a/paddle/pten/core/CMakeLists.txt
+++ b/paddle/pten/core/CMakeLists.txt
@@ -1,3 +1,5 @@
+add_subdirectory(candidate)
+
 IF(WITH_MKLDNN)
     set(MKLDNN_CTX_DEPS mkldnn)
 ELSE()
@@ -15,3 +17,5 @@ cc_library(dense_tensor SRCS dense_tensor.cc DEPS enforce data_type ddim allocat
 
 cc_library(kernel_factory SRCS kernel_factory.cc DEPS enforce)
 cc_library(kernel_context SRCS kernel_context.cc DEPS enforce device_context)
+
+cc_library(tensor_base SRCS tensor_base.cc allocator.cc storage.cc DEPS enforce)
diff --git a/paddle/pten/core/allocator.h b/paddle/pten/core/allocator.h
index b96e695a4f8cf..c16c4ffaa6a37 100644
--- a/paddle/pten/core/allocator.h
+++ b/paddle/pten/core/allocator.h
@@ -23,6 +23,8 @@ namespace pten {
 /// deallocation and construction/destruction of objects.
 class RawAllocator {
  public:
+  using Place = paddle::platform::Place;
+
   /// \brief Default destructor.
   virtual ~RawAllocator() = default;
 
@@ -43,7 +45,7 @@ class RawAllocator {
 
   /// \brief Get the place value of the allocator and the allocation.
   /// \return The place value of the allocator and the allocation.
-  virtual const paddle::platform::Place& place() const = 0;
+  virtual const Place& place() const = 0;
 };
 
 /// \brief Fancy pointer with context. The use of this data type
@@ -52,24 +54,24 @@ class RawAllocator {
 /// support being inherited.
 class Allocation final {
  public:
+  using Place = paddle::platform::Place;
   using DeleterFnPtr = void (*)(void*);
 
   Allocation() = default;
   Allocation(Allocation&&) = default;
   Allocation& operator=(Allocation&&) = default;
 
-  Allocation(void* data, const paddle::platform::Place& place)
-      : data_(data), place_(place) {}
+  Allocation(void* data, const Place& place) : data_(data), place_(place) {}
 
   Allocation(void* data,
              void* ctx,
              DeleterFnPtr ctx_deleter,
-             const paddle::platform::Place& place)
+             const Place& place)
       : data_(data), ctx_(ctx, ctx_deleter), place_(place) {}
 
   void* operator->() const noexcept { return data_; }
   operator bool() const noexcept { return data_ || ctx_.Get(); }
-  const paddle::platform::Place& place() const noexcept { return place_; }
+  const Place& place() const noexcept { return place_; }
 
   void Clear() noexcept {
     data_ = nullptr;
@@ -132,7 +134,7 @@ class Allocation final {
   Context ctx_;
   // TODO(Shixiaowei02): Enum needs to be used instead to reduce
   // the construction overhead by more than 50%.
-  paddle::platform::Place place_;
+  Place place_;
 };
 
 inline void swap(Allocation::Context& a, Allocation::Context& b) noexcept {
diff --git a/paddle/pten/core/candidate/CMakeLists.txt b/paddle/pten/core/candidate/CMakeLists.txt
new file mode 100644
index 0000000000000..dd670abdba1c1
--- /dev/null
+++ b/paddle/pten/core/candidate/CMakeLists.txt
@@ -0,0 +1 @@
+cc_library(pten_dense_tensor SRCS dense_tensor.cc DEPS tensor_base)
diff --git a/paddle/pten/core/candidate/dense_tensor.cc b/paddle/pten/core/candidate/dense_tensor.cc
new file mode 100644
index 0000000000000..325edd1ba077f
--- /dev/null
+++ b/paddle/pten/core/candidate/dense_tensor.cc
@@ -0,0 +1,145 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/core/candidate/dense_tensor.h"
+
+namespace pten {
+namespace candidate {
+
+DenseTensorMeta::DenseTensorMeta(DataType type, const DDim& dims)
+    : dims(dims), type(type) {}
+DenseTensorMeta::DenseTensorMeta(DataType type,
+                                 const DDim& dims,
+                                 DataLayout layout)
+    : dims(dims), type(type), layout(layout) {}
+DenseTensorMeta::DenseTensorMeta(DataType type,
+                                 const DDim& dims,
+                                 DataLayout layout,
+                                 const std::vector<std::vector<size_t>>& lod)
+    : dims(dims), type(type), layout(layout), lod(lod) {}
+
+bool DenseTensorMeta::valid() const noexcept {
+  bool valid{true};
+  valid = valid && (type != DataType::UNDEFINED);
+  valid = valid && (layout != DataLayout::UNDEFINED);
+  valid = valid && (is_scalar || product(dims));
+  return valid;
+}
+
+DenseTensor::DenseTensor(const std::shared_ptr<Allocator>& a,
+                         const DenseTensorMeta& meta)
+    : meta_(meta),
+      storage_(
+          make_intrusive<TensorStorage>(a, SizeOf(data_type()) * numel())) {}
+
+DenseTensor::DenseTensor(const std::shared_ptr<Allocator>& a,
+                         DenseTensorMeta&& meta)
+    : meta_(std::move(meta)),
+      storage_(
+          make_intrusive<TensorStorage>(a, SizeOf(data_type()) * numel())) {}
+
+DenseTensor::DenseTensor(intrusive_ptr<Storage> storage,
+                         const DenseTensorMeta& meta)
+    : meta_(meta), storage_(std::move(storage)) {}
+
+DenseTensor::DenseTensor(intrusive_ptr<Storage> storage, DenseTensorMeta&& meta)
+    : meta_(std::move(meta)), storage_(std::move(storage)) {}
+
+int64_t DenseTensor::numel() const {
+  if (meta_.is_scalar) {
+    return 1;
+  }
+  return product(meta_.dims);
+}
+
+bool DenseTensor::SharesStorageWith(const DenseTensor& b) const {
+  return storage_.get() == b.storage_.get() && storage_.get() != nullptr;
+}
+
+template <typename T>
+T* DenseTensor::mutable_data(size_t request_bytes) {
+  PADDLE_ENFORCE(
+      valid(),
+      paddle::platform::errors::PreconditionNotMet(
+          "The meta data must be valid when call the mutable data function."));
+  PADDLE_ENFORCE_NOT_NULL(
+      storage_,
+      paddle::platform::errors::PreconditionNotMet(
+          "The storage must be valid when call the mutable data function."));
+  PADDLE_ENFORCE(
+      (data_type() == paddle::experimental::CppTypeToDataType<T>::Type()),
+      paddle::platform::errors::PreconditionNotMet(
+          "The type of data we are trying to retrieve does not match the "
+          "type of data currently contained in the container."));
+  size_t bytes = numel() * SizeOf(data_type());
+  if (request_bytes) {
+    PADDLE_ENFORCE_GE(request_bytes,
+                      bytes,
+                      paddle::platform::errors::InvalidArgument(
+                          "The reserved size %d should be enough to meet the "
+                          "volume required by metadata %d.",
+                          request_bytes,
+                          bytes));
+    bytes = request_bytes;
+  }
+  if (storage_->size() < bytes) {
+    storage_->Realloc(bytes);
+  }
+  return static_cast<T*>(storage_->data());
+}
+
+template <typename T>
+const T* DenseTensor::data() const {
+  PADDLE_ENFORCE_NOT_NULL(
+      storage_,
+      paddle::platform::errors::PreconditionNotMet(
+          "The storage must be valid when call the mutable data function."));
+  PADDLE_ENFORCE(
+      (data_type() == paddle::experimental::CppTypeToDataType<T>::Type()),
+      paddle::platform::errors::PreconditionNotMet(
+          "The type of data we are trying to retrieve does not match the "
+          "type of data currently contained in the container."));
+  return static_cast<const T*>(storage_->data());
+}
+
+void DenseTensor::check_memory_size() const {
+  size_t bytes = numel() * SizeOf(data_type());
+  PADDLE_ENFORCE_GE(memory_size(),
+                    bytes,
+                    paddle::platform::errors::InvalidArgument(
+                        "The memory size %d should be enough to meet the "
+                        "volume required by metadata %d.",
+                        memory_size(),
+                        bytes));
+}
+
+#define DATA_MEMBER_FUNC_INSTANTIATION(dtype)                      \
+  template dtype* DenseTensor::mutable_data(size_t request_bytes); \
+  template const dtype* DenseTensor::data() const;
+
+DATA_MEMBER_FUNC_INSTANTIATION(int8_t);
+DATA_MEMBER_FUNC_INSTANTIATION(uint8_t);
+DATA_MEMBER_FUNC_INSTANTIATION(int16_t);
+DATA_MEMBER_FUNC_INSTANTIATION(uint16_t);
+DATA_MEMBER_FUNC_INSTANTIATION(int32_t);
+DATA_MEMBER_FUNC_INSTANTIATION(uint32_t);
+DATA_MEMBER_FUNC_INSTANTIATION(int64_t);
+DATA_MEMBER_FUNC_INSTANTIATION(uint64_t);
+DATA_MEMBER_FUNC_INSTANTIATION(float);
+DATA_MEMBER_FUNC_INSTANTIATION(double);
+
+#undef DATA_MEMBER_FUNC_INSTANTIATION
+
+}  // namespace candidate
+}  // namespace pten
diff --git a/paddle/pten/core/candidate/dense_tensor.h b/paddle/pten/core/candidate/dense_tensor.h
new file mode 100644
index 0000000000000..21a093439529f
--- /dev/null
+++ b/paddle/pten/core/candidate/dense_tensor.h
@@ -0,0 +1,188 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/pten/common/data_type.h"
+#include "paddle/pten/core/allocator.h"
+#include "paddle/pten/core/storage.h"
+#include "paddle/pten/core/tensor_base.h"
+
+namespace pten {
+namespace candidate {
+
+using DDim = paddle::framework::DDim;
+
+/// \brief The meta data of dense tensor. Take the structure type
+/// and use all default operations.
+///
+struct DenseTensorMeta {
+  using DataType = paddle::experimental::DataType;
+  using DataLayout = paddle::experimental::DataLayout;
+
+  DenseTensorMeta() = default;
+  DenseTensorMeta(DataType type, const DDim& dims);
+  DenseTensorMeta(DataType type, const DDim& dims, DataLayout layout);
+  DenseTensorMeta(DataType type,
+                  const DDim& dims,
+                  DataLayout layout,
+                  const std::vector<std::vector<size_t>>& lod);
+
+  /// \brief Test whether the metadata is valid. Does not throw exceptions.
+  /// \return Whether the metadata is valid.
+  bool valid() const noexcept;
+
+  /// During the entire life cycle of a DenseTensor, the following attributes
+  /// marked with `const` are expected to remain unchanged.
+  const bool is_scalar{false};
+  DDim dims;
+  const DataType type{DataType::FLOAT32};
+  const DataLayout layout{DataLayout::NCHW};
+  std::vector<std::vector<size_t>> lod;
+};
+
+/// \brief The Dense tensor store values in a contiguous sequential block
+/// of memory where all values are represented. Tensors or multi-dimensional
+/// arrays are used in math operators.
+/// During the entire life cycle of a DenseTensor, its device type and key
+/// metadata are set unchanged.
+class DenseTensor : public TensorBase,
+                    public TypeInfoTraits<TensorBase, DenseTensor> {
+ public:
+  /// \brief Construct a dense tensor and allocate space.
+  /// \param a The allocator used to allocate space.
+  /// \param meta The meta data of dense tensor.
+  DenseTensor(const std::shared_ptr<Allocator>& a, const DenseTensorMeta& meta);
+
+  /// \brief Construct a dense tensor and allocate space.
+  /// \param a The allocator used to allocate space.
+  /// \param meta The meta data of dense tensor.
+  DenseTensor(const std::shared_ptr<Allocator>& a, DenseTensorMeta&& meta);
+
+  /// \brief Use existing storage space to create dense tensor. This interface
+  /// can be used to deliberately create an uninitialized dense tensor.
+  /// \param storage The existing storage.
+  /// \param meta The meta data of dense tensor.
+  DenseTensor(intrusive_ptr<Storage> storage, const DenseTensorMeta& meta);
+
+  /// \brief Use existing storage space to create dense tensor. This interface
+  /// can be used to deliberately create an uninitialized dense tensor.
+  /// \param storage The existing storage.
+  /// \param meta The meta data of dense tensor.
+  DenseTensor(intrusive_ptr<Storage> storage, DenseTensorMeta&& meta);
+
+  /// \brief Because dense tensor is a kind of container, we give a default
+  /// constructor to use for stl container. But the dense tensor created with
+  /// the default constructor is not practical.
+  DenseTensor() = default;
+
+  /// \brief Because dense tensor is a resource handle, we provide a default
+  /// move constructor to support move semantics.
+  DenseTensor(DenseTensor&& other) = default;
+
+  /// \brief We do not recommend deep copy of dense tensor because of its
+  /// efficiency and complexity across devices. The operation is disabled here.
+  DenseTensor(const DenseTensor& other) = delete;
+
+  /// \brief Destroy the tensor object and release exclusive resources.
+  virtual ~DenseTensor() = default;
+
+ public:
+  /// \brief Returns the name of the class for type traits.
+  /// \return The name of the class.
+  static const char* name() { return "DenseTensor"; }
+
+  /// \brief Returns the number of elements contained in tensor.
+  /// \return The number of elements contained in tensor.
+  int64_t numel() const;
+
+  /// \brief Returns the dims of the tensor.
+  /// \return The dims of the tensor.
+  const DDim& dims() const noexcept { return meta_.dims; }
+
+  /// \brief Returns the lod of the tensor.
+  /// \return The lod of the tensor.
+  const std::vector<std::vector<size_t>>& lod() const noexcept {
+    return meta_.lod;
+  }
+
+  /// \brief Returns the data type of the tensor.
+  /// \return The data type of the tensor.
+  DataType data_type() const noexcept { return meta_.type; }
+
+  /// \brief Returns the data layout of the tensor.
+  /// \return The data layout of the tensor.
+  DataLayout layout() const noexcept { return meta_.layout; }
+
+  /// \brief Returns the data place of the tensor.
+  /// \return The data place of the tensor.
+  const Place& place() const { return storage_->place(); }
+
+  /// \brief Test whether the metadata is valid.
+  /// \return Whether the metadata is valid.
+  bool valid() const noexcept { return meta_.valid(); }
+
+  /// \brief Test whether the storage is allocated.
+  /// return Whether the storage is allocated.
+  bool initialized() const { return storage_->data(); }
+
+  /// \brief Check if storage is shared with other objects.
+  /// \return Whether the storage is shared with other objects.
+  bool SharesStorageWith(const DenseTensor& b) const;
+
+  /// \brief Change the dims information in the metadata, and the corresponding
+  /// memory allocation will occur when the `mutable_data` is called.
+  /// \param dims The new dims of the dense tensor.
+  void Resize(const DDim& dims) noexcept { meta_.dims = dims; }
+
+  /// \brief Returns the actual storage size occupied by tensor, may be larger
+  /// than its shape dims.
+  /// \return The actual storage size occupied by tensor.
+  size_t memory_size() const { return storage_->size(); }
+
+  /// \brief Check that the storage area is large enough to hold the data of the
+  /// metadata size, and throw an exception if the conditions are not met.
+  void check_memory_size() const;
+
+  /// \brief Release the storage area for other purposes. Because of the
+  /// destruction of encapsulation, we do not support two dense tensors directly
+  /// sharing the same intrusive pointer.
+  /// \return The rvalue of instrusize pointer releated to the released storage.
+  intrusive_ptr<Storage> release() { return std::move(storage_); }
+
+  /// \brief Get the mutable data pointer value of type T.
+  /// Memory allocation may occur when calling this interface:
+  /// 1. When the storage size is not enough to meet the current shape of the
+  /// data.
+  /// 2. When more request_bytes parameters are used to reserve the data
+  /// storage.
+  /// param request_bytes The bytes to reserve the data storage.
+  /// \return The mutable data pointer value of type T.
+  template <typename T>
+  T* mutable_data(size_t request_bytes = 0);
+
+  /// \brief Get the const data pointer value of type T.
+  /// \return The const data pointer value of type T.
+  template <typename T>
+  const T* data() const;
+
+ private:
+  DenseTensorMeta meta_;
+  intrusive_ptr<Storage> storage_;
+};
+
+}  // namespace candidate
+}  // namespace pten
diff --git a/paddle/pten/core/storage.h b/paddle/pten/core/storage.h
index b1c6de7fff8f6..430572e253d6e 100644
--- a/paddle/pten/core/storage.h
+++ b/paddle/pten/core/storage.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "boost/intrusive_ptr.hpp"
 #include "paddle/pten/core/utils/intrusive_ptr.h"
 #include "paddle/pten/core/utils/intrusive_ref_counter.h"
+#include "paddle/pten/core/utils/type_info.h"
 
 #include "paddle/fluid/platform/place.h"
 #include "paddle/pten/core/allocator.h"
@@ -30,6 +31,7 @@ namespace pten {
 /// all default copy operations to ensure the integrity of the package.
 class Storage : public intrusive_ref_counter<Storage> {
  public:
+  using Place = paddle::platform::Place;
   Storage() = default;
   Storage(const Storage&) = delete;
 
@@ -43,7 +45,7 @@ class Storage : public intrusive_ref_counter<Storage> {
   void* data() const noexcept { return data_.operator->(); }
 
   virtual size_t size() const = 0;
-  virtual const paddle::platform::Place& place() const = 0;
+  virtual const Place& place() const = 0;
   virtual bool OwnsMemory() const = 0;
   virtual void Realloc(size_t n) = 0;
 
@@ -53,18 +55,20 @@ class Storage : public intrusive_ref_counter<Storage> {
 
 class TensorStorage : public Storage {
  public:
+  using Place = paddle::platform::Place;
+
   explicit TensorStorage(const std::shared_ptr<Allocator>& a) : alloc_(a) {}
   TensorStorage(const std::shared_ptr<Allocator>& a, size_t size)
       : Storage(Allocate(a, size)), alloc_(a), size_(size) {}
 
   ~TensorStorage() = default;
 
+  static const char* name() { return "TensorStorage"; }
+
   void Realloc(size_t size) override;
 
   size_t size() const noexcept override { return size_; }
-  const paddle::platform::Place& place() const override {
-    return data_.place();
-  }
+  const Place& place() const override { return data_.place(); }
   bool OwnsMemory() const noexcept override { return true; }
   const std::shared_ptr<Allocator>& allocator() const noexcept {
     return alloc_;
diff --git a/paddle/pten/core/tensor_base.h b/paddle/pten/core/tensor_base.h
index 58d6975d96900..74cc082646fe2 100644
--- a/paddle/pten/core/tensor_base.h
+++ b/paddle/pten/core/tensor_base.h
@@ -28,6 +28,8 @@ class TensorBase {
  public:
   using DataType = paddle::experimental::DataType;
   using DataLayout = paddle::experimental::DataLayout;
+  using DDim = paddle::framework::DDim;
+  using Place = paddle::platform::Place;
 
   virtual ~TensorBase() = default;
 
@@ -37,7 +39,7 @@ class TensorBase {
 
   /// \brief Returns the dims of the tensor.
   /// \return The dims of the tensor.
-  virtual const paddle::framework::DDim& dims() const = 0;
+  virtual const DDim& dims() const = 0;
 
   /// \brief Returns the data type of the tensor.
   /// \return The data type of the tensor.
@@ -49,7 +51,7 @@ class TensorBase {
 
   /// \brief Returns the data place of the tensor.
   /// \return The data place of the tensor.
-  virtual const paddle::platform::Place& place() const = 0;
+  virtual const Place& place() const = 0;
 
   /// \brief Test whether the metadata is valid.
   /// \return Whether the metadata is valid.
@@ -59,7 +61,7 @@ class TensorBase {
   /// return Whether the storage is allocated.
   virtual bool initialized() const = 0;
 
-  virtual pten::Backend backend() const = 0;
+  virtual paddle::experimental::Backend backend() const { return {}; }
 
   /// \brief Return the type information of the derived class to support
   /// safely downcast in non-rtti environment.
diff --git a/paddle/pten/core/utils/CMakeLists.txt b/paddle/pten/core/utils/CMakeLists.txt
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/paddle/pten/hapi/lib/CMakeLists.txt b/paddle/pten/hapi/lib/CMakeLists.txt
index 54cabb7e69baa..a4726b3d426f6 100644
--- a/paddle/pten/hapi/lib/CMakeLists.txt
+++ b/paddle/pten/hapi/lib/CMakeLists.txt
@@ -1,3 +1,5 @@
+add_subdirectory(utils)
+
 cc_library(math_api SRCS math.cc DEPS pten)
 cc_library(linalg_api SRCS linalg.cc DEPS pten)
 cc_library(creation_api SRCS creation.cc DEPS pten)
diff --git a/paddle/pten/hapi/lib/utils/CMakeLists.txt b/paddle/pten/hapi/lib/utils/CMakeLists.txt
new file mode 100644
index 0000000000000..4ab33a10dcdc4
--- /dev/null
+++ b/paddle/pten/hapi/lib/utils/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_subdirectory(tests)
+
+cc_library(pten_hapi_utils SRCS allocator.cc storage tensor_utils DEPS tensor_base pten_dense_tensor pten_utils)
diff --git a/paddle/pten/hapi/lib/utils/allocator.cc b/paddle/pten/hapi/lib/utils/allocator.cc
new file mode 100644
index 0000000000000..0c364c97e4d1c
--- /dev/null
+++ b/paddle/pten/hapi/lib/utils/allocator.cc
@@ -0,0 +1,23 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/hapi/lib/utils/allocator.h"
+
+namespace paddle {
+namespace experimental {
+
+memory::Allocator::AllocationDeleter DefaultAllocator::deleter_;
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/pten/hapi/lib/utils/allocator.h b/paddle/pten/hapi/lib/utils/allocator.h
new file mode 100644
index 0000000000000..8a8569c73edae
--- /dev/null
+++ b/paddle/pten/hapi/lib/utils/allocator.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/pten/core/allocator.h"
+#include "paddle/pten/core/storage.h"
+
+namespace paddle {
+namespace experimental {
+
+class DefaultAllocator : public pten::Allocator {
+ public:
+  using Allocation = pten::Allocation;
+  explicit DefaultAllocator(const paddle::platform::Place& place)
+      : place_(place) {}
+
+  static void Delete(void* data) {
+    deleter_(static_cast<paddle::memory::Allocation*>(data));
+  }
+
+  Allocation Allocate(size_t bytes_size) override {
+    paddle::memory::AllocationPtr a = memory::Alloc(place_, bytes_size);
+    void* ptr = a->ptr();
+    return Allocation(ptr, a.release(), &Delete, place_);
+  }
+
+ private:
+  paddle::platform::Place place_;
+  static paddle::memory::Allocator::AllocationDeleter deleter_;
+};
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/pten/hapi/lib/utils/storage.cc b/paddle/pten/hapi/lib/utils/storage.cc
new file mode 100644
index 0000000000000..0682b25c6e0dd
--- /dev/null
+++ b/paddle/pten/hapi/lib/utils/storage.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/hapi/lib/utils/storage.h"
+
+namespace paddle {
+namespace experimental {
+
+ExternalStorage::ExternalStorage(void* ptr,
+                                 size_t size,
+                                 const paddle::platform::Place& place)
+    : pten::Storage(pten::Allocation(ptr, place)), size_(size) {}
+
+ExternalStorage::ExternalStorage(const pten::intrusive_ptr<pten::Storage>& root,
+                                 size_t delta,
+                                 size_t size)
+    : Storage(pten::Allocation(static_cast<uint8_t*>(root->data()) + delta,
+                               root->place())),
+      size_(size) {
+  PADDLE_ENFORCE_LE(static_cast<size_t>(delta + size),
+                    root->size(),
+                    paddle::platform::errors::InvalidArgument(
+                        "The size of the external storage does "
+                        "not meet the metadata requirements."));
+}
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/pten/hapi/lib/utils/storage.h b/paddle/pten/hapi/lib/utils/storage.h
new file mode 100644
index 0000000000000..996e98416336b
--- /dev/null
+++ b/paddle/pten/hapi/lib/utils/storage.h
@@ -0,0 +1,91 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/pten/core/storage.h"
+
+namespace paddle {
+namespace experimental {
+
+class ExternalStorage : public pten::Storage {
+ public:
+  ExternalStorage(void* ptr, size_t size, const paddle::platform::Place& place);
+  ExternalStorage(const pten::intrusive_ptr<pten::Storage>& root,
+                  size_t delta,
+                  size_t size);
+
+  static const char* name() { return "ExternalStorage"; }
+
+  void Realloc(size_t n) override {
+    PADDLE_THROW(paddle::platform::errors::Unavailable(
+        "The external shared storage cannot be reallocated."));
+  }
+
+  size_t size() const noexcept override { return size_; }
+  const paddle::platform::Place& place() const override {
+    return data_.place();
+  }
+  bool OwnsMemory() const noexcept override { return false; }
+
+ private:
+  const int64_t size_{0};
+};
+
+class SharedStorage : public pten::Storage {
+ public:
+  explicit SharedStorage(
+      const std::shared_ptr<paddle::memory::Allocation>& allocation)
+      : allocation_(allocation) {
+    CHECK(allocation);
+    data_ = pten::Allocation(allocation->ptr(), allocation->place());
+    size_ = allocation->size();
+  }
+
+  static const char* name() { return "SharedStorage"; }
+
+  void Realloc(size_t n) override {
+    PADDLE_THROW(paddle::platform::errors::Unavailable(
+        "The external shared storage cannot be reallocated."));
+  }
+
+  size_t size() const noexcept override { return size_; }
+  const paddle::platform::Place& place() const override {
+    return data_.place();
+  }
+  bool OwnsMemory() const noexcept override { return false; }
+
+  const std::shared_ptr<paddle::memory::Allocation>& GetAllocation() {
+    return allocation_;
+  }
+
+ private:
+  int64_t size_{0};
+  std::shared_ptr<paddle::memory::Allocation> allocation_;
+};
+
+class TensorStorage : public paddle::memory::allocation::Allocation {
+ public:
+  explicit TensorStorage(pten::intrusive_ptr<pten::Storage> storage)
+      : paddle::memory::allocation::Allocation(
+            storage->data(), storage->size(), storage->place()),
+        storage_(std::move(storage)) {}
+
+ private:
+  pten::intrusive_ptr<pten::Storage> storage_;
+};
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/pten/hapi/lib/utils/tensor_utils.cc b/paddle/pten/hapi/lib/utils/tensor_utils.cc
new file mode 100644
index 0000000000000..be7feebe8c206
--- /dev/null
+++ b/paddle/pten/hapi/lib/utils/tensor_utils.cc
@@ -0,0 +1,19 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/hapi/lib/utils/tensor_utils.h"
+
+namespace paddle {
+namespace experimental {}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/pten/hapi/lib/utils/tensor_utils.h b/paddle/pten/hapi/lib/utils/tensor_utils.h
new file mode 100644
index 0000000000000..9c726260139e3
--- /dev/null
+++ b/paddle/pten/hapi/lib/utils/tensor_utils.h
@@ -0,0 +1,80 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+
+#include "paddle/fluid/framework/lod_tensor.h"
+
+#include "paddle/pten/core/candidate/dense_tensor.h"
+#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/hapi/lib/utils/allocator.h"
+#include "paddle/pten/hapi/lib/utils/storage.h"
+
+namespace paddle {
+namespace experimental {
+
+using namespace pten::candidate;  // NOLINT
+
+template <typename DstLoD, typename SrcLoD>
+void SetLoD(DstLoD* dst, const SrcLoD& src) {
+  dst->reserve(src.size());
+  dst->clear();
+  for (auto&& v : src) {
+    dst->emplace_back(v);
+  }
+}
+
+std::shared_ptr<DenseTensor> MakeSharedDenseTensor(
+    const paddle::framework::Tensor& src) {
+  DenseTensorMeta meta{pten::TransToPtDataType(src.type()),
+                       src.dims(),
+                       pten::TransToPtDataLayout(src.layout())};
+  auto shared_storage = pten::make_intrusive<SharedStorage>(src.Holder());
+  return std::make_shared<DenseTensor>(std::move(shared_storage),
+                                       std::move(meta));
+}
+
+std::shared_ptr<DenseTensor> MakeSharedDenseTensor(
+    const paddle::framework::LoDTensor& src) {
+  DenseTensorMeta meta{pten::TransToPtDataType(src.type()),
+                       src.dims(),
+                       pten::TransToPtDataLayout(src.layout())};
+  SetLoD(&meta.lod, src.lod());
+  auto shared_storage = pten::make_intrusive<SharedStorage>(src.Holder());
+  return std::make_shared<DenseTensor>(std::move(shared_storage),
+                                       std::move(meta));
+}
+
+void MovesStorage(DenseTensor* src, paddle::framework::Tensor* dst) {
+  CHECK(src);
+  CHECK(dst);
+  dst->Resize(src->dims());
+  auto storage = src->release();
+  CHECK(storage->OwnsMemory());
+  std::shared_ptr<paddle::memory::allocation::Allocation> holder(
+      new TensorStorage(std::move(storage)));
+  dst->ResetHolderWithType(holder, pten::TransToProtoVarType(src->data_type()));
+}
+
+void MovesStorage(DenseTensor* src, paddle::framework::LoDTensor* dst) {
+  CHECK(src);
+  CHECK(dst);
+  SetLoD(dst->mutable_lod(), src->lod());
+  MovesStorage(src, static_cast<paddle::framework::Tensor*>(dst));
+}
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/pten/hapi/lib/utils/tests/CMakeLists.txt b/paddle/pten/hapi/lib/utils/tests/CMakeLists.txt
new file mode 100644
index 0000000000000..8ac30a1fa6909
--- /dev/null
+++ b/paddle/pten/hapi/lib/utils/tests/CMakeLists.txt
@@ -0,0 +1,2 @@
+cc_test(test_framework_storage SRCS test_storage.cc DEPS pten_hapi_utils)
+cc_test(test_framework_tensor_utils SRCS test_tensor_utils.cc DEPS pten_hapi_utils)
diff --git a/paddle/pten/hapi/lib/utils/tests/test_storage.cc b/paddle/pten/hapi/lib/utils/tests/test_storage.cc
new file mode 100644
index 0000000000000..fbbcd2a3ee0e5
--- /dev/null
+++ b/paddle/pten/hapi/lib/utils/tests/test_storage.cc
@@ -0,0 +1,65 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "paddle/pten/hapi/lib/utils/allocator.h"
+#include "paddle/pten/hapi/lib/utils/storage.h"
+
+namespace paddle {
+namespace experimental {
+namespace tests {
+
+TEST(host_storage, external_stroage) {
+  const size_t size{100};
+  const auto a =
+      std::make_shared<DefaultAllocator>(paddle::platform::CPUPlace());
+  pten::intrusive_ptr<pten::Storage> in_storage =
+      pten::make_intrusive<pten::TensorStorage>(a, size);
+  char* data = static_cast<char*>(in_storage->data());
+  for (size_t i = 0; i < size; ++i) {
+    data[i] = i;
+  }
+  const size_t delta{1};
+  const size_t n{10};
+  auto ex_storage = pten::make_intrusive<ExternalStorage>(in_storage, delta, n);
+  CHECK_EQ(ex_storage->size(), n);
+  CHECK(paddle::platform::is_cpu_place(ex_storage->place()));
+  CHECK(!ex_storage->OwnsMemory());
+  for (size_t i = delta; i < delta + n; ++i) {
+    CHECK_EQ(data[i], static_cast<char>(i));
+  }
+}
+
+TEST(host_storage, external_vector) {
+  std::vector<char> data(100);
+  for (size_t i = 0; i < data.size(); ++i) {
+    data[i] = i;
+  }
+  const size_t delta{1};
+  const size_t n{10};
+  auto ex_storage = pten::make_intrusive<ExternalStorage>(
+      data.data(), n, paddle::platform::CPUPlace());
+  CHECK_EQ(ex_storage->size(), n);
+  CHECK(paddle::platform::is_cpu_place(ex_storage->place()));
+  CHECK(!ex_storage->OwnsMemory());
+  for (size_t i = delta; i < delta + n; ++i) {
+    CHECK_EQ(data[i], static_cast<char>(i));
+  }
+}
+}  // namespace tests
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/pten/hapi/lib/utils/tests/test_tensor_utils.cc b/paddle/pten/hapi/lib/utils/tests/test_tensor_utils.cc
new file mode 100644
index 0000000000000..64ef1972d8d5a
--- /dev/null
+++ b/paddle/pten/hapi/lib/utils/tests/test_tensor_utils.cc
@@ -0,0 +1,103 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "gtest/gtest.h"
+
+#include "paddle/pten/hapi/lib/utils/tensor_utils.h"
+
+namespace paddle {
+namespace experimental {
+namespace tests {
+
+using DDim = paddle::framework::DDim;
+using DataType = paddle::experimental::DataType;
+using DataLayout = paddle::experimental::DataLayout;
+
+using DenseTensor = pten::candidate::DenseTensor;
+using DenseTensorMeta = pten::candidate::DenseTensorMeta;
+
+TEST(tensor_utils, dense_tensor_to_lod_tensor) {
+  const DDim dims({2, 1});
+  const DataType dtype{DataType::FLOAT32};
+  const DataLayout layout{DataLayout::NCHW};
+  const std::vector<std::vector<size_t>> lod{{0, 2}};
+  DenseTensorMeta meta(dtype, dims, layout, lod);
+
+  auto alloc = std::make_shared<DefaultAllocator>(platform::CPUPlace());
+
+  DenseTensor dense_tensor(alloc, meta);
+  float* data = dense_tensor.mutable_data<float>();
+  data[0] = 1.0f;
+  data[1] = 2.1f;
+
+  framework::LoDTensor lod_tensor;
+  MovesStorage(&dense_tensor, &lod_tensor);
+
+  CHECK(dense_tensor.lod().size() == lod_tensor.lod().size());
+  CHECK(dense_tensor.lod()[0] ==
+        static_cast<std::vector<size_t>>((lod_tensor.lod()[0])));
+  CHECK(dense_tensor.data_type() == pten::TransToPtDataType(lod_tensor.type()));
+  CHECK(dense_tensor.layout() ==
+        pten::TransToPtDataLayout(lod_tensor.layout()));
+  CHECK(platform::is_cpu_place(lod_tensor.place()));
+
+  CHECK(lod_tensor.data<float>()[0] == 1.0f);
+  CHECK(lod_tensor.data<float>()[1] == 2.1f);
+
+  auto dense_tensor_1 = MakeSharedDenseTensor(lod_tensor);
+  CHECK(dense_tensor_1->dims() == dims);
+  CHECK(dense_tensor_1->data_type() == dtype);
+  CHECK(dense_tensor_1->layout() == layout);
+  CHECK(dense_tensor_1->lod().size() == lod.size());
+  CHECK(dense_tensor_1->lod()[0] == lod[0]);
+  const float* data_1 = dense_tensor_1->data<float>();
+  CHECK(data_1[0] == 1.0f);
+  CHECK(data_1[1] == 2.1f);
+}
+
+TEST(tensor_utils, dense_tensor_to_tensor) {
+  const DDim dims({2, 1});
+  const DataType dtype{DataType::FLOAT32};
+  const DataLayout layout{DataLayout::NCHW};
+  DenseTensorMeta meta(dtype, dims, layout);
+
+  auto alloc = std::make_shared<DefaultAllocator>(platform::CPUPlace());
+
+  DenseTensor dense_tensor(alloc, meta);
+  float* data = dense_tensor.mutable_data<float>();
+  data[0] = 1.0f;
+  data[1] = 2.1f;
+
+  framework::Tensor tensor;
+  MovesStorage(&dense_tensor, &tensor);
+
+  CHECK(dense_tensor.data_type() == pten::TransToPtDataType(tensor.type()));
+  CHECK(dense_tensor.layout() == pten::TransToPtDataLayout(tensor.layout()));
+  CHECK(platform::is_cpu_place(tensor.place()));
+
+  CHECK(tensor.data<float>()[0] == 1.0f);
+  CHECK(tensor.data<float>()[1] == 2.1f);
+
+  auto dense_tensor_1 = MakeSharedDenseTensor(tensor);
+  CHECK(dense_tensor_1->dims() == dims);
+  CHECK(dense_tensor_1->data_type() == dtype);
+  CHECK(dense_tensor_1->layout() == layout);
+  const float* data_1 = dense_tensor_1->data<float>();
+  CHECK(data_1[0] == 1.0f);
+  CHECK(data_1[1] == 2.1f);
+}
+
+}  // namespace tests
+}  // namespace experimental
+}  // namespace paddle

From d3674e9671d8ff70ed19c708214bdd91c16ebd4a Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 20 Oct 2021 15:03:40 +0000
Subject: [PATCH 100/125] change all Pt to Pten

---
 paddle/fluid/framework/operator.cc           | 22 +++++------
 paddle/fluid/framework/operator.h            |  8 ++--
 paddle/fluid/framework/pten_utils.cc         | 41 +++++++++++---------
 paddle/fluid/framework/pten_utils.h          | 38 +++++++++---------
 paddle/fluid/framework/pten_utils_test.cc    |  4 +-
 paddle/fluid/imperative/prepared_operator.cc | 14 +++----
 paddle/fluid/operators/fill_any_like_op.cc   |  2 +-
 paddle/fluid/operators/scale_op.cc           |  2 +-
 paddle/pten/core/convert_utils.cc            |  6 +--
 paddle/pten/core/convert_utils.h             |  6 +--
 paddle/pten/kernels/cuda/math.cu             |  2 +-
 11 files changed, 75 insertions(+), 70 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 01d8a3771b100..fed4541ee9f2c 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1146,7 +1146,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   if (FLAGS_run_pt_kernel &&
       pten::KernelFactory::Instance().ContainsKernel(type_.c_str())) {
     if (pt_kernel_signature_.get() == nullptr || pt_kernel_.get() == nullptr) {
-      ChoosePtKernel(exe_ctx);
+      ChoosePtenKernel(exe_ctx);
     }
     run_pt_kernel_ = pt_kernel_->IsValid();
   }
@@ -1192,7 +1192,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
     platform::RecordEvent record_event("compute",
                                        platform::EventRole::kInnerOp);
     if (run_pt_kernel_) {
-      auto op_kernel_ctx = BuildPtKernelContext(*runtime_ctx, *dev_ctx);
+      auto op_kernel_ctx = BuildPtenKernelContext(*runtime_ctx, *dev_ctx);
       (*pt_kernel_)(&op_kernel_ctx);
     } else {
       (*kernel_func_)(
@@ -1282,26 +1282,26 @@ OpKernelType OperatorWithKernel::InnerGetExpectedKernelType(
   return expected_kernel_key;
 }
 
-void OperatorWithKernel::ChoosePtKernel(const ExecutionContext& ctx) const {
+void OperatorWithKernel::ChoosePtenKernel(const ExecutionContext& ctx) const {
   pt_kernel_signature_.reset(
-      new KernelSignature(this->GetExpectedPtKernelArgs(ctx)));
+      new KernelSignature(this->GetExpectedPtenKernelArgs(ctx)));
 
   VLOG(1) << KernelSignatureToString(*pt_kernel_signature_.get());
 
   kernel_type_.reset(new OpKernelType(InnerGetExpectedKernelType(ctx)));
 
   auto pt_kernel_name = pten::KernelName(pt_kernel_signature_->first);
-  auto pt_kernel_key = TransOpKernelTypeToPtKernelKey(*kernel_type_.get());
+  auto pt_kernel_key = TransOpKernelTypeToPtenKernelKey(*kernel_type_.get());
   pt_kernel_.reset(
       new pten::Kernel(pten::KernelFactory::Instance().SelectKernel(
           pt_kernel_name, pt_kernel_key)));
 
   if (pt_kernel_->IsValid()) {
-    VLOG(1) << "Static mode ChoosePtKernel - kernel name: " << pt_kernel_name
+    VLOG(1) << "Static mode ChoosePtenKernel - kernel name: " << pt_kernel_name
             << " | kernel key: " << pt_kernel_key
             << " | kernel: " << *pt_kernel_;
   } else {
-    VLOG(1) << "Static mode ChoosePtKernel - kernel `" << pt_kernel_name
+    VLOG(1) << "Static mode ChoosePtenKernel - kernel `" << pt_kernel_name
             << "` not found.";
   }
 }
@@ -1774,7 +1774,7 @@ OpKernelType OperatorWithKernel::GetKernelTypeForVar(
                       tensor.layout());
 }
 
-KernelSignature OperatorWithKernel::GetExpectedPtKernelArgs(
+KernelSignature OperatorWithKernel::GetExpectedPtenKernelArgs(
     const ExecutionContext& ctx) const {
   if (KernelSignatureMap::Instance().Has(Type())) {
     return *(KernelSignatureMap::Instance().GetNullable(Type()));
@@ -1786,7 +1786,7 @@ KernelSignature OperatorWithKernel::GetExpectedPtKernelArgs(
   }
 }
 
-pten::KernelContext OperatorWithKernel::BuildPtKernelContext(
+pten::KernelContext OperatorWithKernel::BuildPtenKernelContext(
     const RuntimeContext& ctx, const platform::DeviceContext& dev_ctx) const {
   VLOG(1) << RuntimeContextDebugString(ctx);
 
@@ -1834,7 +1834,7 @@ pten::KernelContext OperatorWithKernel::BuildPtKernelContext(
     std::vector<std::shared_ptr<pten::TensorBase>> tmp_inputs;
 
     for (auto var : ins_vector) {
-      auto pt_in = framework::InputVariableToPtTensor(*var, in_def);
+      auto pt_in = framework::InputVariableToPtenTensor(*var, in_def);
       tmp_inputs.emplace_back(pt_in);
     }
     op_kernel_ctx.EmplaceBackInputs(tmp_inputs);
@@ -1846,7 +1846,7 @@ pten::KernelContext OperatorWithKernel::BuildPtKernelContext(
 
     std::vector<std::shared_ptr<pten::TensorBase>> tmp_outputs;
     for (auto var : outs_vector) {
-      auto pt_out = framework::OutputVariableToPtTensor(var, out_def);
+      auto pt_out = framework::OutputVariableToPtenTensor(var, out_def);
       tmp_outputs.emplace_back(pt_out);
     }
     op_kernel_ctx.EmplaceBackOutputs(tmp_outputs);
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 29c60877b8116..224974001c469 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -537,9 +537,9 @@ class OperatorWithKernel : public OperatorBase {
     * output arguments registered in the original OpMaker do not match in some
     * cases, so we use map to record the arguments required by the kernel.
     * When selecting Kernel during Op execution, select the arguments of the
-    * original Op according to the GetExpectedPtKernelArgs returned arguments.
+    * original Op according to the GetExpectedPtenKernelArgs returned arguments.
     */
-  virtual KernelSignature GetExpectedPtKernelArgs(
+  virtual KernelSignature GetExpectedPtenKernelArgs(
       const ExecutionContext& ctx) const;
 
  private:
@@ -583,9 +583,9 @@ class OperatorWithKernel : public OperatorBase {
                                    const std::string& name) const;
 
   /* member functions for adapting to pten lib */
-  void ChoosePtKernel(const ExecutionContext& ctx) const;
+  void ChoosePtenKernel(const ExecutionContext& ctx) const;
 
-  pten::KernelContext BuildPtKernelContext(
+  pten::KernelContext BuildPtenKernelContext(
       const RuntimeContext& ctx, const platform::DeviceContext& dev_ctx) const;
 
  protected:
diff --git a/paddle/fluid/framework/pten_utils.cc b/paddle/fluid/framework/pten_utils.cc
index fbe9a4759bbf1..e0e43db139065 100644
--- a/paddle/fluid/framework/pten_utils.cc
+++ b/paddle/fluid/framework/pten_utils.cc
@@ -66,8 +66,8 @@ std::shared_ptr<pten::DenseTensor> MakeTensorImpl<pten::DenseTensor>(
     const LoDTensor& tensor, const platform::Place& place,
     proto::VarType::Type type) {
   return MakeTensorImpl<pten::DenseTensor, LoDTensor>(
-      tensor, pten::TransToPtBackend(place), pten::TransToPtDataType(type),
-      pten::TransToPtDataLayout(tensor.layout()));
+      tensor, pten::TransToPtenBackend(place), pten::TransToPtenDataType(type),
+      pten::TransToPtenDataLayout(tensor.layout()));
 }
 
 template <>
@@ -75,8 +75,8 @@ std::shared_ptr<pten::DenseTensor> MakeTensorImpl<pten::DenseTensor>(
     const Tensor& tensor, const platform::Place& place,
     proto::VarType::Type type) {
   return MakeTensorImpl<pten::DenseTensor, Tensor>(
-      tensor, pten::TransToPtBackend(place), pten::TransToPtDataType(type),
-      pten::TransToPtDataLayout(tensor.layout()));
+      tensor, pten::TransToPtenBackend(place), pten::TransToPtenDataType(type),
+      pten::TransToPtenDataLayout(tensor.layout()));
 }
 
 template <>
@@ -93,7 +93,7 @@ void ShareTensorImpl<pten::DenseTensor>(pten::DenseTensor* tensor_impl,
                            pten::TransToProtoVarType(tensor_impl->data_type()));
 }
 
-std::shared_ptr<pten::TensorBase> InputVariableToPtTensor(
+std::shared_ptr<pten::TensorBase> InputVariableToPtenTensor(
     const framework::Variable& variable, const pten::TensorArgDef& arg_def) {
   auto expected_place = pten::TransToFluidPlace(arg_def.backend);
 
@@ -138,7 +138,7 @@ std::shared_ptr<pten::TensorBase> InputVariableToPtTensor(
   return nullptr;
 }
 
-std::shared_ptr<pten::TensorBase> OutputVariableToPtTensor(
+std::shared_ptr<pten::TensorBase> OutputVariableToPtenTensor(
     framework::Variable* variable, const pten::TensorArgDef& arg_def) {
   // mutable_data before run kernel, to avoid share output form
   // KernelContext to original tensor
@@ -170,7 +170,8 @@ std::shared_ptr<pten::TensorBase> OutputVariableToPtTensor(
   return nullptr;
 }
 
-OpKernelType TransPtKernelKeyToOpKernelType(const pten::KernelKey& kernel_key) {
+OpKernelType TransPtenKernelKeyToOpKernelType(
+    const pten::KernelKey& kernel_key) {
   proto::VarType::Type data_type =
       pten::TransToProtoVarType(kernel_key.dtype());
   platform::Place place = pten::TransToFluidPlace(kernel_key.backend());
@@ -187,9 +188,9 @@ OpKernelType TransPtKernelKeyToOpKernelType(const pten::KernelKey& kernel_key) {
   return OpKernelType(data_type, place, data_layout, library_type);
 }
 
-pten::KernelKey TransOpKernelTypeToPtKernelKey(
+pten::KernelKey TransOpKernelTypeToPtenKernelKey(
     const OpKernelType& kernel_type) {
-  pten::Backend backend = pten::TransToPtBackend(kernel_type.place_);
+  pten::Backend backend = pten::TransToPtenBackend(kernel_type.place_);
   if (kernel_type.library_type_ == LibraryType::kMKLDNN) {
     backend = pten::Backend::MKLDNN;
   } else if (kernel_type.library_type_ == LibraryType::kCUDNN) {
@@ -198,9 +199,9 @@ pten::KernelKey TransOpKernelTypeToPtKernelKey(
     // do
   }
   paddle::experimental::DataLayout layout =
-      pten::TransToPtDataLayout(kernel_type.data_layout_);
+      pten::TransToPtenDataLayout(kernel_type.data_layout_);
   paddle::experimental::DataType dtype =
-      pten::TransToPtDataType(kernel_type.data_type_);
+      pten::TransToPtenDataType(kernel_type.data_type_);
   return pten::KernelKey(backend, layout, dtype);
 }
 
@@ -215,16 +216,17 @@ KernelArgsNameMakerByOpProto::GetInputArgsNames() {
     auto& in = op_proto_->inputs()[i];
     auto& in_name = in.name();
     if ((in.has_extra() && in.extra()) || (in.has_quant() && in.quant())) {
-      VLOG(1) << "Parse PtKernel input: skip extra & quant input - " << in_name;
+      VLOG(1) << "Parse PtenKernel input: skip extra & quant input - "
+              << in_name;
       continue;
     }
     // If contains dispensable input, we should override the
-    // GetExpectedPtKernelArgs method self
+    // GetExpectedPtenKernelArgs method self
     if (in.has_dispensable() && in.dispensable()) {
-      VLOG(1) << "Parse PtKernel input: skip dispensable input - " << in_name;
+      VLOG(1) << "Parse PtenKernel input: skip dispensable input - " << in_name;
       continue;
     }
-    VLOG(1) << "Parse PtKernel input: " << in_name;
+    VLOG(1) << "Parse PtenKernel input: " << in_name;
     input_names_.emplace_back(in_name);
   }
   return input_names_;
@@ -236,7 +238,7 @@ KernelArgsNameMakerByOpProto::GetOutputArgsNames() {
     auto& out = op_proto_->outputs()[i];
     auto& out_name = out.name();
     // TODO(chenweihang): outputs also need skip some cases
-    VLOG(1) << "Parse PtKernel output: " << out_name;
+    VLOG(1) << "Parse PtenKernel output: " << out_name;
     output_names_.emplace_back(out_name);
   }
   return output_names_;
@@ -250,16 +252,17 @@ KernelArgsNameMakerByOpProto::GetAttrsArgsNames() {
     if (attr_name == "use_mkldnn" || attr_name == "op_role" ||
         attr_name == "op_role_var" || attr_name == "op_namescope" ||
         attr_name == "op_callstack" || attr_name == "op_device") {
-      VLOG(1) << "Parse PtKernel attribute: skip needless attr - " << attr_name;
+      VLOG(1) << "Parse PtenKernel attribute: skip needless attr - "
+              << attr_name;
       continue;
     }
     if ((attr.has_extra() && attr.extra()) ||
         (attr.has_quant() && attr.quant())) {
-      VLOG(1) << "Parse PtKernel attribute: skip extra & quant attr - "
+      VLOG(1) << "Parse PtenKernel attribute: skip extra & quant attr - "
               << attr_name;
       continue;
     }
-    VLOG(1) << "Parse PtKernel attribute: " << attr_name;
+    VLOG(1) << "Parse PtenKernel attribute: " << attr_name;
     attr_names_.emplace_back(attr_name);
   }
 
diff --git a/paddle/fluid/framework/pten_utils.h b/paddle/fluid/framework/pten_utils.h
index 14dbe933195be..263101657ceb9 100644
--- a/paddle/fluid/framework/pten_utils.h
+++ b/paddle/fluid/framework/pten_utils.h
@@ -33,37 +33,39 @@ namespace framework {
 
 /* tensor translate */
 
-template <typename PtTensorImplT, typename VariableT>
-std::shared_ptr<PtTensorImplT> MakeTensorImpl(
+template <typename PtenTensorImplT, typename VariableT>
+std::shared_ptr<PtenTensorImplT> MakeTensorImpl(
     const VariableT& tensor, pten::Backend backend,
     paddle::experimental::DataType dtype,
     paddle::experimental::DataLayout layout);
 
-template <typename PtTensorImplT>
-std::shared_ptr<PtTensorImplT> MakeTensorImpl(const LoDTensor& tensor,
-                                              const platform::Place& place,
-                                              proto::VarType::Type type);
+template <typename PtenTensorImplT>
+std::shared_ptr<PtenTensorImplT> MakeTensorImpl(const LoDTensor& tensor,
+                                                const platform::Place& place,
+                                                proto::VarType::Type type);
 
-template <typename PtTensorImplT>
-std::shared_ptr<PtTensorImplT> MakeTensorImpl(const Tensor& tensor,
-                                              const platform::Place& place,
-                                              proto::VarType::Type type);
+template <typename PtenTensorImplT>
+std::shared_ptr<PtenTensorImplT> MakeTensorImpl(const Tensor& tensor,
+                                                const platform::Place& place,
+                                                proto::VarType::Type type);
 
-template <typename PtTensorImplT>
-void ShareTensorImpl(PtTensorImplT* tensor_impl, LoDTensor* out);
+template <typename PtenTensorImplT>
+void ShareTensorImpl(PtenTensorImplT* tensor_impl, LoDTensor* out);
 
-template <typename PtTensorImplT>
-void ShareTensorImpl(PtTensorImplT* tensor_impl, Tensor* out);
+template <typename PtenTensorImplT>
+void ShareTensorImpl(PtenTensorImplT* tensor_impl, Tensor* out);
 
-std::shared_ptr<pten::TensorBase> InputVariableToPtTensor(
+std::shared_ptr<pten::TensorBase> InputVariableToPtenTensor(
     const framework::Variable& variable, const pten::TensorArgDef& arg_def);
-std::shared_ptr<pten::TensorBase> OutputVariableToPtTensor(
+std::shared_ptr<pten::TensorBase> OutputVariableToPtenTensor(
     framework::Variable* variable, const pten::TensorArgDef& arg_def);
 
 /* Kernel Key translate */
 
-OpKernelType TransPtKernelKeyToOpKernelType(const pten::KernelKey& kernel_key);
-pten::KernelKey TransOpKernelTypeToPtKernelKey(const OpKernelType& kernel_type);
+OpKernelType TransPtenKernelKeyToOpKernelType(
+    const pten::KernelKey& kernel_key);
+pten::KernelKey TransOpKernelTypeToPtenKernelKey(
+    const OpKernelType& kernel_type);
 
 /* Kernel Args parse */
 
diff --git a/paddle/fluid/framework/pten_utils_test.cc b/paddle/fluid/framework/pten_utils_test.cc
index 3ba2da3df0580..b3f0e516a4781 100644
--- a/paddle/fluid/framework/pten_utils_test.cc
+++ b/paddle/fluid/framework/pten_utils_test.cc
@@ -41,7 +41,7 @@ TEST(TcmptUtils, MakeTensor) {
   ASSERT_EQ(dense_x->data_type(), pten::DataType::FLOAT32);
 }
 
-TEST(TcmptUtils, VarToPtTensor) {
+TEST(TcmptUtils, VarToPtenTensor) {
   // 1. create Variable
   Variable v;
   auto selected_rows = v.GetMutable<SelectedRows>();
@@ -57,7 +57,7 @@ TEST(TcmptUtils, VarToPtTensor) {
   auto tensor_def = pten::TensorArgDef(expect_backend, pten::DataLayout::NCHW,
                                        pten::DataType::INT32);
   // 2. test API
-  auto tensor_x = InputVariableToPtTensor(v, tensor_def);
+  auto tensor_x = InputVariableToPtenTensor(v, tensor_def);
   // 3. check result
   ASSERT_EQ(tensor_x->backend(), expect_backend);
   ASSERT_EQ(tensor_x->data_type(), pten::DataType::INT32);
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 97d893babae18..749f4ec76a75c 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -153,12 +153,12 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
 
   if (FLAGS_run_pt_kernel &&
       pten::KernelFactory::Instance().ContainsKernel(op.Type().c_str())) {
-    auto pt_kernel_signature = op.GetExpectedPtKernelArgs(dygraph_exe_ctx);
+    auto pt_kernel_signature = op.GetExpectedPtenKernelArgs(dygraph_exe_ctx);
 
     VLOG(1) << framework::KernelSignatureToString(pt_kernel_signature);
 
     auto pt_kernel_name = pten::KernelName(pt_kernel_signature.first);
-    auto pt_kernel_key = TransOpKernelTypeToPtKernelKey(expected_kernel_key);
+    auto pt_kernel_key = TransOpKernelTypeToPtenKernelKey(expected_kernel_key);
     auto pt_kernel = pten::KernelFactory::Instance().SelectKernel(
         pt_kernel_name, pt_kernel_key);
 
@@ -171,7 +171,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
       return PreparedOp(op, ctx, expected_kernel_key, pt_kernel_signature,
                         pt_kernel, dev_ctx);
     } else {
-      VLOG(1) << "Dynamic mode ChoosePtKernel - kernel `" << pt_kernel_name
+      VLOG(1) << "Dynamic mode ChoosePtenKernel - kernel `" << pt_kernel_name
               << "` not found.";
     }
   }
@@ -243,7 +243,7 @@ PreparedOp PreparedOp::Prepare(const NameVarMap<VariableWrapper>& ins,
 }
 
 template <typename VarType>
-static pten::KernelContext BuildDygraphPtKernelContext(
+static pten::KernelContext BuildDygraphPtenKernelContext(
     const framework::KernelSignature& pt_kernel_signature,
     const pten::Kernel& pt_kernel, const NameVarMap<VarType>& ins,
     const NameVarMap<VarType>& outs, const framework::AttributeMap& attrs,
@@ -292,7 +292,7 @@ static pten::KernelContext BuildDygraphPtKernelContext(
     for (auto var : ins_vector) {
       const auto& variable = var->Var();
 
-      auto pt_in = framework::InputVariableToPtTensor(variable, in_def);
+      auto pt_in = framework::InputVariableToPtenTensor(variable, in_def);
       tmp_inputs.emplace_back(pt_in);
     }
     op_kernel_ctx.EmplaceBackInputs(tmp_inputs);
@@ -306,7 +306,7 @@ static pten::KernelContext BuildDygraphPtKernelContext(
     for (auto var : outs_vector) {
       auto* variable = var->MutableVar();
 
-      auto pt_out = framework::OutputVariableToPtTensor(variable, out_def);
+      auto pt_out = framework::OutputVariableToPtenTensor(variable, out_def);
       tmp_outputs.emplace_back(pt_out);
     }
     op_kernel_ctx.EmplaceBackOutputs(tmp_outputs);
@@ -401,7 +401,7 @@ static void PreparedOpRunPtImpl(
   static_cast<const framework::OperatorWithKernel&>(op).InferShape(
       &infer_shape_ctx);
 
-  auto op_kernel_ctx = BuildDygraphPtKernelContext<VarType>(
+  auto op_kernel_ctx = BuildDygraphPtenKernelContext<VarType>(
       pt_kernel_signature, pt_kernel, ins, outs, attrs, default_attrs,
       *dev_ctx);
 
diff --git a/paddle/fluid/operators/fill_any_like_op.cc b/paddle/fluid/operators/fill_any_like_op.cc
index b46a1c3c89b6a..494341694b72e 100644
--- a/paddle/fluid/operators/fill_any_like_op.cc
+++ b/paddle/fluid/operators/fill_any_like_op.cc
@@ -48,7 +48,7 @@ class FillAnyLikeOp : public framework::OperatorWithKernel {
                                    tensor.layout());
   }
 
-  framework::KernelSignature GetExpectedPtKernelArgs(
+  framework::KernelSignature GetExpectedPtenKernelArgs(
       const framework::ExecutionContext &ctx) const override {
     return std::make_pair(
         "fill_any_like",
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index 329a649a5a34d..e2ae1ef8eca31 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -71,7 +71,7 @@ class ScaleOp : public framework::OperatorWithKernel {
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 
-  framework::KernelSignature GetExpectedPtKernelArgs(
+  framework::KernelSignature GetExpectedPtenKernelArgs(
       const framework::ExecutionContext &ctx) const override {
     if (ctx.HasInput("ScaleTensor")) {
       return std::make_pair(
diff --git a/paddle/pten/core/convert_utils.cc b/paddle/pten/core/convert_utils.cc
index 74d87101d7175..32f2497dd18a5 100644
--- a/paddle/pten/core/convert_utils.cc
+++ b/paddle/pten/core/convert_utils.cc
@@ -20,7 +20,7 @@ limitations under the License. */
 namespace pten {
 
 // TODO(chenweihang): Add other place trans cases later
-Backend TransToPtBackend(const paddle::platform::Place& place) {
+Backend TransToPtenBackend(const paddle::platform::Place& place) {
   if (paddle::platform::is_cpu_place(place)) {
     return Backend::CPU;
   } else if (paddle::platform::is_gpu_place(place)) {
@@ -30,7 +30,7 @@ Backend TransToPtBackend(const paddle::platform::Place& place) {
   }
 }
 
-paddle::experimental::DataType TransToPtDataType(
+paddle::experimental::DataType TransToPtenDataType(
     const paddle::framework::proto::VarType::Type& dtype) {
   // Set the order of case branches according to the frequency with
   // the data type is used
@@ -64,7 +64,7 @@ paddle::experimental::DataType TransToPtDataType(
   }
 }
 
-DataLayout TransToPtDataLayout(const paddle::framework::DataLayout& layout) {
+DataLayout TransToPtenDataLayout(const paddle::framework::DataLayout& layout) {
   switch (layout) {
     case paddle::framework::DataLayout::kNHWC:
       return DataLayout::NHWC;
diff --git a/paddle/pten/core/convert_utils.h b/paddle/pten/core/convert_utils.h
index e97d2a8c73210..aa79cb240dd04 100644
--- a/paddle/pten/core/convert_utils.h
+++ b/paddle/pten/core/convert_utils.h
@@ -30,10 +30,10 @@ namespace pten {
 using DataType = paddle::experimental::DataType;
 using DataLayout = paddle::experimental::DataLayout;
 
-Backend TransToPtBackend(const paddle::platform::Place& place);
-DataType TransToPtDataType(
+Backend TransToPtenBackend(const paddle::platform::Place& place);
+DataType TransToPtenDataType(
     const paddle::framework::proto::VarType::Type& dtype);
-DataLayout TransToPtDataLayout(const paddle::framework::DataLayout& layout);
+DataLayout TransToPtenDataLayout(const paddle::framework::DataLayout& layout);
 
 paddle::platform::Place TransToFluidPlace(const Backend& backend);
 paddle::framework::proto::VarType::Type TransToProtoVarType(
diff --git a/paddle/pten/kernels/cuda/math.cu b/paddle/pten/kernels/cuda/math.cu
index b9230dbf47a1f..1306cdc8017e6 100644
--- a/paddle/pten/kernels/cuda/math.cu
+++ b/paddle/pten/kernels/cuda/math.cu
@@ -77,7 +77,7 @@ void Mean(const CUDAContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
   pten::DenseTensor tmp(
       TensorMeta(paddle::framework::make_ddim(
                      {static_cast<int64_t>(temp_storage_bytes)}),
-                 pten::TransToPtBackend(dev_ctx.GetPlace()),
+                 pten::TransToPtenBackend(dev_ctx.GetPlace()),
                  x.data_type(),
                  x.layout()),
       TensorStatus());

From bbe59bc6748b3170012d5b7548a7f66676e1b841 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 21 Oct 2021 02:35:51 +0000
Subject: [PATCH 101/125] resolve conflit with xiaowei

---
 paddle/pten/hapi/lib/utils/tensor_utils.h             | 8 ++++----
 paddle/pten/hapi/lib/utils/tests/test_tensor_utils.cc | 9 +++++----
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/paddle/pten/hapi/lib/utils/tensor_utils.h b/paddle/pten/hapi/lib/utils/tensor_utils.h
index 9c726260139e3..c9d2f8ca32963 100644
--- a/paddle/pten/hapi/lib/utils/tensor_utils.h
+++ b/paddle/pten/hapi/lib/utils/tensor_utils.h
@@ -39,9 +39,9 @@ void SetLoD(DstLoD* dst, const SrcLoD& src) {
 
 std::shared_ptr<DenseTensor> MakeSharedDenseTensor(
     const paddle::framework::Tensor& src) {
-  DenseTensorMeta meta{pten::TransToPtDataType(src.type()),
+  DenseTensorMeta meta{pten::TransToPtenDataType(src.type()),
                        src.dims(),
-                       pten::TransToPtDataLayout(src.layout())};
+                       pten::TransToPtenDataLayout(src.layout())};
   auto shared_storage = pten::make_intrusive<SharedStorage>(src.Holder());
   return std::make_shared<DenseTensor>(std::move(shared_storage),
                                        std::move(meta));
@@ -49,9 +49,9 @@ std::shared_ptr<DenseTensor> MakeSharedDenseTensor(
 
 std::shared_ptr<DenseTensor> MakeSharedDenseTensor(
     const paddle::framework::LoDTensor& src) {
-  DenseTensorMeta meta{pten::TransToPtDataType(src.type()),
+  DenseTensorMeta meta{pten::TransToPtenDataType(src.type()),
                        src.dims(),
-                       pten::TransToPtDataLayout(src.layout())};
+                       pten::TransToPtenDataLayout(src.layout())};
   SetLoD(&meta.lod, src.lod());
   auto shared_storage = pten::make_intrusive<SharedStorage>(src.Holder());
   return std::make_shared<DenseTensor>(std::move(shared_storage),
diff --git a/paddle/pten/hapi/lib/utils/tests/test_tensor_utils.cc b/paddle/pten/hapi/lib/utils/tests/test_tensor_utils.cc
index 64ef1972d8d5a..f45537508d29a 100644
--- a/paddle/pten/hapi/lib/utils/tests/test_tensor_utils.cc
+++ b/paddle/pten/hapi/lib/utils/tests/test_tensor_utils.cc
@@ -47,9 +47,10 @@ TEST(tensor_utils, dense_tensor_to_lod_tensor) {
   CHECK(dense_tensor.lod().size() == lod_tensor.lod().size());
   CHECK(dense_tensor.lod()[0] ==
         static_cast<std::vector<size_t>>((lod_tensor.lod()[0])));
-  CHECK(dense_tensor.data_type() == pten::TransToPtDataType(lod_tensor.type()));
+  CHECK(dense_tensor.data_type() ==
+        pten::TransToPtenDataType(lod_tensor.type()));
   CHECK(dense_tensor.layout() ==
-        pten::TransToPtDataLayout(lod_tensor.layout()));
+        pten::TransToPtenDataLayout(lod_tensor.layout()));
   CHECK(platform::is_cpu_place(lod_tensor.place()));
 
   CHECK(lod_tensor.data<float>()[0] == 1.0f);
@@ -82,8 +83,8 @@ TEST(tensor_utils, dense_tensor_to_tensor) {
   framework::Tensor tensor;
   MovesStorage(&dense_tensor, &tensor);
 
-  CHECK(dense_tensor.data_type() == pten::TransToPtDataType(tensor.type()));
-  CHECK(dense_tensor.layout() == pten::TransToPtDataLayout(tensor.layout()));
+  CHECK(dense_tensor.data_type() == pten::TransToPtenDataType(tensor.type()));
+  CHECK(dense_tensor.layout() == pten::TransToPtenDataLayout(tensor.layout()));
   CHECK(platform::is_cpu_place(tensor.place()));
 
   CHECK(tensor.data<float>()[0] == 1.0f);

From 76a588edb1dda57548df6f577e0f6120999c1ce6 Mon Sep 17 00:00:00 2001
From: chentianyu03 <ctychentianyu@gmail.com>
Date: Thu, 21 Oct 2021 10:38:56 +0800
Subject: [PATCH 102/125] Op2functor opt1 (#27)

* replace to small vector and change to const &

* add std::move

Co-authored-by: Chen Weihang <chenweihang@baidu.com>
---
 paddle/fluid/framework/operator.cc           | 11 ++++----
 paddle/fluid/imperative/prepared_operator.cc | 11 ++++----
 paddle/pten/core/kernel_context.h            | 29 ++++++++++----------
 3 files changed, 27 insertions(+), 24 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index fed4541ee9f2c..5957158cf7f73 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1284,11 +1284,12 @@ OpKernelType OperatorWithKernel::InnerGetExpectedKernelType(
 
 void OperatorWithKernel::ChoosePtenKernel(const ExecutionContext& ctx) const {
   pt_kernel_signature_.reset(
-      new KernelSignature(this->GetExpectedPtenKernelArgs(ctx)));
+      new KernelSignature(std::move(this->GetExpectedPtenKernelArgs(ctx))));
 
   VLOG(1) << KernelSignatureToString(*pt_kernel_signature_.get());
 
-  kernel_type_.reset(new OpKernelType(InnerGetExpectedKernelType(ctx)));
+  kernel_type_.reset(
+      new OpKernelType(std::move(InnerGetExpectedKernelType(ctx))));
 
   auto pt_kernel_name = pten::KernelName(pt_kernel_signature_->first);
   auto pt_kernel_key = TransOpKernelTypeToPtenKernelKey(*kernel_type_.get());
@@ -1780,7 +1781,7 @@ KernelSignature OperatorWithKernel::GetExpectedPtenKernelArgs(
     return *(KernelSignatureMap::Instance().GetNullable(Type()));
   } else {
     KernelArgsNameMakerByOpProto maker(Info().proto_);
-    auto signature = maker.GetKernelSignature();
+    auto signature = std::move(maker.GetKernelSignature());
     KernelSignatureMap::Instance().Insert(Type(), signature);
     return signature;
   }
@@ -1831,8 +1832,8 @@ pten::KernelContext OperatorWithKernel::BuildPtenKernelContext(
             << in_def.layout;
 
     auto ins_vector = ctx.inputs.at(input_names[i]);
-    std::vector<std::shared_ptr<pten::TensorBase>> tmp_inputs;
 
+    paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_inputs;
     for (auto var : ins_vector) {
       auto pt_in = framework::InputVariableToPtenTensor(*var, in_def);
       tmp_inputs.emplace_back(pt_in);
@@ -1844,7 +1845,7 @@ pten::KernelContext OperatorWithKernel::BuildPtenKernelContext(
     auto out_def = output_defs.at(i);
     auto outs_vector = ctx.outputs.at(output_names[i]);
 
-    std::vector<std::shared_ptr<pten::TensorBase>> tmp_outputs;
+    paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_outputs;
     for (auto var : outs_vector) {
       auto pt_out = framework::OutputVariableToPtenTensor(var, out_def);
       tmp_outputs.emplace_back(pt_out);
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 749f4ec76a75c..bbc636f58cced 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -18,6 +18,7 @@
 #include "paddle/fluid/framework/details/nan_inf_utils.h"
 #include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/imperative/infer_shape_context.h"
+#include "paddle/utils/small_vector.h"
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/xpu/xpu_op_list.h"
 #endif
@@ -262,9 +263,9 @@ static pten::KernelContext BuildDygraphPtenKernelContext(
   auto& attr_names = std::get<1>(pt_kernel_signature.second);
   auto& output_names = std::get<2>(pt_kernel_signature.second);
 
-  auto input_defs = pt_kernel.args_def().input_defs();
-  auto output_defs = pt_kernel.args_def().output_defs();
-  auto attr_defs = pt_kernel.args_def().attribute_defs();
+  auto& input_defs = pt_kernel.args_def().input_defs();
+  auto& output_defs = pt_kernel.args_def().output_defs();
+  auto& attr_defs = pt_kernel.args_def().attribute_defs();
 
   PADDLE_ENFORCE_EQ(input_names.size(), input_defs.size(),
                     platform::errors::InvalidArgument(
@@ -288,7 +289,7 @@ static pten::KernelContext BuildDygraphPtenKernelContext(
     auto& in_def = input_defs.at(i);
     auto& ins_vector = ins.at(input_names[i]);
 
-    std::vector<std::shared_ptr<pten::TensorBase>> tmp_inputs;
+    paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_inputs;
     for (auto var : ins_vector) {
       const auto& variable = var->Var();
 
@@ -302,7 +303,7 @@ static pten::KernelContext BuildDygraphPtenKernelContext(
     auto& out_def = output_defs.at(i);
     auto& outs_vector = outs.at(output_names[i]);
 
-    std::vector<std::shared_ptr<pten::TensorBase>> tmp_outputs;
+    paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_outputs;
     for (auto var : outs_vector) {
       auto* variable = var->MutableVar();
 
diff --git a/paddle/pten/core/kernel_context.h b/paddle/pten/core/kernel_context.h
index c17248831c10e..78c567986bd62 100644
--- a/paddle/pten/core/kernel_context.h
+++ b/paddle/pten/core/kernel_context.h
@@ -18,6 +18,7 @@
 
 #include "paddle/pten/core/tensor_base.h"
 #include "paddle/utils/any.h"
+#include "paddle/utils/small_vector.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
@@ -40,9 +41,9 @@ class KernelContext {
  public:
   explicit KernelContext(const DeviceContext& dev_ctx) : dev_ctx_(dev_ctx) {}
   KernelContext(const DeviceContext& dev_ctx,
-                const std::vector<std::shared_ptr<TensorBase>>& inputs,
-                const std::vector<std::shared_ptr<TensorBase>>& outputs,
-                const std::vector<paddle::any>& attrs)
+                const paddle::SmallVector<std::shared_ptr<TensorBase>>& inputs,
+                const paddle::SmallVector<std::shared_ptr<TensorBase>>& outputs,
+                const paddle::SmallVector<paddle::any>& attrs)
       : dev_ctx_(dev_ctx), inputs_(inputs), outputs_(outputs), attrs_(attrs) {}
 
   template <typename CtxType>
@@ -57,7 +58,8 @@ class KernelContext {
     input_range_.emplace_back(std::pair<int, int>(index, index + 1));
   }
 
-  void EmplaceBackInputs(std::vector<std::shared_ptr<TensorBase>> inputs) {
+  void EmplaceBackInputs(
+      const paddle::SmallVector<std::shared_ptr<TensorBase>>& inputs) {
     for (auto in : inputs) {
       inputs_.emplace_back(in);
     }
@@ -74,7 +76,8 @@ class KernelContext {
     output_range_.emplace_back(std::pair<int, int>(index, index + 1));
   }
 
-  void EmplaceBackOutputs(std::vector<std::shared_ptr<TensorBase>> outputs) {
+  void EmplaceBackOutputs(
+      const paddle::SmallVector<std::shared_ptr<TensorBase>>& outputs) {
     for (auto out : outputs) {
       outputs_.emplace_back(out);
     }
@@ -113,22 +116,20 @@ class KernelContext {
   // DeviceContext base class
   const DeviceContext& dev_ctx_;
 
-  // TODO(chenweihang): replaced by small_vector
   // TODO(chenweihang): Tensor -> Tensor*, Tensor should by managed `scope`
   // Note: can't use API Tensor here, the inference don't use this API Tensor
-  std::vector<std::shared_ptr<TensorBase>> inputs_{};
-  std::vector<std::shared_ptr<TensorBase>> outputs_{};
-  std::vector<paddle::any> attrs_{};
+  paddle::SmallVector<std::shared_ptr<TensorBase>> inputs_{};
+  paddle::SmallVector<std::shared_ptr<TensorBase>> outputs_{};
+  paddle::SmallVector<paddle::any> attrs_{};
 
   // Only contains input like list[Tensor] need `range`
-  // TODO(chenweihang): replaced by small_vector
-  std::vector<std::pair<int, int>> input_range_{{}};
-  std::vector<std::pair<int, int>> output_range_{{}};
+  paddle::SmallVector<std::pair<int, int>> input_range_{{}};
+  paddle::SmallVector<std::pair<int, int>> output_range_{{}};
 
   // Only static graph need `name`
   // TODO(chenweihang): replaced by paddle::string_view
-  std::vector<std::string> input_names_{{}};
-  std::vector<std::string> output_names_{{}};
+  paddle::SmallVector<std::string> input_names_{{}};
+  paddle::SmallVector<std::string> output_names_{{}};
 };
 
 }  // namespace pten

From fb224abe7ada6d1815ed19f70a8ebea07b8d3220 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 21 Oct 2021 14:16:36 +0000
Subject: [PATCH 103/125] polish kernel factory and kernel registry

---
 paddle/fluid/framework/operator.cc           | 25 +------
 paddle/fluid/imperative/prepared_operator.cc |  2 +-
 paddle/fluid/pybind/op_function_generator.cc |  2 +-
 paddle/pten/core/kernel_factory.cc           | 18 +++--
 paddle/pten/core/kernel_factory.h            | 77 +++++++++-----------
 paddle/pten/core/kernel_registry.h           |  1 +
 6 files changed, 54 insertions(+), 71 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 5957158cf7f73..2775d0bcf036b 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1080,20 +1080,6 @@ void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
   this->InferShape(&infer_shape_ctx);
 }
 
-static std::string RuntimeContextDebugString(const RuntimeContext& ctx) {
-  std::stringstream ss;
-  ss << "RuntimeContext(Inputs: ";
-  for (auto& var_pair : ctx.inputs) {
-    ss << var_pair.first << ", ";
-  }
-  ss << "Outputs: ";
-  for (auto& var_pair : ctx.outputs) {
-    ss << var_pair.first << ", ";
-  }
-  ss << ")";
-  return ss.str();
-}
-
 void OperatorWithKernel::RunImpl(const Scope& scope,
                                  const platform::Place& place) const {
   // To reduce the elapsed time of HasAttr, we use bool variable to record the
@@ -1144,7 +1130,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   // and RCOM backend, the XPU, NPU and MKLDNN will be supported in the second
   // phase
   if (FLAGS_run_pt_kernel &&
-      pten::KernelFactory::Instance().ContainsKernel(type_.c_str())) {
+      pten::KernelFactory::Instance().HasCompatiblePtenKernel(type_)) {
     if (pt_kernel_signature_.get() == nullptr || pt_kernel_.get() == nullptr) {
       ChoosePtenKernel(exe_ctx);
     }
@@ -1651,10 +1637,9 @@ void OperatorWithKernel::ParseInputDataType(
       if (t != nullptr) {
         PADDLE_ENFORCE_EQ(
             t->IsInitialized(), true,
-            platform::errors::InvalidArgument(
-                "The Tensor in the %s Op's Input Variable %s(%s) is "
-                "not initialized.",
-                Type(), name, Inputs().at(name).at(i)));
+            platform::errors::InvalidArgument("The %s Op's Input Variable `%s` "
+                                              "contains uninitialized Tensor.",
+                                              Type(), name));
         proto::VarType::Type tmp = t->type();
         PADDLE_ENFORCE(tmp == *data_type || *data_type == default_data_type,
                        platform::errors::InvalidArgument(
@@ -1789,8 +1774,6 @@ KernelSignature OperatorWithKernel::GetExpectedPtenKernelArgs(
 
 pten::KernelContext OperatorWithKernel::BuildPtenKernelContext(
     const RuntimeContext& ctx, const platform::DeviceContext& dev_ctx) const {
-  VLOG(1) << RuntimeContextDebugString(ctx);
-
   // TODO(chenweihang): now only work for very simple case,
   // many cases need to be deal with later:
   // 1. the input and output are not tensor
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index bbc636f58cced..04f5a74788e88 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -153,7 +153,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
   VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
 
   if (FLAGS_run_pt_kernel &&
-      pten::KernelFactory::Instance().ContainsKernel(op.Type().c_str())) {
+      pten::KernelFactory::Instance().HasCompatiblePtenKernel(op.Type())) {
     auto pt_kernel_signature = op.GetExpectedPtenKernelArgs(dygraph_exe_ctx);
 
     VLOG(1) << framework::KernelSignatureToString(pt_kernel_signature);
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index 388fa558f32f6..1569447dfebf7 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -557,7 +557,7 @@ GenerateOpFunctions() {
     // since only OperatorWithKernel can run in dygraph mode.
     // if the pten lib contains op kernel, we still generate ops method
     if (!all_kernels.count(op_type) &&
-        !pten::KernelFactory::Instance().ContainsKernel(op_type.c_str())) {
+        !pten::KernelFactory::Instance().HasCompatiblePtenKernel(op_type)) {
       continue;
     }
 
diff --git a/paddle/pten/core/kernel_factory.cc b/paddle/pten/core/kernel_factory.cc
index 7433a25dcbd66..729f137c08798 100644
--- a/paddle/pten/core/kernel_factory.cc
+++ b/paddle/pten/core/kernel_factory.cc
@@ -19,16 +19,24 @@
 
 namespace pten {
 
+uint32_t KernelKey::Hash::operator()(const KernelKey& key) const {
+  uint32_t hash_value = 0;
+  // |----31-20------|---19-12---|---11-8----|---7-0---|
+  // | For extension | DataType | DataLayout | Backend |
+  hash_value |= static_cast<uint8_t>(key.backend());
+  hash_value |=
+      (static_cast<uint8_t>(key.layout()) << KernelKey::kBackendBitLength);
+  hash_value |=
+      (static_cast<uint16_t>(key.dtype())
+       << (KernelKey::kBackendBitLength + KernelKey::kDataTypeBitLength));
+  return hash_value;
+}
+
 KernelFactory& KernelFactory::Instance() {
   static KernelFactory g_op_kernel_factory;
   return g_op_kernel_factory;
 }
 
-bool KernelFactory::ContainsKernel(const char* kernel_name) const {
-  auto iter = kernels_.find(KernelName(kernel_name, ""));
-  return (iter != kernels_.end());
-}
-
 Kernel KernelFactory::SelectKernel(const KernelName& kernel_name,
                                    const KernelKey& kernel_key) const {
   auto iter = kernels_.find(kernel_name);
diff --git a/paddle/pten/core/kernel_factory.h b/paddle/pten/core/kernel_factory.h
index 9e47d82d0fb08..4ec80521b44a6 100644
--- a/paddle/pten/core/kernel_factory.h
+++ b/paddle/pten/core/kernel_factory.h
@@ -17,6 +17,7 @@
 #include <ostream>
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include <utility>
 
 #include "paddle/pten/common/backend.h"
@@ -37,10 +38,10 @@ using DataLayout = paddle::experimental::DataLayout;
 /**
  * [ Naming considerations ]
  *
- * The tensor Compute library contains many kernels, and the computation
+ * The tensor operation library contains many kernels, and the computation
  * in each specific scenario is represented by an kernel.
  *
- * We directly named it `Kernel` instead of `Kernel`, the tensor Compute
+ * We directly named it `Kernel` instead of `Kernel`, the tensor operation
  * library here and fluid are independent, avoiding developers from
  * misunderstanding the relationship between the two concepts.
  */
@@ -52,10 +53,7 @@ using KernelFn = void (*)(KernelContext* ctx);
 class KernelName final {
  public:
   KernelName(std::string name, std::string overload_name)
-      : name_(std::move(name)), overload_name_(std::move(overload_name)) {
-    hash_value_ = std::hash<std::string>()(name_) ^
-                  (std::hash<std::string>()(overload_name_) << 1);
-  }
+      : name_(std::move(name)), overload_name_(std::move(overload_name)) {}
 
   KernelName(const std::string& kernel_name) {
     ParseNameAndOverloadNameFromString(kernel_name);
@@ -68,24 +66,26 @@ class KernelName final {
 
   const std::string& name() const { return name_; }
   const std::string& overload_name() const { return overload_name_; }
-  size_t hash_value() const { return hash_value_; }
 
   struct Hash {
     size_t operator()(const KernelName& kernel_name) const {
-      return kernel_name.hash_value();
+      return std::hash<std::string>()(kernel_name.name()) ^
+             (std::hash<std::string>()(kernel_name.overload_name()) << 1);
     }
   };
 
+  size_t hash_value() const { return Hash()(*this); }
+
   bool operator<(const KernelName& kernel_name) const {
-    return hash_value_ < kernel_name.hash_value();
+    return hash_value() < kernel_name.hash_value();
   }
 
   bool operator==(const KernelName& kernel_name) const {
-    return hash_value_ == kernel_name.hash_value();
+    return hash_value() == kernel_name.hash_value();
   }
 
   bool operator!=(const KernelName& kernel_name) const {
-    return hash_value_ != kernel_name.hash_value();
+    return hash_value() != kernel_name.hash_value();
   }
 
  private:
@@ -98,17 +98,11 @@ class KernelName final {
       name_ = kernel_name.substr(0, pos);
       overload_name_ = kernel_name.substr(pos + 1, kernel_name.size());
     }
-    hash_value_ = std::hash<std::string>()(name_) ^
-                  (std::hash<std::string>()(overload_name_) << 1);
   }
 
-  // The members cannot be modified except by constructing,
-  // because the hash value need to be re calculated
-  // TODO(chenweihang): use string_view later?
+  // TODO(chenweihang): use string_view to improve performance later
   std::string name_;
   std::string overload_name_;
-  // Avoid calculating Hash value at runtime
-  size_t hash_value_;
 };
 
 class KernelKey {
@@ -116,39 +110,33 @@ class KernelKey {
   KernelKey() = default;
 
   KernelKey(Backend backend, DataLayout layout, DataType dtype)
-      : backend_(backend), layout_(layout), dtype_(dtype) {
-    // |----31-20------|---19-12---|---11-8----|---7-0---|
-    // | For extension | DataType | DataLayout | Backend |
-
-    hash_value_ = 0;
-    hash_value_ |= static_cast<uint8_t>(backend_);
-    hash_value_ |= (static_cast<uint8_t>(layout_) << kBackendBitLength);
-    hash_value_ |= (static_cast<uint16_t>(dtype_)
-                    << (kBackendBitLength + kDataTypeBitLength));
-  }
+      : backend_(backend), layout_(layout), dtype_(dtype) {}
 
   Backend backend() const { return backend_; }
   DataLayout layout() const { return layout_; }
   DataType dtype() const { return dtype_; }
 
-  uint32_t hash_value() const { return hash_value_; }
+  struct Hash {
+    // Note: Now the number of bits we need does not exceed 32 bits, so there is
+    // no need to use 64 bits. If needed in the future, it can be expanded,
+    // but now we don’t over-design.
+    uint32_t operator()(const KernelKey& key) const;
+  };
+
+  uint32_t hash_value() const { return Hash()(*this); }
 
   bool operator<(const KernelKey& key) const {
-    return hash_value_ < key.hash_value();
+    return hash_value() < key.hash_value();
   }
 
   bool operator==(const KernelKey& key) const {
-    return hash_value_ == key.hash_value();
+    return hash_value() == key.hash_value();
   }
 
   bool operator!=(const KernelKey& key) const {
-    return hash_value_ != key.hash_value();
+    return hash_value() != key.hash_value();
   }
 
-  struct Hash {
-    uint32_t operator()(const KernelKey& key) const { return key.hash_value(); }
-  };
-
  private:
   // In total should be smaller than 32.
   constexpr static int kBackendBitLength = 8;
@@ -158,12 +146,6 @@ class KernelKey {
   Backend backend_{Backend::UNDEFINED};
   DataLayout layout_{DataLayout::UNDEFINED};
   DataType dtype_{DataType::UNDEFINED};
-
-  // Avoid calculating Hash value at runtime.
-  // Note: Now the number of bits we need does not exceed 32 bits, so there is
-  // no need to use 64 bits. If needed in the future, it can be expanded,
-  // but now we don’t over-design.
-  uint32_t hash_value_;
 };
 
 // TODO(chenweihang): how deal with vector<Param>?
@@ -282,7 +264,13 @@ class KernelFactory {
 
   KernelMap& kernels() { return kernels_; }
 
-  bool ContainsKernel(const char* name) const;
+  void InsertCompatibleOpType(const std::string& op_type) {
+    compatible_op_types_.insert(op_type);
+  }
+
+  bool HasCompatiblePtenKernel(const std::string& op_type) const {
+    return compatible_op_types_.count(op_type) > 0;
+  }
 
   const Kernel& SelectKernelOrThrowError(const KernelName& kernel_name,
                                          const KernelKey& kernel_key) const;
@@ -299,6 +287,9 @@ class KernelFactory {
   KernelFactory() = default;
 
   KernelMap kernels_;
+  // Used to be compatible with the original execution system and
+  // quickly confirm whether the new kernel can be called
+  std::unordered_set<std::string> compatible_op_types_;
 };
 
 /** operator << overload **/
diff --git a/paddle/pten/core/kernel_registry.h b/paddle/pten/core/kernel_registry.h
index b080a0d3202fb..b77f641b9f51b 100644
--- a/paddle/pten/core/kernel_registry.h
+++ b/paddle/pten/core/kernel_registry.h
@@ -149,6 +149,7 @@ struct KernelRegistrar {
     args_parse_fn(kernel_key, kernel.mutable_args_def());
     args_def_fn(&kernel);
 
+    KernelFactory::Instance().InsertCompatibleOpType(kernel_name.name());
     KernelFactory::Instance().kernels()[kernel_name][kernel_key] = kernel;
   }
 };

From 252fb79f93d90a96ee569d6e9e963e2f7abf1415 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Fri, 22 Oct 2021 02:43:24 +0000
Subject: [PATCH 104/125] fix operator test error msg mismatch

---
 paddle/fluid/framework/operator_test.cc | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/framework/operator_test.cc b/paddle/fluid/framework/operator_test.cc
index 368913700167e..df7e3c4f6dde3 100644
--- a/paddle/fluid/framework/operator_test.cc
+++ b/paddle/fluid/framework/operator_test.cc
@@ -439,9 +439,8 @@ TEST(IndicateVarDataTypeTest, lodtensor) {
     std::string ex_msg = err.what();
     EXPECT_TRUE(
         ex_msg.find(
-            "The Tensor in the indicate_lod_tensor_data_type_test Op's "
-            "Input Variable LoDTensor(lodtensor_1) is not initialized") !=
-        std::string::npos);
+            "The indicate_lod_tensor_data_type_test Op's Input Variable "
+            "`LoDTensor` contains uninitialized Tensor.") != std::string::npos);
   }
   ASSERT_TRUE(caught);
 }
@@ -466,9 +465,9 @@ TEST(IndicateVarDataTypeTest, selectedrows) {
     caught = true;
     std::string ex_msg = err.what();
     EXPECT_TRUE(
-        ex_msg.find("The Tensor in the indicate_selected_rows_data_type_test "
-                    "Op's Input Variable SelectedRows(selected_rows_1) is not "
-                    "initialized") != std::string::npos);
+        ex_msg.find("The indicate_selected_rows_data_type_test Op's "
+                    "Input Variable `SelectedRows` contains uninitialized "
+                    "Tensor.") != std::string::npos);
   }
   ASSERT_TRUE(caught);
 }

From 19b1095347aafd3f5a756464ad6d7e90a77522f8 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Fri, 22 Oct 2021 04:03:25 +0000
Subject: [PATCH 105/125] remove tensor signature and backend set member

---
 .gitignore                                  |  1 -
 paddle/fluid/operators/mean_op.h            |  2 +-
 paddle/pten/common/backend.h                |  4 +-
 paddle/pten/hapi/include/backend_set.h      |  4 +-
 paddle/pten/hapi/include/tensor.h           | 44 +++++++-------------
 paddle/pten/hapi/include/tensor_signature.h | 45 ---------------------
 paddle/pten/hapi/lib/creation.cc            |  1 -
 paddle/pten/hapi/lib/kernel_dispatch.h      | 16 +++++++-
 paddle/pten/hapi/lib/linalg.cc              |  1 -
 paddle/pten/hapi/lib/manipulation.cc        |  1 -
 paddle/pten/hapi/lib/math.cc                |  1 -
 11 files changed, 35 insertions(+), 85 deletions(-)
 delete mode 100644 paddle/pten/hapi/include/tensor_signature.h

diff --git a/.gitignore b/.gitignore
index 8a7b73d46c032..749832c3930cf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,7 +4,6 @@ paddle/fluid/API_DEV.spec
 paddle/fluid/API_PR.spec
 paddle/fluid/op_use_default_grad_maker_DEV.spec
 paddle/fluid/op_use_default_grad_maker_PR.spec
-tools/__pycache__/static_mode_white_list.cpython-37.pyc
 
 *.DS_Store
 *.vs
diff --git a/paddle/fluid/operators/mean_op.h b/paddle/fluid/operators/mean_op.h
index 661ff41f10f85..9a8c2736589c9 100644
--- a/paddle/fluid/operators/mean_op.h
+++ b/paddle/fluid/operators/mean_op.h
@@ -49,7 +49,7 @@ using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
  * Currently, only the first two cases are adapted.
  *
  * The principle here is that the implementation in the kernel must reuse the
- * corresponding functions in the Tensor compute library and cannot maintain
+ * corresponding functions in the Tensor Operation library and cannot maintain
  * two copies of the code.
  */
 template <typename DeviceContext, typename T>
diff --git a/paddle/pten/common/backend.h b/paddle/pten/common/backend.h
index 6dc505fa2c5ca..9808b45b45c7c 100644
--- a/paddle/pten/common/backend.h
+++ b/paddle/pten/common/backend.h
@@ -28,8 +28,8 @@ namespace experimental {
  * but in order to make the boundary of the kernel clearer and the function
  * more specific, we need to distinguish the calculation method.
  *
- * Such as the kernel for CUDA device, it can be a native CUDA kernel,
- * or a kernel implemented by CUDNN library.
+ * Such as the kernel for CPU device, it can be a native CPU kernel,
+ * or a kernel implemented by MKLDNN library.
  *
  * Note(chenweihang): HIP is not needed now, we can added it if needed
  * in the future
diff --git a/paddle/pten/hapi/include/backend_set.h b/paddle/pten/hapi/include/backend_set.h
index a47cb76489375..00f59b45a188f 100644
--- a/paddle/pten/hapi/include/backend_set.h
+++ b/paddle/pten/hapi/include/backend_set.h
@@ -26,8 +26,8 @@ namespace experimental {
  * We use the backend to form a bit set to assist the runtime kernel selection,
  * and the higher backend bit has a higher priority.
  *
- * A Tensor may belong to multiple backends at the same time, such CUDNN and
- * CUDA. Only one backend value cannot
+ * A Tensor may belong to multiple backends at the same time, such CPU and
+ * MKLDNN. Only one backend value cannot
  */
 class BackendSet final {
  public:
diff --git a/paddle/pten/hapi/include/tensor.h b/paddle/pten/hapi/include/tensor.h
index f915a06087017..393332eefa119 100644
--- a/paddle/pten/hapi/include/tensor.h
+++ b/paddle/pten/hapi/include/tensor.h
@@ -19,18 +19,17 @@ limitations under the License. */
 #include <utility>
 
 #include "paddle/pten/core/tensor_base.h"
-#include "paddle/pten/hapi/include/tensor_signature.h"
 
 /**
  * [ Why still include the fluid headers? ]
  *
  * We hope to organize the basic implementation of Tensor and the logic related
  * to Tensor computation into an independent library, which we call
- * [Tensor Compute Library, pten], so we extract or rewrite the original
+ * [Tensor Operation Library, pten], so we extract or rewrite the original
  * Kernels.
  *
  * In the future, the training library, inference library and custom operators
- * will link to this Tensor Compute library.
+ * will link to this Tensor Operation library.
  *
  * However, if we directly split the link relation, we need to make too many
  * changes, which will affect the stability of the framework, so here we still
@@ -47,15 +46,15 @@ namespace experimental {
 
 class Tensor;
 
-class AutogradMetaInterface {
+class AbstractAutogradMeta {
  public:
-  // No AutogradMetaInterface should be created
-  virtual ~AutogradMetaInterface() {}
+  // No AbstractAutogradMeta should be created
+  virtual ~AbstractAutogradMeta() {}
 };
 
 /**
  * Tensor is the API description of the basic data structure in the
- * [ Paddle "Tensor CoMPuTe (pten)" Library ].
+ * [ "Paddle Tensor Operation (pten)" Library ].
  *
  * It is not limited to a simple n-dimensional array.
  * It contains a smart pointer to `TensorImpl`. The data description contained
@@ -97,7 +96,6 @@ class Tensor final {
     if (impl_.get() == nullptr) {
       throw std::runtime_error("TensorImpl with nullptr is not supported");
     }
-    signature_.reset(new TensorSignature(impl_->backend()));
   }
 
   /* Part 2: Dimension, DataType and DataLayout methods */
@@ -140,16 +138,8 @@ class Tensor final {
   /**
    * Backend judgment APIs, shield the concept of Backend.
    */
-  BackendSet backend_set() const { return signature_->backend_set; }
-  void set_backend_set(const BackendSet& backend_set) {
-    if (signature_ == nullptr) {
-      signature_.reset(new TensorSignature());
-    }
-    signature_->backend_set = backend_set;
-  }
-
-  bool is_cpu() const { return signature_->backend_set.Has(Backend::CPU); }
-  bool is_cuda() const { return signature_->backend_set.Has(Backend::CUDA); }
+  bool is_cpu() const { return paddle::platform::is_cpu_place(place()); }
+  bool is_cuda() const { return paddle::platform::is_gpu_place(place()); }
 
   /**
    * Backend convert APIs.
@@ -211,11 +201,11 @@ class Tensor final {
   }
 
   /* Part 7: Autograd methods */
-  AutogradMetaInterface* get_autograd_meta() const {
+  AbstractAutogradMeta* get_autograd_meta() const {
     return autograd_meta_.get();
   }
 
-  void set_autograd_meta(std::shared_ptr<AutogradMetaInterface> autograd_meta) {
+  void set_autograd_meta(std::shared_ptr<AbstractAutogradMeta> autograd_meta) {
     autograd_meta_ = std::move(autograd_meta);
   }
 
@@ -244,7 +234,7 @@ class Tensor final {
   std::shared_ptr<pten::TensorBase> impl_;
 
   /**
-   * [ Why need abstract AutogradMetaInterface here? ]
+   * [ Why need abstract AbstractAutogradMeta here? ]
    *
    * Dynamic graphs need to hold backward information
    *
@@ -254,17 +244,13 @@ class Tensor final {
    *    information, not Tensor data description-related information.
    * 2. Kernel calculation does not require AutogradMeta.
    */
-  std::shared_ptr<AutogradMetaInterface> autograd_meta_{nullptr};
+  std::shared_ptr<AbstractAutogradMeta> autograd_meta_{nullptr};
 
   /**
-   * TensorSignature is used to store auxiliary description information
-   * needed by Tensor.
-   *
-   * The currently stored information includes:
-   * 1. name: used for Debug analysis in the development of new dygraph.
-   * 2. backend_set: used by the API to determine the kernel backend.
+   * Tensor name: used for adapt original execution mechanism and debug analysis
+   * in the development of new dygraph.
    */
-  std::shared_ptr<TensorSignature> signature_{nullptr};
+  std::string name_;
 };
 
 }  // namespace experimental
diff --git a/paddle/pten/hapi/include/tensor_signature.h b/paddle/pten/hapi/include/tensor_signature.h
deleted file mode 100644
index ca20f9da75a84..0000000000000
--- a/paddle/pten/hapi/include/tensor_signature.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <string>
-
-#include "paddle/pten/hapi/include/backend_set.h"
-
-namespace paddle {
-namespace experimental {
-
-struct TensorSignature final {
-  std::string name{""};
-  BackendSet backend_set{Backend::CPU};
-
-  TensorSignature() = default;
-
-  // open default methods if needed
-  TensorSignature& operator=(const TensorSignature&) = delete;
-  TensorSignature& operator=(TensorSignature&&) = delete;
-  TensorSignature(const TensorSignature&) = delete;
-  TensorSignature(TensorSignature&&) = delete;
-
-  explicit TensorSignature(const std::string& t_name) : name(t_name) {}
-  explicit TensorSignature(const Backend& t_backend) : backend_set(t_backend) {}
-  explicit TensorSignature(const BackendSet& t_backend_set)
-      : backend_set(t_backend_set) {}
-  TensorSignature(const std::string& t_name, const BackendSet& t_backend_set)
-      : name(t_name), backend_set(t_backend_set) {}
-};
-
-}  // namespace experimental
-}  // namespace paddle
diff --git a/paddle/pten/hapi/lib/creation.cc b/paddle/pten/hapi/lib/creation.cc
index 5e32ffa59637d..046a76e13295b 100644
--- a/paddle/pten/hapi/lib/creation.cc
+++ b/paddle/pten/hapi/lib/creation.cc
@@ -56,7 +56,6 @@ Tensor full_like(const Tensor& x,
       std::make_shared<pten::DenseTensor>(out_meta, pten::TensorStatus());
   kernel_context.EmplaceBackOutput(dense_out);
   out.set_impl(dense_out);
-  out.set_backend_set(x.backend_set());
 
   // 6. Call kernel
   kernel(&kernel_context);
diff --git a/paddle/pten/hapi/lib/kernel_dispatch.h b/paddle/pten/hapi/lib/kernel_dispatch.h
index 95410ee942012..d7190076bf3f6 100644
--- a/paddle/pten/hapi/lib/kernel_dispatch.h
+++ b/paddle/pten/hapi/lib/kernel_dispatch.h
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/pten/hapi/include/tensor.h"
 
 // TODO(chenweihang): split KernelName, Key, Kernel, Factory into diff files
+#include "paddle/pten/core/convert_utils.h"
 #include "paddle/pten/core/kernel_factory.h"
 
 // See Note [ Why still include the fluid headers? ]
@@ -39,6 +40,19 @@ using CUDAContext = paddle::platform::CUDADeviceContext;
 #endif
 
 namespace detail {
+BackendSet GetTensorBackendSet(const Tensor& t) {
+  BackendSet backend_set(pten::TransToPtenBackend(t.place()));
+  switch (t.layout()) {
+    case DataLayout::MKLDNN:
+      backend_set = backend_set | BackendSet(Backend::MKLDNN);
+      break;
+    default:
+      // do nothing
+      break;
+  }
+  return backend_set;
+}
+
 std::size_t CountLeadingZeros(uint64_t val) {
   if (val == 0) {
     return 64;
@@ -102,7 +116,7 @@ struct KernelKeyParser : ArgsIterator<KernelKeyParser> {
   // TODO(chenweihang): deal with multiple diff input Tensors
   // TODO(chenweihang): add global device guard method to set backend
   void operator()(const Tensor& x) {
-    key_set.backend_set = key_set.backend_set | x.backend_set();
+    key_set.backend_set = key_set.backend_set | detail::GetTensorBackendSet(x);
     // TODO(chenweihang): selecte multi layout and dtype
     key_set.layout = x.layout();
     key_set.dtype = x.type();
diff --git a/paddle/pten/hapi/lib/linalg.cc b/paddle/pten/hapi/lib/linalg.cc
index f973696da49aa..1269702f28f91 100644
--- a/paddle/pten/hapi/lib/linalg.cc
+++ b/paddle/pten/hapi/lib/linalg.cc
@@ -56,7 +56,6 @@ Tensor dot(const Tensor& x, const Tensor& y) {
       std::make_shared<pten::DenseTensor>(out_meta, pten::TensorStatus());
   kernel_context.EmplaceBackOutput(dense_out);
   out.set_impl(dense_out);
-  out.set_backend_set(x.backend_set());
 
   // 6. Call kernel
   kernel(&kernel_context);
diff --git a/paddle/pten/hapi/lib/manipulation.cc b/paddle/pten/hapi/lib/manipulation.cc
index c7c7f99f91afd..4b9b66b9df0bd 100644
--- a/paddle/pten/hapi/lib/manipulation.cc
+++ b/paddle/pten/hapi/lib/manipulation.cc
@@ -50,7 +50,6 @@ Tensor flatten(const Tensor& x, int start_axis, int stop_axis) {
       std::make_shared<pten::DenseTensor>(out_meta, pten::TensorStatus());
   kernel_context.EmplaceBackOutput(dense_out);
   out.set_impl(dense_out);
-  out.set_backend_set(x.backend_set());
 
   // 6. Call kernel
   kernel(&kernel_context);
diff --git a/paddle/pten/hapi/lib/math.cc b/paddle/pten/hapi/lib/math.cc
index 178eb5ac1c07d..851a9bc155cdd 100644
--- a/paddle/pten/hapi/lib/math.cc
+++ b/paddle/pten/hapi/lib/math.cc
@@ -50,7 +50,6 @@ Tensor mean(const Tensor& x) {
       std::make_shared<pten::DenseTensor>(out_meta, pten::TensorStatus());
   kernel_context.EmplaceBackOutput(dense_out);
   out.set_impl(dense_out);
-  out.set_backend_set(x.backend_set());
 
   // 6. Call kernel
   kernel(&kernel_context);

From 24ef6c5698aedb8b2c8ccf85770024a4e4a69511 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Fri, 22 Oct 2021 07:32:43 +0000
Subject: [PATCH 106/125] move scalar and polish enforce

---
 paddle/fluid/framework/operator.cc           |  1 +
 paddle/fluid/imperative/prepared_operator.cc |  1 +
 paddle/pten/api/include/core.h               |  1 -
 paddle/pten/common/backend.h                 |  5 ++++-
 paddle/pten/common/data_type.h               |  7 +++----
 paddle/pten/common/layout.h                  | 12 +++++++-----
 paddle/pten/{core => common}/scalar.h        | 17 ++++++++++++++---
 paddle/pten/core/kernel_utils.h              |  4 ++--
 paddle/pten/hapi/include/backend_set.h       | 11 +++++------
 paddle/pten/hapi/include/creation.h          |  4 ++--
 paddle/pten/hapi/include/tensor.h            |  7 ++++---
 paddle/pten/hapi/lib/creation.cc             |  6 +++---
 paddle/pten/kernels/cpu/creation.h           |  2 +-
 paddle/pten/kernels/cuda/creation.h          |  2 +-
 paddle/pten/kernels/cuda/math.cu             |  8 +++++---
 15 files changed, 53 insertions(+), 35 deletions(-)
 rename paddle/pten/{core => common}/scalar.h (82%)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 2775d0bcf036b..7c63f7c76c921 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -30,6 +30,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"
+#include "paddle/pten/common/scalar.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 04f5a74788e88..2ffb47273f650 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -18,6 +18,7 @@
 #include "paddle/fluid/framework/details/nan_inf_utils.h"
 #include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/imperative/infer_shape_context.h"
+#include "paddle/pten/common/scalar.h"
 #include "paddle/utils/small_vector.h"
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/platform/xpu/xpu_op_list.h"
diff --git a/paddle/pten/api/include/core.h b/paddle/pten/api/include/core.h
index 3cb852970069d..9a042753d1f73 100644
--- a/paddle/pten/api/include/core.h
+++ b/paddle/pten/api/include/core.h
@@ -19,5 +19,4 @@ limitations under the License. */
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_context.h"
 #include "paddle/pten/core/kernel_factory.h"
-#include "paddle/pten/core/scalar.h"
 #include "paddle/pten/core/tensor_meta.h"
diff --git a/paddle/pten/common/backend.h b/paddle/pten/common/backend.h
index 9808b45b45c7c..e0bf746050a67 100644
--- a/paddle/pten/common/backend.h
+++ b/paddle/pten/common/backend.h
@@ -16,6 +16,8 @@ limitations under the License. */
 
 #include <ostream>
 
+#include "paddle/fluid/platform/enforce.h"
+
 namespace paddle {
 namespace experimental {
 
@@ -78,7 +80,8 @@ inline std::ostream& operator<<(std::ostream& os, Backend backend) {
       os << "CUDNN";
       break;
     default:
-      throw std::runtime_error("Invalid Backend type.");
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Invalid enum backend type `%d`.", static_cast<int>(backend)));
   }
   return os;
 }
diff --git a/paddle/pten/common/data_type.h b/paddle/pten/common/data_type.h
index f9c6d032f71ed..2475e4086e731 100644
--- a/paddle/pten/common/data_type.h
+++ b/paddle/pten/common/data_type.h
@@ -18,7 +18,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/errors.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
@@ -164,13 +163,13 @@ inline std::ostream& operator<<(std::ostream& os, DataType dtype) {
       os << "complex128";
       break;
     default:
-      // TODO(chenweihang): change to enforce later
-      throw std::runtime_error("Invalid DataType type.");
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Invalid enum data type `%d`.", static_cast<int>(dtype)));
   }
   return os;
 }
 
-inline DataType& operator++(DataType& dtype, int) {
+inline DataType& operator++(DataType dtype, int) {
   dtype =
       DataType(static_cast<std::underlying_type<DataType>::type>(dtype) + 1);
   return dtype;
diff --git a/paddle/pten/common/layout.h b/paddle/pten/common/layout.h
index bcf468824f233..99288bead4ced 100644
--- a/paddle/pten/common/layout.h
+++ b/paddle/pten/common/layout.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/fluid/platform/enforce.h"
+
 namespace paddle {
 namespace experimental {
 
@@ -26,8 +28,8 @@ enum class DataLayout {
   NUM_DATA_LAYOUTS,
 };
 
-inline std::ostream& operator<<(std::ostream& os, DataLayout dtype) {
-  switch (dtype) {
+inline std::ostream& operator<<(std::ostream& os, DataLayout layout) {
+  switch (layout) {
     case DataLayout::UNDEFINED:
       os << "Undefined";
       break;
@@ -44,13 +46,13 @@ inline std::ostream& operator<<(std::ostream& os, DataLayout dtype) {
       os << "MKLDNN";
       break;
     default:
-      // TODO(chenweihang): change to enforce later
-      throw std::runtime_error("Invalid DataLayout type.");
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Invalid enum data layout type `%d`.", static_cast<int>(layout)));
   }
   return os;
 }
 
-inline DataLayout& operator++(DataLayout& layout, int) {
+inline DataLayout& operator++(DataLayout layout, int) {
   layout = DataLayout(
       static_cast<std::underlying_type<DataLayout>::type>(layout) + 1);
   return layout;
diff --git a/paddle/pten/core/scalar.h b/paddle/pten/common/scalar.h
similarity index 82%
rename from paddle/pten/core/scalar.h
rename to paddle/pten/common/scalar.h
index f8cdd43cc5e4c..c55b700979ac4 100644
--- a/paddle/pten/core/scalar.h
+++ b/paddle/pten/common/scalar.h
@@ -14,7 +14,12 @@ limitations under the License. */
 
 #pragma once
 
-namespace pten {
+#include <cstdint>
+
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace experimental {
 
 class Scalar {
  public:
@@ -43,7 +48,8 @@ class Scalar {
       case Tag::HAS_B:
         return static_cast<T>(data_.b);
       default:
-        throw std::runtime_error("Invalid Scalar type.");
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "Invalid enum scalar type tag `%d`.", static_cast<int>(tag)));
     }
   }
 
@@ -60,4 +66,9 @@ class Scalar {
   } data_;
 };
 
-}  // namespace pten
+}  // namespace experimental
+}  // namespace paddle
+
+namespace pten {
+using Scalar = paddle::experimental::Scalar;
+}
diff --git a/paddle/pten/core/kernel_utils.h b/paddle/pten/core/kernel_utils.h
index 3f8458aed6dfc..c45a81206323e 100644
--- a/paddle/pten/core/kernel_utils.h
+++ b/paddle/pten/core/kernel_utils.h
@@ -14,10 +14,10 @@
 
 #pragma once
 
+#include "paddle/pten/common/scalar.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_context.h"
 #include "paddle/pten/core/kernel_def.h"
-#include "paddle/pten/core/scalar.h"
 
 // See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/platform/device_context.h"
@@ -163,7 +163,7 @@ struct KernelImpl<Return (*)(Args...), kernel_fn> {
   PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int);
   PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int64_t);
   PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(paddle::platform::float16);
-  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const pten::Scalar&);
+  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const Scalar&);
 
   /* Output Helpers */
 
diff --git a/paddle/pten/hapi/include/backend_set.h b/paddle/pten/hapi/include/backend_set.h
index 00f59b45a188f..e01c195e95530 100644
--- a/paddle/pten/hapi/include/backend_set.h
+++ b/paddle/pten/hapi/include/backend_set.h
@@ -16,9 +16,8 @@ limitations under the License. */
 
 #include <ostream>
 
-// TODO(chenweihang): move this file into hapi/include when compile
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/pten/common/backend.h"
-
 namespace paddle {
 namespace experimental {
 
@@ -39,10 +38,10 @@ class BackendSet final {
   uint64_t bitset() const { return bitset_; }
 
   bool inline Has(Backend b) const {
-    // TODO(chenweihang): replace by internal assert method later
-    if (b == Backend::UNDEFINED) {
-      throw std::runtime_error("Backend argument can't be UNDEFINED.");
-    }
+    PADDLE_ENFORCE_NE(b,
+                      Backend::UNDEFINED,
+                      platform::errors::InvalidArgument(
+                          "Backend argument can't be UNDEFINED."));
     return static_cast<bool>(bitset_ & BackendSet(b).bitset());
   }
   bool IsEmpty() const { return bitset_ == 0; }
diff --git a/paddle/pten/hapi/include/creation.h b/paddle/pten/hapi/include/creation.h
index f1c4c06b42622..6f978be995273 100644
--- a/paddle/pten/hapi/include/creation.h
+++ b/paddle/pten/hapi/include/creation.h
@@ -15,14 +15,14 @@
 #pragma once
 
 #include "paddle/pten/common/data_type.h"
-#include "paddle/pten/core/scalar.h"
+#include "paddle/pten/common/scalar.h"
 #include "paddle/pten/hapi/include/tensor.h"
 
 namespace paddle {
 namespace experimental {
 
 Tensor full_like(const Tensor& x,
-                 const pten::Scalar& value,
+                 const Scalar& value,
                  DataType dtype = DataType::UNDEFINED);
 
 Tensor ones_like(const Tensor& x, DataType dtype = DataType::UNDEFINED);
diff --git a/paddle/pten/hapi/include/tensor.h b/paddle/pten/hapi/include/tensor.h
index 393332eefa119..66ea7853541bd 100644
--- a/paddle/pten/hapi/include/tensor.h
+++ b/paddle/pten/hapi/include/tensor.h
@@ -39,6 +39,7 @@ limitations under the License. */
  * or the corresponding components will be re-implemented.
  */
 #include "paddle/fluid/framework/ddim.h"
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 
 namespace paddle {
@@ -93,9 +94,9 @@ class Tensor final {
    */
   explicit Tensor(std::shared_ptr<pten::TensorBase> tensor_impl)
       : impl_(std::move(tensor_impl)) {
-    if (impl_.get() == nullptr) {
-      throw std::runtime_error("TensorImpl with nullptr is not supported");
-    }
+    PADDLE_ENFORCE_NOT_NULL(impl_,
+                            platform::errors::InvalidArgument(
+                                "TensorImpl with nullptr is not supported"));
   }
 
   /* Part 2: Dimension, DataType and DataLayout methods */
diff --git a/paddle/pten/hapi/lib/creation.cc b/paddle/pten/hapi/lib/creation.cc
index 046a76e13295b..5048b983b122f 100644
--- a/paddle/pten/hapi/lib/creation.cc
+++ b/paddle/pten/hapi/lib/creation.cc
@@ -26,7 +26,7 @@ namespace paddle {
 namespace experimental {
 
 Tensor full_like(const Tensor& x,
-                 const pten::Scalar& value,
+                 const Scalar& value,
                  paddle::experimental::DataType dtype) {
   // 1. Get kernel signature and kernel
   auto kernel_key_set = ParseKernelKeyByInputArgs(x);
@@ -63,11 +63,11 @@ Tensor full_like(const Tensor& x,
   return out;
 }
 
-Tensor ones_like(const Tensor& x, paddle::experimental::DataType dtype) {
+Tensor ones_like(const Tensor& x, DataType dtype) {
   return full_like(x, 1, dtype);
 }
 
-Tensor zeros_like(const Tensor& x, paddle::experimental::DataType dtype) {
+Tensor zeros_like(const Tensor& x, DataType dtype) {
   return full_like(x, 0, dtype);
 }
 
diff --git a/paddle/pten/kernels/cpu/creation.h b/paddle/pten/kernels/cpu/creation.h
index 7674e6bb05157..9991df315556d 100644
--- a/paddle/pten/kernels/cpu/creation.h
+++ b/paddle/pten/kernels/cpu/creation.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
+#include "paddle/pten/common/scalar.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/core/scalar.h"
 
 #include "paddle/fluid/platform/device_context.h"
 
diff --git a/paddle/pten/kernels/cuda/creation.h b/paddle/pten/kernels/cuda/creation.h
index 21772f1f98d07..84a868e917ba1 100644
--- a/paddle/pten/kernels/cuda/creation.h
+++ b/paddle/pten/kernels/cuda/creation.h
@@ -17,8 +17,8 @@
 // CUDA and HIP use same api
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
+#include "paddle/pten/common/scalar.h"
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/core/scalar.h"
 
 #include "paddle/fluid/platform/device_context.h"
 
diff --git a/paddle/pten/kernels/cuda/math.cu b/paddle/pten/kernels/cuda/math.cu
index 1306cdc8017e6..4ebe58629545e 100644
--- a/paddle/pten/kernels/cuda/math.cu
+++ b/paddle/pten/kernels/cuda/math.cu
@@ -26,6 +26,7 @@ limitations under the License. */
 namespace cub = hipcub;
 #endif
 
+#include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/pten/core/convert_utils.h"
 #include "paddle/pten/core/kernel_registry.h"
@@ -104,9 +105,10 @@ void ScaleHost(const CUDAContext& dev_ctx,
                float bias,
                bool bias_after_scale,
                DenseTensor* out) {
-  if (paddle::platform::is_gpu_place(scale.place())) {
-    throw std::runtime_error("scale host place error.");
-  }
+  PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(scale.place()),
+                    false,
+                    paddle::platform::errors::InvalidArgument(
+                        "Scale argument isn't a host tensor."));
   eigen::Scale<CUDAContext, T>(dev_ctx,
                                x,
                                static_cast<float>(*scale.data<T>()),

From 1685b670559c995051b16cfd71c35ea1ebb77b92 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Fri, 22 Oct 2021 08:28:03 +0000
Subject: [PATCH 107/125] revert dtype layout change to fix error

---
 paddle/pten/common/data_type.h | 2 +-
 paddle/pten/common/layout.h    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/pten/common/data_type.h b/paddle/pten/common/data_type.h
index 2475e4086e731..af0548cbda581 100644
--- a/paddle/pten/common/data_type.h
+++ b/paddle/pten/common/data_type.h
@@ -169,7 +169,7 @@ inline std::ostream& operator<<(std::ostream& os, DataType dtype) {
   return os;
 }
 
-inline DataType& operator++(DataType dtype, int) {
+inline DataType& operator++(DataType& dtype, int) {
   dtype =
       DataType(static_cast<std::underlying_type<DataType>::type>(dtype) + 1);
   return dtype;
diff --git a/paddle/pten/common/layout.h b/paddle/pten/common/layout.h
index 99288bead4ced..8e14f98625051 100644
--- a/paddle/pten/common/layout.h
+++ b/paddle/pten/common/layout.h
@@ -52,7 +52,7 @@ inline std::ostream& operator<<(std::ostream& os, DataLayout layout) {
   return os;
 }
 
-inline DataLayout& operator++(DataLayout layout, int) {
+inline DataLayout& operator++(DataLayout& layout, int) {
   layout = DataLayout(
       static_cast<std::underlying_type<DataLayout>::type>(layout) + 1);
   return layout;

From 7b7e98838f665013018d52242365b7dfc04da0ac Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Fri, 22 Oct 2021 09:31:42 +0000
Subject: [PATCH 108/125] fix enum operator override error

---
 paddle/pten/common/data_type.h     |  6 ------
 paddle/pten/common/layout.h        |  6 ------
 paddle/pten/core/kernel_registry.h | 14 ++++++++------
 paddle/pten/hapi/include/linalg.h  |  5 -----
 4 files changed, 8 insertions(+), 23 deletions(-)

diff --git a/paddle/pten/common/data_type.h b/paddle/pten/common/data_type.h
index af0548cbda581..f5383da31cf93 100644
--- a/paddle/pten/common/data_type.h
+++ b/paddle/pten/common/data_type.h
@@ -169,12 +169,6 @@ inline std::ostream& operator<<(std::ostream& os, DataType dtype) {
   return os;
 }
 
-inline DataType& operator++(DataType& dtype, int) {
-  dtype =
-      DataType(static_cast<std::underlying_type<DataType>::type>(dtype) + 1);
-  return dtype;
-}
-
 }  // namespace experimental
 }  // namespace paddle
 
diff --git a/paddle/pten/common/layout.h b/paddle/pten/common/layout.h
index 8e14f98625051..0da10dff4335b 100644
--- a/paddle/pten/common/layout.h
+++ b/paddle/pten/common/layout.h
@@ -52,12 +52,6 @@ inline std::ostream& operator<<(std::ostream& os, DataLayout layout) {
   return os;
 }
 
-inline DataLayout& operator++(DataLayout& layout, int) {
-  layout = DataLayout(
-      static_cast<std::underlying_type<DataLayout>::type>(layout) + 1);
-  return layout;
-}
-
 }  // namespace experimental
 }  // namespace paddle
 
diff --git a/paddle/pten/core/kernel_registry.h b/paddle/pten/core/kernel_registry.h
index b77f641b9f51b..adfe0d98b68f7 100644
--- a/paddle/pten/core/kernel_registry.h
+++ b/paddle/pten/core/kernel_registry.h
@@ -107,22 +107,24 @@ struct KernelRegistrar {
                   KernelArgsDefFn args_def_fn,
                   KernelFn kernel_fn) {
     if (layout == DataLayout::ANY) {
-      for (DataLayout layout_iter = DataLayout::NHWC;
-           layout_iter != DataLayout::NUM_DATA_LAYOUTS;
+      for (size_t layout_iter = static_cast<size_t>(DataLayout::NHWC);
+           layout_iter != static_cast<size_t>(DataLayout::NUM_DATA_LAYOUTS);
            layout_iter++) {
-        for (DataType dtype = DataType::BOOL; dtype != DataType::NUM_DATA_TYPES;
+        for (size_t dtype = static_cast<size_t>(DataType::BOOL);
+             dtype != static_cast<size_t>(DataType::NUM_DATA_TYPES);
              dtype++) {
           ConstructKernel(kernel_name_cstr,
                           backend,
-                          layout_iter,
-                          dtype,
+                          static_cast<DataLayout>(layout_iter),
+                          static_cast<DataType>(dtype),
                           args_parse_fn,
                           args_def_fn,
                           kernel_fn);
         }
       }
     } else {
-      for (DataType dtype = DataType::BOOL; dtype != DataType::NUM_DATA_TYPES;
+      for (size_t dtype = static_cast<size_t>(DataType::BOOL);
+           dtype != static_cast<size_t>(DataType::NUM_DATA_TYPES);
            dtype++) {
         ConstructKernel(kernel_name_cstr,
                         backend,
diff --git a/paddle/pten/hapi/include/linalg.h b/paddle/pten/hapi/include/linalg.h
index 6e78b50af11c3..fd628ea19334e 100644
--- a/paddle/pten/hapi/include/linalg.h
+++ b/paddle/pten/hapi/include/linalg.h
@@ -21,10 +21,5 @@ namespace experimental {
 
 Tensor dot(const Tensor& x, const Tensor& y);
 
-Tensor matmul(const Tensor& x,
-              const Tensor& y,
-              bool transpose_x,
-              bool transpose_y);
-
 }  // namespace experimental
 }  // namespace paddle

From 52fead064982bf6ad9aab5b53ac38c065cd919bc Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Fri, 22 Oct 2021 14:24:12 +0000
Subject: [PATCH 109/125] add several base unittests

---
 paddle/pten/hapi/include/math.h          |  2 +
 paddle/pten/tests/CMakeLists.txt         |  3 ++
 paddle/pten/tests/backend_test.cc        | 32 +++++++++++
 paddle/pten/tests/data_layout_test.cc    | 44 +++++++++++++++
 paddle/pten/tests/data_type_test.cc      | 68 ++++++++++++++++++++++++
 paddle/pten/tests/dense_tensor_test.cc   | 12 -----
 paddle/pten/tests/dtype_test.cc          | 13 -----
 paddle/pten/tests/kernel_factory_test.cc | 28 +++++++++-
 paddle/pten/tests/layout_test.cc         | 13 -----
 paddle/pten/tests/test_dot_api.cc        |  1 +
 paddle/pten/tests/test_fill_api.cc       |  1 +
 paddle/pten/tests/test_flatten_api.cc    |  1 +
 paddle/pten/tests/test_mean_api.cc       |  1 +
 13 files changed, 179 insertions(+), 40 deletions(-)
 create mode 100644 paddle/pten/tests/data_layout_test.cc
 create mode 100644 paddle/pten/tests/data_type_test.cc
 delete mode 100644 paddle/pten/tests/dtype_test.cc
 delete mode 100644 paddle/pten/tests/layout_test.cc

diff --git a/paddle/pten/hapi/include/math.h b/paddle/pten/hapi/include/math.h
index 0b3dbab70e86f..db4010c1c14e3 100644
--- a/paddle/pten/hapi/include/math.h
+++ b/paddle/pten/hapi/include/math.h
@@ -19,6 +19,8 @@ limitations under the License. */
 namespace paddle {
 namespace experimental {
 
+// TODO(chenweihang): add scale API
+// TODO(chenweihang): move mean API into stat.h/cc
 Tensor mean(const Tensor& x);
 
 }  // namespace experimental
diff --git a/paddle/pten/tests/CMakeLists.txt b/paddle/pten/tests/CMakeLists.txt
index 5cc7a3f4cc77e..d30ac2578d00b 100644
--- a/paddle/pten/tests/CMakeLists.txt
+++ b/paddle/pten/tests/CMakeLists.txt
@@ -1,3 +1,6 @@
+cc_test(pten_backend_test SRCS backend_test.cc DEPS gtest)
+cc_test(pten_data_layout_test SRCS data_layout_test.cc DEPS gtest)
+cc_test(pten_data_type_test SRCS data_type_test.cc DEPS gtest)
 cc_test(dense_tensor_test SRCS dense_tensor_test.cc DEPS dense_tensor)
 cc_test(kernel_factory_test SRCS kernel_factory_test.cc DEPS kernel_factory)
 cc_test(test_mean_api SRCS test_mean_api.cc DEPS math_api)
diff --git a/paddle/pten/tests/backend_test.cc b/paddle/pten/tests/backend_test.cc
index c1f756f11ad72..2bae2cd417165 100644
--- a/paddle/pten/tests/backend_test.cc
+++ b/paddle/pten/tests/backend_test.cc
@@ -15,3 +15,35 @@ limitations under the License. */
 #include "paddle/pten/common/backend.h"
 
 #include <gtest/gtest.h>
+#include <iostream>
+
+TEST(Backend, OStream) {
+  std::ostringstream oss;
+  oss << pten::Backend::UNDEFINED;
+  EXPECT_EQ(oss.str(), "Undefined");
+  oss.str("");
+  oss << pten::Backend::CPU;
+  EXPECT_EQ(oss.str(), "CPU");
+  oss.str("");
+  oss << pten::Backend::CUDA;
+  EXPECT_EQ(oss.str(), "CUDA");
+  oss.str("");
+  oss << pten::Backend::XPU;
+  EXPECT_EQ(oss.str(), "XPU");
+  oss.str("");
+  oss << pten::Backend::NPU;
+  EXPECT_EQ(oss.str(), "NPU");
+  oss.str("");
+  oss << pten::Backend::MKLDNN;
+  EXPECT_EQ(oss.str(), "MKLDNN");
+  oss.str("");
+  oss << pten::Backend::CUDNN;
+  EXPECT_EQ(oss.str(), "CUDNN");
+  oss.str("");
+  try {
+    oss << pten::Backend::NUM_BACKENDS;
+  } catch (paddle::platform::EnforceNotMet &exception) {
+    std::string ex_msg = exception.what();
+    EXPECT_TRUE(ex_msg.find("Invalid enum backend type") != std::string::npos);
+  }
+}
diff --git a/paddle/pten/tests/data_layout_test.cc b/paddle/pten/tests/data_layout_test.cc
new file mode 100644
index 0000000000000..efa19670f25be
--- /dev/null
+++ b/paddle/pten/tests/data_layout_test.cc
@@ -0,0 +1,44 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <sstream>
+#include "paddle/pten/common/layout.h"
+
+TEST(DataLayout, OStream) {
+  std::ostringstream oss;
+  oss << pten::DataLayout::UNDEFINED;
+  EXPECT_EQ(oss.str(), "Undefined");
+  oss.str("");
+  oss << pten::DataLayout::ANY;
+  EXPECT_EQ(oss.str(), "Any");
+  oss.str("");
+  oss << pten::DataLayout::NHWC;
+  EXPECT_EQ(oss.str(), "NHWC");
+  oss.str("");
+  oss << pten::DataLayout::NCHW;
+  EXPECT_EQ(oss.str(), "NCHW");
+  oss.str("");
+  oss << pten::DataLayout::MKLDNN;
+  EXPECT_EQ(oss.str(), "MKLDNN");
+  oss.str("");
+  try {
+    oss << pten::DataLayout::NUM_DATA_LAYOUTS;
+  } catch (paddle::platform::EnforceNotMet &exception) {
+    std::string ex_msg = exception.what();
+    EXPECT_TRUE(ex_msg.find("Invalid enum data layout type") !=
+                std::string::npos);
+  }
+}
diff --git a/paddle/pten/tests/data_type_test.cc b/paddle/pten/tests/data_type_test.cc
new file mode 100644
index 0000000000000..bcdef84040523
--- /dev/null
+++ b/paddle/pten/tests/data_type_test.cc
@@ -0,0 +1,68 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/common/data_type.h"
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <sstream>
+
+TEST(DataType, OStream) {
+  std::ostringstream oss;
+  oss << pten::DataType::UNDEFINED;
+  EXPECT_EQ(oss.str(), "Undefined");
+  oss.str("");
+  oss << pten::DataType::BOOL;
+  EXPECT_EQ(oss.str(), "bool");
+  oss.str("");
+  oss << pten::DataType::INT8;
+  EXPECT_EQ(oss.str(), "int8");
+  oss.str("");
+  oss << pten::DataType::UINT8;
+  EXPECT_EQ(oss.str(), "uint8");
+  oss.str("");
+  oss << pten::DataType::INT16;
+  EXPECT_EQ(oss.str(), "int16");
+  oss.str("");
+  oss << pten::DataType::INT32;
+  EXPECT_EQ(oss.str(), "int32");
+  oss.str("");
+  oss << pten::DataType::INT64;
+  EXPECT_EQ(oss.str(), "int64");
+  oss.str("");
+  oss << pten::DataType::BFLOAT16;
+  EXPECT_EQ(oss.str(), "bfloat16");
+  oss.str("");
+  oss << pten::DataType::FLOAT16;
+  EXPECT_EQ(oss.str(), "float16");
+  oss.str("");
+  oss << pten::DataType::FLOAT32;
+  EXPECT_EQ(oss.str(), "float32");
+  oss.str("");
+  oss << pten::DataType::FLOAT64;
+  EXPECT_EQ(oss.str(), "float64");
+  oss.str("");
+  oss << pten::DataType::COMPLEX64;
+  EXPECT_EQ(oss.str(), "complex64");
+  oss.str("");
+  oss << pten::DataType::COMPLEX128;
+  EXPECT_EQ(oss.str(), "complex128");
+  oss.str("");
+  try {
+    oss << pten::DataType::NUM_DATA_TYPES;
+  } catch (paddle::platform::EnforceNotMet &exception) {
+    std::string ex_msg = exception.what();
+    EXPECT_TRUE(ex_msg.find("Invalid enum data type") != std::string::npos);
+  }
+}
diff --git a/paddle/pten/tests/dense_tensor_test.cc b/paddle/pten/tests/dense_tensor_test.cc
index bae660ac1c120..722eab17ec412 100644
--- a/paddle/pten/tests/dense_tensor_test.cc
+++ b/paddle/pten/tests/dense_tensor_test.cc
@@ -31,15 +31,3 @@ TEST(DenseTensor, Constructor) {
   ASSERT_EQ(tensor.data_type(), pten::DataType::FLOAT32);
   ASSERT_EQ(tensor.layout(), pten::DataLayout::NCHW);
 }
-
-TEST(DenseTensor, Dims) {
-  // impl later
-}
-
-TEST(DenseTensor, Place) {
-  // impl later
-}
-
-TEST(DenseTensor, Data) {
-  // impl later
-}
diff --git a/paddle/pten/tests/dtype_test.cc b/paddle/pten/tests/dtype_test.cc
deleted file mode 100644
index b2b09faaa9d44..0000000000000
--- a/paddle/pten/tests/dtype_test.cc
+++ /dev/null
@@ -1,13 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
diff --git a/paddle/pten/tests/kernel_factory_test.cc b/paddle/pten/tests/kernel_factory_test.cc
index e52bb99ca16fa..c1c17171b5898 100644
--- a/paddle/pten/tests/kernel_factory_test.cc
+++ b/paddle/pten/tests/kernel_factory_test.cc
@@ -12,12 +12,36 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <iostream>
+#include <sstream>
+
 #include "paddle/pten/core/kernel_factory.h"
 
 #include "gtest/gtest.h"
 
-TEST(KernelFactory, KernelKey) {
+// TODO(chenweihang): add more unittests later
+
+TEST(KernelName, ConstructAndOStream) {
+  std::ostringstream oss;
+  oss << pten::KernelName("scale", "host");
+  EXPECT_EQ(oss.str(), "scale.host");
+  pten::KernelName kernel_name1("scale.host");
+  EXPECT_EQ(kernel_name1.name(), "scale");
+  EXPECT_EQ(kernel_name1.overload_name(), "host");
+  pten::KernelName kernel_name2("scale.host");
+  EXPECT_EQ(kernel_name2.name(), "scale");
+  EXPECT_EQ(kernel_name2.overload_name(), "host");
+}
+
+TEST(KernelKey, ConstructAndOStream) {
   pten::KernelKey key(
       pten::Backend::CPU, pten::DataLayout::NCHW, pten::DataType::FLOAT32);
-  std::cout << key;
+  EXPECT_EQ(key.backend(), pten::Backend::CPU);
+  EXPECT_EQ(key.layout(), pten::DataLayout::NCHW);
+  EXPECT_EQ(key.dtype(), pten::DataType::FLOAT32);
+  std::ostringstream oss;
+  oss << key;
+  std::cout << oss.str();
+  // EXPECT_EQ(oss.str(), "scale.host");
+  oss.flush();
 }
diff --git a/paddle/pten/tests/layout_test.cc b/paddle/pten/tests/layout_test.cc
deleted file mode 100644
index b2b09faaa9d44..0000000000000
--- a/paddle/pten/tests/layout_test.cc
+++ /dev/null
@@ -1,13 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
diff --git a/paddle/pten/tests/test_dot_api.cc b/paddle/pten/tests/test_dot_api.cc
index e567f090bd51d..affa18469ec21 100644
--- a/paddle/pten/tests/test_dot_api.cc
+++ b/paddle/pten/tests/test_dot_api.cc
@@ -29,6 +29,7 @@ PT_DECLARE_MODULE(LinalgCUDA);
 namespace framework = paddle::framework;
 using DDim = paddle::framework::DDim;
 
+// TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, dot) {
   // 1. create tensor
   auto dense_x = std::make_shared<pten::DenseTensor>(
diff --git a/paddle/pten/tests/test_fill_api.cc b/paddle/pten/tests/test_fill_api.cc
index ec69c01b88258..afb36f95e8a1e 100644
--- a/paddle/pten/tests/test_fill_api.cc
+++ b/paddle/pten/tests/test_fill_api.cc
@@ -29,6 +29,7 @@ PT_DECLARE_MODULE(CreationCUDA);
 namespace framework = paddle::framework;
 using DDim = paddle::framework::DDim;
 
+// TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, full_like) {
   // 1. create tensor
   auto dense_x = std::make_shared<pten::DenseTensor>(
diff --git a/paddle/pten/tests/test_flatten_api.cc b/paddle/pten/tests/test_flatten_api.cc
index 12a5e3266ec19..7f68cd75bc8d2 100644
--- a/paddle/pten/tests/test_flatten_api.cc
+++ b/paddle/pten/tests/test_flatten_api.cc
@@ -29,6 +29,7 @@ PT_DECLARE_MODULE(ManipulationCUDA);
 namespace framework = paddle::framework;
 using DDim = paddle::framework::DDim;
 
+// TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, flatten) {
   // 1. create tensor
   auto dense_x = std::make_shared<pten::DenseTensor>(
diff --git a/paddle/pten/tests/test_mean_api.cc b/paddle/pten/tests/test_mean_api.cc
index eb41058316415..9c0472916e01d 100644
--- a/paddle/pten/tests/test_mean_api.cc
+++ b/paddle/pten/tests/test_mean_api.cc
@@ -29,6 +29,7 @@ PT_DECLARE_MODULE(MathCUDA);
 namespace framework = paddle::framework;
 using DDim = paddle::framework::DDim;
 
+// TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, mean) {
   // 1. create tensor
   auto dense_x = std::make_shared<pten::DenseTensor>(

From 2ff27213c31516aa49460630b07997d688a124c7 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Sat, 23 Oct 2021 09:26:29 +0000
Subject: [PATCH 110/125] add pten utils tests

---
 paddle/fluid/framework/pten_utils.cc      |  4 --
 paddle/fluid/framework/pten_utils_test.cc | 82 ++++++++++++++++++-----
 2 files changed, 65 insertions(+), 21 deletions(-)

diff --git a/paddle/fluid/framework/pten_utils.cc b/paddle/fluid/framework/pten_utils.cc
index e0e43db139065..9dac142557ed4 100644
--- a/paddle/fluid/framework/pten_utils.cc
+++ b/paddle/fluid/framework/pten_utils.cc
@@ -37,8 +37,6 @@ std::shared_ptr<pten::DenseTensor> MakeTensorImpl<pten::DenseTensor, LoDTensor>(
 
   if (holder != nullptr) {
     tensor_impl->ShareAllocation(tensor.Holder());
-  } else {
-    VLOG(1) << "Old LoDTensor holder is nullptr.";
   }
   return tensor_impl;
 }
@@ -55,8 +53,6 @@ std::shared_ptr<pten::DenseTensor> MakeTensorImpl<pten::DenseTensor, Tensor>(
 
   if (holder != nullptr) {
     tensor_impl->ShareAllocation(tensor.Holder());
-  } else {
-    VLOG(1) << "Old Tensor holder is nullptr.";
   }
   return tensor_impl;
 }
diff --git a/paddle/fluid/framework/pten_utils_test.cc b/paddle/fluid/framework/pten_utils_test.cc
index b3f0e516a4781..33c55a8086b4e 100644
--- a/paddle/fluid/framework/pten_utils_test.cc
+++ b/paddle/fluid/framework/pten_utils_test.cc
@@ -18,20 +18,18 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/variable.h"
 
-namespace paddle {
-namespace framework {
-
-TEST(TcmptUtils, MakeTensor) {
+TEST(PtenUtils, FluidTensorToPtenTensor) {
   // 1. create tensor
-  LoDTensor x;
-  Tensor x2;
+  paddle::framework::LoDTensor x;
+  paddle::framework::Tensor x2;
   x.Resize({2});
-  x.mutable_data<float>(platform::CPUPlace());
+  x.mutable_data<float>(paddle::platform::CPUPlace());
   x.data<float>()[0] = 0.2;
   x.data<float>()[1] = 0.5;
 
   // 2. test API
-  auto dense_x = MakeTensorImpl<pten::DenseTensor>(x, x.place(), x.type());
+  auto dense_x = paddle::framework::MakeTensorImpl<pten::DenseTensor>(
+      x, x.place(), x.type());
 
   // 3. check result
   std::vector<float> expect_value = {0.2, 0.5};
@@ -41,13 +39,13 @@ TEST(TcmptUtils, MakeTensor) {
   ASSERT_EQ(dense_x->data_type(), pten::DataType::FLOAT32);
 }
 
-TEST(TcmptUtils, VarToPtenTensor) {
+TEST(PtenUtils, VarToPtenTensor) {
   // 1. create Variable
-  Variable v;
-  auto selected_rows = v.GetMutable<SelectedRows>();
-  Tensor* value = selected_rows->mutable_value();
-  auto* data =
-      value->mutable_data<int>(make_ddim({1, 1}), paddle::platform::CPUPlace());
+  paddle::framework::Variable v;
+  auto selected_rows = v.GetMutable<paddle::framework::SelectedRows>();
+  paddle::framework::Tensor* value = selected_rows->mutable_value();
+  auto* data = value->mutable_data<int>(paddle::framework::make_ddim({1, 1}),
+                                        paddle::platform::CPUPlace());
   data[0] = 123;
   pten::Backend expect_backend = pten::Backend::CPU;
 
@@ -57,11 +55,61 @@ TEST(TcmptUtils, VarToPtenTensor) {
   auto tensor_def = pten::TensorArgDef(expect_backend, pten::DataLayout::NCHW,
                                        pten::DataType::INT32);
   // 2. test API
-  auto tensor_x = InputVariableToPtenTensor(v, tensor_def);
+  auto tensor_x = paddle::framework::InputVariableToPtenTensor(v, tensor_def);
   // 3. check result
   ASSERT_EQ(tensor_x->backend(), expect_backend);
   ASSERT_EQ(tensor_x->data_type(), pten::DataType::INT32);
 }
 
-}  // namespace framework
-}  // namespace paddle
+TEST(PtenUtils, PtenTensorToFluidTensor) {
+  pten::DenseTensor dense_tensor(
+      pten::TensorMeta(paddle::framework::make_ddim({1, 1}), pten::Backend::CPU,
+                       pten::DataType::FLOAT32, pten::DataLayout::ANY),
+      pten::TensorStatus());
+  auto* data_ptr = dense_tensor.mutable_data<float>();
+  data_ptr[0] = 0.5;
+  // share allocation into fluid Tensor
+  paddle::framework::Tensor tensor;
+  paddle::framework::LoDTensor lod_tensor;
+  paddle::framework::ShareTensorImpl(&dense_tensor, &tensor);
+  paddle::framework::ShareTensorImpl(&dense_tensor, &lod_tensor);
+  // compare
+  ASSERT_EQ(tensor.data<float>()[0], 0.5);
+  ASSERT_EQ(lod_tensor.data<float>()[0], 0.5);
+}
+
+TEST(PtenUtils, TransPtenKernelKeyToOpKernelType) {
+  pten::KernelKey kernel_key(pten::Backend::CPU, pten::DataLayout::NCHW,
+                             pten::DataType::FLOAT32);
+  auto op_kernel_type =
+      paddle::framework::TransPtenKernelKeyToOpKernelType(kernel_key);
+  ASSERT_EQ(op_kernel_type.data_type_, paddle::framework::proto::VarType::FP32);
+  ASSERT_EQ(op_kernel_type.data_layout_, paddle::framework::DataLayout::kNCHW);
+  ASSERT_TRUE(paddle::platform::is_cpu_place(op_kernel_type.place_));
+  ASSERT_EQ(op_kernel_type.library_type_,
+            paddle::framework::LibraryType::kPlain);
+
+#ifdef PADDLE_WITH_MKLDNN
+  pten::KernelKey kernel_key_mkldnn(
+      pten::Backend::MKLDNN, pten::DataLayout::NCHW, pten::DataType::FLOAT32);
+  op_kernel_type =
+      paddle::framework::TransPtenKernelKeyToOpKernelType(kernel_key_mkldnn);
+  ASSERT_EQ(op_kernel_type.data_type_, paddle::framework::proto::VarType::FP32);
+  ASSERT_EQ(op_kernel_type.data_layout_, paddle::framework::DataLayout::kNCHW);
+  ASSERT_TRUE(paddle::platform::is_cpu_place(op_kernel_type.place_));
+  ASSERT_EQ(op_kernel_type.library_type_,
+            paddle::framework::LibraryType::kMKLDNN);
+#endif
+
+#ifdef PADDLE_WITH_CUDA
+  pten::KernelKey kernel_key_cudnn(pten::Backend::CUDNN, pten::DataLayout::NCHW,
+                                   pten::DataType::FLOAT32);
+  op_kernel_type =
+      paddle::framework::TransPtenKernelKeyToOpKernelType(kernel_key_cudnn);
+  ASSERT_EQ(op_kernel_type.data_type_, paddle::framework::proto::VarType::FP32);
+  ASSERT_EQ(op_kernel_type.data_layout_, paddle::framework::DataLayout::kNCHW);
+  ASSERT_TRUE(paddle::platform::is_gpu_place(op_kernel_type.place_));
+  ASSERT_EQ(op_kernel_type.library_type_,
+            paddle::framework::LibraryType::kCUDNN);
+#endif
+}

From b5c77e51e6376603c1d09fb6c310ca72f6549ba5 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Sun, 24 Oct 2021 13:11:37 +0000
Subject: [PATCH 111/125] polish some details

---
 paddle/fluid/framework/operator.cc           | 10 +++++-----
 paddle/fluid/framework/operator.h            |  2 +-
 paddle/fluid/imperative/prepared_operator.cc | 10 +++++-----
 paddle/fluid/imperative/prepared_operator.h  |  2 +-
 paddle/fluid/platform/flags.cc               |  6 +++---
 paddle/pten/CMakeLists.txt                   |  2 --
 paddle/pten/kernels/CMakeLists.txt           |  2 ++
 paddle/pten/kernels/cpu/linalg.cc            |  8 --------
 paddle/pten/kernels/functions/CMakeLists.txt |  1 +
 paddle/pten/module/CMakeLists.txt            |  0
 10 files changed, 18 insertions(+), 25 deletions(-)
 create mode 100644 paddle/pten/kernels/functions/CMakeLists.txt
 delete mode 100644 paddle/pten/module/CMakeLists.txt

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 7c63f7c76c921..092949d87d25c 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -51,7 +51,7 @@ DECLARE_bool(check_nan_inf);
 DECLARE_bool(enable_unused_var_check);
 PADDLE_DEFINE_EXPORTED_int32(inner_op_parallelism, 0,
                              "number of threads for inner op");
-DECLARE_bool(run_pt_kernel);
+DECLARE_bool(run_pten_kernel);
 
 namespace paddle {
 namespace framework {
@@ -1130,14 +1130,14 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   // TODO(chenweihang): in the first phase of project, we only support CPU, CUDA
   // and RCOM backend, the XPU, NPU and MKLDNN will be supported in the second
   // phase
-  if (FLAGS_run_pt_kernel &&
+  if (FLAGS_run_pten_kernel &&
       pten::KernelFactory::Instance().HasCompatiblePtenKernel(type_)) {
     if (pt_kernel_signature_.get() == nullptr || pt_kernel_.get() == nullptr) {
       ChoosePtenKernel(exe_ctx);
     }
-    run_pt_kernel_ = pt_kernel_->IsValid();
+    run_pten_kernel_ = pt_kernel_->IsValid();
   }
-  if (!run_pt_kernel_) {
+  if (!run_pten_kernel_) {
     if (kernel_type_.get() == nullptr || kernel_func_.get() == nullptr) {
       ChooseKernel(exe_ctx);
     }
@@ -1178,7 +1178,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   {
     platform::RecordEvent record_event("compute",
                                        platform::EventRole::kInnerOp);
-    if (run_pt_kernel_) {
+    if (run_pten_kernel_) {
       auto op_kernel_ctx = BuildPtenKernelContext(*runtime_ctx, *dev_ctx);
       (*pt_kernel_)(&op_kernel_ctx);
     } else {
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 224974001c469..104c5a231375f 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -601,7 +601,7 @@ class OperatorWithKernel : public OperatorBase {
   // NOTE(chenweihang): Similar op members are used to adapt to
   // new pten kernel, if there is a better design in the future,
   // we may polish the implementation here
-  mutable bool run_pt_kernel_ = false;
+  mutable bool run_pten_kernel_ = false;
   mutable std::unique_ptr<KernelSignature> pt_kernel_signature_;
   mutable std::unique_ptr<pten::Kernel> pt_kernel_;
 };
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 2ffb47273f650..004cc3a0c5aa1 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -24,7 +24,7 @@
 #include "paddle/fluid/platform/xpu/xpu_op_list.h"
 #endif
 DECLARE_bool(check_nan_inf);
-DECLARE_bool(run_pt_kernel);
+DECLARE_bool(run_pten_kernel);
 
 namespace paddle {
 namespace imperative {
@@ -118,7 +118,7 @@ PreparedOp::PreparedOp(const framework::OperatorBase& op,
       kernel_type_(kernel_type),
       func_(nullptr),
       dev_ctx_(dev_ctx),
-      run_pt_kernel_(true),
+      run_pten_kernel_(true),
       pt_kernel_signature_(kernel_signature),
       pt_kernel_(pt_kernel) {}
 
@@ -153,7 +153,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
   auto expected_kernel_key = op.GetExpectedKernelType(dygraph_exe_ctx);
   VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
 
-  if (FLAGS_run_pt_kernel &&
+  if (FLAGS_run_pten_kernel &&
       pten::KernelFactory::Instance().HasCompatiblePtenKernel(op.Type())) {
     auto pt_kernel_signature = op.GetExpectedPtenKernelArgs(dygraph_exe_ctx);
 
@@ -417,7 +417,7 @@ void PreparedOp::Run(const NameVarMap<VarBase>& ins,
                      const NameVarMap<VarBase>& outs,
                      const framework::AttributeMap& attrs,
                      const framework::AttributeMap& default_attrs) {
-  if (run_pt_kernel_) {
+  if (run_pten_kernel_) {
     PreparedOpRunPtImpl<VarBase>(op_, pt_kernel_signature_, pt_kernel_,
                                  dev_ctx_, ins, outs, attrs, default_attrs);
   } else {
@@ -430,7 +430,7 @@ void PreparedOp::Run(const NameVarMap<VariableWrapper>& ins,
                      const NameVarMap<VariableWrapper>& outs,
                      const framework::AttributeMap& attrs,
                      const framework::AttributeMap& default_attrs) {
-  if (run_pt_kernel_) {
+  if (run_pten_kernel_) {
     PreparedOpRunPtImpl<VariableWrapper>(op_, pt_kernel_signature_, pt_kernel_,
                                          dev_ctx_, ins, outs, attrs,
                                          default_attrs);
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index 42bd581b9f24a..a2ff0aeec1a90 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -190,7 +190,7 @@ class PreparedOp {
   // NOTE(chenweihang): Similar op members are used to adapt to
   // new pten kernel, if there is a better design in the future,
   // we may polish the implementation here
-  bool run_pt_kernel_{false};
+  bool run_pten_kernel_{false};
   framework::KernelSignature pt_kernel_signature_;
   pten::Kernel pt_kernel_;
 };
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index c3d63f6eb2745..070d88076a824 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -683,16 +683,16 @@ PADDLE_DEFINE_EXPORTED_bool(
 
 /**
  * Pt kernel related FLAG
- * Name: FLAGS_run_pt_kernel
+ * Name: FLAGS_run_pten_kernel
  * Since Version: 2.2.0
  * Value Range: bool, default=false
- * Example: FLAGS_run_pt_kernel=true would use the pt kernel to compute in the
+ * Example: FLAGS_run_pten_kernel=true would use the pt kernel to compute in the
  * Op.
  * Note:
  */
 // TODO(chentianyu03): change default value to false before merge into develop
 // branch
-PADDLE_DEFINE_EXPORTED_bool(run_pt_kernel, true,
+PADDLE_DEFINE_EXPORTED_bool(run_pten_kernel, true,
                             "It controls whether to use pt kernel");
 
 /**
diff --git a/paddle/pten/CMakeLists.txt b/paddle/pten/CMakeLists.txt
index 4fc1c7f18e54f..c1fe2d552af13 100644
--- a/paddle/pten/CMakeLists.txt
+++ b/paddle/pten/CMakeLists.txt
@@ -8,7 +8,5 @@ add_subdirectory(core)
 add_subdirectory(kernels)
 # pten infershape
 add_subdirectory(infershape)
-# TODO(xingfeng): pten inner module API designed by a high-performance team
-add_subdirectory(module)
 # pten tests
 add_subdirectory(tests)
diff --git a/paddle/pten/kernels/CMakeLists.txt b/paddle/pten/kernels/CMakeLists.txt
index 09f7a1b102436..486fd73c00f33 100644
--- a/paddle/pten/kernels/CMakeLists.txt
+++ b/paddle/pten/kernels/CMakeLists.txt
@@ -1,3 +1,5 @@
+# pten basic functions called by kernels
+add_subdirectory(functions)
 # pten kernels for diff device
 add_subdirectory(cpu)
 if(WITH_GPU OR WITH_ROCM)
diff --git a/paddle/pten/kernels/cpu/linalg.cc b/paddle/pten/kernels/cpu/linalg.cc
index 96f8ba98e2949..df401370c881f 100644
--- a/paddle/pten/kernels/cpu/linalg.cc
+++ b/paddle/pten/kernels/cpu/linalg.cc
@@ -45,14 +45,6 @@ void Dot(const CPUContext& dev_ctx,
   }
 }
 
-template <typename T>
-void matmul(const CPUContext& dev_ctx,
-            const DenseTensor& x,
-            const DenseTensor& y,
-            bool transpose_x,
-            bool transpose_y,
-            DenseTensor* out) {}
-
 }  // namespace pten
 
 PT_REGISTER_MODULE(LinalgCPU);
diff --git a/paddle/pten/kernels/functions/CMakeLists.txt b/paddle/pten/kernels/functions/CMakeLists.txt
new file mode 100644
index 0000000000000..a3b2bf314b4c0
--- /dev/null
+++ b/paddle/pten/kernels/functions/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(eigen)
diff --git a/paddle/pten/module/CMakeLists.txt b/paddle/pten/module/CMakeLists.txt
deleted file mode 100644
index e69de29bb2d1d..0000000000000

From 5240ac0c5aa9c5118584301f0a6d992c3d319170 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Tue, 26 Oct 2021 08:46:16 +0800
Subject: [PATCH 112/125] Dev/op2func refactor 3 (#30)

* add a candidate dense tensor class, test=develop

* remove TensorBase::backend(), test=develop

* remove some ops, test=develop

* cherry-pick the pr of tensor meta, test=develop

* moves the dense tensor and some ops, test=develop

* update the linalg operator, test=develop

* update other operators, test=develop

* fix errors, test=develop

* fix bugs, test=develop

* try to resolve the problem of windows ci, test=develop

* updates codes, test=develop

* fix the tensor_utils.cc, test=develop

* modify the dense tensor, test=develop

* fix the data type, test=develop

Co-authored-by: shixiaowei02 <39303645+Shixiaowei02@users.noreply.github.com>
---
 paddle/fluid/framework/CMakeLists.txt         |   8 +-
 paddle/fluid/framework/operator.cc            |  12 +-
 paddle/fluid/framework/pten_utils.cc          | 142 ----------
 paddle/fluid/framework/pten_utils.h           |  30 +-
 paddle/fluid/framework/pten_utils_test.cc     |  60 ----
 paddle/fluid/imperative/prepared_operator.cc  |  14 +-
 paddle/fluid/operators/CMakeLists.txt         |   5 +-
 paddle/fluid/operators/dot_op.h               |  11 +-
 paddle/fluid/operators/fill_any_like_op.h     |   6 +-
 paddle/fluid/operators/mean_op.cu             |   1 +
 paddle/fluid/operators/mean_op.h              |   7 +-
 paddle/fluid/operators/scale_op.h             |   8 +-
 paddle/fluid/operators/sign_op.h              |   6 +-
 paddle/pten/common/data_type.h                |  16 +-
 paddle/pten/core/CMakeLists.txt               |  10 +-
 paddle/pten/core/candidate/CMakeLists.txt     |   1 -
 paddle/pten/core/candidate/dense_tensor.cc    | 145 ----------
 paddle/pten/core/candidate/dense_tensor.h     | 188 -------------
 paddle/pten/core/dense_tensor.cc              | 190 +++++++------
 paddle/pten/core/dense_tensor.h               | 256 ++++++++++--------
 paddle/pten/core/tensor_base.h                |   2 -
 paddle/pten/core/tensor_meta.h                | 152 ++++-------
 paddle/pten/hapi/CMakeLists.txt               |   2 +-
 paddle/pten/hapi/lib/creation.cc              |   9 +-
 paddle/pten/hapi/lib/linalg.cc                |   6 +-
 paddle/pten/hapi/lib/manipulation.cc          |   6 +-
 paddle/pten/hapi/lib/math.cc                  |   7 +-
 paddle/pten/hapi/lib/utils/CMakeLists.txt     |   3 +-
 paddle/pten/hapi/lib/utils/tensor_utils.cc    | 110 +++++++-
 paddle/pten/hapi/lib/utils/tensor_utils.h     |  58 +---
 .../hapi/lib/utils/tests/test_tensor_utils.cc |  29 +-
 paddle/pten/infershape/binary.cc              |   6 +-
 paddle/pten/infershape/binary.h               |  14 +-
 paddle/pten/infershape/unary.cc               |  18 +-
 paddle/pten/infershape/unary.h                |  21 +-
 paddle/pten/kernels/cpu/CMakeLists.txt        |   2 +-
 paddle/pten/kernels/cpu/manipulation.cc       |   6 +-
 paddle/pten/kernels/cpu/utils.cc              |   3 +-
 paddle/pten/kernels/cuda/CMakeLists.txt       |   4 +-
 paddle/pten/kernels/cuda/manipulation.cu      |   6 +-
 paddle/pten/kernels/cuda/math.cu              |  24 +-
 paddle/pten/kernels/cuda/utils.cu             |   3 +-
 paddle/pten/kernels/functions/eigen/dot.h     |   1 -
 paddle/pten/kernels/functions/eigen/mean.h    |   2 -
 paddle/pten/tests/CMakeLists.txt              |  10 +-
 paddle/pten/tests/dense_tensor_test.cc        |  13 -
 paddle/pten/tests/test_copy_api.cc            |  21 +-
 paddle/pten/tests/test_dot_api.cc             |  21 +-
 paddle/pten/tests/test_fill_api.cc            |  39 +--
 paddle/pten/tests/test_flatten_api.cc         |  12 +-
 paddle/pten/tests/test_mean_api.cc            |  12 +-
 51 files changed, 632 insertions(+), 1106 deletions(-)
 delete mode 100644 paddle/pten/core/candidate/CMakeLists.txt
 delete mode 100644 paddle/pten/core/candidate/dense_tensor.cc
 delete mode 100644 paddle/pten/core/candidate/dense_tensor.h

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 231105628dd7c..889925c6fdd39 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -195,10 +195,12 @@ cc_library(unused_var_check SRCS unused_var_check.cc DEPS glog no_need_buffer_va
 
 IF(WITH_XPU)
 cc_library(operator SRCS operator.cc DEPS xpu_op_list op_info device_context tensor scope glog trainer_desc_proto data_feed_proto
-    shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils pten pten_utils)
+    shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils
+    pten pten_utils kernel_factory)
 ELSE()
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog trainer_desc_proto data_feed_proto
-    shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils pten pten_utils)
+    shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils
+    pten pten_utils kernel_factory)
 ENDIF()
 
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
@@ -392,7 +394,7 @@ cc_library(save_load_util SRCS save_load_util.cc DEPS tensor scope layer)
 cc_test(save_load_util_test SRCS save_load_util_test.cc DEPS save_load_util tensor scope layer)
 cc_library(generator SRCS generator.cc DEPS enforce place)
 
-cc_library(pten_utils SRCS pten_utils.cc DEPS lod_tensor selected_rows place pten var_type_traits)
+cc_library(pten_utils SRCS pten_utils.cc DEPS lod_tensor selected_rows place pten var_type_traits pten_hapi_utils)
 
 # Get the current working branch
 execute_process(
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 7c63f7c76c921..f8ec13f1d8b98 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1819,10 +1819,10 @@ pten::KernelContext OperatorWithKernel::BuildPtenKernelContext(
 
     paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_inputs;
     for (auto var : ins_vector) {
-      auto pt_in = framework::InputVariableToPtenTensor(*var, in_def);
-      tmp_inputs.emplace_back(pt_in);
+      tmp_inputs.emplace_back(
+          experimental::MakePtenTensorBaseFromVar(*var, in_def));
     }
-    op_kernel_ctx.EmplaceBackInputs(tmp_inputs);
+    op_kernel_ctx.EmplaceBackInputs(std::move(tmp_inputs));
   }
 
   for (size_t i = 0; i < output_names.size(); ++i) {
@@ -1831,10 +1831,10 @@ pten::KernelContext OperatorWithKernel::BuildPtenKernelContext(
 
     paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_outputs;
     for (auto var : outs_vector) {
-      auto pt_out = framework::OutputVariableToPtenTensor(var, out_def);
-      tmp_outputs.emplace_back(pt_out);
+      tmp_outputs.emplace_back(
+          experimental::MakePtenTensorBaseFromVar(var, out_def));
     }
-    op_kernel_ctx.EmplaceBackOutputs(tmp_outputs);
+    op_kernel_ctx.EmplaceBackOutputs(std::move(tmp_outputs));
   }
 
   for (size_t i = 0; i < attr_names.size(); ++i) {
diff --git a/paddle/fluid/framework/pten_utils.cc b/paddle/fluid/framework/pten_utils.cc
index 9dac142557ed4..96408afc100e9 100644
--- a/paddle/fluid/framework/pten_utils.cc
+++ b/paddle/fluid/framework/pten_utils.cc
@@ -24,148 +24,6 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-// TODO(chenweihang, shixiaowei): adapt SelectedRows
-template <>
-std::shared_ptr<pten::DenseTensor> MakeTensorImpl<pten::DenseTensor, LoDTensor>(
-    const LoDTensor& tensor, pten::Backend backend,
-    paddle::experimental::DataType dtype,
-    paddle::experimental::DataLayout layout) {
-  auto holder = tensor.Holder();
-  auto tensor_impl = std::make_shared<pten::DenseTensor>(
-      pten::TensorMeta(tensor.dims(), backend, dtype, layout, tensor.offset()),
-      pten::TensorStatus());
-
-  if (holder != nullptr) {
-    tensor_impl->ShareAllocation(tensor.Holder());
-  }
-  return tensor_impl;
-}
-
-template <>
-std::shared_ptr<pten::DenseTensor> MakeTensorImpl<pten::DenseTensor, Tensor>(
-    const Tensor& tensor, pten::Backend backend,
-    paddle::experimental::DataType dtype,
-    paddle::experimental::DataLayout layout) {
-  auto holder = tensor.Holder();
-  auto tensor_impl = std::make_shared<pten::DenseTensor>(
-      pten::TensorMeta(tensor.dims(), backend, dtype, layout, tensor.offset()),
-      pten::TensorStatus());
-
-  if (holder != nullptr) {
-    tensor_impl->ShareAllocation(tensor.Holder());
-  }
-  return tensor_impl;
-}
-
-template <>
-std::shared_ptr<pten::DenseTensor> MakeTensorImpl<pten::DenseTensor>(
-    const LoDTensor& tensor, const platform::Place& place,
-    proto::VarType::Type type) {
-  return MakeTensorImpl<pten::DenseTensor, LoDTensor>(
-      tensor, pten::TransToPtenBackend(place), pten::TransToPtenDataType(type),
-      pten::TransToPtenDataLayout(tensor.layout()));
-}
-
-template <>
-std::shared_ptr<pten::DenseTensor> MakeTensorImpl<pten::DenseTensor>(
-    const Tensor& tensor, const platform::Place& place,
-    proto::VarType::Type type) {
-  return MakeTensorImpl<pten::DenseTensor, Tensor>(
-      tensor, pten::TransToPtenBackend(place), pten::TransToPtenDataType(type),
-      pten::TransToPtenDataLayout(tensor.layout()));
-}
-
-template <>
-void ShareTensorImpl<pten::DenseTensor>(pten::DenseTensor* tensor_impl,
-                                        LoDTensor* out) {
-  out->ResetHolderWithType(tensor_impl->allocation(),
-                           pten::TransToProtoVarType(tensor_impl->data_type()));
-}
-
-template <>
-void ShareTensorImpl<pten::DenseTensor>(pten::DenseTensor* tensor_impl,
-                                        Tensor* out) {
-  out->ResetHolderWithType(tensor_impl->allocation(),
-                           pten::TransToProtoVarType(tensor_impl->data_type()));
-}
-
-std::shared_ptr<pten::TensorBase> InputVariableToPtenTensor(
-    const framework::Variable& variable, const pten::TensorArgDef& arg_def) {
-  auto expected_place = pten::TransToFluidPlace(arg_def.backend);
-
-  if (variable.template IsType<framework::LoDTensor>()) {
-    const auto& tensor = variable.template Get<framework::LoDTensor>();
-    if (!platform::is_same_place(tensor.place(), expected_place)) {
-      framework::LoDTensor tmp_tensor;
-      framework::TensorCopySync(tensor, expected_place, &tmp_tensor);
-      auto pt_in =
-          framework::MakeTensorImpl<pten::DenseTensor, framework::LoDTensor>(
-              tmp_tensor, arg_def.backend, arg_def.dtype, arg_def.layout);
-      return pt_in;
-    } else {
-      auto pt_in =
-          framework::MakeTensorImpl<pten::DenseTensor, framework::LoDTensor>(
-              tensor, arg_def.backend, arg_def.dtype, arg_def.layout);
-      return pt_in;
-    }
-  } else if (variable.template IsType<framework::SelectedRows>()) {
-    // TODO(chenweihang): now we don't deal with row and height
-    // by xiaowei's advice
-    const auto& tensor = variable.template Get<framework::SelectedRows>();
-    if (!platform::is_same_place(tensor.value().place(), expected_place)) {
-      framework::Tensor tmp_tensor;
-      TensorCopySync(tensor.value(), expected_place, &tmp_tensor);
-      // TODO(chenweihang): adapt SelectedRows by xiaowei's design
-      auto pt_in =
-          framework::MakeTensorImpl<pten::DenseTensor, framework::Tensor>(
-              tmp_tensor, arg_def.backend, arg_def.dtype, arg_def.layout);
-      return pt_in;
-    } else {
-      auto pt_in =
-          framework::MakeTensorImpl<pten::DenseTensor, framework::Tensor>(
-              tensor.value(), arg_def.backend, arg_def.dtype, arg_def.layout);
-      return pt_in;
-    }
-  } else {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Unsupported shared input `%s` type now when call pt kernel.",
-        framework::ToTypeName(variable.Type())));
-  }
-  return nullptr;
-}
-
-std::shared_ptr<pten::TensorBase> OutputVariableToPtenTensor(
-    framework::Variable* variable, const pten::TensorArgDef& arg_def) {
-  // mutable_data before run kernel, to avoid share output form
-  // KernelContext to original tensor
-  if (variable->template IsType<framework::LoDTensor>()) {
-    auto* tensor = variable->template GetMutable<framework::LoDTensor>();
-    tensor->mutable_data(pten::TransToFluidPlace(arg_def.backend),
-                         pten::TransToProtoVarType(arg_def.dtype));
-    auto pt_out =
-        framework::MakeTensorImpl<pten::DenseTensor, framework::LoDTensor>(
-            *tensor, arg_def.backend, arg_def.dtype, arg_def.layout);
-    return pt_out;
-  } else if (variable->template IsType<framework::SelectedRows>()) {
-    auto* tensor = variable->template GetMutable<framework::SelectedRows>();
-    tensor->mutable_value()->mutable_data(
-        pten::TransToFluidPlace(arg_def.backend),
-        pten::TransToProtoVarType(arg_def.dtype));
-    // TODO(chenweihang): adapt SelectedRows by xiaowei's design,
-    // here the row and height will lost in output!
-    auto pt_out =
-        framework::MakeTensorImpl<pten::DenseTensor, framework::Tensor>(
-            tensor->value(), arg_def.backend, arg_def.dtype, arg_def.layout);
-    return pt_out;
-  } else {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Unsupported shared output `%s` type now when call pt kernel.",
-        framework::ToTypeName(variable->Type())));
-  }
-
-  return nullptr;
-}
-
 OpKernelType TransPtenKernelKeyToOpKernelType(
     const pten::KernelKey& kernel_key) {
   proto::VarType::Type data_type =
diff --git a/paddle/fluid/framework/pten_utils.h b/paddle/fluid/framework/pten_utils.h
index 263101657ceb9..8c1c25b3b67cd 100644
--- a/paddle/fluid/framework/pten_utils.h
+++ b/paddle/fluid/framework/pten_utils.h
@@ -25,41 +25,13 @@ limitations under the License. */
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/pten/api/include/core.h"
+#include "paddle/pten/hapi/lib/utils/tensor_utils.h"
 #include "paddle/utils/flat_hash_map.h"
 #include "paddle/utils/small_vector.h"
 
 namespace paddle {
 namespace framework {
 
-/* tensor translate */
-
-template <typename PtenTensorImplT, typename VariableT>
-std::shared_ptr<PtenTensorImplT> MakeTensorImpl(
-    const VariableT& tensor, pten::Backend backend,
-    paddle::experimental::DataType dtype,
-    paddle::experimental::DataLayout layout);
-
-template <typename PtenTensorImplT>
-std::shared_ptr<PtenTensorImplT> MakeTensorImpl(const LoDTensor& tensor,
-                                                const platform::Place& place,
-                                                proto::VarType::Type type);
-
-template <typename PtenTensorImplT>
-std::shared_ptr<PtenTensorImplT> MakeTensorImpl(const Tensor& tensor,
-                                                const platform::Place& place,
-                                                proto::VarType::Type type);
-
-template <typename PtenTensorImplT>
-void ShareTensorImpl(PtenTensorImplT* tensor_impl, LoDTensor* out);
-
-template <typename PtenTensorImplT>
-void ShareTensorImpl(PtenTensorImplT* tensor_impl, Tensor* out);
-
-std::shared_ptr<pten::TensorBase> InputVariableToPtenTensor(
-    const framework::Variable& variable, const pten::TensorArgDef& arg_def);
-std::shared_ptr<pten::TensorBase> OutputVariableToPtenTensor(
-    framework::Variable* variable, const pten::TensorArgDef& arg_def);
-
 /* Kernel Key translate */
 
 OpKernelType TransPtenKernelKeyToOpKernelType(
diff --git a/paddle/fluid/framework/pten_utils_test.cc b/paddle/fluid/framework/pten_utils_test.cc
index 33c55a8086b4e..ab2d60a34303a 100644
--- a/paddle/fluid/framework/pten_utils_test.cc
+++ b/paddle/fluid/framework/pten_utils_test.cc
@@ -18,66 +18,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/variable.h"
 
-TEST(PtenUtils, FluidTensorToPtenTensor) {
-  // 1. create tensor
-  paddle::framework::LoDTensor x;
-  paddle::framework::Tensor x2;
-  x.Resize({2});
-  x.mutable_data<float>(paddle::platform::CPUPlace());
-  x.data<float>()[0] = 0.2;
-  x.data<float>()[1] = 0.5;
-
-  // 2. test API
-  auto dense_x = paddle::framework::MakeTensorImpl<pten::DenseTensor>(
-      x, x.place(), x.type());
-
-  // 3. check result
-  std::vector<float> expect_value = {0.2, 0.5};
-  ASSERT_EQ(dense_x->data<float>()[0], expect_value[0]);
-  ASSERT_EQ(dense_x->data<float>()[1], expect_value[1]);
-  ASSERT_EQ(dense_x->backend(), pten::Backend::CPU);
-  ASSERT_EQ(dense_x->data_type(), pten::DataType::FLOAT32);
-}
-
-TEST(PtenUtils, VarToPtenTensor) {
-  // 1. create Variable
-  paddle::framework::Variable v;
-  auto selected_rows = v.GetMutable<paddle::framework::SelectedRows>();
-  paddle::framework::Tensor* value = selected_rows->mutable_value();
-  auto* data = value->mutable_data<int>(paddle::framework::make_ddim({1, 1}),
-                                        paddle::platform::CPUPlace());
-  data[0] = 123;
-  pten::Backend expect_backend = pten::Backend::CPU;
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  expect_backend = pten::Backend::CUDA;
-#endif
-  auto tensor_def = pten::TensorArgDef(expect_backend, pten::DataLayout::NCHW,
-                                       pten::DataType::INT32);
-  // 2. test API
-  auto tensor_x = paddle::framework::InputVariableToPtenTensor(v, tensor_def);
-  // 3. check result
-  ASSERT_EQ(tensor_x->backend(), expect_backend);
-  ASSERT_EQ(tensor_x->data_type(), pten::DataType::INT32);
-}
-
-TEST(PtenUtils, PtenTensorToFluidTensor) {
-  pten::DenseTensor dense_tensor(
-      pten::TensorMeta(paddle::framework::make_ddim({1, 1}), pten::Backend::CPU,
-                       pten::DataType::FLOAT32, pten::DataLayout::ANY),
-      pten::TensorStatus());
-  auto* data_ptr = dense_tensor.mutable_data<float>();
-  data_ptr[0] = 0.5;
-  // share allocation into fluid Tensor
-  paddle::framework::Tensor tensor;
-  paddle::framework::LoDTensor lod_tensor;
-  paddle::framework::ShareTensorImpl(&dense_tensor, &tensor);
-  paddle::framework::ShareTensorImpl(&dense_tensor, &lod_tensor);
-  // compare
-  ASSERT_EQ(tensor.data<float>()[0], 0.5);
-  ASSERT_EQ(lod_tensor.data<float>()[0], 0.5);
-}
-
 TEST(PtenUtils, TransPtenKernelKeyToOpKernelType) {
   pten::KernelKey kernel_key(pten::Backend::CPU, pten::DataLayout::NCHW,
                              pten::DataType::FLOAT32);
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 2ffb47273f650..f2251e34fb029 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -293,11 +293,10 @@ static pten::KernelContext BuildDygraphPtenKernelContext(
     paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_inputs;
     for (auto var : ins_vector) {
       const auto& variable = var->Var();
-
-      auto pt_in = framework::InputVariableToPtenTensor(variable, in_def);
-      tmp_inputs.emplace_back(pt_in);
+      tmp_inputs.emplace_back(
+          experimental::MakePtenTensorBaseFromVar(variable, in_def));
     }
-    op_kernel_ctx.EmplaceBackInputs(tmp_inputs);
+    op_kernel_ctx.EmplaceBackInputs(std::move(tmp_inputs));
   }
 
   for (size_t i = 0; i < output_names.size(); ++i) {
@@ -307,11 +306,10 @@ static pten::KernelContext BuildDygraphPtenKernelContext(
     paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_outputs;
     for (auto var : outs_vector) {
       auto* variable = var->MutableVar();
-
-      auto pt_out = framework::OutputVariableToPtenTensor(variable, out_def);
-      tmp_outputs.emplace_back(pt_out);
+      tmp_outputs.emplace_back(
+          experimental::MakePtenTensorBaseFromVar(variable, out_def));
     }
-    op_kernel_ctx.EmplaceBackOutputs(tmp_outputs);
+    op_kernel_ctx.EmplaceBackOutputs(std::move(tmp_outputs));
   }
 
   for (size_t i = 0; i < attr_names.size(); ++i) {
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index adbd9bf277b11..bafc650c433db 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -80,8 +80,9 @@ if(WITH_UNITY_BUILD)
 endif()
 
 set(OP_HEADER_DEPS ${OP_HEADER_DEPS} pten)
-set(OP_HEADER_DEPS ${OP_HEADER_DEPS} pten_utils)
-register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op load_combine_op lstm_op run_program_op eye_op 
+#set(OP_HEADER_DEPS ${OP_HEADER_DEPS} pten_utils)
+register_operators(EXCLUDES
+py_layer_op py_func_op warpctc_op dgc_op load_combine_op lstm_op run_program_op eye_op 
         recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op spectral_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
 
 op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS})
diff --git a/paddle/fluid/operators/dot_op.h b/paddle/fluid/operators/dot_op.h
index 641b0d653d5b0..6a025fdd9ccc6 100644
--- a/paddle/fluid/operators/dot_op.h
+++ b/paddle/fluid/operators/dot_op.h
@@ -16,13 +16,13 @@
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/operators/math/complex_functors.h"
 #include "paddle/fluid/platform/for_range.h"
 
 // only can include the headers in paddle/pten/api dirs
 #include "paddle/pten/api/include/core.h"
 #include "paddle/pten/api/include/linalg.h"
+#include "paddle/pten/hapi/lib/utils/tensor_utils.h"
 
 namespace paddle {
 namespace operators {
@@ -244,12 +244,9 @@ class DotKernel : public framework::OpKernel<T> {
     auto& dev_ctx = ctx.device_context<DeviceContext>();
     out->mutable_data<T>(x->place());
 
-    auto pt_x =
-        framework::MakeTensorImpl<pten::DenseTensor>(*x, x->place(), x->type());
-    auto pt_y =
-        framework::MakeTensorImpl<pten::DenseTensor>(*y, y->place(), y->type());
-    auto pt_out = framework::MakeTensorImpl<pten::DenseTensor>(*out, x->place(),
-                                                               x->type());
+    auto pt_x = paddle::experimental::MakePtenDenseTensor(*x);
+    auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
+    auto pt_out = paddle::experimental::MakePtenDenseTensor(*out);
 
     // call new kernel
     pten::Dot<T>(dev_ctx, *pt_x.get(), *pt_y.get(), pt_out.get());
diff --git a/paddle/fluid/operators/fill_any_like_op.h b/paddle/fluid/operators/fill_any_like_op.h
index 73170c6e2e277..fc649f42c51a1 100644
--- a/paddle/fluid/operators/fill_any_like_op.h
+++ b/paddle/fluid/operators/fill_any_like_op.h
@@ -62,10 +62,8 @@ class FillAnyLikeKernel : public framework::OpKernel<T> {
         std::isnan(value), false,
         platform::errors::InvalidArgument("The filled value is NaN."));
 
-    auto pt_x = framework::MakeTensorImpl<pten::DenseTensor>(*in, in->place(),
-                                                             in->type());
-    auto pt_out = framework::MakeTensorImpl<pten::DenseTensor>(
-        *out, out->place(), out->type());
+    auto pt_x = paddle::experimental::MakePtenDenseTensor(*in);
+    auto pt_out = paddle::experimental::MakePtenDenseTensor(*out);
 
     const auto& dev_ctx = context.template device_context<DeviceContext>();
     // call new kernel
diff --git a/paddle/fluid/operators/mean_op.cu b/paddle/fluid/operators/mean_op.cu
index ffb667ba974b8..26c844392d4d7 100644
--- a/paddle/fluid/operators/mean_op.cu
+++ b/paddle/fluid/operators/mean_op.cu
@@ -62,6 +62,7 @@ class MeanCUDAGradKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
+
 REGISTER_OP_CUDA_KERNEL(
     mean, ops::MeanKernel<paddle::platform::CUDADeviceContext, float>,
     ops::MeanKernel<paddle::platform::CUDADeviceContext, double>,
diff --git a/paddle/fluid/operators/mean_op.h b/paddle/fluid/operators/mean_op.h
index 9a8c2736589c9..9d9954a8412a3 100644
--- a/paddle/fluid/operators/mean_op.h
+++ b/paddle/fluid/operators/mean_op.h
@@ -20,6 +20,7 @@ limitations under the License. */
 // only can include the headers in paddle/top/api dirs
 #include "paddle/pten/api/include/core.h"
 #include "paddle/pten/api/include/math.h"
+#include "paddle/pten/hapi/lib/utils/tensor_utils.h"
 
 namespace paddle {
 namespace operators {
@@ -61,10 +62,8 @@ class MeanKernel : public framework::OpKernel<T> {
     auto& dev_ctx = context.device_context<DeviceContext>();
     out->mutable_data<T>(x->place());
 
-    auto pt_x =
-        framework::MakeTensorImpl<pten::DenseTensor>(*x, x->place(), x->type());
-    auto pt_out = framework::MakeTensorImpl<pten::DenseTensor>(*out, x->place(),
-                                                               x->type());
+    auto pt_x = paddle::experimental::MakePtenDenseTensor(*x);
+    auto pt_out = paddle::experimental::MakePtenDenseTensor(*out);
 
     // call new kernel
     VLOG(1) << "chenweihang: call original mean kernel compute.";
diff --git a/paddle/fluid/operators/scale_op.h b/paddle/fluid/operators/scale_op.h
index 9a043361678b2..0d7113a6f4de9 100644
--- a/paddle/fluid/operators/scale_op.h
+++ b/paddle/fluid/operators/scale_op.h
@@ -20,6 +20,7 @@ limitations under the License. */
 // only can include the headers in paddle/top/api dirs
 #include "paddle/pten/api/include/core.h"
 #include "paddle/pten/api/include/math.h"
+#include "paddle/pten/hapi/lib/utils/tensor_utils.h"
 
 namespace paddle {
 namespace operators {
@@ -60,16 +61,13 @@ class ScaleKernel : public framework::OpKernel<T> {
       out_slr->set_rows(in_slr.rows());
       out_slr->set_height(in_slr.height());
     }
-
     auto* out =
         framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(out_var);
     out->mutable_data<T>(in->place());
     auto& dev_ctx = ctx.device_context<DeviceContext>();
 
-    auto pt_x = framework::MakeTensorImpl<pten::DenseTensor>(*in, in->place(),
-                                                             in->type());
-    auto pt_out = framework::MakeTensorImpl<pten::DenseTensor>(
-        *out, in->place(), in->type());
+    auto pt_x = paddle::experimental::MakePtenDenseTensor(*in);
+    auto pt_out = paddle::experimental::MakePtenDenseTensor(*out);
 
     // call new kernel
     pten::Scale<T>(dev_ctx, *pt_x.get(), scale, bias, bias_after_scale,
diff --git a/paddle/fluid/operators/sign_op.h b/paddle/fluid/operators/sign_op.h
index f3083f4937875..0e3036115e3c1 100644
--- a/paddle/fluid/operators/sign_op.h
+++ b/paddle/fluid/operators/sign_op.h
@@ -36,10 +36,8 @@ class SignKernel : public framework::OpKernel<T> {
     auto& dev_ctx = context.device_context<DeviceContext>();
     out->mutable_data<T>(x->place());
 
-    auto pt_x =
-        framework::MakeTensorImpl<pten::DenseTensor>(*x, x->place(), x->type());
-    auto pt_out = framework::MakeTensorImpl<pten::DenseTensor>(*out, x->place(),
-                                                               x->type());
+    auto pt_x = paddle::experimental::MakePtenDenseTensor(*x);
+    auto pt_out = paddle::experimental::MakePtenDenseTensor(*out);
 
     // call new kernel
     pten::Sign<T>(dev_ctx, *pt_x.get(), pt_out.get());
diff --git a/paddle/pten/common/data_type.h b/paddle/pten/common/data_type.h
index f5383da31cf93..27ca28b273485 100644
--- a/paddle/pten/common/data_type.h
+++ b/paddle/pten/common/data_type.h
@@ -54,6 +54,7 @@ inline size_t SizeOf(DataType data_type) {
     case DataType::UINT8:
     case DataType::INT8:
       return 1;
+    case DataType::BFLOAT16:
     case DataType::FLOAT16:
     case DataType::INT16:
     case DataType::UINT16:
@@ -65,11 +66,11 @@ inline size_t SizeOf(DataType data_type) {
     case DataType::FLOAT64:
     case DataType::INT64:
     case DataType::UINT64:
-      return 8;
-    case DataType::UNDEFINED:
-    case DataType::BFLOAT16:
     case DataType::COMPLEX64:
+      return 8;
     case DataType::COMPLEX128:
+      return 16;
+    case DataType::UNDEFINED:
     case DataType::NUM_DATA_TYPES:
       PADDLE_THROW(platform::errors::Unimplemented(
           "Data type %d is not supported by tensor.",
@@ -138,12 +139,21 @@ inline std::ostream& operator<<(std::ostream& os, DataType dtype) {
     case DataType::INT16:
       os << "int16";
       break;
+    case DataType::UINT16:
+      os << "uint16";
+      break;
     case DataType::INT32:
       os << "int32";
       break;
+    case DataType::UINT32:
+      os << "uint32";
+      break;
     case DataType::INT64:
       os << "int64";
       break;
+    case DataType::UINT64:
+      os << "uint64";
+      break;
     case DataType::BFLOAT16:
       os << "bfloat16";
       break;
diff --git a/paddle/pten/core/CMakeLists.txt b/paddle/pten/core/CMakeLists.txt
index ca562332bb79f..a7ccf31467438 100644
--- a/paddle/pten/core/CMakeLists.txt
+++ b/paddle/pten/core/CMakeLists.txt
@@ -1,5 +1,3 @@
-add_subdirectory(candidate)
-
 IF(WITH_MKLDNN)
     set(MKLDNN_CTX_DEPS mkldnn)
 ELSE()
@@ -7,15 +5,15 @@ ELSE()
 ENDIF()
 
 if(WITH_GPU)
-    cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place gpu_info)
+  cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place gpu_info)
 elseif(WITH_ROCM)
-    cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place gpu_info)
+  cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place gpu_info)
 else()
-    cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place)
+  cc_library(convert_utils SRCS convert_utils.cc DEPS data_type place)
 endif()
-cc_library(dense_tensor SRCS dense_tensor.cc DEPS enforce data_type ddim allocator place convert_utils ${MKLDNN_CTX_DEPS})
 
 cc_library(kernel_factory SRCS kernel_factory.cc DEPS enforce)
 cc_library(kernel_context SRCS kernel_context.cc DEPS enforce device_context)
 
 cc_library(tensor_base SRCS tensor_base.cc allocator.cc storage.cc DEPS enforce)
+cc_library(dense_tensor SRCS dense_tensor.cc DEPS tensor_base)
diff --git a/paddle/pten/core/candidate/CMakeLists.txt b/paddle/pten/core/candidate/CMakeLists.txt
deleted file mode 100644
index dd670abdba1c1..0000000000000
--- a/paddle/pten/core/candidate/CMakeLists.txt
+++ /dev/null
@@ -1 +0,0 @@
-cc_library(pten_dense_tensor SRCS dense_tensor.cc DEPS tensor_base)
diff --git a/paddle/pten/core/candidate/dense_tensor.cc b/paddle/pten/core/candidate/dense_tensor.cc
deleted file mode 100644
index 325edd1ba077f..0000000000000
--- a/paddle/pten/core/candidate/dense_tensor.cc
+++ /dev/null
@@ -1,145 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/pten/core/candidate/dense_tensor.h"
-
-namespace pten {
-namespace candidate {
-
-DenseTensorMeta::DenseTensorMeta(DataType type, const DDim& dims)
-    : dims(dims), type(type) {}
-DenseTensorMeta::DenseTensorMeta(DataType type,
-                                 const DDim& dims,
-                                 DataLayout layout)
-    : dims(dims), type(type), layout(layout) {}
-DenseTensorMeta::DenseTensorMeta(DataType type,
-                                 const DDim& dims,
-                                 DataLayout layout,
-                                 const std::vector<std::vector<size_t>>& lod)
-    : dims(dims), type(type), layout(layout), lod(lod) {}
-
-bool DenseTensorMeta::valid() const noexcept {
-  bool valid{true};
-  valid = valid && (type != DataType::UNDEFINED);
-  valid = valid && (layout != DataLayout::UNDEFINED);
-  valid = valid && (is_scalar || product(dims));
-  return valid;
-}
-
-DenseTensor::DenseTensor(const std::shared_ptr<Allocator>& a,
-                         const DenseTensorMeta& meta)
-    : meta_(meta),
-      storage_(
-          make_intrusive<TensorStorage>(a, SizeOf(data_type()) * numel())) {}
-
-DenseTensor::DenseTensor(const std::shared_ptr<Allocator>& a,
-                         DenseTensorMeta&& meta)
-    : meta_(std::move(meta)),
-      storage_(
-          make_intrusive<TensorStorage>(a, SizeOf(data_type()) * numel())) {}
-
-DenseTensor::DenseTensor(intrusive_ptr<Storage> storage,
-                         const DenseTensorMeta& meta)
-    : meta_(meta), storage_(std::move(storage)) {}
-
-DenseTensor::DenseTensor(intrusive_ptr<Storage> storage, DenseTensorMeta&& meta)
-    : meta_(std::move(meta)), storage_(std::move(storage)) {}
-
-int64_t DenseTensor::numel() const {
-  if (meta_.is_scalar) {
-    return 1;
-  }
-  return product(meta_.dims);
-}
-
-bool DenseTensor::SharesStorageWith(const DenseTensor& b) const {
-  return storage_.get() == b.storage_.get() && storage_.get() != nullptr;
-}
-
-template <typename T>
-T* DenseTensor::mutable_data(size_t request_bytes) {
-  PADDLE_ENFORCE(
-      valid(),
-      paddle::platform::errors::PreconditionNotMet(
-          "The meta data must be valid when call the mutable data function."));
-  PADDLE_ENFORCE_NOT_NULL(
-      storage_,
-      paddle::platform::errors::PreconditionNotMet(
-          "The storage must be valid when call the mutable data function."));
-  PADDLE_ENFORCE(
-      (data_type() == paddle::experimental::CppTypeToDataType<T>::Type()),
-      paddle::platform::errors::PreconditionNotMet(
-          "The type of data we are trying to retrieve does not match the "
-          "type of data currently contained in the container."));
-  size_t bytes = numel() * SizeOf(data_type());
-  if (request_bytes) {
-    PADDLE_ENFORCE_GE(request_bytes,
-                      bytes,
-                      paddle::platform::errors::InvalidArgument(
-                          "The reserved size %d should be enough to meet the "
-                          "volume required by metadata %d.",
-                          request_bytes,
-                          bytes));
-    bytes = request_bytes;
-  }
-  if (storage_->size() < bytes) {
-    storage_->Realloc(bytes);
-  }
-  return static_cast<T*>(storage_->data());
-}
-
-template <typename T>
-const T* DenseTensor::data() const {
-  PADDLE_ENFORCE_NOT_NULL(
-      storage_,
-      paddle::platform::errors::PreconditionNotMet(
-          "The storage must be valid when call the mutable data function."));
-  PADDLE_ENFORCE(
-      (data_type() == paddle::experimental::CppTypeToDataType<T>::Type()),
-      paddle::platform::errors::PreconditionNotMet(
-          "The type of data we are trying to retrieve does not match the "
-          "type of data currently contained in the container."));
-  return static_cast<const T*>(storage_->data());
-}
-
-void DenseTensor::check_memory_size() const {
-  size_t bytes = numel() * SizeOf(data_type());
-  PADDLE_ENFORCE_GE(memory_size(),
-                    bytes,
-                    paddle::platform::errors::InvalidArgument(
-                        "The memory size %d should be enough to meet the "
-                        "volume required by metadata %d.",
-                        memory_size(),
-                        bytes));
-}
-
-#define DATA_MEMBER_FUNC_INSTANTIATION(dtype)                      \
-  template dtype* DenseTensor::mutable_data(size_t request_bytes); \
-  template const dtype* DenseTensor::data() const;
-
-DATA_MEMBER_FUNC_INSTANTIATION(int8_t);
-DATA_MEMBER_FUNC_INSTANTIATION(uint8_t);
-DATA_MEMBER_FUNC_INSTANTIATION(int16_t);
-DATA_MEMBER_FUNC_INSTANTIATION(uint16_t);
-DATA_MEMBER_FUNC_INSTANTIATION(int32_t);
-DATA_MEMBER_FUNC_INSTANTIATION(uint32_t);
-DATA_MEMBER_FUNC_INSTANTIATION(int64_t);
-DATA_MEMBER_FUNC_INSTANTIATION(uint64_t);
-DATA_MEMBER_FUNC_INSTANTIATION(float);
-DATA_MEMBER_FUNC_INSTANTIATION(double);
-
-#undef DATA_MEMBER_FUNC_INSTANTIATION
-
-}  // namespace candidate
-}  // namespace pten
diff --git a/paddle/pten/core/candidate/dense_tensor.h b/paddle/pten/core/candidate/dense_tensor.h
deleted file mode 100644
index 21a093439529f..0000000000000
--- a/paddle/pten/core/candidate/dense_tensor.h
+++ /dev/null
@@ -1,188 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/platform/place.h"
-#include "paddle/pten/common/data_type.h"
-#include "paddle/pten/core/allocator.h"
-#include "paddle/pten/core/storage.h"
-#include "paddle/pten/core/tensor_base.h"
-
-namespace pten {
-namespace candidate {
-
-using DDim = paddle::framework::DDim;
-
-/// \brief The meta data of dense tensor. Take the structure type
-/// and use all default operations.
-///
-struct DenseTensorMeta {
-  using DataType = paddle::experimental::DataType;
-  using DataLayout = paddle::experimental::DataLayout;
-
-  DenseTensorMeta() = default;
-  DenseTensorMeta(DataType type, const DDim& dims);
-  DenseTensorMeta(DataType type, const DDim& dims, DataLayout layout);
-  DenseTensorMeta(DataType type,
-                  const DDim& dims,
-                  DataLayout layout,
-                  const std::vector<std::vector<size_t>>& lod);
-
-  /// \brief Test whether the metadata is valid. Does not throw exceptions.
-  /// \return Whether the metadata is valid.
-  bool valid() const noexcept;
-
-  /// During the entire life cycle of a DenseTensor, the following attributes
-  /// marked with `const` are expected to remain unchanged.
-  const bool is_scalar{false};
-  DDim dims;
-  const DataType type{DataType::FLOAT32};
-  const DataLayout layout{DataLayout::NCHW};
-  std::vector<std::vector<size_t>> lod;
-};
-
-/// \brief The Dense tensor store values in a contiguous sequential block
-/// of memory where all values are represented. Tensors or multi-dimensional
-/// arrays are used in math operators.
-/// During the entire life cycle of a DenseTensor, its device type and key
-/// metadata are set unchanged.
-class DenseTensor : public TensorBase,
-                    public TypeInfoTraits<TensorBase, DenseTensor> {
- public:
-  /// \brief Construct a dense tensor and allocate space.
-  /// \param a The allocator used to allocate space.
-  /// \param meta The meta data of dense tensor.
-  DenseTensor(const std::shared_ptr<Allocator>& a, const DenseTensorMeta& meta);
-
-  /// \brief Construct a dense tensor and allocate space.
-  /// \param a The allocator used to allocate space.
-  /// \param meta The meta data of dense tensor.
-  DenseTensor(const std::shared_ptr<Allocator>& a, DenseTensorMeta&& meta);
-
-  /// \brief Use existing storage space to create dense tensor. This interface
-  /// can be used to deliberately create an uninitialized dense tensor.
-  /// \param storage The existing storage.
-  /// \param meta The meta data of dense tensor.
-  DenseTensor(intrusive_ptr<Storage> storage, const DenseTensorMeta& meta);
-
-  /// \brief Use existing storage space to create dense tensor. This interface
-  /// can be used to deliberately create an uninitialized dense tensor.
-  /// \param storage The existing storage.
-  /// \param meta The meta data of dense tensor.
-  DenseTensor(intrusive_ptr<Storage> storage, DenseTensorMeta&& meta);
-
-  /// \brief Because dense tensor is a kind of container, we give a default
-  /// constructor to use for stl container. But the dense tensor created with
-  /// the default constructor is not practical.
-  DenseTensor() = default;
-
-  /// \brief Because dense tensor is a resource handle, we provide a default
-  /// move constructor to support move semantics.
-  DenseTensor(DenseTensor&& other) = default;
-
-  /// \brief We do not recommend deep copy of dense tensor because of its
-  /// efficiency and complexity across devices. The operation is disabled here.
-  DenseTensor(const DenseTensor& other) = delete;
-
-  /// \brief Destroy the tensor object and release exclusive resources.
-  virtual ~DenseTensor() = default;
-
- public:
-  /// \brief Returns the name of the class for type traits.
-  /// \return The name of the class.
-  static const char* name() { return "DenseTensor"; }
-
-  /// \brief Returns the number of elements contained in tensor.
-  /// \return The number of elements contained in tensor.
-  int64_t numel() const;
-
-  /// \brief Returns the dims of the tensor.
-  /// \return The dims of the tensor.
-  const DDim& dims() const noexcept { return meta_.dims; }
-
-  /// \brief Returns the lod of the tensor.
-  /// \return The lod of the tensor.
-  const std::vector<std::vector<size_t>>& lod() const noexcept {
-    return meta_.lod;
-  }
-
-  /// \brief Returns the data type of the tensor.
-  /// \return The data type of the tensor.
-  DataType data_type() const noexcept { return meta_.type; }
-
-  /// \brief Returns the data layout of the tensor.
-  /// \return The data layout of the tensor.
-  DataLayout layout() const noexcept { return meta_.layout; }
-
-  /// \brief Returns the data place of the tensor.
-  /// \return The data place of the tensor.
-  const Place& place() const { return storage_->place(); }
-
-  /// \brief Test whether the metadata is valid.
-  /// \return Whether the metadata is valid.
-  bool valid() const noexcept { return meta_.valid(); }
-
-  /// \brief Test whether the storage is allocated.
-  /// return Whether the storage is allocated.
-  bool initialized() const { return storage_->data(); }
-
-  /// \brief Check if storage is shared with other objects.
-  /// \return Whether the storage is shared with other objects.
-  bool SharesStorageWith(const DenseTensor& b) const;
-
-  /// \brief Change the dims information in the metadata, and the corresponding
-  /// memory allocation will occur when the `mutable_data` is called.
-  /// \param dims The new dims of the dense tensor.
-  void Resize(const DDim& dims) noexcept { meta_.dims = dims; }
-
-  /// \brief Returns the actual storage size occupied by tensor, may be larger
-  /// than its shape dims.
-  /// \return The actual storage size occupied by tensor.
-  size_t memory_size() const { return storage_->size(); }
-
-  /// \brief Check that the storage area is large enough to hold the data of the
-  /// metadata size, and throw an exception if the conditions are not met.
-  void check_memory_size() const;
-
-  /// \brief Release the storage area for other purposes. Because of the
-  /// destruction of encapsulation, we do not support two dense tensors directly
-  /// sharing the same intrusive pointer.
-  /// \return The rvalue of instrusize pointer releated to the released storage.
-  intrusive_ptr<Storage> release() { return std::move(storage_); }
-
-  /// \brief Get the mutable data pointer value of type T.
-  /// Memory allocation may occur when calling this interface:
-  /// 1. When the storage size is not enough to meet the current shape of the
-  /// data.
-  /// 2. When more request_bytes parameters are used to reserve the data
-  /// storage.
-  /// param request_bytes The bytes to reserve the data storage.
-  /// \return The mutable data pointer value of type T.
-  template <typename T>
-  T* mutable_data(size_t request_bytes = 0);
-
-  /// \brief Get the const data pointer value of type T.
-  /// \return The const data pointer value of type T.
-  template <typename T>
-  const T* data() const;
-
- private:
-  DenseTensorMeta meta_;
-  intrusive_ptr<Storage> storage_;
-};
-
-}  // namespace candidate
-}  // namespace pten
diff --git a/paddle/pten/core/dense_tensor.cc b/paddle/pten/core/dense_tensor.cc
index 0a11c8e7d1912..647ddea0b4e1b 100644
--- a/paddle/pten/core/dense_tensor.cc
+++ b/paddle/pten/core/dense_tensor.cc
@@ -13,114 +13,126 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/pten/core/dense_tensor.h"
-#include "paddle/pten/core/convert_utils.h"
-
-// See Note [ Why still include the fluid headers? ]
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/gpu_info.h"
-#include "paddle/fluid/platform/place.h"
 
 namespace pten {
 
-using CPUPlace = paddle::platform::CPUPlace;
-using CUDAPlace = paddle::platform::CUDAPlace;
-using CUDAPinnedPlace = paddle::platform::CUDAPinnedPlace;
-using XPUPlace = paddle::platform::XPUPlace;
-using NPUPlace = paddle::platform::NPUPlace;
-using NPUPinnedPlace = paddle::platform::NPUPinnedPlace;
+DenseTensor::DenseTensor(const std::shared_ptr<Allocator>& a,
+                         const DenseTensorMeta& meta)
+    : meta_(meta),
+      storage_(
+          make_intrusive<TensorStorage>(a, SizeOf(data_type()) * numel())) {}
 
-const paddle::platform::Place& DenseTensor::place() const {
-  PADDLE_ENFORCE_NOT_NULL(
-      allocation_,
-      paddle::platform::errors::PreconditionNotMet(
-          "Tensor not initialized yet when Tensor::place() is called."));
-  return allocation_->place();
-}
+DenseTensor::DenseTensor(const std::shared_ptr<Allocator>& a,
+                         DenseTensorMeta&& meta)
+    : meta_(std::move(meta)),
+      storage_(
+          make_intrusive<TensorStorage>(a, SizeOf(data_type()) * numel())) {}
+
+DenseTensor::DenseTensor(intrusive_ptr<Storage> storage,
+                         const DenseTensorMeta& meta)
+    : meta_(meta), storage_(std::move(storage)) {}
 
-//----------------------------------------------------------------
-// Inner methods
+DenseTensor::DenseTensor(intrusive_ptr<Storage> storage, DenseTensorMeta&& meta)
+    : meta_(std::move(meta)), storage_(std::move(storage)) {}
 
-void DenseTensor::ShareAllocation(
-    const std::shared_ptr<paddle::memory::allocation::Allocation>& allocation) {
-  // This operation can be very slow!
-  // std::shared_ptr reference count is atomic. increasing or decreasing
-  // the reference count requires atomic increment or decrement.
-  // This is hundred times slower than non-atomic increment/decrement
-  allocation_ = allocation;
+int64_t DenseTensor::numel() const {
+  if (meta_.is_scalar) {
+    return 1;
+  }
+  return product(meta_.dims);
+}
+
+bool DenseTensor::IsSharedWith(const DenseTensor& b) const {
+  return storage_.get() == b.storage_.get() && storage_.get() != nullptr;
 }
 
-// TODO(chenweihang): Add other place branchs
-paddle::platform::Place DenseTensor::GetPlaceByBackend() const {
-  switch (meta_.backend) {
-    case Backend::CPU:
-      return CPUPlace();
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    case Backend::CUDA:
-      return CUDAPlace(paddle::platform::GetCurrentDeviceId());
-#endif
-    default:
-      PADDLE_THROW(paddle::platform::errors::Unimplemented(
-          "Unsupported Tensor backend."));
+void* DenseTensor::mutable_data(size_t request_bytes) {
+  PADDLE_ENFORCE(
+      valid(),
+      paddle::platform::errors::PreconditionNotMet(
+          "The meta data must be valid when call the mutable data function."));
+  PADDLE_ENFORCE_NOT_NULL(
+      storage_,
+      paddle::platform::errors::PreconditionNotMet(
+          "The storage must be valid when call the mutable data function."));
+  size_t bytes = numel() * SizeOf(data_type());
+  if (request_bytes) {
+    PADDLE_ENFORCE_GE(request_bytes,
+                      bytes,
+                      paddle::platform::errors::InvalidArgument(
+                          "The reserved size %d should be enough to meet the "
+                          "volume required by metadata %d.",
+                          request_bytes,
+                          bytes));
+    bytes = request_bytes;
+  }
+  if (storage_->size() < bytes) {
+    storage_->Realloc(bytes);
   }
+  return storage_->data();
 }
 
-size_t DenseTensor::MemorySize() const {
-  return allocation_ == nullptr ? 0UL : allocation_->size() - meta_.offset;
+template <typename T>
+T* DenseTensor::mutable_data() {
+  PADDLE_ENFORCE(
+      (data_type() == paddle::experimental::CppTypeToDataType<T>::Type()),
+      paddle::platform::errors::PreconditionNotMet(
+          "The type of data (%d) we are trying to retrieve does not match the "
+          "type of data currently contained in the container (%d).",
+          static_cast<int>(paddle::experimental::CppTypeToDataType<T>::Type()),
+          static_cast<int>(data_type())));
+  return static_cast<T*>(mutable_data());
 }
 
-void DenseTensor::CheckMemorySize() const {
-  PADDLE_ENFORCE_NOT_NULL(allocation_,
-                          paddle::platform::errors::PreconditionNotMet(
-                              "Tensor holds no memory. "
-                              "Call Tensor::mutable_data firstly."));
-  size_t size_of_type =
-      paddle::framework::SizeOfType(TransToProtoVarType(meta_.type));
-  PADDLE_ENFORCE_LE(
-      numel() * size_of_type,
-      MemorySize(),
+template <typename T>
+const T* DenseTensor::data() const {
+  PADDLE_ENFORCE(
+      (data_type() == paddle::experimental::CppTypeToDataType<T>::Type()),
       paddle::platform::errors::PreconditionNotMet(
-          "Tensor's dimension is out of bound."
-          "Tensor's dimension must be equal or less than the size of its "
-          "memory."
-          "But received  Tensor's dimension is d%, memory's size is %d.",
-          numel() * size_of_type,
-          MemorySize()));
+          "The type of data we are trying to retrieve does not match the "
+          "type of data currently contained in the container."));
+  return static_cast<const T*>(data());
 }
 
 const void* DenseTensor::data() const {
-  CheckMemorySize();
-  return reinterpret_cast<const void*>(
-      reinterpret_cast<uintptr_t>(allocation_->ptr()) + meta_.offset);
+  PADDLE_ENFORCE_NOT_NULL(
+      storage_,
+      paddle::platform::errors::PreconditionNotMet(
+          "The storage must be valid when call the mutable data function."));
+  return storage_->data();
 }
 
-void* DenseTensor::mutable_data() {
-  PADDLE_ENFORCE_GE(
-      numel(),
-      0,
-      paddle::platform::errors::PreconditionNotMet(
-          "The Tensor's element number must be equal or greater than zero. "
-          "The Tensor's shape is [",
-          dims(),
-          "] now"));
-  size_t size =
-      numel() * paddle::framework::SizeOfType(TransToProtoVarType(meta_.type));
-  auto place = GetPlaceByBackend();
-  if (allocation_ == nullptr) {
-    allocation_.reset();
-    allocation_ = paddle::memory::AllocShared(place, size);
-  } else {
-    if (!(allocation_->place() == place) ||
-        allocation_->size() < size + meta_.offset) {
-      allocation_.reset();
-      allocation_ = paddle::memory::AllocShared(place, size);
-    } else {
-      // do nothing
-    }
-  }
-  return reinterpret_cast<void*>(
-      reinterpret_cast<uintptr_t>(allocation_->ptr()) + meta_.offset);
+void DenseTensor::check_memory_size() const {
+  size_t bytes = numel() * SizeOf(data_type());
+  PADDLE_ENFORCE_GE(memory_size(),
+                    bytes,
+                    paddle::platform::errors::InvalidArgument(
+                        "The memory size %d should be enough to meet the "
+                        "volume required by metadata %d.",
+                        memory_size(),
+                        bytes));
 }
 
+#define DATA_MEMBER_FUNC_INSTANTIATION(dtype)  \
+  template dtype* DenseTensor::mutable_data(); \
+  template const dtype* DenseTensor::data() const;
+
+DATA_MEMBER_FUNC_INSTANTIATION(bool);
+DATA_MEMBER_FUNC_INSTANTIATION(int8_t);
+DATA_MEMBER_FUNC_INSTANTIATION(uint8_t);
+DATA_MEMBER_FUNC_INSTANTIATION(int16_t);
+DATA_MEMBER_FUNC_INSTANTIATION(uint16_t);
+DATA_MEMBER_FUNC_INSTANTIATION(int32_t);
+DATA_MEMBER_FUNC_INSTANTIATION(uint32_t);
+DATA_MEMBER_FUNC_INSTANTIATION(int64_t);
+DATA_MEMBER_FUNC_INSTANTIATION(uint64_t);
+DATA_MEMBER_FUNC_INSTANTIATION(::paddle::platform::bfloat16);
+DATA_MEMBER_FUNC_INSTANTIATION(::paddle::platform::float16);
+DATA_MEMBER_FUNC_INSTANTIATION(float);
+DATA_MEMBER_FUNC_INSTANTIATION(double);
+DATA_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::complex64);
+DATA_MEMBER_FUNC_INSTANTIATION(::paddle::experimental::complex128);
+
+#undef DATA_MEMBER_FUNC_INSTANTIATION
+
 }  // namespace pten
diff --git a/paddle/pten/core/dense_tensor.h b/paddle/pten/core/dense_tensor.h
index e913440a7e663..46932ecac2ad0 100644
--- a/paddle/pten/core/dense_tensor.h
+++ b/paddle/pten/core/dense_tensor.h
@@ -14,137 +14,159 @@ limitations under the License. */
 
 #pragma once
 
-#include <memory>
-
+#include "paddle/pten/core/allocator.h"
+#include "paddle/pten/core/storage.h"
 #include "paddle/pten/core/tensor_base.h"
 #include "paddle/pten/core/tensor_meta.h"
-#include "paddle/pten/core/tensor_status.h"
-
-namespace paddle {
-namespace memory {
-namespace allocation {
-class Allocation;
-}
-}
-}
 
 namespace pten {
 
-using DataType = paddle::experimental::DataType;
-
-/**
- * The implementation of general Tensor (For CPU, CUDA, HIP, etc.), similar
- * to the Tensor in fluid, contains a pointer to Allocation and a series of
- * descriptive metadata and status required by Tensor.
- *
- * DenseTensor is still a base class, it may have inherited classes.
- *
- * The memory layout of these inherited classes is consistent with the
- * basic DenseTensor, except that a small number of members are added to
- * further specialize the description of the tensor.
- *
- * If the memory layout is different, it cannot be described based on the
- * general Allocation, and it needs to be directly inherited from
- * TensorBase.
- */
-class DenseTensor : public TensorBase {
+/// \brief The Dense tensor store values in a contiguous sequential block
+/// of memory where all values are represented. Tensors or multi-dimensional
+/// arrays are used in math operators.
+/// During the entire life cycle of a DenseTensor, its device type and key
+/// metadata are set unchanged.
+class DenseTensor : public TensorBase,
+                    public TypeInfoTraits<TensorBase, DenseTensor> {
  public:
-  // Not allowed to initialize a tensor without descriptive metadata
-  DenseTensor() = delete;
-
-  // DenseTensor(const DenseTensor&) = delete;
-  // DenseTensor& operator=(const DenseTensor&) = delete;
-  DenseTensor(DenseTensor&&) = delete;
-  DenseTensor& operator=(DenseTensor&&) = delete;
-
-  /**
-   * If we still malloc memory by mutable_data,
-   * the DenseTensor doesn't need complicated constructor.
-   *
-   * Note: Tensor objects lacking meta information are not allowed to exist.
-   */
-  DenseTensor(const TensorMeta& meta, const TensorStatus& status)
-      : meta_(meta), status_(status) {}
-
-  DenseTensor(TensorMeta&& meta, TensorStatus&& status)
-      : meta_(std::move(meta)), status_(std::move(status)) {}
-
-  int64_t numel() const override { return meta_.numel; }
-
-  const paddle::framework::DDim& dims() const override { return meta_.dims; }
-
-  DataType data_type() const override { return meta_.type; }
+  /// \brief Construct a dense tensor and allocate space.
+  /// \param a The allocator used to allocate space.
+  /// \param meta The meta data of dense tensor.
+  DenseTensor(const std::shared_ptr<Allocator>& a, const DenseTensorMeta& meta);
+
+  /// \brief Construct a dense tensor and allocate space.
+  /// \param a The allocator used to allocate space.
+  /// \param meta The meta data of dense tensor.
+  DenseTensor(const std::shared_ptr<Allocator>& a, DenseTensorMeta&& meta);
+
+  /// \brief Use existing storage space to create dense tensor. This interface
+  /// can be used to deliberately create an uninitialized dense tensor.
+  /// \param storage The existing storage.
+  /// \param meta The meta data of dense tensor.
+  DenseTensor(intrusive_ptr<Storage> storage, const DenseTensorMeta& meta);
+
+  /// \brief Use existing storage space to create dense tensor. This interface
+  /// can be used to deliberately create an uninitialized dense tensor.
+  /// \param storage The existing storage.
+  /// \param meta The meta data of dense tensor.
+  DenseTensor(intrusive_ptr<Storage> storage, DenseTensorMeta&& meta);
+
+  /// \brief Because dense tensor is a kind of container, we give a default
+  /// constructor to use for stl container. But the dense tensor created with
+  /// the default constructor is not practical.
+  DenseTensor() = default;
+
+  /// \brief Because dense tensor is a resource handle, we provide a default
+  /// move constructor to support move semantics.
+  DenseTensor(DenseTensor&& other) = default;
+
+  /// \brief We do not recommend deep copy of dense tensor because of its
+  /// efficiency and complexity across devices. The operation is disabled here.
+  DenseTensor(const DenseTensor& other) = delete;
+
+  /// \brief Destroy the tensor object and release exclusive resources.
+  virtual ~DenseTensor() = default;
 
-  DataLayout layout() const override { return meta_.layout; }
-
-  const paddle::platform::Place& place() const override;
-
-  Backend backend() const override { return meta_.backend; }
-
-  bool valid() const override { return allocation_ != nullptr; }
-
-  bool initialized() const override { return allocation_ != nullptr; }
-
-  /* member methods */
-
-  const std::shared_ptr<paddle::memory::allocation::Allocation>& allocation()
-      const {
-    return allocation_;
+ public:
+  /// \brief Returns the name of the class for type traits.
+  /// \return The name of the class.
+  static const char* name() { return "DenseTensor"; }
+
+  /// \brief Returns the number of elements contained in tensor.
+  /// \return The number of elements contained in tensor.
+  int64_t numel() const;
+
+  /// \brief Returns the dims of the tensor.
+  /// \return The dims of the tensor.
+  const DDim& dims() const noexcept { return meta_.dims; }
+
+  /// \brief Returns the lod of the tensor.
+  /// \return The lod of the tensor.
+  const std::vector<std::vector<size_t>>& lod() const noexcept {
+    return meta_.lod;
   }
 
-  const TensorMeta& meta() const { return meta_; }
-
-  TensorMeta* mutable_meta() { return &meta_; }
-
-  /* Data Access Methods */
-
-  const void* data() const;
-
-  void* mutable_data();
-
+  /// \brief Set the lod of the tensor.
+  void set_lod(const std::vector<std::vector<size_t>>& lod) { meta_.lod = lod; }
+
+  /// \brief Returns the data type of the tensor.
+  /// \return The data type of the tensor.
+  DataType data_type() const noexcept { return meta_.type; }
+
+  /// \brief Returns the data layout of the tensor.
+  /// \return The data layout of the tensor.
+  DataLayout layout() const noexcept { return meta_.layout; }
+
+  /// \brief Returns the data place of the tensor.
+  /// \return The data place of the tensor.
+  const Place& place() const { return storage_->place(); }
+
+  /// \brief Returns the meta information of the tensor.
+  /// \return The meta information of the tensor.
+  const DenseTensorMeta& meta() const noexcept { return meta_; }
+
+  /// \brief Test whether the metadata is valid.
+  /// \return Whether the metadata is valid.
+  bool valid() const noexcept { return meta_.valid(); }
+
+  /// \brief Test whether the storage is allocated.
+  /// return Whether the storage is allocated.
+  bool initialized() const { return storage_->data(); }
+
+  /// \brief Check if storage is shared with other objects.
+  /// \return Whether the storage is shared with other objects.
+  bool IsSharedWith(const DenseTensor& b) const;
+
+  /// \brief Change the dims information in the metadata, and the corresponding
+  /// memory allocation will occur when the `mutable_data` is called.
+  /// \param dims The new dims of the dense tensor.
+  void Resize(const DDim& dims) noexcept { meta_.dims = dims; }
+
+  /// \brief Returns the actual storage size occupied by tensor, may be larger
+  /// than its shape dims.
+  /// \return The actual storage size occupied by tensor.
+  size_t memory_size() const { return storage_->size(); }
+
+  /// \brief Check that the storage area is large enough to hold the data of the
+  /// metadata size, and throw an exception if the conditions are not met.
+  void check_memory_size() const;
+
+  /// \brief Release the storage area for other purposes. Because of the
+  /// destruction of encapsulation, we do not support two dense tensors directly
+  /// sharing the same intrusive pointer.
+  /// \return The rvalue of instrusize pointer releated to the released storage.
+  intrusive_ptr<Storage> release() { return std::move(storage_); }
+
+  /// \brief Get the mutable data pointer value of type T.
+  /// Memory allocation may occur when calling this interface:
+  /// 1. When the storage size is not enough to meet the current shape of the
+  /// data.
+  /// \return The mutable data pointer value of type T.
   template <typename T>
-  const T* data() const {
-    static_assert(std::is_pod<T>::value || std::is_same<T, void>::value,
-                  "T must be POD when call Tensor.data<T>().");
-    return reinterpret_cast<const T*>(data());
-  }
-
-  // NOTE: mutable_data does not hold arguments. Before calling mutable_data,
-  // please make sure that Tensor has maintained
-  // the correct meta and status.
-  //
-  // TODO(chenweihang): We need to be able to specify the allocator when
-  // mutable_data, or directly remove the mutable_data method.
-  // DenseTensor cannot actively apply for memory. Its memory application is
-  // handled by the DeviceContext->AllocateTensorData interface.
-  // I prefer the latter
+  T* mutable_data();
+
+  /// \brief Get the mutable data pointer value of raw type.
+  /// Memory allocation may occur when calling this interface:
+  /// 1. When the storage size is not enough to meet the current shape of the
+  /// data.
+  /// 2. When more request_bytes parameters are used to reserve the data
+  /// storage.
+  /// param request_bytes The bytes to reserve the data storage.
+  /// \return The mutable data pointer value of type T.
+  void* mutable_data(size_t request_bytes = 0);
+
+  /// \brief Get the const data pointer value of type T.
+  /// \return The const data pointer value of type T.
   template <typename T>
-  T* mutable_data() {
-    static_assert(std::is_pod<T>::value,
-                  "T must be POD when call Tensor.mutable_data<T>().");
-    return reinterpret_cast<T*>(mutable_data());
-  }
-
-  // For non-API and non-member interfaces, we still follow the C++ code style?
-
-  void Resize(const DDim& dims) { meta_.dims = dims; }
-
-  void ShareAllocation(const std::shared_ptr<
-                       paddle::memory::allocation::Allocation>& allocation);
+  const T* data() const;
 
-  paddle::platform::Place GetPlaceByBackend() const;
-
-  size_t MemorySize() const;
-
-  void CheckMemorySize() const;
+  /// \brief Get the const data pointer value of raw type.
+  /// \return The const data pointer value of raw type.
+  const void* data() const;
 
  private:
-  // The actual Tensor storage holder
-  std::shared_ptr<paddle::memory::allocation::Allocation> allocation_;
-  // The Tensor meta data
-  TensorMeta meta_;
-  // The Tensor status data
-  TensorStatus status_;
+  DenseTensorMeta meta_;
+  intrusive_ptr<Storage> storage_;
 };
 
 }  // namespace pten
diff --git a/paddle/pten/core/tensor_base.h b/paddle/pten/core/tensor_base.h
index 74cc082646fe2..79fd742aea10b 100644
--- a/paddle/pten/core/tensor_base.h
+++ b/paddle/pten/core/tensor_base.h
@@ -61,8 +61,6 @@ class TensorBase {
   /// return Whether the storage is allocated.
   virtual bool initialized() const = 0;
 
-  virtual paddle::experimental::Backend backend() const { return {}; }
-
   /// \brief Return the type information of the derived class to support
   /// safely downcast in non-rtti environment.
   /// return The type information of the derived class.
diff --git a/paddle/pten/core/tensor_meta.h b/paddle/pten/core/tensor_meta.h
index 8783ee584faf6..b4452a644f152 100644
--- a/paddle/pten/core/tensor_meta.h
+++ b/paddle/pten/core/tensor_meta.h
@@ -28,114 +28,58 @@ limitations under the License. */
 
 namespace pten {
 
-// template <typename T>
-// using Vector = paddle::framework::Vector<T>;
-
-/*
- * LoD is short for Level of Details.
- *
- * - in a level, each element indicates relative offset of the lower level
- * - the first element should be 0 and that indicates that this sequence start
- * from 0
- * - each sequence's begin and end(no-inclusive) is level[id, id+1]
- *
- * For example:
- *    3-level LoD stores
- *
- *    0 2 3
- *    0 2 4 7
- *    0 2 5 7 10 12 15 20
- */
-// using LoD = std::vector<paddle::framework::Vector<size_t>>;
-using LoD = std::vector<std::vector<size_t>>;
 using DDim = paddle::framework::DDim;
-/**
- * The Meta data member of DenseTensor.
- *
- * Here the `meta` represents information describing the basic features and
- * data features of Tensor, and does not include the status information of
- * Tensor
- *
- * Note: TensorMeta is a struct, the members are named like
- * ordinary nonmember variables, such as `type` instead of `type_`.
- * And we direct access its members, in addition to constructor, destructor
- * and functions for setting data members, can not provide other functions.
- */
-struct TensorMeta {
-  TensorMeta() = delete;
-  TensorMeta& operator=(const TensorMeta&) = delete;
-  TensorMeta& operator=(TensorMeta&&) = delete;
-
-  TensorMeta(const TensorMeta&) = default;
-  // TensorMeta(TensorMeta&&) = default;
-
-  TensorMeta(TensorMeta&& meta)
-      : dims(meta.dims),
-        backend(meta.backend),
-        type(meta.type),
-        layout(meta.layout),
-        numel(meta.numel),
-        offset(meta.offset),
-        lod(meta.lod) {}
-
-  // Compatible Contructor
-  TensorMeta(const DDim& dims,
-             Backend backend,
-             DataType type,
-             DataLayout layout,
-             size_t offset = 0UL,
-             const LoD& lod = {})
-      : dims(dims),
-        backend(backend),
-        type(type),
-        layout(layout),
-        offset(offset),
-        lod(lod) {
-    int64_t init_numel = paddle::framework::product(dims);
-    if (init_numel >= 0) {
-      numel = init_numel;
-    }
-  }
-
-  virtual ~TensorMeta() = default;
+using LoD = std::vector<std::vector<size_t>>;
 
+/// \brief The meta data of dense tensor. Take the structure type
+/// and use all default operations.
+///
+struct DenseTensorMeta {
+  using DataType = paddle::experimental::DataType;
+  using DataLayout = paddle::experimental::DataLayout;
+
+  DenseTensorMeta() = default;
+  DenseTensorMeta(DataType type, const DDim& dims);
+  DenseTensorMeta(DataType type, const DDim& dims, DataLayout layout);
+  DenseTensorMeta(DataType type,
+                  const DDim& dims,
+                  DataLayout layout,
+                  const std::vector<std::vector<size_t>>& lod);
+
+  /// \brief Test whether the metadata is valid. Does not throw exceptions.
+  /// \return Whether the metadata is valid.
+  bool valid() const noexcept;
+
+  /// During the entire life cycle of a DenseTensor, the following attributes
+  /// marked with `const` are expected to remain unchanged.
+  const bool is_scalar{false};
   DDim dims;
-
-  Backend backend{Backend::CPU};
-  DataType type{DataType::FLOAT32};
-  DataLayout layout{DataLayout::NCHW};
-
-  /**
-   * [ Why not calculate numel based on dims? ]
-   *
-   * Tensor may be 0-dimensional, but 0-dimensional Tensor may have values.
-   * For example:
-   *
-   *   import paddle
-   *
-   *   a = paddle.to_tensor([1, 2, 3])
-   *   print(a[0].shape) # expected: []
-   *   print(a[0].numel()) # expected: 1
-   *
-   * Now Paddle can not get expected result above, because the old Tensor's
-   * numel is calculated based on dims.
-   */
-  int64_t numel{1};
-
-  size_t offset{0};
-
-  /**
-   * [ Why basic TensorMeta hold LoD? ]
-   *
-   * LoDTensor is still the main Tensor concept in Paddle.
-   * Although only a small number of ops need to use LoD information,
-   * LoD may need to be passed between Op's input and output, which is
-   * difficult to remove in a short time.
-   *
-   * But we don't want to add a Tensor type because of LoD, which makes
-   * the concept complicated, so LoD is a member held by Tensor by default.
-   */
+  const DataType type{DataType::FLOAT32};
+  const DataLayout layout{DataLayout::NCHW};
   LoD lod;
 };
 
+inline DenseTensorMeta::DenseTensorMeta(DataType type, const DDim& dims)
+    : dims(dims), type(type) {}
+
+inline DenseTensorMeta::DenseTensorMeta(DataType type,
+                                        const DDim& dims,
+                                        DataLayout layout)
+    : dims(dims), type(type), layout(layout) {}
+
+inline DenseTensorMeta::DenseTensorMeta(
+    DataType type,
+    const DDim& dims,
+    DataLayout layout,
+    const std::vector<std::vector<size_t>>& lod)
+    : dims(dims), type(type), layout(layout), lod(lod) {}
+
+inline bool DenseTensorMeta::valid() const noexcept {
+  bool valid{true};
+  valid = valid && (type != DataType::UNDEFINED);
+  valid = valid && (layout != DataLayout::UNDEFINED);
+  valid = valid && (is_scalar || product(dims));
+  return valid;
+}
+
 }  // namespace pten
diff --git a/paddle/pten/hapi/CMakeLists.txt b/paddle/pten/hapi/CMakeLists.txt
index 8a33de85bddd3..4b427b3b4a383 100644
--- a/paddle/pten/hapi/CMakeLists.txt
+++ b/paddle/pten/hapi/CMakeLists.txt
@@ -1,3 +1,3 @@
 add_subdirectory(lib)
 
-cc_library(pten_hapi SRCS all.cc DEPS math_api linalg_api creation_api)
+cc_library(pten_hapi SRCS all.cc DEPS linalg_api math_api creation_api)
diff --git a/paddle/pten/hapi/lib/creation.cc b/paddle/pten/hapi/lib/creation.cc
index 5048b983b122f..cda8d24b5e6ad 100644
--- a/paddle/pten/hapi/lib/creation.cc
+++ b/paddle/pten/hapi/lib/creation.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/pten/api/include/core.h"
 #include "paddle/pten/api/include/infershape.h"
 #include "paddle/pten/hapi/lib/kernel_dispatch.h"
+#include "paddle/pten/hapi/lib/utils/allocator.h"
 
 namespace paddle {
 namespace experimental {
@@ -50,10 +51,12 @@ Tensor full_like(const Tensor& x,
   Tensor out;
   // InferDataType
   if (dtype != pten::DataType::UNDEFINED) {
-    out_meta.type = dtype;
+    const_cast<pten::DenseTensorMeta::DataType&>(out_meta.type) = dtype;
   }
-  auto dense_out =
-      std::make_shared<pten::DenseTensor>(out_meta, pten::TensorStatus());
+  const auto allocator =
+      std::make_shared<paddle::experimental::DefaultAllocator>(
+          pten::TransToFluidPlace(kernel_key.backend()));
+  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
   kernel_context.EmplaceBackOutput(dense_out);
   out.set_impl(dense_out);
 
diff --git a/paddle/pten/hapi/lib/linalg.cc b/paddle/pten/hapi/lib/linalg.cc
index 1269702f28f91..54829feb43a24 100644
--- a/paddle/pten/hapi/lib/linalg.cc
+++ b/paddle/pten/hapi/lib/linalg.cc
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_context.h"
 #include "paddle/pten/hapi/lib/kernel_dispatch.h"
+#include "paddle/pten/hapi/lib/utils/allocator.h"
 #include "paddle/pten/infershape/binary.h"
 
 namespace paddle {
@@ -52,8 +53,9 @@ Tensor dot(const Tensor& x, const Tensor& y) {
 
   // 5. Prepare outputs
   Tensor out;
-  auto dense_out =
-      std::make_shared<pten::DenseTensor>(out_meta, pten::TensorStatus());
+  const auto allocator = std::make_shared<DefaultAllocator>(
+      pten::TransToFluidPlace(kernel_key.backend()));
+  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
   kernel_context.EmplaceBackOutput(dense_out);
   out.set_impl(dense_out);
 
diff --git a/paddle/pten/hapi/lib/manipulation.cc b/paddle/pten/hapi/lib/manipulation.cc
index 4b9b66b9df0bd..fa60bac6d1aed 100644
--- a/paddle/pten/hapi/lib/manipulation.cc
+++ b/paddle/pten/hapi/lib/manipulation.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "glog/logging.h"
 #include "paddle/pten/api/include/core.h"
 #include "paddle/pten/hapi/lib/kernel_dispatch.h"
+#include "paddle/pten/hapi/lib/utils/allocator.h"
 #include "paddle/pten/infershape/unary.h"
 
 namespace paddle {
@@ -46,8 +47,9 @@ Tensor flatten(const Tensor& x, int start_axis, int stop_axis) {
 
   // 5. Prepare outputs
   Tensor out;
-  auto dense_out =
-      std::make_shared<pten::DenseTensor>(out_meta, pten::TensorStatus());
+  const auto allocator = std::make_shared<DefaultAllocator>(
+      pten::TransToFluidPlace(kernel_key.backend()));
+  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
   kernel_context.EmplaceBackOutput(dense_out);
   out.set_impl(dense_out);
 
diff --git a/paddle/pten/hapi/lib/math.cc b/paddle/pten/hapi/lib/math.cc
index 851a9bc155cdd..5e4e96d333030 100644
--- a/paddle/pten/hapi/lib/math.cc
+++ b/paddle/pten/hapi/lib/math.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/pten/api/include/core.h"
 #include "paddle/pten/api/include/infershape.h"
 #include "paddle/pten/hapi/lib/kernel_dispatch.h"
+#include "paddle/pten/hapi/lib/utils/allocator.h"
 #include "paddle/pten/infershape/unary.h"
 
 namespace paddle {
@@ -46,8 +47,10 @@ Tensor mean(const Tensor& x) {
 
   // 5. Prepare outputs
   Tensor out;
-  auto dense_out =
-      std::make_shared<pten::DenseTensor>(out_meta, pten::TensorStatus());
+  const auto allocator =
+      std::make_shared<paddle::experimental::DefaultAllocator>(
+          pten::TransToFluidPlace(kernel_key.backend()));
+  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
   kernel_context.EmplaceBackOutput(dense_out);
   out.set_impl(dense_out);
 
diff --git a/paddle/pten/hapi/lib/utils/CMakeLists.txt b/paddle/pten/hapi/lib/utils/CMakeLists.txt
index 4ab33a10dcdc4..c89ef812846ad 100644
--- a/paddle/pten/hapi/lib/utils/CMakeLists.txt
+++ b/paddle/pten/hapi/lib/utils/CMakeLists.txt
@@ -1,3 +1,4 @@
 add_subdirectory(tests)
 
-cc_library(pten_hapi_utils SRCS allocator.cc storage tensor_utils DEPS tensor_base pten_dense_tensor pten_utils)
+cc_library(pten_hapi_utils SRCS allocator.cc storage.cc tensor_utils.cc DEPS tensor_base convert_utils
+dense_tensor lod_tensor selected_rows place var_type_traits)
diff --git a/paddle/pten/hapi/lib/utils/tensor_utils.cc b/paddle/pten/hapi/lib/utils/tensor_utils.cc
index be7feebe8c206..2fb39852702c2 100644
--- a/paddle/pten/hapi/lib/utils/tensor_utils.cc
+++ b/paddle/pten/hapi/lib/utils/tensor_utils.cc
@@ -15,5 +15,113 @@ limitations under the License. */
 #include "paddle/pten/hapi/lib/utils/tensor_utils.h"
 
 namespace paddle {
-namespace experimental {}  // namespace experimental
+namespace experimental {
+
+template <typename DstLoD, typename SrcLoD>
+void SetLoD(DstLoD* dst, const SrcLoD& src) {
+  dst->reserve(src.size());
+  dst->clear();
+  for (auto&& v : src) {
+    dst->emplace_back(v);
+  }
+}
+
+std::unique_ptr<pten::DenseTensor> MakePtenDenseTensor(
+    const paddle::framework::Tensor& src) {
+  pten::DenseTensorMeta meta{pten::TransToPtenDataType(src.type()),
+                             src.dims(),
+                             pten::TransToPtenDataLayout(src.layout())};
+  auto shared_storage = pten::make_intrusive<SharedStorage>(src.Holder());
+  return std::make_unique<pten::DenseTensor>(std::move(shared_storage),
+                                             std::move(meta));
+}
+
+std::unique_ptr<pten::DenseTensor> MakePtenDenseTensor(
+    const paddle::framework::LoDTensor& src) {
+  pten::DenseTensorMeta meta{pten::TransToPtenDataType(src.type()),
+                             src.dims(),
+                             pten::TransToPtenDataLayout(src.layout())};
+  SetLoD(&meta.lod, src.lod());
+  auto shared_storage = pten::make_intrusive<SharedStorage>(src.Holder());
+  return std::make_unique<pten::DenseTensor>(std::move(shared_storage),
+                                             std::move(meta));
+}
+
+std::unique_ptr<pten::TensorBase> MakePtenTensorBaseFromVar(
+    const framework::Variable& variable, const pten::TensorArgDef& arg_def) {
+  auto expected_place = pten::TransToFluidPlace(arg_def.backend);
+
+  if (variable.IsType<framework::LoDTensor>()) {
+    const auto& tensor = variable.Get<framework::LoDTensor>();
+    if (!platform::is_same_place(tensor.place(), expected_place)) {
+      framework::LoDTensor tmp_tensor;
+      framework::TensorCopySync(tensor, expected_place, &tmp_tensor);
+      return MakePtenDenseTensor(tmp_tensor);
+    } else {
+      return MakePtenDenseTensor(tensor);
+    }
+  } else if (variable.IsType<framework::SelectedRows>()) {
+    // TODO(chenweihang): now we don't deal with row and height
+    // by xiaowei's advice
+    const auto& tensor = variable.Get<framework::SelectedRows>();
+    if (!platform::is_same_place(tensor.value().place(), expected_place)) {
+      framework::Tensor tmp_tensor;
+      TensorCopySync(tensor.value(), expected_place, &tmp_tensor);
+      // TODO(chenweihang): adapt SelectedRows by xiaowei's design
+      return MakePtenDenseTensor(tmp_tensor);
+    } else {
+      return MakePtenDenseTensor(tensor.value());
+    }
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unsupported shared input `%s` type now when call pt kernel.",
+        framework::ToTypeName(variable.Type())));
+  }
+  return {};
+}
+
+std::unique_ptr<pten::TensorBase> MakePtenTensorBaseFromVar(
+    framework::Variable* variable, const pten::TensorArgDef& arg_def) {
+  // mutable_data before run kernel, to avoid share output form
+  // KernelContext to original tensor
+  if (variable->template IsType<framework::LoDTensor>()) {
+    auto* tensor = variable->template GetMutable<framework::LoDTensor>();
+    tensor->mutable_data(pten::TransToFluidPlace(arg_def.backend),
+                         pten::TransToProtoVarType(arg_def.dtype));
+    return MakePtenDenseTensor(*tensor);
+  } else if (variable->template IsType<framework::SelectedRows>()) {
+    auto* tensor = variable->template GetMutable<framework::SelectedRows>();
+    tensor->mutable_value()->mutable_data(
+        pten::TransToFluidPlace(arg_def.backend),
+        pten::TransToProtoVarType(arg_def.dtype));
+    // TODO(chenweihang): adapt SelectedRows by xiaowei's design,
+    // here the row and height will lost in output!
+    return MakePtenDenseTensor(tensor->value());
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unsupported shared output `%s` type now when call pt kernel.",
+        framework::ToTypeName(variable->Type())));
+  }
+  return {};
+}
+
+void MovesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst) {
+  CHECK(src);
+  CHECK(dst);
+  dst->Resize(src->dims());
+  auto storage = src->release();
+  CHECK(storage->OwnsMemory());
+  std::shared_ptr<paddle::memory::allocation::Allocation> holder(
+      new TensorStorage(std::move(storage)));
+  dst->ResetHolderWithType(holder, pten::TransToProtoVarType(src->data_type()));
+}
+
+void MovesStorage(pten::DenseTensor* src, paddle::framework::LoDTensor* dst) {
+  CHECK(src);
+  CHECK(dst);
+  SetLoD(dst->mutable_lod(), src->lod());
+  MovesStorage(src, static_cast<paddle::framework::Tensor*>(dst));
+}
+
+}  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/pten/hapi/lib/utils/tensor_utils.h b/paddle/pten/hapi/lib/utils/tensor_utils.h
index c9d2f8ca32963..a2b2688362a4c 100644
--- a/paddle/pten/hapi/lib/utils/tensor_utils.h
+++ b/paddle/pten/hapi/lib/utils/tensor_utils.h
@@ -17,64 +17,32 @@ limitations under the License. */
 #include <memory>
 
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/variable.h"
 
-#include "paddle/pten/core/candidate/dense_tensor.h"
 #include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_factory.h"
 #include "paddle/pten/hapi/lib/utils/allocator.h"
 #include "paddle/pten/hapi/lib/utils/storage.h"
 
 namespace paddle {
 namespace experimental {
 
-using namespace pten::candidate;  // NOLINT
+std::unique_ptr<pten::DenseTensor> MakePtenDenseTensor(
+    const paddle::framework::Tensor& src);
 
-template <typename DstLoD, typename SrcLoD>
-void SetLoD(DstLoD* dst, const SrcLoD& src) {
-  dst->reserve(src.size());
-  dst->clear();
-  for (auto&& v : src) {
-    dst->emplace_back(v);
-  }
-}
+std::unique_ptr<pten::DenseTensor> MakePtenDenseTensor(
+    const paddle::framework::LoDTensor& src);
 
-std::shared_ptr<DenseTensor> MakeSharedDenseTensor(
-    const paddle::framework::Tensor& src) {
-  DenseTensorMeta meta{pten::TransToPtenDataType(src.type()),
-                       src.dims(),
-                       pten::TransToPtenDataLayout(src.layout())};
-  auto shared_storage = pten::make_intrusive<SharedStorage>(src.Holder());
-  return std::make_shared<DenseTensor>(std::move(shared_storage),
-                                       std::move(meta));
-}
+std::unique_ptr<pten::TensorBase> MakePtenTensorBaseFromVar(
+    const framework::Variable& variable, const pten::TensorArgDef& arg_def);
 
-std::shared_ptr<DenseTensor> MakeSharedDenseTensor(
-    const paddle::framework::LoDTensor& src) {
-  DenseTensorMeta meta{pten::TransToPtenDataType(src.type()),
-                       src.dims(),
-                       pten::TransToPtenDataLayout(src.layout())};
-  SetLoD(&meta.lod, src.lod());
-  auto shared_storage = pten::make_intrusive<SharedStorage>(src.Holder());
-  return std::make_shared<DenseTensor>(std::move(shared_storage),
-                                       std::move(meta));
-}
+std::unique_ptr<pten::TensorBase> MakePtenTensorBaseFromVar(
+    framework::Variable* variable, const pten::TensorArgDef& arg_def);
 
-void MovesStorage(DenseTensor* src, paddle::framework::Tensor* dst) {
-  CHECK(src);
-  CHECK(dst);
-  dst->Resize(src->dims());
-  auto storage = src->release();
-  CHECK(storage->OwnsMemory());
-  std::shared_ptr<paddle::memory::allocation::Allocation> holder(
-      new TensorStorage(std::move(storage)));
-  dst->ResetHolderWithType(holder, pten::TransToProtoVarType(src->data_type()));
-}
+void MovesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst);
 
-void MovesStorage(DenseTensor* src, paddle::framework::LoDTensor* dst) {
-  CHECK(src);
-  CHECK(dst);
-  SetLoD(dst->mutable_lod(), src->lod());
-  MovesStorage(src, static_cast<paddle::framework::Tensor*>(dst));
-}
+void MovesStorage(pten::DenseTensor* src, paddle::framework::LoDTensor* dst);
 
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/pten/hapi/lib/utils/tests/test_tensor_utils.cc b/paddle/pten/hapi/lib/utils/tests/test_tensor_utils.cc
index f45537508d29a..56184eec70f26 100644
--- a/paddle/pten/hapi/lib/utils/tests/test_tensor_utils.cc
+++ b/paddle/pten/hapi/lib/utils/tests/test_tensor_utils.cc
@@ -24,8 +24,8 @@ using DDim = paddle::framework::DDim;
 using DataType = paddle::experimental::DataType;
 using DataLayout = paddle::experimental::DataLayout;
 
-using DenseTensor = pten::candidate::DenseTensor;
-using DenseTensorMeta = pten::candidate::DenseTensorMeta;
+using DenseTensor = pten::DenseTensor;
+using DenseTensorMeta = pten::DenseTensorMeta;
 
 TEST(tensor_utils, dense_tensor_to_lod_tensor) {
   const DDim dims({2, 1});
@@ -56,7 +56,7 @@ TEST(tensor_utils, dense_tensor_to_lod_tensor) {
   CHECK(lod_tensor.data<float>()[0] == 1.0f);
   CHECK(lod_tensor.data<float>()[1] == 2.1f);
 
-  auto dense_tensor_1 = MakeSharedDenseTensor(lod_tensor);
+  auto dense_tensor_1 = MakePtenDenseTensor(lod_tensor);
   CHECK(dense_tensor_1->dims() == dims);
   CHECK(dense_tensor_1->data_type() == dtype);
   CHECK(dense_tensor_1->layout() == layout);
@@ -90,7 +90,7 @@ TEST(tensor_utils, dense_tensor_to_tensor) {
   CHECK(tensor.data<float>()[0] == 1.0f);
   CHECK(tensor.data<float>()[1] == 2.1f);
 
-  auto dense_tensor_1 = MakeSharedDenseTensor(tensor);
+  auto dense_tensor_1 = MakePtenDenseTensor(tensor);
   CHECK(dense_tensor_1->dims() == dims);
   CHECK(dense_tensor_1->data_type() == dtype);
   CHECK(dense_tensor_1->layout() == layout);
@@ -99,6 +99,27 @@ TEST(tensor_utils, dense_tensor_to_tensor) {
   CHECK(data_1[1] == 2.1f);
 }
 
+TEST(PtenUtils, VarToPtTensor) {
+  // 1. create Variable
+  paddle::framework::Variable v;
+  auto selected_rows = v.GetMutable<paddle::framework::SelectedRows>();
+  paddle::framework::Tensor* value = selected_rows->mutable_value();
+  auto* data = value->mutable_data<int>(paddle::framework::make_ddim({1, 1}),
+                                        paddle::platform::CPUPlace());
+  data[0] = 123;
+  pten::Backend expect_backend = pten::Backend::CPU;
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  expect_backend = pten::Backend::CUDA;
+#endif
+  auto tensor_def = pten::TensorArgDef(
+      expect_backend, pten::DataLayout::NCHW, pten::DataType::INT32);
+  // 2. test API
+  auto tensor_x = MakePtenTensorBaseFromVar(v, tensor_def);
+  // 3. check result
+  ASSERT_EQ(tensor_x->data_type(), pten::DataType::INT32);
+}
+
 }  // namespace tests
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/pten/infershape/binary.cc b/paddle/pten/infershape/binary.cc
index 7d224835cc05a..c2b88c74d847e 100644
--- a/paddle/pten/infershape/binary.cc
+++ b/paddle/pten/infershape/binary.cc
@@ -17,7 +17,8 @@ limitations under the License. */
 
 namespace pten {
 
-TensorMeta DotInferShape(const TensorMeta& x_meta, const TensorMeta& y_meta) {
+DenseTensorMeta DotInferShape(const DenseTensorMeta& x_meta,
+                              const DenseTensorMeta& y_meta) {
   auto x_dims = x_meta.dims;
   auto x_rank = static_cast<size_t>(x_dims.size());
   PADDLE_ENFORCE_EQ(true,
@@ -54,8 +55,7 @@ TensorMeta DotInferShape(const TensorMeta& x_meta, const TensorMeta& y_meta) {
                         y_dims.to_str()));
 
   x_dims[x_dims.size() - 1] = 1;
-  TensorMeta return_meta(
-      x_dims, x_meta.backend, x_meta.type, x_meta.layout, x_meta.offset);
+  DenseTensorMeta return_meta(x_meta.type, x_dims, x_meta.layout);
   return return_meta;
 }
 
diff --git a/paddle/pten/infershape/binary.h b/paddle/pten/infershape/binary.h
index 8e44b520e0a9f..613d2f66a6edd 100644
--- a/paddle/pten/infershape/binary.h
+++ b/paddle/pten/infershape/binary.h
@@ -21,15 +21,19 @@ namespace pten {
 
 // Common InferShape Functions for binary operators, The format like:
 //
-//   1. TensorMeta [OpName]InferShape(const TensorMeta& x_meta, ...) {}
-//   2. std::pair<TensorMeta, TensorMeta> [OpName]InferShape(const TensorMeta&
+//   1. DenseTensorMeta [OpName]InferShape(const DenseTensorMeta& x_meta, ...)
+//   {}
+//   2. std::pair<DenseTensorMeta, DenseTensorMeta> [OpName]InferShape(const
+//   DenseTensorMeta&
 //   x_meta, ...) {}
-//   3. std::tuple<TensorMeta, TensorMeta, TensorMeta> [OpName]InferShape(const
-//   TensorMeta& x_meta, ...)
+//   3. std::tuple<DenseTensorMeta, DenseTensorMeta, DenseTensorMeta>
+//   [OpName]InferShape(const
+//   DenseTensorMeta& x_meta, ...)
 //  NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good.
 //  Because functions in this file
 //  not only can infer shape, but alse need infer lod or other useful data.
 
-TensorMeta DotInferShape(const TensorMeta& x_meta, const TensorMeta& y_meta);
+DenseTensorMeta DotInferShape(const DenseTensorMeta& x_meta,
+                              const DenseTensorMeta& y_meta);
 
 }  // namespace pten
diff --git a/paddle/pten/infershape/unary.cc b/paddle/pten/infershape/unary.cc
index 57e74345b7d42..4e743261b5906 100644
--- a/paddle/pten/infershape/unary.cc
+++ b/paddle/pten/infershape/unary.cc
@@ -17,18 +17,19 @@ limitations under the License. */
 
 namespace pten {
 
-TensorMeta UnchangedInferShape(const TensorMeta& x_meta) { return x_meta; }
+DenseTensorMeta UnchangedInferShape(const DenseTensorMeta& x_meta) {
+  return x_meta;
+}
 
-TensorMeta ReductionInferShape(const TensorMeta& x_meta) {
+DenseTensorMeta ReductionInferShape(const DenseTensorMeta& x_meta) {
   const auto& out_dims = paddle::framework::make_ddim({1});
-  TensorMeta return_meta(
-      out_dims, x_meta.backend, x_meta.type, x_meta.layout, x_meta.offset);
+  DenseTensorMeta return_meta(x_meta.type, out_dims, x_meta.layout);
   return return_meta;
 }
 
-TensorMeta FlattenInferShape(const TensorMeta& x_meta,
-                             int start_axis,
-                             int stop_axis) {
+DenseTensorMeta FlattenInferShape(const DenseTensorMeta& x_meta,
+                                  int start_axis,
+                                  int stop_axis) {
   auto& x_dims = x_meta.dims;
   int in_dims_size = x_dims.size();
   if (start_axis < 0) {
@@ -62,8 +63,7 @@ TensorMeta FlattenInferShape(const TensorMeta& x_meta,
     out_shape.push_back(x_dims[i]);
   }
   const auto& out_dims = paddle::framework::make_ddim(out_shape);
-  TensorMeta return_meta(
-      out_dims, x_meta.backend, x_meta.type, x_meta.layout, x_meta.offset);
+  DenseTensorMeta return_meta(x_meta.type, out_dims, x_meta.layout);
 
   if (x_dims[0] == return_meta.dims[0]) {
     // Only pass LoD when the first dimension of output and Input(X)
diff --git a/paddle/pten/infershape/unary.h b/paddle/pten/infershape/unary.h
index 1d8fac05d0eaa..1db0b094eba3a 100644
--- a/paddle/pten/infershape/unary.h
+++ b/paddle/pten/infershape/unary.h
@@ -21,21 +21,24 @@ namespace pten {
 
 // Common InferShape Functions for unary operators, The format like:
 //
-//   1. TensorMeta [OpName]InferShape(const TensorMeta& x_meta, ...) {}
-//   2. std::pair<TensorMeta, TensorMeta> [OpName]InferShape(const TensorMeta&
+//   1. DenseTensorMeta [OpName]InferShape(const DenseTensorMeta& x_meta, ...)
+//   {}
+//   2. std::pair<DenseTensorMeta, DenseTensorMeta> [OpName]InferShape(const
+//   DenseTensorMeta&
 //   x_meta, ...) {}
-//   3. std::tuple<TensorMeta, TensorMeta, TensorMeta> [OpName]InferShape(const
-//   TensorMeta& x_meta, ...)
+//   3. std::tuple<DenseTensorMeta, DenseTensorMeta, DenseTensorMeta>
+//   [OpName]InferShape(const
+//   DenseTensorMeta& x_meta, ...)
 //  NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good.
 //  Because functions in this file
 //  not only can infer shape, but alse need infer lod or other useful data.
 
-TensorMeta UnchangedInferShape(const TensorMeta& x_meta);
+DenseTensorMeta UnchangedInferShape(const DenseTensorMeta& x_meta);
 
-TensorMeta ReductionInferShape(const TensorMeta& x_meta);
+DenseTensorMeta ReductionInferShape(const DenseTensorMeta& x_meta);
 
-TensorMeta FlattenInferShape(const TensorMeta& x_meta,
-                             int start_axis,
-                             int stop_axis);
+DenseTensorMeta FlattenInferShape(const DenseTensorMeta& x_meta,
+                                  int start_axis,
+                                  int stop_axis);
 
 }  // namespace pten
diff --git a/paddle/pten/kernels/cpu/CMakeLists.txt b/paddle/pten/kernels/cpu/CMakeLists.txt
index ad18a2f555265..2c4a424e48492 100644
--- a/paddle/pten/kernels/cpu/CMakeLists.txt
+++ b/paddle/pten/kernels/cpu/CMakeLists.txt
@@ -1,5 +1,5 @@
 cc_library(math_cpu SRCS math.cc DEPS dense_tensor kernel_context kernel_factory eigen_function)
 cc_library(linalg_cpu SRCS linalg.cc DEPS dense_tensor kernel_context kernel_factory)
 cc_library(creation_cpu SRCS creation.cc DEPS dense_tensor kernel_context kernel_factory eigen_function)
-cc_library(utils_cpu SRCS utils.cc DEPS dense_tensor kernel_context kernel_factory memory)
+cc_library(utils_cpu SRCS utils.cc DEPS dense_tensor kernel_context kernel_factory memory convert_utils)
 cc_library(manipulation_cpu SRCS manipulation.cc DEPS dense_tensor kernel_context kernel_factory utils_cpu unary)
diff --git a/paddle/pten/kernels/cpu/manipulation.cc b/paddle/pten/kernels/cpu/manipulation.cc
index ac7a8eaba8cf5..c436e14e0caab 100644
--- a/paddle/pten/kernels/cpu/manipulation.cc
+++ b/paddle/pten/kernels/cpu/manipulation.cc
@@ -26,7 +26,7 @@ void Flatten(const CPUContext& dev_ctx,
              DenseTensor* out) {
   auto out_meta = FlattenInferShape(x.meta(), start_axis, stop_axis);
   pten::Copy(dev_ctx, x, out);
-  out->mutable_meta()->lod = out_meta.lod;
+  out->set_lod(out_meta.lod);
   out->Resize(out_meta.dims);
 }
 
@@ -47,8 +47,8 @@ void FlattenWithXShape(const CPUContext& dev_ctx,
   for (int i = 0; i < in_dims.size(); ++i) {
     xshape_dims[i + 1] = in_dims[i];
   }
-  xshape->mutable_meta()->dims = paddle::framework::make_ddim(xshape_dims);
-  xshape->mutable_meta()->lod = x.meta().lod;
+  xshape->Resize(paddle::framework::make_ddim(xshape_dims));
+  xshape->set_lod(x.lod());
 }
 
 }  // namespace pten
diff --git a/paddle/pten/kernels/cpu/utils.cc b/paddle/pten/kernels/cpu/utils.cc
index b17b6512178d0..1f9d675deafa2 100644
--- a/paddle/pten/kernels/cpu/utils.cc
+++ b/paddle/pten/kernels/cpu/utils.cc
@@ -24,7 +24,6 @@ void Copy(const CPUContext& dev_ctx, const DenseTensor& src, DenseTensor* dst) {
   auto* dst_ptr = dst->mutable_data();
   const auto& src_place = src.place();
   const auto& dst_place = dst->place();
-  src.CheckMemorySize();
 
   if (src_ptr == dst_ptr && src_place == dst_place) {
     VLOG(3) << "Skip copy the same data async from " << src_place << " to "
@@ -36,7 +35,7 @@ void Copy(const CPUContext& dev_ctx, const DenseTensor& src, DenseTensor* dst) {
   VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
           << dst_place;
   dst->Resize(src.dims());
-  dst->mutable_meta()->layout = src.meta().layout;
+  CHECK(dst->layout() == src.layout());
   auto size = src.numel() * paddle::framework::SizeOfType(
                                 TransToProtoVarType(src.data_type()));
 
diff --git a/paddle/pten/kernels/cuda/CMakeLists.txt b/paddle/pten/kernels/cuda/CMakeLists.txt
index 54df37ecb5e26..9e86d9521c99a 100644
--- a/paddle/pten/kernels/cuda/CMakeLists.txt
+++ b/paddle/pten/kernels/cuda/CMakeLists.txt
@@ -2,12 +2,12 @@ if(WITH_GPU)
   nv_library(math_cuda SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory)
   nv_library(linalg_cuda SRCS linalg.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
   nv_library(creation_cuda SRCS creation.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
-  nv_library(utils_cuda SRCS utils.cu DEPS dense_tensor kernel_context kernel_factory memory)
+  nv_library(utils_cuda SRCS utils.cu DEPS dense_tensor kernel_context kernel_factory memory convert_utils)
   nv_library(manipulation_cuda SRCS manipulation.cu DEPS dense_tensor kernel_context kernel_factory utils_cuda unary)
 elseif(WITH_ROCM)
   hip_library(math_cuda SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory)
   hip_library(linalg_cuda SRCS linalg.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
   hip_library(creation_cuda SRCS creation.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
-  hip_library(utils_cuda SRCS utils.cu DEPS dense_tensor kernel_context kernel_factory memory)
+  hip_library(utils_cuda SRCS utils.cu DEPS dense_tensor kernel_context kernel_factory memory convert_utils)
   hip_library(manipulation_cuda SRCS manipulation.cu DEPS dense_tensor kernel_context kernel_factory utils_cuda unary)
 endif()
diff --git a/paddle/pten/kernels/cuda/manipulation.cu b/paddle/pten/kernels/cuda/manipulation.cu
index 13bc109faaba3..43614f859c58b 100644
--- a/paddle/pten/kernels/cuda/manipulation.cu
+++ b/paddle/pten/kernels/cuda/manipulation.cu
@@ -26,7 +26,7 @@ void Flatten(const CUDAContext& dev_ctx,
              DenseTensor* out) {
   auto out_meta = FlattenInferShape(x.meta(), start_axis, stop_axis);
   pten::Copy(dev_ctx, x, out);
-  out->mutable_meta()->lod = out_meta.lod;
+  out->set_lod(out_meta.lod);
   out->Resize(out_meta.dims);
 }
 
@@ -47,8 +47,8 @@ void FlattenWithXShape(const CUDAContext& dev_ctx,
   for (int i = 0; i < in_dims.size(); ++i) {
     xshape_dims[i + 1] = in_dims[i];
   }
-  xshape->mutable_meta()->dims = paddle::framework::make_ddim(xshape_dims);
-  xshape->mutable_meta()->lod = x.meta().lod;
+  xshape->Resize(paddle::framework::make_ddim(xshape_dims));
+  xshape->set_lod(x.lod());
 }
 
 }  // namespace pten
diff --git a/paddle/pten/kernels/cuda/math.cu b/paddle/pten/kernels/cuda/math.cu
index 4ebe58629545e..1f2a34ea505c2 100644
--- a/paddle/pten/kernels/cuda/math.cu
+++ b/paddle/pten/kernels/cuda/math.cu
@@ -30,6 +30,7 @@ namespace cub = hipcub;
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/pten/core/convert_utils.h"
 #include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/hapi/lib/utils/tensor_utils.h"
 
 namespace pten {
 
@@ -75,16 +76,21 @@ void Mean(const CUDAContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
       nullptr, temp_storage_bytes, trans_x, out_data, size_prob, stream);
   PADDLE_ENFORCE_CUDA_SUCCESS(err);
 
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      dev_ctx.GetPlace());
   pten::DenseTensor tmp(
-      TensorMeta(paddle::framework::make_ddim(
-                     {static_cast<int64_t>(temp_storage_bytes)}),
-                 pten::TransToPtenBackend(dev_ctx.GetPlace()),
-                 x.data_type(),
-                 x.layout()),
-      TensorStatus());
-  auto* temp_storage = tmp.mutable_data<uint8_t>();
-  err = cub::DeviceReduce::Sum(
-      temp_storage, temp_storage_bytes, trans_x, out_data, size_prob, stream);
+      alloc,
+      DenseTensorMeta(x.data_type(),
+                      paddle::framework::make_ddim(
+                          {static_cast<int64_t>(temp_storage_bytes)}),
+                      x.layout()));
+  void* temp_storage = tmp.mutable_data<T>();
+  err = cub::DeviceReduce::Sum(static_cast<uint8_t*>(temp_storage),
+                               temp_storage_bytes,
+                               trans_x,
+                               out_data,
+                               size_prob,
+                               stream);
   PADDLE_ENFORCE_CUDA_SUCCESS(err);
 }
 
diff --git a/paddle/pten/kernels/cuda/utils.cu b/paddle/pten/kernels/cuda/utils.cu
index 74e070880e106..e81e00a5873f7 100644
--- a/paddle/pten/kernels/cuda/utils.cu
+++ b/paddle/pten/kernels/cuda/utils.cu
@@ -27,7 +27,6 @@ void Copy(const CUDAContext& dev_ctx,
   auto* dst_ptr = dst->mutable_data();
   const auto& src_place = src.place();
   const auto& dst_place = dst->place();
-  src.CheckMemorySize();
 
   if (src_ptr == dst_ptr && src_place == dst_place) {
     VLOG(3) << "Skip copy the same data async from " << src_place << " to "
@@ -39,7 +38,7 @@ void Copy(const CUDAContext& dev_ctx,
   VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
           << dst_place;
   dst->Resize(src.dims());
-  dst->mutable_meta()->layout = src.meta().layout;
+  CHECK(dst->layout() == src.layout());
   auto size = src.numel() * paddle::framework::SizeOfType(
                                 TransToProtoVarType(src.data_type()));
 
diff --git a/paddle/pten/kernels/functions/eigen/dot.h b/paddle/pten/kernels/functions/eigen/dot.h
index 605517bad6a9a..300da4ae1f13b 100644
--- a/paddle/pten/kernels/functions/eigen/dot.h
+++ b/paddle/pten/kernels/functions/eigen/dot.h
@@ -28,7 +28,6 @@ void Dot(const DevCtx& dev_ctx,
          const DenseTensor& x,
          const DenseTensor& y,
          DenseTensor* out) {
-  out->mutable_data();
   if (1 == out->dims().size()) {
     auto eigen_out = pten::EigenScalar<T>::From(*out);
     auto eigen_x = pten::EigenVector<T>::Flatten(x);
diff --git a/paddle/pten/kernels/functions/eigen/mean.h b/paddle/pten/kernels/functions/eigen/mean.h
index 574a1957ae558..ee4bf1653f23a 100644
--- a/paddle/pten/kernels/functions/eigen/mean.h
+++ b/paddle/pten/kernels/functions/eigen/mean.h
@@ -25,8 +25,6 @@ namespace eigen {
 
 template <typename DevCtx, typename T>
 void Mean(const DevCtx& dev_ctx, const DenseTensor& x, DenseTensor* out) {
-  out->mutable_data<T>();
-
   // TODO(chenweihang): if we design new tensor, we should support
   // the low-level calc functor use new tensor as input,
   // which may be a big project!
diff --git a/paddle/pten/tests/CMakeLists.txt b/paddle/pten/tests/CMakeLists.txt
index d30ac2578d00b..21ce2f74df945 100644
--- a/paddle/pten/tests/CMakeLists.txt
+++ b/paddle/pten/tests/CMakeLists.txt
@@ -3,8 +3,8 @@ cc_test(pten_data_layout_test SRCS data_layout_test.cc DEPS gtest)
 cc_test(pten_data_type_test SRCS data_type_test.cc DEPS gtest)
 cc_test(dense_tensor_test SRCS dense_tensor_test.cc DEPS dense_tensor)
 cc_test(kernel_factory_test SRCS kernel_factory_test.cc DEPS kernel_factory)
-cc_test(test_mean_api SRCS test_mean_api.cc DEPS math_api)
-cc_test(test_dot_api SRCS test_dot_api.cc DEPS linalg_api)
-cc_test(test_fill_api SRCS test_fill_api.cc DEPS creation_api)
-cc_test(test_copy_api SRCS test_copy_api.cc DEPS utils_cpu)
-cc_test(test_flatten_api SRCS test_flatten_api.cc DEPS utils_cpu manipulation_api)
+cc_test(test_mean_api SRCS test_mean_api.cc DEPS math_api pten_hapi_utils)
+cc_test(test_dot_api SRCS test_dot_api.cc DEPS linalg_api pten_hapi_utils)
+cc_test(test_fill_api SRCS test_fill_api.cc DEPS creation_api pten_hapi_utils)
+cc_test(test_copy_api SRCS test_copy_api.cc DEPS utils_cpu pten_hapi_utils)
+cc_test(test_flatten_api SRCS test_flatten_api.cc DEPS utils_cpu manipulation_api pten_hapi_utils)
diff --git a/paddle/pten/tests/dense_tensor_test.cc b/paddle/pten/tests/dense_tensor_test.cc
index 722eab17ec412..e74917263dafb 100644
--- a/paddle/pten/tests/dense_tensor_test.cc
+++ b/paddle/pten/tests/dense_tensor_test.cc
@@ -18,16 +18,3 @@ limitations under the License. */
 
 namespace framework = paddle::framework;
 using DDim = paddle::framework::DDim;
-
-TEST(DenseTensor, Constructor) {
-  pten::DenseTensor tensor(pten::TensorMeta(framework::make_ddim({5, 10}),
-                                            pten::Backend::CPU,
-                                            pten::DataType::FLOAT32,
-                                            pten::DataLayout::NCHW,
-                                            0UL),
-                           pten::TensorStatus());
-  ASSERT_EQ(tensor.dims().size(), 2);
-  ASSERT_EQ(tensor.backend(), pten::Backend::CPU);
-  ASSERT_EQ(tensor.data_type(), pten::DataType::FLOAT32);
-  ASSERT_EQ(tensor.layout(), pten::DataLayout::NCHW);
-}
diff --git a/paddle/pten/tests/test_copy_api.cc b/paddle/pten/tests/test_copy_api.cc
index 39533c73a2564..fcebe9a310dea 100644
--- a/paddle/pten/tests/test_copy_api.cc
+++ b/paddle/pten/tests/test_copy_api.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/pten/kernels/cpu/utils.h"
 
 #include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/hapi/lib/utils/allocator.h"
 
 PT_DECLARE_MODULE(UtilsCPU);
 
@@ -30,20 +31,20 @@ using DDim = paddle::framework::DDim;
 // 'paddle/api',
 TEST(API, copy) {
   // 1. create tensor
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
   auto dense_src = std::make_shared<pten::DenseTensor>(
-      pten::TensorMeta(framework::make_ddim({2, 3}),
-                       pten::Backend::CPU,
-                       pten::DataType::FLOAT32,
-                       pten::DataLayout::NCHW),
-      pten::TensorStatus());
+      alloc,
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            framework::make_ddim({2, 3}),
+                            pten::DataLayout::NCHW));
   auto* dense_x_data = dense_src->mutable_data<float>();
 
   auto dense_dst = std::make_shared<pten::DenseTensor>(
-      pten::TensorMeta(framework::make_ddim({2, 3}),
-                       pten::Backend::CPU,
-                       pten::DataType::FLOAT32,
-                       pten::DataLayout::NCHW),
-      pten::TensorStatus());
+      alloc,
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            framework::make_ddim({2, 3}),
+                            pten::DataLayout::NCHW));
 
   for (size_t i = 0; i < 2; ++i) {
     for (size_t j = 0; j < 3; ++j) {
diff --git a/paddle/pten/tests/test_dot_api.cc b/paddle/pten/tests/test_dot_api.cc
index affa18469ec21..69e785904fe3c 100644
--- a/paddle/pten/tests/test_dot_api.cc
+++ b/paddle/pten/tests/test_dot_api.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/hapi/lib/utils/allocator.h"
 
 PT_DECLARE_MODULE(LinalgCPU);
 
@@ -32,20 +33,20 @@ using DDim = paddle::framework::DDim;
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, dot) {
   // 1. create tensor
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
   auto dense_x = std::make_shared<pten::DenseTensor>(
-      pten::TensorMeta(framework::make_ddim({3, 10}),
-                       pten::Backend::CPU,
-                       pten::DataType::FLOAT32,
-                       pten::DataLayout::NCHW),
-      pten::TensorStatus());
+      alloc,
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            framework::make_ddim({3, 10}),
+                            pten::DataLayout::NCHW));
   auto* dense_x_data = dense_x->mutable_data<float>();
 
   auto dense_y = std::make_shared<pten::DenseTensor>(
-      pten::TensorMeta(framework::make_ddim({3, 10}),
-                       pten::Backend::CPU,
-                       pten::DataType::FLOAT32,
-                       pten::DataLayout::NCHW),
-      pten::TensorStatus());
+      alloc,
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            framework::make_ddim({3, 10}),
+                            pten::DataLayout::NCHW));
   auto* dense_y_data = dense_y->mutable_data<float>();
 
   float sum[3] = {0.0, 0.0, 0.0};
diff --git a/paddle/pten/tests/test_fill_api.cc b/paddle/pten/tests/test_fill_api.cc
index afb36f95e8a1e..c19d14efaa976 100644
--- a/paddle/pten/tests/test_fill_api.cc
+++ b/paddle/pten/tests/test_fill_api.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/hapi/lib/utils/allocator.h"
 
 PT_DECLARE_MODULE(CreationCPU);
 
@@ -32,12 +33,14 @@ using DDim = paddle::framework::DDim;
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, full_like) {
   // 1. create tensor
+  // 1. create tensor
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
   auto dense_x = std::make_shared<pten::DenseTensor>(
-      pten::TensorMeta(framework::make_ddim({3, 2}),
-                       pten::Backend::CPU,
-                       pten::DataType::FLOAT32,
-                       pten::DataLayout::NCHW),
-      pten::TensorStatus());
+      alloc,
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            framework::make_ddim({3, 2}),
+                            pten::DataLayout::NCHW));
   auto* dense_x_data = dense_x->mutable_data<float>();
   dense_x_data[0] = 0;
 
@@ -66,12 +69,13 @@ TEST(API, full_like) {
 
 TEST(API, zeros_like) {
   // 1. create tensor
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
   auto dense_x = std::make_shared<pten::DenseTensor>(
-      pten::TensorMeta(framework::make_ddim({3, 2}),
-                       pten::Backend::CPU,
-                       pten::DataType::FLOAT32,
-                       pten::DataLayout::NCHW),
-      pten::TensorStatus());
+      alloc,
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            framework::make_ddim({3, 2}),
+                            pten::DataLayout::NCHW));
   auto* dense_x_data = dense_x->mutable_data<float>();
   dense_x_data[0] = 1;
 
@@ -98,13 +102,14 @@ TEST(API, zeros_like) {
 
 TEST(API, ones_like) {
   // 1. create tensor
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
   auto dense_x = std::make_shared<pten::DenseTensor>(
-      pten::TensorMeta(framework::make_ddim({3, 2}),
-                       pten::Backend::CPU,
-                       pten::DataType::FLOAT32,
-                       pten::DataLayout::NCHW),
-      pten::TensorStatus());
-  auto* dense_x_data = dense_x->mutable_data<float>();
+      alloc,
+      pten::DenseTensorMeta(pten::DataType::INT32,
+                            framework::make_ddim({3, 2}),
+                            pten::DataLayout::NCHW));
+  auto* dense_x_data = dense_x->mutable_data<int32_t>();
   dense_x_data[0] = 0;
 
   paddle::experimental::Tensor x(dense_x);
@@ -122,7 +127,7 @@ TEST(API, ones_like) {
   ASSERT_EQ(out.initialized(), true);
 
   auto dense_out = std::dynamic_pointer_cast<pten::DenseTensor>(out.impl());
-  auto* actual_result = dense_out->data<float>();
+  auto* actual_result = dense_out->data<int32_t>();
   for (auto i = 0; i < 6; i++) {
     ASSERT_EQ(actual_result[i], 1);
   }
diff --git a/paddle/pten/tests/test_flatten_api.cc b/paddle/pten/tests/test_flatten_api.cc
index 7f68cd75bc8d2..48d2205c2ff48 100644
--- a/paddle/pten/tests/test_flatten_api.cc
+++ b/paddle/pten/tests/test_flatten_api.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/hapi/lib/utils/allocator.h"
 
 PT_DECLARE_MODULE(ManipulationCPU);
 
@@ -32,12 +33,13 @@ using DDim = paddle::framework::DDim;
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, flatten) {
   // 1. create tensor
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
   auto dense_x = std::make_shared<pten::DenseTensor>(
-      pten::TensorMeta(framework::make_ddim({3, 2, 2, 3}),
-                       pten::Backend::CPU,
-                       pten::DataType::FLOAT32,
-                       pten::DataLayout::NCHW),
-      pten::TensorStatus());
+      alloc,
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            framework::make_ddim({3, 2, 2, 3}),
+                            pten::DataLayout::NCHW));
   auto* dense_x_data = dense_x->mutable_data<float>();
 
   for (int i = 0; i < dense_x->numel(); i++) {
diff --git a/paddle/pten/tests/test_mean_api.cc b/paddle/pten/tests/test_mean_api.cc
index 9c0472916e01d..ee8388671b7eb 100644
--- a/paddle/pten/tests/test_mean_api.cc
+++ b/paddle/pten/tests/test_mean_api.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/hapi/lib/utils/allocator.h"
 
 PT_DECLARE_MODULE(MathCPU);
 
@@ -32,12 +33,13 @@ using DDim = paddle::framework::DDim;
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, mean) {
   // 1. create tensor
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
   auto dense_x = std::make_shared<pten::DenseTensor>(
-      pten::TensorMeta(framework::make_ddim({3, 4}),
-                       pten::Backend::CPU,
-                       pten::DataType::FLOAT32,
-                       pten::DataLayout::NCHW),
-      pten::TensorStatus());
+      alloc,
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            framework::make_ddim({3, 4}),
+                            pten::DataLayout::NCHW));
   auto* dense_x_data = dense_x->mutable_data<float>();
 
   float sum = 0.0;

From 558a848d13236a5de4cc40f69df6be39a78d9320 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Tue, 26 Oct 2021 09:33:58 +0000
Subject: [PATCH 113/125] polish some details

---
 paddle/fluid/operators/CMakeLists.txt                         | 1 -
 paddle/pten/core/tensor_meta.h                                | 2 +-
 paddle/pten/kernels/cuda/math.cu                              | 2 --
 paddle/pten/kernels/functions/eigen/sign.h                    | 4 ----
 .../paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py | 2 +-
 5 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index bafc650c433db..3a856dd82eb61 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -80,7 +80,6 @@ if(WITH_UNITY_BUILD)
 endif()
 
 set(OP_HEADER_DEPS ${OP_HEADER_DEPS} pten)
-#set(OP_HEADER_DEPS ${OP_HEADER_DEPS} pten_utils)
 register_operators(EXCLUDES
 py_layer_op py_func_op warpctc_op dgc_op load_combine_op lstm_op run_program_op eye_op 
         recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op spectral_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
diff --git a/paddle/pten/core/tensor_meta.h b/paddle/pten/core/tensor_meta.h
index b4452a644f152..b94552fd8016c 100644
--- a/paddle/pten/core/tensor_meta.h
+++ b/paddle/pten/core/tensor_meta.h
@@ -78,7 +78,7 @@ inline bool DenseTensorMeta::valid() const noexcept {
   bool valid{true};
   valid = valid && (type != DataType::UNDEFINED);
   valid = valid && (layout != DataLayout::UNDEFINED);
-  valid = valid && (is_scalar || product(dims));
+  valid = valid && (is_scalar || product(dims) >= 0);
   return valid;
 }
 
diff --git a/paddle/pten/kernels/cuda/math.cu b/paddle/pten/kernels/cuda/math.cu
index 1f2a34ea505c2..0ead1f8048bfd 100644
--- a/paddle/pten/kernels/cuda/math.cu
+++ b/paddle/pten/kernels/cuda/math.cu
@@ -60,8 +60,6 @@ void Sign(const CUDAContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
 
 template <typename T>
 void Mean(const CUDAContext& dev_ctx, const DenseTensor& x, DenseTensor* out) {
-  VLOG(1) << "chenweihang: call new pt mean kernel.";
-  // eigen::Mean<CUDAContext, T>(dev_ctx, x, out);
   auto size_prob = x.numel();
   const T* x_data = x.data<T>();
   T* out_data = out->mutable_data<T>();
diff --git a/paddle/pten/kernels/functions/eigen/sign.h b/paddle/pten/kernels/functions/eigen/sign.h
index 13c8d3f3cfe8c..5cd620815bf26 100644
--- a/paddle/pten/kernels/functions/eigen/sign.h
+++ b/paddle/pten/kernels/functions/eigen/sign.h
@@ -25,11 +25,7 @@ namespace eigen {
 
 template <typename DevCtx, typename T>
 void Sign(const DevCtx& dev_ctx, const DenseTensor& x, DenseTensor* out) {
-  VLOG(1) << "enter module::Sign";
-  // out->mutable_data<T>(x.place());
   out->mutable_data<T>();
-
-  VLOG(1) << "module::Sign, calc by eigen.";
   // TODO(chenweihang): if we design new tensor, we should support
   // the low-level calc functor use new tensor as input,
   // which may be a big project!
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
index e3a2566133742..2548ed35bb719 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
@@ -83,7 +83,7 @@ def _decorate_compare_fused_all_reduce(self, model, use_device):
             use_device,
             init_feed_dict=init_data,
             optimizer=self.optimizer,
-            fuse_all_optimizer_ops=True)
+            fuse_all_optimizer_ops=False)
 
     def test_simple_fc_with_fuse_all_reduce(self):
         self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.CUDA)

From 8f100da939ce3613a4e2fd944a8e789caf2e83d9 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Tue, 26 Oct 2021 14:21:42 +0000
Subject: [PATCH 114/125] polish kernel signature details

---
 paddle/fluid/framework/operator.cc            | 10 ++++------
 paddle/fluid/framework/pten_utils.cc          |  5 -----
 paddle/fluid/framework/pten_utils.h           | 20 ++++++++++++++++++-
 paddle/fluid/platform/flags.cc                |  6 ++----
 .../unittests/test_fuse_all_reduce_pass.py    |  2 +-
 5 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index b8f311ff0d173..5f91020c69981 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1763,14 +1763,12 @@ OpKernelType OperatorWithKernel::GetKernelTypeForVar(
 
 KernelSignature OperatorWithKernel::GetExpectedPtenKernelArgs(
     const ExecutionContext& ctx) const {
-  if (KernelSignatureMap::Instance().Has(Type())) {
-    return *(KernelSignatureMap::Instance().GetNullable(Type()));
-  } else {
+  if (!KernelSignatureMap::Instance().Has(Type())) {
     KernelArgsNameMakerByOpProto maker(Info().proto_);
-    auto signature = std::move(maker.GetKernelSignature());
-    KernelSignatureMap::Instance().Insert(Type(), signature);
-    return signature;
+    KernelSignatureMap::Instance().Emplace(
+        Type(), std::move(maker.GetKernelSignature()));
   }
+  return KernelSignatureMap::Instance().Get(Type());
 }
 
 pten::KernelContext OperatorWithKernel::BuildPtenKernelContext(
diff --git a/paddle/fluid/framework/pten_utils.cc b/paddle/fluid/framework/pten_utils.cc
index 96408afc100e9..13cf383af5546 100644
--- a/paddle/fluid/framework/pten_utils.cc
+++ b/paddle/fluid/framework/pten_utils.cc
@@ -59,11 +59,6 @@ pten::KernelKey TransOpKernelTypeToPtenKernelKey(
   return pten::KernelKey(backend, layout, dtype);
 }
 
-KernelSignatureMap& KernelSignatureMap::Instance() {
-  static KernelSignatureMap g_kernel_signature_map;
-  return g_kernel_signature_map;
-}
-
 const paddle::SmallVector<std::string>&
 KernelArgsNameMakerByOpProto::GetInputArgsNames() {
   for (int i = 0; i < op_proto_->inputs_size(); ++i) {
diff --git a/paddle/fluid/framework/pten_utils.h b/paddle/fluid/framework/pten_utils.h
index 8c1c25b3b67cd..d1a21f93410d6 100644
--- a/paddle/fluid/framework/pten_utils.h
+++ b/paddle/fluid/framework/pten_utils.h
@@ -44,7 +44,10 @@ pten::KernelKey TransOpKernelTypeToPtenKernelKey(
 // TODO(chenweihang): we can generate this map by proto info in compile time
 class KernelSignatureMap {
  public:
-  static KernelSignatureMap& Instance();
+  static KernelSignatureMap& Instance() {
+    static KernelSignatureMap g_kernel_signature_map;
+    return g_kernel_signature_map;
+  }
 
   bool Has(const std::string& op_type) const {
     return map_.find(op_type) != map_.end();
@@ -56,6 +59,12 @@ class KernelSignatureMap {
     }
   }
 
+  void Emplace(const std::string& op_type, KernelSignature&& signature) {
+    if (!Has(op_type)) {
+      map_.emplace(op_type, signature);
+    }
+  }
+
   const KernelSignature* GetNullable(const std::string& op_type) const {
     auto it = map_.find(op_type);
     if (it == map_.end()) {
@@ -65,6 +74,15 @@ class KernelSignatureMap {
     }
   }
 
+  const KernelSignature& Get(const std::string& op_type) const {
+    auto it = map_.find(op_type);
+    PADDLE_ENFORCE_NE(
+        it, map_.end(),
+        platform::errors::NotFound(
+            "Operator `%s`'s kernel signature is not registered.", op_type));
+    return it->second;
+  }
+
  private:
   KernelSignatureMap() = default;
   paddle::flat_hash_map<std::string, KernelSignature> map_;
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index 2311e2f1ce997..f6c8ac2dc420f 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -684,16 +684,14 @@ PADDLE_DEFINE_EXPORTED_bool(
 /**
  * Pt kernel related FLAG
  * Name: FLAGS_run_pten_kernel
- * Since Version: 2.2.0
+ * Since Version: 2.3.0
  * Value Range: bool, default=false
  * Example: FLAGS_run_pten_kernel=true would use the pt kernel to compute in the
  * Op.
  * Note:
  */
-// TODO(chentianyu03): change default value to false before merge into develop
-// branch
 PADDLE_DEFINE_EXPORTED_bool(run_pten_kernel, true,
-                            "It controls whether to use pt kernel");
+                            "It controls whether to use pten kernel");
 
 /**
  * Distributed related FLAG
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
index 2548ed35bb719..e3a2566133742 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
@@ -83,7 +83,7 @@ def _decorate_compare_fused_all_reduce(self, model, use_device):
             use_device,
             init_feed_dict=init_data,
             optimizer=self.optimizer,
-            fuse_all_optimizer_ops=False)
+            fuse_all_optimizer_ops=True)
 
     def test_simple_fc_with_fuse_all_reduce(self):
         self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.CUDA)

From be9df70dae2a31a383a33e809bef187cabfdc968 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 27 Oct 2021 10:07:50 +0800
Subject: [PATCH 115/125] fix a bug about offsets of the tensor, test=develop
 (#31)

Co-authored-by: shixiaowei02 <39303645+Shixiaowei02@users.noreply.github.com>
---
 paddle/pten/hapi/lib/utils/storage.h       | 8 ++++++--
 paddle/pten/hapi/lib/utils/tensor_utils.cc | 6 ++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/paddle/pten/hapi/lib/utils/storage.h b/paddle/pten/hapi/lib/utils/storage.h
index 996e98416336b..0a88c893f4dcf 100644
--- a/paddle/pten/hapi/lib/utils/storage.h
+++ b/paddle/pten/hapi/lib/utils/storage.h
@@ -47,10 +47,14 @@ class ExternalStorage : public pten::Storage {
 class SharedStorage : public pten::Storage {
  public:
   explicit SharedStorage(
-      const std::shared_ptr<paddle::memory::Allocation>& allocation)
+      const std::shared_ptr<paddle::memory::Allocation>& allocation,
+      size_t offset)
       : allocation_(allocation) {
     CHECK(allocation);
-    data_ = pten::Allocation(allocation->ptr(), allocation->place());
+    data_ = pten::Allocation(
+        reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(allocation->ptr()) +
+                                offset),
+        allocation->place());
     size_ = allocation->size();
   }
 
diff --git a/paddle/pten/hapi/lib/utils/tensor_utils.cc b/paddle/pten/hapi/lib/utils/tensor_utils.cc
index 2fb39852702c2..a55c50db761a6 100644
--- a/paddle/pten/hapi/lib/utils/tensor_utils.cc
+++ b/paddle/pten/hapi/lib/utils/tensor_utils.cc
@@ -31,7 +31,8 @@ std::unique_ptr<pten::DenseTensor> MakePtenDenseTensor(
   pten::DenseTensorMeta meta{pten::TransToPtenDataType(src.type()),
                              src.dims(),
                              pten::TransToPtenDataLayout(src.layout())};
-  auto shared_storage = pten::make_intrusive<SharedStorage>(src.Holder());
+  auto shared_storage =
+      pten::make_intrusive<SharedStorage>(src.Holder(), src.offset());
   return std::make_unique<pten::DenseTensor>(std::move(shared_storage),
                                              std::move(meta));
 }
@@ -42,7 +43,8 @@ std::unique_ptr<pten::DenseTensor> MakePtenDenseTensor(
                              src.dims(),
                              pten::TransToPtenDataLayout(src.layout())};
   SetLoD(&meta.lod, src.lod());
-  auto shared_storage = pten::make_intrusive<SharedStorage>(src.Holder());
+  auto shared_storage =
+      pten::make_intrusive<SharedStorage>(src.Holder(), src.offset());
   return std::make_unique<pten::DenseTensor>(std::move(shared_storage),
                                              std::move(meta));
 }

From a83e9c76347130f0099723e5033abc786899fdbe Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 27 Oct 2021 10:01:42 +0000
Subject: [PATCH 116/125] polish some details

---
 paddle/fluid/framework/operator.cc           | 12 +++----
 paddle/fluid/framework/operator.h            |  1 +
 paddle/fluid/framework/pten_utils.cc         | 15 ++++-----
 paddle/fluid/framework/pten_utils.h          | 33 +++++++++++---------
 paddle/fluid/framework/type_defs.h           |  3 --
 paddle/fluid/imperative/prepared_operator.cc | 11 +++----
 paddle/fluid/imperative/prepared_operator.h  |  1 +
 paddle/fluid/operators/fill_any_like_op.cc   |  7 ++---
 paddle/fluid/operators/mean_op.h             |  1 -
 paddle/fluid/operators/scale_op.cc           | 15 +++------
 paddle/pten/core/kernel_context.h            | 26 ++++++++-------
 paddle/pten/kernels/functions/eigen/common.h |  2 +-
 12 files changed, 58 insertions(+), 69 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 5f91020c69981..33763672e7690 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -23,7 +23,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/details/nan_inf_utils.h"
 #include "paddle/fluid/framework/op_call_stack.h"
-#include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/framework/shape_inference.h"
 #include "paddle/fluid/framework/transfer_scope_cache.h"
 #include "paddle/fluid/framework/unused_var_check.h"
@@ -1278,7 +1277,7 @@ void OperatorWithKernel::ChoosePtenKernel(const ExecutionContext& ctx) const {
   kernel_type_.reset(
       new OpKernelType(std::move(InnerGetExpectedKernelType(ctx))));
 
-  auto pt_kernel_name = pten::KernelName(pt_kernel_signature_->first);
+  auto pt_kernel_name = pten::KernelName(pt_kernel_signature_->name);
   auto pt_kernel_key = TransOpKernelTypeToPtenKernelKey(*kernel_type_.get());
   pt_kernel_.reset(
       new pten::Kernel(pten::KernelFactory::Instance().SelectKernel(
@@ -1764,6 +1763,7 @@ OpKernelType OperatorWithKernel::GetKernelTypeForVar(
 KernelSignature OperatorWithKernel::GetExpectedPtenKernelArgs(
     const ExecutionContext& ctx) const {
   if (!KernelSignatureMap::Instance().Has(Type())) {
+    // TODO(chenweihang): we can generate this map by proto info in compile time
     KernelArgsNameMakerByOpProto maker(Info().proto_);
     KernelSignatureMap::Instance().Emplace(
         Type(), std::move(maker.GetKernelSignature()));
@@ -1782,9 +1782,9 @@ pten::KernelContext OperatorWithKernel::BuildPtenKernelContext(
   // 5. kernel input is not DenseTensor
   pten::KernelContext op_kernel_ctx(dev_ctx);
 
-  auto& input_names = std::get<0>(pt_kernel_signature_->second);
-  auto& attr_names = std::get<1>(pt_kernel_signature_->second);
-  auto& output_names = std::get<2>(pt_kernel_signature_->second);
+  auto& input_names = std::get<0>(pt_kernel_signature_->args);
+  auto& attr_names = std::get<1>(pt_kernel_signature_->args);
+  auto& output_names = std::get<2>(pt_kernel_signature_->args);
 
   auto input_defs = pt_kernel_->args_def().input_defs();
   auto attr_defs = pt_kernel_->args_def().attribute_defs();
@@ -1843,7 +1843,7 @@ pten::KernelContext OperatorWithKernel::BuildPtenKernelContext(
       // attribtue type by attr_defs
       if (std::type_index(attr.type()) == std::type_index(typeid(float))) {
         op_kernel_ctx.EmplaceBackAttr(
-            pten::Scalar(BOOST_GET_CONST(float, attr)));
+            std::move(pten::Scalar(BOOST_GET_CONST(float, attr))));
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "unsupported cast op attribute `%s` to Scalar when construct "
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 104c5a231375f..170dd910b2b47 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -30,6 +30,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_kernel_type.h"
+#include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/tensor.h"
diff --git a/paddle/fluid/framework/pten_utils.cc b/paddle/fluid/framework/pten_utils.cc
index 13cf383af5546..8bd9b87a47847 100644
--- a/paddle/fluid/framework/pten_utils.cc
+++ b/paddle/fluid/framework/pten_utils.cc
@@ -119,20 +119,17 @@ KernelArgsNameMakerByOpProto::GetAttrsArgsNames() {
 }
 
 KernelSignature KernelArgsNameMakerByOpProto::GetKernelSignature() {
-  return std::make_pair(
-      op_proto_->type(),
-      std::make_tuple(GetInputArgsNames(), GetAttrsArgsNames(),
-                      GetOutputArgsNames()));
+  return KernelSignature(op_proto_->type(), GetInputArgsNames(),
+                         GetAttrsArgsNames(), GetOutputArgsNames());
 }
 
 std::string KernelSignatureToString(const KernelSignature& signature) {
   std::stringstream os;
-  os << "Kernel Signature - name: " << signature.first << "; inputs: "
-     << string::join_strings(std::get<0>(signature.second), ", ")
+  os << "Kernel Signature - name: " << signature.name
+     << "; inputs: " << string::join_strings(std::get<0>(signature.args), ", ")
      << "; attributes: "
-     << string::join_strings(std::get<1>(signature.second), ", ")
-     << "; outputs: "
-     << string::join_strings(std::get<2>(signature.second), ", ");
+     << string::join_strings(std::get<1>(signature.args), ", ") << "; outputs: "
+     << string::join_strings(std::get<2>(signature.args), ", ");
   return os.str();
 }
 
diff --git a/paddle/fluid/framework/pten_utils.h b/paddle/fluid/framework/pten_utils.h
index d1a21f93410d6..30000ab62d9f7 100644
--- a/paddle/fluid/framework/pten_utils.h
+++ b/paddle/fluid/framework/pten_utils.h
@@ -41,6 +41,24 @@ pten::KernelKey TransOpKernelTypeToPtenKernelKey(
 
 /* Kernel Args parse */
 
+struct KernelSignature {
+  std::string name;
+  KernelArgsTuple args;
+
+  KernelSignature() = default;
+  KernelSignature(std::string&& kernel_name,
+                  paddle::SmallVector<std::string>&& inputs,
+                  paddle::SmallVector<std::string>&& attrs,
+                  paddle::SmallVector<std::string>&& outputs)
+      : name(std::move(kernel_name)),
+        args(std::make_tuple(inputs, attrs, outputs)) {}
+  KernelSignature(const std::string& kernel_name,
+                  const paddle::SmallVector<std::string>& inputs,
+                  const paddle::SmallVector<std::string>& attrs,
+                  const paddle::SmallVector<std::string>& outputs)
+      : name(kernel_name), args(std::make_tuple(inputs, attrs, outputs)) {}
+};
+
 // TODO(chenweihang): we can generate this map by proto info in compile time
 class KernelSignatureMap {
  public:
@@ -53,27 +71,12 @@ class KernelSignatureMap {
     return map_.find(op_type) != map_.end();
   }
 
-  void Insert(const std::string& op_type, const KernelSignature& signature) {
-    if (!Has(op_type)) {
-      map_.insert({op_type, signature});
-    }
-  }
-
   void Emplace(const std::string& op_type, KernelSignature&& signature) {
     if (!Has(op_type)) {
       map_.emplace(op_type, signature);
     }
   }
 
-  const KernelSignature* GetNullable(const std::string& op_type) const {
-    auto it = map_.find(op_type);
-    if (it == map_.end()) {
-      return nullptr;
-    } else {
-      return &it->second;
-    }
-  }
-
   const KernelSignature& Get(const std::string& op_type) const {
     auto it = map_.find(op_type);
     PADDLE_ENFORCE_NE(
diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h
index d0d1b915f2317..7f7785b374ead 100644
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -88,9 +88,6 @@ using InferInplaceOpFN = std::function<InplacePair(bool /*use_cuda*/)>;
 using KernelArgsTuple = std::tuple<paddle::SmallVector<std::string>,
                                    paddle::SmallVector<std::string>,
                                    paddle::SmallVector<std::string>>;
-// TODD(yuanrisheng): impl implicit overload signature, use KernelArgsTuple
-// directly
-using KernelSignature = std::pair<std::string, KernelArgsTuple>;
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 329c5ea52bb2f..b2d55babc7e1c 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -16,7 +16,6 @@
 
 #include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/details/nan_inf_utils.h"
-#include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/imperative/infer_shape_context.h"
 #include "paddle/pten/common/scalar.h"
 #include "paddle/utils/small_vector.h"
@@ -160,7 +159,7 @@ PreparedOp PrepareImpl(const NameVarMap<VarType>& ins,
 
     VLOG(1) << framework::KernelSignatureToString(pt_kernel_signature);
 
-    auto pt_kernel_name = pten::KernelName(pt_kernel_signature.first);
+    auto pt_kernel_name = pten::KernelName(pt_kernel_signature.name);
     auto pt_kernel_key = TransOpKernelTypeToPtenKernelKey(expected_kernel_key);
     auto pt_kernel = pten::KernelFactory::Instance().SelectKernel(
         pt_kernel_name, pt_kernel_key);
@@ -261,9 +260,9 @@ static pten::KernelContext BuildDygraphPtenKernelContext(
   // 5. kernel input is not DenseTensor
   pten::KernelContext op_kernel_ctx(dev_ctx);
 
-  auto& input_names = std::get<0>(pt_kernel_signature.second);
-  auto& attr_names = std::get<1>(pt_kernel_signature.second);
-  auto& output_names = std::get<2>(pt_kernel_signature.second);
+  auto& input_names = std::get<0>(pt_kernel_signature.args);
+  auto& attr_names = std::get<1>(pt_kernel_signature.args);
+  auto& output_names = std::get<2>(pt_kernel_signature.args);
 
   auto& input_defs = pt_kernel.args_def().input_defs();
   auto& output_defs = pt_kernel.args_def().output_defs();
@@ -321,7 +320,7 @@ static pten::KernelContext BuildDygraphPtenKernelContext(
       // attribtue type by attr_defs
       if (std::type_index(attr.type()) == std::type_index(typeid(float))) {
         op_kernel_ctx.EmplaceBackAttr(
-            pten::Scalar(BOOST_GET_CONST(float, attr)));
+            std::move(pten::Scalar(BOOST_GET_CONST(float, attr))));
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "unsupported cast op attribute `%s` to Scalar when construct "
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index a2ff0aeec1a90..fab67e87c7948 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -21,6 +21,7 @@
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/imperative/execution_context.h"
 #include "paddle/fluid/imperative/layer.h"
diff --git a/paddle/fluid/operators/fill_any_like_op.cc b/paddle/fluid/operators/fill_any_like_op.cc
index 494341694b72e..3174fada77802 100644
--- a/paddle/fluid/operators/fill_any_like_op.cc
+++ b/paddle/fluid/operators/fill_any_like_op.cc
@@ -50,11 +50,8 @@ class FillAnyLikeOp : public framework::OperatorWithKernel {
 
   framework::KernelSignature GetExpectedPtenKernelArgs(
       const framework::ExecutionContext &ctx) const override {
-    return std::make_pair(
-        "fill_any_like",
-        std::make_tuple(paddle::SmallVector<std::string>({"X"}),
-                        paddle::SmallVector<std::string>({"value"}),
-                        paddle::SmallVector<std::string>({"Out"})));
+    return framework::KernelSignature("fill_any_like", {"X"}, {"value"},
+                                      {"Out"});
   }
 };
 
diff --git a/paddle/fluid/operators/mean_op.h b/paddle/fluid/operators/mean_op.h
index 9d9954a8412a3..f909b96c9193c 100644
--- a/paddle/fluid/operators/mean_op.h
+++ b/paddle/fluid/operators/mean_op.h
@@ -66,7 +66,6 @@ class MeanKernel : public framework::OpKernel<T> {
     auto pt_out = paddle::experimental::MakePtenDenseTensor(*out);
 
     // call new kernel
-    VLOG(1) << "chenweihang: call original mean kernel compute.";
     pten::Mean<T>(dev_ctx, *pt_x.get(), pt_out.get());
   }
 };
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index e2ae1ef8eca31..038fcfcfee490 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -74,18 +74,11 @@ class ScaleOp : public framework::OperatorWithKernel {
   framework::KernelSignature GetExpectedPtenKernelArgs(
       const framework::ExecutionContext &ctx) const override {
     if (ctx.HasInput("ScaleTensor")) {
-      return std::make_pair(
-          "scale.host",
-          std::make_tuple(
-              paddle::SmallVector<std::string>({"X", "ScaleTensor"}),
-              paddle::SmallVector<std::string>({"bias", "bias_after_scale"}),
-              paddle::SmallVector<std::string>({"Out"})));
+      return framework::KernelSignature("scale.host", {"X", "ScaleTensor"},
+                                        {"bias", "bias_after_scale"}, {"Out"});
     } else {
-      return std::make_pair(
-          "scale", std::make_tuple(paddle::SmallVector<std::string>({"X"}),
-                                   paddle::SmallVector<std::string>(
-                                       {"scale", "bias", "bias_after_scale"}),
-                                   paddle::SmallVector<std::string>({"Out"})));
+      return framework::KernelSignature(
+          "scale", {"X"}, {"scale", "bias", "bias_after_scale"}, {"Out"});
     }
   }
 };
diff --git a/paddle/pten/core/kernel_context.h b/paddle/pten/core/kernel_context.h
index 78c567986bd62..b6459d9b70695 100644
--- a/paddle/pten/core/kernel_context.h
+++ b/paddle/pten/core/kernel_context.h
@@ -52,14 +52,14 @@ class KernelContext {
   }
 
   void EmplaceBackInput(std::shared_ptr<TensorBase> input) {
-    inputs_.emplace_back(input);
+    inputs_.emplace_back(std::move(input));
     // Record the start and end index of the input
     int index = inputs_.size();
     input_range_.emplace_back(std::pair<int, int>(index, index + 1));
   }
 
   void EmplaceBackInputs(
-      const paddle::SmallVector<std::shared_ptr<TensorBase>>& inputs) {
+      paddle::SmallVector<std::shared_ptr<TensorBase>> inputs) {
     for (auto in : inputs) {
       inputs_.emplace_back(in);
     }
@@ -70,14 +70,14 @@ class KernelContext {
   }
 
   void EmplaceBackOutput(std::shared_ptr<TensorBase> output) {
-    outputs_.emplace_back(output);
+    outputs_.emplace_back(std::move(output));
     // Record the start and end index of the input
     int index = outputs_.size();
     output_range_.emplace_back(std::pair<int, int>(index, index + 1));
   }
 
   void EmplaceBackOutputs(
-      const paddle::SmallVector<std::shared_ptr<TensorBase>>& outputs) {
+      paddle::SmallVector<std::shared_ptr<TensorBase>> outputs) {
     for (auto out : outputs) {
       outputs_.emplace_back(out);
     }
@@ -87,7 +87,9 @@ class KernelContext {
         std::pair<int, int>(index, index + outputs.size()));
   }
 
-  void EmplaceBackAttr(paddle::any attr) { attrs_.emplace_back(attr); }
+  void EmplaceBackAttr(paddle::any attr) {
+    attrs_.emplace_back(std::move(attr));
+  }
 
   template <typename TensorType>
   const TensorType& InputAt(size_t idx) const {
@@ -118,18 +120,18 @@ class KernelContext {
 
   // TODO(chenweihang): Tensor -> Tensor*, Tensor should by managed `scope`
   // Note: can't use API Tensor here, the inference don't use this API Tensor
-  paddle::SmallVector<std::shared_ptr<TensorBase>> inputs_{};
-  paddle::SmallVector<std::shared_ptr<TensorBase>> outputs_{};
-  paddle::SmallVector<paddle::any> attrs_{};
+  paddle::SmallVector<std::shared_ptr<TensorBase>> inputs_;
+  paddle::SmallVector<std::shared_ptr<TensorBase>> outputs_;
+  paddle::SmallVector<paddle::any> attrs_;
 
   // Only contains input like list[Tensor] need `range`
-  paddle::SmallVector<std::pair<int, int>> input_range_{{}};
-  paddle::SmallVector<std::pair<int, int>> output_range_{{}};
+  paddle::SmallVector<std::pair<int, int>> input_range_;
+  paddle::SmallVector<std::pair<int, int>> output_range_;
 
   // Only static graph need `name`
   // TODO(chenweihang): replaced by paddle::string_view
-  paddle::SmallVector<std::string> input_names_{{}};
-  paddle::SmallVector<std::string> output_names_{{}};
+  paddle::SmallVector<std::string> input_names_;
+  paddle::SmallVector<std::string> output_names_;
 };
 
 }  // namespace pten
diff --git a/paddle/pten/kernels/functions/eigen/common.h b/paddle/pten/kernels/functions/eigen/common.h
index f3a6f5fb51ff2..5ac083f710213 100644
--- a/paddle/pten/kernels/functions/eigen/common.h
+++ b/paddle/pten/kernels/functions/eigen/common.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.

From 9584c40720ed32e5ff8319b9137f0ad46c4761e0 Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Tue, 2 Nov 2021 03:21:14 +0000
Subject: [PATCH 117/125] add fill_constant kernel in pten

---
 paddle/fluid/framework/operator.cc           |  9 +++++
 paddle/fluid/imperative/prepared_operator.cc |  8 +++++
 paddle/fluid/operators/fill_constant_op.cc   | 13 +++++++
 paddle/pten/api/CMakeLists.txt               |  2 +-
 paddle/pten/api/include/infershape.h         |  1 +
 paddle/pten/common/scalar.h                  | 12 +++++++
 paddle/pten/core/kernel_utils.h              |  1 +
 paddle/pten/hapi/include/creation.h          |  6 ++++
 paddle/pten/hapi/lib/creation.cc             | 35 +++++++++++++++++++
 paddle/pten/infershape/0_nary.cc             | 27 +++++++++++++++
 paddle/pten/infershape/0_nary.h              | 34 ++++++++++++++++++
 paddle/pten/infershape/CMakeLists.txt        |  1 +
 paddle/pten/kernels/cpu/creation.cc          | 36 ++++++++++++++++++++
 paddle/pten/kernels/cpu/creation.h           |  5 +++
 paddle/pten/kernels/cuda/creation.cu         | 25 ++++++++++++++
 paddle/pten/kernels/cuda/creation.h          |  5 +++
 paddle/pten/tests/test_fill_api.cc           | 26 ++++++++++++++
 17 files changed, 245 insertions(+), 1 deletion(-)
 create mode 100644 paddle/pten/infershape/0_nary.cc
 create mode 100644 paddle/pten/infershape/0_nary.h

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 33763672e7690..335ab68ec101a 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -966,6 +966,7 @@ class RuntimeInferShapeContext : public InferShapeContext {
       var->GetMutable<LoDTensor>()->Resize(dim);
     } else if (var->IsType<SelectedRows>()) {
       var->GetMutable<SelectedRows>()->set_height(dim[0]);
+      var->GetMutable<framework::SelectedRows>()->mutable_value()->Resize(dim);
     } else {
       PADDLE_THROW(platform::errors::Unimplemented(
           "Variable type error, expect LoDTensor or SelectedRows, but received "
@@ -1844,6 +1845,14 @@ pten::KernelContext OperatorWithKernel::BuildPtenKernelContext(
       if (std::type_index(attr.type()) == std::type_index(typeid(float))) {
         op_kernel_ctx.EmplaceBackAttr(
             std::move(pten::Scalar(BOOST_GET_CONST(float, attr))));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(double))) {
+        op_kernel_ctx.EmplaceBackAttr(
+            std::move(pten::Scalar(BOOST_GET_CONST(double, attr))));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::string))) {
+        op_kernel_ctx.EmplaceBackAttr(
+            std::move(pten::Scalar(BOOST_GET_CONST(std::string, attr))));
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "unsupported cast op attribute `%s` to Scalar when construct "
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index b2d55babc7e1c..19c56cc33b6f1 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -321,6 +321,14 @@ static pten::KernelContext BuildDygraphPtenKernelContext(
       if (std::type_index(attr.type()) == std::type_index(typeid(float))) {
         op_kernel_ctx.EmplaceBackAttr(
             std::move(pten::Scalar(BOOST_GET_CONST(float, attr))));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(double))) {
+        op_kernel_ctx.EmplaceBackAttr(
+            std::move(pten::Scalar(BOOST_GET_CONST(double, attr))));
+      } else if (std::type_index(attr.type()) ==
+                 std::type_index(typeid(std::string))) {
+        op_kernel_ctx.EmplaceBackAttr(
+            std::move(pten::Scalar(BOOST_GET_CONST(std::string, attr))));
       } else {
         PADDLE_THROW(platform::errors::Unimplemented(
             "unsupported cast op attribute `%s` to Scalar when construct "
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index 44dcc343a4b4a..b36bbd4b79a73 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -68,6 +68,19 @@ class FillConstantOp : public framework::OperatorWithKernel {
         framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
         ctx.GetPlace());
   }
+
+  framework::KernelSignature GetExpectedPtenKernelArgs(
+      const framework::ExecutionContext& ctx) const override {
+    if (!ctx.HasInput("ShapeTensor") &&
+        ctx.MultiInput<framework::Tensor>("ShapeTensorList").empty() &&
+        !ctx.HasInput("ValueTensor")) {
+      const auto& str_value = ctx.Attr<std::string>("str_value");
+      std::string value = str_value.empty() ? "value" : "str_value";
+      return framework::KernelSignature("fill_constant.Scalar", {}, {value},
+                                        {"Out"});
+    }
+    return framework::KernelSignature("fill_constant.Unregistered", {}, {}, {});
+  }
 };
 
 class FillConstantOpVarTypeInference : public framework::VarTypeInference {
diff --git a/paddle/pten/api/CMakeLists.txt b/paddle/pten/api/CMakeLists.txt
index 1c107519324e2..509fbce2a3997 100644
--- a/paddle/pten/api/CMakeLists.txt
+++ b/paddle/pten/api/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(PTEN_DEPS convert_utils dense_tensor kernel_factory kernel_context)
 set(PTEN_DEPS ${PTEN_DEPS} math_cpu linalg_cpu creation_cpu manipulation_cpu)
-set(PTEN_DEPS ${PTEN_DEPS} unary binary)
+set(PTEN_DEPS ${PTEN_DEPS} 0_nary unary binary)
 if(WITH_GPU OR WITH_ROCM)
   set(PTEN_DEPS ${PTEN_DEPS} math_cuda linalg_cuda creation_cuda manipulation_cuda)
 endif()
diff --git a/paddle/pten/api/include/infershape.h b/paddle/pten/api/include/infershape.h
index 8c1bd43aaa24e..763d0c72dff53 100644
--- a/paddle/pten/api/include/infershape.h
+++ b/paddle/pten/api/include/infershape.h
@@ -15,5 +15,6 @@ limitations under the License. */
 #pragma once
 
 // See Note: [ How do we organize the kernel directory ]
+#include "paddle/pten/infershape/0_nary.h"
 #include "paddle/pten/infershape/binary.h"
 #include "paddle/pten/infershape/unary.h"
diff --git a/paddle/pten/common/scalar.h b/paddle/pten/common/scalar.h
index c55b700979ac4..ef648ba70f336 100644
--- a/paddle/pten/common/scalar.h
+++ b/paddle/pten/common/scalar.h
@@ -34,6 +34,18 @@ class Scalar {
 
   Scalar(bool val) : tag(Tag::HAS_B) { data_.b = val; }  // NOLINT
 
+  Scalar(const std::string& str_value) : tag(Tag::HAS_D) {  // NOLINT
+    if (str_value == "inf") {
+      data_.d = std::numeric_limits<double>::infinity();
+    } else if (str_value == "-inf") {
+      data_.d = -std::numeric_limits<double>::infinity();
+    } else if (str_value == "nan") {
+      data_.d = std::numeric_limits<double>::quiet_NaN();
+    } else {
+      data_.d = std::stod(str_value);
+    }
+  }
+
   template <typename T>
   inline T to() const {
     switch (tag) {
diff --git a/paddle/pten/core/kernel_utils.h b/paddle/pten/core/kernel_utils.h
index c45a81206323e..c67494279471a 100644
--- a/paddle/pten/core/kernel_utils.h
+++ b/paddle/pten/core/kernel_utils.h
@@ -164,6 +164,7 @@ struct KernelImpl<Return (*)(Args...), kernel_fn> {
   PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(int64_t);
   PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(paddle::platform::float16);
   PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const Scalar&);
+  PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<int64_t>&);
 
   /* Output Helpers */
 
diff --git a/paddle/pten/hapi/include/creation.h b/paddle/pten/hapi/include/creation.h
index 6f978be995273..b6bfb870ae78c 100644
--- a/paddle/pten/hapi/include/creation.h
+++ b/paddle/pten/hapi/include/creation.h
@@ -21,6 +21,12 @@
 namespace paddle {
 namespace experimental {
 
+Tensor full(const std::vector<int64_t>& shape,
+            const Scalar& value,
+            DataType dtype = DataType::FLOAT32,
+            Backend backend = Backend::CPU,
+            DataLayout layout = DataLayout::NCHW);
+
 Tensor full_like(const Tensor& x,
                  const Scalar& value,
                  DataType dtype = DataType::UNDEFINED);
diff --git a/paddle/pten/hapi/lib/creation.cc b/paddle/pten/hapi/lib/creation.cc
index cda8d24b5e6ad..16338606a360d 100644
--- a/paddle/pten/hapi/lib/creation.cc
+++ b/paddle/pten/hapi/lib/creation.cc
@@ -26,6 +26,41 @@ limitations under the License. */
 namespace paddle {
 namespace experimental {
 
+Tensor full(const std::vector<int64_t>& shape,
+            const Scalar& value,
+            DataType dtype,
+            Backend backend,
+            DataLayout layout) {
+  // 1. Get kernel signature and kernel
+  pten::KernelKey kernel_key{backend, layout, dtype};
+  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
+      "fill_constant", kernel_key);
+
+  // 2. Get Device Context
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
+  auto kernel_context = pten::KernelContext(*dev_ctx);
+
+  // 3. Auto data transform
+  kernel_context.EmplaceBackAttr(value);
+
+  // 4. InferShape
+  auto out_meta = pten::FullInferShape(shape, dtype, layout);
+
+  // 5. Prepare outputs
+  const auto allocator =
+      std::make_shared<paddle::experimental::DefaultAllocator>(
+          pten::TransToFluidPlace(kernel_key.backend()));
+  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
+  kernel_context.EmplaceBackOutput(dense_out);
+  Tensor out;
+  out.set_impl(dense_out);
+
+  // 6. Call kernel
+  kernel(&kernel_context);
+
+  return out;
+}
+
 Tensor full_like(const Tensor& x,
                  const Scalar& value,
                  paddle::experimental::DataType dtype) {
diff --git a/paddle/pten/infershape/0_nary.cc b/paddle/pten/infershape/0_nary.cc
new file mode 100644
index 0000000000000..d86bffb438ab9
--- /dev/null
+++ b/paddle/pten/infershape/0_nary.cc
@@ -0,0 +1,27 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/pten/infershape/0_nary.h"
+
+namespace pten {
+
+DenseTensorMeta FullInferShape(const std::vector<int64_t>& shape,
+                               DataType dtype,
+                               DataLayout layout) {
+  const auto& out_dims = paddle::framework::make_ddim(shape);
+  return {dtype, out_dims, layout};
+}
+
+}  // namespace pten
diff --git a/paddle/pten/infershape/0_nary.h b/paddle/pten/infershape/0_nary.h
new file mode 100644
index 0000000000000..8900e0ed71c9f
--- /dev/null
+++ b/paddle/pten/infershape/0_nary.h
@@ -0,0 +1,34 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/pten/core/tensor_meta.h"
+
+namespace pten {
+
+// Common InferShape Functions for 0-nary operators(no input tensor), The format
+// like:
+//
+//   1. DenseTensorMeta [OpName]InferShape( ...)
+//  NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good.
+//  Because functions in this file
+//  not only can infer shape, but alse need infer lod or other useful data.
+
+DenseTensorMeta FullInferShape(const std::vector<int64_t>& shape,
+                               DataType dtype,
+                               DataLayout layout);
+
+}  // namespace pten
diff --git a/paddle/pten/infershape/CMakeLists.txt b/paddle/pten/infershape/CMakeLists.txt
index 0b3771df3574a..a474a31c519a8 100644
--- a/paddle/pten/infershape/CMakeLists.txt
+++ b/paddle/pten/infershape/CMakeLists.txt
@@ -1,2 +1,3 @@
+cc_library(0_nary SRCS 0_nary.cc DEPS convert_utils)
 cc_library(unary SRCS unary.cc DEPS convert_utils)
 cc_library(binary SRCS binary.cc DEPS convert_utils)
diff --git a/paddle/pten/kernels/cpu/creation.cc b/paddle/pten/kernels/cpu/creation.cc
index c3986c985bd0a..b1838f075dabd 100644
--- a/paddle/pten/kernels/cpu/creation.cc
+++ b/paddle/pten/kernels/cpu/creation.cc
@@ -27,6 +27,13 @@ void FillAnyLike(const CPUContext& dev_ctx,
   eigen::fill<CPUContext, T>(dev_ctx, out, val.to<float>());
 }
 
+template <typename T>
+void FillConstant(const CPUContext& dev_ctx,
+                  const Scalar& val,
+                  DenseTensor* out) {
+  eigen::fill<CPUContext, T>(dev_ctx, out, val.to<T>());
+}
+
 }  // namespace pten
 
 PT_REGISTER_MODULE(CreationCPU);
@@ -41,3 +48,32 @@ PT_REGISTER_KERNEL("fill_any_like",
                    int64_t,
                    bool,
                    paddle::platform::float16) {}
+
+PT_REGISTER_KERNEL("fill_constant.Scalar",
+                   CPU,
+                   ANY,
+                   pten::FillConstant,
+                   float,
+                   double,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool,
+                   paddle::platform::float16) {}
+
+// PT_REGISTER_KERNEL("fill_constant",
+//                    CPU,
+//                    NCHW,
+//                    pten::FillConstant,
+//                    float,
+//                    double,
+//                    uint8_t,
+//                    int16_t,
+//                    int,
+//                    int64_t,
+//                    bool,
+//                    paddle::platform::float16,
+//                    paddle::platform::bfloat16,
+//                    paddle::platform::complex<float>,
+//                    paddle::platform::complex<double>) {}
diff --git a/paddle/pten/kernels/cpu/creation.h b/paddle/pten/kernels/cpu/creation.h
index 9991df315556d..6d7732033aed9 100644
--- a/paddle/pten/kernels/cpu/creation.h
+++ b/paddle/pten/kernels/cpu/creation.h
@@ -29,4 +29,9 @@ void FillAnyLike(const CPUContext& dev_ctx,
                  const Scalar& val,
                  DenseTensor* out);
 
+template <typename T>
+void FillConstant(const CPUContext& dev_ctx,
+                  const Scalar& val,
+                  DenseTensor* out);
+
 }  // namespace pten
diff --git a/paddle/pten/kernels/cuda/creation.cu b/paddle/pten/kernels/cuda/creation.cu
index 40e965e5aaca1..5a6b00275bcf1 100644
--- a/paddle/pten/kernels/cuda/creation.cu
+++ b/paddle/pten/kernels/cuda/creation.cu
@@ -27,6 +27,13 @@ void FillAnyLike(const CUDAContext& dev_ctx,
   eigen::fill<CUDAContext, T>(dev_ctx, out, val.to<float>());
 }
 
+template <typename T>
+void FillConstant(const CUDAContext& dev_ctx,
+                  const Scalar& val,
+                  DenseTensor* out) {
+  eigen::fill<CUDAContext, T>(dev_ctx, out, val.to<T>());
+}
+
 }  // namespace pten
 
 PT_REGISTER_MODULE(CreationCUDA);
@@ -41,3 +48,21 @@ PT_REGISTER_KERNEL("fill_any_like",
                    int64_t,
                    bool,
                    paddle::platform::float16) {}
+
+/*
+PT_REGISTER_KERNEL("fill_constant.Scalar",
+                   CUDA,
+                   ANY,
+                   pt::FillConstant,
+                   float,
+                   double,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool,
+                   paddle::platform::float16,
+                   paddle::platform::bfloat16,
+                   paddle::platform::complex<float>,
+                   paddle::platform::complex<double>) {}
+*/
diff --git a/paddle/pten/kernels/cuda/creation.h b/paddle/pten/kernels/cuda/creation.h
index 84a868e917ba1..025cd6ba51b5d 100644
--- a/paddle/pten/kernels/cuda/creation.h
+++ b/paddle/pten/kernels/cuda/creation.h
@@ -32,6 +32,11 @@ void FillAnyLike(const CUDAContext& dev_ctx,
                  const Scalar& val,
                  DenseTensor* out);
 
+template <typename T>
+void FillConstant(const CUDAContext& dev_ctx,
+                  const Scalar& val,
+                  DenseTensor* out);
+
 }  // namespace pten
 
 #endif
diff --git a/paddle/pten/tests/test_fill_api.cc b/paddle/pten/tests/test_fill_api.cc
index c19d14efaa976..57b5194b3218d 100644
--- a/paddle/pten/tests/test_fill_api.cc
+++ b/paddle/pten/tests/test_fill_api.cc
@@ -132,3 +132,29 @@ TEST(API, ones_like) {
     ASSERT_EQ(actual_result[i], 1);
   }
 }
+
+TEST(API, full) {
+  // 1. create tensor
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+
+  float val = 1.0;
+
+  // 2. test API
+  auto out = paddle::experimental::full({3, 2}, val, pten::DataType::FLOAT32);
+
+  // 3. check result
+  ASSERT_EQ(out.shape().size(), 2);
+  ASSERT_EQ(out.shape()[0], 3);
+  ASSERT_EQ(out.numel(), 6);
+  ASSERT_EQ(out.is_cpu(), true);
+  ASSERT_EQ(out.type(), pten::DataType::FLOAT32);
+  ASSERT_EQ(out.layout(), pten::DataLayout::NCHW);
+  ASSERT_EQ(out.initialized(), true);
+
+  auto dense_out = std::dynamic_pointer_cast<pten::DenseTensor>(out.impl());
+  auto* actual_result = dense_out->data<float>();
+  for (auto i = 0; i < 6; i++) {
+    ASSERT_NEAR(actual_result[i], val, 1e-6f);
+  }
+}

From 7058f2236a0919698c55faf1b85860806340913e Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Tue, 2 Nov 2021 07:56:01 +0000
Subject: [PATCH 118/125] fix bug of full api (c++)

---
 paddle/pten/hapi/lib/creation.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/pten/hapi/lib/creation.cc b/paddle/pten/hapi/lib/creation.cc
index 16338606a360d..8ef7899dae710 100644
--- a/paddle/pten/hapi/lib/creation.cc
+++ b/paddle/pten/hapi/lib/creation.cc
@@ -34,7 +34,7 @@ Tensor full(const std::vector<int64_t>& shape,
   // 1. Get kernel signature and kernel
   pten::KernelKey kernel_key{backend, layout, dtype};
   auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "fill_constant", kernel_key);
+      "fill_constant.Scalar", kernel_key);
 
   // 2. Get Device Context
   auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());

From 2163b8fc751e3f348b472c56e28c2cd89e35df2c Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Tue, 2 Nov 2021 12:39:43 +0000
Subject: [PATCH 119/125] remove the support for SelectRows in new
 fill_constant kernel

---
 paddle/fluid/framework/operator.cc         | 1 -
 paddle/fluid/operators/fill_constant_op.cc | 3 ++-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index c32962f7cbf14..5c3f547f4761d 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -966,7 +966,6 @@ class RuntimeInferShapeContext : public InferShapeContext {
       var->GetMutable<LoDTensor>()->Resize(dim);
     } else if (var->IsType<SelectedRows>()) {
       var->GetMutable<SelectedRows>()->set_height(dim[0]);
-      var->GetMutable<framework::SelectedRows>()->mutable_value()->Resize(dim);
     } else {
       PADDLE_THROW(platform::errors::Unimplemented(
           "Variable type error, expect LoDTensor or SelectedRows, but received "
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index b36bbd4b79a73..7192d3edecb39 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -73,7 +73,8 @@ class FillConstantOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     if (!ctx.HasInput("ShapeTensor") &&
         ctx.MultiInput<framework::Tensor>("ShapeTensorList").empty() &&
-        !ctx.HasInput("ValueTensor")) {
+        !ctx.HasInput("ValueTensor") &&
+        !ctx.OutputVar("Out")->IsType<framework::SelectedRows>()) {
       const auto& str_value = ctx.Attr<std::string>("str_value");
       std::string value = str_value.empty() ? "value" : "str_value";
       return framework::KernelSignature("fill_constant.Scalar", {}, {value},

From f0c9c0c2f322cf9ef09d4aeada5b702b37e98077 Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Wed, 3 Nov 2021 13:29:35 +0000
Subject: [PATCH 120/125] fix bug of setting fill_any_like kernel key

---
 paddle/pten/hapi/lib/creation.cc     | 5 ++++-
 paddle/pten/kernels/cpu/creation.cc  | 1 -
 paddle/pten/kernels/cuda/creation.cu | 1 -
 paddle/pten/tests/test_fill_api.cc   | 8 ++++----
 4 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/paddle/pten/hapi/lib/creation.cc b/paddle/pten/hapi/lib/creation.cc
index 8ef7899dae710..3e0d51ea51f1c 100644
--- a/paddle/pten/hapi/lib/creation.cc
+++ b/paddle/pten/hapi/lib/creation.cc
@@ -68,7 +68,10 @@ Tensor full_like(const Tensor& x,
   auto kernel_key_set = ParseKernelKeyByInputArgs(x);
   auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
   auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "fill_any_like", kernel_key);
+      "fill_any_like",
+      {kernel_key.backend(),
+       kernel_key.layout(),
+       dtype == DataType::UNDEFINED ? kernel_key.dtype() : dtype});
 
   // 2. Get Device Context
   auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
diff --git a/paddle/pten/kernels/cpu/creation.cc b/paddle/pten/kernels/cpu/creation.cc
index 3ca8e2c301c73..bf2bb7ba51353 100644
--- a/paddle/pten/kernels/cpu/creation.cc
+++ b/paddle/pten/kernels/cpu/creation.cc
@@ -48,7 +48,6 @@ void FillAnyLike(const CPUContext& dev_ctx,
           static_cast<CommonType>(std::numeric_limits<T>::lowest()),
           static_cast<CommonType>(std::numeric_limits<T>::max()),
           static_cast<float>(value)));
-
   eigen::fill<CPUContext, T>(dev_ctx, out, value);
 }
 
diff --git a/paddle/pten/kernels/cuda/creation.cu b/paddle/pten/kernels/cuda/creation.cu
index 23326d754f6f9..35b03a6afcce3 100644
--- a/paddle/pten/kernels/cuda/creation.cu
+++ b/paddle/pten/kernels/cuda/creation.cu
@@ -86,6 +86,5 @@ PT_REGISTER_KERNEL("fill_constant.Scalar",
                    int64_t,
                    bool,
                    paddle::platform::float16,
-                   paddle::platform::bfloat16,
                    paddle::platform::complex<float>,
                    paddle::platform::complex<double>) {}
diff --git a/paddle/pten/tests/test_fill_api.cc b/paddle/pten/tests/test_fill_api.cc
index a6268d821d109..3490174a0f401 100644
--- a/paddle/pten/tests/test_fill_api.cc
+++ b/paddle/pten/tests/test_fill_api.cc
@@ -83,21 +83,21 @@ TEST(API, zeros_like) {
   paddle::experimental::Tensor x(dense_x);
 
   // 2. test API
-  auto out = paddle::experimental::zeros_like(x, pten::DataType::FLOAT32);
+  auto out = paddle::experimental::zeros_like(x, pten::DataType::INT32);
 
   // 3. check result
   ASSERT_EQ(out.shape().size(), 2);
   ASSERT_EQ(out.shape()[0], 3);
   ASSERT_EQ(out.numel(), 6);
   ASSERT_EQ(out.is_cpu(), true);
-  ASSERT_EQ(out.type(), pten::DataType::FLOAT32);
+  ASSERT_EQ(out.type(), pten::DataType::INT32);
   ASSERT_EQ(out.layout(), pten::DataLayout::NCHW);
   ASSERT_EQ(out.initialized(), true);
 
   auto dense_out = std::dynamic_pointer_cast<pten::DenseTensor>(out.impl());
-  auto* actual_result = dense_out->data<float>();
+  auto* actual_result = dense_out->data<int32_t>();
   for (auto i = 0; i < 6; i++) {
-    ASSERT_NEAR(actual_result[i], 0, 1e-6f);
+    ASSERT_EQ(actual_result[i], 0);
   }
 }
 

From fdd0ff8121442e96001005d6eff3abf9aa55f2db Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Wed, 3 Nov 2021 13:49:23 +0000
Subject: [PATCH 121/125] merge code confilct

---
 paddle/pten/api/include/creation.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/paddle/pten/api/include/creation.h b/paddle/pten/api/include/creation.h
index 755038adb1f71..b7e7bf55c6bc5 100644
--- a/paddle/pten/api/include/creation.h
+++ b/paddle/pten/api/include/creation.h
@@ -21,6 +21,12 @@
 namespace paddle {
 namespace experimental {
 
+Tensor full(const std::vector<int64_t>& shape,
+            const Scalar& value,
+            DataType dtype = DataType::FLOAT32,
+            Backend backend = Backend::CPU,
+            DataLayout layout = DataLayout::NCHW);
+
 Tensor full_like(const Tensor& x,
                  const Scalar& value,
                  DataType dtype = DataType::UNDEFINED);

From b2d74cbc06f53fba7cfac69731fecfcd5fe67d96 Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Fri, 5 Nov 2021 07:44:59 +0000
Subject: [PATCH 122/125] modify fill_constant GetExpectedKernelType

---
 paddle/fluid/operators/fill_constant_op.cc | 32 +++++++++++++++++++++-
 paddle/pten/CMakeLists.txt                 |  2 +-
 2 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index 7192d3edecb39..4a320e1c86275 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -64,9 +64,39 @@ class FillConstantOp : public framework::OperatorWithKernel {
 
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
+    framework::OpKernelType kt = framework::OpKernelType(
         framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
         ctx.GetPlace());
+    // TODO(zyfncg) The force_cpu and place_type are conflicted, it's a issue
+    // lefted before, and we may merge them in the future.
+    // In order to invoke new fill_constant kernel, the place of OpKernelType
+    // will be setted by force_cpu and place_type here.
+    if (ctx.Attr<bool>("force_cpu")) {
+      kt.place_ = platform::CPUPlace();
+    }
+    auto place_type = ctx.Attr<int>("place_type");
+    if (place_type != -1) {
+      switch (place_type) {
+        case 0:
+          kt.place_ = platform::CPUPlace();
+          break;
+        case 1:
+          kt.place_ = platform::CUDAPlace();
+          break;
+        case 2:
+          kt.place_ = platform::CUDAPinnedPlace();
+          break;
+        case 3:
+          kt.place_ = platform::XPUPlace();
+          break;
+        default:
+          PADDLE_THROW(platform::errors::Unimplemented(
+              "Could NOT determine the place of variable, place_type = %d .",
+              place_type));
+      }
+    }
+
+    return kt;
   }
 
   framework::KernelSignature GetExpectedPtenKernelArgs(
diff --git a/paddle/pten/CMakeLists.txt b/paddle/pten/CMakeLists.txt
index 0444fa593c0ac..57698d86625d7 100644
--- a/paddle/pten/CMakeLists.txt
+++ b/paddle/pten/CMakeLists.txt
@@ -13,7 +13,7 @@ add_subdirectory(tests)
 # make an unity target for compile deps
 set(PTEN_DEPS convert_utils dense_tensor kernel_factory kernel_context)
 set(PTEN_DEPS ${PTEN_DEPS} math_cpu linalg_cpu creation_cpu manipulation_cpu)
-set(PTEN_DEPS ${PTEN_DEPS} unary binary)
+set(PTEN_DEPS ${PTEN_DEPS} 0_nary unary binary)
 if(WITH_GPU OR WITH_ROCM)
   set(PTEN_DEPS ${PTEN_DEPS} math_cuda linalg_cuda creation_cuda manipulation_cuda)
 endif()

From b657296e49436987d7e4dc6a845a4b96387d4875 Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Fri, 5 Nov 2021 09:48:34 +0000
Subject: [PATCH 123/125] fix fill_constant KernelType bug

---
 paddle/fluid/operators/fill_constant_op.cc | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index 4a320e1c86275..91019a82cc36a 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -81,10 +81,8 @@ class FillConstantOp : public framework::OperatorWithKernel {
           kt.place_ = platform::CPUPlace();
           break;
         case 1:
-          kt.place_ = platform::CUDAPlace();
-          break;
         case 2:
-          kt.place_ = platform::CUDAPinnedPlace();
+          kt.place_ = platform::CUDAPlace();
           break;
         case 3:
           kt.place_ = platform::XPUPlace();

From 8bd9c104b6ab27d9ccce3fb86f46a777587c4689 Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Fri, 5 Nov 2021 12:16:26 +0000
Subject: [PATCH 124/125] polish code of build pten KernelContext

---
 paddle/fluid/framework/operator.cc           | 4 ----
 paddle/fluid/imperative/prepared_operator.cc | 4 ----
 2 files changed, 8 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index f66f22138b92c..2fc2deb087e89 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1838,10 +1838,6 @@ pten::KernelContext OperatorWithKernel::BuildPtenKernelContext(
       if (std::type_index(attr.type()) == std::type_index(typeid(float))) {
         op_kernel_ctx.EmplaceBackAttr(
             std::move(pten::Scalar(BOOST_GET_CONST(float, attr))));
-      } else if (std::type_index(attr.type()) ==
-                 std::type_index(typeid(double))) {
-        op_kernel_ctx.EmplaceBackAttr(
-            std::move(pten::Scalar(BOOST_GET_CONST(double, attr))));
       } else if (std::type_index(attr.type()) ==
                  std::type_index(typeid(std::string))) {
         op_kernel_ctx.EmplaceBackAttr(
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 19c56cc33b6f1..7c0aaed25ab14 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -321,10 +321,6 @@ static pten::KernelContext BuildDygraphPtenKernelContext(
       if (std::type_index(attr.type()) == std::type_index(typeid(float))) {
         op_kernel_ctx.EmplaceBackAttr(
             std::move(pten::Scalar(BOOST_GET_CONST(float, attr))));
-      } else if (std::type_index(attr.type()) ==
-                 std::type_index(typeid(double))) {
-        op_kernel_ctx.EmplaceBackAttr(
-            std::move(pten::Scalar(BOOST_GET_CONST(double, attr))));
       } else if (std::type_index(attr.type()) ==
                  std::type_index(typeid(std::string))) {
         op_kernel_ctx.EmplaceBackAttr(

From 5d8a3f6ad1648e51a739822c7fef6d77158b5500 Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Mon, 8 Nov 2021 08:00:29 +0000
Subject: [PATCH 125/125] refactor code of fill_constant in pten

---
 paddle/fluid/operators/fill_constant_op.cc    | 4 ++--
 paddle/pten/CMakeLists.txt                    | 2 +-
 paddle/pten/api/lib/creation.cc               | 2 +-
 paddle/pten/include/infershape.h              | 2 +-
 paddle/pten/infershape/CMakeLists.txt         | 2 +-
 paddle/pten/infershape/{0_nary.cc => nary.cc} | 2 +-
 paddle/pten/infershape/{0_nary.h => nary.h}   | 0
 paddle/pten/kernels/cpu/creation.cc           | 2 +-
 paddle/pten/kernels/cuda/creation.cu          | 2 +-
 9 files changed, 9 insertions(+), 9 deletions(-)
 rename paddle/pten/infershape/{0_nary.cc => nary.cc} (95%)
 rename paddle/pten/infershape/{0_nary.h => nary.h} (100%)

diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index 91019a82cc36a..aea149fbedc45 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -105,10 +105,10 @@ class FillConstantOp : public framework::OperatorWithKernel {
         !ctx.OutputVar("Out")->IsType<framework::SelectedRows>()) {
       const auto& str_value = ctx.Attr<std::string>("str_value");
       std::string value = str_value.empty() ? "value" : "str_value";
-      return framework::KernelSignature("fill_constant.Scalar", {}, {value},
+      return framework::KernelSignature("fill_constant.scalar", {}, {value},
                                         {"Out"});
     }
-    return framework::KernelSignature("fill_constant.Unregistered", {}, {}, {});
+    return framework::KernelSignature("fill_constant.unregistered", {}, {}, {});
   }
 };
 
diff --git a/paddle/pten/CMakeLists.txt b/paddle/pten/CMakeLists.txt
index 01ba31b2a0aaa..0b3bb2557039c 100644
--- a/paddle/pten/CMakeLists.txt
+++ b/paddle/pten/CMakeLists.txt
@@ -13,7 +13,7 @@ add_subdirectory(tests)
 # make an unity target for compile deps
 set(PTEN_DEPS convert_utils dense_tensor kernel_factory kernel_context)
 set(PTEN_DEPS ${PTEN_DEPS} math_cpu linalg_cpu creation_cpu manipulation_cpu)
-set(PTEN_DEPS ${PTEN_DEPS} 0_nary unary binary)
+set(PTEN_DEPS ${PTEN_DEPS} nary unary binary)
 if(WITH_GPU OR WITH_ROCM)
   set(PTEN_DEPS ${PTEN_DEPS} math_cuda linalg_cuda creation_cuda manipulation_cuda)
 endif()
diff --git a/paddle/pten/api/lib/creation.cc b/paddle/pten/api/lib/creation.cc
index a489901b74663..047b19010a26c 100644
--- a/paddle/pten/api/lib/creation.cc
+++ b/paddle/pten/api/lib/creation.cc
@@ -34,7 +34,7 @@ Tensor full(const std::vector<int64_t>& shape,
   // 1. Get kernel signature and kernel
   pten::KernelKey kernel_key{backend, layout, dtype};
   auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
-      "fill_constant.Scalar", kernel_key);
+      "fill_constant.scalar", kernel_key);
 
   // 2. Get Device Context
   auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
diff --git a/paddle/pten/include/infershape.h b/paddle/pten/include/infershape.h
index 763d0c72dff53..d8dd2837a72d9 100644
--- a/paddle/pten/include/infershape.h
+++ b/paddle/pten/include/infershape.h
@@ -15,6 +15,6 @@ limitations under the License. */
 #pragma once
 
 // See Note: [ How do we organize the kernel directory ]
-#include "paddle/pten/infershape/0_nary.h"
 #include "paddle/pten/infershape/binary.h"
+#include "paddle/pten/infershape/nary.h"
 #include "paddle/pten/infershape/unary.h"
diff --git a/paddle/pten/infershape/CMakeLists.txt b/paddle/pten/infershape/CMakeLists.txt
index a474a31c519a8..b32ec0a51c736 100644
--- a/paddle/pten/infershape/CMakeLists.txt
+++ b/paddle/pten/infershape/CMakeLists.txt
@@ -1,3 +1,3 @@
-cc_library(0_nary SRCS 0_nary.cc DEPS convert_utils)
+cc_library(nary SRCS nary.cc DEPS convert_utils)
 cc_library(unary SRCS unary.cc DEPS convert_utils)
 cc_library(binary SRCS binary.cc DEPS convert_utils)
diff --git a/paddle/pten/infershape/0_nary.cc b/paddle/pten/infershape/nary.cc
similarity index 95%
rename from paddle/pten/infershape/0_nary.cc
rename to paddle/pten/infershape/nary.cc
index d86bffb438ab9..b8745dd9b83af 100644
--- a/paddle/pten/infershape/0_nary.cc
+++ b/paddle/pten/infershape/nary.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 // See Note [ Why still include the fluid headers? ]
-#include "paddle/pten/infershape/0_nary.h"
+#include "paddle/pten/infershape/nary.h"
 
 namespace pten {
 
diff --git a/paddle/pten/infershape/0_nary.h b/paddle/pten/infershape/nary.h
similarity index 100%
rename from paddle/pten/infershape/0_nary.h
rename to paddle/pten/infershape/nary.h
diff --git a/paddle/pten/kernels/cpu/creation.cc b/paddle/pten/kernels/cpu/creation.cc
index bf2bb7ba51353..2ab2537a84437 100644
--- a/paddle/pten/kernels/cpu/creation.cc
+++ b/paddle/pten/kernels/cpu/creation.cc
@@ -73,7 +73,7 @@ PT_REGISTER_KERNEL("fill_any_like",
                    bool,
                    paddle::platform::float16) {}
 
-PT_REGISTER_KERNEL("fill_constant.Scalar",
+PT_REGISTER_KERNEL("fill_constant.scalar",
                    CPU,
                    ANY,
                    pten::FillConstant,
diff --git a/paddle/pten/kernels/cuda/creation.cu b/paddle/pten/kernels/cuda/creation.cu
index 35b03a6afcce3..b96b5ebea9b70 100644
--- a/paddle/pten/kernels/cuda/creation.cu
+++ b/paddle/pten/kernels/cuda/creation.cu
@@ -74,7 +74,7 @@ PT_REGISTER_KERNEL("fill_any_like",
                    bool,
                    paddle::platform::float16) {}
 
-PT_REGISTER_KERNEL("fill_constant.Scalar",
+PT_REGISTER_KERNEL("fill_constant.scalar",
                    CUDA,
                    ANY,
                    pten::FillConstant,