diff --git a/CMakeLists.txt b/CMakeLists.txt
index 50070c7fc05133..4f6ed9de30efe4 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -119,8 +119,6 @@ if(WIN32)
         endforeach(flag_var)
     endif()
 
-    math(EXPR PROCESS_MAX "${CPU_CORES} * 2 / 3")
-
     # windows build turn off warnings, use parallel compiling.
     foreach(flag_var
         CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
@@ -128,8 +126,12 @@ if(WIN32)
         CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
         CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
         string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
-        # NOTE(zhouwei25): GPU compile have too high memory utilization when parallel compiling
-        if(NOT WITH_GPU)
+        
+        # NOTE(zhouwei25): GPU compile have too high memory utilization when parallel compiling,
+        # For Visual Studio generators, /MP should be added. 
+        # For other generators like Ninja, it is not need to add /MP.
+        if("${CMAKE_GENERATOR}" STREQUAL "Visual Studio" AND NOT WITH_GPU)
+            math(EXPR PROCESS_MAX "${CPU_CORES} * 2 / 3")
             set(${flag_var} "${${flag_var}} /MP${PROCESS_MAX}")
         endif()
     endforeach(flag_var)
@@ -312,6 +314,17 @@ else()
      endif()
 endif()
 
+if(WITH_DISTRIBUTE)
+    if(LINUX)
+        set(WITH_GLOO ON CACHE STRING "Enable GLOO when compiling WITH_DISTRIBUTE=ON." FORCE)
+    endif()
+    if(WITH_ASCEND_CL)
+        # disable WITH_PSCORE for NPU before include third_party
+        MESSAGE(WARNING "Disable WITH_PSCORE when compiling with NPU. Force WITH_PSCORE=OFF.")
+        set(WITH_PSCORE OFF CACHE BOOL "Disable WITH_PSCORE when compiling with NPU" FORCE)
+    endif()
+endif()
+
 include(third_party)  # download, build, install third_party, Contains about 20+ dependencies
 
 include(flags)              # set paddle compile flags
@@ -322,12 +335,6 @@ if(WITH_PROFILER)
     add_definitions(-DWITH_GPERFTOOLS)
 endif()
 
-if(WITH_DISTRIBUTE)
-    if(LINUX)
-        set(WITH_GLOO ON CACHE STRING "Enable GLOO when compiling WITH_DISTRIBUTE=ON." FORCE)
-    endif()
-endif()
-
 include(ccache)             # set ccache for compilation
 include(util)               # set unittest and link libs
 include(version)            # set PADDLE_VERSION
diff --git a/cmake/ccache.cmake b/cmake/ccache.cmake
index 25798758473af5..5520720f7a6c71 100644
--- a/cmake/ccache.cmake
+++ b/cmake/ccache.cmake
@@ -18,7 +18,7 @@ elseif("${CMAKE_GENERATOR}" STREQUAL "Ninja")
 
     if(SCCACHE_PATH)
         execute_process(COMMAND sccache -V OUTPUT_VARIABLE sccache_version)
-        message(STATUS "${sccache_version} is founded, use [${SCCACHE_PATH}] to speed up compile on Windows.")
+        message(STATUS "sccache is founded, use [${SCCACHE_PATH}] to speed up compile on Windows.")
 
         set(CMAKE_C_COMPILER_LAUNCHER ${SCCACHE_PATH})
         set(CMAKE_CXX_COMPILER_LAUNCHER ${SCCACHE_PATH})
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 0ed62ac93a7278..485fddff4df424 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -197,14 +197,14 @@ cc_test(operator_exception_test SRCS operator_exception_test.cc DEPS operator op
 cc_library(version SRCS version.cc)
 cc_test(version_test SRCS version_test.cc DEPS version)
 
-cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
+cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS attribute shape_inference op_info operator glog version)
 
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
 
 cc_library(op_call_stack SRCS op_call_stack.cc DEPS op_proto_maker enforce)
 cc_test(op_call_stack_test SRCS op_call_stack_test.cc DEPS op_call_stack)
 
-cc_library(program_processing SRCS program_processing.cc DEPS framework_proto)
+cc_library(program_processing SRCS program_processing.cc DEPS boost proto_desc)
 cc_test(program_processing_test SRCS program_processing_test.cc DEPS proto_desc program_processing)
 
 if(WITH_GPU)
diff --git a/paddle/fluid/framework/ir/fuse_pass_base.cc b/paddle/fluid/framework/ir/fuse_pass_base.cc
index 9dfc8bf6037a74..4f89750daee16f 100644
--- a/paddle/fluid/framework/ir/fuse_pass_base.cc
+++ b/paddle/fluid/framework/ir/fuse_pass_base.cc
@@ -53,6 +53,8 @@ void FusePassBase::AddStatis(int count_of_fused) const {
   auto& info =
       graph_->Get<std::unordered_map<std::string, int>>(kFuseStatisAttr);
   info[repr_] = count_of_fused;
+  if (count_of_fused > 0)
+    LOG(INFO) << "---  detected " << count_of_fused << " subgraphs";
 }
 
 FuseOptions FusePassBase::FindFuseOption(const Node& node1,
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 7717bcfc3e9624..9d06a4de9548de 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -88,7 +88,7 @@ void GraphPatternDetector::operator()(Graph *graph,
   ValidateByNodeRole(&subgraphs);
 
   if (subgraphs.empty()) return;
-  LOG(INFO) << "---  detected " << subgraphs.size() << " subgraphs";
+
   int id = 0;
   for (auto &g : subgraphs) {
     VLOG(3) << "optimizing #" << id++ << " subgraph";
diff --git a/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc b/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc
index 9542d3d3d43f31..613768284735c1 100644
--- a/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc
+++ b/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc
@@ -191,11 +191,6 @@ void MapMatmul2MulPass::ApplyImpl(ir::Graph* graph) const {
   int found_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
-    if (!IsCompat(subgraph, g)) {
-      LOG(WARNING) << "Pass in op compat failed.";
-      return;
-    }
-
     VLOG(4) << "map matmul to mul";
     GET_IR_NODE_FROM_SUBGRAPH(matmul_in_x, matmul_in_x, matmul_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(matmul_in_y, matmul_in_y, matmul_pattern);
@@ -221,6 +216,10 @@ void MapMatmul2MulPass::ApplyImpl(ir::Graph* graph) const {
            next_ops[0]->Name() == "elementwise_add";
 
     if (flag) {
+      if (!IsCompat(subgraph, g)) {
+        LOG(WARNING) << "Pass in op compat failed.";
+        return;
+      }
       OpDesc desc;
       desc.SetType("mul");
       desc.SetInput("X", {matmul_in_x->Name()});
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
index 74bbe24eb82f5d..a7514038d400b6 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
@@ -70,7 +70,7 @@ ConvBiasFusePass::ConvBiasFusePass() {
       .IsTensor()
       .End()
       .AddAttr("axis")
-      .IsIntIn({-1, 0})
+      .IsIntIn({1, 3})
       .End();
 }
 
diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
index 80a9ef7eda724a..e41c35ba33fdc9 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
@@ -52,7 +52,7 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
       op->SetInput("Bias", {});
   } else if (type == "elementwise_add") {
     op->SetAttr("use_mkldnn", true);
-    op->SetAttr("axis", -1);
+    op->SetAttr("axis", 1);
     op->SetInput("X", {inputs[0]});
     op->SetInput("Y", {inputs[1]});
     op->SetOutput("Out", outputs);
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index dd3a33130a3e6e..d32ec581ce94b4 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1255,6 +1255,7 @@ USE_TRT_CONVERTER(nearest_interp);
 USE_TRT_CONVERTER(reshape);
 USE_TRT_CONVERTER(reduce_sum);
 USE_TRT_CONVERTER(gather_nd);
+USE_TRT_CONVERTER(reduce_mean);
 #endif
 
 namespace paddle_infer {
diff --git a/paddle/fluid/inference/tensorrt/convert/reduce_op.cc b/paddle/fluid/inference/tensorrt/convert/reduce_op.cc
index 66d2680fe9969c..f3c4059b8e6456 100644
--- a/paddle/fluid/inference/tensorrt/convert/reduce_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/reduce_op.cc
@@ -35,12 +35,18 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-class ReduceSumOpConverter : public OpConverter {
+class ReduceOpConverter : public OpConverter {
  public:
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope, bool test_mode) override {
-    VLOG(4) << "convert a paddle reduce_sum op to tensorrt reduce layer";
+    VLOG(4) << "convert a paddle " << op_type << " op to tensorrt reduce layer";
     framework::OpDesc op_desc(op, nullptr);
+    nvinfer1::ReduceOperation reduce_type;
+    if (op_type == "reduce_sum") {
+      reduce_type = nvinfer1::ReduceOperation::kSUM;
+    } else if (op_type == "reduce_mean") {
+      reduce_type = nvinfer1::ReduceOperation::kAVG;
+    }
 
     auto* x = engine_->GetITensor(op_desc.Input("X").front());
     nvinfer1::Dims input_shape = x->getDimensions();
@@ -51,15 +57,13 @@ class ReduceSumOpConverter : public OpConverter {
         BOOST_GET_CONST(std::vector<int32_t>, op_desc.GetAttr("dim"));
     bool reduce_all = BOOST_GET_CONST(bool, op_desc.GetAttr("reduce_all"));
 
-    // Now we only support dynamic_shape mode.
     nvinfer1::IReduceLayer* layer = nullptr;
     if (reduce_all) {
       uint32_t reduce_dim = 0;
       for (int i = 0; i < input_dims; ++i) {
         reduce_dim |= 1 << i;
       }
-      layer = TRT_ENGINE_ADD_LAYER(engine_, Reduce, *x,
-                                   nvinfer1::ReduceOperation::kSUM, reduce_dim,
+      layer = TRT_ENGINE_ADD_LAYER(engine_, Reduce, *x, reduce_type, reduce_dim,
                                    keep_dim);
     } else {
       auto CvtToBitMask = [&](const std::vector<int32_t>& dims) -> uint32_t {
@@ -68,19 +72,32 @@ class ReduceSumOpConverter : public OpConverter {
           if (x < 0) {
             res |= 1 << (x + input_dims);
           } else {
+            if (!engine_->with_dynamic_shape()) x = x - 1;
             res |= 1 << x;
           }
         }
         return res;
       };
-      layer = TRT_ENGINE_ADD_LAYER(engine_, Reduce, *x,
-                                   nvinfer1::ReduceOperation::kSUM,
+      layer = TRT_ENGINE_ADD_LAYER(engine_, Reduce, *x, reduce_type,
                                    CvtToBitMask(dim), keep_dim);
     }
 
     auto output_name = op_desc.Output("Out")[0];
-    RreplenishLayerAndOutput(layer, "reduce_sum", {output_name}, test_mode);
+    RreplenishLayerAndOutput(layer, op_type, {output_name}, test_mode);
   }
+
+ protected:
+  std::string op_type;
+};
+
+class ReduceSumOpConverter : public ReduceOpConverter {
+ public:
+  ReduceSumOpConverter() { op_type = "reduce_sum"; }
+};
+
+class ReduceMeanOpConverter : public ReduceOpConverter {
+ public:
+  ReduceMeanOpConverter() { op_type = "reduce_mean"; }
 };
 
 }  // namespace tensorrt
@@ -88,3 +105,4 @@ class ReduceSumOpConverter : public OpConverter {
 }  // namespace paddle
 
 REGISTER_TRT_OP_CONVERTER(reduce_sum, ReduceSumOpConverter);
+REGISTER_TRT_OP_CONVERTER(reduce_mean, ReduceMeanOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h
index e3c7d8b10333c3..f0d585e1b4090a 100644
--- a/paddle/fluid/inference/tensorrt/helper.h
+++ b/paddle/fluid/inference/tensorrt/helper.h
@@ -39,6 +39,12 @@ namespace tensorrt {
   NV_TENSORRT_MAJOR * 1000 + NV_TENSORRT_MINOR * 100 + \
       NV_TENSORRT_PATCH * 10 + NV_TENSORRT_BUILD
 
+#if IS_TRT_VERSION_GE(8000)
+#define TRT_NOEXCEPT noexcept
+#else
+#define TRT_NOEXCEPT
+#endif
+
 namespace dy = paddle::platform::dynload;
 
 // TensorRT data type to size
@@ -72,7 +78,8 @@ static int GetInferLibVersion() {
 // A logger for create TensorRT infer builder.
 class NaiveLogger : public nvinfer1::ILogger {
  public:
-  void log(nvinfer1::ILogger::Severity severity, const char* msg) override {
+  void log(nvinfer1::ILogger::Severity severity,
+           const char* msg) TRT_NOEXCEPT override {
     switch (severity) {
       case Severity::kVERBOSE:
         VLOG(3) << msg;
@@ -105,7 +112,7 @@ class NaiveProfiler : public nvinfer1::IProfiler {
   typedef std::pair<std::string, float> Record;
   std::vector<Record> mProfile;
 
-  virtual void reportLayerTime(const char* layerName, float ms) {
+  virtual void reportLayerTime(const char* layerName, float ms) TRT_NOEXCEPT {
     auto record =
         std::find_if(mProfile.begin(), mProfile.end(),
                      [&](const Record& r) { return r.first == layerName; });
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index f98b0c9ede76e2..6c6006065435f4 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -130,6 +130,7 @@ struct SimpleOpTypeSetTeller : public Teller {
       "nearest_interp",
       "anchor_generator",
       "reduce_sum",
+      "reduce_mean",
   };
 };
 
@@ -709,18 +710,24 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       if (!with_dynamic_shape && shape[0] == -1) return false;
     }
 
-    if (op_type == "reduce_sum") {
-      if (!with_dynamic_shape) {
-        VLOG(3) << "the reduce_sum does not support static shape yet";
-        return false;
-      }
-
+    if (op_type == "reduce_sum" || op_type == "reduce_mean") {
       if (!(desc.HasAttr("keep_dim") && desc.HasAttr("dim") &&
             desc.HasAttr("reduce_all"))) {
-        VLOG(3) << "the reduce_sum does not have attr (keep_dim or dim or "
+        VLOG(3) << "the " << op_type
+                << " does not have attr (keep_dim or dim or "
                    "reduce_all)";
         return false;
       }
+
+      // The batch size dimension cannot be reduced if it's not dynamic shape.
+      if (!with_dynamic_shape) {
+        if (desc.HasAttr("reduce_all")) return false;
+        std::vector<int32_t> dim =
+            BOOST_GET_CONST(std::vector<int32_t>, desc.GetAttr("dim"));
+        for (auto x : dim) {
+          if (!x) return false;
+        }
+      }
     }
 
     if ((*teller)(op_type, desc, use_no_calib_int8)) return true;
diff --git a/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu
index 8cf9178b6f139b..e5584f26580679 100644
--- a/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu
@@ -110,16 +110,18 @@ AnchorGeneratorPlugin::AnchorGeneratorPlugin(const void* data, size_t length) {
   PrepareParamsOnDevice();
 }
 
-const char* AnchorGeneratorPlugin::getPluginType() const {
+const char* AnchorGeneratorPlugin::getPluginType() const TRT_NOEXCEPT {
   return "anchor_generator_plugin";
 }
 
-const char* AnchorGeneratorPlugin::getPluginVersion() const { return "1"; }
+const char* AnchorGeneratorPlugin::getPluginVersion() const TRT_NOEXCEPT {
+  return "1";
+}
 
-int AnchorGeneratorPlugin::getNbOutputs() const { return 2; }
+int AnchorGeneratorPlugin::getNbOutputs() const TRT_NOEXCEPT { return 2; }
 
 nvinfer1::Dims AnchorGeneratorPlugin::getOutputDimensions(
-    int index, const nvinfer1::Dims* inputs, int nb_input_dims) {
+    int index, const nvinfer1::Dims* inputs, int nb_input_dims) TRT_NOEXCEPT {
   nvinfer1::Dims dims{};
   dims.nbDims = 4;
   dims.d[0] = height_;
@@ -130,20 +132,21 @@ nvinfer1::Dims AnchorGeneratorPlugin::getOutputDimensions(
 }
 
 bool AnchorGeneratorPlugin::supportsFormat(
-    nvinfer1::DataType type, nvinfer1::TensorFormat format) const {
+    nvinfer1::DataType type, nvinfer1::TensorFormat format) const TRT_NOEXCEPT {
   // static shape plugin can't support different type between input/out
   // it may cause addition overhead in half mode
   return (type == data_type_ && format == nvinfer1::TensorFormat::kLINEAR);
 }
 
-size_t AnchorGeneratorPlugin::getWorkspaceSize(int max_batch_size) const {
+size_t AnchorGeneratorPlugin::getWorkspaceSize(int max_batch_size) const
+    TRT_NOEXCEPT {
   return 0;
 }
 
 template <typename T>
 int AnchorGeneratorPlugin::enqueue_impl(int batch_size,
                                         const void* const* inputs,
-                                        void** outputs, void* workspace,
+                                        void* const* outputs, void* workspace,
                                         cudaStream_t stream) {
   const int block = 512;
   const int gen_anchor_grid = (box_num_ + block - 1) / block;
@@ -169,15 +172,15 @@ int AnchorGeneratorPlugin::enqueue(int batch_size, const void* const* inputs,
 #else
                                    void* const* outputs, void* workspace,
 #endif
-                                   cudaStream_t stream) {
+                                   cudaStream_t stream) TRT_NOEXCEPT {
   return enqueue_impl<float>(batch_size, inputs, outputs, workspace, stream);
 }
 
-int AnchorGeneratorPlugin::initialize() { return 0; }
+int AnchorGeneratorPlugin::initialize() TRT_NOEXCEPT { return 0; }
 
-void AnchorGeneratorPlugin::terminate() {}
+void AnchorGeneratorPlugin::terminate() TRT_NOEXCEPT {}
 
-size_t AnchorGeneratorPlugin::getSerializationSize() const {
+size_t AnchorGeneratorPlugin::getSerializationSize() const TRT_NOEXCEPT {
   size_t serialize_size = 0;
   serialize_size += SerializedSize(data_type_);
   serialize_size += SerializedSize(anchor_sizes_);
@@ -192,7 +195,7 @@ size_t AnchorGeneratorPlugin::getSerializationSize() const {
   return serialize_size;
 }
 
-void AnchorGeneratorPlugin::serialize(void* buffer) const {
+void AnchorGeneratorPlugin::serialize(void* buffer) const TRT_NOEXCEPT {
   SerializeValue(&buffer, data_type_);
   SerializeValue(&buffer, anchor_sizes_);
   SerializeValue(&buffer, aspect_ratios_);
@@ -205,28 +208,31 @@ void AnchorGeneratorPlugin::serialize(void* buffer) const {
   SerializeValue(&buffer, box_num_);
 }
 
-void AnchorGeneratorPlugin::destroy() {}
+void AnchorGeneratorPlugin::destroy() TRT_NOEXCEPT {}
 
-void AnchorGeneratorPlugin::setPluginNamespace(const char* lib_namespace) {
+void AnchorGeneratorPlugin::setPluginNamespace(const char* lib_namespace)
+    TRT_NOEXCEPT {
   namespace_ = std::string(lib_namespace);
 }
 
-const char* AnchorGeneratorPlugin::getPluginNamespace() const {
+const char* AnchorGeneratorPlugin::getPluginNamespace() const TRT_NOEXCEPT {
   return namespace_.c_str();
 }
 
 nvinfer1::DataType AnchorGeneratorPlugin::getOutputDataType(
-    int index, const nvinfer1::DataType* input_type, int nb_inputs) const {
+    int index, const nvinfer1::DataType* input_type,
+    int nb_inputs) const TRT_NOEXCEPT {
   return input_type[0];
 }
 
 bool AnchorGeneratorPlugin::isOutputBroadcastAcrossBatch(
-    int output_index, const bool* input_is_broadcast, int nb_inputs) const {
+    int output_index, const bool* input_is_broadcast,
+    int nb_inputs) const TRT_NOEXCEPT {
   return true;
 }
 
-bool AnchorGeneratorPlugin::canBroadcastInputAcrossBatch(
-    int input_index) const {
+bool AnchorGeneratorPlugin::canBroadcastInputAcrossBatch(int input_index) const
+    TRT_NOEXCEPT {
   return false;
 }
 
@@ -236,9 +242,9 @@ void AnchorGeneratorPlugin::configurePlugin(
     const nvinfer1::DataType* input_types,
     const nvinfer1::DataType* output_types, const bool* input_is_broadcast,
     const bool* output_is_broadcast, nvinfer1::PluginFormat float_format,
-    int max_batct_size) {}
+    int max_batct_size) TRT_NOEXCEPT {}
 
-nvinfer1::IPluginV2Ext* AnchorGeneratorPlugin::clone() const {
+nvinfer1::IPluginV2Ext* AnchorGeneratorPlugin::clone() const TRT_NOEXCEPT {
   auto plugin = new AnchorGeneratorPlugin(
       data_type_, anchor_sizes_, aspect_ratios_, stride_, variances_, offset_,
       height_, width_, num_anchors_, box_num_);
@@ -246,30 +252,32 @@ nvinfer1::IPluginV2Ext* AnchorGeneratorPlugin::clone() const {
   return plugin;
 }
 
-void AnchorGeneratorPluginCreator::setPluginNamespace(
-    const char* lib_namespace) {
+void AnchorGeneratorPluginCreator::setPluginNamespace(const char* lib_namespace)
+    TRT_NOEXCEPT {
   namespace_ = std::string(lib_namespace);
 }
 
-const char* AnchorGeneratorPluginCreator::getPluginNamespace() const {
+const char* AnchorGeneratorPluginCreator::getPluginNamespace() const
+    TRT_NOEXCEPT {
   return namespace_.c_str();
 }
 
-const char* AnchorGeneratorPluginCreator::getPluginName() const {
+const char* AnchorGeneratorPluginCreator::getPluginName() const TRT_NOEXCEPT {
   return "anchor_generator_plugin";
 }
 
-const char* AnchorGeneratorPluginCreator::getPluginVersion() const {
+const char* AnchorGeneratorPluginCreator::getPluginVersion() const
+    TRT_NOEXCEPT {
   return "1";
 }
 
 const nvinfer1::PluginFieldCollection*
-AnchorGeneratorPluginCreator::getFieldNames() {
+AnchorGeneratorPluginCreator::getFieldNames() TRT_NOEXCEPT {
   return &field_collection_;
 }
 
 nvinfer1::IPluginV2Ext* AnchorGeneratorPluginCreator::createPlugin(
-    const char* name, const nvinfer1::PluginFieldCollection* fc) {
+    const char* name, const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT {
   const nvinfer1::PluginField* fields = fc->fields;
   int type_id = -1;
   std::vector<float> anchor_sizes, aspect_ratios, stride, variances;
@@ -315,7 +323,8 @@ nvinfer1::IPluginV2Ext* AnchorGeneratorPluginCreator::createPlugin(
 }
 
 nvinfer1::IPluginV2Ext* AnchorGeneratorPluginCreator::deserializePlugin(
-    const char* name, const void* serial_data, size_t serial_length) {
+    const char* name, const void* serial_data,
+    size_t serial_length) TRT_NOEXCEPT {
   auto plugin = new AnchorGeneratorPlugin(serial_data, serial_length);
   plugin->setPluginNamespace(namespace_.c_str());
   return plugin;
@@ -374,7 +383,8 @@ AnchorGeneratorPluginDynamic::AnchorGeneratorPluginDynamic(void const* data,
   PrepareParamsOnDevice();
 }
 
-nvinfer1::IPluginV2DynamicExt* AnchorGeneratorPluginDynamic::clone() const {
+nvinfer1::IPluginV2DynamicExt* AnchorGeneratorPluginDynamic::clone() const
+    TRT_NOEXCEPT {
   auto plugin = new AnchorGeneratorPluginDynamic(
       data_type_, anchor_sizes_, aspect_ratios_, stride_, variances_, offset_,
       num_anchors_);
@@ -384,7 +394,7 @@ nvinfer1::IPluginV2DynamicExt* AnchorGeneratorPluginDynamic::clone() const {
 
 nvinfer1::DimsExprs AnchorGeneratorPluginDynamic::getOutputDimensions(
     int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
-    nvinfer1::IExprBuilder& exprBuilder) {
+    nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT {
   nvinfer1::DimsExprs ret{};
   ret.nbDims = 4;
   ret.d[0] = inputs[0].d[2];  // feature height
@@ -396,7 +406,7 @@ nvinfer1::DimsExprs AnchorGeneratorPluginDynamic::getOutputDimensions(
 
 bool AnchorGeneratorPluginDynamic::supportsFormatCombination(
     int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs,
-    int nbOutputs) {
+    int nbOutputs) TRT_NOEXCEPT {
   // input can be any, doesn't matter
   // anchor generator doesn't read input raw data, only need the shape info
   auto type = inOut[pos].type;
@@ -412,11 +422,12 @@ bool AnchorGeneratorPluginDynamic::supportsFormatCombination(
 
 void AnchorGeneratorPluginDynamic::configurePlugin(
     const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
-    const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) {}
+    const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) TRT_NOEXCEPT {}
 
 size_t AnchorGeneratorPluginDynamic::getWorkspaceSize(
     const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
-    const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const {
+    const nvinfer1::PluginTensorDesc* outputs,
+    int nbOutputs) const TRT_NOEXCEPT {
   return 0;
 }
 
@@ -449,7 +460,7 @@ int AnchorGeneratorPluginDynamic::enqueue_impl(
 int AnchorGeneratorPluginDynamic::enqueue(
     const nvinfer1::PluginTensorDesc* inputDesc,
     const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs,
-    void* const* outputs, void* workspace, cudaStream_t stream) {
+    void* const* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT {
   assert(outputDesc[0].type == nvinfer1::DataType::kFLOAT);
   assert(outputDesc[1].type == nvinfer1::DataType::kFLOAT);
   return enqueue_impl<float>(inputDesc, outputDesc, inputs, outputs, workspace,
@@ -457,21 +468,24 @@ int AnchorGeneratorPluginDynamic::enqueue(
 }
 
 nvinfer1::DataType AnchorGeneratorPluginDynamic::getOutputDataType(
-    int index, const nvinfer1::DataType* inputTypes, int nbInputs) const {
+    int index, const nvinfer1::DataType* inputTypes,
+    int nbInputs) const TRT_NOEXCEPT {
   return inputTypes[0];
 }
 
-const char* AnchorGeneratorPluginDynamic::getPluginType() const {
+const char* AnchorGeneratorPluginDynamic::getPluginType() const TRT_NOEXCEPT {
   return "anchor_generator_plugin_dynamic";
 }
 
-int AnchorGeneratorPluginDynamic::getNbOutputs() const { return 2; }
+int AnchorGeneratorPluginDynamic::getNbOutputs() const TRT_NOEXCEPT {
+  return 2;
+}
 
-int AnchorGeneratorPluginDynamic::initialize() { return 0; }
+int AnchorGeneratorPluginDynamic::initialize() TRT_NOEXCEPT { return 0; }
 
-void AnchorGeneratorPluginDynamic::terminate() {}
+void AnchorGeneratorPluginDynamic::terminate() TRT_NOEXCEPT {}
 
-size_t AnchorGeneratorPluginDynamic::getSerializationSize() const {
+size_t AnchorGeneratorPluginDynamic::getSerializationSize() const TRT_NOEXCEPT {
   size_t serialize_size = 0;
   serialize_size += SerializedSize(data_type_);
   serialize_size += SerializedSize(anchor_sizes_);
@@ -483,7 +497,7 @@ size_t AnchorGeneratorPluginDynamic::getSerializationSize() const {
   return serialize_size;
 }
 
-void AnchorGeneratorPluginDynamic::serialize(void* buffer) const {
+void AnchorGeneratorPluginDynamic::serialize(void* buffer) const TRT_NOEXCEPT {
   SerializeValue(&buffer, data_type_);
   SerializeValue(&buffer, anchor_sizes_);
   SerializeValue(&buffer, aspect_ratios_);
@@ -493,32 +507,35 @@ void AnchorGeneratorPluginDynamic::serialize(void* buffer) const {
   SerializeValue(&buffer, num_anchors_);
 }
 
-void AnchorGeneratorPluginDynamic::destroy() {}
+void AnchorGeneratorPluginDynamic::destroy() TRT_NOEXCEPT {}
 
 void AnchorGeneratorPluginDynamicCreator::setPluginNamespace(
-    const char* lib_namespace) {
+    const char* lib_namespace) TRT_NOEXCEPT {
   namespace_ = std::string(lib_namespace);
 }
 
-const char* AnchorGeneratorPluginDynamicCreator::getPluginNamespace() const {
+const char* AnchorGeneratorPluginDynamicCreator::getPluginNamespace() const
+    TRT_NOEXCEPT {
   return namespace_.c_str();
 }
 
-const char* AnchorGeneratorPluginDynamicCreator::getPluginName() const {
+const char* AnchorGeneratorPluginDynamicCreator::getPluginName() const
+    TRT_NOEXCEPT {
   return "anchor_generator_plugin_dynamic";
 }
 
-const char* AnchorGeneratorPluginDynamicCreator::getPluginVersion() const {
+const char* AnchorGeneratorPluginDynamicCreator::getPluginVersion() const
+    TRT_NOEXCEPT {
   return "1";
 }
 
 const nvinfer1::PluginFieldCollection*
-AnchorGeneratorPluginDynamicCreator::getFieldNames() {
+AnchorGeneratorPluginDynamicCreator::getFieldNames() TRT_NOEXCEPT {
   return &field_collection_;
 }
 
 nvinfer1::IPluginV2Ext* AnchorGeneratorPluginDynamicCreator::createPlugin(
-    const char* name, const nvinfer1::PluginFieldCollection* fc) {
+    const char* name, const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT {
   const nvinfer1::PluginField* fields = fc->fields;
   int type_id = -1;
   std::vector<float> anchor_sizes, aspect_ratios, stride, variances;
@@ -555,7 +572,8 @@ nvinfer1::IPluginV2Ext* AnchorGeneratorPluginDynamicCreator::createPlugin(
 }
 
 nvinfer1::IPluginV2Ext* AnchorGeneratorPluginDynamicCreator::deserializePlugin(
-    const char* name, const void* serial_data, size_t serial_length) {
+    const char* name, const void* serial_data,
+    size_t serial_length) TRT_NOEXCEPT {
   auto plugin = new AnchorGeneratorPluginDynamic(serial_data, serial_length);
   plugin->setPluginNamespace(namespace_.c_str());
   return plugin;
diff --git a/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h
index 458326d0679ca9..3d265dfb5933e9 100644
--- a/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h
@@ -34,34 +34,35 @@ class AnchorGeneratorPlugin : public nvinfer1::IPluginV2Ext {
       const int width, const int num_anchors, const int box_num);
   AnchorGeneratorPlugin(const void* data, size_t length);
   ~AnchorGeneratorPlugin() override;
-  const char* getPluginType() const override;
-  const char* getPluginVersion() const override;
-  int getNbOutputs() const override;
+  const char* getPluginType() const TRT_NOEXCEPT override;
+  const char* getPluginVersion() const TRT_NOEXCEPT override;
+  int getNbOutputs() const TRT_NOEXCEPT override;
   nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
-                                     int nb_input_dims) override;
-  bool supportsFormat(nvinfer1::DataType type,
-                      nvinfer1::TensorFormat format) const override;
-  size_t getWorkspaceSize(int max_batch_size) const override;
+                                     int nb_input_dims) TRT_NOEXCEPT override;
+  bool supportsFormat(nvinfer1::DataType type, nvinfer1::TensorFormat format)
+      const TRT_NOEXCEPT override;
+  size_t getWorkspaceSize(int max_batch_size) const TRT_NOEXCEPT override;
 #if IS_TRT_VERSION_LT(8000)
   int enqueue(int batch_size, const void* const* inputs, void** outputs,
 #else
   int enqueue(int batch_size, const void* const* inputs, void* const* outputs,
 #endif
-              void* workspace, cudaStream_t stream) override;
-  int initialize() override;
-  void terminate() override;
-  size_t getSerializationSize() const override;
-  void serialize(void* buffer) const override;
-  void destroy() override;
-  void setPluginNamespace(const char* lib_namespace) override;
-  const char* getPluginNamespace() const override;
-  nvinfer1::DataType getOutputDataType(int index,
-                                       const nvinfer1::DataType* input_type,
-                                       int nb_inputs) const override;
+              void* workspace, cudaStream_t stream) TRT_NOEXCEPT override;
+  int initialize() TRT_NOEXCEPT override;
+  void terminate() TRT_NOEXCEPT override;
+  size_t getSerializationSize() const TRT_NOEXCEPT override;
+  void serialize(void* buffer) const TRT_NOEXCEPT override;
+  void destroy() TRT_NOEXCEPT override;
+  void setPluginNamespace(const char* lib_namespace) TRT_NOEXCEPT override;
+  const char* getPluginNamespace() const TRT_NOEXCEPT override;
+  nvinfer1::DataType getOutputDataType(
+      int index, const nvinfer1::DataType* input_type,
+      int nb_inputs) const TRT_NOEXCEPT override;
   bool isOutputBroadcastAcrossBatch(int output_index,
                                     const bool* input_is_broadcast,
-                                    int nb_inputs) const override;
-  bool canBroadcastInputAcrossBatch(int input_index) const override;
+                                    int nb_inputs) const TRT_NOEXCEPT override;
+  bool canBroadcastInputAcrossBatch(int input_index) const
+      TRT_NOEXCEPT override;
   void configurePlugin(const nvinfer1::Dims* input_dims, int nb_inputs,
                        const nvinfer1::Dims* output_dims, int nb_outputs,
                        const nvinfer1::DataType* input_types,
@@ -69,13 +70,13 @@ class AnchorGeneratorPlugin : public nvinfer1::IPluginV2Ext {
                        const bool* input_is_broadcast,
                        const bool* output_is_broadcast,
                        nvinfer1::PluginFormat float_format,
-                       int max_batct_size) override;
-  nvinfer1::IPluginV2Ext* clone() const override;
+                       int max_batct_size) TRT_NOEXCEPT override;
+  nvinfer1::IPluginV2Ext* clone() const TRT_NOEXCEPT override;
 
  private:
   template <typename T>
-  int enqueue_impl(int batch_size, const void* const* inputs, void** outputs,
-                   void* workspace, cudaStream_t stream);
+  int enqueue_impl(int batch_size, const void* const* inputs,
+                   void* const* outputs, void* workspace, cudaStream_t stream);
   nvinfer1::DataType data_type_;
   std::vector<float> anchor_sizes_;
   std::vector<float> aspect_ratios_;
@@ -97,16 +98,17 @@ class AnchorGeneratorPluginCreator : public nvinfer1::IPluginCreator {
  public:
   AnchorGeneratorPluginCreator() = default;
   ~AnchorGeneratorPluginCreator() override = default;
-  void setPluginNamespace(const char* lib_namespace) override;
-  const char* getPluginNamespace() const override;
-  const char* getPluginName() const override;
-  const char* getPluginVersion() const override;
-  const nvinfer1::PluginFieldCollection* getFieldNames() override;
+  void setPluginNamespace(const char* lib_namespace) TRT_NOEXCEPT override;
+  const char* getPluginNamespace() const TRT_NOEXCEPT override;
+  const char* getPluginName() const TRT_NOEXCEPT override;
+  const char* getPluginVersion() const TRT_NOEXCEPT override;
+  const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override;
   nvinfer1::IPluginV2Ext* createPlugin(
-      const char* name, const nvinfer1::PluginFieldCollection* fc) override;
-  nvinfer1::IPluginV2Ext* deserializePlugin(const char* name,
-                                            const void* serial_data,
-                                            size_t serial_length) override;
+      const char* name,
+      const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;
+  nvinfer1::IPluginV2Ext* deserializePlugin(
+      const char* name, const void* serial_data,
+      size_t serial_length) TRT_NOEXCEPT override;
 
  private:
   std::string namespace_;
@@ -127,35 +129,36 @@ class AnchorGeneratorPluginDynamic : public DynamicPluginTensorRT {
                                         const int num_anchors);
   AnchorGeneratorPluginDynamic(void const* data, size_t length);
   ~AnchorGeneratorPluginDynamic();
-  nvinfer1::IPluginV2DynamicExt* clone() const override;
+  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
   nvinfer1::DimsExprs getOutputDimensions(
       int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
-      nvinfer1::IExprBuilder& exprBuilder) override;
+      nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT override;
   bool supportsFormatCombination(int pos,
                                  const nvinfer1::PluginTensorDesc* inOut,
-                                 int nbInputs, int nbOutputs) override;
+                                 int nbInputs,
+                                 int nbOutputs) TRT_NOEXCEPT override;
   void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
                        int nbInputs,
                        const nvinfer1::DynamicPluginTensorDesc* out,
-                       int nbOutputs) override;
+                       int nbOutputs) TRT_NOEXCEPT override;
   size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
                           int nbInputs,
                           const nvinfer1::PluginTensorDesc* outputs,
-                          int nbOutputs) const override;
+                          int nbOutputs) const TRT_NOEXCEPT override;
   int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
               const nvinfer1::PluginTensorDesc* outputDesc,
               const void* const* inputs, void* const* outputs, void* workspace,
-              cudaStream_t stream) override;
-  nvinfer1::DataType getOutputDataType(int index,
-                                       const nvinfer1::DataType* inputTypes,
-                                       int nbInputs) const override;
-  const char* getPluginType() const override;
-  int getNbOutputs() const override;
-  int initialize() override;
-  void terminate() override;
-  size_t getSerializationSize() const override;
-  void serialize(void* buffer) const override;
-  void destroy() override;
+              cudaStream_t stream) TRT_NOEXCEPT override;
+  nvinfer1::DataType getOutputDataType(
+      int index, const nvinfer1::DataType* inputTypes,
+      int nbInputs) const TRT_NOEXCEPT override;
+  const char* getPluginType() const TRT_NOEXCEPT override;
+  int getNbOutputs() const TRT_NOEXCEPT override;
+  int initialize() TRT_NOEXCEPT override;
+  void terminate() TRT_NOEXCEPT override;
+  size_t getSerializationSize() const TRT_NOEXCEPT override;
+  void serialize(void* buffer) const TRT_NOEXCEPT override;
+  void destroy() TRT_NOEXCEPT override;
 
  private:
   template <typename T>
@@ -181,16 +184,17 @@ class AnchorGeneratorPluginDynamicCreator : public nvinfer1::IPluginCreator {
  public:
   AnchorGeneratorPluginDynamicCreator() = default;
   ~AnchorGeneratorPluginDynamicCreator() override = default;
-  void setPluginNamespace(const char* lib_namespace) override;
-  const char* getPluginNamespace() const override;
-  const char* getPluginName() const override;
-  const char* getPluginVersion() const override;
-  const nvinfer1::PluginFieldCollection* getFieldNames() override;
+  void setPluginNamespace(const char* lib_namespace) TRT_NOEXCEPT override;
+  const char* getPluginNamespace() const TRT_NOEXCEPT override;
+  const char* getPluginName() const TRT_NOEXCEPT override;
+  const char* getPluginVersion() const TRT_NOEXCEPT override;
+  const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override;
   nvinfer1::IPluginV2Ext* createPlugin(
-      const char* name, const nvinfer1::PluginFieldCollection* fc) override;
-  nvinfer1::IPluginV2Ext* deserializePlugin(const char* name,
-                                            const void* serial_data,
-                                            size_t serial_length) override;
+      const char* name,
+      const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;
+  nvinfer1::IPluginV2Ext* deserializePlugin(
+      const char* name, const void* serial_data,
+      size_t serial_length) TRT_NOEXCEPT override;
 
  private:
   std::string namespace_;
diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
index 3338aae370e514..69e0075729b0dc 100644
--- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
@@ -48,7 +48,7 @@ __global__ void elementwise_kernel(const size_t total, const T *x_data,
 }
 
 nvinfer1::Dims ElementWisePlugin::getOutputDimensions(
-    int index, const nvinfer1::Dims *input_dims, int num_inputs) {
+    int index, const nvinfer1::Dims *input_dims, int num_inputs) TRT_NOEXCEPT {
   PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
                                   "There is only one output in TRT elementwise "
                                   "op plugin, but got output index: %d.",
@@ -64,7 +64,7 @@ nvinfer1::Dims ElementWisePlugin::getOutputDimensions(
   return input_dims[0];
 }
 
-int ElementWisePlugin::initialize() {
+int ElementWisePlugin::initialize() TRT_NOEXCEPT {
   PADDLE_ENFORCE_GT(dims_y_.nbDims, 0,
                     platform::errors::InvalidArgument(
                         "The dimension of input Y of TRT elementwise op plugin "
@@ -120,7 +120,7 @@ int ElementWisePlugin::enqueue(int batch_size, const void *const *inputs,
 #else
                                void *const *outputs, void *workspace,
 #endif
-                               cudaStream_t stream) {
+                               cudaStream_t stream) TRT_NOEXCEPT {
   const float *x = reinterpret_cast<const float *>(inputs[0]);
   const float *y = reinterpret_cast<const float *>(inputs[1]);
   float *out = reinterpret_cast<float *>(outputs[0]);
@@ -147,26 +147,26 @@ int ElementWisePlugin::enqueue(int batch_size, const void *const *inputs,
 // Dynamic Plugin below.
 #if IS_TRT_VERSION_GE(6000)
 
-int ElementwisePluginDynamic::initialize() { return 0; }
+int ElementwisePluginDynamic::initialize() TRT_NOEXCEPT { return 0; }
 
-size_t ElementwisePluginDynamic::getSerializationSize() const {
+size_t ElementwisePluginDynamic::getSerializationSize() const TRT_NOEXCEPT {
   return SerializedSize(type_.c_str()) + SerializedSize(axis_);
 }
 
-void ElementwisePluginDynamic::serialize(void *buffer) const {
+void ElementwisePluginDynamic::serialize(void *buffer) const TRT_NOEXCEPT {
   SerializeValue(&buffer, type_.c_str());
   SerializeValue(&buffer, axis_);
 }
 
 nvinfer1::DimsExprs ElementwisePluginDynamic::getOutputDimensions(
     int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
-    nvinfer1::IExprBuilder &expr_builder) {
+    nvinfer1::IExprBuilder &expr_builder) TRT_NOEXCEPT {
   return inputs[0];
 }
 
 bool ElementwisePluginDynamic::supportsFormatCombination(
     int pos, const nvinfer1::PluginTensorDesc *in_out, int nb_inputs,
-    int nb_outputs) {
+    int nb_outputs) TRT_NOEXCEPT {
   PADDLE_ENFORCE_NOT_NULL(
       in_out, platform::errors::InvalidArgument(
                   "The input of swish plugin shoule not be nullptr."));
@@ -189,7 +189,8 @@ bool ElementwisePluginDynamic::supportsFormatCombination(
 }
 
 nvinfer1::DataType ElementwisePluginDynamic::getOutputDataType(
-    int index, const nvinfer1::DataType *input_types, int nb_inputs) const {
+    int index, const nvinfer1::DataType *input_types,
+    int nb_inputs) const TRT_NOEXCEPT {
   PADDLE_ENFORCE_EQ(index, 0,
                     platform::errors::InvalidArgument(
                         "The Elementwise Plugin only has one input, so the "
@@ -201,7 +202,7 @@ nvinfer1::DataType ElementwisePluginDynamic::getOutputDataType(
 int ElementwisePluginDynamic::enqueue(
     const nvinfer1::PluginTensorDesc *input_desc,
     const nvinfer1::PluginTensorDesc *output_desc, const void *const *inputs,
-    void *const *outputs, void *workspace, cudaStream_t stream) {
+    void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT {
   auto x_dims = input_desc[0].dims;
   auto y_dims = input_desc[1].dims;
   int axis = (axis_ == -1) ? x_dims.nbDims - y_dims.nbDims : axis_;
diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
index 5dd3142c758398..aa1ab5389a5720 100644
--- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
@@ -48,33 +48,35 @@ class ElementWisePlugin : public PluginTensorRT {
     DeserializeValue(&serial_data, &serial_length, &post_size_);
   }
 
-  ElementWisePlugin* clone() const override {
+  ElementWisePlugin* clone() const TRT_NOEXCEPT override {
     return new ElementWisePlugin(type_, dims_x_, dims_y_, axis_);
   }
 
-  const char* getPluginType() const override { return "elementwise_plugin"; }
+  const char* getPluginType() const TRT_NOEXCEPT override {
+    return "elementwise_plugin";
+  }
 
   nvinfer1::Dims getOutputDimensions(int index,
                                      const nvinfer1::Dims* input_dims,
-                                     int num_inputs) override;
+                                     int num_inputs) TRT_NOEXCEPT override;
 
-  int initialize() override;
+  int initialize() TRT_NOEXCEPT override;
 
 #if IS_TRT_VERSION_LT(8000)
   int enqueue(int batch_size, const void* const* inputs, void** outputs,
 #else
   int enqueue(int batch_size, const void* const* inputs, void* const* outputs,
 #endif
-              void* workspace, cudaStream_t stream);
+              void* workspace, cudaStream_t stream) TRT_NOEXCEPT;
 
-  size_t getSerializationSize() const override {
+  size_t getSerializationSize() const TRT_NOEXCEPT override {
     return getBaseSerializationSize() + SerializedSize(type_.c_str()) +
            SerializedSize(dims_x_) + SerializedSize(dims_y_) +
            SerializedSize(axis_) + SerializedSize(prev_size_) +
            SerializedSize(midd_size_) + SerializedSize(post_size_);
   }
 
-  void serialize(void* buffer) const override {
+  void serialize(void* buffer) const TRT_NOEXCEPT override {
     serializeBase(buffer);
     SerializeValue(&buffer, type_.c_str());
     SerializeValue(&buffer, dims_x_);
@@ -97,13 +99,15 @@ class ElementWisePlugin : public PluginTensorRT {
 
 class ElementWisePluginCreator : public TensorRTPluginCreator {
  public:
-  const char* getPluginName() const override { return "elementwise_plugin"; }
+  const char* getPluginName() const TRT_NOEXCEPT override {
+    return "elementwise_plugin";
+  }
 
-  const char* getPluginVersion() const override { return "1"; }
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
 
-  nvinfer1::IPluginV2* deserializePlugin(const char* name,
-                                         const void* serial_data,
-                                         size_t serial_length) override {
+  nvinfer1::IPluginV2* deserializePlugin(
+      const char* name, const void* serial_data,
+      size_t serial_length) TRT_NOEXCEPT override {
     return new ElementWisePlugin(serial_data, serial_length);
   }
 };
@@ -120,48 +124,49 @@ class ElementwisePluginDynamic : public DynamicPluginTensorRT {
     type_ = std::string(elementwise_type);
     DeserializeValue(&serialData, &serialLength, &axis_);
   }
-  nvinfer1::IPluginV2DynamicExt* clone() const override {
+  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override {
     return new ElementwisePluginDynamic(type_, axis_);
   }
 
-  const char* getPluginType() const override {
+  const char* getPluginType() const TRT_NOEXCEPT override {
     return "elementwise_plugin_dynamic";
   }
-  int getNbOutputs() const override { return 1; }
-  int initialize() override;
+  int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
+  int initialize() TRT_NOEXCEPT override;
 
-  size_t getSerializationSize() const override;
-  void serialize(void* buffer) const override;
+  size_t getSerializationSize() const TRT_NOEXCEPT override;
+  void serialize(void* buffer) const TRT_NOEXCEPT override;
 
   nvinfer1::DimsExprs getOutputDimensions(
       int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
-      nvinfer1::IExprBuilder& expr_builder) override;
+      nvinfer1::IExprBuilder& expr_builder) TRT_NOEXCEPT override;
 
   bool supportsFormatCombination(int pos,
                                  const nvinfer1::PluginTensorDesc* inOut,
-                                 int nbInputs, int nbOutputs) override;
+                                 int nbInputs,
+                                 int nbOutputs) TRT_NOEXCEPT override;
 
   void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
                        int nbInputs,
                        const nvinfer1::DynamicPluginTensorDesc* out,
-                       int nbOutputs) override {}
+                       int nbOutputs) TRT_NOEXCEPT override {}
 
   size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
                           int nbInputs,
                           const nvinfer1::PluginTensorDesc* outputs,
-                          int nbOutputs) const override {
+                          int nbOutputs) const TRT_NOEXCEPT override {
     return 0;
   }
 
   int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
               const nvinfer1::PluginTensorDesc* outputDesc,
               const void* const* inputs, void* const* outputs, void* workspace,
-              cudaStream_t stream) override;
-  nvinfer1::DataType getOutputDataType(int index,
-                                       const nvinfer1::DataType* inputTypes,
-                                       int nbInputs) const override;
+              cudaStream_t stream) TRT_NOEXCEPT override;
+  nvinfer1::DataType getOutputDataType(
+      int index, const nvinfer1::DataType* inputTypes,
+      int nbInputs) const TRT_NOEXCEPT override;
 
-  void destroy() override { delete this; }
+  void destroy() TRT_NOEXCEPT override { delete this; }
 
  private:
   std::string type_;
@@ -171,33 +176,34 @@ class ElementwisePluginDynamic : public DynamicPluginTensorRT {
 class ElementwisePluginDynamicCreator : public nvinfer1::IPluginCreator {
  public:
   ElementwisePluginDynamicCreator() {}
-  const char* getPluginName() const override {
+  const char* getPluginName() const TRT_NOEXCEPT override {
     return "elementwise_plugin_dynamic";
   }
 
-  const char* getPluginVersion() const override { return "1"; }
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
 
-  const nvinfer1::PluginFieldCollection* getFieldNames() override {
+  const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override {
     return &field_collection_;
   }
 
-  nvinfer1::IPluginV2* createPlugin(
-      const char* name, const nvinfer1::PluginFieldCollection* fc) override {
+  nvinfer1::IPluginV2* createPlugin(const char* name,
+                                    const nvinfer1::PluginFieldCollection* fc)
+      TRT_NOEXCEPT override {
     return nullptr;
   }
 
-  nvinfer1::IPluginV2* deserializePlugin(const char* name,
-                                         const void* serial_data,
-                                         size_t serial_length) override {
+  nvinfer1::IPluginV2* deserializePlugin(
+      const char* name, const void* serial_data,
+      size_t serial_length) TRT_NOEXCEPT override {
     auto plugin = new ElementwisePluginDynamic(serial_data, serial_length);
     return plugin;
   }
 
-  void setPluginNamespace(const char* lib_namespace) override {
+  void setPluginNamespace(const char* lib_namespace) TRT_NOEXCEPT override {
     plugin_namespace_ = lib_namespace;
   }
 
-  const char* getPluginNamespace() const override {
+  const char* getPluginNamespace() const TRT_NOEXCEPT override {
     return plugin_namespace_.c_str();
   }
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
index 79fc3d66bbe4dd..a8e9a94955f702 100644
--- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
@@ -128,7 +128,7 @@ template <typename T>
 int EmbEltwiseLayernormPluginDynamicImpl<T>::enqueue(
     const nvinfer1::PluginTensorDesc *input_desc,
     const nvinfer1::PluginTensorDesc *output_desc, const void *const *inputs,
-    void *const *outputs, void *workspace, cudaStream_t stream) {
+    void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT {
   auto id_dims = input_desc[0].dims;
   int batch = id_dims.d[0];
   int seq_len = id_dims.d[1];
@@ -181,17 +181,19 @@ template class EmbEltwiseLayernormPluginDynamicImpl<float>;
 template class EmbEltwiseLayernormPluginDynamicImpl<half>;
 #endif
 
-int EmbEltwiseLayernormPluginDynamic::initialize() {
+int EmbEltwiseLayernormPluginDynamic::initialize() TRT_NOEXCEPT {
   impl_->initialize();
 
   return 0;
 }
 
-void EmbEltwiseLayernormPluginDynamic::terminate() { impl_->terminate(); }
+void EmbEltwiseLayernormPluginDynamic::terminate() TRT_NOEXCEPT {
+  impl_->terminate();
+}
 
 nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic::getOutputDimensions(
     int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
-    nvinfer1::IExprBuilder &expr_builder) {  // NOLINT
+    nvinfer1::IExprBuilder &expr_builder) TRT_NOEXCEPT {  // NOLINT
   PADDLE_ENFORCE_EQ(output_index, 0,
                     platform::errors::InvalidArgument(
                         "There is only one output of the EmbEltwiseLayernorm, "
@@ -208,7 +210,7 @@ nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic::getOutputDimensions(
 
 bool EmbEltwiseLayernormPluginDynamic::supportsFormatCombination(
     int pos, const nvinfer1::PluginTensorDesc *in_out, int nb_inputs,
-    int nb_outputs) {
+    int nb_outputs) TRT_NOEXCEPT {
   PADDLE_ENFORCE_NOT_NULL(
       in_out, platform::errors::InvalidArgument(
                   "The input of swish plugin shoule not be nullptr."));
@@ -256,7 +258,8 @@ bool EmbEltwiseLayernormPluginDynamic::supportsFormatCombination(
 }
 
 nvinfer1::DataType EmbEltwiseLayernormPluginDynamic::getOutputDataType(
-    int index, const nvinfer1::DataType *input_types, int nb_inputs) const {
+    int index, const nvinfer1::DataType *input_types,
+    int nb_inputs) const TRT_NOEXCEPT {
   PADDLE_ENFORCE_EQ(
       index, 0, platform::errors::InvalidArgument(
                     "The EmbEltwiseLayernorm Plugin only has one input, so the "
@@ -271,7 +274,7 @@ nvinfer1::DataType EmbEltwiseLayernormPluginDynamic::getOutputDataType(
 int EmbEltwiseLayernormPluginDynamic::enqueue(
     const nvinfer1::PluginTensorDesc *input_desc,
     const nvinfer1::PluginTensorDesc *output_desc, const void *const *inputs,
-    void *const *outputs, void *workspace, cudaStream_t stream) {
+    void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT {
   impl_->enqueue(input_desc, output_desc, inputs, outputs, workspace, stream);
   return cudaGetLastError() != cudaSuccess;
 }
diff --git a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
index 7de84a8fc49bcc..f44391310cc219 100644
--- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h
@@ -68,7 +68,7 @@ class EmbEltwiseLayernormPluginDynamicImpl
   int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
               const nvinfer1::PluginTensorDesc* outputDesc,
               const void* const* inputs, void* const* outputs, void* workspace,
-              cudaStream_t stream);
+              cudaStream_t stream) TRT_NOEXCEPT;
   void shareGPUData(const EmbEltwiseLayernormPluginDynamicImplBase* anthor);
 
  private:
@@ -189,7 +189,7 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
     }
   }
 
-  nvinfer1::IPluginV2DynamicExt* clone() const override {
+  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override {
     auto ptr = new EmbEltwiseLayernormPluginDynamic(
         embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_, hidden_size_,
         eps_, with_fp16_);
@@ -197,14 +197,14 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
     return ptr;
   }
 
-  const char* getPluginType() const override {
+  const char* getPluginType() const TRT_NOEXCEPT override {
     return "fused_embedding_eltwise_layernorm_plugin";
   }
-  int getNbOutputs() const override { return 1; }
-  int initialize() override;
-  void terminate() override;
+  int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
+  int initialize() TRT_NOEXCEPT override;
+  void terminate() TRT_NOEXCEPT override;
 
-  size_t getSerializationSize() const override {
+  size_t getSerializationSize() const TRT_NOEXCEPT override {
     int sum_num = 0;
     sum_num += SerializedSize(emb_sizes_);
 
@@ -223,7 +223,7 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
     return sum_num;
   }
 
-  void serialize(void* buffer) const override {
+  void serialize(void* buffer) const TRT_NOEXCEPT override {
     SerializeValue(&buffer, emb_sizes_);
     for (size_t i = 0; i < emb_sizes_.size(); i++) {
       auto size = emb_sizes_[i];
@@ -248,33 +248,34 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
 
   nvinfer1::DimsExprs getOutputDimensions(
       int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
-      nvinfer1::IExprBuilder& expr_builder) override;
+      nvinfer1::IExprBuilder& expr_builder) TRT_NOEXCEPT override;
 
   bool supportsFormatCombination(int pos,
                                  const nvinfer1::PluginTensorDesc* in_out,
-                                 int nb_inputs, int nb_outputs) override;
+                                 int nb_inputs,
+                                 int nb_outputs) TRT_NOEXCEPT override;
 
   void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
                        int nb_inputs,
                        const nvinfer1::DynamicPluginTensorDesc* out,
-                       int nb_outputs) override {}
+                       int nb_outputs) TRT_NOEXCEPT override {}
 
   size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
                           int nb_inputs,
                           const nvinfer1::PluginTensorDesc* outputs,
-                          int nb_outputs) const override {
+                          int nb_outputs) const TRT_NOEXCEPT override {
     return 0;
   }
 
   int enqueue(const nvinfer1::PluginTensorDesc* input_desc,
               const nvinfer1::PluginTensorDesc* output_desc,
               const void* const* inputs, void* const* outputs, void* workspace,
-              cudaStream_t stream) override;
-  nvinfer1::DataType getOutputDataType(int index,
-                                       const nvinfer1::DataType* input_types,
-                                       int nb_inputs) const override;
+              cudaStream_t stream) TRT_NOEXCEPT override;
+  nvinfer1::DataType getOutputDataType(
+      int index, const nvinfer1::DataType* input_types,
+      int nb_inputs) const TRT_NOEXCEPT override;
 
-  void destroy() override {
+  void destroy() TRT_NOEXCEPT override {
     if (own_host_buff_) {
       for (auto ptr : embs_) {
         delete[] ptr;
@@ -310,32 +311,33 @@ class EmbEltwiseLayernormPluginDynamicCreator
     : public nvinfer1::IPluginCreator {
  public:
   EmbEltwiseLayernormPluginDynamicCreator() {}
-  const char* getPluginName() const override {
+  const char* getPluginName() const TRT_NOEXCEPT override {
     return "fused_embedding_eltwise_layernorm_plugin";
   }
 
-  const char* getPluginVersion() const override { return "1"; }
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
 
-  const nvinfer1::PluginFieldCollection* getFieldNames() override {
+  const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override {
     return &field_collection_;
   }
 
-  nvinfer1::IPluginV2* createPlugin(
-      const char* name, const nvinfer1::PluginFieldCollection* fc) override {
+  nvinfer1::IPluginV2* createPlugin(const char* name,
+                                    const nvinfer1::PluginFieldCollection* fc)
+      TRT_NOEXCEPT override {
     return nullptr;
   }
 
-  nvinfer1::IPluginV2* deserializePlugin(const char* name,
-                                         const void* serial_data,
-                                         size_t serial_length) override {
+  nvinfer1::IPluginV2* deserializePlugin(
+      const char* name, const void* serial_data,
+      size_t serial_length) TRT_NOEXCEPT override {
     return new EmbEltwiseLayernormPluginDynamic(serial_data, serial_length);
   }
 
-  void setPluginNamespace(const char* lib_namespace) override {
+  void setPluginNamespace(const char* lib_namespace) TRT_NOEXCEPT override {
     plugin_namespace_ = lib_namespace;
   }
 
-  const char* getPluginNamespace() const override {
+  const char* getPluginNamespace() const TRT_NOEXCEPT override {
     return plugin_namespace_.c_str();
   }
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.cu
index 933ca333cdbb93..4371cc69f33341 100644
--- a/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.cu
@@ -58,19 +58,19 @@ __global__ void GatherNdCUDAKernel(const T* input, const int32_t* input_dims,
   }
 }
 
-int GatherNdPluginDynamic::initialize() { return 0; }
+int GatherNdPluginDynamic::initialize() TRT_NOEXCEPT { return 0; }
 
-size_t GatherNdPluginDynamic::getSerializationSize() const {
+size_t GatherNdPluginDynamic::getSerializationSize() const TRT_NOEXCEPT {
   return SerializedSize(with_fp16_);
 }
 
-void GatherNdPluginDynamic::serialize(void* buffer) const {
+void GatherNdPluginDynamic::serialize(void* buffer) const TRT_NOEXCEPT {
   SerializeValue(&buffer, with_fp16_);
 }
 
 nvinfer1::DimsExprs GatherNdPluginDynamic::getOutputDimensions(
     int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
-    nvinfer1::IExprBuilder& expr_builder) {
+    nvinfer1::IExprBuilder& expr_builder) TRT_NOEXCEPT {
   PADDLE_ENFORCE_EQ(
       nb_inputs, 2,
       platform::errors::InvalidArgument(
@@ -100,7 +100,7 @@ nvinfer1::DimsExprs GatherNdPluginDynamic::getOutputDimensions(
 
 bool GatherNdPluginDynamic::supportsFormatCombination(
     int pos, const nvinfer1::PluginTensorDesc* in_out, int nb_inputs,
-    int nb_outputs) {
+    int nb_outputs) TRT_NOEXCEPT {
   PADDLE_ENFORCE_NOT_NULL(
       in_out, platform::errors::InvalidArgument(
                   "The input of gather_nd plugin should not be nullptr."));
@@ -134,14 +134,15 @@ bool GatherNdPluginDynamic::supportsFormatCombination(
 }
 
 nvinfer1::DataType GatherNdPluginDynamic::getOutputDataType(
-    int index, const nvinfer1::DataType* input_types, int nb_inputs) const {
+    int index, const nvinfer1::DataType* input_types,
+    int nb_inputs) const TRT_NOEXCEPT {
   return input_types[0];
 }
 
 int GatherNdPluginDynamic::enqueue(
     const nvinfer1::PluginTensorDesc* input_desc,
     const nvinfer1::PluginTensorDesc* output_desc, const void* const* inputs,
-    void* const* outputs, void* workspace, cudaStream_t stream) {
+    void* const* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT {
   auto input_dims = input_desc[0].dims;
   auto index_dims = input_desc[1].dims;
   auto input_dims_size = input_dims.nbDims;
diff --git a/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h
index 0a242238c81fb3..841fb2f6fe399f 100644
--- a/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/gather_nd_op_plugin.h
@@ -35,46 +35,49 @@ class GatherNdPluginDynamic : public DynamicPluginTensorRT {
     DeserializeValue(&serial_data, &serial_length, &with_fp16_);
   }
 
-  nvinfer1::IPluginV2DynamicExt* clone() const override {
+  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override {
     return new GatherNdPluginDynamic(with_fp16_);
   }
 
-  const char* getPluginType() const override { return "gather_nd_plugin"; }
-  int getNbOutputs() const override { return 1; }
-  int initialize() override;
+  const char* getPluginType() const TRT_NOEXCEPT override {
+    return "gather_nd_plugin";
+  }
+  int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
+  int initialize() TRT_NOEXCEPT override;
 
-  size_t getSerializationSize() const override;
-  void serialize(void* buffer) const override;
+  size_t getSerializationSize() const TRT_NOEXCEPT override;
+  void serialize(void* buffer) const TRT_NOEXCEPT override;
 
   nvinfer1::DimsExprs getOutputDimensions(
       int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
-      nvinfer1::IExprBuilder& exprBuilder) override;
+      nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT override;
 
   bool supportsFormatCombination(int pos,
                                  const nvinfer1::PluginTensorDesc* inOut,
-                                 int nbInputs, int nbOutputs) override;
+                                 int nbInputs,
+                                 int nbOutputs) TRT_NOEXCEPT override;
 
   void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
                        int nbInputs,
                        const nvinfer1::DynamicPluginTensorDesc* out,
-                       int nbOutputs) override {}
+                       int nbOutputs) TRT_NOEXCEPT override {}
 
   size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
                           int nbInputs,
                           const nvinfer1::PluginTensorDesc* outputs,
-                          int nbOutputs) const override {
+                          int nbOutputs) const TRT_NOEXCEPT override {
     return 0;
   }
 
   int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
               const nvinfer1::PluginTensorDesc* outputDesc,
               const void* const* inputs, void* const* outputs, void* workspace,
-              cudaStream_t stream) override;
-  nvinfer1::DataType getOutputDataType(int index,
-                                       const nvinfer1::DataType* inputTypes,
-                                       int nbInputs) const override;
+              cudaStream_t stream) TRT_NOEXCEPT override;
+  nvinfer1::DataType getOutputDataType(
+      int index, const nvinfer1::DataType* inputTypes,
+      int nbInputs) const TRT_NOEXCEPT override;
 
-  void destroy() override {
+  void destroy() TRT_NOEXCEPT override {
     if (input_dims_data_) {
       cudaFree(input_dims_data_);
     }
@@ -88,31 +91,34 @@ class GatherNdPluginDynamic : public DynamicPluginTensorRT {
 class GatherNdPluginDynamicCreator : public nvinfer1::IPluginCreator {
  public:
   GatherNdPluginDynamicCreator() {}
-  const char* getPluginName() const override { return "gather_nd_plugin"; }
+  const char* getPluginName() const TRT_NOEXCEPT override {
+    return "gather_nd_plugin";
+  }
 
-  const char* getPluginVersion() const override { return "1"; }
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
 
-  const nvinfer1::PluginFieldCollection* getFieldNames() override {
+  const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override {
     return &field_collection_;
   }
 
-  nvinfer1::IPluginV2* createPlugin(
-      const char* name, const nvinfer1::PluginFieldCollection* fc) override {
+  nvinfer1::IPluginV2* createPlugin(const char* name,
+                                    const nvinfer1::PluginFieldCollection* fc)
+      TRT_NOEXCEPT override {
     return nullptr;
   }
 
-  nvinfer1::IPluginV2* deserializePlugin(const char* name,
-                                         const void* serial_data,
-                                         size_t serial_length) override {
+  nvinfer1::IPluginV2* deserializePlugin(
+      const char* name, const void* serial_data,
+      size_t serial_length) TRT_NOEXCEPT override {
     auto plugin = new GatherNdPluginDynamic(serial_data, serial_length);
     return plugin;
   }
 
-  void setPluginNamespace(const char* lib_namespace) override {
+  void setPluginNamespace(const char* lib_namespace) TRT_NOEXCEPT override {
     plugin_namespace_ = lib_namespace;
   }
 
-  const char* getPluginNamespace() const override {
+  const char* getPluginNamespace() const TRT_NOEXCEPT override {
     return plugin_namespace_.c_str();
   }
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu
index 43557c341ef42e..08b259e0f952e1 100644
--- a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu
@@ -30,8 +30,8 @@ static const float kAT = 0.5;
 static const float kBT = 0.7978845608028654;    // sqrt(2.0/M_PI)
 static const float kCT = 0.035677408136300125;  // 0.044715 * sqrt(2.0/M_PI)
 
-bool GeluPlugin::supportsFormat(nvinfer1::DataType type,
-                                nvinfer1::PluginFormat format) const {
+bool GeluPlugin::supportsFormat(
+    nvinfer1::DataType type, nvinfer1::PluginFormat format) const TRT_NOEXCEPT {
   if (with_fp16_) {
     return ((type == nvinfer1::DataType::kFLOAT ||
              type == nvinfer1::DataType::kHALF) &&
@@ -44,7 +44,7 @@ bool GeluPlugin::supportsFormat(nvinfer1::DataType type,
 
 nvinfer1::Dims GeluPlugin::getOutputDimensions(int index,
                                                const nvinfer1::Dims* in_dims,
-                                               int nb_inputs) {
+                                               int nb_inputs) TRT_NOEXCEPT {
   assert(nb_inputs == 1);
   assert(index < this->getNbOutputs());
   nvinfer1::Dims const& input_dims = in_dims[0];
@@ -96,7 +96,8 @@ int GeluPlugin::enqueue(int batch_size, const void* const* inputs,
 #if IS_TRT_VERSION_LT(8000)
                         void** outputs, void*, cudaStream_t stream) {
 #else
-                        void* const* outputs, void*, cudaStream_t stream) {
+                        void* const* outputs, void*,
+                        cudaStream_t stream) TRT_NOEXCEPT {
 #endif
   const auto& input_dims = this->getInputDims(0);
   int num = batch_size;
@@ -132,13 +133,13 @@ int GeluPlugin::enqueue(int batch_size, const void* const* inputs,
 
 nvinfer1::DimsExprs GeluPluginDynamic::getOutputDimensions(
     int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
-    nvinfer1::IExprBuilder& expr_builder) {
+    nvinfer1::IExprBuilder& expr_builder) TRT_NOEXCEPT {
   return inputs[0];
 }
 
 bool GeluPluginDynamic::supportsFormatCombination(
     int pos, const nvinfer1::PluginTensorDesc* in_out, int nb_inputs,
-    int nb_outputs) {
+    int nb_outputs) TRT_NOEXCEPT {
   PADDLE_ENFORCE_NOT_NULL(
       in_out, platform::errors::InvalidArgument(
                   "The input of swish plugin shoule not be nullptr."));
@@ -167,7 +168,8 @@ bool GeluPluginDynamic::supportsFormatCombination(
 }
 
 nvinfer1::DataType GeluPluginDynamic::getOutputDataType(
-    int index, const nvinfer1::DataType* input_types, int nb_inputs) const {
+    int index, const nvinfer1::DataType* input_types,
+    int nb_inputs) const TRT_NOEXCEPT {
   PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
                                   "The Gelu Plugin only has one input, so the "
                                   "index value should be 0, but get %d.",
@@ -178,7 +180,8 @@ nvinfer1::DataType GeluPluginDynamic::getOutputDataType(
 int GeluPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
                                const nvinfer1::PluginTensorDesc* output_desc,
                                const void* const* inputs, void* const* outputs,
-                               void* workspace, cudaStream_t stream) {
+                               void* workspace,
+                               cudaStream_t stream) TRT_NOEXCEPT {
   auto input_dims = input_desc[0].dims;
   size_t num = ProductDim(input_dims);
   const int block_size = 256;
diff --git a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h
index 6fdd9791a61bdb..7efdd2798b2640 100644
--- a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h
@@ -35,40 +35,48 @@ class GeluPlugin : public PluginTensorRT {
   }
 
   ~GeluPlugin() {}
-  GeluPlugin* clone() const override { return new GeluPlugin(with_fp16_); }
+  GeluPlugin* clone() const TRT_NOEXCEPT override {
+    return new GeluPlugin(with_fp16_);
+  }
 
-  const char* getPluginType() const override { return "gelu_plugin"; }
-  int getNbOutputs() const override { return 1; }
-  int initialize() override { return 0; }
-  bool supportsFormat(nvinfer1::DataType type,
-                      nvinfer1::PluginFormat format) const override;
+  const char* getPluginType() const TRT_NOEXCEPT override {
+    return "gelu_plugin";
+  }
+  int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
+  int initialize() TRT_NOEXCEPT override { return 0; }
+  bool supportsFormat(nvinfer1::DataType type, nvinfer1::PluginFormat format)
+      const TRT_NOEXCEPT override;
   nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
-                                     int nb_input_dims) override;
+                                     int nb_input_dims) TRT_NOEXCEPT override;
 #if IS_TRT_VERSION_LT(8000)
   int enqueue(int batch_size, const void* const* inputs, void** outputs,
 #else
   int enqueue(int batch_size, const void* const* inputs, void* const* outputs,
 #endif
-              void* workspace, cudaStream_t stream) override;
+              void* workspace, cudaStream_t stream) TRT_NOEXCEPT override;
 
-  size_t getSerializationSize() const override {
+  size_t getSerializationSize() const TRT_NOEXCEPT override {
     return getBaseSerializationSize();
   }
 
   // TRT will call this func  to serialize the configuration of TRT
   // It should not be called by users.
-  void serialize(void* buffer) const override { serializeBase(buffer); }
+  void serialize(void* buffer) const TRT_NOEXCEPT override {
+    serializeBase(buffer);
+  }
 };
 
 class GeluPluginCreator : public TensorRTPluginCreator {
  public:
-  const char* getPluginName() const override { return "gelu_plugin"; }
+  const char* getPluginName() const TRT_NOEXCEPT override {
+    return "gelu_plugin";
+  }
 
-  const char* getPluginVersion() const override { return "1"; }
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
 
-  nvinfer1::IPluginV2* deserializePlugin(const char* name,
-                                         const void* serial_data,
-                                         size_t serial_length) override {
+  nvinfer1::IPluginV2* deserializePlugin(
+      const char* name, const void* serial_data,
+      size_t serial_length) TRT_NOEXCEPT override {
     return new GeluPlugin(serial_data, serial_length);
   }
 };
@@ -83,61 +91,66 @@ class GeluPluginDynamic : public DynamicPluginTensorRT {
   }
 
   ~GeluPluginDynamic() {}
-  nvinfer1::IPluginV2DynamicExt* clone() const override {
+  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override {
     return new GeluPluginDynamic(with_fp16_);
   }
 
-  const char* getPluginType() const override { return "gelu_plugin_dynamic"; }
-  int getNbOutputs() const override { return 1; }
-  int initialize() override { return 0; }
+  const char* getPluginType() const TRT_NOEXCEPT override {
+    return "gelu_plugin_dynamic";
+  }
+  int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
+  int initialize() TRT_NOEXCEPT override { return 0; }
 
-  size_t getSerializationSize() const override {
+  size_t getSerializationSize() const TRT_NOEXCEPT override {
     return SerializedSize(with_fp16_);
   }
-  void serialize(void* buffer) const override {
+  void serialize(void* buffer) const TRT_NOEXCEPT override {
     SerializeValue(&buffer, with_fp16_);
   }
 
   nvinfer1::DimsExprs getOutputDimensions(
       int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
-      nvinfer1::IExprBuilder& expr_builder) override;
+      nvinfer1::IExprBuilder& expr_builder) TRT_NOEXCEPT override;
 
   bool supportsFormatCombination(int pos,
                                  const nvinfer1::PluginTensorDesc* in_out,
-                                 int nb_inputs, int nb_outputs) override;
+                                 int nb_inputs,
+                                 int nb_outputs) TRT_NOEXCEPT override;
 
   void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
                        int nb_inputs,
                        const nvinfer1::DynamicPluginTensorDesc* out,
-                       int nb_outputs) override {}
+                       int nb_outputs) TRT_NOEXCEPT override {}
 
   size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
                           int nb_inputs,
                           const nvinfer1::PluginTensorDesc* outputs,
-                          int nb_outputs) const override {
+                          int nb_outputs) const TRT_NOEXCEPT override {
     return 0;
   }
 
   int enqueue(const nvinfer1::PluginTensorDesc* input_desc,
               const nvinfer1::PluginTensorDesc* output_desc,
               const void* const* inputs, void* const* outputs, void* workspace,
-              cudaStream_t stream) override;
-  nvinfer1::DataType getOutputDataType(int index,
-                                       const nvinfer1::DataType* input_types,
-                                       int nb_inputs) const override;
+              cudaStream_t stream) TRT_NOEXCEPT override;
+  nvinfer1::DataType getOutputDataType(
+      int index, const nvinfer1::DataType* input_types,
+      int nb_inputs) const TRT_NOEXCEPT override;
 
-  void destroy() override { delete this; }
+  void destroy() TRT_NOEXCEPT override { delete this; }
 };
 
 class GeluPluginDynamicCreator : public TensorRTPluginCreator {
  public:
-  const char* getPluginName() const override { return "gelu_plugin_dynamic"; }
+  const char* getPluginName() const TRT_NOEXCEPT override {
+    return "gelu_plugin_dynamic";
+  }
 
-  const char* getPluginVersion() const override { return "1"; }
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
 
-  nvinfer1::IPluginV2* deserializePlugin(const char* name,
-                                         const void* serial_data,
-                                         size_t serial_length) override {
+  nvinfer1::IPluginV2* deserializePlugin(
+      const char* name, const void* serial_data,
+      size_t serial_length) TRT_NOEXCEPT override {
     auto plugin = new GeluPluginDynamic(serial_data, serial_length);
     return plugin;
   }
diff --git a/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.cu
index dab7ddac1957a1..28060bd2facbee 100644
--- a/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.cu
@@ -22,7 +22,7 @@ namespace tensorrt {
 namespace plugin {
 
 nvinfer1::Dims HardSwishPlugin::getOutputDimensions(
-    int index, const nvinfer1::Dims* in_dims, int nb_inputs) {
+    int index, const nvinfer1::Dims* in_dims, int nb_inputs) TRT_NOEXCEPT {
   assert(nb_inputs == 1);
   assert(index < this->getNbOutputs());
   nvinfer1::Dims const& input_dims = in_dims[0];
@@ -54,7 +54,8 @@ int HardSwishPlugin::enqueue(int batch_size, const void* const* inputs,
 #if IS_TRT_VERSION_LT(8000)
                              void** outputs, void*, cudaStream_t stream) {
 #else
-                             void* const* outputs, void*, cudaStream_t stream) {
+                             void* const* outputs, void*,
+                             cudaStream_t stream) TRT_NOEXCEPT {
 #endif
   const auto& input_dims = this->getInputDims(0);
   int num = batch_size;
diff --git a/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.h
index 42c47959988a50..5dfa00ef1c204e 100644
--- a/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.h
@@ -40,30 +40,32 @@ class HardSwishPlugin : public PluginTensorRT {
   }
 
   ~HardSwishPlugin() {}
-  HardSwishPlugin* clone() const override {
+  HardSwishPlugin* clone() const TRT_NOEXCEPT override {
     return new HardSwishPlugin(threshold_, scale_, offset_);
   }
 
-  const char* getPluginType() const override { return "hard_swish_plugin"; }
-  int getNbOutputs() const override { return 1; }
-  int initialize() override { return 0; }
+  const char* getPluginType() const TRT_NOEXCEPT override {
+    return "hard_swish_plugin";
+  }
+  int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
+  int initialize() TRT_NOEXCEPT override { return 0; }
   nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
-                                     int nbInputDims) override;
+                                     int nbInputDims) TRT_NOEXCEPT override;
 #if IS_TRT_VERSION_LT(8000)
   int enqueue(int batchSize, const void* const* inputs, void** outputs,
 #else
   int enqueue(int batchSize, const void* const* inputs, void* const* outputs,
 #endif
-              void* workspace, cudaStream_t stream) override;
+              void* workspace, cudaStream_t stream) TRT_NOEXCEPT override;
 
-  size_t getSerializationSize() const override {
+  size_t getSerializationSize() const TRT_NOEXCEPT override {
     return getBaseSerializationSize() + SerializedSize(threshold_) +
            SerializedSize(scale_) + SerializedSize(offset_);
   }
 
   // TRT will call this func  to serialize the configuration of TRT
   // It should not be called by users.
-  void serialize(void* buffer) const override {
+  void serialize(void* buffer) const TRT_NOEXCEPT override {
     serializeBase(buffer);
     SerializeValue(&buffer, threshold_);
     SerializeValue(&buffer, scale_);
@@ -78,13 +80,15 @@ class HardSwishPlugin : public PluginTensorRT {
 
 class HardSwishPluginCreator : public TensorRTPluginCreator {
  public:
-  const char* getPluginName() const override { return "hard_swish_plugin"; }
+  const char* getPluginName() const TRT_NOEXCEPT override {
+    return "hard_swish_plugin";
+  }
 
-  const char* getPluginVersion() const override { return "1"; }
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
 
-  nvinfer1::IPluginV2* deserializePlugin(const char* name,
-                                         const void* serial_data,
-                                         size_t serial_length) override {
+  nvinfer1::IPluginV2* deserializePlugin(
+      const char* name, const void* serial_data,
+      size_t serial_length) TRT_NOEXCEPT override {
     return new HardSwishPlugin(serial_data, serial_length);
   }
 };
diff --git a/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu
index 13aa6df643e82a..b7c4fb7c99acfd 100644
--- a/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu
@@ -39,10 +39,10 @@ cudnnStatus_t convert_trt2cudnn_dtype(nvinfer1::DataType trt_dtype,
   return CUDNN_STATUS_SUCCESS;
 }
 
-int InstanceNormPlugin::initialize() { return 0; }
+int InstanceNormPlugin::initialize() TRT_NOEXCEPT { return 0; }
 
 nvinfer1::Dims InstanceNormPlugin::getOutputDimensions(
-    int index, const nvinfer1::Dims *inputDims, int nbInputs) {
+    int index, const nvinfer1::Dims *inputDims, int nbInputs) TRT_NOEXCEPT {
   assert(nbInputs == 1);
   assert(index < this->getNbOutputs());
   nvinfer1::Dims const &input_dims = inputDims[0];
@@ -50,8 +50,8 @@ nvinfer1::Dims InstanceNormPlugin::getOutputDimensions(
   return output_dims;
 }
 
-bool InstanceNormPlugin::supportsFormat(nvinfer1::DataType type,
-                                        nvinfer1::PluginFormat format) const {
+bool InstanceNormPlugin::supportsFormat(
+    nvinfer1::DataType type, nvinfer1::PluginFormat format) const TRT_NOEXCEPT {
   return ((type == nvinfer1::DataType::kFLOAT ||
            type == nvinfer1::DataType::kHALF) &&
           (format == nvinfer1::PluginFormat::kLINEAR));
@@ -63,7 +63,7 @@ int InstanceNormPlugin::enqueue(int batch_size, const void *const *inputs,
 #else
                                 void *const *outputs, void *workspace,
 #endif
-                                cudaStream_t stream) {
+                                cudaStream_t stream) TRT_NOEXCEPT {
   const auto &input_dims = this->getInputDims(0);
 
   PADDLE_ENFORCE_EQ(input_dims.nbDims, 3,
diff --git a/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.h
index f9dab09beebd3a..8b1507256757fc 100644
--- a/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.h
@@ -39,7 +39,7 @@ class InstanceNormPlugin : public PluginTensorRT {
   cudnnTensorDescriptor_t x_desc_, y_desc_, b_desc_;
 
  public:
-  size_t getSerializationSize() const override {
+  size_t getSerializationSize() const TRT_NOEXCEPT override {
     return getBaseSerializationSize() + SerializedSize(eps_) +
            SerializedSize(scale_) + SerializedSize(bias_);
   }
@@ -47,7 +47,7 @@ class InstanceNormPlugin : public PluginTensorRT {
   // TRT will call this func when we need to serialize the configuration of
   // tensorrt.
   // It should not be called by users.
-  void serialize(void *buffer) const override {
+  void serialize(void *buffer) const TRT_NOEXCEPT override {
     serializeBase(buffer);
     SerializeValue(&buffer, eps_);
     SerializeValue(&buffer, scale_);
@@ -89,37 +89,41 @@ class InstanceNormPlugin : public PluginTensorRT {
     platform::dynload::cudnnDestroyTensorDescriptor(b_desc_);
   }
 
-  int initialize() override;
+  int initialize() TRT_NOEXCEPT override;
 
-  InstanceNormPlugin *clone() const override {
+  InstanceNormPlugin *clone() const TRT_NOEXCEPT override {
     return new InstanceNormPlugin(eps_, scale_, bias_);
   }
 
-  const char *getPluginType() const override { return "instance_norm_plugin"; }
-  int getNbOutputs() const override { return 1; }
+  const char *getPluginType() const TRT_NOEXCEPT override {
+    return "instance_norm_plugin";
+  }
+  int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
   nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs,
-                                     int nbInputDims) override;
+                                     int nbInputDims) TRT_NOEXCEPT override;
 
 #if IS_TRT_VERSION_LT(8000)
   int enqueue(int batchSize, const void *const *inputs, void **outputs,
 #else
   int enqueue(int batchSize, const void *const *inputs, void *const *outputs,
 #endif
-              void *workspace, cudaStream_t stream) override;
+              void *workspace, cudaStream_t stream) TRT_NOEXCEPT override;
 
-  bool supportsFormat(nvinfer1::DataType type,
-                      nvinfer1::PluginFormat format) const override;
+  bool supportsFormat(nvinfer1::DataType type, nvinfer1::PluginFormat format)
+      const TRT_NOEXCEPT override;
 };
 
 class InstanceNormPluginCreator : public TensorRTPluginCreator {
  public:
-  const char *getPluginName() const override { return "instance_norm_plugin"; }
+  const char *getPluginName() const TRT_NOEXCEPT override {
+    return "instance_norm_plugin";
+  }
 
-  const char *getPluginVersion() const override { return "1"; }
+  const char *getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
 
-  nvinfer1::IPluginV2 *deserializePlugin(const char *name,
-                                         const void *serial_data,
-                                         size_t serial_length) override {
+  nvinfer1::IPluginV2 *deserializePlugin(
+      const char *name, const void *serial_data,
+      size_t serial_length) TRT_NOEXCEPT override {
     return new InstanceNormPlugin(serial_data, serial_length);
   }
 };
diff --git a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
index 2688380726f78e..325aed89f29f01 100644
--- a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
@@ -24,10 +24,10 @@ namespace inference {
 namespace tensorrt {
 namespace plugin {
 
-int LayerNormPlugin::initialize() { return 0; }
+int LayerNormPlugin::initialize() TRT_NOEXCEPT { return 0; }
 
 nvinfer1::Dims LayerNormPlugin::getOutputDimensions(
-    int index, const nvinfer1::Dims *inputDims, int nbInputs) {
+    int index, const nvinfer1::Dims *inputDims, int nbInputs) TRT_NOEXCEPT {
   assert(nbInputs == 1);
   assert(index < this->getNbOutputs());
   nvinfer1::Dims const &input_dims = inputDims[0];
@@ -41,10 +41,10 @@ int LayerNormPlugin::enqueue(int batch_size, const void *const *inputs,
 #else
                              void *const *outputs, void *workspace,
 #endif
-                             cudaStream_t stream) {
+                             cudaStream_t stream) TRT_NOEXCEPT {
   const auto &input_dims = this->getInputDims(0);
   const float *input = reinterpret_cast<const float *>(inputs[0]);
-  float *output = reinterpret_cast<float **>(outputs)[0];
+  float *output = reinterpret_cast<float *const *>(outputs)[0];
   int begin_norm_axis = begin_norm_axis_;
   float eps = eps_;
 
@@ -91,13 +91,13 @@ int LayerNormPlugin::enqueue(int batch_size, const void *const *inputs,
 
 nvinfer1::DimsExprs LayerNormPluginDynamic::getOutputDimensions(
     int output_index, const nvinfer1::DimsExprs *inputDims, int nb_inputs,
-    nvinfer1::IExprBuilder &expr_builder) {
+    nvinfer1::IExprBuilder &expr_builder) TRT_NOEXCEPT {
   return inputDims[0];
 }
 
 bool LayerNormPluginDynamic::supportsFormatCombination(
     int pos, const nvinfer1::PluginTensorDesc *in_out, int nb_inputs,
-    int nb_outputs) {
+    int nb_outputs) TRT_NOEXCEPT {
   PADDLE_ENFORCE_NOT_NULL(
       in_out, platform::errors::InvalidArgument(
                   "The input of layernorm plugin shoule not be nullptr."));
@@ -118,7 +118,8 @@ bool LayerNormPluginDynamic::supportsFormatCombination(
 }
 
 nvinfer1::DataType LayerNormPluginDynamic::getOutputDataType(
-    int index, const nvinfer1::DataType *input_types, int nb_inputs) const {
+    int index, const nvinfer1::DataType *input_types,
+    int nb_inputs) const TRT_NOEXCEPT {
   PADDLE_ENFORCE_EQ(index, 0,
                     platform::errors::InvalidArgument(
                         "The LayerNormPlugin only has one input, so the "
@@ -130,7 +131,7 @@ nvinfer1::DataType LayerNormPluginDynamic::getOutputDataType(
 int LayerNormPluginDynamic::enqueue(
     const nvinfer1::PluginTensorDesc *input_desc,
     const nvinfer1::PluginTensorDesc *output_desc, const void *const *inputs,
-    void *const *outputs, void *workspace, cudaStream_t stream) {
+    void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT {
   const auto &input_dims = input_desc[0].dims;
   int begin_norm_axis = begin_norm_axis_;
   float eps = eps_;
diff --git a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h
index caa3c21db63fab..9e8ce302833731 100644
--- a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h
@@ -40,7 +40,7 @@ class LayerNormPlugin : public PluginTensorRT {
   std::vector<int64_t> variance_shape_;
 
  public:
-  size_t getSerializationSize() const override {
+  size_t getSerializationSize() const TRT_NOEXCEPT override {
     return getBaseSerializationSize() + SerializedSize(bias_) +
            SerializedSize(scale_) + SerializedSize(begin_norm_axis_) +
            SerializedSize(eps_) + SerializedSize(mean_shape_) +
@@ -50,7 +50,7 @@ class LayerNormPlugin : public PluginTensorRT {
   // TRT will call this func when we need to serialize the configuration of
   // tensorrt.
   // It should not be called by users.
-  void serialize(void* buffer) const override {
+  void serialize(void* buffer) const TRT_NOEXCEPT override {
     serializeBase(buffer);
     SerializeValue(&buffer, bias_);
     SerializeValue(&buffer, scale_);
@@ -86,35 +86,39 @@ class LayerNormPlugin : public PluginTensorRT {
     DeserializeValue(&serialData, &serialLength, &variance_shape_);
   }
   ~LayerNormPlugin() {}
-  int initialize() override;
+  int initialize() TRT_NOEXCEPT override;
 
-  LayerNormPlugin* clone() const override {
+  LayerNormPlugin* clone() const TRT_NOEXCEPT override {
     return new LayerNormPlugin(bias_.data(), bias_.size(), scale_.data(),
                                scale_.size(), begin_norm_axis_, eps_,
                                mean_shape_, variance_shape_);
   }
 
-  const char* getPluginType() const override { return "layernorm_plugin"; }
-  int getNbOutputs() const override { return 1; }
+  const char* getPluginType() const TRT_NOEXCEPT override {
+    return "layernorm_plugin";
+  }
+  int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
   nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
-                                     int nbInputDims) override;
+                                     int nbInputDims) TRT_NOEXCEPT override;
 #if IS_TRT_VERSION_LT(8000)
   int enqueue(int batchSize, const void* const* inputs, void** outputs,
 #else
   int enqueue(int batchSize, const void* const* inputs, void* const* outputs,
 #endif
-              void* workspace, cudaStream_t stream) override;
+              void* workspace, cudaStream_t stream) TRT_NOEXCEPT override;
 };
 
 class LayerNormPluginCreator : public TensorRTPluginCreator {
  public:
-  const char* getPluginName() const override { return "layernorm_plugin"; }
+  const char* getPluginName() const TRT_NOEXCEPT override {
+    return "layernorm_plugin";
+  }
 
-  const char* getPluginVersion() const override { return "1"; }
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
 
-  nvinfer1::IPluginV2* deserializePlugin(const char* name,
-                                         const void* serial_data,
-                                         size_t serial_length) override {
+  nvinfer1::IPluginV2* deserializePlugin(
+      const char* name, const void* serial_data,
+      size_t serial_length) TRT_NOEXCEPT override {
     return new LayerNormPlugin(serial_data, serial_length);
   }
 };
@@ -145,25 +149,25 @@ class LayerNormPluginDynamic : public DynamicPluginTensorRT {
     DeserializeValue(&serialData, &serialLength, &mean_shape_);
     DeserializeValue(&serialData, &serialLength, &variance_shape_);
   }
-  nvinfer1::IPluginV2DynamicExt* clone() const override {
+  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override {
     return new LayerNormPluginDynamic(bias_.data(), bias_.size(), scale_.data(),
                                       scale_.size(), begin_norm_axis_, eps_,
                                       mean_shape_, variance_shape_);
   }
 
-  const char* getPluginType() const override {
+  const char* getPluginType() const TRT_NOEXCEPT override {
     return "layernorm_plugin_dynamic";
   }
-  int getNbOutputs() const override { return 1; }
-  int initialize() override { return 0; }
+  int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
+  int initialize() TRT_NOEXCEPT override { return 0; }
 
-  size_t getSerializationSize() const override {
+  size_t getSerializationSize() const TRT_NOEXCEPT override {
     return SerializedSize(bias_) + SerializedSize(scale_) +
            SerializedSize(begin_norm_axis_) + SerializedSize(eps_) +
            SerializedSize(mean_shape_) + SerializedSize(variance_shape_);
   }
 
-  void serialize(void* buffer) const override {
+  void serialize(void* buffer) const TRT_NOEXCEPT override {
     SerializeValue(&buffer, bias_);
     SerializeValue(&buffer, scale_);
     SerializeValue(&buffer, begin_norm_axis_);
@@ -174,33 +178,34 @@ class LayerNormPluginDynamic : public DynamicPluginTensorRT {
 
   nvinfer1::DimsExprs getOutputDimensions(
       int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
-      nvinfer1::IExprBuilder& expr_builder) override;
+      nvinfer1::IExprBuilder& expr_builder) TRT_NOEXCEPT override;
 
   bool supportsFormatCombination(int pos,
                                  const nvinfer1::PluginTensorDesc* inOut,
-                                 int nbInputs, int nbOutputs) override;
+                                 int nbInputs,
+                                 int nbOutputs) TRT_NOEXCEPT override;
 
   void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
                        int nbInputs,
                        const nvinfer1::DynamicPluginTensorDesc* out,
-                       int nbOutputs) override {}
+                       int nbOutputs) TRT_NOEXCEPT override {}
 
   size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
                           int nbInputs,
                           const nvinfer1::PluginTensorDesc* outputs,
-                          int nbOutputs) const override {
+                          int nbOutputs) const TRT_NOEXCEPT override {
     return 0;
   }
 
   int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
               const nvinfer1::PluginTensorDesc* outputDesc,
               const void* const* inputs, void* const* outputs, void* workspace,
-              cudaStream_t stream) override;
-  nvinfer1::DataType getOutputDataType(int index,
-                                       const nvinfer1::DataType* inputTypes,
-                                       int nbInputs) const override;
+              cudaStream_t stream) TRT_NOEXCEPT override;
+  nvinfer1::DataType getOutputDataType(
+      int index, const nvinfer1::DataType* inputTypes,
+      int nbInputs) const TRT_NOEXCEPT override;
 
-  void destroy() override { delete this; }
+  void destroy() TRT_NOEXCEPT override { delete this; }
 
  private:
   std::vector<float> bias_;
@@ -217,15 +222,15 @@ class LayerNormPluginDynamic : public DynamicPluginTensorRT {
 
 class LayerNormPluginDynamicCreator : public TensorRTPluginCreator {
  public:
-  const char* getPluginName() const override {
+  const char* getPluginName() const TRT_NOEXCEPT override {
     return "layernorm_plugin_dynamic";
   }
 
-  const char* getPluginVersion() const override { return "1"; }
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
 
-  nvinfer1::IPluginV2* deserializePlugin(const char* name,
-                                         const void* serial_data,
-                                         size_t serial_length) override {
+  nvinfer1::IPluginV2* deserializePlugin(
+      const char* name, const void* serial_data,
+      size_t serial_length) TRT_NOEXCEPT override {
     return new LayerNormPluginDynamic(serial_data, serial_length);
   }
 };
diff --git a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu
index 7e1d18227e2325..21c8812f3789e3 100644
--- a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu
@@ -22,7 +22,7 @@ namespace plugin {
 
 nvinfer1::Dims PoolPlugin::getOutputDimensions(int index,
                                                const nvinfer1::Dims *inputDims,
-                                               int nbInputs) {
+                                               int nbInputs) TRT_NOEXCEPT {
   assert(nbInputs == 1);
   assert(index == 0);
   assert(inputDims[0].nbDims == 3);
@@ -37,15 +37,16 @@ nvinfer1::Dims PoolPlugin::getOutputDimensions(int index,
 
 int PoolPlugin::enqueue(int batchSize, const void *const *inputs,
 #if IS_TRT_VERSION_LT(8000)
-                        void **outputs, void *workspace, cudaStream_t stream) {
+                        void **outputs, void *workspace,
+                        cudaStream_t stream) TRT_NOEXCEPT {
 #else
                         void *const *outputs, void *workspace,
-                        cudaStream_t stream) {
+                        cudaStream_t stream) TRT_NOEXCEPT {
 #endif
   auto const &input_dims = this->getInputDims(0);
   int input_size = 0;
   float const *idata = reinterpret_cast<float const *>(inputs[0]);
-  float **odatas = reinterpret_cast<float **>(outputs);
+  float *const *odatas = reinterpret_cast<float *const *>(outputs);
 
   std::vector<int> input_shape = input_shape_;
   std::vector<int> output_shape = output_shape_;
@@ -87,14 +88,14 @@ PoolPluginDynamic::PoolPluginDynamic(void const *serialData,
   DeserializeValue(&serialData, &serialLength, &is_global_);
 }
 
-size_t PoolPluginDynamic::getSerializationSize() const {
+size_t PoolPluginDynamic::getSerializationSize() const TRT_NOEXCEPT {
   return SerializedSize(ceil_mode_) + SerializedSize(pool_type_.c_str()) +
          SerializedSize(adaptive_) + SerializedSize(ksize_) +
          SerializedSize(strides_) + SerializedSize(paddings_) +
          SerializedSize(is_global_);
 }
 
-void PoolPluginDynamic::serialize(void *buffer) const {
+void PoolPluginDynamic::serialize(void *buffer) const TRT_NOEXCEPT {
   SerializeValue(&buffer, ceil_mode_);
   SerializeValue(&buffer, pool_type_.c_str());
   SerializeValue(&buffer, adaptive_);
@@ -106,7 +107,7 @@ void PoolPluginDynamic::serialize(void *buffer) const {
 
 nvinfer1::DimsExprs PoolPluginDynamic::getOutputDimensions(
     int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
-    nvinfer1::IExprBuilder &expr_builder) {
+    nvinfer1::IExprBuilder &expr_builder) TRT_NOEXCEPT {
   PADDLE_ENFORCE_EQ(nb_inputs, 1,
                     platform::errors::InvalidArgument(
                         "The Split plugin should be only one input."));
@@ -181,7 +182,7 @@ nvinfer1::DimsExprs PoolPluginDynamic::getOutputDimensions(
 
 bool PoolPluginDynamic::supportsFormatCombination(
     int pos, const nvinfer1::PluginTensorDesc *in_out, int nb_inputs,
-    int nb_outputs) {
+    int nb_outputs) TRT_NOEXCEPT {
   PADDLE_ENFORCE_NOT_NULL(
       in_out, platform::errors::InvalidArgument(
                   "The input of swish plugin shoule not be nullptr."));
@@ -198,7 +199,8 @@ bool PoolPluginDynamic::supportsFormatCombination(
 }
 
 nvinfer1::DataType PoolPluginDynamic::getOutputDataType(
-    int index, const nvinfer1::DataType *input_types, int nb_inputs) const {
+    int index, const nvinfer1::DataType *input_types,
+    int nb_inputs) const TRT_NOEXCEPT {
   PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
                                   "The Pool Plugin only has one input, so the "
                                   "index value should be 0, but get %d.",
@@ -212,7 +214,8 @@ nvinfer1::DataType PoolPluginDynamic::getOutputDataType(
 int PoolPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
                                const nvinfer1::PluginTensorDesc *output_desc,
                                const void *const *inputs, void *const *outputs,
-                               void *workspace, cudaStream_t stream) {
+                               void *workspace,
+                               cudaStream_t stream) TRT_NOEXCEPT {
   auto input_dims = input_desc[0].dims;
   int n = input_dims.d[0];
   int c = input_dims.d[1];
diff --git a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.h
index 7c12796805c5d1..6ced066a35952f 100644
--- a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.h
@@ -57,7 +57,7 @@ static std::vector<int> CalcOutputSize(const std::vector<int>& input_shape,
 
 class PoolPlugin : public PluginTensorRT {
  public:
-  size_t getSerializationSize() const override {
+  size_t getSerializationSize() const TRT_NOEXCEPT override {
     return getBaseSerializationSize() + SerializedSize(ceil_mode_) +
            SerializedSize(pool_type_) + SerializedSize(adaptive_) +
            SerializedSize(ksize_) + SerializedSize(strides_) +
@@ -67,7 +67,7 @@ class PoolPlugin : public PluginTensorRT {
 
   // TRT will call this func when we need to serialize the configuration of
   // tensorrt.
-  void serialize(void* buffer) const override {
+  void serialize(void* buffer) const TRT_NOEXCEPT override {
     serializeBase(buffer);
     SerializeValue(&buffer, ceil_mode_);
     SerializeValue(&buffer, pool_type_);
@@ -116,22 +116,24 @@ class PoolPlugin : public PluginTensorRT {
     DeserializeValue(&serialData, &serialLength, &output_shape_);
   }
 
-  PoolPlugin* clone() const override {
+  PoolPlugin* clone() const TRT_NOEXCEPT override {
     return new PoolPlugin(ceil_mode_, pool_type_, adaptive_, ksize_, strides_,
                           paddings_, input_shape_);
   }
 
-  const char* getPluginType() const override { return "pool_plugin"; }
-  int getNbOutputs() const override { return 1; }
+  const char* getPluginType() const TRT_NOEXCEPT override {
+    return "pool_plugin";
+  }
+  int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
   nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
-                                     int nbInputDims) override;
-  int initialize() override { return 0; }
+                                     int nbInputDims) TRT_NOEXCEPT override;
+  int initialize() TRT_NOEXCEPT override { return 0; }
 #if IS_TRT_VERSION_LT(8000)
   int enqueue(int batchSize, const void* const* inputs, void** outputs,
 #else
   int enqueue(int batchSize, const void* const* inputs, void* const* outputs,
 #endif
-              void* workspace, cudaStream_t stream) override;
+              void* workspace, cudaStream_t stream) TRT_NOEXCEPT override;
 
  private:
   bool ceil_mode_;
@@ -146,13 +148,15 @@ class PoolPlugin : public PluginTensorRT {
 
 class PoolPluginCreator : public TensorRTPluginCreator {
  public:
-  const char* getPluginName() const override { return "pool_plugin"; }
+  const char* getPluginName() const TRT_NOEXCEPT override {
+    return "pool_plugin";
+  }
 
-  const char* getPluginVersion() const override { return "1"; }
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
 
-  nvinfer1::IPluginV2* deserializePlugin(const char* name,
-                                         const void* serial_data,
-                                         size_t serial_length) override {
+  nvinfer1::IPluginV2* deserializePlugin(
+      const char* name, const void* serial_data,
+      size_t serial_length) TRT_NOEXCEPT override {
     return new PoolPlugin(serial_data, serial_length);
   }
 };
@@ -176,47 +180,50 @@ class PoolPluginDynamic : public DynamicPluginTensorRT {
 
   PoolPluginDynamic(void const* serialData, size_t serialLength);
   ~PoolPluginDynamic() {}
-  nvinfer1::IPluginV2DynamicExt* clone() const override {
+  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override {
     return new PoolPluginDynamic(ceil_mode_, pool_type_, adaptive_, ksize_,
                                  strides_, paddings_, is_global_);
   }
 
-  const char* getPluginType() const override { return "pool_plugin_dynamic"; }
-  int getNbOutputs() const override { return 1; }
-  int initialize() override { return 0; }
+  const char* getPluginType() const TRT_NOEXCEPT override {
+    return "pool_plugin_dynamic";
+  }
+  int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
+  int initialize() TRT_NOEXCEPT override { return 0; }
 
-  size_t getSerializationSize() const override;
-  void serialize(void* buffer) const override;
+  size_t getSerializationSize() const TRT_NOEXCEPT override;
+  void serialize(void* buffer) const TRT_NOEXCEPT override;
 
   nvinfer1::DimsExprs getOutputDimensions(
       int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
-      nvinfer1::IExprBuilder& expr_builder) override;
+      nvinfer1::IExprBuilder& expr_builder) TRT_NOEXCEPT override;
 
   bool supportsFormatCombination(int pos,
                                  const nvinfer1::PluginTensorDesc* inOut,
-                                 int nbInputs, int nbOutputs) override;
+                                 int nbInputs,
+                                 int nbOutputs) TRT_NOEXCEPT override;
 
   void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
                        int nbInputs,
                        const nvinfer1::DynamicPluginTensorDesc* out,
-                       int nbOutputs) override {}
+                       int nbOutputs) TRT_NOEXCEPT override {}
 
   size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
                           int nbInputs,
                           const nvinfer1::PluginTensorDesc* outputs,
-                          int nbOutputs) const override {
+                          int nbOutputs) const TRT_NOEXCEPT override {
     return 0;
   }
 
   int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
               const nvinfer1::PluginTensorDesc* outputDesc,
               const void* const* inputs, void* const* outputs, void* workspace,
-              cudaStream_t stream) override;
-  nvinfer1::DataType getOutputDataType(int index,
-                                       const nvinfer1::DataType* inputTypes,
-                                       int nbInputs) const override;
+              cudaStream_t stream) TRT_NOEXCEPT override;
+  nvinfer1::DataType getOutputDataType(
+      int index, const nvinfer1::DataType* inputTypes,
+      int nbInputs) const TRT_NOEXCEPT override;
 
-  void destroy() override { delete this; }
+  void destroy() TRT_NOEXCEPT override { delete this; }
 
  private:
   bool ceil_mode_;
@@ -230,13 +237,15 @@ class PoolPluginDynamic : public DynamicPluginTensorRT {
 
 class PoolPluginDynamicCreator : public TensorRTPluginCreator {
  public:
-  const char* getPluginName() const override { return "pool_plugin_dynamic"; }
+  const char* getPluginName() const TRT_NOEXCEPT override {
+    return "pool_plugin_dynamic";
+  }
 
-  const char* getPluginVersion() const override { return "1"; }
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
 
-  nvinfer1::IPluginV2* deserializePlugin(const char* name,
-                                         const void* serial_data,
-                                         size_t serial_length) override {
+  nvinfer1::IPluginV2* deserializePlugin(
+      const char* name, const void* serial_data,
+      size_t serial_length) TRT_NOEXCEPT override {
     return new PoolPluginDynamic(serial_data, serial_length);
   }
 };
diff --git a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
index 1882084a8f5169..5533fb0af3fc4f 100644
--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
@@ -26,14 +26,14 @@ namespace inference {
 namespace tensorrt {
 namespace plugin {
 
-int PReluPlugin::initialize() {
+int PReluPlugin::initialize() TRT_NOEXCEPT {
   cudaMalloc(&p_gpu_weight_, sizeof(float) * weight_.size());
   cudaMemcpy(p_gpu_weight_, weight_.data(), weight_.size() * sizeof(float),
              cudaMemcpyHostToDevice);
   return 0;
 }
 
-void PReluPlugin::terminate() {
+void PReluPlugin::terminate() TRT_NOEXCEPT {
   if (p_gpu_weight_) {
     cudaFree(p_gpu_weight_);
     p_gpu_weight_ = nullptr;
@@ -42,7 +42,7 @@ void PReluPlugin::terminate() {
 
 nvinfer1::Dims PReluPlugin::getOutputDimensions(int index,
                                                 const nvinfer1::Dims *inputDims,
-                                                int nbInputs) {
+                                                int nbInputs) TRT_NOEXCEPT {
   assert(nbInputs == 1);
   assert(index < this->getNbOutputs());
   nvinfer1::Dims const &input_dims = inputDims[0];
@@ -55,14 +55,14 @@ int PReluPlugin::enqueue(int batch_size, const void *const *inputs,
                          void **outputs, void *workspace, cudaStream_t stream) {
 #else
                          void *const *outputs, void *workspace,
-                         cudaStream_t stream) {
+                         cudaStream_t stream) TRT_NOEXCEPT {
 #endif
   // input dims is CHW.
   const auto &input_dims = this->getInputDims(0);
   const float *input = reinterpret_cast<const float *>(inputs[0]);
   // const float *alpha = reinterpret_cast<const float *>(alpha_.get().values);
   const float *alpha = p_gpu_weight_;
-  float *output = reinterpret_cast<float **>(outputs)[0];
+  float *const output = reinterpret_cast<float *const *>(outputs)[0];
   int numel = 1;
   for (int i = 0; i < input_dims.nbDims; i++) {
     numel *= input_dims.d[i];
@@ -86,13 +86,13 @@ int PReluPlugin::enqueue(int batch_size, const void *const *inputs,
 
 #if IS_TRT_VERSION_GE(6000)
 
-void PReluPluginDynamic::terminate() {
+void PReluPluginDynamic::terminate() TRT_NOEXCEPT {
   if (p_gpu_weight_) {
     cudaFree(p_gpu_weight_);
   }
 }
 
-int PReluPluginDynamic::initialize() {
+int PReluPluginDynamic::initialize() TRT_NOEXCEPT {
   cudaMalloc(&p_gpu_weight_, sizeof(float) * weight_.size());
   cudaMemcpy(p_gpu_weight_, weight_.data(), weight_.size() * sizeof(float),
              cudaMemcpyHostToDevice);
@@ -107,24 +107,24 @@ PReluPluginDynamic::PReluPluginDynamic(void const *serialData,
   mode_ = std::string(prelu_mode);
 }
 
-size_t PReluPluginDynamic::getSerializationSize() const {
+size_t PReluPluginDynamic::getSerializationSize() const TRT_NOEXCEPT {
   return SerializedSize(mode_.c_str()) + SerializedSize(weight_);
 }
 
-void PReluPluginDynamic::serialize(void *buffer) const {
+void PReluPluginDynamic::serialize(void *buffer) const TRT_NOEXCEPT {
   SerializeValue(&buffer, weight_);
   SerializeValue(&buffer, mode_.c_str());
 }
 
 nvinfer1::DimsExprs PReluPluginDynamic::getOutputDimensions(
     int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
-    nvinfer1::IExprBuilder &expr_builder) {
+    nvinfer1::IExprBuilder &expr_builder) TRT_NOEXCEPT {
   return inputs[0];
 }
 
 bool PReluPluginDynamic::supportsFormatCombination(
     int pos, const nvinfer1::PluginTensorDesc *in_out, int nb_inputs,
-    int nb_outputs) {
+    int nb_outputs) TRT_NOEXCEPT {
   PADDLE_ENFORCE_NOT_NULL(
       in_out, platform::errors::InvalidArgument(
                   "The input of swish plugin shoule not be nullptr."));
@@ -141,7 +141,8 @@ bool PReluPluginDynamic::supportsFormatCombination(
 }
 
 nvinfer1::DataType PReluPluginDynamic::getOutputDataType(
-    int index, const nvinfer1::DataType *input_types, int nb_inputs) const {
+    int index, const nvinfer1::DataType *input_types,
+    int nb_inputs) const TRT_NOEXCEPT {
   PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
                                   "The PRelu Plugin only has one input, so the "
                                   "index value should be 0, but get %d.",
@@ -155,7 +156,8 @@ nvinfer1::DataType PReluPluginDynamic::getOutputDataType(
 int PReluPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
                                 const nvinfer1::PluginTensorDesc *output_desc,
                                 const void *const *inputs, void *const *outputs,
-                                void *workspace, cudaStream_t stream) {
+                                void *workspace,
+                                cudaStream_t stream) TRT_NOEXCEPT {
   auto input_dims = input_desc[0].dims;
   const float *alpha = p_gpu_weight_;
   const float *input = static_cast<const float *>(inputs[0]);
diff --git a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
index e3f05bdbe85a1b..c61b07e22d6eea 100644
--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
@@ -34,7 +34,7 @@ class PReluPlugin : public PluginTensorRT {
   std::string mode_;
 
  public:
-  size_t getSerializationSize() const override {
+  size_t getSerializationSize() const TRT_NOEXCEPT override {
     return getBaseSerializationSize() + SerializedSize(mode_.c_str()) +
            SerializedSize(weight_);
   }
@@ -42,7 +42,7 @@ class PReluPlugin : public PluginTensorRT {
   // TRT will call this func when we need to serialize the configuration of
   // tensorrt.
   // It should not be called by users.
-  void serialize(void* buffer) const override {
+  void serialize(void* buffer) const TRT_NOEXCEPT override {
     serializeBase(buffer);
     SerializeValue(&buffer, weight_);
     SerializeValue(&buffer, mode_.c_str());
@@ -65,36 +65,40 @@ class PReluPlugin : public PluginTensorRT {
     mode_ = std::string(prelu_mode);
   }
   ~PReluPlugin() {}
-  int initialize() override;
-  void terminate() override;
+  int initialize() TRT_NOEXCEPT override;
+  void terminate() TRT_NOEXCEPT override;
 
-  PReluPlugin* clone() const override {
+  PReluPlugin* clone() const TRT_NOEXCEPT override {
     auto* ptr = new PReluPlugin(weight_.data(), weight_.size(), mode_);
     ptr->p_gpu_weight_ = p_gpu_weight_;
     return ptr;
   }
 
-  const char* getPluginType() const override { return "prelu_plugin"; }
-  int getNbOutputs() const override { return 1; }
+  const char* getPluginType() const TRT_NOEXCEPT override {
+    return "prelu_plugin";
+  }
+  int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
   nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
-                                     int nbInputDims) override;
+                                     int nbInputDims) TRT_NOEXCEPT override;
 #if IS_TRT_VERSION_LT(8000)
   int enqueue(int batchSize, const void* const* inputs, void** outputs,
 #else
   int enqueue(int batchSize, const void* const* inputs, void* const* outputs,
 #endif
-              void* workspace, cudaStream_t stream) override;
+              void* workspace, cudaStream_t stream) TRT_NOEXCEPT override;
 };
 
 class PReluPluginCreator : public TensorRTPluginCreator {
  public:
-  const char* getPluginName() const override { return "prelu_plugin"; }
+  const char* getPluginName() const TRT_NOEXCEPT override {
+    return "prelu_plugin";
+  }
 
-  const char* getPluginVersion() const override { return "1"; }
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
 
-  nvinfer1::IPluginV2* deserializePlugin(const char* name,
-                                         const void* serial_data,
-                                         size_t serial_length) override {
+  nvinfer1::IPluginV2* deserializePlugin(
+      const char* name, const void* serial_data,
+      size_t serial_length) TRT_NOEXCEPT override {
     return new PReluPlugin(serial_data, serial_length);
   }
 };
@@ -112,49 +116,52 @@ class PReluPluginDynamic : public DynamicPluginTensorRT {
 
   PReluPluginDynamic(void const* serialData, size_t serialLength);
   ~PReluPluginDynamic() {}
-  nvinfer1::IPluginV2DynamicExt* clone() const override {
+  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override {
     auto ptr = new PReluPluginDynamic(weight_.data(), weight_.size(), mode_);
     ptr->p_gpu_weight_ = p_gpu_weight_;
     return ptr;
   }
 
-  const char* getPluginType() const override { return "prelu_plugin_dynamic"; }
-  int getNbOutputs() const override { return 1; }
-  int initialize() override;
-  void terminate() override;
+  const char* getPluginType() const TRT_NOEXCEPT override {
+    return "prelu_plugin_dynamic";
+  }
+  int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
+  int initialize() TRT_NOEXCEPT override;
+  void terminate() TRT_NOEXCEPT override;
 
-  size_t getSerializationSize() const override;
-  void serialize(void* buffer) const override;
+  size_t getSerializationSize() const TRT_NOEXCEPT override;
+  void serialize(void* buffer) const TRT_NOEXCEPT override;
 
   nvinfer1::DimsExprs getOutputDimensions(
       int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
-      nvinfer1::IExprBuilder& expr_builder) override;
+      nvinfer1::IExprBuilder& expr_builder) TRT_NOEXCEPT override;
 
   bool supportsFormatCombination(int pos,
                                  const nvinfer1::PluginTensorDesc* inOut,
-                                 int nbInputs, int nbOutputs) override;
+                                 int nbInputs,
+                                 int nbOutputs) TRT_NOEXCEPT override;
 
   void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
                        int nbInputs,
                        const nvinfer1::DynamicPluginTensorDesc* out,
-                       int nbOutputs) override {}
+                       int nbOutputs) TRT_NOEXCEPT override {}
 
   size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
                           int nbInputs,
                           const nvinfer1::PluginTensorDesc* outputs,
-                          int nbOutputs) const override {
+                          int nbOutputs) const TRT_NOEXCEPT override {
     return 0;
   }
 
   int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
               const nvinfer1::PluginTensorDesc* outputDesc,
               const void* const* inputs, void* const* outputs, void* workspace,
-              cudaStream_t stream) override;
-  nvinfer1::DataType getOutputDataType(int index,
-                                       const nvinfer1::DataType* inputTypes,
-                                       int nbInputs) const override;
+              cudaStream_t stream) TRT_NOEXCEPT override;
+  nvinfer1::DataType getOutputDataType(
+      int index, const nvinfer1::DataType* inputTypes,
+      int nbInputs) const TRT_NOEXCEPT override;
 
-  void destroy() override { delete this; }
+  void destroy() TRT_NOEXCEPT override { delete this; }
 
  private:
   std::vector<float> weight_;
@@ -165,13 +172,15 @@ class PReluPluginDynamic : public DynamicPluginTensorRT {
 
 class PReluPluginDynamicCreator : public TensorRTPluginCreator {
  public:
-  const char* getPluginName() const override { return "prelu_plugin_dynamic"; }
+  const char* getPluginName() const TRT_NOEXCEPT override {
+    return "prelu_plugin_dynamic";
+  }
 
-  const char* getPluginVersion() const override { return "1"; }
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
 
-  nvinfer1::IPluginV2* deserializePlugin(const char* name,
-                                         const void* serial_data,
-                                         size_t serial_length) override {
+  nvinfer1::IPluginV2* deserializePlugin(
+      const char* name, const void* serial_data,
+      size_t serial_length) TRT_NOEXCEPT override {
     return new PReluPluginDynamic(serial_data, serial_length);
   }
 };
diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
index 0d9e5417263f3b..0d978939c4bf35 100644
--- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
@@ -147,11 +147,11 @@ inline void TransposeQKV(const int batch, const int seq_len,
   }
 }
 
-int QkvToContextPluginDynamic::initialize() { return 0; }
+int QkvToContextPluginDynamic::initialize() TRT_NOEXCEPT { return 0; }
 
 nvinfer1::DimsExprs QkvToContextPluginDynamic::getOutputDimensions(
     int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
-    nvinfer1::IExprBuilder &expr_builder) {
+    nvinfer1::IExprBuilder &expr_builder) TRT_NOEXCEPT {
   // input[0], (B, S, 3 * N * H, 1, 1)
   // input[1], (B, head_num, seq_len, seq_len)
   // output, (B, seq_len, hidden)
@@ -177,7 +177,7 @@ nvinfer1::DimsExprs QkvToContextPluginDynamic::getOutputDimensions(
 
 bool QkvToContextPluginDynamic::supportsFormatCombination(
     int pos, const nvinfer1::PluginTensorDesc *in_out, int nb_inputs,
-    int nb_outputs) {
+    int nb_outputs) TRT_NOEXCEPT {
   PADDLE_ENFORCE_NOT_NULL(
       in_out, platform::errors::InvalidArgument(
                   "The input of swish plugin shoule not be nullptr."));
@@ -215,7 +215,8 @@ bool QkvToContextPluginDynamic::supportsFormatCombination(
 }
 
 nvinfer1::DataType QkvToContextPluginDynamic::getOutputDataType(
-    int index, const nvinfer1::DataType *input_types, int nb_inputs) const {
+    int index, const nvinfer1::DataType *input_types,
+    int nb_inputs) const TRT_NOEXCEPT {
   PADDLE_ENFORCE_EQ(
       index, 0, platform::errors::InvalidArgument(
                     "The EmbEltwiseLayernorm Plugin only has one input, so the "
@@ -235,7 +236,7 @@ __global__ void apply_scale(T *data, T scale, int n) {
 int QkvToContextPluginDynamic::enqueue(
     const nvinfer1::PluginTensorDesc *input_desc,
     const nvinfer1::PluginTensorDesc *output_desc, const void *const *inputs,
-    void *const *outputs, void *workspace, cudaStream_t stream) {
+    void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT {
   auto input_dims = input_desc[0].dims;
   int input_num = ProductDim(input_dims);
   // input[0], (B, S, 3 * N * H, 1, 1)
diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h
index 7147d9855755be..501c17b2858d6f 100644
--- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h
@@ -59,21 +59,23 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT {
     DeserializeValue(&serial_data, &serial_length, &scale_);
     DeserializeValue(&serial_data, &serial_length, &with_fp16_);
   }
-  nvinfer1::IPluginV2DynamicExt* clone() const override {
+  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override {
     return new QkvToContextPluginDynamic(hidden_, head_number_, head_size_,
                                          scale_, with_fp16_);
   }
 
-  const char* getPluginType() const override { return "qkv_to_context_plugin"; }
-  int getNbOutputs() const override { return 1; }
-  int initialize() override;
+  const char* getPluginType() const TRT_NOEXCEPT override {
+    return "qkv_to_context_plugin";
+  }
+  int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
+  int initialize() TRT_NOEXCEPT override;
 
-  size_t getSerializationSize() const override {
+  size_t getSerializationSize() const TRT_NOEXCEPT override {
     return SerializedSize(hidden_) + SerializedSize(head_number_) +
            SerializedSize(head_size_) + SerializedSize(scale_) +
            SerializedSize(with_fp16_);
   }
-  void serialize(void* buffer) const override {
+  void serialize(void* buffer) const TRT_NOEXCEPT override {
     SerializeValue(&buffer, hidden_);
     SerializeValue(&buffer, head_number_);
     SerializeValue(&buffer, head_size_);
@@ -83,33 +85,34 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT {
 
   nvinfer1::DimsExprs getOutputDimensions(
       int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
-      nvinfer1::IExprBuilder& expr_builder) override;
+      nvinfer1::IExprBuilder& expr_builder) TRT_NOEXCEPT override;
 
   bool supportsFormatCombination(int pos,
                                  const nvinfer1::PluginTensorDesc* in_out,
-                                 int nb_inputs, int nb_outputs) override;
+                                 int nb_inputs,
+                                 int nb_outputs) TRT_NOEXCEPT override;
 
   void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
                        int nb_inputs,
                        const nvinfer1::DynamicPluginTensorDesc* out,
-                       int nb_outputs) override {}
+                       int nb_outputs) TRT_NOEXCEPT override {}
 
   size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
                           int nb_inputs,
                           const nvinfer1::PluginTensorDesc* outputs,
-                          int nb_outputs) const override {
+                          int nb_outputs) const TRT_NOEXCEPT override {
     return 0;
   }
 
   int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
               const nvinfer1::PluginTensorDesc* outputDesc,
               const void* const* inputs, void* const* outputs, void* workspace,
-              cudaStream_t stream) override;
-  nvinfer1::DataType getOutputDataType(int index,
-                                       const nvinfer1::DataType* input_types,
-                                       int nb_inputs) const override;
+              cudaStream_t stream) TRT_NOEXCEPT override;
+  nvinfer1::DataType getOutputDataType(
+      int index, const nvinfer1::DataType* input_types,
+      int nb_inputs) const TRT_NOEXCEPT override;
 
-  void destroy() override { delete this; }
+  void destroy() TRT_NOEXCEPT override { delete this; }
 
  private:
   int hidden_;
@@ -121,31 +124,34 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT {
 class QkvToContextPluginDynamicCreator : public nvinfer1::IPluginCreator {
  public:
   QkvToContextPluginDynamicCreator() {}
-  const char* getPluginName() const override { return "qkv_to_context_plugin"; }
+  const char* getPluginName() const TRT_NOEXCEPT override {
+    return "qkv_to_context_plugin";
+  }
 
-  const char* getPluginVersion() const override { return "1"; }
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
 
-  const nvinfer1::PluginFieldCollection* getFieldNames() override {
+  const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override {
     return &field_collection_;
   }
 
-  nvinfer1::IPluginV2* createPlugin(
-      const char* name, const nvinfer1::PluginFieldCollection* fc) override {
+  nvinfer1::IPluginV2* createPlugin(const char* name,
+                                    const nvinfer1::PluginFieldCollection* fc)
+      TRT_NOEXCEPT override {
     return nullptr;
   }
 
-  nvinfer1::IPluginV2* deserializePlugin(const char* name,
-                                         const void* serial_data,
-                                         size_t serial_length) override {
+  nvinfer1::IPluginV2* deserializePlugin(
+      const char* name, const void* serial_data,
+      size_t serial_length) TRT_NOEXCEPT override {
     auto plugin = new QkvToContextPluginDynamic(serial_data, serial_length);
     return plugin;
   }
 
-  void setPluginNamespace(const char* lib_namespace) override {
+  void setPluginNamespace(const char* lib_namespace) TRT_NOEXCEPT override {
     plugin_namespace_ = lib_namespace;
   }
 
-  const char* getPluginNamespace() const override {
+  const char* getPluginNamespace() const TRT_NOEXCEPT override {
     return plugin_namespace_.c_str();
   }
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu
index 5ec6e5af86daf1..06540b36260828 100644
--- a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu
@@ -200,7 +200,8 @@ RoiAlignPluginDynamic::RoiAlignPluginDynamic(void const* data, size_t length) {
   smem_per_block_ = smem_per_block;
 }
 
-nvinfer1::IPluginV2DynamicExt* RoiAlignPluginDynamic::clone() const {
+nvinfer1::IPluginV2DynamicExt* RoiAlignPluginDynamic::clone() const
+    TRT_NOEXCEPT {
   auto* plugin =
       new RoiAlignPluginDynamic(data_type_, pooled_height_, pooled_width_,
                                 spatial_scale_, sampling_ratio_);
@@ -210,7 +211,7 @@ nvinfer1::IPluginV2DynamicExt* RoiAlignPluginDynamic::clone() const {
 
 nvinfer1::DimsExprs RoiAlignPluginDynamic::getOutputDimensions(
     int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
-    nvinfer1::IExprBuilder& exprBuilder) {
+    nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT {
   nvinfer1::DimsExprs ret{};
   ret.nbDims = 4;
   ret.d[0] = inputs[1].d[0];  // roi
@@ -222,7 +223,7 @@ nvinfer1::DimsExprs RoiAlignPluginDynamic::getOutputDimensions(
 
 bool RoiAlignPluginDynamic::supportsFormatCombination(
     int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs,
-    int nbOutputs) {
+    int nbOutputs) TRT_NOEXCEPT {
   if (inOut[pos].format != nvinfer1::TensorFormat::kLINEAR) {
     return false;
   }
@@ -234,11 +235,12 @@ bool RoiAlignPluginDynamic::supportsFormatCombination(
 
 void RoiAlignPluginDynamic::configurePlugin(
     const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
-    const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) {}
+    const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) TRT_NOEXCEPT {}
 
 size_t RoiAlignPluginDynamic::getWorkspaceSize(
     const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
-    const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const {
+    const nvinfer1::PluginTensorDesc* outputs,
+    int nbOutputs) const TRT_NOEXCEPT {
   return 0;
 }
 
@@ -287,7 +289,7 @@ int RoiAlignPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
                                    const nvinfer1::PluginTensorDesc* outputDesc,
                                    const void* const* inputs,
                                    void* const* outputs, void* workspace,
-                                   cudaStream_t stream) {
+                                   cudaStream_t stream) TRT_NOEXCEPT {
   PADDLE_ENFORCE_EQ(outputDesc[0].type, data_type_,
                     platform::errors::InvalidArgument(
                         "TRT RoiAlignPluginDynamic expects outputDesc[0].type "
@@ -302,21 +304,22 @@ int RoiAlignPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
 }
 
 nvinfer1::DataType RoiAlignPluginDynamic::getOutputDataType(
-    int index, const nvinfer1::DataType* inputTypes, int nbInputs) const {
+    int index, const nvinfer1::DataType* inputTypes,
+    int nbInputs) const TRT_NOEXCEPT {
   return inputTypes[0];
 }
 
-const char* RoiAlignPluginDynamic::getPluginType() const {
+const char* RoiAlignPluginDynamic::getPluginType() const TRT_NOEXCEPT {
   return "roi_align_plugin_dynamic";
 }
 
-int RoiAlignPluginDynamic::getNbOutputs() const { return 1; }
+int RoiAlignPluginDynamic::getNbOutputs() const TRT_NOEXCEPT { return 1; }
 
-int RoiAlignPluginDynamic::initialize() { return 0; }
+int RoiAlignPluginDynamic::initialize() TRT_NOEXCEPT { return 0; }
 
-void RoiAlignPluginDynamic::terminate() {}
+void RoiAlignPluginDynamic::terminate() TRT_NOEXCEPT {}
 
-size_t RoiAlignPluginDynamic::getSerializationSize() const {
+size_t RoiAlignPluginDynamic::getSerializationSize() const TRT_NOEXCEPT {
   size_t serialize_size = 0;
   serialize_size += SerializedSize(data_type_);
   serialize_size += SerializedSize(pooled_height_);
@@ -326,7 +329,7 @@ size_t RoiAlignPluginDynamic::getSerializationSize() const {
   return serialize_size;
 }
 
-void RoiAlignPluginDynamic::serialize(void* buffer) const {
+void RoiAlignPluginDynamic::serialize(void* buffer) const TRT_NOEXCEPT {
   SerializeValue(&buffer, data_type_);
   SerializeValue(&buffer, pooled_height_);
   SerializeValue(&buffer, pooled_width_);
@@ -334,40 +337,43 @@ void RoiAlignPluginDynamic::serialize(void* buffer) const {
   SerializeValue(&buffer, sampling_ratio_);
 }
 
-void RoiAlignPluginDynamic::destroy() {}
+void RoiAlignPluginDynamic::destroy() TRT_NOEXCEPT {}
 
 RoiAlignPluginDynamicCreator::RoiAlignPluginDynamicCreator() {}
 
-void RoiAlignPluginDynamicCreator::setPluginNamespace(
-    const char* lib_namespace) {
+void RoiAlignPluginDynamicCreator::setPluginNamespace(const char* lib_namespace)
+    TRT_NOEXCEPT {
   namespace_ = std::string(lib_namespace);
 }
 
-const char* RoiAlignPluginDynamicCreator::getPluginNamespace() const {
+const char* RoiAlignPluginDynamicCreator::getPluginNamespace() const
+    TRT_NOEXCEPT {
   return namespace_.c_str();
 }
 
-const char* RoiAlignPluginDynamicCreator::getPluginName() const {
+const char* RoiAlignPluginDynamicCreator::getPluginName() const TRT_NOEXCEPT {
   return "roi_align_plugin_dynamic";
 }
 
-const char* RoiAlignPluginDynamicCreator::getPluginVersion() const {
+const char* RoiAlignPluginDynamicCreator::getPluginVersion() const
+    TRT_NOEXCEPT {
   return "1";
 }
 
 const nvinfer1::PluginFieldCollection*
-RoiAlignPluginDynamicCreator::getFieldNames() {
+RoiAlignPluginDynamicCreator::getFieldNames() TRT_NOEXCEPT {
   return &field_collection_;
 }
 
 nvinfer1::IPluginV2Ext* RoiAlignPluginDynamicCreator::createPlugin(
-    const char* name, const nvinfer1::PluginFieldCollection* fc) {
+    const char* name, const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT {
   const nvinfer1::PluginField* fields = fc->fields;
   return nullptr;
 }
 
 nvinfer1::IPluginV2Ext* RoiAlignPluginDynamicCreator::deserializePlugin(
-    const char* name, const void* serial_data, size_t serial_length) {
+    const char* name, const void* serial_data,
+    size_t serial_length) TRT_NOEXCEPT {
   auto plugin = new RoiAlignPluginDynamic(serial_data, serial_length);
   plugin->setPluginNamespace(namespace_.c_str());
   return plugin;
diff --git a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h
index bba7d0d5a99664..44d2b630698357 100644
--- a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h
@@ -34,37 +34,38 @@ class RoiAlignPluginDynamic : public DynamicPluginTensorRT {
                                  int sampling_ratio);
   RoiAlignPluginDynamic(void const* data, size_t length);
   ~RoiAlignPluginDynamic() = default;
-  nvinfer1::IPluginV2DynamicExt* clone() const override;
+  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
   nvinfer1::DimsExprs getOutputDimensions(
       int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
-      nvinfer1::IExprBuilder& exprBuilder) override;
+      nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT override;
   bool supportsFormatCombination(int pos,
                                  const nvinfer1::PluginTensorDesc* inOut,
-                                 int nbInputs, int nbOutputs) override;
+                                 int nbInputs,
+                                 int nbOutputs) TRT_NOEXCEPT override;
   void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
                        int nbInputs,
                        const nvinfer1::DynamicPluginTensorDesc* out,
-                       int nbOutputs) override;
+                       int nbOutputs) TRT_NOEXCEPT override;
   size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
                           int nbInputs,
                           const nvinfer1::PluginTensorDesc* outputs,
-                          int nbOutputs) const override;
+                          int nbOutputs) const TRT_NOEXCEPT override;
   int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
               const nvinfer1::PluginTensorDesc* outputDesc,
               const void* const* inputs, void* const* outputs, void* workspace,
-              cudaStream_t stream) override;
+              cudaStream_t stream) TRT_NOEXCEPT override;
 
-  nvinfer1::DataType getOutputDataType(int index,
-                                       const nvinfer1::DataType* inputTypes,
-                                       int nbInputs) const override;
+  nvinfer1::DataType getOutputDataType(
+      int index, const nvinfer1::DataType* inputTypes,
+      int nbInputs) const TRT_NOEXCEPT override;
 
-  const char* getPluginType() const override;
-  int getNbOutputs() const override;
-  int initialize() override;
-  void terminate() override;
-  size_t getSerializationSize() const override;
-  void serialize(void* buffer) const override;
-  void destroy() override;
+  const char* getPluginType() const TRT_NOEXCEPT override;
+  int getNbOutputs() const TRT_NOEXCEPT override;
+  int initialize() TRT_NOEXCEPT override;
+  void terminate() TRT_NOEXCEPT override;
+  size_t getSerializationSize() const TRT_NOEXCEPT override;
+  void serialize(void* buffer) const TRT_NOEXCEPT override;
+  void destroy() TRT_NOEXCEPT override;
 
  private:
   template <typename T, typename OutT>
@@ -87,17 +88,18 @@ class RoiAlignPluginDynamicCreator : public nvinfer1::IPluginCreator {
   RoiAlignPluginDynamicCreator();
   ~RoiAlignPluginDynamicCreator() override = default;
 
-  void setPluginNamespace(const char* lib_namespace) override;
-  const char* getPluginNamespace() const override;
-  const char* getPluginName() const override;
-  const char* getPluginVersion() const override;
-  const nvinfer1::PluginFieldCollection* getFieldNames() override;
+  void setPluginNamespace(const char* lib_namespace) TRT_NOEXCEPT override;
+  const char* getPluginNamespace() const TRT_NOEXCEPT override;
+  const char* getPluginName() const TRT_NOEXCEPT override;
+  const char* getPluginVersion() const TRT_NOEXCEPT override;
+  const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override;
 
   nvinfer1::IPluginV2Ext* createPlugin(
-      const char* name, const nvinfer1::PluginFieldCollection* fc) override;
-  nvinfer1::IPluginV2Ext* deserializePlugin(const char* name,
-                                            const void* serial_data,
-                                            size_t serial_length) override;
+      const char* name,
+      const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;
+  nvinfer1::IPluginV2Ext* deserializePlugin(
+      const char* name, const void* serial_data,
+      size_t serial_length) TRT_NOEXCEPT override;
 
  private:
   std::string namespace_;
diff --git a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
index 346b4c680830e9..fb14749f3d1dba 100644
--- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
@@ -29,7 +29,7 @@ namespace plugin {
 // Dynamic Plugin below.
 #if IS_TRT_VERSION_GE(6000)
 
-int SkipLayerNormPluginDynamic::initialize() {
+int SkipLayerNormPluginDynamic::initialize() TRT_NOEXCEPT {
   cudaMalloc(&bias_gpu_, sizeof(float) * bias_size_);
   cudaMemcpy(bias_gpu_, bias_.data(), bias_size_ * sizeof(float),
              cudaMemcpyHostToDevice);
@@ -39,7 +39,7 @@ int SkipLayerNormPluginDynamic::initialize() {
   return 0;
 }
 
-void SkipLayerNormPluginDynamic::terminate() {
+void SkipLayerNormPluginDynamic::terminate() TRT_NOEXCEPT {
   if (bias_gpu_) {
     cudaFree(bias_gpu_);
     bias_gpu_ = nullptr;
@@ -52,13 +52,13 @@ void SkipLayerNormPluginDynamic::terminate() {
 
 nvinfer1::DimsExprs SkipLayerNormPluginDynamic::getOutputDimensions(
     int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
-    nvinfer1::IExprBuilder &expr_builder) {
+    nvinfer1::IExprBuilder &expr_builder) TRT_NOEXCEPT {
   return inputs[0];
 }
 
 bool SkipLayerNormPluginDynamic::supportsFormatCombination(
     int pos, const nvinfer1::PluginTensorDesc *in_out, int nb_inputs,
-    int nb_outputs) {
+    int nb_outputs) TRT_NOEXCEPT {
   PADDLE_ENFORCE_NOT_NULL(
       in_out, platform::errors::InvalidArgument(
                   "The input of swish plugin shoule not be nullptr."));
@@ -96,7 +96,8 @@ bool SkipLayerNormPluginDynamic::supportsFormatCombination(
 }
 
 nvinfer1::DataType SkipLayerNormPluginDynamic::getOutputDataType(
-    int index, const nvinfer1::DataType *input_types, int nb_inputs) const {
+    int index, const nvinfer1::DataType *input_types,
+    int nb_inputs) const TRT_NOEXCEPT {
   PADDLE_ENFORCE_EQ(index, 0,
                     platform::errors::InvalidArgument(
                         "The SkipLayerNorm Plugin only has one input, so the "
@@ -112,7 +113,7 @@ nvinfer1::DataType SkipLayerNormPluginDynamic::getOutputDataType(
 int SkipLayerNormPluginDynamic::enqueue(
     const nvinfer1::PluginTensorDesc *input_desc,
     const nvinfer1::PluginTensorDesc *output_desc, const void *const *inputs,
-    void *const *outputs, void *workspace, cudaStream_t stream) {
+    void *const *outputs, void *workspace, cudaStream_t stream) TRT_NOEXCEPT {
   auto input_dims = input_desc[0].dims;
   size_t num = ProductDim(input_dims);
   int hidden = input_dims.d[2];
diff --git a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
index ac621784550f2f..c66b285a9fbc56 100644
--- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h
@@ -39,6 +39,7 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
     std::copy(bias, bias + bias_size, bias_.data());
     std::copy(scale, scale + scale_size, scale_.data());
   }
+
   SkipLayerNormPluginDynamic(void const* serial_data, size_t serial_length) {
     DeserializeValue(&serial_data, &serial_length, &bias_);
     DeserializeValue(&serial_data, &serial_length, &scale_);
@@ -48,7 +49,7 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
     DeserializeValue(&serial_data, &serial_length, &with_fp16_);
   }
 
-  nvinfer1::IPluginV2DynamicExt* clone() const override {
+  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override {
     auto ptr = new SkipLayerNormPluginDynamic(
         bias_.data(), scale_.data(), bias_size_, scale_size_, eps_, with_fp16_);
     ptr->bias_gpu_ = bias_gpu_;
@@ -56,17 +57,19 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
     return ptr;
   }
 
-  const char* getPluginType() const override { return "skip_layernorm_plugin"; }
-  int getNbOutputs() const override { return 1; }
-  int initialize() override;
+  const char* getPluginType() const TRT_NOEXCEPT override {
+    return "skip_layernorm_plugin";
+  }
+  int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
+  int initialize() TRT_NOEXCEPT override;
 
-  size_t getSerializationSize() const override {
+  size_t getSerializationSize() const TRT_NOEXCEPT override {
     size_t ser_size = SerializedSize(bias_) + SerializedSize(scale_) +
                       SerializedSize(bias_size_) + SerializedSize(scale_size_) +
                       SerializedSize(eps_) + SerializedSize(with_fp16_);
     return ser_size;
   }
-  void serialize(void* buffer) const override {
+  void serialize(void* buffer) const TRT_NOEXCEPT override {
     SerializeValue(&buffer, bias_);
     SerializeValue(&buffer, scale_);
     SerializeValue(&buffer, bias_size_);
@@ -77,34 +80,35 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
 
   nvinfer1::DimsExprs getOutputDimensions(
       int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
-      nvinfer1::IExprBuilder& expr_builder) override;
+      nvinfer1::IExprBuilder& expr_builder) TRT_NOEXCEPT override;
 
   bool supportsFormatCombination(int pos,
                                  const nvinfer1::PluginTensorDesc* in_out,
-                                 int nb_inputs, int nb_outputs) override;
+                                 int nb_inputs,
+                                 int nb_outputs) TRT_NOEXCEPT override;
 
   void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
                        int nb_inputs,
                        const nvinfer1::DynamicPluginTensorDesc* out,
-                       int nb_outputs) override {}
+                       int nb_outputs) TRT_NOEXCEPT override {}
 
   size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
                           int nb_inputs,
                           const nvinfer1::PluginTensorDesc* outputs,
-                          int nb_outputs) const override {
+                          int nb_outputs) const TRT_NOEXCEPT override {
     return 0;
   }
 
   int enqueue(const nvinfer1::PluginTensorDesc* input_desc,
               const nvinfer1::PluginTensorDesc* output_desc,
               const void* const* inputs, void* const* outputs, void* workspace,
-              cudaStream_t stream) override;
-  nvinfer1::DataType getOutputDataType(int index,
-                                       const nvinfer1::DataType* input_types,
-                                       int nb_inputs) const override;
+              cudaStream_t stream) TRT_NOEXCEPT override;
+  nvinfer1::DataType getOutputDataType(
+      int index, const nvinfer1::DataType* input_types,
+      int nb_inputs) const TRT_NOEXCEPT override;
 
-  void destroy() override { delete this; }
-  void terminate() override;
+  void destroy() TRT_NOEXCEPT override { delete this; }
+  void terminate() TRT_NOEXCEPT override;
 
  private:
   std::vector<float> bias_;
@@ -122,31 +126,34 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
 class SkipLayerNormPluginDynamicCreator : public nvinfer1::IPluginCreator {
  public:
   SkipLayerNormPluginDynamicCreator() {}
-  const char* getPluginName() const override { return "skip_layernorm_plugin"; }
+  const char* getPluginName() const TRT_NOEXCEPT override {
+    return "skip_layernorm_plugin";
+  }
 
-  const char* getPluginVersion() const override { return "1"; }
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
 
-  const nvinfer1::PluginFieldCollection* getFieldNames() override {
+  const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override {
     return &field_collection_;
   }
 
-  nvinfer1::IPluginV2* createPlugin(
-      const char* name, const nvinfer1::PluginFieldCollection* fc) override {
+  nvinfer1::IPluginV2* createPlugin(const char* name,
+                                    const nvinfer1::PluginFieldCollection* fc)
+      TRT_NOEXCEPT override {
     return nullptr;
   }
 
-  nvinfer1::IPluginV2* deserializePlugin(const char* name,
-                                         const void* serial_data,
-                                         size_t serial_length) override {
+  nvinfer1::IPluginV2* deserializePlugin(
+      const char* name, const void* serial_data,
+      size_t serial_length) TRT_NOEXCEPT override {
     auto plugin = new SkipLayerNormPluginDynamic(serial_data, serial_length);
     return plugin;
   }
 
-  void setPluginNamespace(const char* lib_namespace) override {
+  void setPluginNamespace(const char* lib_namespace) TRT_NOEXCEPT override {
     plugin_namespace_ = lib_namespace;
   }
 
-  const char* getPluginNamespace() const override {
+  const char* getPluginNamespace() const TRT_NOEXCEPT override {
     return plugin_namespace_.c_str();
   }
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu
index 70ff0e7cb069d7..6d367712eabc5a 100644
--- a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu
@@ -75,12 +75,12 @@ SlicePlugin::~SlicePlugin() {
   cudaFree(offset_temp_data_);
 }
 
-SlicePlugin *SlicePlugin::clone() const {
+SlicePlugin *SlicePlugin::clone() const TRT_NOEXCEPT {
   return new SlicePlugin(starts_, ends_, axes_, with_fp16_);
 }
 
-bool SlicePlugin::supportsFormat(nvinfer1::DataType type,
-                                 nvinfer1::PluginFormat format) const {
+bool SlicePlugin::supportsFormat(
+    nvinfer1::DataType type, nvinfer1::PluginFormat format) const TRT_NOEXCEPT {
   if (with_fp16_) {
     return ((type == nvinfer1::DataType::kFLOAT ||
              type == nvinfer1::DataType::kHALF) &&
@@ -91,9 +91,8 @@ bool SlicePlugin::supportsFormat(nvinfer1::DataType type,
   }
 }
 
-nvinfer1::Dims SlicePlugin::getOutputDimensions(int index,
-                                                const nvinfer1::Dims *inputs,
-                                                int nb_input_dims) {
+nvinfer1::Dims SlicePlugin::getOutputDimensions(
+    int index, const nvinfer1::Dims *inputs, int nb_input_dims) TRT_NOEXCEPT {
   auto in_dims = inputs[0];
   nvinfer1::Dims out_dims = in_dims;
   for (size_t i = 0; i < axes_.size(); i++) {
@@ -109,7 +108,7 @@ int SlicePlugin::enqueue(int batch_size, const void *const *inputs,
                          void **outputs, void *workspace, cudaStream_t stream) {
 #else
                          void *const *outputs, void *workspace,
-                         cudaStream_t stream) {
+                         cudaStream_t stream) TRT_NOEXCEPT {
 #endif
   auto input_dims = getInputDims(0);
 
@@ -187,13 +186,13 @@ int SlicePlugin::enqueue(int batch_size, const void *const *inputs,
   return cudaGetLastError() != cudaSuccess;
 }
 
-size_t SlicePlugin::getSerializationSize() const {
+size_t SlicePlugin::getSerializationSize() const TRT_NOEXCEPT {
   return getBaseSerializationSize() + SerializedSize(getPluginType()) +
          SerializedSize(starts_) + SerializedSize(ends_) +
          SerializedSize(axes_);
 }
 
-void SlicePlugin::serialize(void *buffer) const {
+void SlicePlugin::serialize(void *buffer) const TRT_NOEXCEPT {
   SerializeValue(&buffer, getPluginType());
   serializeBase(buffer);
   SerializeValue(&buffer, starts_);
@@ -222,23 +221,23 @@ SlicePluginDynamic::SlicePluginDynamic(void const *serialData,
   cudaStreamCreate(&copy_stream_);
 }
 
-void SlicePluginDynamic::destroy() {
+void SlicePluginDynamic::destroy() TRT_NOEXCEPT {
   cudaStreamDestroy(copy_stream_);
   cudaEventDestroy(copy_event_);
   cudaFree(offset_temp_data_);
   delete this;
 }
 
-int SlicePluginDynamic::initialize() { return 0; }
+int SlicePluginDynamic::initialize() TRT_NOEXCEPT { return 0; }
 
-size_t SlicePluginDynamic::getSerializationSize() const {
+size_t SlicePluginDynamic::getSerializationSize() const TRT_NOEXCEPT {
   size_t size = SerializedSize(starts_) + SerializedSize(ends_) +
                 SerializedSize(axes_) + SerializedSize(with_fp16_);
 
   return size;
 }
 
-void SlicePluginDynamic::serialize(void *buffer) const {
+void SlicePluginDynamic::serialize(void *buffer) const TRT_NOEXCEPT {
   SerializeValue(&buffer, starts_);
   SerializeValue(&buffer, ends_);
   SerializeValue(&buffer, axes_);
@@ -247,7 +246,7 @@ void SlicePluginDynamic::serialize(void *buffer) const {
 
 nvinfer1::DimsExprs SlicePluginDynamic::getOutputDimensions(
     int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
-    nvinfer1::IExprBuilder &expr_builder) {
+    nvinfer1::IExprBuilder &expr_builder) TRT_NOEXCEPT {
   auto in_dims = inputs[0];
   nvinfer1::DimsExprs ret = in_dims;
   // start, ends should greater 0
@@ -261,7 +260,7 @@ nvinfer1::DimsExprs SlicePluginDynamic::getOutputDimensions(
 
 bool SlicePluginDynamic::supportsFormatCombination(
     int pos, const nvinfer1::PluginTensorDesc *in_out, int nb_inputs,
-    int nb_outputs) {
+    int nb_outputs) TRT_NOEXCEPT {
   PADDLE_ENFORCE_NOT_NULL(
       in_out, platform::errors::InvalidArgument(
                   "The input of swish plugin shoule not be nullptr."));
@@ -289,7 +288,8 @@ bool SlicePluginDynamic::supportsFormatCombination(
 }
 
 nvinfer1::DataType SlicePluginDynamic::getOutputDataType(
-    int index, const nvinfer1::DataType *input_types, int nb_inputs) const {
+    int index, const nvinfer1::DataType *input_types,
+    int nb_inputs) const TRT_NOEXCEPT {
   PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
                                   "The Slice Plugin only has one input, so the "
                                   "index value should be 0, but get %d.",
@@ -304,7 +304,8 @@ nvinfer1::DataType SlicePluginDynamic::getOutputDataType(
 int SlicePluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
                                 const nvinfer1::PluginTensorDesc *output_desc,
                                 const void *const *inputs, void *const *outputs,
-                                void *workspace, cudaStream_t stream) {
+                                void *workspace,
+                                cudaStream_t stream) TRT_NOEXCEPT {
   auto input_dims = input_desc[0].dims;
   auto out_dims = output_desc[0].dims;
   auto num_dims = input_dims.nbDims;
diff --git a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h
index b656918f8fbab4..29f8f7c0999c47 100644
--- a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h
@@ -35,27 +35,29 @@ class SlicePlugin : public PluginTensorRT {
   // It should not be called by users.
   SlicePlugin(void const* serial_data, size_t serial_length);
   ~SlicePlugin();
-  SlicePlugin* clone() const override;
+  SlicePlugin* clone() const TRT_NOEXCEPT override;
 
-  const char* getPluginType() const override { return "slice_plugin"; }
-  int getNbOutputs() const override { return 1; }
-  int initialize() override { return 0; }
-  bool supportsFormat(nvinfer1::DataType type,
-                      nvinfer1::PluginFormat format) const override;
+  const char* getPluginType() const TRT_NOEXCEPT override {
+    return "slice_plugin";
+  }
+  int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
+  int initialize() TRT_NOEXCEPT override { return 0; }
+  bool supportsFormat(nvinfer1::DataType type, nvinfer1::PluginFormat format)
+      const TRT_NOEXCEPT override;
   nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
-                                     int nb_input_dims) override;
+                                     int nb_input_dims) TRT_NOEXCEPT override;
 #if IS_TRT_VERSION_LT(8000)
   int enqueue(int batch_size, const void* const* inputs, void** outputs,
 #else
   int enqueue(int batch_size, const void* const* inputs, void* const* outputs,
 #endif
-              void* workspace, cudaStream_t stream) override;
+              void* workspace, cudaStream_t stream) TRT_NOEXCEPT override;
 
-  size_t getSerializationSize() const override;
+  size_t getSerializationSize() const TRT_NOEXCEPT override;
 
   // TRT will call this func  to serialize the configuration of TRT
   // It should not be called by users.
-  void serialize(void* buffer) const override;
+  void serialize(void* buffer) const TRT_NOEXCEPT override;
 
  private:
   std::vector<int> starts_;
@@ -68,13 +70,15 @@ class SlicePlugin : public PluginTensorRT {
 
 class SlicePluginCreator : public TensorRTPluginCreator {
  public:
-  const char* getPluginName() const override { return "slice_plugin"; }
+  const char* getPluginName() const TRT_NOEXCEPT override {
+    return "slice_plugin";
+  }
 
-  const char* getPluginVersion() const override { return "1"; }
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
 
-  nvinfer1::IPluginV2* deserializePlugin(const char* name,
-                                         const void* serial_data,
-                                         size_t serial_length) override {
+  nvinfer1::IPluginV2* deserializePlugin(
+      const char* name, const void* serial_data,
+      size_t serial_length) TRT_NOEXCEPT override {
     return new SlicePlugin(serial_data, serial_length);
   }
 };
@@ -86,48 +90,51 @@ class SlicePluginDynamic : public DynamicPluginTensorRT {
   explicit SlicePluginDynamic(std::vector<int> starts, std::vector<int> ends,
                               std::vector<int> axes, bool with_fp16);
 
-  nvinfer1::IPluginV2DynamicExt* clone() const override {
+  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override {
     return new SlicePluginDynamic(starts_, ends_, axes_, with_fp16_);
   }
 
   SlicePluginDynamic(void const* serialData, size_t serialLength);
 
-  const char* getPluginType() const override { return "slice_plugin_dynamic"; }
-  int getNbOutputs() const override { return 1; }
-  int initialize() override;
+  const char* getPluginType() const TRT_NOEXCEPT override {
+    return "slice_plugin_dynamic";
+  }
+  int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
+  int initialize() TRT_NOEXCEPT override;
 
-  size_t getSerializationSize() const override;
-  void serialize(void* buffer) const override;
+  size_t getSerializationSize() const TRT_NOEXCEPT override;
+  void serialize(void* buffer) const TRT_NOEXCEPT override;
 
   nvinfer1::DimsExprs getOutputDimensions(
       int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
-      nvinfer1::IExprBuilder& expr_builder) override;
+      nvinfer1::IExprBuilder& expr_builder) TRT_NOEXCEPT override;
 
   bool supportsFormatCombination(int pos,
                                  const nvinfer1::PluginTensorDesc* inOut,
-                                 int nbInputs, int nbOutputs) override;
+                                 int nbInputs,
+                                 int nbOutputs) TRT_NOEXCEPT override;
 
   void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
                        int nbInputs,
                        const nvinfer1::DynamicPluginTensorDesc* out,
-                       int nbOutputs) override {}
+                       int nbOutputs) TRT_NOEXCEPT override {}
 
   size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
                           int nbInputs,
                           const nvinfer1::PluginTensorDesc* outputs,
-                          int nbOutputs) const override {
+                          int nbOutputs) const TRT_NOEXCEPT override {
     return 0;
   }
 
   int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
               const nvinfer1::PluginTensorDesc* outputDesc,
               const void* const* inputs, void* const* outputs, void* workspace,
-              cudaStream_t stream) override;
-  nvinfer1::DataType getOutputDataType(int index,
-                                       const nvinfer1::DataType* inputTypes,
-                                       int nbInputs) const override;
+              cudaStream_t stream) TRT_NOEXCEPT override;
+  nvinfer1::DataType getOutputDataType(
+      int index, const nvinfer1::DataType* inputTypes,
+      int nbInputs) const TRT_NOEXCEPT override;
 
-  void destroy() override;
+  void destroy() TRT_NOEXCEPT override;
 
  private:
   std::vector<int> starts_;
@@ -140,13 +147,15 @@ class SlicePluginDynamic : public DynamicPluginTensorRT {
 
 class SlicePluginDynamicCreator : public TensorRTPluginCreator {
  public:
-  const char* getPluginName() const override { return "slice_plugin_dynamic"; }
+  const char* getPluginName() const TRT_NOEXCEPT override {
+    return "slice_plugin_dynamic";
+  }
 
-  const char* getPluginVersion() const override { return "1"; }
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
 
-  nvinfer1::IPluginV2* deserializePlugin(const char* name,
-                                         const void* serialData,
-                                         size_t serialLength) override {
+  nvinfer1::IPluginV2* deserializePlugin(
+      const char* name, const void* serialData,
+      size_t serialLength) TRT_NOEXCEPT override {
     return new SlicePluginDynamic(serialData, serialLength);
   }
 };
diff --git a/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu
index 3bef9672e5058a..49c03b761ceb3e 100644
--- a/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu
@@ -30,28 +30,29 @@ SpecialSlicePluginDynamic::SpecialSlicePluginDynamic(void const* serial_data,
 
 SpecialSlicePluginDynamic::~SpecialSlicePluginDynamic() {}
 
-nvinfer1::IPluginV2DynamicExt* SpecialSlicePluginDynamic::clone() const {
+nvinfer1::IPluginV2DynamicExt* SpecialSlicePluginDynamic::clone() const
+    TRT_NOEXCEPT {
   return new SpecialSlicePluginDynamic();
 }
 
-const char* SpecialSlicePluginDynamic::getPluginType() const {
+const char* SpecialSlicePluginDynamic::getPluginType() const TRT_NOEXCEPT {
   return "special_slice_plugin";
 }
 
-int SpecialSlicePluginDynamic::getNbOutputs() const { return 1; }
+int SpecialSlicePluginDynamic::getNbOutputs() const TRT_NOEXCEPT { return 1; }
 
-int SpecialSlicePluginDynamic::initialize() { return 0; }
+int SpecialSlicePluginDynamic::initialize() TRT_NOEXCEPT { return 0; }
 
-size_t SpecialSlicePluginDynamic::getSerializationSize() const {
+size_t SpecialSlicePluginDynamic::getSerializationSize() const TRT_NOEXCEPT {
   size_t serialize_size = 0;
   return serialize_size;
 }
 
-void SpecialSlicePluginDynamic::serialize(void* buffer) const {}
+void SpecialSlicePluginDynamic::serialize(void* buffer) const TRT_NOEXCEPT {}
 
 nvinfer1::DimsExprs SpecialSlicePluginDynamic::getOutputDimensions(
     int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
-    nvinfer1::IExprBuilder& expr_builder) {
+    nvinfer1::IExprBuilder& expr_builder) TRT_NOEXCEPT {
   nvinfer1::DimsExprs output(inputs[0]);
   output.nbDims++;
   for (int i = output.nbDims - 1; i > 1; i--) {
@@ -69,21 +70,22 @@ nvinfer1::DimsExprs SpecialSlicePluginDynamic::getOutputDimensions(
 
 void SpecialSlicePluginDynamic::configurePlugin(
     const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
-    const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) {}
+    const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) TRT_NOEXCEPT {}
 
 size_t SpecialSlicePluginDynamic::getWorkspaceSize(
     const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
-    const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const {
+    const nvinfer1::PluginTensorDesc* outputs,
+    int nbOutputs) const TRT_NOEXCEPT {
   return 0;
 }
 
-void SpecialSlicePluginDynamic::destroy() { delete this; }
+void SpecialSlicePluginDynamic::destroy() TRT_NOEXCEPT { delete this; }
 
-void SpecialSlicePluginDynamic::terminate() {}
+void SpecialSlicePluginDynamic::terminate() TRT_NOEXCEPT {}
 
 bool SpecialSlicePluginDynamic::supportsFormatCombination(
     int pos, const nvinfer1::PluginTensorDesc* desc, int nb_inputs,
-    int nb_outputs) {
+    int nb_outputs) TRT_NOEXCEPT {
   if (pos == 0)  // slice tensor
     return (desc[pos].type == nvinfer1::DataType::kHALF &&
             desc[pos].format ==
@@ -101,7 +103,8 @@ bool SpecialSlicePluginDynamic::supportsFormatCombination(
 }
 
 nvinfer1::DataType SpecialSlicePluginDynamic::getOutputDataType(
-    int index, const nvinfer1::DataType* input_types, int nb_inputs) const {
+    int index, const nvinfer1::DataType* input_types,
+    int nb_inputs) const TRT_NOEXCEPT {
   PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
                                   "The index should be equal to 0"));
   return input_types[0];
@@ -120,7 +123,7 @@ __global__ void SpecialSliceKernel(const T* slice_input,
 int SpecialSlicePluginDynamic::enqueue(
     const nvinfer1::PluginTensorDesc* input_desc,
     const nvinfer1::PluginTensorDesc* output_desc, const void* const* inputs,
-    void* const* outputs, void* workspace, cudaStream_t stream) {
+    void* const* outputs, void* workspace, cudaStream_t stream) TRT_NOEXCEPT {
   auto input_dims = input_desc[0].dims;  // (sum(S), 768, 1, 1)
   auto out_dims = output_desc[0].dims;   // (batch, 768, 1, 1)
 
@@ -142,36 +145,40 @@ int SpecialSlicePluginDynamic::enqueue(
 
 SpecialSlicePluginDynamicCreator::SpecialSlicePluginDynamicCreator() {}
 
-const char* SpecialSlicePluginDynamicCreator::getPluginName() const {
+const char* SpecialSlicePluginDynamicCreator::getPluginName() const
+    TRT_NOEXCEPT {
   return "special_slice_plugin";
 }
 
-const char* SpecialSlicePluginDynamicCreator::getPluginVersion() const {
+const char* SpecialSlicePluginDynamicCreator::getPluginVersion() const
+    TRT_NOEXCEPT {
   return "1";
 }
 
 const nvinfer1::PluginFieldCollection*
-SpecialSlicePluginDynamicCreator::getFieldNames() {
+SpecialSlicePluginDynamicCreator::getFieldNames() TRT_NOEXCEPT {
   return &field_collection_;
 }
 
 nvinfer1::IPluginV2* SpecialSlicePluginDynamicCreator::createPlugin(
-    const char* name, const nvinfer1::PluginFieldCollection* fc) {
+    const char* name, const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT {
   return new SpecialSlicePluginDynamic();
 }
 
 nvinfer1::IPluginV2* SpecialSlicePluginDynamicCreator::deserializePlugin(
-    const char* name, const void* serial_data, size_t serial_length) {
+    const char* name, const void* serial_data,
+    size_t serial_length) TRT_NOEXCEPT {
   auto plugin = new SpecialSlicePluginDynamic(serial_data, serial_length);
   return plugin;
 }
 
 void SpecialSlicePluginDynamicCreator::setPluginNamespace(
-    const char* lib_namespace) {
+    const char* lib_namespace) TRT_NOEXCEPT {
   plugin_namespace_ = lib_namespace;
 }
 
-const char* SpecialSlicePluginDynamicCreator::getPluginNamespace() const {
+const char* SpecialSlicePluginDynamicCreator::getPluginNamespace() const
+    TRT_NOEXCEPT {
   return plugin_namespace_.c_str();
 }
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.h b/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.h
index 438d9e9465c52a..c3521e4ed63713 100644
--- a/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.h
@@ -31,37 +31,38 @@ class SpecialSlicePluginDynamic : public DynamicPluginTensorRT {
   SpecialSlicePluginDynamic();
   SpecialSlicePluginDynamic(void const* serial_data, size_t serial_length);
   ~SpecialSlicePluginDynamic();
-  nvinfer1::IPluginV2DynamicExt* clone() const override;
+  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
   nvinfer1::DimsExprs getOutputDimensions(
       int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
-      nvinfer1::IExprBuilder& exprBuilder) override;
+      nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT override;
   bool supportsFormatCombination(int pos,
                                  const nvinfer1::PluginTensorDesc* inOut,
-                                 int nbInputs, int nbOutputs) override;
+                                 int nbInputs,
+                                 int nbOutputs) TRT_NOEXCEPT override;
   void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
                        int nbInputs,
                        const nvinfer1::DynamicPluginTensorDesc* out,
-                       int nbOutputs) override;
+                       int nbOutputs) TRT_NOEXCEPT override;
   size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
                           int nbInputs,
                           const nvinfer1::PluginTensorDesc* outputs,
-                          int nbOutputs) const override;
+                          int nbOutputs) const TRT_NOEXCEPT override;
   int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
               const nvinfer1::PluginTensorDesc* outputDesc,
               const void* const* inputs, void* const* outputs, void* workspace,
-              cudaStream_t stream) override;
+              cudaStream_t stream) TRT_NOEXCEPT override;
 
-  nvinfer1::DataType getOutputDataType(int index,
-                                       const nvinfer1::DataType* inputTypes,
-                                       int nbInputs) const override;
+  nvinfer1::DataType getOutputDataType(
+      int index, const nvinfer1::DataType* inputTypes,
+      int nbInputs) const TRT_NOEXCEPT override;
 
-  const char* getPluginType() const override;
-  int getNbOutputs() const override;
-  int initialize() override;
-  void terminate() override;
-  size_t getSerializationSize() const override;
-  void serialize(void* buffer) const override;
-  void destroy() override;
+  const char* getPluginType() const TRT_NOEXCEPT override;
+  int getNbOutputs() const TRT_NOEXCEPT override;
+  int initialize() TRT_NOEXCEPT override;
+  void terminate() TRT_NOEXCEPT override;
+  size_t getSerializationSize() const TRT_NOEXCEPT override;
+  void serialize(void* buffer) const TRT_NOEXCEPT override;
+  void destroy() TRT_NOEXCEPT override;
 
  private:
   int axis_;
@@ -71,16 +72,17 @@ class SpecialSlicePluginDynamic : public DynamicPluginTensorRT {
 class SpecialSlicePluginDynamicCreator : public nvinfer1::IPluginCreator {
  public:
   SpecialSlicePluginDynamicCreator();
-  const char* getPluginName() const override;
-  const char* getPluginVersion() const override;
-  const nvinfer1::PluginFieldCollection* getFieldNames() override;
-  nvinfer1::IPluginV2* createPlugin(
-      const char* name, const nvinfer1::PluginFieldCollection* fc) override;
-  nvinfer1::IPluginV2* deserializePlugin(const char* name,
-                                         const void* serial_data,
-                                         size_t serial_length) override;
-  void setPluginNamespace(const char* lib_namespace) override;
-  const char* getPluginNamespace() const override;
+  const char* getPluginName() const TRT_NOEXCEPT override;
+  const char* getPluginVersion() const TRT_NOEXCEPT override;
+  const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override;
+  nvinfer1::IPluginV2* createPlugin(const char* name,
+                                    const nvinfer1::PluginFieldCollection* fc)
+      TRT_NOEXCEPT override;
+  nvinfer1::IPluginV2* deserializePlugin(
+      const char* name, const void* serial_data,
+      size_t serial_length) TRT_NOEXCEPT override;
+  void setPluginNamespace(const char* lib_namespace) TRT_NOEXCEPT override;
+  const char* getPluginNamespace() const TRT_NOEXCEPT override;
 
  private:
   std::string plugin_namespace_;
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
index 37afff9105d80a..091680ff672d0e 100644
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
@@ -38,7 +38,7 @@ __device__ int upper_bound(T const* vals, int n, T const& key) {
 }
 
 nvinfer1::Dims SplitPlugin::getOutputDimensions(
-    int index, const nvinfer1::Dims* input_dims, int num_inputs) {
+    int index, const nvinfer1::Dims* input_dims, int num_inputs) TRT_NOEXCEPT {
   PADDLE_ENFORCE_EQ(num_inputs, 1,
                     platform::errors::InvalidArgument(
                         "Invalid number of inputs of split TRT plugin. "
@@ -66,7 +66,7 @@ void SplitPlugin::shareData(const SplitPlugin* another) {
   d_output_ptrs_.resize(another->d_output_ptrs_.size(), nullptr);
 }
 
-int SplitPlugin::initialize() {
+int SplitPlugin::initialize() TRT_NOEXCEPT {
   PADDLE_ENFORCE_LE(axis_, nvinfer1::Dims::MAX_DIMS,
                     platform::errors::InvalidArgument(
                         "Axis dimension exceeds max dimension in TensorRT. "
@@ -98,7 +98,7 @@ int SplitPlugin::initialize() {
 }
 
 // nothing to release according to initialize
-void SplitPlugin::terminate() {}
+void SplitPlugin::terminate() TRT_NOEXCEPT {}
 
 // The following part of the code refers to onnx-tensorrt
 // https://github.com/onnx/onnx-tensorrt/blob/master/Split.cu
@@ -129,7 +129,7 @@ int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
                          void** outputs, void* workspace, cudaStream_t stream) {
 #else
                          void* const* outputs, void* workspace,
-                         cudaStream_t stream) {
+                         cudaStream_t stream) TRT_NOEXCEPT {
 #endif
   const int* d_segment_offsets_ptr =
       thrust::raw_pointer_cast(&d_segment_offsets_[0]);
@@ -155,14 +155,14 @@ int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
 
 // Dynamic Plugin below.
 #if IS_TRT_VERSION_GE(6000)
-int SplitPluginDynamic::initialize() { return 0; }
+int SplitPluginDynamic::initialize() TRT_NOEXCEPT { return 0; }
 
-size_t SplitPluginDynamic::getSerializationSize() const {
+size_t SplitPluginDynamic::getSerializationSize() const TRT_NOEXCEPT {
   return SerializedSize(axis_) + SerializedSize(output_length_) +
          SerializedSize(with_fp16_);
 }
 
-void SplitPluginDynamic::serialize(void* buffer) const {
+void SplitPluginDynamic::serialize(void* buffer) const TRT_NOEXCEPT {
   SerializeValue(&buffer, axis_);
   SerializeValue(&buffer, output_length_);
   SerializeValue(&buffer, with_fp16_);
@@ -170,7 +170,7 @@ void SplitPluginDynamic::serialize(void* buffer) const {
 
 nvinfer1::DimsExprs SplitPluginDynamic::getOutputDimensions(
     int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
-    nvinfer1::IExprBuilder& expr_builder) {
+    nvinfer1::IExprBuilder& expr_builder) TRT_NOEXCEPT {
   PADDLE_ENFORCE_EQ(nb_inputs, 1,
                     platform::errors::InvalidArgument(
                         "The Split plugin should be only one input."));
@@ -188,7 +188,7 @@ nvinfer1::DimsExprs SplitPluginDynamic::getOutputDimensions(
 
 bool SplitPluginDynamic::supportsFormatCombination(
     int pos, const nvinfer1::PluginTensorDesc* in_out, int nb_inputs,
-    int nb_outputs) {
+    int nb_outputs) TRT_NOEXCEPT {
   PADDLE_ENFORCE_NOT_NULL(
       in_out, platform::errors::InvalidArgument(
                   "The input of split plugin should not be nullptr."));
@@ -217,14 +217,16 @@ bool SplitPluginDynamic::supportsFormatCombination(
 }
 
 nvinfer1::DataType SplitPluginDynamic::getOutputDataType(
-    int index, const nvinfer1::DataType* input_types, int nb_inputs) const {
+    int index, const nvinfer1::DataType* input_types,
+    int nb_inputs) const TRT_NOEXCEPT {
   return input_types[0];
 }
 
 int SplitPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
                                 const nvinfer1::PluginTensorDesc* output_desc,
                                 const void* const* inputs, void* const* outputs,
-                                void* workspace, cudaStream_t stream) {
+                                void* workspace,
+                                cudaStream_t stream) TRT_NOEXCEPT {
   auto input_dims = input_desc[0].dims;
   int outer_rows = 1;
   int inner_cols = 1;
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
index a791395f4a3d38..7a41fe1d1eef23 100644
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
@@ -39,43 +39,47 @@ class SplitPlugin : public PluginTensorRTV2Ext {
     DeserializeValue(&serial_data, &serial_length, &output_length_);
   }
 
-  nvinfer1::IPluginV2Ext* clone() const override {
+  nvinfer1::IPluginV2Ext* clone() const TRT_NOEXCEPT override {
     SplitPlugin* ptr = new SplitPlugin(axis_, output_length_, with_fp16_);
     ptr->setPluginNamespace(this->getPluginNamespace());
     ptr->shareData(this);
     return ptr;
   }
 
-  nvinfer1::DataType getOutputDataType(int index,
-                                       const nvinfer1::DataType* input_types,
-                                       int nb_inputs) const override {
+  nvinfer1::DataType getOutputDataType(
+      int index, const nvinfer1::DataType* input_types,
+      int nb_inputs) const TRT_NOEXCEPT override {
     return input_types[0];
   }
 
-  const char* getPluginType() const override { return "split_plugin_v2ext"; }
-  int getNbOutputs() const override { return output_length_.size(); }
+  const char* getPluginType() const TRT_NOEXCEPT override {
+    return "split_plugin_v2ext";
+  }
+  int getNbOutputs() const TRT_NOEXCEPT override {
+    return output_length_.size();
+  }
   nvinfer1::Dims getOutputDimensions(int index,
                                      const nvinfer1::Dims* input_dims,
-                                     int num_inputs) override;
+                                     int num_inputs) TRT_NOEXCEPT override;
 
-  int initialize() override;
-  void terminate() override;
+  int initialize() TRT_NOEXCEPT override;
+  void terminate() TRT_NOEXCEPT override;
 #if IS_TRT_VERSION_LT(8000)
   int enqueue(int batch_size, const void* const* inputs, void** outputs,
 #else
   int enqueue(int batch_size, const void* const* inputs, void* const* outputs,
 #endif
-              void* workspace, cudaStream_t stream) override;
+              void* workspace, cudaStream_t stream) TRT_NOEXCEPT override;
 
-  void destroy() override { delete this; }
+  void destroy() TRT_NOEXCEPT override { delete this; }
 
  protected:
-  size_t getSerializationSize() const override {
+  size_t getSerializationSize() const TRT_NOEXCEPT override {
     return SerializedSize(axis_) + SerializedSize(output_length_) +
            getBaseSerializationSize();
   }
 
-  void serialize(void* buffer) const override {
+  void serialize(void* buffer) const TRT_NOEXCEPT override {
     serializeBase(buffer);
     SerializeValue(&buffer, axis_);
     SerializeValue(&buffer, output_length_);
@@ -98,32 +102,35 @@ class SplitPlugin : public PluginTensorRTV2Ext {
 class SplitPluginCreator : public nvinfer1::IPluginCreator {
  public:
   SplitPluginCreator() {}
-  const char* getPluginName() const override { return "split_plugin_v2ext"; }
+  const char* getPluginName() const TRT_NOEXCEPT override {
+    return "split_plugin_v2ext";
+  }
 
-  const char* getPluginVersion() const override { return "1"; }
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
 
-  const nvinfer1::PluginFieldCollection* getFieldNames() override {
+  const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override {
     return &field_collection_;
   }
 
-  nvinfer1::IPluginV2* createPlugin(
-      const char* name, const nvinfer1::PluginFieldCollection* fc) override {
+  nvinfer1::IPluginV2* createPlugin(const char* name,
+                                    const nvinfer1::PluginFieldCollection* fc)
+      TRT_NOEXCEPT override {
     // not implemented
     return nullptr;
   }
 
-  nvinfer1::IPluginV2* deserializePlugin(const char* name,
-                                         const void* serial_data,
-                                         size_t serial_length) override {
+  nvinfer1::IPluginV2* deserializePlugin(
+      const char* name, const void* serial_data,
+      size_t serial_length) TRT_NOEXCEPT override {
     auto plugin = new SplitPlugin(serial_data, serial_length);
     return plugin;
   }
 
-  void setPluginNamespace(const char* lib_namespace) override {
+  void setPluginNamespace(const char* lib_namespace) TRT_NOEXCEPT override {
     plugin_namespace_ = lib_namespace;
   }
 
-  const char* getPluginNamespace() const override {
+  const char* getPluginNamespace() const TRT_NOEXCEPT override {
     return plugin_namespace_.c_str();
   }
 
@@ -151,46 +158,51 @@ class SplitPluginDynamic : public DynamicPluginTensorRT {
     DeserializeValue(&serial_data, &serial_length, &with_fp16_);
   }
 
-  nvinfer1::IPluginV2DynamicExt* clone() const override {
+  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override {
     return new SplitPluginDynamic(axis_, output_length_, with_fp16_);
   }
 
-  const char* getPluginType() const override { return "split_plugin"; }
-  int getNbOutputs() const override { return output_length_.size(); }
-  int initialize() override;
+  const char* getPluginType() const TRT_NOEXCEPT override {
+    return "split_plugin";
+  }
+  int getNbOutputs() const TRT_NOEXCEPT override {
+    return output_length_.size();
+  }
+  int initialize() TRT_NOEXCEPT override;
 
-  size_t getSerializationSize() const override;
-  void serialize(void* buffer) const override;
+  size_t getSerializationSize() const TRT_NOEXCEPT override;
+  void serialize(void* buffer) const TRT_NOEXCEPT override;
 
   nvinfer1::DimsExprs getOutputDimensions(
       int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
-      nvinfer1::IExprBuilder& exprBuilder) override;
+      nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT override;
 
   bool supportsFormatCombination(int pos,
                                  const nvinfer1::PluginTensorDesc* inOut,
-                                 int nbInputs, int nbOutputs) override;
+                                 int nbInputs,
+                                 int nbOutputs) TRT_NOEXCEPT override;
 
   void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
                        int nbInputs,
                        const nvinfer1::DynamicPluginTensorDesc* out,
-                       int nbOutputs) override {}
+                       int nbOutputs) TRT_NOEXCEPT override {}
 
   size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
                           int nbInputs,
                           const nvinfer1::PluginTensorDesc* outputs,
-                          int nbOutputs) const override {
+                          int nbOutputs) const TRT_NOEXCEPT override {
     return 0;
   }
 
   int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
               const nvinfer1::PluginTensorDesc* outputDesc,
               const void* const* inputs, void* const* outputs, void* workspace,
-              cudaStream_t stream) override;
-  nvinfer1::DataType getOutputDataType(int index,
-                                       const nvinfer1::DataType* inputTypes,
-                                       int nbInputs) const override;
+              cudaStream_t stream) TRT_NOEXCEPT override;
+  nvinfer1::DataType getOutputDataType(
+      int index, const nvinfer1::DataType* inputTypes,
+      int nbInputs) const TRT_NOEXCEPT override;
 
-  void destroy() override { delete this; }
+  void destroy() TRT_NOEXCEPT override { delete this; }
 
  private:
   int axis_;
@@ -200,31 +212,34 @@ class SplitPluginDynamic : public DynamicPluginTensorRT {
 class SplitPluginDynamicCreator : public nvinfer1::IPluginCreator {
  public:
   SplitPluginDynamicCreator() {}
-  const char* getPluginName() const override { return "split_plugin"; }
+  const char* getPluginName() const TRT_NOEXCEPT override {
+    return "split_plugin";
+  }
 
-  const char* getPluginVersion() const override { return "1"; }
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
 
-  const nvinfer1::PluginFieldCollection* getFieldNames() override {
+  const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override {
     return &field_collection_;
   }
 
-  nvinfer1::IPluginV2* createPlugin(
-      const char* name, const nvinfer1::PluginFieldCollection* fc) override {
+  nvinfer1::IPluginV2* createPlugin(const char* name,
+                                    const nvinfer1::PluginFieldCollection* fc)
+      TRT_NOEXCEPT override {
     return nullptr;
   }
 
-  nvinfer1::IPluginV2* deserializePlugin(const char* name,
-                                         const void* serial_data,
-                                         size_t serial_length) override {
+  nvinfer1::IPluginV2* deserializePlugin(
+      const char* name, const void* serial_data,
+      size_t serial_length) TRT_NOEXCEPT override {
     auto plugin = new SplitPluginDynamic(serial_data, serial_length);
     return plugin;
   }
 
-  void setPluginNamespace(const char* lib_namespace) override {
+  void setPluginNamespace(const char* lib_namespace) TRT_NOEXCEPT override {
     plugin_namespace_ = lib_namespace;
   }
 
-  const char* getPluginNamespace() const override {
+  const char* getPluginNamespace() const TRT_NOEXCEPT override {
     return plugin_namespace_.c_str();
   }
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu
index 21e80339b50062..c3b4a6ff4af1cb 100644
--- a/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.cu
@@ -37,17 +37,19 @@ StackPluginDynamic::StackPluginDynamic(void const* serial_data,
 
 StackPluginDynamic::~StackPluginDynamic() {}
 
-nvinfer1::IPluginV2DynamicExt* StackPluginDynamic::clone() const {
+nvinfer1::IPluginV2DynamicExt* StackPluginDynamic::clone() const TRT_NOEXCEPT {
   return new StackPluginDynamic(axis_, num_stack_, with_fp16_);
 }
 
-const char* StackPluginDynamic::getPluginType() const { return "stack_plugin"; }
+const char* StackPluginDynamic::getPluginType() const TRT_NOEXCEPT {
+  return "stack_plugin";
+}
 
-int StackPluginDynamic::getNbOutputs() const { return 1; }
+int StackPluginDynamic::getNbOutputs() const TRT_NOEXCEPT { return 1; }
 
-int StackPluginDynamic::initialize() { return 0; }
+int StackPluginDynamic::initialize() TRT_NOEXCEPT { return 0; }
 
-size_t StackPluginDynamic::getSerializationSize() const {
+size_t StackPluginDynamic::getSerializationSize() const TRT_NOEXCEPT {
   size_t serialize_size = 0;
   serialize_size += SerializedSize(axis_);
   serialize_size += SerializedSize(num_stack_);
@@ -55,7 +57,7 @@ size_t StackPluginDynamic::getSerializationSize() const {
   return serialize_size;
 }
 
-void StackPluginDynamic::serialize(void* buffer) const {
+void StackPluginDynamic::serialize(void* buffer) const TRT_NOEXCEPT {
   SerializeValue(&buffer, axis_);
   SerializeValue(&buffer, num_stack_);
   SerializeValue(&buffer, with_fp16_);
@@ -63,7 +65,7 @@ void StackPluginDynamic::serialize(void* buffer) const {
 
 nvinfer1::DimsExprs StackPluginDynamic::getOutputDimensions(
     int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
-    nvinfer1::IExprBuilder& expr_builder) {
+    nvinfer1::IExprBuilder& expr_builder) TRT_NOEXCEPT {
   nvinfer1::DimsExprs output(inputs[0]);
   output.nbDims = inputs[0].nbDims + 1;
 
@@ -76,21 +78,22 @@ nvinfer1::DimsExprs StackPluginDynamic::getOutputDimensions(
 
 void StackPluginDynamic::configurePlugin(
     const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
-    const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) {}
+    const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) TRT_NOEXCEPT {}
 
 size_t StackPluginDynamic::getWorkspaceSize(
     const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
-    const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const {
+    const nvinfer1::PluginTensorDesc* outputs,
+    int nbOutputs) const TRT_NOEXCEPT {
   return num_stack_ * sizeof(uintptr_t);
 }
 
-void StackPluginDynamic::destroy() { delete this; }
+void StackPluginDynamic::destroy() TRT_NOEXCEPT { delete this; }
 
-void StackPluginDynamic::terminate() {}
+void StackPluginDynamic::terminate() TRT_NOEXCEPT {}
 
 bool StackPluginDynamic::supportsFormatCombination(
     int pos, const nvinfer1::PluginTensorDesc* in_out, int nb_inputs,
-    int nb_outputs) {
+    int nb_outputs) TRT_NOEXCEPT {
   PADDLE_ENFORCE_NOT_NULL(
       in_out, platform::errors::InvalidArgument(
                   "The input of stack plugin should not be nullptr."));
@@ -118,7 +121,8 @@ bool StackPluginDynamic::supportsFormatCombination(
 }
 
 nvinfer1::DataType StackPluginDynamic::getOutputDataType(
-    int index, const nvinfer1::DataType* input_types, int nb_inputs) const {
+    int index, const nvinfer1::DataType* input_types,
+    int nb_inputs) const TRT_NOEXCEPT {
   PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
                                   "The index should be equal to 0"));
   return input_types[0];
@@ -139,7 +143,8 @@ __global__ void StackKernel(const T* const* input, T* output, int num_stack,
 int StackPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
                                 const nvinfer1::PluginTensorDesc* output_desc,
                                 const void* const* inputs, void* const* outputs,
-                                void* workspace, cudaStream_t stream) {
+                                void* workspace,
+                                cudaStream_t stream) TRT_NOEXCEPT {
   auto input_dims = input_desc[0].dims;  // (batch, seq, seq)
   auto out_dims = output_desc[0].dims;   // (batch, num_head, seq, seq)
   auto out_num_dims = out_dims.nbDims;
@@ -195,19 +200,21 @@ int StackPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
 
 StackPluginDynamicCreator::StackPluginDynamicCreator() {}
 
-const char* StackPluginDynamicCreator::getPluginName() const {
+const char* StackPluginDynamicCreator::getPluginName() const TRT_NOEXCEPT {
   return "stack_plugin";
 }
 
-const char* StackPluginDynamicCreator::getPluginVersion() const { return "1"; }
+const char* StackPluginDynamicCreator::getPluginVersion() const TRT_NOEXCEPT {
+  return "1";
+}
 
 const nvinfer1::PluginFieldCollection*
-StackPluginDynamicCreator::getFieldNames() {
+StackPluginDynamicCreator::getFieldNames() TRT_NOEXCEPT {
   return &field_collection_;
 }
 
 nvinfer1::IPluginV2* StackPluginDynamicCreator::createPlugin(
-    const char* name, const nvinfer1::PluginFieldCollection* fc) {
+    const char* name, const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT {
   int axis = -1;
   int num_stack = -1;
   bool with_fp16 = false;
@@ -230,16 +237,18 @@ nvinfer1::IPluginV2* StackPluginDynamicCreator::createPlugin(
 }
 
 nvinfer1::IPluginV2* StackPluginDynamicCreator::deserializePlugin(
-    const char* name, const void* serial_data, size_t serial_length) {
+    const char* name, const void* serial_data,
+    size_t serial_length) TRT_NOEXCEPT {
   auto plugin = new StackPluginDynamic(serial_data, serial_length);
   return plugin;
 }
 
-void StackPluginDynamicCreator::setPluginNamespace(const char* lib_namespace) {
+void StackPluginDynamicCreator::setPluginNamespace(const char* lib_namespace)
+    TRT_NOEXCEPT {
   plugin_namespace_ = lib_namespace;
 }
 
-const char* StackPluginDynamicCreator::getPluginNamespace() const {
+const char* StackPluginDynamicCreator::getPluginNamespace() const TRT_NOEXCEPT {
   return plugin_namespace_.c_str();
 }
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.h
index cd8adaf7549572..965c53e2698778 100644
--- a/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/stack_op_plugin.h
@@ -31,37 +31,36 @@ class StackPluginDynamic : public DynamicPluginTensorRT {
   explicit StackPluginDynamic(int axis, int num_stack, bool with_fp16);
   StackPluginDynamic(void const* serial_data, size_t serial_length);
   ~StackPluginDynamic();
-  nvinfer1::IPluginV2DynamicExt* clone() const override;
+  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override;
   nvinfer1::DimsExprs getOutputDimensions(
       int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
-      nvinfer1::IExprBuilder& exprBuilder) override;
+      nvinfer1::IExprBuilder& exprBuilder) TRT_NOEXCEPT override;
   bool supportsFormatCombination(int pos,
                                  const nvinfer1::PluginTensorDesc* inOut,
-                                 int nbInputs, int nbOutputs) override;
+                                 int nbInputs,
+                                 int nbOutputs) TRT_NOEXCEPT override;
   void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
                        int nbInputs,
                        const nvinfer1::DynamicPluginTensorDesc* out,
-                       int nbOutputs) override;
+                       int nbOutputs) TRT_NOEXCEPT override;
   size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
                           int nbInputs,
                           const nvinfer1::PluginTensorDesc* outputs,
-                          int nbOutputs) const override;
+                          int nbOutputs) const TRT_NOEXCEPT override;
   int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
               const nvinfer1::PluginTensorDesc* outputDesc,
               const void* const* inputs, void* const* outputs, void* workspace,
-              cudaStream_t stream) override;
-
-  nvinfer1::DataType getOutputDataType(int index,
-                                       const nvinfer1::DataType* inputTypes,
-                                       int nbInputs) const override;
-
-  const char* getPluginType() const override;
-  int getNbOutputs() const override;
-  int initialize() override;
-  void terminate() override;
-  size_t getSerializationSize() const override;
-  void serialize(void* buffer) const override;
-  void destroy() override;
+              cudaStream_t stream) TRT_NOEXCEPT override;
+  nvinfer1::DataType getOutputDataType(
+      int index, const nvinfer1::DataType* inputTypes,
+      int nbInputs) const TRT_NOEXCEPT override;
+  const char* getPluginType() const TRT_NOEXCEPT override;
+  int getNbOutputs() const TRT_NOEXCEPT override;
+  int initialize() TRT_NOEXCEPT override;
+  void terminate() TRT_NOEXCEPT override;
+  size_t getSerializationSize() const TRT_NOEXCEPT override;
+  void serialize(void* buffer) const TRT_NOEXCEPT override;
+  void destroy() TRT_NOEXCEPT override;
 
  private:
   int axis_;
@@ -71,16 +70,17 @@ class StackPluginDynamic : public DynamicPluginTensorRT {
 class StackPluginDynamicCreator : public nvinfer1::IPluginCreator {
  public:
   StackPluginDynamicCreator();
-  const char* getPluginName() const override;
-  const char* getPluginVersion() const override;
-  const nvinfer1::PluginFieldCollection* getFieldNames() override;
-  nvinfer1::IPluginV2* createPlugin(
-      const char* name, const nvinfer1::PluginFieldCollection* fc) override;
-  nvinfer1::IPluginV2* deserializePlugin(const char* name,
-                                         const void* serial_data,
-                                         size_t serial_length) override;
-  void setPluginNamespace(const char* lib_namespace) override;
-  const char* getPluginNamespace() const override;
+  const char* getPluginName() const TRT_NOEXCEPT override;
+  const char* getPluginVersion() const TRT_NOEXCEPT override;
+  const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override;
+  nvinfer1::IPluginV2* createPlugin(const char* name,
+                                    const nvinfer1::PluginFieldCollection* fc)
+      TRT_NOEXCEPT override;
+  nvinfer1::IPluginV2* deserializePlugin(
+      const char* name, const void* serial_data,
+      size_t serial_length) TRT_NOEXCEPT override;
+  void setPluginNamespace(const char* lib_namespace) TRT_NOEXCEPT override;
+  const char* getPluginNamespace() const TRT_NOEXCEPT override;
 
  private:
   std::string plugin_namespace_;
diff --git a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu
index da9d21acd5d63f..9720719fd0bca0 100644
--- a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu
@@ -23,11 +23,11 @@ namespace inference {
 namespace tensorrt {
 namespace plugin {
 
-int SwishPlugin::initialize() { return 0; }
+int SwishPlugin::initialize() TRT_NOEXCEPT { return 0; }
 
 nvinfer1::Dims SwishPlugin::getOutputDimensions(int index,
                                                 const nvinfer1::Dims *inputDims,
-                                                int nbInputs) {
+                                                int nbInputs) TRT_NOEXCEPT {
   assert(nbInputs == 1);
   assert(index < this->getNbOutputs());
   nvinfer1::Dims const &input_dims = inputDims[0];
@@ -83,12 +83,12 @@ int SwishPlugin::enqueue(int batch_size, const void *const *inputs,
                          void **outputs, void *workspace, cudaStream_t stream) {
 #else
                          void *const *outputs, void *workspace,
-                         cudaStream_t stream) {
+                         cudaStream_t stream) TRT_NOEXCEPT {
 #endif
   // input dims is CHW.
   const auto &input_dims = this->getInputDims(0);
   const float *input = reinterpret_cast<const float *>(inputs[0]);
-  float *output = reinterpret_cast<float **>(outputs)[0];
+  float *output = reinterpret_cast<float *const *>(outputs)[0];
   int num = batch_size;
   for (int i = 0; i < input_dims.nbDims; i++) {
     num *= input_dims.d[i];
@@ -103,29 +103,29 @@ int SwishPlugin::enqueue(int batch_size, const void *const *inputs,
 // Dynamic Plugin below.
 #if IS_TRT_VERSION_GE(6000)
 
-int SwishPluginDynamic::initialize() {
+int SwishPluginDynamic::initialize() TRT_NOEXCEPT {
   getPluginNamespace();
   return 0;
 }
 
-size_t SwishPluginDynamic::getSerializationSize() const {
+size_t SwishPluginDynamic::getSerializationSize() const TRT_NOEXCEPT {
   return SerializedSize(beta_) + SerializedSize(with_fp16_);
 }
 
-void SwishPluginDynamic::serialize(void *buffer) const {
+void SwishPluginDynamic::serialize(void *buffer) const TRT_NOEXCEPT {
   SerializeValue(&buffer, beta_);
   SerializeValue(&buffer, with_fp16_);
 }
 
 nvinfer1::DimsExprs SwishPluginDynamic::getOutputDimensions(
     int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
-    nvinfer1::IExprBuilder &expr_builder) {
+    nvinfer1::IExprBuilder &expr_builder) TRT_NOEXCEPT {
   return inputs[0];
 }
 
 bool SwishPluginDynamic::supportsFormatCombination(
     int pos, const nvinfer1::PluginTensorDesc *in_out, int nb_inputs,
-    int nb_outputs) {
+    int nb_outputs) TRT_NOEXCEPT {
   PADDLE_ENFORCE_NOT_NULL(
       in_out, platform::errors::InvalidArgument(
                   "The input of swish plugin shoule not be nullptr."));
@@ -154,7 +154,8 @@ bool SwishPluginDynamic::supportsFormatCombination(
 }
 
 nvinfer1::DataType SwishPluginDynamic::getOutputDataType(
-    int index, const nvinfer1::DataType *input_types, int nb_inputs) const {
+    int index, const nvinfer1::DataType *input_types,
+    int nb_inputs) const TRT_NOEXCEPT {
   PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
                                   "The Swish Plugin only has one input, so the "
                                   "index value should be 0, but get %d.",
@@ -165,7 +166,8 @@ nvinfer1::DataType SwishPluginDynamic::getOutputDataType(
 int SwishPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
                                 const nvinfer1::PluginTensorDesc *output_desc,
                                 const void *const *inputs, void *const *outputs,
-                                void *workspace, cudaStream_t stream) {
+                                void *workspace,
+                                cudaStream_t stream) TRT_NOEXCEPT {
   auto input_dims = input_desc[0].dims;
   size_t num = ProductDim(input_dims);
   int threads = 1024;
diff --git a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h
index 8940fdce3b0b56..c4bdc5f921509c 100644
--- a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h
@@ -31,11 +31,11 @@ class SwishPlugin : public PluginTensorRT {
   float beta_;
 
  public:
-  size_t getSerializationSize() const override {
+  size_t getSerializationSize() const TRT_NOEXCEPT override {
     return getBaseSerializationSize() + SerializedSize(beta_);
   }
 
-  void serialize(void* buffer) const override {
+  void serialize(void* buffer) const TRT_NOEXCEPT override {
     serializeBase(buffer);
     SerializeValue(&buffer, beta_);
   }
@@ -53,33 +53,37 @@ class SwishPlugin : public PluginTensorRT {
 
   ~SwishPlugin() {}
 
-  int initialize() override;
+  int initialize() TRT_NOEXCEPT override;
 
-  SwishPlugin* clone() const override {
+  SwishPlugin* clone() const TRT_NOEXCEPT override {
     return new SwishPlugin(beta_, with_fp16_);
   }
 
-  const char* getPluginType() const override { return "swish_plugin"; }
-  int getNbOutputs() const override { return 1; }
+  const char* getPluginType() const TRT_NOEXCEPT override {
+    return "swish_plugin";
+  }
+  int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
   nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
-                                     int nbInputDims) override;
+                                     int nbInputDims) TRT_NOEXCEPT override;
 #if IS_TRT_VERSION_LT(8000)
   int enqueue(int batchSize, const void* const* inputs, void** outputs,
 #else
   int enqueue(int batchSize, const void* const* inputs, void* const* outputs,
 #endif
-              void* workspace, cudaStream_t stream) override;
+              void* workspace, cudaStream_t stream) TRT_NOEXCEPT override;
 };
 
 class SwishPluginCreator : public TensorRTPluginCreator {
  public:
-  const char* getPluginName() const override { return "swish_plugin"; }
+  const char* getPluginName() const TRT_NOEXCEPT override {
+    return "swish_plugin";
+  }
 
-  const char* getPluginVersion() const override { return "1"; }
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
 
-  nvinfer1::IPluginV2* deserializePlugin(const char* name,
-                                         const void* serial_data,
-                                         size_t serial_length) override {
+  nvinfer1::IPluginV2* deserializePlugin(
+      const char* name, const void* serial_data,
+      size_t serial_length) TRT_NOEXCEPT override {
     return new SwishPlugin(serial_data, serial_length);
   }
 };
@@ -96,46 +100,49 @@ class SwishPluginDynamic : public DynamicPluginTensorRT {
     DeserializeValue(&serialData, &serialLength, &beta_);
     DeserializeValue(&serialData, &serialLength, &with_fp16_);
   }
-  nvinfer1::IPluginV2DynamicExt* clone() const override {
+  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override {
     return new SwishPluginDynamic(beta_, with_fp16_);
   }
 
-  const char* getPluginType() const override { return "swish_plugin_dynamic"; }
-  int getNbOutputs() const override { return 1; }
-  int initialize() override;
+  const char* getPluginType() const TRT_NOEXCEPT override {
+    return "swish_plugin_dynamic";
+  }
+  int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
+  int initialize() TRT_NOEXCEPT override;
 
-  size_t getSerializationSize() const override;
-  void serialize(void* buffer) const override;
+  size_t getSerializationSize() const TRT_NOEXCEPT override;
+  void serialize(void* buffer) const TRT_NOEXCEPT override;
 
   nvinfer1::DimsExprs getOutputDimensions(
       int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
-      nvinfer1::IExprBuilder& expr_builder) override;
+      nvinfer1::IExprBuilder& expr_builder) TRT_NOEXCEPT override;
 
   bool supportsFormatCombination(int pos,
                                  const nvinfer1::PluginTensorDesc* inOut,
-                                 int nbInputs, int nbOutputs) override;
+                                 int nbInputs,
+                                 int nbOutputs) TRT_NOEXCEPT override;
 
   void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
                        int nbInputs,
                        const nvinfer1::DynamicPluginTensorDesc* out,
-                       int nbOutputs) override {}
+                       int nbOutputs) TRT_NOEXCEPT override {}
 
   size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
                           int nbInputs,
                           const nvinfer1::PluginTensorDesc* outputs,
-                          int nbOutputs) const override {
+                          int nbOutputs) const TRT_NOEXCEPT override {
     return 0;
   }
 
   int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
               const nvinfer1::PluginTensorDesc* outputDesc,
               const void* const* inputs, void* const* outputs, void* workspace,
-              cudaStream_t stream) override;
-  nvinfer1::DataType getOutputDataType(int index,
-                                       const nvinfer1::DataType* inputTypes,
-                                       int nbInputs) const override;
+              cudaStream_t stream) TRT_NOEXCEPT override;
+  nvinfer1::DataType getOutputDataType(
+      int index, const nvinfer1::DataType* inputTypes,
+      int nbInputs) const TRT_NOEXCEPT override;
 
-  void destroy() override { delete this; }
+  void destroy() TRT_NOEXCEPT override { delete this; }
 
  private:
   float beta_;
@@ -143,13 +150,15 @@ class SwishPluginDynamic : public DynamicPluginTensorRT {
 
 class SwishPluginDynamicCreator : public TensorRTPluginCreator {
  public:
-  const char* getPluginName() const override { return "swish_plugin_dynamic"; }
+  const char* getPluginName() const TRT_NOEXCEPT override {
+    return "swish_plugin_dynamic";
+  }
 
-  const char* getPluginVersion() const override { return "1"; }
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
 
-  nvinfer1::IPluginV2* deserializePlugin(const char* name,
-                                         const void* serial_data,
-                                         size_t serial_length) override {
+  nvinfer1::IPluginV2* deserializePlugin(
+      const char* name, const void* serial_data,
+      size_t serial_length) TRT_NOEXCEPT override {
     return new SwishPluginDynamic(serial_data, serial_length);
   }
 };
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
index 5be0ed4a13b230..da5aa54ee4eb59 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc
@@ -60,8 +60,8 @@ size_t PluginTensorRT::getBaseSerializationSize() const {
   return SeriaSize(input_dims_, data_type_, data_format_, with_fp16_);
 }
 
-bool PluginTensorRT::supportsFormat(nvinfer1::DataType type,
-                                    nvinfer1::PluginFormat format) const {
+bool PluginTensorRT::supportsFormat(
+    nvinfer1::DataType type, nvinfer1::PluginFormat format) const TRT_NOEXCEPT {
   return ((type == nvinfer1::DataType::kFLOAT) &&
           (format == nvinfer1::PluginFormat::kLINEAR));
 }
@@ -69,7 +69,7 @@ bool PluginTensorRT::supportsFormat(nvinfer1::DataType type,
 void PluginTensorRT::configureWithFormat(
     const nvinfer1::Dims* input_dims, int num_inputs,
     const nvinfer1::Dims* output_dims, int num_outputs, nvinfer1::DataType type,
-    nvinfer1::PluginFormat format, int max_batch_size) {
+    nvinfer1::PluginFormat format, int max_batch_size) TRT_NOEXCEPT {
   data_type_ = type;
   data_format_ = format;
   input_dims_.assign(input_dims, input_dims + num_inputs);
@@ -95,26 +95,28 @@ void PluginTensorRTV2Ext::configurePlugin(
     const nvinfer1::DataType* input_types,
     const nvinfer1::DataType* output_types, const bool* input_is_broadcast,
     const bool* output_is_broadcast, nvinfer1::PluginFormat float_format,
-    int32_t max_batch_size) {
+    int32_t max_batch_size) TRT_NOEXCEPT {
   input_dims_.assign(input_dims, input_dims + nb_inputs);
   data_format_ = float_format;
   data_type_ = input_types[0];
 }
 
-const nvinfer1::PluginFieldCollection* TensorRTPluginCreator::getFieldNames() {
+const nvinfer1::PluginFieldCollection* TensorRTPluginCreator::getFieldNames()
+    TRT_NOEXCEPT {
   return &field_collection_;
 }
 
 nvinfer1::IPluginV2* TensorRTPluginCreator::createPlugin(
-    const char* name, const nvinfer1::PluginFieldCollection* fc) {
+    const char* name, const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT {
   return nullptr;
 }
 
-void TensorRTPluginCreator::setPluginNamespace(const char* lib_namespace) {
+void TensorRTPluginCreator::setPluginNamespace(const char* lib_namespace)
+    TRT_NOEXCEPT {
   plugin_namespace_ = lib_namespace;
 }
 
-const char* TensorRTPluginCreator::getPluginNamespace() const {
+const char* TensorRTPluginCreator::getPluginNamespace() const TRT_NOEXCEPT {
   return plugin_namespace_.c_str();
 }
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
index 599294392799dc..6b2925a068bbd2 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
@@ -64,35 +64,35 @@ class PluginTensorRT : public nvinfer1::IPluginV2 {
   nvinfer1::PluginFormat getDataFormat() const { return data_format_; }
 
   // IPluginV2
-  virtual const char* getPluginType() const = 0;
+  virtual const char* getPluginType() const TRT_NOEXCEPT = 0;
 
-  virtual const char* getPluginVersion() const { return "1"; }
+  virtual const char* getPluginVersion() const TRT_NOEXCEPT { return "1"; }
 
-  int getNbOutputs() const { return 1; }
+  int getNbOutputs() const TRT_NOEXCEPT { return 1; }
 
   virtual nvinfer1::Dims getOutputDimensions(int index,
                                              const nvinfer1::Dims* input_dims,
-                                             int num_inputs) = 0;
+                                             int num_inputs) TRT_NOEXCEPT = 0;
 
   // Check format support. The default is FLOAT32 and kLINEAR.
-  bool supportsFormat(nvinfer1::DataType type,
-                      nvinfer1::PluginFormat format) const override;
+  bool supportsFormat(nvinfer1::DataType type, nvinfer1::PluginFormat format)
+      const TRT_NOEXCEPT override;
 
   // Configure the layer
   void configureWithFormat(const nvinfer1::Dims* input_dims, int num_inputs,
                            const nvinfer1::Dims* output_dims, int num_outputs,
                            nvinfer1::DataType type,
                            nvinfer1::PluginFormat format,
-                           int max_batch_size) override;
+                           int max_batch_size) TRT_NOEXCEPT override;
 
   // Initialize the layer for execution.
-  int initialize() override { return 0; }
+  int initialize() TRT_NOEXCEPT override { return 0; }
 
   // Shutdown the layer. This is called when the engine is destroyed
-  void terminate() override {}
+  void terminate() TRT_NOEXCEPT override {}
 
   // Find the workspace size required by the layer
-  size_t getWorkspaceSize(int) const override { return 0; }
+  size_t getWorkspaceSize(int) const TRT_NOEXCEPT override { return 0; }
 
 // Execute the layer
 #if IS_TRT_VERSION_LT(8000)
@@ -101,25 +101,27 @@ class PluginTensorRT : public nvinfer1::IPluginV2 {
   virtual int enqueue(int batch_size, const void* const* inputs,
                       void* const* outputs,
 #endif
-                      void* workspace, cudaStream_t stream) = 0;
+                      void* workspace, cudaStream_t stream) TRT_NOEXCEPT = 0;
 
   // Find the size of the serialization buffer required
-  virtual size_t getSerializationSize() const = 0;
+  virtual size_t getSerializationSize() const TRT_NOEXCEPT = 0;
 
   // Serialize the layer config to buffer.
   // TensorRT will call this func to serialize the configuration of TensorRT
   // engine. It should not be called by users.
-  virtual void serialize(void* buffer) const = 0;
+  virtual void serialize(void* buffer) const TRT_NOEXCEPT = 0;
 
-  void destroy() override { delete this; }
+  void destroy() TRT_NOEXCEPT override { delete this; }
 
-  virtual nvinfer1::IPluginV2* clone() const = 0;
+  virtual nvinfer1::IPluginV2* clone() const TRT_NOEXCEPT = 0;
 
-  void setPluginNamespace(const char* plugin_namespace) override {
+  void setPluginNamespace(const char* plugin_namespace) TRT_NOEXCEPT override {
     namespace_ = plugin_namespace;
   }
 
-  const char* getPluginNamespace() const override { return namespace_.c_str(); }
+  const char* getPluginNamespace() const TRT_NOEXCEPT override {
+    return namespace_.c_str();
+  }
 
  protected:
   // Deserialize input_dims, max_batch_size, data_type, data_format
@@ -155,15 +157,16 @@ class PluginTensorRTV2Ext : public nvinfer1::IPluginV2Ext {
   // The Func in IPluginV2Ext
   virtual nvinfer1::DataType getOutputDataType(
       int index, const nvinfer1::DataType* input_types,
-      int nb_inputs) const = 0;
+      int nb_inputs) const TRT_NOEXCEPT = 0;
 
-  virtual bool isOutputBroadcastAcrossBatch(int32_t output_index,
-                                            const bool* input_is_broadcasted,
-                                            int32_t nb_inputs) const {
+  virtual bool isOutputBroadcastAcrossBatch(
+      int32_t output_index, const bool* input_is_broadcasted,
+      int32_t nb_inputs) const TRT_NOEXCEPT {
     return false;
   }
 
-  virtual bool canBroadcastInputAcrossBatch(int32_t input_index) const {
+  virtual bool canBroadcastInputAcrossBatch(int32_t input_index) const
+      TRT_NOEXCEPT {
     return false;
   }
 
@@ -174,37 +177,37 @@ class PluginTensorRTV2Ext : public nvinfer1::IPluginV2Ext {
                        const bool* input_is_broadcast,
                        const bool* output_is_broadcast,
                        nvinfer1::PluginFormat float_format,
-                       int32_t max_batch_size) override;
+                       int32_t max_batch_size) TRT_NOEXCEPT override;
 
-  virtual IPluginV2Ext* clone() const = 0;
+  virtual IPluginV2Ext* clone() const TRT_NOEXCEPT = 0;
 
   void attachToContext(cudnnContext*, cublasContext*,
-                       nvinfer1::IGpuAllocator*) override {}
+                       nvinfer1::IGpuAllocator*) TRT_NOEXCEPT override {}
 
-  void detachFromContext() override {}
+  void detachFromContext() TRT_NOEXCEPT override {}
 
   // The Func in IPluginV2
-  virtual const char* getPluginType() const = 0;
-  const char* getPluginVersion() const override { return "1"; }
-  virtual int32_t getNbOutputs() const { return 1; }
+  virtual const char* getPluginType() const TRT_NOEXCEPT = 0;
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
+  virtual int32_t getNbOutputs() const TRT_NOEXCEPT { return 1; }
   virtual nvinfer1::Dims getOutputDimensions(int32_t index,
                                              const nvinfer1::Dims* inputs,
-                                             int32_t nb_input) = 0;
+                                             int32_t nb_input) TRT_NOEXCEPT = 0;
   // Check format support. The default is FLOAT32 and NCHW.
-  bool supportsFormat(nvinfer1::DataType type,
-                      nvinfer1::PluginFormat format) const override {
+  bool supportsFormat(nvinfer1::DataType type, nvinfer1::PluginFormat format)
+      const TRT_NOEXCEPT override {
     return ((type == nvinfer1::DataType::kFLOAT) &&
             (format == nvinfer1::PluginFormat::kLINEAR));
   }
   // Initialize the layer for execution.
   // This is called when the engine is created.
-  int initialize() override { return 0; }
+  int initialize() TRT_NOEXCEPT override { return 0; }
 
   // Shutdown the layer. This is called when the engine is destroyed
-  void terminate() override {}
+  void terminate() TRT_NOEXCEPT override {}
 
   // Find the workspace size required by the layer
-  size_t getWorkspaceSize(int) const override { return 0; }
+  size_t getWorkspaceSize(int) const TRT_NOEXCEPT override { return 0; }
 
 // Execute the layer
 #if IS_TRT_VERSION_LT(8000)
@@ -213,23 +216,23 @@ class PluginTensorRTV2Ext : public nvinfer1::IPluginV2Ext {
   virtual int enqueue(int batch_size, const void* const* inputs,
                       void* const* outputs,
 #endif
-                      void* workspace, cudaStream_t stream) = 0;
+                      void* workspace, cudaStream_t stream) TRT_NOEXCEPT = 0;
 
   // Find the size of the serialization buffer required
-  virtual size_t getSerializationSize() const = 0;
+  virtual size_t getSerializationSize() const TRT_NOEXCEPT = 0;
 
   // Serialize the layer config to buffer.
   // TensorRT will call this func to serialize the configuration of TensorRT
   // engine. It should not be called by users.
-  virtual void serialize(void* buffer) const = 0;
+  virtual void serialize(void* buffer) const TRT_NOEXCEPT = 0;
 
-  virtual void destroy() = 0;
+  virtual void destroy() TRT_NOEXCEPT = 0;
 
-  void setPluginNamespace(const char* plugin_namespace) override {
+  void setPluginNamespace(const char* plugin_namespace) TRT_NOEXCEPT override {
     name_space_ = plugin_namespace;
   }
 
-  const char* getPluginNamespace() const override {
+  const char* getPluginNamespace() const TRT_NOEXCEPT override {
     return name_space_.c_str();
   }
 
@@ -256,52 +259,52 @@ class DynamicPluginTensorRT : public nvinfer1::IPluginV2DynamicExt {
   DynamicPluginTensorRT(const void* serialized_data, size_t length) {}
 
   // The Func in IPluginExt or IpluginExtV2
-  virtual const char* getPluginVersion() const { return "1"; }
-  virtual const char* getPluginType() const = 0;
-  int getNbOutputs() const { return 1; }
-  int initialize() override { return 0; }
-  void terminate() override{};
+  virtual const char* getPluginVersion() const TRT_NOEXCEPT { return "1"; }
+  virtual const char* getPluginType() const TRT_NOEXCEPT = 0;
+  int getNbOutputs() const TRT_NOEXCEPT { return 1; }
+  int initialize() TRT_NOEXCEPT override { return 0; }
+  void terminate() TRT_NOEXCEPT override{};
 
-  virtual size_t getSerializationSize() const = 0;
-  virtual void serialize(void* buffer) const = 0;
+  virtual size_t getSerializationSize() const TRT_NOEXCEPT = 0;
+  virtual void serialize(void* buffer) const TRT_NOEXCEPT = 0;
 
   // The Func in IPluginV2
-  nvinfer1::IPluginV2DynamicExt* clone() const = 0;
+  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT = 0;
   virtual nvinfer1::DimsExprs getOutputDimensions(
       int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
-      nvinfer1::IExprBuilder& expr_builder) = 0;  // NOLINT
+      nvinfer1::IExprBuilder& expr_builder) TRT_NOEXCEPT = 0;  // NOLINT
 
   virtual bool supportsFormatCombination(
       int pos, const nvinfer1::PluginTensorDesc* in_out, int nb_inputs,
-      int nb_outputs) = 0;
+      int nb_outputs) TRT_NOEXCEPT = 0;
 
   virtual void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
                                int nb_inputs,
                                const nvinfer1::DynamicPluginTensorDesc* out,
-                               int nb_outputs) = 0;
+                               int nb_outputs) TRT_NOEXCEPT = 0;
 
   size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
                           int nb_inputs,
                           const nvinfer1::PluginTensorDesc* outputs,
-                          int nb_outputs) const override {
+                          int nb_outputs) const TRT_NOEXCEPT override {
     return 0;
   }
 
   virtual int enqueue(const nvinfer1::PluginTensorDesc* input_desc,
                       const nvinfer1::PluginTensorDesc* output_desc,
                       const void* const* inputs, void* const* outputs,
-                      void* workspace, cudaStream_t stream) = 0;
+                      void* workspace, cudaStream_t stream) TRT_NOEXCEPT = 0;
 
   virtual nvinfer1::DataType getOutputDataType(
       int index, const nvinfer1::DataType* input_types,
-      int nb_inputs) const = 0;
-  void setPluginNamespace(const char* plugin_namespace) override {
+      int nb_inputs) const TRT_NOEXCEPT = 0;
+  void setPluginNamespace(const char* plugin_namespace) TRT_NOEXCEPT override {
     name_space_ = plugin_namespace;
   }
-  const char* getPluginNamespace() const override {
+  const char* getPluginNamespace() const TRT_NOEXCEPT override {
     return name_space_.c_str();
   }
-  virtual void destroy() = 0;
+  virtual void destroy() TRT_NOEXCEPT = 0;
 
  protected:
   void deserializeBase(void const*& serial_data,  // NOLINT
@@ -320,22 +323,23 @@ class TensorRTPluginCreator : public nvinfer1::IPluginCreator {
  public:
   TensorRTPluginCreator() = default;
 
-  virtual const char* getPluginName() const = 0;
+  virtual const char* getPluginName() const TRT_NOEXCEPT = 0;
 
-  virtual const char* getPluginVersion() const = 0;
+  virtual const char* getPluginVersion() const TRT_NOEXCEPT = 0;
 
-  const nvinfer1::PluginFieldCollection* getFieldNames() override;
+  const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override;
 
-  nvinfer1::IPluginV2* createPlugin(
-      const char* name, const nvinfer1::PluginFieldCollection* fc) override;
+  nvinfer1::IPluginV2* createPlugin(const char* name,
+                                    const nvinfer1::PluginFieldCollection* fc)
+      TRT_NOEXCEPT override;
 
-  virtual nvinfer1::IPluginV2* deserializePlugin(const char* name,
-                                                 const void* serial_data,
-                                                 size_t serial_length) = 0;
+  virtual nvinfer1::IPluginV2* deserializePlugin(
+      const char* name, const void* serial_data,
+      size_t serial_length) TRT_NOEXCEPT = 0;
 
-  void setPluginNamespace(const char* lib_namespace) override;
+  void setPluginNamespace(const char* lib_namespace) TRT_NOEXCEPT override;
 
-  const char* getPluginNamespace() const override;
+  const char* getPluginNamespace() const TRT_NOEXCEPT override;
 
  private:
   std::string plugin_namespace_;
diff --git a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu
index fe292dba4673f6..ee1709f57e2598 100644
--- a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu
@@ -70,15 +70,16 @@ YoloBoxPlugin::~YoloBoxPlugin() {
   }
 }
 
-const char* YoloBoxPlugin::getPluginType() const { return "yolo_box_plugin"; }
+const char* YoloBoxPlugin::getPluginType() const TRT_NOEXCEPT {
+  return "yolo_box_plugin";
+}
 
-const char* YoloBoxPlugin::getPluginVersion() const { return "1"; }
+const char* YoloBoxPlugin::getPluginVersion() const TRT_NOEXCEPT { return "1"; }
 
-int YoloBoxPlugin::getNbOutputs() const { return 2; }
+int YoloBoxPlugin::getNbOutputs() const TRT_NOEXCEPT { return 2; }
 
-nvinfer1::Dims YoloBoxPlugin::getOutputDimensions(int index,
-                                                  const nvinfer1::Dims* inputs,
-                                                  int nb_input_dims) {
+nvinfer1::Dims YoloBoxPlugin::getOutputDimensions(
+    int index, const nvinfer1::Dims* inputs, int nb_input_dims) TRT_NOEXCEPT {
   const int anchor_num = anchors_.size() / 2;
   const int box_num = inputs[0].d[1] * inputs[0].d[2] * anchor_num;
 
@@ -90,13 +91,15 @@ nvinfer1::Dims YoloBoxPlugin::getOutputDimensions(int index,
   return nvinfer1::Dims2(box_num, class_num_);
 }
 
-bool YoloBoxPlugin::supportsFormat(nvinfer1::DataType type,
-                                   nvinfer1::TensorFormat format) const {
+bool YoloBoxPlugin::supportsFormat(
+    nvinfer1::DataType type, nvinfer1::TensorFormat format) const TRT_NOEXCEPT {
   return ((type == data_type_ || type == nvinfer1::DataType::kINT32) &&
           format == nvinfer1::TensorFormat::kLINEAR);
 }
 
-size_t YoloBoxPlugin::getWorkspaceSize(int max_batch_size) const { return 0; }
+size_t YoloBoxPlugin::getWorkspaceSize(int max_batch_size) const TRT_NOEXCEPT {
+  return 0;
+}
 
 template <typename T>
 __device__ inline T sigmoid(T x) {
@@ -219,7 +222,7 @@ __global__ void KeYoloBoxFw(const T* const input, const int* const imgsize,
 
 template <typename T>
 int YoloBoxPlugin::enqueue_impl(int batch_size, const void* const* inputs,
-                                void** outputs, void* workspace,
+                                void* const* outputs, void* workspace,
                                 cudaStream_t stream) {
   const int n = batch_size;
   const int h = input_h_;
@@ -247,7 +250,7 @@ int YoloBoxPlugin::enqueue(int batch_size, const void* const* inputs,
 #else
                            void* const* outputs, void* workspace,
 #endif
-                           cudaStream_t stream) {
+                           cudaStream_t stream) TRT_NOEXCEPT {
   if (data_type_ == nvinfer1::DataType::kFLOAT) {
     return enqueue_impl<float>(batch_size, inputs, outputs, workspace, stream);
   } else if (data_type_ == nvinfer1::DataType::kHALF) {
@@ -256,11 +259,11 @@ int YoloBoxPlugin::enqueue(int batch_size, const void* const* inputs,
   assert("unsupported type.");
 }
 
-int YoloBoxPlugin::initialize() { return 0; }
+int YoloBoxPlugin::initialize() TRT_NOEXCEPT { return 0; }
 
-void YoloBoxPlugin::terminate() {}
+void YoloBoxPlugin::terminate() TRT_NOEXCEPT {}
 
-size_t YoloBoxPlugin::getSerializationSize() const {
+size_t YoloBoxPlugin::getSerializationSize() const TRT_NOEXCEPT {
   size_t serialize_size = 0;
   serialize_size += SerializedSize(data_type_);
   serialize_size += SerializedSize(anchors_);
@@ -274,7 +277,7 @@ size_t YoloBoxPlugin::getSerializationSize() const {
   return serialize_size;
 }
 
-void YoloBoxPlugin::serialize(void* buffer) const {
+void YoloBoxPlugin::serialize(void* buffer) const TRT_NOEXCEPT {
   SerializeValue(&buffer, data_type_);
   SerializeValue(&buffer, anchors_);
   SerializeValue(&buffer, class_num_);
@@ -286,28 +289,30 @@ void YoloBoxPlugin::serialize(void* buffer) const {
   SerializeValue(&buffer, input_w_);
 }
 
-void YoloBoxPlugin::destroy() {}
+void YoloBoxPlugin::destroy() TRT_NOEXCEPT {}
 
-void YoloBoxPlugin::setPluginNamespace(const char* lib_namespace) {
+void YoloBoxPlugin::setPluginNamespace(const char* lib_namespace) TRT_NOEXCEPT {
   namespace_ = std::string(lib_namespace);
 }
 
-const char* YoloBoxPlugin::getPluginNamespace() const {
+const char* YoloBoxPlugin::getPluginNamespace() const TRT_NOEXCEPT {
   return namespace_.c_str();
 }
 
 nvinfer1::DataType YoloBoxPlugin::getOutputDataType(
-    int index, const nvinfer1::DataType* input_type, int nb_inputs) const {
+    int index, const nvinfer1::DataType* input_type,
+    int nb_inputs) const TRT_NOEXCEPT {
   return input_type[0];
 }
 
-bool YoloBoxPlugin::isOutputBroadcastAcrossBatch(int output_index,
-                                                 const bool* input_is_broadcast,
-                                                 int nb_inputs) const {
+bool YoloBoxPlugin::isOutputBroadcastAcrossBatch(
+    int output_index, const bool* input_is_broadcast,
+    int nb_inputs) const TRT_NOEXCEPT {
   return false;
 }
 
-bool YoloBoxPlugin::canBroadcastInputAcrossBatch(int input_index) const {
+bool YoloBoxPlugin::canBroadcastInputAcrossBatch(int input_index) const
+    TRT_NOEXCEPT {
   return false;
 }
 
@@ -317,9 +322,9 @@ void YoloBoxPlugin::configurePlugin(
     const nvinfer1::DataType* input_types,
     const nvinfer1::DataType* output_types, const bool* input_is_broadcast,
     const bool* output_is_broadcast, nvinfer1::PluginFormat float_format,
-    int max_batct_size) {}
+    int max_batct_size) TRT_NOEXCEPT {}
 
-nvinfer1::IPluginV2Ext* YoloBoxPlugin::clone() const {
+nvinfer1::IPluginV2Ext* YoloBoxPlugin::clone() const TRT_NOEXCEPT {
   return new YoloBoxPlugin(data_type_, anchors_, class_num_, conf_thresh_,
                            downsample_ratio_, clip_bbox_, scale_x_y_, input_h_,
                            input_w_);
@@ -327,26 +332,30 @@ nvinfer1::IPluginV2Ext* YoloBoxPlugin::clone() const {
 
 YoloBoxPluginCreator::YoloBoxPluginCreator() {}
 
-void YoloBoxPluginCreator::setPluginNamespace(const char* lib_namespace) {
+void YoloBoxPluginCreator::setPluginNamespace(const char* lib_namespace)
+    TRT_NOEXCEPT {
   namespace_ = std::string(lib_namespace);
 }
 
-const char* YoloBoxPluginCreator::getPluginNamespace() const {
+const char* YoloBoxPluginCreator::getPluginNamespace() const TRT_NOEXCEPT {
   return namespace_.c_str();
 }
 
-const char* YoloBoxPluginCreator::getPluginName() const {
+const char* YoloBoxPluginCreator::getPluginName() const TRT_NOEXCEPT {
   return "yolo_box_plugin";
 }
 
-const char* YoloBoxPluginCreator::getPluginVersion() const { return "1"; }
+const char* YoloBoxPluginCreator::getPluginVersion() const TRT_NOEXCEPT {
+  return "1";
+}
 
-const nvinfer1::PluginFieldCollection* YoloBoxPluginCreator::getFieldNames() {
+const nvinfer1::PluginFieldCollection* YoloBoxPluginCreator::getFieldNames()
+    TRT_NOEXCEPT {
   return &field_collection_;
 }
 
 nvinfer1::IPluginV2Ext* YoloBoxPluginCreator::createPlugin(
-    const char* name, const nvinfer1::PluginFieldCollection* fc) {
+    const char* name, const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT {
   const nvinfer1::PluginField* fields = fc->fields;
 
   int type_id = -1;
@@ -392,7 +401,8 @@ nvinfer1::IPluginV2Ext* YoloBoxPluginCreator::createPlugin(
 }
 
 nvinfer1::IPluginV2Ext* YoloBoxPluginCreator::deserializePlugin(
-    const char* name, const void* serial_data, size_t serial_length) {
+    const char* name, const void* serial_data,
+    size_t serial_length) TRT_NOEXCEPT {
   auto plugin = new YoloBoxPlugin(serial_data, serial_length);
   plugin->setPluginNamespace(namespace_.c_str());
   return plugin;
diff --git a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h
index 4cd6a383336e23..c9e9f9a0567aee 100644
--- a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h
@@ -35,38 +35,39 @@ class YoloBoxPlugin : public nvinfer1::IPluginV2Ext {
   YoloBoxPlugin(const void* data, size_t length);
   ~YoloBoxPlugin() override;
 
-  const char* getPluginType() const override;
-  const char* getPluginVersion() const override;
-  int getNbOutputs() const override;
+  const char* getPluginType() const TRT_NOEXCEPT override;
+  const char* getPluginVersion() const TRT_NOEXCEPT override;
+  int getNbOutputs() const TRT_NOEXCEPT override;
   nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
-                                     int nb_input_dims) override;
-  bool supportsFormat(nvinfer1::DataType type,
-                      nvinfer1::TensorFormat format) const override;
-  size_t getWorkspaceSize(int max_batch_size) const override;
+                                     int nb_input_dims) TRT_NOEXCEPT override;
+  bool supportsFormat(nvinfer1::DataType type, nvinfer1::TensorFormat format)
+      const TRT_NOEXCEPT override;
+  size_t getWorkspaceSize(int max_batch_size) const TRT_NOEXCEPT override;
 #if IS_TRT_VERSION_LT(8000)
   int enqueue(int batch_size, const void* const* inputs, void** outputs,
 #else
   int enqueue(int batch_size, const void* const* inputs, void* const* outputs,
 #endif
-              void* workspace, cudaStream_t stream) override;
+              void* workspace, cudaStream_t stream) TRT_NOEXCEPT override;
   template <typename T>
-  int enqueue_impl(int batch_size, const void* const* inputs, void** outputs,
-                   void* workspace, cudaStream_t stream);
-  int initialize() override;
-  void terminate() override;
-  size_t getSerializationSize() const override;
-  void serialize(void* buffer) const override;
-  void destroy() override;
-  void setPluginNamespace(const char* lib_namespace) override;
-  const char* getPluginNamespace() const override;
+  int enqueue_impl(int batch_size, const void* const* inputs,
+                   void* const* outputs, void* workspace, cudaStream_t stream);
+  int initialize() TRT_NOEXCEPT override;
+  void terminate() TRT_NOEXCEPT override;
+  size_t getSerializationSize() const TRT_NOEXCEPT override;
+  void serialize(void* buffer) const TRT_NOEXCEPT override;
+  void destroy() TRT_NOEXCEPT override;
+  void setPluginNamespace(const char* lib_namespace) TRT_NOEXCEPT override;
+  const char* getPluginNamespace() const TRT_NOEXCEPT override;
 
-  nvinfer1::DataType getOutputDataType(int index,
-                                       const nvinfer1::DataType* input_type,
-                                       int nb_inputs) const override;
+  nvinfer1::DataType getOutputDataType(
+      int index, const nvinfer1::DataType* input_type,
+      int nb_inputs) const TRT_NOEXCEPT override;
   bool isOutputBroadcastAcrossBatch(int output_index,
                                     const bool* input_is_broadcast,
-                                    int nb_inputs) const override;
-  bool canBroadcastInputAcrossBatch(int input_index) const override;
+                                    int nb_inputs) const TRT_NOEXCEPT override;
+  bool canBroadcastInputAcrossBatch(int input_index) const
+      TRT_NOEXCEPT override;
   void configurePlugin(const nvinfer1::Dims* input_dims, int nb_inputs,
                        const nvinfer1::Dims* output_dims, int nb_outputs,
                        const nvinfer1::DataType* input_types,
@@ -74,8 +75,8 @@ class YoloBoxPlugin : public nvinfer1::IPluginV2Ext {
                        const bool* input_is_broadcast,
                        const bool* output_is_broadcast,
                        nvinfer1::PluginFormat float_format,
-                       int max_batct_size) override;
-  nvinfer1::IPluginV2Ext* clone() const override;
+                       int max_batct_size) TRT_NOEXCEPT override;
+  nvinfer1::IPluginV2Ext* clone() const TRT_NOEXCEPT override;
 
  private:
   nvinfer1::DataType data_type_;
@@ -96,17 +97,18 @@ class YoloBoxPluginCreator : public nvinfer1::IPluginCreator {
   YoloBoxPluginCreator();
   ~YoloBoxPluginCreator() override = default;
 
-  void setPluginNamespace(const char* lib_namespace) override;
-  const char* getPluginNamespace() const override;
-  const char* getPluginName() const override;
-  const char* getPluginVersion() const override;
-  const nvinfer1::PluginFieldCollection* getFieldNames() override;
+  void setPluginNamespace(const char* lib_namespace) TRT_NOEXCEPT override;
+  const char* getPluginNamespace() const TRT_NOEXCEPT override;
+  const char* getPluginName() const TRT_NOEXCEPT override;
+  const char* getPluginVersion() const TRT_NOEXCEPT override;
+  const nvinfer1::PluginFieldCollection* getFieldNames() TRT_NOEXCEPT override;
 
   nvinfer1::IPluginV2Ext* createPlugin(
-      const char* name, const nvinfer1::PluginFieldCollection* fc) override;
-  nvinfer1::IPluginV2Ext* deserializePlugin(const char* name,
-                                            const void* serial_data,
-                                            size_t serial_length) override;
+      const char* name,
+      const nvinfer1::PluginFieldCollection* fc) TRT_NOEXCEPT override;
+  nvinfer1::IPluginV2Ext* deserializePlugin(
+      const char* name, const void* serial_data,
+      size_t serial_length) TRT_NOEXCEPT override;
 
  private:
   std::string namespace_;
diff --git a/paddle/fluid/inference/tensorrt/test_tensorrt.cc b/paddle/fluid/inference/tensorrt/test_tensorrt.cc
index 36a25e27d78f5b..2f5b75c1020041 100644
--- a/paddle/fluid/inference/tensorrt/test_tensorrt.cc
+++ b/paddle/fluid/inference/tensorrt/test_tensorrt.cc
@@ -16,13 +16,15 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 #include "NvInfer.h"
+#include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/platform/dynload/tensorrt.h"
 
 namespace dy = paddle::platform::dynload;
 
 class Logger : public nvinfer1::ILogger {
  public:
-  void log(nvinfer1::ILogger::Severity severity, const char* msg) override {
+  void log(nvinfer1::ILogger::Severity severity,
+           const char* msg) TRT_NOEXCEPT override {
     switch (severity) {
       case Severity::kINFO:
         LOG(INFO) << msg;
@@ -74,10 +76,11 @@ nvinfer1::IHostMemory* CreateNetwork() {
   Logger logger;
   // Create the engine.
   nvinfer1::IBuilder* builder = createInferBuilder(&logger);
+  auto config = builder->createBuilderConfig();
   ScopedWeights weights(2.);
   ScopedWeights bias(3.);
 
-  nvinfer1::INetworkDefinition* network = builder->createNetwork();
+  nvinfer1::INetworkDefinition* network = builder->createNetworkV2(0U);
   // Add the input
   auto input = network->addInput(kInputTensor, nvinfer1::DataType::kFLOAT,
                                  nvinfer1::Dims3{1, 1, 1});
@@ -91,8 +94,8 @@ nvinfer1::IHostMemory* CreateNetwork() {
   network->markOutput(*output);
   // Build the engine.
   builder->setMaxBatchSize(1);
-  builder->setMaxWorkspaceSize(1 << 10);
-  auto engine = builder->buildCudaEngine(*network);
+  config->setMaxWorkspaceSize(1 << 10);
+  auto engine = builder->buildEngineWithConfig(*network, *config);
   EXPECT_NE(engine, nullptr);
   // Serialize the engine to create a model, then close.
   nvinfer1::IHostMemory* model = engine->serialize();
diff --git a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc
index 48343fca01efad..86666950bc36e6 100644
--- a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc
+++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.cc
@@ -22,7 +22,7 @@ namespace inference {
 namespace tensorrt {
 
 // set the batch size before constructing the thread to execute engine
-int TRTInt8Calibrator::getBatchSize() const { return batch_size_; }
+int TRTInt8Calibrator::getBatchSize() const TRT_NOEXCEPT { return batch_size_; }
 
 TRTInt8Calibrator::TRTInt8Calibrator(
     const std::unordered_map<std::string, size_t>& buffers, int batch_size,
@@ -95,7 +95,7 @@ bool TRTInt8Calibrator::setBatch(
 }
 
 bool TRTInt8Calibrator::getBatch(void** bindings, const char** names,
-                                 int num_bindings) {
+                                 int num_bindings) TRT_NOEXCEPT {
   VLOG(4) << "get batch: " << engine_name_;
   std::unique_lock<std::mutex> lk(mut_);
   // The consumer has just finished processing a data.
@@ -131,14 +131,15 @@ void TRTInt8Calibrator::setDone() {
   cond_.notify_all();
 }
 
-const void* TRTInt8Calibrator::readCalibrationCache(size_t& length) {
+const void* TRTInt8Calibrator::readCalibrationCache(size_t& length)
+    TRT_NOEXCEPT {
   if (calibration_table_.empty()) return nullptr;
   length = calibration_table_.size();
   return calibration_table_.data();
 }
 
 void TRTInt8Calibrator::writeCalibrationCache(const void* ptr,
-                                              std::size_t length) {
+                                              std::size_t length) TRT_NOEXCEPT {
   calibration_table_ = std::string((const char*)ptr, length);
   VLOG(4) << "Got calibration data for " << engine_name_ << " " << ptr
           << " length=" << length;
diff --git a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h
index 15ae67fa10f697..c84cb45b7ecbad 100644
--- a/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h
+++ b/paddle/fluid/inference/tensorrt/trt_int8_calibrator.h
@@ -43,17 +43,18 @@ struct TRTInt8Calibrator : public nvinfer1::IInt8EntropyCalibrator2 {
   explicit TRTInt8Calibrator(const std::string& calibration_data);
   ~TRTInt8Calibrator();
 
-  int getBatchSize() const override;
+  int getBatchSize() const TRT_NOEXCEPT override;
 
   bool getBatch(void* bindings[], const char* names[],
-                int num_bindings) override;
+                int num_bindings) TRT_NOEXCEPT override;
 
   bool setBatch(const std::unordered_map<std::string, void*>& data);
   void setDone();
   void waitAndSetDone();
 
-  const void* readCalibrationCache(std::size_t& length) override;
-  void writeCalibrationCache(const void* ptr, std::size_t length) override;
+  const void* readCalibrationCache(std::size_t& length) TRT_NOEXCEPT override;
+  void writeCalibrationCache(const void* ptr,
+                             std::size_t length) TRT_NOEXCEPT override;
   const std::string& getCalibrationTableAsString() {
     return calibration_table_;
   }
diff --git a/paddle/fluid/operators/activation_op_npu.cc b/paddle/fluid/operators/activation_op_npu.cc
index 241081bc0d4afd..bb520c270fa2cb 100644
--- a/paddle/fluid/operators/activation_op_npu.cc
+++ b/paddle/fluid/operators/activation_op_npu.cc
@@ -347,6 +347,56 @@ class SigmoidGradNPUKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename DeviceContext, typename T>
+class HardSigmoidNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+    float slope = ctx.Attr<float>("slope");
+    float offset = ctx.Attr<float>("offset");
+
+    out->mutable_data<T>(ctx.GetPlace());
+
+    framework::NPUAttributeMap attr_input = {{"alpha", slope},
+                                             {"beta", offset}};
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    const auto& runner = NpuOpRunner("HardSigmoid", {*x}, {*out}, attr_input);
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class HardSigmoidGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* out = ctx.Input<Tensor>("Out");
+
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    float slope = ctx.Attr<float>("slope");
+    float offset = ctx.Attr<float>("offset");
+
+    dx->mutable_data<T>(ctx.GetPlace());
+
+    framework::NPUAttributeMap attr_input = {{"alpha", slope},
+                                             {"beta", offset}};
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    const auto& runner_dx =
+        NpuOpRunner("HardSigmoidGrad", {*dout, *out}, {*dx}, attr_input);
+    runner_dx.Run(stream);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -421,3 +471,15 @@ REGISTER_OP_NPU_KERNEL(
     ops::SigmoidGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
     ops::SigmoidGradNPUKernel<paddle::platform::NPUDeviceContext,
                               paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    hard_sigmoid,
+    ops::HardSigmoidNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::HardSigmoidNPUKernel<paddle::platform::NPUDeviceContext,
+                              paddle::platform::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    hard_sigmoid_grad,
+    ops::HardSigmoidGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::HardSigmoidGradNPUKernel<paddle::platform::NPUDeviceContext,
+                                  paddle::platform::float16>);
diff --git a/paddle/fluid/operators/cast_op.cu b/paddle/fluid/operators/cast_op.cu
index 1ac110b3cafd6b..0beb2291060169 100644
--- a/paddle/fluid/operators/cast_op.cu
+++ b/paddle/fluid/operators/cast_op.cu
@@ -40,7 +40,8 @@ __global__ void VecCastCUDAKernel(const InT* in, const int64_t N, OutT* out) {
   int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
   using LoadT = AlignedVector<InT, VecSize>;
   using StoreT = AlignedVector<OutT, VecSize>;
-  for (int i = idx * VecSize; i < N; i += blockDim.x * gridDim.x * VecSize) {
+  for (int64_t i = idx * VecSize; i < N;
+       i += blockDim.x * gridDim.x * VecSize) {
     InT in_vec[VecSize];
     LoadT* in_value = reinterpret_cast<LoadT*>(&in_vec);
     *in_value = *reinterpret_cast<const LoadT*>(&in[i]);
diff --git a/paddle/fluid/operators/controlflow/logical_op.cc b/paddle/fluid/operators/controlflow/logical_op.cc
index fb8cde70f5324f..285b17d4995dbc 100644
--- a/paddle/fluid/operators/controlflow/logical_op.cc
+++ b/paddle/fluid/operators/controlflow/logical_op.cc
@@ -1,11 +1,8 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -26,15 +23,16 @@ class BinaryLogicalOpProtoMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     OpComment comment;
     AddInput("X", string::Sprintf("Left hand operand of %s operator. Must be "
-                                  "a Variable of type bool.",
+                                  "a Variable of type being one of bool, int8, "
+                                  "int16, int32, int64, float32, float64.",
                                   comment.type));
     AddInput("Y", string::Sprintf("Right hand operand of %s operator. Must be "
-                                  "a Variable of type bool.",
+                                  "a Variable of type being one of bool, int8, "
+                                  "int16, int32, int64, float32, float64.",
                                   comment.type));
     AddOutput("Out", string::Sprintf("n-dim bool Variable"));
     AddComment(string::Sprintf(R"DOC(%s Operator
-
-It operates element-wise on X and Y, and returns the Out. X, Y and Out are N-dim boolean LoDTensor or Tensor.
+It operates element-wise on X and Y, and returns the Out. X, Y and Out are N-dim LoDTensor or Tensor.
 Each element of Out is calculated by %s
 )DOC",
                                comment.type, comment.equation));
@@ -46,13 +44,14 @@ class UnaryLogicalOpProtoMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     OpComment comment;
-    AddInput("X", string::Sprintf("Operand of %s operator. Must be "
-                                  "a LoDTensor or Tensor of type bool.",
-                                  comment.type));
+    AddInput("X",
+             string::Sprintf("Operand of %s operator. Must be "
+                             "a LoDTensor or Tensor of type being one of bool, "
+                             "int8, int16, int32, int64, float32, float64.",
+                             comment.type));
     AddOutput("Out", string::Sprintf("n-dim bool LoDTensor or Tensor."));
     AddComment(string::Sprintf(R"DOC(%s Operator
-
-It operates element-wise on X, and returns the Out. X and Out are N-dim boolean LoDTensor or Tensor.
+It operates element-wise on X, and returns the Out. X and Out are N-dim LoDTensor or Tensor.
 Each element of Out is calculated by %s
 )DOC",
                                comment.type, comment.equation));
diff --git a/paddle/fluid/operators/controlflow/logical_op.cu b/paddle/fluid/operators/controlflow/logical_op.cu
index 6cbcd516e08264..301b4c4149fad3 100644
--- a/paddle/fluid/operators/controlflow/logical_op.cu
+++ b/paddle/fluid/operators/controlflow/logical_op.cu
@@ -1,11 +1,8 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -21,13 +18,13 @@ namespace plat = paddle::platform;
 namespace paddle {
 namespace operators {
 
-#define LOGICAL_BINARY_FUNCTOR(func_name, op)         \
-  template <typename T>                               \
-  struct func_name {                                  \
-    using ELEMENT_TYPE = T;                           \
-    HOSTDEVICE bool operator()(const T* args) const { \
-      return args[0] op args[1];                      \
-    }                                                 \
+#define LOGICAL_BINARY_FUNCTOR(func_name, op)                          \
+  template <typename T>                                                \
+  struct func_name {                                                   \
+    using ELEMENT_TYPE = T;                                            \
+    HOSTDEVICE bool operator()(const T* args) const {                  \
+      return static_cast<bool>(args[0]) op static_cast<bool>(args[1]); \
+    }                                                                  \
   };
 
 LOGICAL_BINARY_FUNCTOR(CudaOrFunctor, ||)
@@ -68,10 +65,16 @@ class BinaryLogicalOpKernel<platform::CUDADeviceContext, Functor>
 }  // namespace operators
 }  // namespace paddle
 
-#define REGISTER_LOGICAL_CUDA_KERNEL(op_name, func) \
-  REGISTER_OP_CUDA_KERNEL(                          \
-      op_name,                                      \
-      ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<bool>>);
+#define REGISTER_LOGICAL_CUDA_KERNEL(op_name, func)                            \
+  REGISTER_OP_CUDA_KERNEL(                                                     \
+      op_name,                                                                 \
+      ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<bool>>,    \
+      ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<int8_t>>,  \
+      ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<int16_t>>, \
+      ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<int>>,     \
+      ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<int64_t>>, \
+      ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<float>>,   \
+      ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<double>>);
 
 REGISTER_LOGICAL_CUDA_KERNEL(logical_or, CudaOrFunctor)
 REGISTER_LOGICAL_CUDA_KERNEL(logical_and, CudaAndFunctor)
diff --git a/paddle/fluid/operators/controlflow/logical_op.h b/paddle/fluid/operators/controlflow/logical_op.h
index 2c39201a426a25..92fe0a10cb907c 100644
--- a/paddle/fluid/operators/controlflow/logical_op.h
+++ b/paddle/fluid/operators/controlflow/logical_op.h
@@ -1,11 +1,8 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -82,12 +79,36 @@ class UnaryLogicalOpKernel
 }  // namespace operators
 }  // namespace paddle
 
-#define REGISTER_BINARY_LOGICAL_KERNEL(op_type, dev, functor) \
-  REGISTER_OP_##dev##_KERNEL(                                 \
-      op_type, ::paddle::operators::BinaryLogicalOpKernel<    \
-                   ::paddle::platform::dev##DeviceContext, functor<bool>>);
+#define REGISTER_BINARY_LOGICAL_KERNEL(op_type, dev, functor)              \
+  REGISTER_OP_##dev##_KERNEL(                                              \
+      op_type, ::paddle::operators::BinaryLogicalOpKernel<                 \
+                   ::paddle::platform::dev##DeviceContext, functor<bool>>, \
+      ::paddle::operators::BinaryLogicalOpKernel<                          \
+          ::paddle::platform::dev##DeviceContext, functor<int8_t>>,        \
+      ::paddle::operators::BinaryLogicalOpKernel<                          \
+          ::paddle::platform::dev##DeviceContext, functor<int16_t>>,       \
+      ::paddle::operators::BinaryLogicalOpKernel<                          \
+          ::paddle::platform::dev##DeviceContext, functor<int>>,           \
+      ::paddle::operators::BinaryLogicalOpKernel<                          \
+          ::paddle::platform::dev##DeviceContext, functor<int64_t>>,       \
+      ::paddle::operators::BinaryLogicalOpKernel<                          \
+          ::paddle::platform::dev##DeviceContext, functor<float>>,         \
+      ::paddle::operators::BinaryLogicalOpKernel<                          \
+          ::paddle::platform::dev##DeviceContext, functor<double>>);
 
-#define REGISTER_UNARY_LOGICAL_KERNEL(op_type, dev, functor) \
-  REGISTER_OP_##dev##_KERNEL(                                \
-      op_type, ::paddle::operators::UnaryLogicalOpKernel<    \
-                   ::paddle::platform::dev##DeviceContext, functor<bool>>);
+#define REGISTER_UNARY_LOGICAL_KERNEL(op_type, dev, functor)               \
+  REGISTER_OP_##dev##_KERNEL(                                              \
+      op_type, ::paddle::operators::UnaryLogicalOpKernel<                  \
+                   ::paddle::platform::dev##DeviceContext, functor<bool>>, \
+      ::paddle::operators::UnaryLogicalOpKernel<                           \
+          ::paddle::platform::dev##DeviceContext, functor<int8_t>>,        \
+      ::paddle::operators::UnaryLogicalOpKernel<                           \
+          ::paddle::platform::dev##DeviceContext, functor<int16_t>>,       \
+      ::paddle::operators::UnaryLogicalOpKernel<                           \
+          ::paddle::platform::dev##DeviceContext, functor<int>>,           \
+      ::paddle::operators::UnaryLogicalOpKernel<                           \
+          ::paddle::platform::dev##DeviceContext, functor<int64_t>>,       \
+      ::paddle::operators::UnaryLogicalOpKernel<                           \
+          ::paddle::platform::dev##DeviceContext, functor<float>>,         \
+      ::paddle::operators::UnaryLogicalOpKernel<                           \
+          ::paddle::platform::dev##DeviceContext, functor<double>>);
diff --git a/paddle/fluid/operators/controlflow/logical_op_npu.cc b/paddle/fluid/operators/controlflow/logical_op_npu.cc
index b452bee747232d..babdb2257ee3ca 100644
--- a/paddle/fluid/operators/controlflow/logical_op_npu.cc
+++ b/paddle/fluid/operators/controlflow/logical_op_npu.cc
@@ -1,11 +1,8 @@
 /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -82,11 +79,29 @@ class LogicalAndPUKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_NPU_KERNEL(logical_not,
-                       ops::LogicalNotNPUKernel<plat::NPUDeviceContext, bool>);
+REGISTER_OP_NPU_KERNEL(
+    logical_not, ops::LogicalNotNPUKernel<plat::NPUDeviceContext, bool>,
+    ops::LogicalNotNPUKernel<plat::NPUDeviceContext, int8_t>,
+    ops::LogicalNotNPUKernel<plat::NPUDeviceContext, int16_t>,
+    ops::LogicalNotNPUKernel<plat::NPUDeviceContext, int>,
+    ops::LogicalNotNPUKernel<plat::NPUDeviceContext, int64_t>,
+    ops::LogicalNotNPUKernel<plat::NPUDeviceContext, float>,
+    ops::LogicalNotNPUKernel<plat::NPUDeviceContext, double>);
 
 REGISTER_OP_NPU_KERNEL(logical_or,
-                       ops::LogicalOrNPUKernel<plat::NPUDeviceContext, bool>);
+                       ops::LogicalOrNPUKernel<plat::NPUDeviceContext, bool>,
+                       ops::LogicalOrNPUKernel<plat::NPUDeviceContext, int8_t>,
+                       ops::LogicalOrNPUKernel<plat::NPUDeviceContext, int16_t>,
+                       ops::LogicalOrNPUKernel<plat::NPUDeviceContext, int>,
+                       ops::LogicalOrNPUKernel<plat::NPUDeviceContext, int64_t>,
+                       ops::LogicalOrNPUKernel<plat::NPUDeviceContext, float>,
+                       ops::LogicalOrNPUKernel<plat::NPUDeviceContext, double>);
 
 REGISTER_OP_NPU_KERNEL(logical_and,
-                       ops::LogicalAndPUKernel<plat::NPUDeviceContext, bool>);
+                       ops::LogicalAndPUKernel<plat::NPUDeviceContext, bool>,
+                       ops::LogicalAndPUKernel<plat::NPUDeviceContext, int8_t>,
+                       ops::LogicalAndPUKernel<plat::NPUDeviceContext, int16_t>,
+                       ops::LogicalAndPUKernel<plat::NPUDeviceContext, int>,
+                       ops::LogicalAndPUKernel<plat::NPUDeviceContext, int64_t>,
+                       ops::LogicalAndPUKernel<plat::NPUDeviceContext, float>,
+                       ops::LogicalAndPUKernel<plat::NPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/controlflow/logical_op_xpu.h b/paddle/fluid/operators/controlflow/logical_op_xpu.h
index 9d46ad8c0447ff..aef6ae27a31945 100644
--- a/paddle/fluid/operators/controlflow/logical_op_xpu.h
+++ b/paddle/fluid/operators/controlflow/logical_op_xpu.h
@@ -45,7 +45,7 @@ class BinaryLogicalOpXPUKernel : public framework::OpKernel<T> {
     auto* x = context.Input<framework::Tensor>("X");
     auto* y = context.Input<framework::Tensor>("Y");
     auto* out = context.Output<framework::Tensor>("Out");
-    T* out_ptr = out->mutable_data<T>(context.GetPlace());
+    bool* out_ptr = out->mutable_data<bool>(context.GetPlace());
     const T* x_ptr = x->data<T>();
     const T* y_ptr = y->data<T>();
     auto& dev_ctx =
@@ -153,7 +153,7 @@ class UnaryLogicalOpXPUKernel : public framework::OpKernel<T> {
     if (x->numel() == 0) {
       return;
     }
-    out->mutable_data<T>(context.GetPlace());
+    out->mutable_data<bool>(context.GetPlace());
     auto& dev_ctx =
         context.template device_context<paddle::platform::XPUDeviceContext>();
     int ret = xpu::logical_not<bool>(dev_ctx.x_context(), x->data<T>(),
diff --git a/paddle/fluid/operators/controlflow/logicaland_op_xpu.cc b/paddle/fluid/operators/controlflow/logicaland_op_xpu.cc
index 08927e66f25064..6248b6e0b06378 100644
--- a/paddle/fluid/operators/controlflow/logicaland_op_xpu.cc
+++ b/paddle/fluid/operators/controlflow/logicaland_op_xpu.cc
@@ -17,5 +17,11 @@ limitations under the License. */
 namespace ops = paddle::operators;
 REGISTER_OP_XPU_KERNEL(
     logical_and,
-    ops::BinaryLogicalOpXPUKernel<ops::XpuLogicalType::XPU_AND, bool>);
+    ops::BinaryLogicalOpXPUKernel<ops::XpuLogicalType::XPU_AND, bool>,
+    ops::BinaryLogicalOpXPUKernel<ops::XpuLogicalType::XPU_AND, int8_t>,
+    ops::BinaryLogicalOpXPUKernel<ops::XpuLogicalType::XPU_AND, int16_t>,
+    ops::BinaryLogicalOpXPUKernel<ops::XpuLogicalType::XPU_AND, int>,
+    ops::BinaryLogicalOpXPUKernel<ops::XpuLogicalType::XPU_AND, int64_t>,
+    ops::BinaryLogicalOpXPUKernel<ops::XpuLogicalType::XPU_AND, float>,
+    ops::BinaryLogicalOpXPUKernel<ops::XpuLogicalType::XPU_AND, double>);
 #endif
diff --git a/paddle/fluid/operators/controlflow/logicalnot_op_xpu.cc b/paddle/fluid/operators/controlflow/logicalnot_op_xpu.cc
old mode 100755
new mode 100644
index a8cef52ace2c60..be857db8aa9669
--- a/paddle/fluid/operators/controlflow/logicalnot_op_xpu.cc
+++ b/paddle/fluid/operators/controlflow/logicalnot_op_xpu.cc
@@ -15,5 +15,11 @@ limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/operators/controlflow/logical_op_xpu.h"
 namespace ops = paddle::operators;
-REGISTER_OP_XPU_KERNEL(logicalnot, ops::UnaryLogicalOpXPUKernel<bool>);
+REGISTER_OP_XPU_KERNEL(logicalnot, ops::UnaryLogicalOpXPUKernel<bool>,
+                       ops::UnaryLogicalOpXPUKernel<int8_t>,
+                       ops::UnaryLogicalOpXPUKernel<int16_t>,
+                       ops::UnaryLogicalOpXPUKernel<int>,
+                       ops::UnaryLogicalOpXPUKernel<int64_t>,
+                       ops::UnaryLogicalOpXPUKernel<float>,
+                       ops::UnaryLogicalOpXPUKernel<double>);
 #endif
diff --git a/paddle/fluid/operators/controlflow/logicalor_op_xpu.cc b/paddle/fluid/operators/controlflow/logicalor_op_xpu.cc
index e99c2f1a181040..126596841a29f8 100644
--- a/paddle/fluid/operators/controlflow/logicalor_op_xpu.cc
+++ b/paddle/fluid/operators/controlflow/logicalor_op_xpu.cc
@@ -18,5 +18,11 @@ limitations under the License. */
 namespace ops = paddle::operators;
 REGISTER_OP_XPU_KERNEL(
     logical_or,
-    ops::BinaryLogicalOpXPUKernel<ops::XpuLogicalType::XPU_OR, bool>);
+    ops::BinaryLogicalOpXPUKernel<ops::XpuLogicalType::XPU_OR, bool>,
+    ops::BinaryLogicalOpXPUKernel<ops::XpuLogicalType::XPU_OR, int8_t>,
+    ops::BinaryLogicalOpXPUKernel<ops::XpuLogicalType::XPU_OR, int16_t>,
+    ops::BinaryLogicalOpXPUKernel<ops::XpuLogicalType::XPU_OR, int>,
+    ops::BinaryLogicalOpXPUKernel<ops::XpuLogicalType::XPU_OR, int64_t>,
+    ops::BinaryLogicalOpXPUKernel<ops::XpuLogicalType::XPU_OR, float>,
+    ops::BinaryLogicalOpXPUKernel<ops::XpuLogicalType::XPU_OR, double>);
 #endif
diff --git a/paddle/fluid/operators/cumsum_op_npu.cc b/paddle/fluid/operators/cumsum_op_npu.cc
new file mode 100644
index 00000000000000..e8cf1a46db3cca
--- /dev/null
+++ b/paddle/fluid/operators/cumsum_op_npu.cc
@@ -0,0 +1,73 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the Licnse. */
+
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/cum_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class CumSumNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+    int axis = ctx.Attr<int>("axis");
+    bool exclusive = ctx.Attr<bool>("exclusive");
+    bool reverse = ctx.Attr<bool>("reverse");
+
+    out->mutable_data<T>(ctx.GetPlace());
+
+    framework::NPUAttributeMap attr_input = {
+        {"axis", axis}, {"exclusive", exclusive}, {"reverse", reverse}};
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    bool flatten = ctx.Attr<bool>("flatten");
+    if (flatten) {
+      PADDLE_ENFORCE_EQ(
+          axis, -1,
+          platform::errors::InvalidArgument(
+              "when flatten is true, attr axis must be default %d, but got %d",
+              -1, axis));
+
+      Tensor new_x(x->type());
+      new_x.ShareDataWith(*x);
+
+      new_x.Resize(framework::make_ddim({x->numel()}));
+
+      const auto& runner = NpuOpRunner("CumsumD", {new_x}, {*out}, attr_input);
+      runner.Run(stream);
+    } else {
+      const auto& runner = NpuOpRunner("CumsumD", {*x}, {*out}, attr_input);
+      runner.Run(stream);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_NPU_KERNEL(
+    cumsum, ops::CumSumNPUKernel<plat::NPUDeviceContext, int>,
+    ops::CumSumNPUKernel<plat::NPUDeviceContext, float>,
+    ops::CumSumNPUKernel<plat::NPUDeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/dequantize_abs_max_op.cc b/paddle/fluid/operators/dequantize_abs_max_op.cc
index c8bca25b6b0f0e..aee468e05e1826 100644
--- a/paddle/fluid/operators/dequantize_abs_max_op.cc
+++ b/paddle/fluid/operators/dequantize_abs_max_op.cc
@@ -50,6 +50,7 @@ struct DequantizeFunctor<platform::CPUDeviceContext, T> {
 };
 
 template struct DequantizeFunctor<platform::CPUDeviceContext, int8_t>;
+template struct DequantizeFunctor<platform::CPUDeviceContext, int16_t>;
 
 class DequantizeMaxAbsOp : public framework::OperatorWithKernel {
  public:
@@ -79,7 +80,7 @@ class DequantizeMaxAbsOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X",
-             "(int8 Tensor) The input with int8 type is the "
+             "(Int Tensor) The input with int8/16 type is the "
              "low precision tensor.");
     AddInput("Scale", "(float) The scale in quantization stage.");
     AddOutput("Out",
@@ -108,4 +109,5 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OP_CPU_KERNEL(dequantize_abs_max,
-                       ops::DequantizeMaxAbsKernel<CPU, int8_t>);
+                       ops::DequantizeMaxAbsKernel<CPU, int8_t>,
+                       ops::DequantizeMaxAbsKernel<CPU, int16_t>);
diff --git a/paddle/fluid/operators/dequantize_abs_max_op.cu b/paddle/fluid/operators/dequantize_abs_max_op.cu
index 6554d4545ad312..e96835a1ea51cd 100644
--- a/paddle/fluid/operators/dequantize_abs_max_op.cu
+++ b/paddle/fluid/operators/dequantize_abs_max_op.cu
@@ -45,6 +45,7 @@ struct DequantizeFunctor<platform::CUDADeviceContext, T> {
 };
 
 template struct DequantizeFunctor<platform::CUDADeviceContext, int8_t>;
+template struct DequantizeFunctor<platform::CUDADeviceContext, int16_t>;
 
 }  // namespace operators
 }  // namespace paddle
@@ -52,4 +53,5 @@ template struct DequantizeFunctor<platform::CUDADeviceContext, int8_t>;
 namespace ops = paddle::operators;
 using CUDA = paddle::platform::CUDADeviceContext;
 REGISTER_OP_CUDA_KERNEL(dequantize_abs_max,
-                        ops::DequantizeMaxAbsKernel<CUDA, int8_t>);
+                        ops::DequantizeMaxAbsKernel<CUDA, int8_t>,
+                        ops::DequantizeMaxAbsKernel<CUDA, int16_t>);
diff --git a/paddle/fluid/operators/gather_tree_op.cu b/paddle/fluid/operators/gather_tree_op.cu
index c53f1e81cef54e..829682764a674d 100644
--- a/paddle/fluid/operators/gather_tree_op.cu
+++ b/paddle/fluid/operators/gather_tree_op.cu
@@ -50,6 +50,14 @@ class GatherTreeOpCUDAKernel : public framework::OpKernel<T> {
     const auto *parents_data = parents->data<T>();
     auto *out_data = out->mutable_data<T>(ctx.GetPlace());
 
+    PADDLE_ENFORCE_NOT_NULL(
+        ids_data, platform::errors::InvalidArgument(
+                      "Input(Ids) of gather_tree should not be null."));
+
+    PADDLE_ENFORCE_NOT_NULL(
+        parents_data, platform::errors::InvalidArgument(
+                          "Input(Parents) of gather_tree should not be null."));
+
     auto &ids_dims = ids->dims();
     int64_t max_length = ids_dims[0];
     int64_t batch_size = ids_dims[1];
diff --git a/paddle/fluid/operators/gather_tree_op.h b/paddle/fluid/operators/gather_tree_op.h
index 742a7ffcaae4c8..e035a30e7954fe 100644
--- a/paddle/fluid/operators/gather_tree_op.h
+++ b/paddle/fluid/operators/gather_tree_op.h
@@ -38,6 +38,14 @@ class GatherTreeOpKernel : public framework::OpKernel<T> {
     auto batch_size = ids_dims[1];
     auto beam_size = ids_dims[2];
 
+    PADDLE_ENFORCE_NOT_NULL(
+        ids_data, platform::errors::InvalidArgument(
+                      "Input(Ids) of gather_tree should not be null."));
+
+    PADDLE_ENFORCE_NOT_NULL(
+        parents_data, platform::errors::InvalidArgument(
+                          "Input(Parents) of gather_tree should not be null."));
+
     for (int batch = 0; batch < batch_size; batch++) {
       for (int beam = 0; beam < beam_size; beam++) {
         auto idx = (max_length - 1) * batch_size * beam_size +
diff --git a/paddle/fluid/operators/index_sample_op_npu.cc b/paddle/fluid/operators/index_sample_op_npu.cc
new file mode 100644
index 00000000000000..f5a4100c635856
--- /dev/null
+++ b/paddle/fluid/operators/index_sample_op_npu.cc
@@ -0,0 +1,130 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/index_sample_op.h"
+
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+
+template <typename T>
+class IndexSampleNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>();
+    auto* input = ctx.Input<framework::LoDTensor>("X");
+    auto* index = ctx.Input<framework::LoDTensor>("Index");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    Tensor transformed_index;
+    const auto& index_type = index->type();
+    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
+                            index_type == framework::proto::VarType::INT64;
+    PADDLE_ENFORCE_EQ(index_type_match, true,
+                      platform::errors::InvalidArgument(
+                          "Input(Index) holds the wrong type, it holds %s, but "
+                          "desires to be %s or %s",
+                          paddle::framework::DataTypeToString(index_type),
+                          paddle::framework::DataTypeToString(
+                              framework::proto::VarType::INT32),
+                          paddle::framework::DataTypeToString(
+                              framework::proto::VarType::INT64)));
+    if (index_type == framework::proto::VarType::INT32) {
+      transformed_index.mutable_data<int64_t>(index->dims(),
+                                              dev_ctx.GetPlace());
+      const auto& cast_runner = NpuOpRunner(
+          "Cast", {*index}, {transformed_index}, {{"dst_type", ACL_INT64}});
+      cast_runner.Run(dev_ctx.stream());
+    } else {
+      transformed_index.ShareDataWith(*index);
+    }
+
+    const auto& runner = NpuOpRunner(
+        "GatherElements", {*input, transformed_index}, {*out}, {{"dim", 1}});
+    runner.Run(dev_ctx.stream());
+  }
+};
+
+template <typename IndexT>
+void IndexSampleGradScatter(const paddle::platform::NPUDeviceContext& dev_ctx,
+                            const Tensor* index, const Tensor* out_grad,
+                            Tensor* x_grad) {
+  auto index_dims = index->dims();
+  auto input_dims = x_grad->dims();
+  auto batch_size = input_dims[0];
+  auto index_length = index_dims[1];
+
+  std::vector<IndexT> scatter_index_vec;
+  std::vector<IndexT> index_vec;
+  framework::TensorToVector(*index, dev_ctx, &index_vec);
+  for (auto i = 0; i < batch_size; ++i) {
+    for (auto j = 0; j < index_length; j++) {
+      scatter_index_vec.push_back(i);
+      scatter_index_vec.push_back(index_vec[i * index_length + j]);
+    }
+  }
+  Tensor scatter_index;
+  framework::TensorFromVector(scatter_index_vec, dev_ctx, &scatter_index);
+  scatter_index.Resize({batch_size, index_length, 2});
+
+  NpuOpRunner runner;
+  runner.SetType("ScatterNd")
+      .AddInput(scatter_index)
+      .AddInput(*out_grad)
+      .AddInput(framework::vectorize<IndexT>(x_grad->dims()))
+      .AddOutput(*x_grad);
+  runner.Run(dev_ctx.stream());
+}
+
+template <typename T>
+class IndexSampleGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>();
+    auto* index = ctx.Input<framework::LoDTensor>("Index");
+    auto* out_grad =
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+    auto* x_grad =
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    x_grad->mutable_data<T>(ctx.GetPlace());
+
+    const auto& index_type = index->type();
+    if (index_type == framework::proto::VarType::INT32) {
+      IndexSampleGradScatter<int32_t>(dev_ctx, index, out_grad, x_grad);
+    } else {
+      IndexSampleGradScatter<int64_t>(dev_ctx, index, out_grad, x_grad);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(index_sample, ops::IndexSampleNPUKernel<plat::float16>,
+                       ops::IndexSampleNPUKernel<float>,
+                       ops::IndexSampleNPUKernel<int32_t>,
+                       ops::IndexSampleNPUKernel<int64_t>);
+REGISTER_OP_NPU_KERNEL(index_sample_grad,
+                       ops::IndexSampleGradNPUKernel<plat::float16>,
+                       ops::IndexSampleGradNPUKernel<float>,
+                       ops::IndexSampleGradNPUKernel<int32_t>,
+                       ops::IndexSampleGradNPUKernel<int64_t>);
diff --git a/paddle/fluid/operators/index_select_op.h b/paddle/fluid/operators/index_select_op.h
index 70714b7f3a0644..04775107033adc 100644
--- a/paddle/fluid/operators/index_select_op.h
+++ b/paddle/fluid/operators/index_select_op.h
@@ -15,6 +15,10 @@
 #pragma once
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/jit/macro.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/cpu_info.h"
 
 namespace paddle {
 namespace operators {
@@ -38,7 +42,6 @@ void IndexSelectInner(const framework::ExecutionContext& context,
 
   auto input_width = slice_size * input_dim[dim];
   auto output_width = slice_size * output_dim[dim];
-
   auto outer_nums = 1;
   for (auto i = 0; i < dim; i++) {
     outer_nums *= input_dim[i];
@@ -77,7 +80,6 @@ void IndexSelectInner(const framework::ExecutionContext& context,
   for (auto i = 0; i < outer_nums; i++) {
     auto input_start_offset = i * input_width;
     auto output_start_offset = i * output_width;
-
     for (auto j = 0; j < index_size; j++) {
       IndexT index_value = index_vec[j];
       for (auto k = 0; k < slice_size; k++) {
@@ -98,7 +100,6 @@ class IndexSelectKernel : public framework::OpKernel<T> {
     auto* inputs_var = context.InputVar("X");
     auto* index_var = context.InputVar("Index");
     auto* output_var = context.OutputVar("Out");
-
     auto& inputs = inputs_var->Get<LoDTensor>();
     auto& index = index_var->Get<LoDTensor>();
     auto* output = output_var->GetMutable<framework::LoDTensor>();
@@ -107,8 +108,8 @@ class IndexSelectKernel : public framework::OpKernel<T> {
     if (dim < 0) {
       dim += inputs.dims().size();
     }
-
     const auto& index_type = index.type();
+
     bool index_type_match = index_type == framework::proto::VarType::INT32 ||
                             index_type == framework::proto::VarType::INT64;
     PADDLE_ENFORCE_EQ(index_type_match, true,
@@ -129,19 +130,41 @@ class IndexSelectKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename T, typename IndexT = int>
+template <typename DeviceContext, typename T, class Enable = void>
+struct IndexSelectAdd {
+  void operator()(const framework::ExecutionContext& ctx, int slice_size,
+                  const T* src_pointer, const T* p_pointer, T* dist_pointer) {
+    for (int i = 0; i < slice_size; i++) {
+      dist_pointer[i] = src_pointer[i] + p_pointer[i];
+    }
+  }
+};
+template <typename DeviceContext, typename T>
+struct IndexSelectAdd<
+    DeviceContext, T,
+    typename std::enable_if<std::is_floating_point<T>::value>::type> {
+  void operator()(const framework::ExecutionContext& ctx, int slice_size,
+                  const T* src_pointer, const T* p_pointer, T* dist_pointer) {
+    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+    blas.VADD(slice_size, src_pointer, p_pointer, dist_pointer);
+  }
+};
+
+template <typename DeviceContext, typename T, typename IndexT = int>
 void IndexSelectGradInner(const framework::ExecutionContext& context,
-                          const LoDTensor& out_grad, const LoDTensor& index,
+                          const LoDTensor* out_grad, const LoDTensor* index,
                           LoDTensor* x_grad, int dim) {
-  std::vector<T> input_vec;
-  std::vector<IndexT> index_vec;
-  TensorToVector(out_grad, context.device_context(), &input_vec);
-  TensorToVector(index, context.device_context(), &index_vec);
-
-  auto input_dim = out_grad.dims();
+  const T* input_data = out_grad->data<T>();
+  const IndexT* index_data = index->data<IndexT>();
+  const T* p_output = x_grad->mutable_data<T>(context.GetPlace());
+  T* out_data = x_grad->mutable_data<T>(context.GetPlace());
+  auto input_dim = out_grad->dims();
   auto input_dim_size = input_dim.size();
   auto output_dim = x_grad->dims();
-  std::vector<T> out_vec(x_grad->numel(), 0);
+
+  auto& dev_ctx = context.template device_context<DeviceContext>();
+  math::SetConstant<DeviceContext, T> set_constant;
+  set_constant(dev_ctx, x_grad, static_cast<T>(0.0));
 
   auto slice_size = 1;
   for (auto i = dim + 1; i < input_dim_size; i++) {
@@ -156,7 +179,7 @@ void IndexSelectGradInner(const framework::ExecutionContext& context,
     outer_nums *= input_dim[i];
   }
 
-  auto index_size = index.dims()[0];
+  auto index_size = index->dims()[0];
   VLOG(3) << "Index_Select_Grad_Debug; outer_nums: " << outer_nums
           << "; slice_size: " << slice_size << "; input_width: " << input_width
           << "; output_width: " << output_width
@@ -167,15 +190,14 @@ void IndexSelectGradInner(const framework::ExecutionContext& context,
     auto output_start_offset = i * output_width;
 
     for (auto j = 0; j < index_size; j++) {
-      IndexT index_value = index_vec[j];
-      for (auto k = 0; k < slice_size; k++) {
-        out_vec[output_start_offset + index_value * slice_size + k] +=
-            input_vec[input_start_offset + j * slice_size + k];
-      }
+      IndexT index_value = index_data[j];
+      auto src = input_data + input_start_offset + j * slice_size;
+      auto p_out = p_output + output_start_offset + index_value * slice_size;
+      auto dst = out_data + output_start_offset + index_value * slice_size;
+      IndexSelectAdd<DeviceContext, T> index_select_add;
+      index_select_add(context, slice_size, src, p_out, dst);
     }
   }
-  x_grad->mutable_data<T>(context.GetPlace());
-  framework::TensorFromVector(out_vec, context.device_context(), x_grad);
   x_grad->Resize(output_dim);
 }
 
@@ -183,19 +205,18 @@ template <typename DeviceContext, typename T>
 class IndexSelectGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto* index_var = context.InputVar("Index");
-    auto* x_grad_var = context.OutputVar(framework::GradVarName("X"));
-    auto* out_grad_var = context.InputVar(framework::GradVarName("Out"));
+    auto* x_grad =
+        context.Output<framework::LoDTensor>(framework::GradVarName("X"));
+    auto* index = context.Input<framework::LoDTensor>("Index");
+    auto* out_grad =
+        context.Input<framework::LoDTensor>(framework::GradVarName("Out"));
 
-    auto& index = index_var->Get<LoDTensor>();
-    auto& out_grad = out_grad_var->Get<LoDTensor>();
-    auto* x_grad = x_grad_var->GetMutable<framework::LoDTensor>();
     int dim = context.Attr<int>("dim");
     if (dim < 0) {
-      dim += out_grad.dims().size();
+      dim += out_grad->dims().size();
     }
+    const auto& index_type = index->type();
 
-    const auto& index_type = index.type();
     bool index_type_match = index_type == framework::proto::VarType::INT32 ||
                             index_type == framework::proto::VarType::INT64;
     PADDLE_ENFORCE_EQ(index_type_match, true,
@@ -209,9 +230,11 @@ class IndexSelectGradKernel : public framework::OpKernel<T> {
                               framework::proto::VarType::INT64)));
 
     if (index_type == framework::proto::VarType::INT32) {
-      IndexSelectGradInner<T, int>(context, out_grad, index, x_grad, dim);
+      IndexSelectGradInner<DeviceContext, T, int>(context, out_grad, index,
+                                                  x_grad, dim);
     } else if (index_type == framework::proto::VarType::INT64) {
-      IndexSelectGradInner<T, int64_t>(context, out_grad, index, x_grad, dim);
+      IndexSelectGradInner<DeviceContext, T, int64_t>(context, out_grad, index,
+                                                      x_grad, dim);
     }
   }
 };
diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
index 9a0ce3900acf1c..2f3217e628dd0e 100644
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -229,6 +229,7 @@ REGISTER_OPERATOR(lookup_table_grad, ops::LookupTableOpGrad,
 REGISTER_OP_CPU_KERNEL(lookup_table, ops::LookupTableKernel<float>,
                        ops::LookupTableKernel<double>,
                        ops::LookupTableKernel<int8_t>,
+                       ops::LookupTableKernel<int16_t>,
                        ops::LookupTableKernel<paddle::platform::bfloat16>);
 REGISTER_OP_CPU_KERNEL(lookup_table_grad, ops::LookupTableGradKernel<float>,
                        ops::LookupTableGradKernel<double>,
diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu
index 6985b916757173..3edea025b2a044 100644
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
@@ -227,7 +227,8 @@ namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(lookup_table, ops::LookupTableCUDAKernel<float>,
                         ops::LookupTableCUDAKernel<double>,
                         ops::LookupTableCUDAKernel<plat::float16>,
-                        ops::LookupTableCUDAKernel<int8_t>);
+                        ops::LookupTableCUDAKernel<int8_t>,
+                        ops::LookupTableCUDAKernel<int16_t>);
 REGISTER_OP_CUDA_KERNEL(lookup_table_grad,
                         ops::LookupTableGradCUDAKernel<float>,
                         ops::LookupTableGradCUDAKernel<double>,
diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h
index e385d72d1f43fd..74e26626bd5285 100644
--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -103,6 +103,7 @@ class LookupTableKernel : public framework::OpKernel<T> {
 
             if (id_index != -1) {
               if (input_data_type == framework::proto::VarType::INT8 ||
+                  input_data_type == framework::proto::VarType::INT16 ||
                   input_data_type == framework::proto::VarType::BF16) {
                 memcpy(output + i * row_width, table + id_index * row_width,
                        row_width * sizeof(T));
@@ -130,6 +131,7 @@ class LookupTableKernel : public framework::OpKernel<T> {
                     id_index));
 
             if (input_data_type == framework::proto::VarType::INT8 ||
+                input_data_type == framework::proto::VarType::INT16 ||
                 input_data_type == framework::proto::VarType::BF16) {
               memcpy(output + i * row_width, table + id_index * row_width,
                      row_width * sizeof(T));
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index eab513e24bc809..55151c5483a38b 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -54,6 +54,15 @@ struct CBlas<int8_t> {
   }
 };
 
+template <>
+struct CBlas<int16_t> {
+  template <typename... ARGS>
+  static void VCOPY(ARGS... args) {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Blas VCOPY do not supported on CPU, please check your code"));
+  }
+};
+
 template <>
 struct CBlas<platform::bfloat16> {
   template <typename... ARGS>
diff --git a/paddle/fluid/operators/math/concat_and_split.cc b/paddle/fluid/operators/math/concat_and_split.cc
index 7df78b321de996..6c1ee863737011 100644
--- a/paddle/fluid/operators/math/concat_and_split.cc
+++ b/paddle/fluid/operators/math/concat_and_split.cc
@@ -40,18 +40,18 @@ class ConcatFunctor<platform::CPUDeviceContext, T> {
                   const std::vector<framework::Tensor>& input, int axis,
                   framework::Tensor* output) {
     // TODO(zcd): Add input data validity checking
-    int num = input.size();
+    size_t num = input.size();
 
-    int rows = 1;
+    int64_t rows = 1;
     auto dim_0 = input[0].dims();
     for (int i = 0; i < axis; ++i) {
       rows *= dim_0[i];
     }
-    int out_rows = rows, out_cols = 0;
+    int64_t out_rows = rows, out_cols = 0;
 
     std::vector<int64_t> input_cols(input.size());
-    for (int i = 0; i < num; ++i) {
-      int t_cols = input[i].numel() / rows;
+    for (size_t i = 0; i < num; ++i) {
+      int64_t t_cols = input[i].numel() / rows;
       out_cols += t_cols;
       input_cols[i] = t_cols;
     }
@@ -59,11 +59,11 @@ class ConcatFunctor<platform::CPUDeviceContext, T> {
 
     // computation
     auto output_data = output->data<T>();
-    int col_idx = 0;
-    for (int j = 0; j < num; ++j) {
-      int col_len = input_cols[j];
+    int64_t col_idx = 0;
+    for (size_t j = 0; j < num; ++j) {
+      int64_t col_len = input_cols[j];
       auto input_data = input[j].data<T>();
-      for (int k = 0; k < out_rows; ++k) {
+      for (int64_t k = 0; k < out_rows; ++k) {
         memory::Copy(cpu_place, output_data + k * out_cols + col_idx, cpu_place,
                      input_data + k * col_len, sizeof(T) * col_len);
       }
diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu
index 58f936788a363e..f9cce061383939 100644
--- a/paddle/fluid/operators/math/concat_and_split.cu
+++ b/paddle/fluid/operators/math/concat_and_split.cu
@@ -26,9 +26,9 @@ namespace operators {
 namespace math {
 
 template <typename T>
-__global__ void ConcatKernel(const T** inputs, const int* input_cols,
-                             int col_size, const int output_rows,
-                             const int output_cols, T* output) {
+__global__ void ConcatKernel(const T** inputs, const int64_t* input_cols,
+                             int col_size, const int64_t output_rows,
+                             const int64_t output_cols, T* output) {
   int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
   int curr_segment = 0;
   int curr_offset = input_cols[0];
@@ -70,8 +70,8 @@ __device__ void ConcatKernelDetail(const T** inputs_data,
 
 template <typename T>
 __global__ void ConcatKernel(const T* input_addr0, const T* input_addr1,
-                             const int fixed_in_col, const int out_rows,
-                             const int out_cols, T* output_data) {
+                             const int64_t fixed_in_col, const int64_t out_rows,
+                             const int64_t out_cols, T* output_data) {
   const T* inputs_data[2];
   inputs_data[0] = input_addr0;
   inputs_data[1] = input_addr1;
@@ -81,8 +81,8 @@ __global__ void ConcatKernel(const T* input_addr0, const T* input_addr1,
 
 template <typename T>
 __global__ void ConcatKernel(const T* input_addr0, const T* input_addr1,
-                             const T* input_addr2, const int fixed_in_col,
-                             const int out_rows, const int out_cols,
+                             const T* input_addr2, const int64_t fixed_in_col,
+                             const int64_t out_rows, const int64_t out_cols,
                              T* output_data) {
   const T* inputs_data[3];
   inputs_data[0] = input_addr0;
@@ -95,8 +95,8 @@ __global__ void ConcatKernel(const T* input_addr0, const T* input_addr1,
 template <typename T>
 __global__ void ConcatKernel(const T* input_addr0, const T* input_addr1,
                              const T* input_addr2, const T* input_addr3,
-                             const int fixed_in_col, const int out_rows,
-                             const int out_cols, T* output_data) {
+                             const int64_t fixed_in_col, const int64_t out_rows,
+                             const int64_t out_cols, T* output_data) {
   const T* inputs_data[4];
   inputs_data[0] = input_addr0;
   inputs_data[1] = input_addr1;
@@ -108,8 +108,8 @@ __global__ void ConcatKernel(const T* input_addr0, const T* input_addr1,
 
 template <typename T>
 __global__ void ConcatKernel(const T** inputs_data, const int in_num,
-                             const int fixed_in_col, const int out_rows,
-                             const int out_cols, T* output_data) {
+                             const int64_t fixed_in_col, const int64_t out_rows,
+                             const int64_t out_cols, T* output_data) {
   ConcatKernelDetail<T>(inputs_data, fixed_in_col, out_rows, out_cols,
                         output_data);
 }
@@ -235,19 +235,19 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
                   framework::Tensor* output) {
     // TODO(zcd): Add input data validity checking
     int in_num = input.size();
-    int in_row = 1;
+    int64_t in_row = 1;
     auto dim_0 = input[0].dims();
     for (int i = 0; i < axis; ++i) {
       in_row *= dim_0[i];
     }
-    int in_col = input[0].numel() / in_row;
-    int out_row = in_row, out_col = 0;
+    int64_t in_col = input[0].numel() / in_row;
+    int64_t out_row = in_row, out_col = 0;
 
     int inputs_col_num = in_num + 1;
     std::vector<const T*> inputs_data_vec(in_num);
-    std::vector<int> inputs_col_vec(inputs_col_num);
+    std::vector<int64_t> inputs_col_vec(inputs_col_num);
     const T** inputs_data = inputs_data_vec.data();
-    int* inputs_col = inputs_col_vec.data();
+    int64_t* inputs_col = inputs_col_vec.data();
 
 // There are some differences between hip runtime and NV runtime.
 // In NV, when the pageable memory data less than 64K is transferred from
@@ -263,13 +263,13 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
     inputs_data = reinterpret_cast<const T**>(data_alloc->ptr());
     col_alloc = memory::Alloc(platform::CUDAPinnedPlace(),
                               inputs_col_num * sizeof(int));
-    inputs_col = reinterpret_cast<int*>(col_alloc->ptr());
+    inputs_col = reinterpret_cast<int64_t*>(col_alloc->ptr());
 #endif
 
     inputs_col[0] = 0;
     bool has_same_shape = true;
     for (int i = 0; i < in_num; ++i) {
-      int t_cols = input[i].numel() / in_row;
+      int64_t t_cols = input[i].numel() / in_row;
       if (has_same_shape) {
         if (t_cols != in_col) has_same_shape = false;
       }
@@ -312,17 +312,19 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
       }
     } else {
       auto tmp_dev_ins_col_data =
-          memory::Alloc(context, inputs_col_num * sizeof(int));
+          memory::Alloc(context, inputs_col_num * sizeof(int64_t));
       memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
                    tmp_dev_ins_col_data->ptr(), platform::CPUPlace(),
-                   static_cast<void*>(inputs_col), inputs_col_num * sizeof(int),
-                   context.stream());
-      int* dev_ins_col_data = static_cast<int*>(tmp_dev_ins_col_data->ptr());
+                   static_cast<void*>(inputs_col),
+                   inputs_col_num * sizeof(int64_t), context.stream());
+      int64_t* dev_ins_col_data =
+          static_cast<int64_t*>(tmp_dev_ins_col_data->ptr());
 
       ConcatKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
           dev_ins_data, dev_ins_col_data, static_cast<int>(inputs_col_num),
           out_row, out_col, output->data<T>());
     }
+
 #ifdef PADDLE_WITH_HIP
     // Prevent the pinned memory value from being covered and release the memory
     // after the launch kernel of the stream is executed (reapply pinned memory
diff --git a/paddle/fluid/operators/matmul_op_npu.cc b/paddle/fluid/operators/matmul_op_npu.cc
new file mode 100644
index 00000000000000..d5606177a55926
--- /dev/null
+++ b/paddle/fluid/operators/matmul_op_npu.cc
@@ -0,0 +1,185 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class MatMulNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* y = ctx.Input<framework::Tensor>("Y");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    bool transpose_x = ctx.Attr<bool>("transpose_X");
+    bool transpose_y = ctx.Attr<bool>("transpose_Y");
+
+    if (x->dims().size() == 2) {
+      out->mutable_data<T>(ctx.GetPlace());
+
+      const auto& runner = NpuOpRunner(
+          "MatMul", {*x, *y}, {*out},
+          {{"transpose_x1", transpose_x}, {"transpose_x2", transpose_y}});
+
+      auto stream =
+          ctx.template device_context<paddle::platform::NPUDeviceContext>()
+              .stream();
+      runner.Run(stream);
+
+    } else if (x->dims().size() > 2) {
+      out->mutable_data<T>(ctx.GetPlace());
+
+      const auto& runner =
+          NpuOpRunner("BatchMatMul", {*x, *y}, {*out},
+                      {{"adj_x1", transpose_x}, {"adj_x2", transpose_y}});
+
+      auto stream =
+          ctx.template device_context<paddle::platform::NPUDeviceContext>()
+              .stream();
+      runner.Run(stream);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class MatMulGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::Tensor>("X");
+    auto* y = ctx.Input<framework::Tensor>("Y");
+    auto* dout = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<framework::Tensor>(framework::GradVarName("Y"));
+    bool transpose_y = ctx.Attr<bool>("transpose_Y");
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    if (x->dims().size() == 2) {
+      if (transpose_y) {
+        if (dx) {
+          dx->mutable_data<T>(ctx.GetPlace());
+          const auto& runner_dx =
+              NpuOpRunner("MatMul", {*dout, *y}, {*dx},
+                          {{"transpose_x1", false}, {"transpose_x2", false}});
+
+          runner_dx.Run(stream);
+        }
+        if (dy) {
+          dy->mutable_data<T>(ctx.GetPlace());
+          const auto& runner_dy =
+              NpuOpRunner("MatMul", {*dout, *x}, {*dy},
+                          {{"transpose_x1", true}, {"transpose_x2", false}});
+
+          runner_dy.Run(stream);
+        }
+
+      } else {
+        if (dx) {
+          dx->mutable_data<T>(ctx.GetPlace());
+          const auto& runner_dx =
+              NpuOpRunner("MatMul", {*dout, *y}, {*dx},
+                          {{"transpose_x1", false}, {"transpose_x2", true}});
+
+          runner_dx.Run(stream);
+        }
+        if (dy) {
+          dy->mutable_data<T>(ctx.GetPlace());
+          const auto& runner_dy =
+              NpuOpRunner("MatMul", {*x, *dout}, {*dy},
+                          {{"transpose_x1", true}, {"transpose_x2", false}});
+
+          runner_dy.Run(stream);
+        }
+      }
+    } else if (x->dims().size() > 2) {
+      if (transpose_y) {
+        if (dx) {
+          dx->mutable_data<T>(ctx.GetPlace());
+          const auto& runner_dx =
+              NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx},
+                          {{"adj_x1", false}, {"adj_x2", false}});
+
+          runner_dx.Run(stream);
+        }
+        if (dy) {
+          dy->mutable_data<T>(ctx.GetPlace());
+          const auto& runner_dy =
+              NpuOpRunner("BatchMatMul", {*dout, *x}, {*dy},
+                          {{"adj_x1", true}, {"adj_x2", false}});
+
+          runner_dy.Run(stream);
+        }
+      } else {
+        if (dx) {
+          dx->mutable_data<T>(ctx.GetPlace());
+          const auto& runner_dx =
+              NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx},
+                          {{"adj_x1", false}, {"adj_x2", true}});
+
+          runner_dx.Run(stream);
+        }
+        if (dy) {
+          dy->mutable_data<T>(ctx.GetPlace());
+          if ((x->dims().size() == 3) && (dout->dims().size() == 3) &&
+              (dy->dims().size() == 2)) {
+            framework::Tensor dout_tmp;
+            dout_tmp.ShareDataWith(*dout);
+            std::vector<int> vec_dim =
+                framework::vectorize<int>(dout_tmp.dims());
+            std::vector<int> vec_dim_v{vec_dim[0] * vec_dim[1], vec_dim[2]};
+            dout_tmp.Resize(framework::make_ddim(vec_dim_v));
+
+            framework::Tensor x_tmp;
+            x_tmp.ShareDataWith(*x);
+            std::vector<int> vec_dim_x =
+                framework::vectorize<int>(x_tmp.dims());
+            std::vector<int> vec_dim_x_v{vec_dim_x[0] * vec_dim_x[1],
+                                         vec_dim_x[2]};
+            x_tmp.Resize(framework::make_ddim(vec_dim_x_v));
+            const auto& runner_dy =
+                NpuOpRunner("MatMul", {x_tmp, dout_tmp}, {*dy},
+                            {{"transpose_x1", true}, {"transpose_x2", false}});
+            runner_dy.Run(stream);
+          } else {
+            const auto& runner_dy =
+                NpuOpRunner("BatchMatMul", {*x, *dout}, {*dy},
+                            {{"adj_x1", true}, {"adj_x2", false}});
+            runner_dy.Run(stream);
+          }
+        }
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    matmul, ops::MatMulNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::MatMulNPUKernel<paddle::platform::NPUDeviceContext,
+                         paddle::platform::float16>);
+REGISTER_OP_NPU_KERNEL(
+    matmul_grad,
+    ops::MatMulGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::MatMulGradNPUKernel<paddle::platform::NPUDeviceContext,
+                             paddle::platform::float16>);
diff --git a/paddle/fluid/operators/matmul_v2_op_npu.cc b/paddle/fluid/operators/matmul_v2_op_npu.cc
index 3d77c177500e38..b23b408e9c59a7 100644
--- a/paddle/fluid/operators/matmul_v2_op_npu.cc
+++ b/paddle/fluid/operators/matmul_v2_op_npu.cc
@@ -140,20 +140,22 @@ class MatMulV2GradNPUKernel : public framework::OpKernel<T> {
           dy->mutable_data<T>(ctx.GetPlace());
           if ((x->dims().size() == 3) && (dout->dims().size() == 3) &&
               (dy->dims().size() == 2)) {
-            framework::Tensor dout_;
-            dout_.ShareDataWith(*dout);
-            std::vector<int> vec_dim = framework::vectorize<int>(dout_.dims());
+            framework::Tensor dout_tmp;
+            dout_tmp.ShareDataWith(*dout);
+            std::vector<int> vec_dim =
+                framework::vectorize<int>(dout_tmp.dims());
             std::vector<int> vec_dim_v{vec_dim[0] * vec_dim[1], vec_dim[2]};
-            dout_.Resize(framework::make_ddim(vec_dim_v));
+            dout_tmp.Resize(framework::make_ddim(vec_dim_v));
 
-            framework::Tensor x_;
-            x_.ShareDataWith(*x);
-            std::vector<int> vec_dim_x = framework::vectorize<int>(x_.dims());
+            framework::Tensor x_tmp;
+            x_tmp.ShareDataWith(*x);
+            std::vector<int> vec_dim_x =
+                framework::vectorize<int>(x_tmp.dims());
             std::vector<int> vec_dim_x_v{vec_dim_x[0] * vec_dim_x[1],
                                          vec_dim_x[2]};
-            x_.Resize(framework::make_ddim(vec_dim_x_v));
+            x_tmp.Resize(framework::make_ddim(vec_dim_x_v));
             const auto& runner_dy =
-                NpuOpRunner("MatMul", {x_, dout_}, {*dy},
+                NpuOpRunner("MatMul", {x_tmp, dout_tmp}, {*dy},
                             {{"transpose_x1", true}, {"transpose_x2", false}});
             runner_dy.Run(stream);
           } else {
diff --git a/paddle/fluid/operators/memcpy_op.h b/paddle/fluid/operators/memcpy_op.h
index 63a41cc7237310..ecd266858024e0 100644
--- a/paddle/fluid/operators/memcpy_op.h
+++ b/paddle/fluid/operators/memcpy_op.h
@@ -51,17 +51,14 @@ class MemcpyFunctor {
     } else if (dst_place_type_ == 1) {
       framework::TensorCopy(lod_tensor, dev_ctx_.GetPlace(), dev_ctx_,
                             &out_tensor);
-    }
+    } else if (dst_place_type_ == 0) {
+      framework::TensorCopySync(lod_tensor, platform::CPUPlace(), &out_tensor);
 #ifdef PADDLE_WITH_ASCEND_CL
-    else if (dst_place_type_ == 0) {  // NOLINT
-      framework::TensorCopy(lod_tensor, platform::CPUPlace(), dev_ctx_,
-                            &out_tensor);
     } else if (dst_place_type_ == 4) {
       framework::TensorCopy(lod_tensor, dev_ctx_.GetPlace(), dev_ctx_,
                             &out_tensor);
-    }
 #endif
-    else {  // NOLINT
+    } else {
       PADDLE_THROW(platform::errors::Unimplemented(
           "memcpy dst_place_type: %d is not supported yet.", dst_place_type_));
     }
diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
index 177e539c4b6c29..3b92d2e2d88913 100644
--- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
@@ -251,7 +251,9 @@ namespace ops = paddle::operators;
       ops::MKLDNNActivationKernel<ops::functor<paddle::platform::bfloat16>>); \
   REGISTER_OP_KERNEL(                                                         \
       act_type##_grad, MKLDNN, ::paddle::platform::CPUPlace,                  \
-      ops::MKLDNNActivationGradKernel<ops::grad_functor<float>>);
+      ops::MKLDNNActivationGradKernel<ops::grad_functor<float>>,              \
+      ops::MKLDNNActivationGradKernel<                                        \
+          ops::grad_functor<paddle::platform::bfloat16>>);
 
 #define FOR_EACH_MKLDNN_KERNEL_FUNCTOR(__macro)                           \
   __macro(relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor);                \
@@ -259,7 +261,6 @@ namespace ops = paddle::operators;
   __macro(leaky_relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor);          \
   __macro(swish, SwishMKLDNNFunctor, SwishMKLDNNGradFunctor);             \
   __macro(hardswish, HardSwishMKLDNNFunctor, HardSwishMKLDNNGradFunctor); \
-  __macro(sigmoid, SigmoidMKLDNNFunctor, SigmoidMKLDNNGradFunctor);       \
   __macro(tanh, TanhMKLDNNFunctor, TanhMKLDNNGradFunctor);                \
   __macro(sqrt, SqrtMKLDNNFunctor, SqrtMKLDNNGradFunctor);                \
   __macro(abs, AbsMKLDNNFunctor, AbsMKLDNNGradFunctor);
@@ -267,3 +268,5 @@ namespace ops = paddle::operators;
 FOR_EACH_MKLDNN_KERNEL_FUNCTOR(REGISTER_ACTIVATION_MKLDNN_KERNEL);
 REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(gelu, GeluMKLDNNFunctor,
                                        GeluMKLDNNGradFunctor);
+REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(sigmoid, SigmoidMKLDNNFunctor,
+                                       SigmoidMKLDNNGradFunctor);
diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc
index edc75bda4abdf7..130e10a1f8de30 100644
--- a/paddle/fluid/operators/optimizers/adam_op.cc
+++ b/paddle/fluid/operators/optimizers/adam_op.cc
@@ -122,7 +122,8 @@ framework::OpKernelType AdamOp::GetExpectedKernelType(
 framework::OpKernelType AdamOp::GetKernelTypeForVar(
     const std::string &var_name, const framework::Tensor &tensor,
     const framework::OpKernelType &expected_kernel_type) const {
-  if (var_name == "Beta1Pow" || var_name == "Beta2Pow") {
+  if (var_name == "Beta1Pow" || var_name == "Beta2Pow" ||
+      var_name == "SkipUpdate") {
     return expected_kernel_type;
   } else {
     return framework::OpKernelType(expected_kernel_type.data_type_,
diff --git a/paddle/fluid/operators/optimizers/adam_op_npu.cc b/paddle/fluid/operators/optimizers/adam_op_npu.cc
index 8b33dc64c4e4f0..d0de480c1a0ccc 100644
--- a/paddle/fluid/operators/optimizers/adam_op_npu.cc
+++ b/paddle/fluid/operators/optimizers/adam_op_npu.cc
@@ -141,7 +141,7 @@ class AdamNPUKernel : public framework::OpKernel<T> {
 
     if (ctx.HasInput("Beta2Tensor")) {
       beta2_tensor = ctx.Input<framework::Tensor>("Beta2Tensor");
-      PADDLE_ENFORCE_EQ(beta1_tensor->numel(), 1,
+      PADDLE_ENFORCE_EQ(beta2_tensor->numel(), 1,
                         platform::errors::InvalidArgument(
                             "Input(Beta2Tensor) size must be 1, but get %d",
                             beta2_tensor->numel()));
diff --git a/paddle/fluid/operators/sampling_id_op_npu.cc b/paddle/fluid/operators/sampling_id_op_npu.cc
new file mode 100644
index 00000000000000..162403595b6a67
--- /dev/null
+++ b/paddle/fluid/operators/sampling_id_op_npu.cc
@@ -0,0 +1,19 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/sampling_id_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_NPU_KERNEL(sampling_id, paddle::operators::SamplingIdKernel<float>,
+                       paddle::operators::SamplingIdKernel<double>);
diff --git a/paddle/fluid/operators/slice_op.h b/paddle/fluid/operators/slice_op.h
index 96b8ea11d6845e..658939a91f39a7 100644
--- a/paddle/fluid/operators/slice_op.h
+++ b/paddle/fluid/operators/slice_op.h
@@ -391,17 +391,7 @@ class SliceGradKernel : public framework::OpKernel<T> {
         }
       }
 
-      if (need_pad_num == 0) {
-        // do not need padding, pass if data address same, else copy
-        if (d_input->mutable_data<T>(context.GetPlace()) == d_out->data<T>()) {
-          // inplace, do not any operator, pass
-        } else {
-          framework::TensorCopy(
-              *d_out, context.GetPlace(),
-              context.template device_context<platform::DeviceContext>(),
-              d_input);
-        }
-      } else if (need_pad_num == 1) {
+      if (need_pad_num == 1) {
         // only need padding one dimension, we can reduce dimension.
         // only the padding dimension is available for us.
         // How to reduce dimension(5 to 3 for example):
diff --git a/paddle/fluid/platform/gpu_launch_config.h b/paddle/fluid/platform/gpu_launch_config.h
index 4da91b4e764a52..a82262419066fa 100644
--- a/paddle/fluid/platform/gpu_launch_config.h
+++ b/paddle/fluid/platform/gpu_launch_config.h
@@ -41,7 +41,7 @@ struct GpuLaunchConfig {
 };
 
 inline GpuLaunchConfig GetGpuLaunchConfig1D(
-    const platform::CUDADeviceContext& context, int element_count,
+    const platform::CUDADeviceContext& context, int64_t element_count,
 #ifdef PADDLE_WITH_HIP
     // HIP will throw GPU memory access fault if threads > 256
     int max_threads = 256) {
diff --git a/paddle/fluid/platform/stream/CMakeLists.txt b/paddle/fluid/platform/stream/CMakeLists.txt
index e1e3e49ce9cbc0..cf219caa9f5c9c 100644
--- a/paddle/fluid/platform/stream/CMakeLists.txt
+++ b/paddle/fluid/platform/stream/CMakeLists.txt
@@ -1,5 +1,11 @@
+IF(WITH_MKLDNN)
+    set(MKLDNN_CTX_DEPS mkldnn)
+ELSE()
+    set(MKLDNN_CTX_DEPS)
+ENDIF()
+
 IF(WITH_GPU OR WITH_ROCM)
-cc_library(cuda_stream SRCS cuda_stream.cc DEPS enforce boost)
+cc_library(cuda_stream SRCS cuda_stream.cc DEPS enforce boost ${MKLDNN_CTX_DEPS})
 ENDIF()
 
 IF(WITH_ASCEND_CL)
diff --git a/paddle/fluid/pybind/cuda_streams_py.cc b/paddle/fluid/pybind/cuda_streams_py.cc
index df63239cad6d69..21c6e0a4f28caa 100644
--- a/paddle/fluid/pybind/cuda_streams_py.cc
+++ b/paddle/fluid/pybind/cuda_streams_py.cc
@@ -164,8 +164,7 @@ void BindCudaStream(py::module *m_ptr) {
            [](paddle::platform::stream::CUDAStream &self,
               paddle::platform::CudaEvent *event) {
              if (event == nullptr) {
-               auto event_tmp = paddle::platform::CudaEvent();
-               event = &event_tmp;
+               event = new paddle::platform::CudaEvent();
              }
              event->Record(self);
              return event;
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 619301e3b45d31..7b99c7df188f35 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -432,19 +432,24 @@ static void ParseIndexingSlice(framework::LoDTensor *tensor, PyObject *_index,
   const auto &shape = tensor->dims();
   const int rank = shape.size();
   const int size = PyTuple_GET_SIZE(index);
+
+  // specified_dims is the number of dimensions which indexed by Interger,
+  // Slices.
+  int specified_dims = 0;
+  for (int dim = 0; dim < size; ++dim) {
+    PyObject *slice_item = PyTuple_GetItem(index, dim);
+    if (PyCheckInteger(slice_item) || PySlice_Check(slice_item)) {
+      specified_dims++;
+    }
+  }
+
   PADDLE_ENFORCE_EQ(
       size <= rank, true,
       platform::errors::InvalidArgument(
           "too many indices (%d) for tensor of dimension %d", size, rank));
-  for (int dim = 0; dim < size; ++dim) {
-    PyObject *slice_item = PyTuple_GetItem(index, dim);
-    PADDLE_ENFORCE_EQ(PyCheckInteger(slice_item) || PySlice_Check(slice_item),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Currently, VarBase.__getitem__() only allows "
-                          "indexing by Integers, Slices, and tuples of "
-                          "these types, but received %s in %dth slice item",
-                          std::string(Py_TYPE(slice_item)->tp_name), dim + 1));
+  for (int i = 0, dim = 0; i < size; ++i) {
+    PyObject *slice_item = PyTuple_GetItem(index, i);
+
     infer_flags->push_back(1);
     int dim_len = shape[dim];
     if (PyCheckInteger(slice_item)) {
@@ -467,7 +472,8 @@ static void ParseIndexingSlice(framework::LoDTensor *tensor, PyObject *_index,
       slice_ends->push_back(start + 1);
       slice_strides->push_back(1);
       decrease_axis->push_back(dim);
-    } else {
+      dim++;
+    } else if (PySlice_Check(slice_item)) {
       // slice item
       Py_ssize_t start, end, step;
       PySliceObject *p = reinterpret_cast<PySliceObject *>(slice_item);
@@ -475,12 +481,22 @@ static void ParseIndexingSlice(framework::LoDTensor *tensor, PyObject *_index,
 
       // :: or : or 0:dim_len:1
       if (start == 0 && end == dim_len && step == 1) {
+        dim++;
         continue;
       }
       slice_axes->push_back(dim);
       slice_starts->push_back(start);
       slice_ends->push_back(end);
       slice_strides->push_back(step);
+      dim++;
+    } else if (slice_item == Py_Ellipsis) {
+      dim += rank - specified_dims;
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Currently, VarBase.__getitem__() only allows "
+          "indexing by Integers, Slices, Ellipsis, and tuples of "
+          "these types, but received %s in %dth slice item",
+          std::string(Py_TYPE(slice_item)->tp_name), i + 1));
     }
   }
   if (!PyTuple_Check(_index)) Py_DecRef(index);
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 4286b9092c2395..d646e06d8a47a9 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -78,7 +78,6 @@ limitations under the License. */
 #include "paddle/fluid/pybind/box_helper_py.h"
 #include "paddle/fluid/pybind/compatible.h"
 #include "paddle/fluid/pybind/const_value.h"
-#include "paddle/fluid/pybind/cuda_streams_py.h"
 #include "paddle/fluid/pybind/data_set_py.h"
 #include "paddle/fluid/pybind/exception.h"
 #include "paddle/fluid/pybind/fleet_wrapper_py.h"
@@ -240,6 +239,7 @@ OpSupportedInfos(const std::string &place,
       {"GPU", &platform::is_gpu_place},
       {"CPU", &platform::is_cpu_place},
       {"XPU", &platform::is_xpu_place},
+      {"NPU", &platform::is_npu_place},
   };
   PADDLE_ENFORCE_NE(
       is_target_place.count(query_place), 0,
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index bebcfe64406d9e..62d30a50d6be4d 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -324,14 +324,17 @@ if %day_now% NEQ %day_before% (
     echo %day_now% > %cache_dir%\day.txt
     type %cache_dir%\day.txt
     if %day_now% EQU 21 (
+        del D:\sccache\sccache_log.txt
         rmdir %cache_dir%\third_party_GPU /s/q
         rmdir %cache_dir%\third_party /s/q
     )
     if %day_now% EQU 11 (
+        del D:\sccache\sccache_log.txt
         rmdir %cache_dir%\third_party_GPU /s/q
         rmdir %cache_dir%\third_party /s/q
     )
     if %day_now% EQU 01 (
+        del D:\sccache\sccache_log.txt
         rmdir %cache_dir%\third_party_GPU /s/q
         rmdir %cache_dir%\third_party /s/q
     )
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 6b19e154c721e7..fb6496e8d6c656 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -225,7 +225,11 @@ function cmake_base() {
         -DLITE_GIT_TAG=release/v2.8
         -DWITH_UNITY_BUILD=${WITH_UNITY_BUILD:-OFF}
         -DWITH_XPU_BKCL=${WITH_XPU_BKCL:-OFF}
+        -DWITH_ARM=${WITH_ARM:-OFF}
+        -DWITH_ASCEND=${WITH_ASCEND:-OFF}
+        -DWITH_ASCEND_CL=${WITH_ASCEND_CL:-OFF}
         -DWITH_STRIP=${WITH_STRIP:-ON}
+        -DON_INFER=${ON_INFER:-OFF}
     ========================================
 EOF
     # Disable UNITTEST_USE_VIRTUALENV in docker because
@@ -262,7 +266,11 @@ EOF
         -DXPU_SDK_ROOT=${XPU_SDK_ROOT:-""} \
         -DWITH_LITE=${WITH_LITE:-OFF} \
         -DWITH_XPU_BKCL=${WITH_XPU_BKCL:-OFF} \
+        -DWITH_ARM=${WITH_ARM:-OFF} \
+        -DWITH_ASCEND=${WITH_ASCEND:-OFF} \
+        -DWITH_ASCEND_CL=${WITH_ASCEND_CL:-OFF} \
         -DWITH_STRIP=${WITH_STRIP:-ON} \
+        -DON_INFER=${ON_INFER:-OFF} \
         -DWITH_UNITY_BUILD=${WITH_UNITY_BUILD:-OFF};build_error=$?
     if [ "$build_error" != 0 ];then
         exit 7;
@@ -343,7 +351,11 @@ function build_base() {
     # reset ccache zero stats for collect PR's actual hit rate
     ccache -z
 
-    make install -j ${parallel_number};build_error=$?
+    if [ "$WITH_ARM" == "ON" ];then
+        make TARGET=ARMV8 -j ${parallel_number};build_error=$?
+    else
+        make install -j ${parallel_number};build_error=$?
+    fi
 
     # ci will collect ccache hit rate
     collect_ccache_hits
@@ -816,20 +828,25 @@ function check_approvals_of_unittest() {
         curBuildSize=$(du -m --max-depth=0 ${PADDLE_ROOT}/build/paddle_inference_install_dir/paddle/lib/libpaddle_inference.so |awk '{print $1}')
         apt-get install -y bc
         diffSize=$(printf "%.2f" `echo "$curBuildSize - $oriBuildSize" | bc`)
+        AllDiffSize=$(printf "%.2f" `echo "$diffSize * 4" | bc`)
         cat <<EOF
         ========================================
         Original libpaddle_inference.so Size is ${oriBuildSize}M.
         Current libpaddle_inference.so Size is ${curBuildSize}M.
-        Diff size of libpaddle_inference.so is ${diffSize}M.
+        In single gpu architecture, Growing size of libpaddle_inference.so is ${diffSize}M.
+        In release version, The gpu architecture parameter is "All", The library size is four times to single gpu architecture.
+        It means the release version library size growth is about ${AllDiffSize}M.
         ========================================
 EOF
-        if [ `echo "5 < $diffSize"|bc` -eq 1 ] ; then
+        if [ `echo "20 < $AllDiffSize"|bc` -eq 1 ] ; then
+            
             approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
             APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 39303645 328693`
             echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
             if [ "${APPROVALS}" == "FALSE" ]; then
                 echo "=========================================================================================="
-                echo -e "Diff size is no less than 5 M. Then you must have one RD (Shixiaowei02 (Recommend) or Superjomn) approval for this PR\n"
+                echo "This PR make the release inference library size growth exceeds 20 M."
+                echo "Then you must have one RD (Shixiaowei02 (Recommend) or Superjomn) approval for this PR\n"
                 echo "=========================================================================================="
                 exit 6
             fi
@@ -1023,6 +1040,8 @@ function card_test() {
     # get the CUDA device count, XPU device count is one
     if [ "${WITH_XPU}" == "ON" ];then
         CUDA_DEVICE_COUNT=1
+    elif [ "${WITH_ASCEND_CL}" == "ON" ];then
+        CUDA_DEVICE_COUNT=1
     elif [ "${WITH_ROCM}" == "ON" ];then
         CUDA_DEVICE_COUNT=4
     else
@@ -1567,7 +1586,7 @@ function parallel_test_base_xpu() {
     if [ ${WITH_TESTING:-ON} == "ON" ] ; then
     cat <<EOF
     ========================================
-    Running unit cpu tests ...
+    Running unit xpu tests ...
     ========================================
 EOF
 
@@ -1597,6 +1616,42 @@ set -x
     fi   
 }
 
+function parallel_test_base_npu() {
+    mkdir -p ${PADDLE_ROOT}/build
+    cd ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/npu
+    if [ ${WITH_TESTING:-ON} == "ON" ] ; then
+    cat <<EOF
+    ========================================
+    Running unit npu tests ...
+    ========================================
+EOF
+
+set +x
+        ut_startTime_s=`date +%s`
+        test_cases=$(ctest -N -V) # get all test cases
+        get_quickly_disable_ut||disable_ut_quickly=''   # indicate whether the case was in quickly disable list
+        while read -r line; do
+            if [[ "$line" == "" ]]; then
+                continue
+            fi
+            read testcase <<< $(echo "$line"|grep -oEi "\w+$")
+            if [[ "$single_card_tests" == "" ]]; then
+                single_card_tests="^$testcase$"
+            else
+                single_card_tests="$single_card_tests|^$testcase$"
+            fi
+        done <<< "$test_cases";
+        card_test "$single_card_tests" 1
+        collect_failed_tests
+set -x
+        ut_endTime_s=`date +%s`
+        echo "NPU testCase Time: $[ $ut_endTime_s - $ut_startTime_s ]s"
+        if [[ "$EXIT_CODE" != "0" ]]; then
+            exit 8;
+        fi
+    fi   
+}
+
 function parallel_test() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
@@ -1605,12 +1660,12 @@ function parallel_test() {
     ut_total_startTime_s=`date +%s`
     if [ "$WITH_GPU" == "ON" ] || [ "$WITH_ROCM" == "ON" ];then
         parallel_test_base_gpu
+    elif [ "$WITH_XPU" == "ON" ];then
+        parallel_test_base_xpu
+    elif [ "$WITH_ASCEND_CL" == "ON" ];then
+        parallel_test_base_npu
     else
-        if [ "$WITH_XPU" == "ON" ];then
-            parallel_test_base_xpu
-        else
-            parallel_test_base_cpu ${PROC_RUN:-1}
-        fi
+        parallel_test_base_cpu ${PROC_RUN:-1}
     fi
     ut_total_endTime_s=`date +%s`
     echo "TestCases Total Time: $[ $ut_total_endTime_s - $ut_total_startTime_s ]s"
@@ -2259,6 +2314,11 @@ function main() {
         parallel_test
         check_coverage
         ;;
+      check_npu_coverage)
+        cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number}
+        parallel_test
+        check_coverage
+        ;;
       reuse_so_cicheck_py35)
         reuse_so_cache
         parallel_test
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index b493ecedd9651e..a5f78a9f31a90c 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -8,6 +8,8 @@ if(WITH_GPU)
   SET(PACKAGE_NAME "paddlepaddle-gpu")
 elseif(WITH_ROCM)
   SET(PACKAGE_NAME "paddlepaddle-rocm")
+elseif(WITH_ASCEND_CL)
+  SET(PACKAGE_NAME "paddlepaddle-npu")
 else()
   SET(PACKAGE_NAME "paddlepaddle")
 endif()
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index cfba9f656b333a..df6df856222299 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -1341,20 +1341,20 @@ def split(x,
 
     Examples:
         .. code-block:: python
-
+            # required: distributed
             import paddle
-            from paddle.distributed import init_parallel_env
-
-            # required: gpu
+            import paddle.distributed.fleet as fleet
 
+            paddle.enable_static()
             paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
-            init_parallel_env()
+            fleet.init(is_collective=True)
             data = paddle.randint(0, 8, shape=[10,4])
             emb_out = paddle.distributed.split(
                 data,
                 (8, 8),
                 operation="embedding",
                 num_partitions=2)
+
     """
     assert isinstance(size, (list, tuple)), (
         "The type of size for "
diff --git a/python/paddle/distributed/fleet/elastic.py b/python/paddle/distributed/fleet/elastic.py
index 706868918f531f..101269764adb19 100644
--- a/python/paddle/distributed/fleet/elastic.py
+++ b/python/paddle/distributed/fleet/elastic.py
@@ -41,14 +41,16 @@ def __init__(self, args):
 
     def _terminate_procs(self):
         # try to terminate process by group, this happend in multiprocess senario in user process
-        for p in self.procs:
-            if p.proc.poll() is None:
-                os.killpg(os.getpgid(p.proc.pid), signal.SIGTERM)
-                if p.log_fn:
-                    p.log_fn.close()
-                logger.info("terminate process group gid:{}".format(p.proc.pid))
+        if os.name != 'nt':
+            for p in self.procs:
+                if p.proc.poll() is None:
+                    os.killpg(os.getpgid(p.proc.pid), signal.SIGTERM)
+                    if p.log_fn:
+                        p.log_fn.close()
+                    logger.info("terminate process group gid:{}".format(
+                        p.proc.pid))
 
-        time.sleep(1)
+            time.sleep(1)
         for p in self.procs:
             if p.proc.poll() is None:
                 p.proc.terminate()
diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py
index 7c183fc9286c81..6ead643df6c1b8 100644
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -517,6 +517,7 @@ def start_local_trainers(cluster,
                 "details abouts PADDLE_TRAINER_ENDPOINTS can be found in {}/endpoints.log, and detail running logs maybe found in {}/workerlog.0".
                 format(log_dir, log_dir))
         fn = None
+        pre_fn = None if os.name == 'nt' else os.setsid
         if log_dir is not None:
             os.system("mkdir -p {}".format(log_dir))
             if os.path.exists("%s/endpoints.log" % log_dir):
@@ -526,13 +527,9 @@ def start_local_trainers(cluster,
                 f.write("\n".join(cluster.trainers_endpoints()))
             fn = open("%s/workerlog.%d" % (log_dir, idx), "a")
             proc = subprocess.Popen(
-                cmd,
-                env=current_env,
-                stdout=fn,
-                stderr=fn,
-                preexec_fn=os.setsid)
+                cmd, env=current_env, stdout=fn, stderr=fn, preexec_fn=pre_fn)
         else:
-            proc = subprocess.Popen(cmd, env=current_env, preexec_fn=os.setsid)
+            proc = subprocess.Popen(cmd, env=current_env, preexec_fn=pre_fn)
 
         tp = TrainerProc()
         tp.proc = proc
diff --git a/python/paddle/fluid/contrib/mixed_precision/decorator.py b/python/paddle/fluid/contrib/mixed_precision/decorator.py
index 7a646e069db357..22eb2d20f3db7f 100644
--- a/python/paddle/fluid/contrib/mixed_precision/decorator.py
+++ b/python/paddle/fluid/contrib/mixed_precision/decorator.py
@@ -400,6 +400,10 @@ def apply_gradients(self, params_grads):
                         name="update_loss_scaling")
         # Pass found_inf to adam, to skip update for not only param, but also momentum and beta_pow
         if isinstance(self._optimizer, paddle.fluid.optimizer.Adam):
+            # NOTE(zhiqiu): Since found_inf needs to be on cpu in adam op, we 
+            # copy it in advance to avoid multiple time copies.
+            found_inf = paddle.tensor.creation._memcpy(found_inf,
+                                                       paddle.CPUPlace())
             self._optimizer._set_auxiliary_var('found_inf', found_inf)
         optimize_ops = self._optimizer.apply_gradients(params_grads)
         return optimize_ops
diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
index efa9caaee8894a..37fe1e505f02d9 100644
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
@@ -158,6 +158,9 @@ def _update_list(self):
 if core.is_compiled_with_xpu():
     _, _, _sys_unsupported_fp16_list = core.op_supported_infos(
         'XPU', core.VarDesc.VarType.FP16)
+elif core.is_compiled_with_npu():
+    _, _, _sys_unsupported_fp16_list = core.op_supported_infos(
+        'NPU', core.VarDesc.VarType.FP16)
 else:
     _, _, _sys_unsupported_fp16_list = core.op_supported_infos(
         'GPU', core.VarDesc.VarType.FP16)
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
index 39d44060abfb38..14fa291ee077c6 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
@@ -17,8 +17,8 @@
 import os
 import numpy as np
 import random
-import shutil
 import time
+import tempfile
 import unittest
 import logging
 
@@ -50,19 +50,6 @@ class TestImperativeQat(unittest.TestCase):
     QAT = quantization-aware training
     """
 
-    @classmethod
-    def setUpClass(cls):
-        timestamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
-        cls.root_path = os.path.join(os.getcwd(), "imperative_qat_" + timestamp)
-        cls.save_path = os.path.join(cls.root_path, "lenet")
-
-    @classmethod
-    def tearDownClass(cls):
-        try:
-            shutil.rmtree(cls.root_path)
-        except Exception as e:
-            print("Failed to delete {} due to {}".format(cls.root_path, str(e)))
-
     def set_vars(self):
         self.weight_quantize_type = 'abs_max'
         self.activation_quantize_type = 'moving_average_abs_max'
@@ -170,34 +157,35 @@ def test_qat(self):
             lenet.eval()
             before_save = lenet(test_img)
 
-        # save inference quantized model
-        imperative_qat.save_quantized_model(
-            layer=lenet,
-            path=self.save_path,
-            input_spec=[
-                paddle.static.InputSpec(
-                    shape=[None, 1, 28, 28], dtype='float32')
-            ])
-        print('Quantized model saved in {%s}' % self.save_path)
-
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-        else:
-            place = core.CPUPlace()
-        exe = fluid.Executor(place)
-        [inference_program, feed_target_names,
-         fetch_targets] = fluid.io.load_inference_model(
-             dirname=self.root_path,
-             executor=exe,
-             model_filename="lenet" + INFER_MODEL_SUFFIX,
-             params_filename="lenet" + INFER_PARAMS_SUFFIX)
-        after_save, = exe.run(inference_program,
-                              feed={feed_target_names[0]: test_data},
-                              fetch_list=fetch_targets)
-        # check
-        self.assertTrue(
-            np.allclose(after_save, before_save.numpy()),
-            msg='Failed to save the inference quantized model.')
+        with tempfile.TemporaryDirectory(prefix="qat_save_path_") as tmpdir:
+            # save inference quantized model
+            imperative_qat.save_quantized_model(
+                layer=lenet,
+                path=os.path.join(tmpdir, "lenet"),
+                input_spec=[
+                    paddle.static.InputSpec(
+                        shape=[None, 1, 28, 28], dtype='float32')
+                ])
+            print('Quantized model saved in %s' % tmpdir)
+
+            if core.is_compiled_with_cuda():
+                place = core.CUDAPlace(0)
+            else:
+                place = core.CPUPlace()
+            exe = fluid.Executor(place)
+            [inference_program, feed_target_names,
+             fetch_targets] = fluid.io.load_inference_model(
+                 dirname=tmpdir,
+                 executor=exe,
+                 model_filename="lenet" + INFER_MODEL_SUFFIX,
+                 params_filename="lenet" + INFER_PARAMS_SUFFIX)
+            after_save, = exe.run(inference_program,
+                                  feed={feed_target_names[0]: test_data},
+                                  fetch_list=fetch_targets)
+            # check
+            self.assertTrue(
+                np.allclose(after_save, before_save.numpy()),
+                msg='Failed to save the inference quantized model.')
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py
index 3087a5a4b01426..1f26b3d0bf059e 100644
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -92,15 +92,14 @@ def avx_supported():
                 'Can not get the AVX flag from machdep.cpu.features.\n'
                 'The original error is: %s\n' % cpt.get_exception_message(e))
         if not has_avx:
-            try:
-                has_avx = os.popen(
-                    'sysctl machdep.cpu.leaf7_features | grep -i avx').read(
-                    ) != ''
-            except Exception as e:
-                sys.stderr.write(
-                    'Can not get the AVX flag from machdep.cpu.leaf7_features.\n'
-                    'The original error is: %s\n' %
-                    cpt.get_exception_message(e))
+            import subprocess
+            pipe = subprocess.Popen(
+                'sysctl machdep.cpu.leaf7_features | grep -i avx',
+                shell=True,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE)
+            _ = pipe.communicate()
+            has_avx = True if pipe.returncode == 0 else False
         return has_avx
     elif sysstr == 'windows':
         import ctypes
diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py
index 17a57d510fa590..d315250657d7b0 100644
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
@@ -122,13 +122,6 @@ def __init__(self, loader):
 
         self._init_thread()
 
-        # if user exit python program when dataloader is still
-        # iterating, resource may no release safely, so we
-        # add __del__ function to to CleanupFuncRegistrar
-        # to make sure __del__ is always called when program
-        # exit for resoure releasing safely
-        CleanupFuncRegistrar.register(self.__del__)
-
     def _init_thread(self):
         self._var_names = [v.name for v in self._feed_list]
         self._shapes = [v.shape for v in self._feed_list]
@@ -228,7 +221,7 @@ def _shutdown_thread(self):
             self._thread_done_event.set()
             if self._thread is not threading.current_thread():
                 self._thread.join()
-                self._thread = None
+            self._thread = None
 
     # python2 compatibility
     def next(self):
@@ -284,17 +277,6 @@ def __init__(self, loader):
         self._init_thread()
         self._shutdown = False
 
-        # if user exit python program when dataloader is still
-        # iterating, resource may no release safely, so we
-        # add _shutdown_on_exit function to to CleanupFuncRegistrar
-        # to make sure _try_shutdown_all is always called when program
-        # exit for resoure releasing safely
-        # worker join may hang for in _try_shutdown_all call in atexit
-        # for main process is in atexit state in some OS, so we add
-        # timeout=1 for shutdown function call in atexit, for shutdown
-        # function call in __del__, we keep it as it is
-        CleanupFuncRegistrar.register(self._shutdown_on_exit)
-
     def _init_workers(self):
         # multiprocess worker and indice queue list initial as empty
         self._workers = []
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
index a621f68c6545a5..b62c16989fbe78 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
@@ -88,7 +88,7 @@ def is_unsupported(func):
         for v in m.__dict__.values():
             func_in_dict = func == v
             if isinstance(func_in_dict, (list, numpy.ndarray)):
-                func_in_dict = any(func_in_dict)
+                func_in_dict = numpy.array(func_in_dict).any()
             if func_in_dict:
                 translator_logger.log(
                     2,
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 17cd499bfee5f2..2fda67e891abfd 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -371,7 +371,10 @@ def grad(self):
                 # Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=False, [500.])
 
         """
-        msg = "tensor.grad will return the tensor value of the gradient."
+        msg = 'tensor.grad will return the tensor value of the gradient.' \
+            ' This is an incompatible upgrade for tensor.grad API. ' \
+            ' It\'s return type changes from numpy.ndarray in version 2.0 to paddle.Tensor in version 2.1.0. ' \
+            ' If you want to get the numpy value of the gradient, you can use :code:`x.grad.numpy()`'
         warning_msg = "\033[93m\nWarning:\n%s \033[0m" % (msg)
         # ensure ANSI escape sequences print correctly in cmd and powershell
         if sys.platform.lower() == 'win32':
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
index 63e84fab7b21b0..caaaa88cc4c643 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
@@ -25,6 +25,7 @@
 from .node import DownpourWorker, DownpourServer
 from . import ps_pb2 as pslib
 import os
+import logging
 
 OpRole = core.op_proto_and_checker_maker.OpRole
 # this dict is for store info about pull/push sparse ops.
@@ -41,6 +42,10 @@
     "scale_sparse_grad": None,
 }
 
+logging.basicConfig(
+    format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
+logger = logging.getLogger(__name__)
+
 
 class DistributedOptimizerImplBase(object):
     """
@@ -300,6 +305,74 @@ def _generate_multi_dense_table(self,
 
         return dense_tables, cond2denseid, lists_params, lists_grads, root_params_list, root_grads_list
 
+    def _gen_distributed_emb_to_size_dict(self, program):
+        d_size = dict()
+        local_vars = program.current_block().vars
+
+        for op in program.global_block().ops:
+            if op.type in self.supported_embedding_types:
+                if op.attr('is_distributed') is True:
+                    table_name = op.input("W")[0]
+                    emb_size = local_vars[table_name].shape[1]
+                    if d_size.get(table_name) is None:
+                        d_size[table_name] = emb_size
+                    elif d_size[table_name] != emb_size:
+                        raise ValueError("embedding size error: %s vs %s" %
+                                         (emb_size, d_size[table_name]))
+
+        return d_size
+
+    def _check_config_fleet_with_program_op(self, strategy, table_name,
+                                            emb_to_size):
+        if strategy.get(table_name) is None:
+            strategy[table_name] = dict()
+        st = strategy[table_name]
+
+        accessor = None
+        if st.get("sparse_accessor_class") is not None:
+            accessor = st["sparse_accessor_class"]
+
+        if accessor is None:
+            accessor = "DownpourCtrAccessor"
+
+        # set sparse_embedx_dim in strategy,
+        # user do not have to set it in config_fleet
+        if accessor == "DownpourFeatureValueAccessor" \
+                or accessor == "DownpourCtrAccessor" \
+                or accessor == "DownpourDoubleUnitAccessor" \
+                or accessor == "DownpourUnitAccessor":
+            if st.get("sparse_embedx_dim") is not None \
+                    and st["sparse_embedx_dim"] != emb_to_size[table_name] - 3:
+                raise ValueError("fleet config sparse_embedx_dim=%s not"
+                                 " equal to embedding size - 3 = %s" %
+                                 (st["sparse_embedx_dim"],
+                                  emb_to_size[table_name] - 3))
+            if st.get("sparse_embedx_dim") is None:
+                logger.warning(
+                    "sparse embedding size for table name '{}' is: {}, while sparse_embedx_dim "
+                    "with same sparse table name is not set in config_fleet.py. "
+                    "Hence automatically set sparse_embedx_dim = {} - 3.".
+                    format(table_name, emb_to_size[table_name], emb_to_size[
+                        table_name]))
+                st["sparse_embedx_dim"] = emb_to_size[table_name] - 3
+        elif accessor == "DownpourSparseValueAccessor":
+            if st.get("sparse_embedx_dim") is not None \
+                    and st["sparse_embedx_dim"] != emb_to_size[table_name]:
+                raise ValueError("fleet config sparse_embedx_dim=%s not"
+                                 " equal to embedding size = %s" %
+                                 (st["sparse_embedx_dim"],
+                                  emb_to_size[table_name]))
+            if st.get("sparse_embedx_dim") is None:
+                logger.warning(
+                    "sparse embedding size for table name '{}' is: {}, while sparse_embedx_dim "
+                    "with same sparse table name is not set in config_fleet.py. "
+                    "Hence automatically set sparse_embedx_dim = {}.".format(
+                        table_name, emb_to_size[table_name], emb_to_size[
+                            table_name]))
+                st["sparse_embedx_dim"] = emb_to_size[table_name]
+
+        return strategy
+
     def _minimize(self,
                   losses,
                   startup_program=None,
@@ -397,6 +470,10 @@ def _minimize(self,
                         sparse_table_to_index[tn] = sparse_table_index
                         sparse_table_index += 1
 
+                # get {table_name: emb_size} dict from program ops
+                emb_to_size = self._gen_distributed_emb_to_size_dict(
+                    loss.block.program)
+
                 # get inputs_dict
                 inputs_dict = self._find_distributed_lookup_table_inputs(
                     loss.block.program, sparse_table)
@@ -511,8 +588,10 @@ def _minimize(self,
         # ServerParameter add all sparse tables
         for tn in sparse_table_to_index:
             sparse_table_index = sparse_table_to_index[tn]
-            if strategy.get(tn) is not None:
-                server.add_sparse_table(sparse_table_index, strategy[tn])
+            st = self._check_config_fleet_with_program_op(strategy, tn,
+                                                          emb_to_size)
+            if st.get(tn) is not None:
+                server.add_sparse_table(sparse_table_index, st[tn])
             else:
                 server.add_sparse_table(sparse_table_index, None)
 
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 306648fb34f463..cebb5e77ac636f 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -3924,6 +3924,10 @@ def conv2d_transpose(input,
           print(conv2d_transpose.shape) # [-1, 2, 34, 34]
     """
     assert param_attr is not False, "param_attr should not be False in conv2d_transpose."
+    if len(input.shape) != 4:
+        raise ValueError("Input size should be 4, "
+                         "but received {}".format(len(input.shape)))
+
     if data_format not in ['NCHW', 'NHWC']:
         raise ValueError(
             "Attr(data_format) of Op(fluid.layers.conv2d_transpose) got wrong value: received "
@@ -4015,7 +4019,14 @@ def is_list_or_tuple(ele):
         output_size = utils.convert_to_list(output_size, 2, 'output_size')
     else:
         raise ValueError("output_size should be int, list[int] or tuple[int]")
-    groups = 1 if groups is None else groups
+
+    if groups is None:
+        groups = 1
+    elif groups <= 0:
+        raise ValueError("the groups of input must be greater than 0, "
+                         "but received the groups of input is {}".format(
+                             groups))
+
     filter_shape = [input_channel, num_filters // groups] + filter_size
 
     img_filter = helper.create_parameter(
@@ -9058,16 +9069,16 @@ def crop_tensor(x, shape=None, offsets=None, name=None):
                               [6, 7, 8]]]
 
     Parameters:
-        x (Variable): 1-D to 6-D Tensor, the data type is float32, float64, int32 or int64.
-        shape (list|tuple|Variable): The output shape is specified
+        x (Tensor): 1-D to 6-D Tensor, the data type is float32, float64, int32 or int64.
+        shape (list|tuple|Tensor): The output shape is specified
             by `shape`. Its data type is int32. If a list/tuple, it's length must be
-            the same as the dimension size of `x`. If a Variable, it should be a 1-D Tensor.
+            the same as the dimension size of `x`. If a Tensor, it should be a 1-D Tensor.
             When it is a list, each element can be an integer or a Tensor of shape: [1].
             If Variable contained, it is suitable for the case that the shape may
             be changed each iteration.
         offsets (list|tuple|Variable, optional): Specifies the cropping
             offsets at each dimension. Its data type is int32. If a list/tuple, it's length
-            must be the same as the dimension size of `x`. If a Variable, it should be a 1-D
+            must be the same as the dimension size of `x`. If a Tensor, it should be a 1-D
             Tensor. When it is a list, each element can be an integer or a Tensor of shape: [1].
             If Variable contained, it is suitable for the case that the offsets may be changed
             each iteration. Default: None, the offsets are 0 at each dimension.
@@ -9075,51 +9086,36 @@ def crop_tensor(x, shape=None, offsets=None, name=None):
             this property. For more information, please refer to :ref:`api_guide_Name` .
 
     Returns:
-        Variable: The cropped Tensor has same data type with `x`.
-
-    Raises:
-        TypeError: If the data type of `x` is not in: float32, float64, int32, int64.
-        TypeError: If `shape` is not a list, tuple or Variable.
-        TypeError: If the data type of `shape` is not int32.
-        TypeError: If `offsets` is not None and not a list, tuple or Variable.
-        TypeError: If the data type of `offsets` is not int32.
-        ValueError: If the element in `offsets` is less than zero.
+        Tensor: The cropped Tensor has same data type with `x`.
 
     Examples:
 
         .. code-block:: python
+          :name: code-example1
 
-            import paddle.fluid as fluid
-            import paddle.fluid as fluid
             import paddle
-            paddle.enable_static()
-            x = fluid.data(name="x", shape=[None, 3, 5], dtype="float32")
-            # x.shape = [-1, 3, 5], where -1 indicates batch size, and it will get the exact value in runtime.
-
-            # shape is a 1-D Tensor
-            crop_shape = fluid.data(name="crop_shape", shape=[3], dtype="int32")
-            crop0 = fluid.layers.crop_tensor(x, shape=crop_shape)
-            # crop0.shape = [-1, -1, -1], it means crop0.shape[0] = x.shape[0] in runtime.
-
-            # or shape is a list in which each element is a constant
-            crop1 = fluid.layers.crop_tensor(x, shape=[-1, -1, 3], offsets=[0, 1, 0])
-            # crop1.shape = [-1, 2, 3]
-
-            # or shape is a list in which each element is a constant or Variable
-            y = fluid.data(name="y", shape=[3, 8, 8], dtype="float32")
-            dim1 = fluid.data(name="dim1", shape=[1], dtype="int32")
-            crop2 = fluid.layers.crop_tensor(y, shape=[3, dim1, 4])
-            # crop2.shape = [3, -1, 4]
-
-            # offsets is a 1-D Tensor
-            crop_offsets = fluid.data(name="crop_offsets", shape=[3], dtype="int32")
-            crop3 = fluid.layers.crop_tensor(x, shape=[-1, 2, 3], offsets=crop_offsets)
-            # crop3.shape = [-1, 2, 3]
-
-            # offsets is a list in which each element is a constant or Variable
-            offsets_var =  fluid.data(name="dim1", shape=[1], dtype="int32")
-            crop4 = fluid.layers.crop_tensor(x, shape=[-1, 2, 3], offsets=[0, 1, offsets_var])
-            # crop4.shape = [-1, 2, 3]
+            x = paddle.to_tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+            # x.shape = [3, 3]
+            # x = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
+
+            # shape can be a 1-D Tensor or list or tuple.
+            shape = paddle.to_tensor([2, 2], dtype='int32')
+            # shape = [2, 2]
+            # shape = (2, 2)
+            out = paddle.crop(x, shape)
+            # out.shape = [2, 2]
+            # out = [[1,2], [4,5]]
+
+            # offsets can be a 1-D Tensor or list or tuple.
+            offsets = paddle.to_tensor([0, 1], dtype='int32')
+            # offsets = [1, 0]
+            # offsets = (1, 1)
+            out = paddle.crop(x, shape, offsets)
+            # out.shape = [2, 2]
+            # if offsets = [0, 0], out = [[1,2], [4,5]]
+            # if offsets = [0, 1], out = [[2,3], [5,6]]
+            # if offsets = [1, 0], out = [[4,5], [7,8]]
+            # if offsets = [1, 1], out = [[5,6], [8,9]]
 
     """
     helper = LayerHelper('crop_tensor', **locals())
@@ -12151,17 +12147,22 @@ def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):
             return op(x, y)
         else:
             return op(x)
-
-    check_variable_and_dtype(x, "x", ["bool"], op_name)
+    check_variable_and_dtype(x, "x", [
+        "bool", "int8", "int16", "int32", "int64", "float32", "float64"
+    ], op_name)
     if y is not None:
-        check_variable_and_dtype(y, "y", ["bool"], op_name)
+        check_variable_and_dtype(y, "y", [
+            "bool", "int8", "int16", "int32", "int64", "float32", "float64"
+        ], op_name)
     if out is not None:
         check_type(out, "out", Variable, op_name)
 
     helper = LayerHelper(op_name, **locals())
 
-    if binary_op:
-        assert x.dtype == y.dtype
+    if binary_op and x.dtype != y.dtype:
+        raise ValueError(
+            "(InvalidArgument) The DataType of %s Op's Variable must be consistent, but received %s and %s."
+            % (op_name, x.dtype, y.dtype))
 
     if out is None:
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -12179,7 +12180,7 @@ def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):
 def logical_and(x, y, out=None, name=None):
     r"""
 
-    ``logical_and`` operator computes element-wise logical AND on ``x`` and ``y``, and returns ``out``. ``x``, ``y`` and ``out`` are N-dim boolean ``Tensor``.
+    ``logical_and`` operator computes element-wise logical AND on ``x`` and ``y``, and returns ``out``. ``out`` is N-dim boolean ``Tensor``.
     Each element of ``out`` is calculated by
 
     .. math::
@@ -12190,8 +12191,8 @@ def logical_and(x, y, out=None, name=None):
         ``paddle.logical_and`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting`.
 
     Args:
-        x (Tensor): the input tensor, it's data type should be bool.
-        y (Tensor): the input tensor, it's data type should be bool.
+        x (Tensor): the input tensor, it's data type should be one of bool, int8, int16, in32, in64, float32, float64.
+        y (Tensor): the input tensor, it's data type should be one of bool, int8, int16, in32, in64, float32, float64.
         out(Tensor): The ``Tensor`` that specifies the output of the operator, which can be any ``Tensor`` that has been created in the program. The default value is None, and a new ``Tensor`` will be created to save the output.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
@@ -12215,7 +12216,7 @@ def logical_and(x, y, out=None, name=None):
 def logical_or(x, y, out=None, name=None):
     """
 
-    ``logical_or`` operator computes element-wise logical OR on ``x`` and ``y``, and returns ``out``. ``x``, ``y`` and ``out`` are N-dim boolean ``Tensor``.
+    ``logical_or`` operator computes element-wise logical OR on ``x`` and ``y``, and returns ``out``. ``out`` is N-dim boolean ``Tensor``.
     Each element of ``out`` is calculated by
 
     .. math::
@@ -12226,8 +12227,8 @@ def logical_or(x, y, out=None, name=None):
         ``paddle.logical_or`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting`.
     
     Args:
-        x (Tensor): the input tensor, it's data type should be bool.
-        y (Tensor): the input tensor, it's data type should be bool.
+        x (Tensor): the input tensor, it's data type should be one of bool, int8, int16, in32, in64, float32, float64.
+        y (Tensor): the input tensor, it's data type should be one of bool, int8, int16, in32, in64, float32, float64.
         out(Tensor): The ``Variable`` that specifies the output of the operator, which can be any ``Tensor`` that has been created in the program. The default value is None, and a new ``Tensor`` will be created to save the output.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
@@ -12254,7 +12255,7 @@ def logical_or(x, y, out=None, name=None):
 def logical_xor(x, y, out=None, name=None):
     r"""
 
-    ``logical_xor`` operator computes element-wise logical XOR on ``x`` and ``y``, and returns ``out``. ``x``, ``y`` and ``out`` are N-dim boolean ``Tensor``.
+    ``logical_xor`` operator computes element-wise logical XOR on ``x`` and ``y``, and returns ``out``. ``out`` is N-dim boolean ``Tensor``.
     Each element of ``out`` is calculated by
 
     .. math::
@@ -12265,8 +12266,8 @@ def logical_xor(x, y, out=None, name=None):
         ``paddle.logical_xor`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting`.
 
     Args:
-        x (Tensor): the input tensor, it's data type should be bool.
-        y (Tensor): the input tensor, it's data type should be bool.
+        x (Tensor): the input tensor, it's data type should be one of bool, int8, int16, in32, in64, float32, float64.
+        y (Tensor): the input tensor, it's data type should be one of bool, int8, int16, in32, in64, float32, float64.
         out(Tensor): The ``Tensor`` that specifies the output of the operator, which can be any ``Tensor`` that has been created in the program. The default value is None, and a new ``Tensor`` will be created to save the output.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
@@ -12294,7 +12295,7 @@ def logical_xor(x, y, out=None, name=None):
 def logical_not(x, out=None, name=None):
     """
 
-    ``logical_not`` operator computes element-wise logical NOT on ``x``, and returns ``out``. ``x`` and ``out`` are N-dim boolean ``Variable``.
+    ``logical_not`` operator computes element-wise logical NOT on ``x``, and returns ``out``. ``out`` is N-dim boolean ``Variable``.
     Each element of ``out`` is calculated by
 
     .. math::
@@ -12302,7 +12303,7 @@ def logical_not(x, out=None, name=None):
         out = !x
 
     Args:
-        x(Tensor):  Operand of logical_not operator. Must be a Tensor of type bool.
+        x(Tensor):  Operand of logical_not operator. Must be a Tensor of type bool, int8, int16, in32, in64, float32, or float64.
         out(Tensor): The ``Tensor`` that specifies the output of the operator, which can be any ``Tensor`` that has been created in the program. The default value is None, and a new ``Tensor` will be created to save the output.
         name(str|None): The default value is None. Normally there is no need for users to set this property. For more information, please refer to :ref:`api_guide_Name`.
 
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 486792093a35cf..d60e07674edad0 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -4867,6 +4867,39 @@ def _insert_send_recv(cur_id, prev_id):
                             })
                         extra_index_info['index'] += 1
                     elif self.schedule_mode == '1F1B':  # 1F1B
+                        var_shape = list(var.shape)
+                        var_shape[0] = self.micro_batch_size if var_shape[
+                            0] < 0 else var_shape[0]
+
+                        numel = np.prod(var.shape)
+                        assert numel % self.mp_degree == 0, \
+                            "The numel={} must be divisible by mp_degree={}".format(numel, self.mp_degree)
+
+                        if 'subprog' in var.name:
+                            # For recompute, if the checkpoints var is layer_norm_6.tmp_2
+                            # this var will be sent twice, layer_norm_6.tmp_2 for forward pass,
+                            # layer_norm_6.tmp_2.subprog_* for recompute pass.
+                            # We can store the first sent var and copy the value to the
+                            # second one to reduce one send/recv op.
+                            # The origin_ckpt_name is layer_norm_6.tmp_2, which will be used
+                            # to find the stored var for the forward pass.
+                            origin_name = var.name.split('subprog')[0][0:-1]
+                            associate_var = block.var(origin_name)
+                            block._insert_op_without_sync(
+                                index=index + extra_index_info['index'],
+                                type='assign',
+                                inputs={'X': [associate_var]},
+                                outputs={'Out': [var]},
+                                attrs={
+                                    'out_shape': var_shape,
+                                    'dtype': var.dtype,
+                                    self._op_device_key: cur_dev,
+                                    self._op_role_key: op_role,
+                                    'use_calc_stream': True,
+                                })
+                            extra_index_info['index'] += 1
+                            return
+
                         block._insert_op_without_sync(
                             index=index + extra_index_info['index'],
                             type='c_sync_calc_stream',
@@ -4894,7 +4927,6 @@ def _insert_send_recv(cur_id, prev_id):
                             })
                         extra_index_info['index'] += 1
                         insert_index = None
-
                         if int(op_role) == int(self._op_role.Backward):
                             insert_index = extra_index_info[
                                 'first_optimize_index']
@@ -4902,7 +4934,6 @@ def _insert_send_recv(cur_id, prev_id):
                         else:
                             insert_index = index
                             new_op_role = self._op_role.Backward
-
                         sync_comm_op = block._insert_op_without_sync(
                             index=insert_index + extra_index_info['index'],
                             type='c_sync_comm_stream',
@@ -4913,18 +4944,9 @@ def _insert_send_recv(cur_id, prev_id):
                                 self._op_role_key: new_op_role,
                                 'ring_id': ring_id,
                             })
-
                         if int(op_role) == int(self._op_role.Forward):
                             sync_comm_op._set_attr('pipeline_flag', '')
                             extra_index_info['index'] += 1
-
-                        var_shape = list(var.shape)
-                        var_shape[0] = self.micro_batch_size if var_shape[
-                            0] < 0 else var_shape[0]
-
-                        numel = np.prod(var.shape)
-                        assert numel % self.mp_degree == 0, \
-                            "The numel={} must be divisible by mp_degree={}".format(numel, self.mp_degree)
                         block._insert_op_without_sync(
                             index=index + extra_index_info['index'],
                             type='recv_v2'
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index fcb2dbfa2ec0a2..d39ed01df8ca2f 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -17,6 +17,7 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding_over_height)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_transformer)
 list(APPEND DIST_TEST_OPS test_fleet_pipeline_meta_optimizer)
+list(APPEND DIST_TEST_OPS test_fleet_pipeline_meta_optimizer_with_recompute)
 list(APPEND DIST_TEST_OPS test_fleet_raw_program_meta_optimizer)
 list(APPEND DIST_TEST_OPS test_fleet_graph_execution_meta_optimizer)
 list(APPEND DIST_TEST_OPS test_gen_nccl_id_op)
@@ -56,6 +57,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_base_2)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_base_3)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_recompute_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_pipeline_meta_optimizer)
+list(APPEND MIXED_DIST_TEST_OPS test_fleet_pipeline_meta_optimizer_with_recompute)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_raw_program_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_amp_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_amp_init)
@@ -411,6 +413,7 @@ list(REMOVE_ITEM TEST_OPS test_imperative_static_runner_mnist)
 list(REMOVE_ITEM TEST_OPS test_imperative_static_runner_while)
 # disable test_cumsum_op temporaily
 # list(REMOVE_ITEM TEST_OPS test_cumsum_op)
+list(REMOVE_ITEM TEST_OPS test_dataloader_dataset)
 
 # disable this unittest temporarily
 list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_exception)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_container.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_container.py
index 647c9e9672cf0c..2c82f5c6990871 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_container.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_container.py
@@ -47,10 +47,30 @@ def forward(self, x):
         return out
 
 
+class NestSequentialNet(paddle.nn.Layer):
+    def __init__(self):
+        super().__init__()
+        group1 = paddle.nn.Sequential(
+            paddle.nn.Linear(10, 10),
+            paddle.nn.Sigmoid(), )
+        group2 = paddle.nn.Sequential(
+            paddle.nn.Linear(10, 3),
+            paddle.nn.ReLU(), )
+        self.layers = paddle.nn.Sequential(group1, group2)
+
+    def forward(self, x):
+        return self.layers(x)
+
+
 class TestSequential(unittest.TestCase):
     def setUp(self):
         paddle.set_device('cpu')
         self.seed = 2021
+        self._init_config()
+
+    def _init_config(self):
+        self.net = SequentialNet(BufferLayers, 10, 3)
+        self.model_path = './sequential_net'
 
     def _init_seed(self):
         paddle.seed(self.seed)
@@ -58,13 +78,12 @@ def _init_seed(self):
 
     def _run(self, to_static):
         self._init_seed()
-        net = SequentialNet(BufferLayers, 10, 3)
         if to_static:
-            net = paddle.jit.to_static(net)
+            self.net = paddle.jit.to_static(self.net)
         x = paddle.rand([16, 10], 'float32')
-        out = net(x)
+        out = self.net(x)
         if to_static:
-            load_out = self._test_load(net, x)
+            load_out = self._test_load(self.net, x)
             self.assertTrue(
                 np.allclose(load_out, out),
                 msg='load_out is {}\st_out is {}'.format(load_out, out))
@@ -80,12 +99,17 @@ def test_train(self):
             msg='dygraph_res is {}\nstatic_res is {}'.format(dy_out, st_out))
 
     def _test_load(self, net, x):
-        model_path = './sequential_net'
-        paddle.jit.save(net, model_path)
-        load_net = paddle.jit.load(model_path)
+        paddle.jit.save(net, self.model_path)
+        load_net = paddle.jit.load(self.model_path)
         out = load_net(x)
         return out
 
 
+class TestNestSequential(TestSequential):
+    def _init_config(self):
+        self.net = NestSequentialNet()
+        self.model_path = './nested_sequential_net'
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
index 792a976aeb0252..281bbb078b74b6 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
@@ -36,4 +36,5 @@ set_tests_properties(test_trt_conv_pass PROPERTIES TIMEOUT 120)
 #set_tests_properties(test_trt_multiclass_nms_op PROPERTIES TIMEOUT 200)
 set_tests_properties(test_trt_dynamic_shape PROPERTIES TIMEOUT 120)
 set_tests_properties(test_trt_pool_op PROPERTIES ENVIRONMENT FLAGS_fraction_of_gpu_memory_to_use=0.1 TIMEOUT 45)
+set_tests_properties(test_trt_reduce_mean_op PROPERTIES TIMEOUT 60)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reduce_mean_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reduce_mean_op.py
new file mode 100644
index 00000000000000..1bfccd3e72df9f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reduce_mean_op.py
@@ -0,0 +1,235 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+from paddle.fluid.core import AnalysisConfig
+
+
+class TRTReduceMeanTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, -1, -1], dtype="float32")
+            reduce_mean = fluid.layers.reduce_mean(
+                data, dim=[2, -1], keep_dim=True)
+            out = fluid.layers.batch_norm(reduce_mean, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random([3, 3, 224, 224]).astype("float32"),
+        }
+        self.enable_trt = True
+        self.trt_parameters = TRTReduceMeanTest.TensorRTParam(
+            1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+        self.dynamic_shape_params = TRTReduceMeanTest.DynamicShapeParam({
+            'data': [1, 3, 64, 64]
+        }, {'data': [3, 3, 224, 224]}, {'data': [3, 3, 224, 224]}, False)
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu, flatten=True)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
+class TRTReduceMeanAllNoBatchTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, -1, -1], dtype="float32")
+            reduce_mean = fluid.layers.reduce_mean(data, keep_dim=True)
+            out = fluid.layers.batch_norm(reduce_mean, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random([3, 3, 224, 224]).astype("float32"),
+        }
+        self.enable_trt = True
+        self.trt_parameters = TRTReduceMeanAllNoBatchTest.TensorRTParam(
+            1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+        self.dynamic_shape_params = TRTReduceMeanAllNoBatchTest.DynamicShapeParam(
+            {
+                'data': [1, 3, 64, 64]
+            }, {'data': [3, 3, 224, 224]}, {'data': [3, 3, 224, 224]}, False)
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu, flatten=True)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
+class TRTReduceMeanTestFP16(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, -1, -1], dtype="float32")
+            reduce_mean = fluid.layers.reduce_mean(
+                data, dim=[2, -1], keep_dim=True)
+            out = fluid.layers.batch_norm(reduce_mean, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random([3, 3, 224, 224]).astype("float32"),
+        }
+        self.enable_trt = True
+        self.trt_parameters = TRTReduceMeanTestFP16.TensorRTParam(
+            1 << 30, 32, 1, AnalysisConfig.Precision.Half, False, False)
+        self.fetch_list = [out]
+        self.dynamic_shape_params = TRTReduceMeanTestFP16.DynamicShapeParam({
+            'data': [1, 3, 64, 64]
+        }, {'data': [3, 3, 224, 224]}, {'data': [3, 3, 224, 224]}, False)
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu, flatten=True)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
+class TRTReduceMeanAllTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 224, 224], dtype="float32")
+            reduce_mean = fluid.layers.reduce_mean(data, keep_dim=True)
+            out = fluid.layers.batch_norm(reduce_mean, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random([3, 3, 224, 224]).astype("float32"),
+        }
+        self.enable_trt = True
+        self.trt_parameters = TRTReduceMeanAllTest.TensorRTParam(
+            1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+        self.dynamic_shape_params = TRTReduceMeanAllTest.DynamicShapeParam({
+            'data': [1, 3, 224, 224]
+        }, {'data': [3, 3, 224, 224]}, {'data': [3, 3, 224, 224]}, False)
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu, flatten=True)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
+class TRTReduceMeanTestStatic(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[3, 3, 224, 224], dtype="float32")
+            reduce_mean = fluid.layers.reduce_mean(
+                data, dim=[2, -1], keep_dim=True)
+            out = fluid.layers.batch_norm(reduce_mean, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random([3, 3, 224, 224]).astype("float32"),
+        }
+        self.enable_trt = True
+        self.trt_parameters = TRTReduceMeanTestStatic.TensorRTParam(
+            1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu, flatten=True)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
+class TRTReduceMeanStaticAllTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[4, 3, 224, 224], dtype="float32")
+            reduce_mean = fluid.layers.reduce_mean(data, keep_dim=True)
+            out = fluid.layers.batch_norm(reduce_mean, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random([4, 3, 224, 224]).astype("float32"),
+        }
+        self.enable_trt = True
+        self.trt_parameters = TRTReduceMeanStaticAllTest.TensorRTParam(
+            1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu, flatten=True)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
+class TRTReduceMeanStaticFP16(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[4, 3, 224, 224], dtype="float32")
+            reduce_mean = fluid.layers.reduce_mean(data, keep_dim=True)
+            out = fluid.layers.batch_norm(reduce_mean, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random([4, 3, 224, 224]).astype("float32"),
+        }
+        self.enable_trt = True
+        self.trt_parameters = TRTReduceMeanStaticFP16.TensorRTParam(
+            1 << 30, 32, 1, AnalysisConfig.Precision.Half, False, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu, flatten=True)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
+class TRTReduceMeanFP16Static(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[4, 3, 224, 224], dtype="float32")
+            reduce_mean = fluid.layers.reduce_mean(data, keep_dim=True)
+            out = fluid.layers.batch_norm(reduce_mean, is_test=True)
+
+        self.feeds = {
+            "data": np.random.random([4, 3, 224, 224]).astype("float32"),
+        }
+        self.enable_trt = True
+        self.trt_parameters = TRTReduceMeanFP16Static.TensorRTParam(
+            1 << 30, 32, 1, AnalysisConfig.Precision.Half, True, False)
+        self.fetch_list = [out]
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu, flatten=True)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_bf16_mkldnn_op.py
new file mode 100644
index 00000000000000..3d5a0139158337
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_bf16_mkldnn_op.py
@@ -0,0 +1,106 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from scipy.special import expit, erf
+import paddle.fluid.core as core
+from paddle.fluid.tests.unittests.op_test import OpTest, OpTestTool, convert_float_to_uint16
+from paddle.fluid.tests.unittests.test_activation_op import TestActivation
+from paddle.fluid.tests.unittests.test_gelu_op import gelu
+
+
+@OpTestTool.skip_if_not_cpu_bf16()
+class TestMKLDNNSigmoidBF16Op(TestActivation):
+    def config(self):
+        self.op_type = "sigmoid"
+
+    def op_forward(self, x):
+        return 1 / (1 + np.exp(-x))
+
+    def op_grad(self, dout, x):
+        return dout * self.op_forward(x) * (1 - self.op_forward(x))
+
+    def set_attrs(self):
+        self.attrs = {"use_mkldnn": True}
+
+    def init_data(self):
+        self.x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype(np.float32)
+
+    def setUp(self):
+        self.dtype = np.uint16
+        self.init_data()
+        self.config()
+        self.out = self.op_forward(self.x)
+
+        self.inputs = {'X': convert_float_to_uint16(self.x)}
+        self.outputs = {'Out': self.out}
+        self.set_attrs()
+
+    def calculate_grads(self):
+        self.dx = self.op_grad(self.out, self.x)
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CPUPlace())
+
+    def test_check_grad(self):
+        self.calculate_grads()
+        self.check_grad_with_place(
+            core.CPUPlace(), ["X"],
+            "Out",
+            user_defined_grads=[self.dx],
+            user_defined_grad_outputs=[convert_float_to_uint16(self.out)])
+
+
+class TestMKLDNNGeluErfBF16Op(TestMKLDNNSigmoidBF16Op):
+    def config(self):
+        self.op_type = "gelu"
+
+    def op_forward(self, x):
+        return gelu(x, False)
+
+    def op_grad(self, dout, x):
+        return (dout *
+                (0.5 + 0.5 * erf(x / np.sqrt(2)) +
+                 (x / np.sqrt(2 * np.pi) * np.exp(-0.5 * np.power(x, 2)))))
+
+
+class TestMKLDNNGeluErfDim2BF16Op(TestMKLDNNGeluErfBF16Op):
+    def init_data(self):
+        self.x = np.random.uniform(-1, 1, [11, 17]).astype(np.float32)
+
+
+class TestMKLDNNGeluTanhBF16Op(TestMKLDNNSigmoidBF16Op):
+    def config(self):
+        self.op_type = "gelu"
+
+    def op_forward(self, x):
+        return gelu(x, True)
+
+    def op_grad(self, dout, x):
+        grad_part = np.tanh(
+            np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3)))
+        return dout * 0.5 * (1 + grad_part) * (1 + np.sqrt(2 / np.pi) *
+                                               (x + 0.134145 * np.power(x, 3)) *
+                                               (1 - grad_part))
+
+    def set_attrs(self):
+        self.attrs = {"use_mkldnn": True, "approximate": True}
+
+
+class TestMKLDNNGeluTanhDim2BF16Op(TestMKLDNNGeluTanhBF16Op):
+    def init_data(self):
+        self.x = np.random.uniform(-1, 1, [11, 17]).astype(np.float32)
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
index 75348cd53e1b80..e10a7be40d85cf 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
@@ -16,9 +16,9 @@
 
 import unittest
 import numpy as np
-from scipy.special import expit
+from scipy.special import expit, erf
 import paddle.fluid.core as core
-from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16
+from paddle.fluid.tests.unittests.op_test import OpTest, OpTestTool, convert_float_to_uint16
 from paddle.fluid.tests.unittests.test_activation_op import TestActivation, TestRelu, TestTanh, TestSqrt, TestAbs, TestLeakyRelu, TestSwish, TestHardSwish, TestRelu6, TestSigmoid
 from paddle.fluid.tests.unittests.test_gelu_op import gelu
 from mkldnn_op_test import check_if_mkldnn_primitives_exist_in_bwd
@@ -79,48 +79,6 @@ def setUp(self):
         self.attrs = {"use_mkldnn": True, "approximate": True}
 
 
-@unittest.skipIf(not core.supports_bfloat16(),
-                 "place does not support BF16 evaluation")
-class TestMKLDNNGeluBf16Dim2(TestActivation):
-    def setUp(self):
-        self.op_type = "gelu"
-        self.dtype = np.uint16
-
-        x = np.random.uniform(-1, 1, [11, 17]).astype(np.float32)
-        out = convert_float_to_uint16(gelu(x, False))
-
-        self.inputs = {'X': convert_float_to_uint16(x)}
-        self.outputs = {'Out': out}
-        self.attrs = {"use_mkldnn": True}
-
-    def test_check_output(self):
-        self.check_output_with_place(core.CPUPlace())
-
-    def test_check_grad(self):
-        pass
-
-
-@unittest.skipIf(not core.supports_bfloat16(),
-                 "place does not support BF16 evaluation")
-class TestMKLDNNGeluBf16Dim2Approx(TestActivation):
-    def setUp(self):
-        self.op_type = "gelu"
-        self.dtype = np.uint16
-
-        x = np.random.uniform(-1, 1, [11, 17]).astype(np.float32)
-        out = convert_float_to_uint16(gelu(x, True))
-
-        self.inputs = {'X': convert_float_to_uint16(x)}
-        self.outputs = {'Out': out}
-        self.attrs = {"use_mkldnn": True, "approximate": True}
-
-    def test_check_output(self):
-        self.check_output_with_place(core.CPUPlace())
-
-    def test_check_grad(self):
-        pass
-
-
 class TestMKLDNNTanhDim2(TestTanh):
     def setUp(self):
         super(TestMKLDNNTanhDim2, self).setUp()
diff --git a/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt
index f71e04c09aa38b..4ab9262f248a21 100644
--- a/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt
@@ -1,6 +1,8 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
-foreach(TEST_OP ${TEST_OPS})
-    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
-endforeach(TEST_OP)
+if (WITH_ASCEND_CL)
+    foreach(TEST_OP ${TEST_OPS})
+        py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+    endforeach(TEST_OP)
+endif()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_abs_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_abs_op_npu.py
index 9382cf2162ef25..3c16a24b33191d 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_abs_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_abs_op_npu.py
@@ -25,8 +25,6 @@
 paddle.enable_static()
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestNPUAbs(OpTest):
     def setUp(self):
         self.op_type = "abs"
diff --git a/python/paddle/fluid/tests/unittests/npu/test_accuracy_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_accuracy_op_npu.py
index 5aeca5abd9f831..0f55c8b5914870 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_accuracy_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_accuracy_op_npu.py
@@ -27,8 +27,6 @@
 SEED = 2021
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestAccuracy(OpTest):
     def setUp(self):
         self.op_type = "accuracy"
@@ -60,7 +58,7 @@ def init_dtype(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
 
 class TestAccuracy2(TestAccuracy):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py
index 17dad036185d84..02d4002f72c492 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_adam_op_npu.py
@@ -25,8 +25,6 @@
 SEED = 2021
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestAdam(OpTest):
     def setUp(self):
         self.set_npu()
@@ -75,11 +73,9 @@ def init_dtype(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, atol=1e-5, check_dygraph=False)
+        self.check_output_with_place(self.place, atol=1e-5)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestAdamWithEpsilonTensor(OpTest):
     def setUp(self):
         self.set_npu()
@@ -131,11 +127,9 @@ def init_dtype(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, atol=1e-5, check_dygraph=False)
+        self.check_output_with_place(self.place, atol=1e-5)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestAdamOpWithSkipUpdate(OpTest):
     def setUp(self):
         self.set_npu()
@@ -185,11 +179,9 @@ def init_dtype(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, atol=1e-5, check_dygraph=False)
+        self.check_output_with_place(self.place, atol=1e-5)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestAdamOpWithGlobalBetaPow(OpTest):
     def setUp(self):
         self.set_npu()
@@ -244,11 +236,9 @@ def init_dtype(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, atol=1e-5, check_dygraph=False)
+        self.check_output_with_place(self.place, atol=1e-5)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestNet(unittest.TestCase):
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
@@ -309,8 +299,6 @@ def test_npu(self):
         self.assertTrue(np.allclose(npu_loss, cpu_loss, rtol=1e-3))
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestNetWithEpsilonTensor(unittest.TestCase):
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py
index e92bfbb4d77d33..604eb32db0a6c9 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_amp_check_finite_and_scale_op_npu.py
@@ -25,8 +25,6 @@
 paddle.enable_static()
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestCheckFiniteAndUnscale(unittest.TestCase):
     def get_prog(self):
         paddle.enable_static()
@@ -39,11 +37,11 @@ def get_prog(self):
                 name="status", shape=[8], dtype='float32')
             main_program.global_block().append_op(
                 type="alloc_float_status",
-                outputs={"FloatStatus": float_status}, )
+                outputs={"FloatStatus": float_status})
             main_program.global_block().append_op(
                 type="clear_float_status",
                 inputs={"FloatStatus": float_status},
-                outputs={"FloatStatusOut": float_status}, )
+                outputs={"FloatStatusOut": float_status})
             c = paddle.fluid.layers.elementwise_div(a, b)
             out, found_inf = check_finite_and_unscale(
                 [c], scale, float_status=float_status)
@@ -95,8 +93,6 @@ def test_not_contains_nan_inf(self):
         self.assertFalse(found_inf[0])
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestCheckFiniteAndUnscaleClearFloatStatus(unittest.TestCase):
     def get_prog(self):
         paddle.enable_static()
@@ -109,21 +105,21 @@ def get_prog(self):
                 name="status", shape=[8], dtype='float32')
             main_program.global_block().append_op(
                 type="alloc_float_status",
-                outputs={"FloatStatus": float_status}, )
+                outputs={"FloatStatus": float_status})
             main_program.global_block().append_op(
                 type="clear_float_status",
                 inputs={"FloatStatus": float_status},
-                outputs={"FloatStatusOut": float_status}, )
+                outputs={"FloatStatusOut": float_status})
             c = paddle.fluid.layers.elementwise_div(a, b)
             out, found_inf = check_finite_and_unscale(
                 [c], scale, float_status=float_status)
             main_program.global_block().append_op(
                 type="alloc_float_status",
-                outputs={"FloatStatus": float_status}, )
+                outputs={"FloatStatus": float_status})
             main_program.global_block().append_op(
                 type="clear_float_status",
                 inputs={"FloatStatus": float_status},
-                outputs={"FloatStatusOut": float_status}, )
+                outputs={"FloatStatusOut": float_status})
             d = paddle.fluid.layers.elementwise_add(a, b)
             out, found_inf = check_finite_and_unscale(
                 [d], scale, float_status=float_status)
diff --git a/python/paddle/fluid/tests/unittests/test_assign_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_assign_op_npu.py
similarity index 88%
rename from python/paddle/fluid/tests/unittests/test_assign_op_npu.py
rename to python/paddle/fluid/tests/unittests/npu/test_assign_op_npu.py
index ed21549b7e01fd..14133d5a385ff8 100644
--- a/python/paddle/fluid/tests/unittests/test_assign_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_assign_op_npu.py
@@ -27,8 +27,6 @@
 SEED = 2021
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestAssign(OpTest):
     def setUp(self):
         self.set_npu()
@@ -49,7 +47,7 @@ def init_dtype(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/npu/test_bce_loss_npu.py b/python/paddle/fluid/tests/unittests/npu/test_bce_loss_npu.py
index eb00e777fe0eef..16db952533437c 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_bce_loss_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_bce_loss_npu.py
@@ -147,8 +147,6 @@ def calc_bceloss(input_np, label_np, reduction='mean', weight_np=None):
     return expected
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestBCELoss(unittest.TestCase):
     def test_BCELoss(self):
         input_np = np.random.uniform(0.1, 0.8, size=(20, 30)).astype(np.float32)
@@ -220,8 +218,6 @@ def bce_loss(input, label):
     return -1 * (label * np.log(input) + (1. - label) * np.log(1. - input))
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestBceLossOp(OpTest):
     def setUp(self):
         self.set_npu()
@@ -248,15 +244,11 @@ def init_test_case(self):
         self.shape = [10, 10]
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestBceLossOpCase1(OpTest):
     def init_test_cast(self):
         self.shape = [2, 3, 4, 5]
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestBceLossOpCase2(OpTest):
     def init_test_cast(self):
         self.shape = [2, 3, 20]
diff --git a/python/paddle/fluid/tests/unittests/npu/test_cast_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_cast_op_npu.py
index ae48866b7b969d..f522eb10d92e6b 100755
--- a/python/paddle/fluid/tests/unittests/npu/test_cast_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_cast_op_npu.py
@@ -27,8 +27,6 @@
 SEED = 2021
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestCast1(OpTest):
     def setUp(self):
         self.set_npu()
@@ -48,7 +46,7 @@ def set_npu(self):
         self.__class__.use_npu = True
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
 
 class TestCast2(OpTest):
@@ -70,7 +68,7 @@ def set_npu(self):
         self.__class__.use_npu = True
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-3)
+        self.check_output_with_place(self.place, atol=1e-3)
 
 
 class TestCast3(OpTest):
@@ -92,7 +90,7 @@ def set_npu(self):
         self.__class__.use_npu = True
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-3)
+        self.check_output_with_place(self.place, atol=1e-3)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/npu/test_coalesce_tensor_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_coalesce_tensor_op_npu.py
index 37fa5f8cad2abe..f1bbf0becf1950 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_coalesce_tensor_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_coalesce_tensor_op_npu.py
@@ -28,8 +28,6 @@
 alignment = 512
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestAllocContinuousSpace(OpTest):
     def setUp(self):
         self.__class__.use_npu = True
@@ -82,12 +80,9 @@ def test_check_output(self):
         self.check_output_with_place(
             place=paddle.NPUPlace(0),
             no_check_set=["FusedOutput"],
-            atol=1e-5,
-            check_dygraph=False)
+            atol=1e-5, )
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestAllocContinuousSpace2(TestAllocContinuousSpace):
     def init_attr(self):
         return {
@@ -102,8 +97,7 @@ def test_check_output(self):
         self.check_output_with_place(
             place=paddle.NPUPlace(0),
             no_check_set=["FusedOutput"],
-            atol=1e-5,
-            check_dygraph=False)
+            atol=1e-5, )
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/npu/test_compare_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_compare_op_npu.py
index 54a2c1e7163a9f..909bfaaa07fd58 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_compare_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_compare_op_npu.py
@@ -26,8 +26,6 @@
 SEED = 2021
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestEqual(OpTest):
     def setUp(self):
         self.set_npu()
@@ -53,11 +51,9 @@ def init_dtype(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestLessthan(OpTest):
     def setUp(self):
         self.set_npu()
@@ -83,7 +79,7 @@ def init_dtype(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
 
 class TestEqual2(TestEqual):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_concat_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_concat_op_npu.py
index a2ec1c7a9eef6e..8f11d00ccabf67 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_concat_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_concat_op_npu.py
@@ -26,8 +26,6 @@
 SEED = 2021
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestConcat(OpTest):
     def setUp(self):
         self.set_npu()
@@ -56,7 +54,7 @@ def init_dtype(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
     def init_test_data(self):
         self.x0 = np.random.random((1, 4, 50)).astype(self.dtype)
@@ -65,12 +63,9 @@ def init_test_data(self):
         self.axis = 0
 
     def test_check_grad(self):
-        self.check_grad_with_place(
-            self.place, ['x0', 'x2'], 'Out', check_dygraph=False)
-        self.check_grad_with_place(
-            self.place, ['x1'], 'Out', check_dygraph=False)
-        self.check_grad_with_place(
-            self.place, ['x2'], 'Out', check_dygraph=False)
+        self.check_grad_with_place(self.place, ['x0', 'x2'], 'Out')
+        self.check_grad_with_place(self.place, ['x1'], 'Out')
+        self.check_grad_with_place(self.place, ['x2'], 'Out')
 
 
 class TestConcatFP16(OpTest):
@@ -102,7 +97,7 @@ def init_dtype(self):
         self.dtype = np.float16
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
     def init_test_data(self):
         self.x0 = np.random.random((1, 4, 50)).astype(self.dtype)
diff --git a/python/paddle/fluid/tests/unittests/npu/test_cumsum_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_cumsum_op_npu.py
new file mode 100644
index 00000000000000..5a3f98524bbd09
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_cumsum_op_npu.py
@@ -0,0 +1,253 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.op_test import OpTest
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+
+paddle.enable_static()
+
+
+class TestCumsumOp(unittest.TestCase):
+    def run_cases(self):
+        data_np = np.arange(12).reshape(3, 4)
+        data = paddle.to_tensor(data_np)
+
+        y = paddle.cumsum(data)
+        z = np.cumsum(data_np)
+        self.assertTrue(np.array_equal(z, y.numpy()))
+
+        y = paddle.cumsum(data, axis=0)
+        z = np.cumsum(data_np, axis=0)
+        self.assertTrue(np.array_equal(z, y.numpy()))
+
+        y = paddle.cumsum(data, axis=-1)
+        z = np.cumsum(data_np, axis=-1)
+        self.assertTrue(np.array_equal(z, y.numpy()))
+
+        y = paddle.cumsum(data, dtype='float32')
+        self.assertTrue(y.dtype == core.VarDesc.VarType.FP32)
+
+        y = paddle.cumsum(data, dtype=np.int32)
+        self.assertTrue(y.dtype == core.VarDesc.VarType.INT32)
+
+        y = paddle.cumsum(data, axis=-2)
+        z = np.cumsum(data_np, axis=-2)
+        self.assertTrue(np.array_equal(z, y.numpy()))
+
+    def run_static(self, use_npu=False):
+        with fluid.program_guard(fluid.Program()):
+            data_np = np.random.random((100, 100)).astype(np.float32)
+            x = paddle.static.data('X', [100, 100])
+            y = paddle.cumsum(x)
+            y2 = paddle.cumsum(x, axis=0)
+            y3 = paddle.cumsum(x, axis=-1)
+            y4 = paddle.cumsum(x, dtype='float32')
+            y5 = paddle.cumsum(x, dtype=np.int32)
+            y6 = paddle.cumsum(x, axis=-2)
+
+            place = fluid.NPUPlace(0) if use_npu else fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            out = exe.run(feed={'X': data_np},
+                          fetch_list=[
+                              y.name, y2.name, y3.name, y4.name, y5.name,
+                              y6.name
+                          ])
+
+            z = np.cumsum(data_np)
+            self.assertTrue(np.allclose(z, out[0]))
+            z = np.cumsum(data_np, axis=0)
+            self.assertTrue(np.allclose(z, out[1]))
+            z = np.cumsum(data_np, axis=-1)
+            self.assertTrue(np.allclose(z, out[2]))
+            self.assertTrue(out[3].dtype == np.float32)
+            self.assertTrue(out[4].dtype == np.int32)
+            z = np.cumsum(data_np, axis=-2)
+            self.assertTrue(np.allclose(z, out[5]))
+
+    def test_npu(self):
+        # Now, npu tests need setting paddle.enable_static()
+
+        self.run_static(use_npu=True)
+
+    def test_name(self):
+        with fluid.program_guard(fluid.Program()):
+            x = paddle.static.data('x', [3, 4])
+            y = paddle.cumsum(x, name='out')
+            self.assertTrue('out' in y.name)
+
+
+class TestNPUCumSumOp1(OpTest):
+    def setUp(self):
+        self.op_type = "cumsum"
+        self.set_npu()
+        self.init_dtype()
+        self.init_testcase()
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.place = paddle.NPUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def init_testcase(self):
+        self.attrs = {'axis': 2}
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.outputs = {'Out': self.inputs['X'].cumsum(axis=2)}
+
+
+class TestNPUCumSumOp2(TestNPUCumSumOp1):
+    def init_testcase(self):
+        self.attrs = {'axis': -1, 'reverse': True}
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.outputs = {
+            'Out': np.flip(
+                np.flip(
+                    self.inputs['X'], axis=2).cumsum(axis=2), axis=2)
+        }
+
+
+class TestNPUCumSumOp3(TestNPUCumSumOp1):
+    def init_testcase(self):
+        self.attrs = {'axis': 1}
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.outputs = {'Out': self.inputs['X'].cumsum(axis=1)}
+
+
+class TestNPUCumSumOp4(TestNPUCumSumOp1):
+    def init_testcase(self):
+        self.attrs = {'axis': 0}
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.outputs = {'Out': self.inputs['X'].cumsum(axis=0)}
+
+
+class TestNPUCumSumOp5(TestNPUCumSumOp1):
+    def init_testcase(self):
+        self.inputs = {'X': np.random.random((5, 20)).astype(self.dtype)}
+        self.outputs = {'Out': self.inputs['X'].cumsum(axis=1)}
+
+
+class TestNPUCumSumOp7(TestNPUCumSumOp1):
+    def init_testcase(self):
+        self.inputs = {'X': np.random.random((100)).astype(self.dtype)}
+        self.outputs = {'Out': self.inputs['X'].cumsum(axis=0)}
+
+
+class TestNPUCumSumExclusive1(TestNPUCumSumOp1):
+    def init_testcase(self):
+        self.attrs = {'axis': 2, "exclusive": True}
+        a = np.random.random((4, 5, 65)).astype(self.dtype)
+        self.inputs = {'X': a}
+        self.outputs = {
+            'Out': np.concatenate(
+                (np.zeros(
+                    (4, 5, 1), dtype=self.dtype), a[:, :, :-1].cumsum(axis=2)),
+                axis=2)
+        }
+
+
+class TestNPUCumSumExclusive2(TestNPUCumSumOp1):
+    def init_testcase(self):
+        self.attrs = {'axis': 2, "exclusive": True}
+        a = np.random.random((1, 1, 888)).astype(self.dtype)
+        self.inputs = {'X': a}
+        self.outputs = {
+            'Out': np.concatenate(
+                (np.zeros(
+                    (1, 1, 1), dtype=self.dtype), a[:, :, :-1].cumsum(axis=2)),
+                axis=2)
+        }
+
+
+class TestNPUCumSumExclusive3(TestNPUCumSumOp1):
+    def init_testcase(self):
+        self.attrs = {'axis': 2, "exclusive": True}
+        a = np.random.random((4, 5, 888)).astype(self.dtype)
+        self.inputs = {'X': a}
+        self.outputs = {
+            'Out': np.concatenate(
+                (np.zeros(
+                    (4, 5, 1), dtype=self.dtype), a[:, :, :-1].cumsum(axis=2)),
+                axis=2)
+        }
+
+
+class TestNPUCumSumExclusive4(TestNPUCumSumOp1):
+    def init_testcase(self):
+        self.attrs = {'axis': 2, "exclusive": True}
+        a = np.random.random((1, 1, 3049)).astype(self.dtype)
+        self.inputs = {'X': a}
+        self.outputs = {
+            'Out': np.concatenate(
+                (np.zeros(
+                    (1, 1, 1), dtype=self.dtype), a[:, :, :-1].cumsum(axis=2)),
+                axis=2)
+        }
+
+
+class TestNPUCumSumExclusive5(TestNPUCumSumOp1):
+    def init_testcase(self):
+        self.attrs = {'axis': 2, "exclusive": True}
+        a = np.random.random((4, 5, 3096)).astype(self.dtype)
+        self.inputs = {'X': a}
+        self.outputs = {
+            'Out': np.concatenate(
+                (np.zeros(
+                    (4, 5, 1), dtype=self.dtype), a[:, :, :-1].cumsum(axis=2)),
+                axis=2)
+        }
+
+
+class TestNPUCumSumReverseExclusive(TestNPUCumSumOp1):
+    def init_testcase(self):
+        self.attrs = {'axis': 2, 'reverse': True, "exclusive": True}
+        a = np.random.random((4, 5, 6)).astype(self.dtype)
+        self.inputs = {'X': a}
+        a = np.flip(a, axis=2)
+        self.outputs = {
+            'Out': np.concatenate(
+                (np.flip(
+                    a[:, :, :-1].cumsum(axis=2), axis=2), np.zeros(
+                        (4, 5, 1), dtype=self.dtype)),
+                axis=2)
+        }
+
+
+class TestNPUCumSumWithFlatten1(TestNPUCumSumOp1):
+    def init_testcase(self):
+        self.attrs = {'flatten': True}
+        self.inputs = {'X': np.random.random((5, 6)).astype(self.dtype)}
+        self.outputs = {'Out': self.inputs['X'].cumsum()}
+
+
+class TestNPUCumSumWithFlatten2(TestNPUCumSumOp1):
+    def init_testcase(self):
+        self.attrs = {'flatten': True}
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)}
+        self.outputs = {'Out': self.inputs['X'].cumsum()}
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_dropout_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_dropout_op_npu.py
index 6b936514452f74..4c434561e1cc11 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_dropout_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_dropout_op_npu.py
@@ -28,8 +28,6 @@
 EPOCH = 100
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestDropoutOp(OpTest):
     def setUp(self):
         self.op_type = "dropout"
@@ -55,17 +53,14 @@ def set_npu(self):
         self.place = paddle.NPUPlace(0)
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
     def test_check_grad_normal(self):
         if self.dtype == np.float16:
             return
-        self.check_grad_with_place(
-            self.place, ['X'], 'Out', check_dygraph=False)
+        self.check_grad_with_place(self.place, ['X'], 'Out')
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestDropoutOpInput1d(TestDropoutOp):
     # change input shape
     def setUp(self):
@@ -85,15 +80,13 @@ def setUp(self):
         }
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestDropoutOpInput1d(TestDropoutOp):
     # the input is 1-D
     def setUp(self):
         self.op_type = "dropout"
         self.set_npu()
         self.init_dtype()
-        self.inputs = {'X': np.random.random((2000, )).astype(self.dtype)}
+        self.inputs = {'X': np.random.random((2000)).astype(self.dtype)}
         self.attrs = {
             'dropout_prob': 0.0,
             'fix_seed': True,
@@ -106,8 +99,6 @@ def setUp(self):
         }
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestDropoutOp2(TestDropoutOp):
     # the dropout_prob is 1.0
     def setUp(self):
@@ -127,8 +118,6 @@ def setUp(self):
         }
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestDropoutOp3(TestDropoutOp):
     # the input dim is 3
     def setUp(self):
@@ -148,8 +137,6 @@ def setUp(self):
         }
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 @skip_check_grad_ci(reason="For inference, check_grad is not required.")
 class TestDropoutOpInference(OpTest):
     # is_test = True
@@ -174,11 +161,9 @@ def set_npu(self):
         self.place = paddle.NPUPlace(0)
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 @skip_check_grad_ci(reason="For inference, check_grad is not required.")
 class TestDropoutOpInference2(TestDropoutOpInference):
     def setUp(self):
@@ -194,8 +179,6 @@ def setUp(self):
         self.outputs = {'Out': self.inputs['X']}
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestDropoutOpWithSeed(TestDropoutOp):
     # the seed is a Tensor
     def setUp(self):
@@ -218,8 +201,6 @@ def setUp(self):
         }
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestDropoutOpFp16(TestDropoutOp):
     # float16
     def init_dtype(self):
@@ -231,8 +212,6 @@ def set_npu(self):
         self.place = paddle.NPUPlace(0)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestDropoutAPI(unittest.TestCase):
     def setUp(self):
         np.random.seed(123)
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py
index 6a82157faaec41..5288db5ceb1c6f 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py
@@ -25,8 +25,6 @@
 paddle.enable_static()
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestElementwiseAddOp(OpTest):
     def setUp(self):
         self.set_npu()
@@ -62,34 +60,32 @@ def init_axis(self):
         self.axis = -1
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
     def test_check_grad_normal(self):
         self.check_grad_with_place(
-            self.place, ['X', 'Y'],
+            self.place,
+            ['X', 'Y'],
             'Out',
-            max_relative_error=0.006,
-            check_dygraph=False)
+            max_relative_error=0.006, )
 
     def test_check_grad_ingore_x(self):
         self.check_grad_with_place(
-            self.place, ['Y'],
+            self.place,
+            ['Y'],
             'Out',
             no_grad_set=set("X"),
-            max_relative_error=0.006,
-            check_dygraph=False)
+            max_relative_error=0.006, )
 
     def test_check_grad_ingore_y(self):
         self.check_grad_with_place(
-            self.place, ['X'],
+            self.place,
+            ['X'],
             'Out',
             no_grad_set=set("Y"),
-            max_relative_error=0.006,
-            check_dygraph=False)
+            max_relative_error=0.006, )
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestAddAPI(unittest.TestCase):
     def test_name(self):
         with paddle.static.program_guard(paddle.static.Program()):
@@ -134,8 +130,6 @@ def test_static(self):
                 msg="z_value = {}, but expected {}".format(z_value, z_expected))
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestAddError(unittest.TestCase):
     def test_errors(self):
         with paddle.static.program_guard(paddle.static.Program()):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_div_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_div_op_npu.py
index 0ae2678d10b47c..ee72ee74065e3b 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_div_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_div_op_npu.py
@@ -26,8 +26,6 @@
 SEED = 2021
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestElementwiseDiv(OpTest):
     def setUp(self):
         self.set_npu()
@@ -54,30 +52,28 @@ def init_dtype(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
     def test_check_grad_normal(self):
         self.check_grad_with_place(
-            self.place, ['X', 'Y'],
+            self.place,
+            ['X', 'Y'],
             'Out',
-            max_relative_error=0.007,
-            check_dygraph=False)
+            max_relative_error=0.007, )
 
     def test_check_grad_ingore_x(self):
         self.check_grad_with_place(
-            self.place, ['Y'],
+            self.place,
+            ['Y'],
             'Out',
             max_relative_error=0.007,
-            no_grad_set=set("X"),
-            check_dygraph=False)
+            no_grad_set=set("X"), )
 
     def test_check_grad_ingore_y(self):
         self.check_grad_with_place(
-            self.place, ['X'], 'Out', no_grad_set=set("Y"), check_dygraph=False)
+            self.place, ['X'], 'Out', no_grad_set=set("Y"))
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestElementwiseDivFp16(OpTest):
     def setUp(self):
         self.set_npu()
@@ -105,11 +101,9 @@ def init_dtype(self):
         self.dtype = np.float16
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+        self.check_output_with_place(self.place, atol=1e-5)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestElementwiseDivNet(unittest.TestCase):
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_floordiv_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_floordiv_op_npu.py
index 93538e938670f0..36d282a3d06f77 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_floordiv_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_floordiv_op_npu.py
@@ -24,8 +24,6 @@
 paddle.enable_static()
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestElementwiseFloorDiv(OpTest):
     def setUp(self):
         self.op_type = "elementwise_floordiv"
@@ -53,11 +51,9 @@ def init_dtype(self):
         self.dtype = "int64"
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestElementwiseFloorDiv2(TestElementwiseFloorDiv):
     def init_dtype(self):
         self.dtype = "int32"
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_max_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_max_op_npu.py
similarity index 91%
rename from python/paddle/fluid/tests/unittests/test_elementwise_max_op_npu.py
rename to python/paddle/fluid/tests/unittests/npu/test_elementwise_max_op_npu.py
index 6475caf970cba7..6c325b020206c4 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_max_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_max_op_npu.py
@@ -26,8 +26,6 @@
 SEED = 2021
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestElementwiseMax(OpTest):
     def setUp(self):
         self.set_npu()
@@ -54,7 +52,7 @@ def init_dtype(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
     # TODO(ascendrc): Max grad test
     # def test_check_grad(self):
@@ -64,8 +62,6 @@ def test_check_output(self):
     #
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestElementwiseMaxFp16(OpTest):
     def setUp(self):
         self.set_npu()
@@ -93,11 +89,9 @@ def init_dtype(self):
         self.dtype = np.float16
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+        self.check_output_with_place(self.place, atol=1e-5)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestElementwiseMaxNet(unittest.TestCase):
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py
index b4d9c7285b2b55..2034a12c5c0feb 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py
@@ -26,8 +26,6 @@
 SEED = 2021
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestElementwiseMin(OpTest):
     def setUp(self):
         self.set_npu()
@@ -54,7 +52,7 @@ def init_dtype(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
     # TODO(ascendrc): Min grad test
     # def test_check_grad(self):
@@ -64,8 +62,6 @@ def test_check_output(self):
     #
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestElementwiseMinFp16(OpTest):
     def setUp(self):
         self.set_npu()
@@ -93,11 +89,9 @@ def init_dtype(self):
         self.dtype = np.float16
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+        self.check_output_with_place(self.place, atol=1e-5)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestElementwiseMinNet(unittest.TestCase):
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_mul_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_mul_op_npu.py
index 9bfb7e033e7ea4..ea94661e8a51e6 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_mul_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_mul_op_npu.py
@@ -26,8 +26,6 @@
 SEED = 2021
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestElementwiseMul(OpTest):
     def setUp(self):
         self.set_npu()
@@ -54,7 +52,7 @@ def init_dtype(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
     # TODO(ascendrc): Mul grad test
     # def test_check_grad(self):
@@ -64,8 +62,6 @@ def test_check_output(self):
     #
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestElementwiseMulFp16(OpTest):
     def setUp(self):
         self.set_npu()
@@ -93,11 +89,9 @@ def init_dtype(self):
         self.dtype = np.float16
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+        self.check_output_with_place(self.place, atol=1e-5)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestElementwiseMulNet(unittest.TestCase):
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py
index 862c546b8e05eb..dea1828a6d75fc 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_pow_op_npu.py
@@ -26,8 +26,6 @@
 SEED = 2021
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestElementwisePow(OpTest):
     def setUp(self):
         self.set_npu()
@@ -54,7 +52,7 @@ def init_dtype(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
     # TODO(ascendrc): Pow grad test
     # def test_check_grad(self):
@@ -64,8 +62,6 @@ def test_check_output(self):
     #
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestElementwisePowFp16(OpTest):
     def setUp(self):
         self.set_npu()
@@ -93,11 +89,9 @@ def init_dtype(self):
         self.dtype = np.float16
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+        self.check_output_with_place(self.place, atol=1e-5)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestElementwisePowNet(unittest.TestCase):
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py
index 8c6c7b46f49f27..6faa77b4602137 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py
@@ -27,8 +27,6 @@
 SEED = 2021
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestElementwiseSubOp(OpTest):
     def setUp(self):
         self.set_npu()
@@ -64,7 +62,7 @@ def init_axis(self):
         self.axis = 0
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
     # TODO(ascendrc): For grad tests, OpTest raises FatalError:Segmentation fault
     #  when call op.run, which may be caused by system environment exception
@@ -74,7 +72,7 @@ def test_check_output(self):
     #         self.place, ['X', 'Y'],
     #         'Out',
     #         max_relative_error=0.006,
-    #         check_dygraph=False)
+    #         )
     #
     # def test_check_grad_ingore_x(self):
     #     self.check_grad_with_place(
@@ -82,18 +80,16 @@ def test_check_output(self):
     #         'Out',
     #         no_grad_set=set("X"),
     #         max_relative_error=0.006,
-    #         check_dygraph=False)
+    #         )
     #
     # def test_check_grad_ingore_y(self):
     #     self.check_grad_with_place(
     #         self.place, ['X'],
     #         'Out',
     #         no_grad_set=set("Y"),
-    #         max_relative_error=0.006,check_dygraph=False)
+    #         max_relative_error=0.006,)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestSubtractAPI(unittest.TestCase):
     def test_name(self):
         with paddle.static.program_guard(paddle.static.Program()):
@@ -138,8 +134,6 @@ def test_static(self):
                 msg="z_value = {}, but expected {}".format(z_value, z_expected))
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestSubtractError(unittest.TestCase):
     def test_errors(self):
         with paddle.static.program_guard(paddle.static.Program()):
@@ -158,8 +152,6 @@ def test_errors(self):
             self.assertRaises(TypeError, paddle.subtract, x2, y2)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestSubtractNet(unittest.TestCase):
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py
index f6a84d3be5c100..375003f79e500f 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_expand_op_npu.py
@@ -26,8 +26,6 @@
 SEED = 2021
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestExpand(OpTest):
     def setUp(self):
         self.set_npu()
@@ -50,7 +48,7 @@ def init_dtype(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
     # TODO(ascendrc): Add grad test
     # def test_check_grad(self):
@@ -60,8 +58,6 @@ def test_check_output(self):
     #
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestExpandV2(TestExpand):
     def setUp(self):
         self.set_npu()
@@ -82,8 +78,6 @@ def setUp(self):
         self.outputs = {'Out': out}
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestExpandFp16(TestExpand):
     no_need_check_grad = True
 
@@ -91,8 +85,6 @@ def init_dtype(self):
         self.dtype = np.float16
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestExpandNet(unittest.TestCase):
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_fill_constant_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_fill_constant_op_npu.py
index 6e619bfd11fb90..c8d7f2f9dc9873 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_fill_constant_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_fill_constant_op_npu.py
@@ -27,8 +27,6 @@
 SEED = 2021
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestFillConstant(OpTest):
     def setUp(self):
         self.set_npu()
@@ -47,7 +45,7 @@ def init_dtype(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
 
 class TestFillConstantInt(OpTest):
@@ -71,7 +69,7 @@ def init_dtype(self):
         self.dtype = np.int32
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
 
 class TestFillConstantFP16(OpTest):
@@ -95,7 +93,7 @@ def init_dtype(self):
         self.dtype = np.float16
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-3)
+        self.check_output_with_place(self.place, atol=1e-3)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/npu/test_gather_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_gather_op_npu.py
index 008422ffd21188..daca3d884600a0 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_gather_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_gather_op_npu.py
@@ -34,8 +34,6 @@ def gather_numpy(x, index, axis):
     return gather
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestGatherOp(OpTest):
     def setUp(self):
         self.set_npu()
@@ -53,14 +51,14 @@ def set_npu(self):
         self.__class__.use_npu = True
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
     def test_check_grad(self):
         self.check_grad_with_place(
-            self.place, ['X'],
+            self.place,
+            ['X'],
             'Out',
-            max_relative_error=0.006,
-            check_dygraph=False)
+            max_relative_error=0.006, )
 
     def config(self):
         """
@@ -72,8 +70,6 @@ def config(self):
         self.index_type = "int32"
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestCase1(TestGatherOp):
     def config(self):
         """
@@ -85,8 +81,6 @@ def config(self):
         self.index_type = "int32"
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class API_TestGather(unittest.TestCase):
     def test_out1(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
@@ -120,8 +114,6 @@ def test_out2(self):
         self.assertTrue(np.allclose(result, expected_output))
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestGatherGrad(unittest.TestCase):
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_gaussian_random_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_gaussian_random_op_npu.py
index 07e214e06003f3..11f64b8fc7d269 100755
--- a/python/paddle/fluid/tests/unittests/npu/test_gaussian_random_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_gaussian_random_op_npu.py
@@ -26,8 +26,6 @@
 paddle.enable_static()
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestNPUGaussianRandomOp(OpTest):
     def setUp(self):
         self.set_npu()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_gelu_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_gelu_op_npu.py
index 4127c7382880e2..760ce59812ea2e 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_gelu_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_gelu_op_npu.py
@@ -32,8 +32,6 @@ def np_gelu(x):
     return y
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestGelu(OpTest):
     def setUp(self):
         self.set_npu()
@@ -56,18 +54,13 @@ def init_dtype(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-3)
+        self.check_output_with_place(self.place, atol=1e-3)
 
     def test_check_grad(self):
         self.check_grad_with_place(
-            self.place, ['X'],
-            'Out',
-            check_dygraph=False,
-            max_relative_error=0.007)
+            self.place, ['X'], 'Out', max_relative_error=0.007)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestGeluFp16(OpTest):
     def setUp(self):
         self.set_npu()
@@ -91,11 +84,9 @@ def init_dtype(self):
         self.dtype = np.float16
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-3)
+        self.check_output_with_place(self.place, atol=1e-3)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestGeluNet(unittest.TestCase):
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_hard_sigmoid_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_hard_sigmoid_op_npu.py
new file mode 100644
index 00000000000000..d7aafccc88cf8d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_hard_sigmoid_op_npu.py
@@ -0,0 +1,158 @@
+#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+from paddle.fluid.tests.unittests.op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.nn.functional as F
+from paddle.fluid import Program, program_guard
+
+paddle.enable_static()
+SEED = 2021
+
+
+def ref_hardsigmoid(x, slope=0.166666666666667, offset=0.5):
+    return np.maximum(np.minimum(x * slope + offset, 1.), 0.).astype(x.dtype)
+
+
+class TestNPUHardSigmoid(OpTest):
+    def setUp(self):
+        paddle.enable_static()
+
+        self.op_type = "hard_sigmoid"
+        self.set_npu()
+        self.init_dtype()
+        self.set_attrs()
+
+        x = np.random.uniform(-5, 5, [10, 12]).astype(self.dtype)
+        lower_threshold = -self.offset / self.slope
+        upper_threshold = (1. - self.offset) / self.slope
+
+        # Same reason as TestAbs
+        delta = 0.005
+        x[np.abs(x - lower_threshold) < delta] = lower_threshold - 0.02
+        x[np.abs(x - upper_threshold) < delta] = upper_threshold - 0.02
+
+        out = ref_hardsigmoid(x, self.slope, self.offset)
+
+        self.attrs = {'slope': self.slope, 'offset': self.offset}
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-5)
+
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
+
+        self.check_grad_with_place(self.place, ['X'], 'Out')
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.place = paddle.NPUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def set_attrs(self):
+        self.slope = 0.166666666666667
+        self.offset = 0.5
+
+
+class TestNPUHardSigmoid2(TestNPUHardSigmoid):
+    def set_attrs(self):
+        self.slope = 0.2
+        self.offset = 0.5
+
+
+class TestNPUHardSigmoid3(TestNPUHardSigmoid):
+    def set_attrs(self):
+        self.slope = 0.2
+        self.offset = 0.4
+
+
+class TestNPUHardSigmoidFp16(TestNPUHardSigmoid):
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-3)
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+class TestHardsigmoidAPI(unittest.TestCase):
+    # test paddle.nn.Hardsigmoid, paddle.nn.functional.hardsigmoid
+    def setUp(self):
+        self.x_np = np.random.uniform(-1, 1, [10, 12]).astype(np.float32)
+        self.place = paddle.NPUPlace(0)
+
+    def test_static_api(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.static.data('X', self.x_np.shape, self.x_np.dtype)
+            out1 = F.hardsigmoid(x)
+            m = paddle.nn.Hardsigmoid()
+            out2 = m(x)
+            exe = paddle.static.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out1, out2])
+        out_ref = ref_hardsigmoid(self.x_np)
+        for r in res:
+            self.assertTrue(np.allclose(out_ref, r))
+
+    def test_dygraph_api(self):
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out1 = F.hardsigmoid(x)
+        m = paddle.nn.Hardsigmoid()
+        out2 = m(x)
+        out_ref = ref_hardsigmoid(self.x_np)
+        for r in [out1, out2]:
+            self.assertTrue(np.allclose(out_ref, r.numpy()))
+        paddle.enable_static()
+
+    def test_fluid_api(self):
+        with fluid.program_guard(fluid.Program()):
+            x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
+            out = fluid.layers.hard_sigmoid(x)
+            exe = fluid.Executor(self.place)
+            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
+        out_ref = ref_hardsigmoid(self.x_np, 0.2, 0.5)
+        self.assertTrue(np.allclose(out_ref, res[0]))
+
+        paddle.disable_static(self.place)
+        x = paddle.to_tensor(self.x_np)
+        out = paddle.fluid.layers.hard_sigmoid(x)
+        self.assertTrue(np.allclose(out_ref, out.numpy()))
+        paddle.enable_static()
+
+    def test_errors(self):
+        with paddle.static.program_guard(paddle.static.Program()):
+            # The input type must be Variable.
+            self.assertRaises(TypeError, F.hardsigmoid, 1)
+            # The input dtype must be float16, float32, float64.
+            x_int32 = paddle.fluid.data(
+                name='x_int32', shape=[12, 10], dtype='int32')
+            self.assertRaises(TypeError, F.hardsigmoid, x_int32)
+            # support the input dtype is float16
+            x_fp16 = paddle.fluid.data(
+                name='x_fp16', shape=[12, 10], dtype='float16')
+            F.hardsigmoid(x_fp16)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_increment_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_increment_op_npu.py
index 3e2e8f944b84c6..dfb9b26d64ea85 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_increment_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_increment_op_npu.py
@@ -29,8 +29,6 @@
 NPUPlace = 0
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestIncrement(OpTest):
     def setUp(self):
         self.set_npu()
@@ -54,11 +52,9 @@ def init_dtype(self):
         self.dtype = np.int64
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestIncrementFP16(OpTest):
     def setUp(self):
         self.set_npu()
@@ -82,11 +78,9 @@ def init_dtype(self):
         self.dtype = np.float16
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestIncrementInplace(unittest.TestCase):
     def test_npu(self):
         main_prog = paddle.static.Program()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_index_sample_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_index_sample_op_npu.py
new file mode 100644
index 00000000000000..9b890d22ada792
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_index_sample_op_npu.py
@@ -0,0 +1,193 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import sys
+import unittest
+import numpy as np
+sys.path.append("..")
+
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+
+paddle.enable_static()
+
+
+class TestIndexSampleOp(OpTest):
+    def set_npu(self):
+        self.__class__.use_npu = True
+
+    def setUp(self):
+        self.set_npu()
+        self.op_type = "index_sample"
+        self.config()
+        xnp = np.random.random(self.x_shape).astype(self.dtype)
+        indexnp = np.random.randint(
+            low=0, high=self.x_shape[1],
+            size=self.index_shape).astype(self.index_type)
+        self.inputs = {'X': xnp, 'Index': indexnp}
+        index_array = []
+        for i in range(self.index_shape[0]):
+            for j in indexnp[i]:
+                index_array.append(xnp[i, j])
+        index_array = np.array(index_array).astype(self.dtype)
+        out = np.reshape(index_array, self.index_shape)
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output_with_place(paddle.NPUPlace(0))
+
+    def test_check_grad(self):
+        self.check_grad_with_place(paddle.NPUPlace(0), ['X'], 'Out')
+
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = (10, 20)
+        self.dtype = "float32"
+        self.index_shape = (10, 10)
+        self.index_type = "int32"
+
+
+class TestCase1(TestIndexSampleOp):
+    def config(self):
+        """
+        For one dimension input
+        """
+        self.x_shape = (100, 1)
+        self.dtype = "float32"
+        self.index_shape = (100, 1)
+        self.index_type = "int32"
+
+
+class TestCase2(TestIndexSampleOp):
+    def config(self):
+        """
+        For int64_t index type
+        """
+        self.x_shape = (10, 100)
+        self.dtype = "float32"
+        self.index_shape = (10, 10)
+        self.index_type = "int64"
+
+
+class TestCase3(TestIndexSampleOp):
+    def config(self):
+        """
+        For int index type
+        """
+        self.x_shape = (10, 100)
+        self.dtype = "float32"
+        self.index_shape = (10, 10)
+        self.index_type = "int32"
+
+
+class TestCase4(TestIndexSampleOp):
+    def config(self):
+        """
+        For int64 index type
+        """
+        self.x_shape = (10, 128)
+        self.dtype = "float32"
+        self.index_shape = (10, 64)
+        self.index_type = "int64"
+
+
+class TestCase5(TestIndexSampleOp):
+    def config(self):
+        """
+        For float16 x type
+        """
+        self.__class__.no_need_check_grad = True
+        self.x_shape = (10, 128)
+        self.dtype = "float16"
+        self.index_shape = (10, 64)
+        self.index_type = "int64"
+
+    def test_check_grad(self):
+        pass
+
+
+class TestCase6(TestCase5):
+    def config(self):
+        """
+        For int32 x type
+        """
+        self.__class__.no_need_check_grad = True
+        self.x_shape = (10, 128)
+        self.dtype = "int32"
+        self.index_shape = (10, 64)
+        self.index_type = "int64"
+
+
+class TestCase7(TestCase5):
+    def config(self):
+        """
+        For int64 x type
+        """
+        self.__class__.no_need_check_grad = True
+        self.x_shape = (10, 128)
+        self.dtype = "int64"
+        self.index_shape = (10, 64)
+        self.index_type = "int64"
+
+
+class TestIndexSampleShape(unittest.TestCase):
+    def test_shape(self):
+        paddle.enable_static()
+        # create x value
+        x_shape = (2, 5)
+        x_type = "float32"
+        x_np = np.random.random(x_shape).astype(x_type)
+
+        # create index value
+        index_shape = (2, 3)
+        index_type = "int32"
+        index_np = np.random.randint(
+            low=0, high=x_shape[1], size=index_shape).astype(index_type)
+
+        x = fluid.data(name='x', shape=[-1, 5], dtype='float32')
+        index = fluid.data(name='index', shape=[-1, 3], dtype='int32')
+        output = paddle.index_sample(x=x, index=index)
+
+        place = fluid.NPUPlace(0)
+        exe = fluid.Executor(place=place)
+        exe.run(fluid.default_startup_program())
+
+        feed = {'x': x_np, 'index': index_np}
+        res = exe.run(feed=feed, fetch_list=[output])
+
+
+class TestIndexSampleDynamic(unittest.TestCase):
+    def test_result(self):
+        with fluid.dygraph.guard(paddle.NPUPlace(0)):
+            x = paddle.to_tensor(
+                [[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0],
+                 [9.0, 10.0, 11.0, 12.0]],
+                dtype='float32')
+            index = paddle.to_tensor(
+                [[0, 1, 2], [1, 2, 3], [0, 0, 0]], dtype='int32')
+            out_z1 = paddle.index_sample(x, index)
+
+            except_output = np.array(
+                [[1.0, 2.0, 3.0], [6.0, 7.0, 8.0], [9.0, 9.0, 9.0]])
+            assert out_z1.numpy().all() == except_output.all()
+
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_layer_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_layer_norm_op_npu.py
index d447dfb8d4d031..0345ac1f2065b1 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_layer_norm_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_layer_norm_op_npu.py
@@ -36,8 +36,6 @@
 _set_use_system_allocator(False)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestLayerNormOp(unittest.TestCase):
     def setUp(self):
         self.use_cudnn = True
@@ -191,8 +189,6 @@ def test_check_forward_backward_with_scale_and_bias(self):
         self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=3)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestLayerNormOpFP16(TestLayerNormOp):
     def init_dtype(self):
         self.dtype = np.float16
diff --git a/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py
index 3cdd2448628a0b..9534431e99a7a2 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_log_op_npu.py
@@ -26,8 +26,6 @@
 SEED = 2021
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestLog(OpTest):
     def setUp(self):
         self.set_npu()
@@ -50,7 +48,7 @@ def init_dtype(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
     # TODO(ascendrc): Add grad test
     # def test_check_grad(self):
@@ -60,8 +58,6 @@ def test_check_output(self):
     #
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestLogFp16(OpTest):
     def setUp(self):
         self.set_npu()
@@ -85,11 +81,9 @@ def init_dtype(self):
         self.dtype = np.float16
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+        self.check_output_with_place(self.place, atol=1e-5)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestLogNet(unittest.TestCase):
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_logical_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_logical_op_npu.py
index 6d1327f068a528..f5f0a23d81a503 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_logical_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_logical_op_npu.py
@@ -23,6 +23,10 @@
 import paddle.fluid as fluid
 from paddle.static import Program, program_guard
 
+SUPPORTED_DTYPES = [
+    bool, np.int8, np.int16, np.int32, np.int64, np.float32, np.float64
+]
+
 TEST_META_OP_DATA = [{
     'op_str': 'logical_and',
     'binary_op': True
@@ -110,13 +114,13 @@ def run_static(x_np, y_np, op_str, use_npu=False, binary_op=True):
         place = paddle.NPUPlace(0)
     exe = fluid.Executor(place)
     with fluid.program_guard(main_program, startup_program):
-        x = paddle.static.data(name='x', shape=x_np.shape, dtype='bool')
+        x = paddle.static.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
         op = getattr(paddle, op_str)
         feed_list = {'x': x_np}
         if not binary_op:
             res = op(x)
         else:
-            y = paddle.static.data(name='y', shape=y_np.shape, dtype='bool')
+            y = paddle.static.data(name='y', shape=y_np.shape, dtype=y_np.dtype)
             feed_list['y'] = y_np
             res = op(x, y)
         exe.run(startup_program)
@@ -130,17 +134,20 @@ def run_dygraph(x_np, y_np, op_str, use_npu=False, binary_op=True):
         place = paddle.NPUPlace(0)
     paddle.disable_static(place)
     op = getattr(paddle, op_str)
-    x = paddle.to_tensor(x_np)
+    x = paddle.to_tensor(x_np, dtype=x_np.dtype)
     if not binary_op:
         dygraph_result = op(x)
     else:
-        y = paddle.to_tensor(y_np)
+        y = paddle.to_tensor(y_np, dtype=y_np.dtype)
         dygraph_result = op(x, y)
     return dygraph_result
 
 
-def np_data_generator(np_shape, *args, **kwargs):
-    return np.random.choice(a=[True, False], size=np_shape).astype(bool)
+def np_data_generator(np_shape, dtype, *args, **kwargs):
+    if dtype == bool:
+        return np.random.choice(a=[True, False], size=np_shape).astype(bool)
+    else:
+        return np.random.randn(*np_shape).astype(dtype)
 
 
 def test(unit_test, use_npu=False, test_error=False):
@@ -152,40 +159,46 @@ def test(unit_test, use_npu=False, test_error=False):
         if test_error:
             META_DATA = dict(TEST_META_WRONG_SHAPE_DATA)
         for shape_data in META_DATA.values():
-            meta_data['x_np'] = np_data_generator(shape_data['x_shape'])
-            meta_data['y_np'] = np_data_generator(shape_data['y_shape'])
-            if meta_data['binary_op'] and test_error:
-                # catch C++ Exception
-                unit_test.assertRaises(BaseException, run_static, **meta_data)
-                unit_test.assertRaises(BaseException, run_dygraph, **meta_data)
-                continue
-            static_result = run_static(**meta_data)
-            dygraph_result = run_dygraph(**meta_data)
-            if meta_data['binary_op']:
-                np_result = np_op(meta_data['x_np'], meta_data['y_np'])
-            else:
-                np_result = np_op(meta_data['x_np'])
-            unit_test.assertTrue((static_result == np_result).all())
-            unit_test.assertTrue((dygraph_result.numpy() == np_result).all())
+            for data_type in SUPPORTED_DTYPES:
+                meta_data['x_np'] = np_data_generator(
+                    shape_data['x_shape'], dtype=data_type)
+                meta_data['y_np'] = np_data_generator(
+                    shape_data['y_shape'], dtype=data_type)
+                if meta_data['binary_op'] and test_error:
+                    # catch C++ Exception
+                    unit_test.assertRaises(BaseException, run_static,
+                                           **meta_data)
+                    unit_test.assertRaises(BaseException, run_dygraph,
+                                           **meta_data)
+                    continue
+                static_result = run_static(**meta_data)
+                dygraph_result = run_dygraph(**meta_data)
+                if meta_data['binary_op']:
+                    np_result = np_op(meta_data['x_np'], meta_data['y_np'])
+                else:
+                    np_result = np_op(meta_data['x_np'])
+                unit_test.assertTrue((static_result == np_result).all())
+                unit_test.assertTrue((dygraph_result.numpy() == np_result).all(
+                ))
 
 
 def test_type_error(unit_test, use_npu, type_str_map):
     def check_type(op_str, x, y, binary_op):
         op = getattr(paddle, op_str)
-        error_type = TypeError
+        error_type = ValueError
         if isinstance(x, np.ndarray):
             x = paddle.to_tensor(x)
             y = paddle.to_tensor(y)
             error_type = BaseException
         if binary_op:
-            if type_str_map['x'] != 'bool' or type_str_map['y'] != 'bool':
+            if type_str_map['x'] != type_str_map['y']:
                 unit_test.assertRaises(error_type, op, x=x, y=y)
             if not fluid.in_dygraph_mode():
+                error_type = TypeError
                 unit_test.assertRaises(error_type, op, x=x, y=y, out=1)
         else:
-            if type_str_map['x'] != 'bool':
-                unit_test.assertRaises(error_type, op, x=x)
             if not fluid.in_dygraph_mode():
+                error_type = TypeError
                 unit_test.assertRaises(error_type, op, x=x, out=1)
 
     place = paddle.CPUPlace()
@@ -212,16 +225,12 @@ def check_type(op_str, x, y, binary_op):
 
 
 def type_map_factory():
-    x_type_list = ['float32', 'float64', 'int32', 'int64', 'bool']
-    y_type_list = ['float32', 'float64', 'int32', 'int64', 'bool']
     return [{
         'x': x_type,
         'y': y_type
-    } for x_type in x_type_list for y_type in y_type_list]
+    } for x_type in SUPPORTED_DTYPES for y_type in SUPPORTED_DTYPES]
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestCPU(unittest.TestCase):
     def test(self):
         test(self)
@@ -235,8 +244,6 @@ def test_type_error(self):
             test_type_error(self, False, type_map)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestNPU(unittest.TestCase):
     def test(self):
         test(self, True)
diff --git a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
index 41fe0636bd7790..56f04a6e993f3a 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
@@ -26,8 +26,6 @@
 SEED = 2021
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestLookupTableV2(OpTest):
     def setUp(self):
         self.set_npu()
@@ -67,17 +65,14 @@ def init_dim(self):
         self.dim = 20
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
     def test_check_grad(self):
         if self.dtype == np.float16:
             return
-        self.check_grad_with_place(
-            self.place, ['W'], 'Out', check_dygraph=False)
+        self.check_grad_with_place(self.place, ['W'], 'Out')
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestLookupTableV2FP16(TestLookupTableV2):
     no_need_check_grad = True
 
@@ -89,16 +84,12 @@ def set_npu(self):
         self.__class__.no_need_check_grad = True
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestLookupTableV2Dim32(TestLookupTableV2):
     def init_dim(self):
         # embedding_dim is multiple of 32
         self.dim = 64
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestLookupTableV2Dim32FP16(TestLookupTableV2):
     no_need_check_grad = True
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py
index b093fa4f2caa4a..53766c5eb61b7a 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py
@@ -26,15 +26,13 @@
 SEED = 2021
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 def reference_matmul(X, Y, transpose_X=False, transpose_Y=False):
     """Reference forward implementation using np.matmul."""
     # np.matmul does not support the transpose flags, so we manually
     # transpose X and Y appropriately.
     if transpose_X:
         if X.ndim == 1:
-            X = X.reshape((X.size, ))
+            X = X.reshape((X.size))
         elif X.ndim == 2:
             X = X.T
         else:
@@ -43,7 +41,7 @@ def reference_matmul(X, Y, transpose_X=False, transpose_Y=False):
             X = np.transpose(X, tuple(dim))
     if transpose_Y:
         if Y.ndim == 1:
-            Y = Y.reshape((Y.size, ))
+            Y = Y.reshape((Y.size))
         else:
             dim = [i for i in range(len(Y.shape))]
             dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1]
@@ -53,7 +51,7 @@ def reference_matmul(X, Y, transpose_X=False, transpose_Y=False):
     if not Out.shape:
         # We do not support 0-dimensional Tensors (scalars). So where
         # np.matmul outputs a scalar, we must convert to a Tensor of
-        # shape (1, ) instead.
+        # shape (1) instead.
         # Everywhere else, we are compatible with np.matmul.
         Out = np.array([Out], dtype="float64")
     return Out
@@ -95,7 +93,7 @@ def init_dtype(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+        self.check_output_with_place(self.place, atol=1e-5)
 
 
     # TODO(ascendrc): Add grad test
@@ -137,8 +135,6 @@ def config(self):
         self.trans_y = False
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestMatMulNet(unittest.TestCase):
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
@@ -207,8 +203,8 @@ def test_npu(self):
 
 
 # The precision is aligned in NPU and GPU separately, which is only used for the usage method.
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
+
+
 class TestMatMulNet3_2(unittest.TestCase):
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_mean_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_mean_op_npu.py
index 6e8f99a9dbb197..e69c2fd84dd9db 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_mean_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_mean_op_npu.py
@@ -27,8 +27,6 @@
 SEED = 2021
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestMean(OpTest):
     def setUp(self):
         self.set_npu()
@@ -50,15 +48,12 @@ def init_dtype(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
     def test_check_grad(self):
-        self.check_grad_with_place(
-            self.place, ['X'], 'Out', check_dygraph=False)
+        self.check_grad_with_place(self.place, ['X'], 'Out')
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestMeanFP16(OpTest):
     def setUp(self):
         self.set_npu()
@@ -81,7 +76,7 @@ def init_dtype(self):
         self.dtype = np.float16
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/npu/test_memcpy_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_memcpy_op_npu.py
index 63c4fb8e5885ea..a421e2667347c5 100755
--- a/python/paddle/fluid/tests/unittests/npu/test_memcpy_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_memcpy_op_npu.py
@@ -28,8 +28,6 @@
 SEED = 2021
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestMemcpy_FillConstant(unittest.TestCase):
     def get_prog(self):
         paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_mixed_precision_npu.py b/python/paddle/fluid/tests/unittests/npu/test_mixed_precision_npu.py
new file mode 100644
index 00000000000000..193b9eb4e0acaf
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_mixed_precision_npu.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import sys
+import paddle
+sys.path.append("..")
+import test_mixed_precision
+
+paddle.enable_static()
+
+
+class AMPTestNpu(test_mixed_precision.AMPTest):
+    def setUp(self):
+        self.place = paddle.NPUPlace(0)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py
index 07f187a0f0de9d..cb58a2a8d44093 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py
@@ -52,30 +52,30 @@ def init_dtype(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+        self.check_output_with_place(self.place, atol=1e-5)
 
     def test_check_grad_normal(self):
         self.check_grad_with_place(
-            self.place, ['X', 'Y'],
+            self.place,
+            ['X', 'Y'],
             'Out',
-            max_relative_error=0.0065,
-            check_dygraph=False)
+            max_relative_error=0.0065, )
 
     def test_check_grad_ingore_x(self):
         self.check_grad_with_place(
-            self.place, ['Y'],
+            self.place,
+            ['Y'],
             'Out',
             no_grad_set=set("X"),
-            max_relative_error=0.0065,
-            check_dygraph=False)
+            max_relative_error=0.0065, )
 
     def test_check_grad_ingore_y(self):
         self.check_grad_with_place(
-            self.place, ['X'],
+            self.place,
+            ['X'],
             'Out',
             no_grad_set=set("Y"),
-            max_relative_error=0.0065,
-            check_dygraph=False)
+            max_relative_error=0.0065, )
 
 
 @skip_check_grad_ci(
@@ -170,8 +170,6 @@ def test_check_grad_ingore_y(self):
         pass
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestMulNet(unittest.TestCase):
     def init_dtype(self):
         self.dtype = np.float32
@@ -243,8 +241,6 @@ def test_npu(self):
         self.assertTrue(np.allclose(npu_loss, cpu_loss))
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestMulNet3_2(unittest.TestCase):
     def init_dtype(self):
         self.dtype = np.float32
@@ -317,8 +313,6 @@ def test_npu(self):
         self.assertTrue(np.allclose(npu_loss, cpu_loss, atol=1e-5))
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestMulNet3_2_xc2(unittest.TestCase):
     def init_dtype(self):
         self.dtype = np.float32
diff --git a/python/paddle/fluid/tests/unittests/npu/test_npu_place.py b/python/paddle/fluid/tests/unittests/npu/test_npu_place.py
index 3f71fad2b9c108..91e0c29e10609b 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_npu_place.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_npu_place.py
@@ -22,8 +22,6 @@
 paddle.enable_static()
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestNpuPlace(unittest.TestCase):
     def test(self):
         p = core.Place()
@@ -33,8 +31,6 @@ def test(self):
         self.assertEqual(p.npu_device_id(), 0)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestNpuPlaceError(unittest.TestCase):
     def test_static(self):
         # NPU is not supported in ParallelExecutor
diff --git a/python/paddle/fluid/tests/unittests/npu/test_pow_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_pow_op_npu.py
index 8c67766b31184a..a188953d70c93c 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_pow_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_pow_op_npu.py
@@ -26,8 +26,6 @@
 SEED = 2021
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestPow(OpTest):
     def setUp(self):
         self.set_npu()
@@ -50,15 +48,12 @@ def init_dtype(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
     def test_check_grad(self):
-        self.check_grad_with_place(
-            self.place, ['X'], 'Out', check_dygraph=False)
+        self.check_grad_with_place(self.place, ['X'], 'Out')
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestPowFp16(OpTest):
     def setUp(self):
         self.set_npu()
@@ -82,11 +77,9 @@ def init_dtype(self):
         self.dtype = np.float16
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+        self.check_output_with_place(self.place, atol=1e-5)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestPowNet(unittest.TestCase):
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_reduce_any_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reduce_any_op_npu.py
index 583a648224d730..1a30d1395283ec 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_reduce_any_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_reduce_any_op_npu.py
@@ -28,8 +28,6 @@
 paddle.enable_static()
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestAny8DOp(OpTest):
     def setUp(self):
         self.set_npu()
@@ -46,11 +44,9 @@ def set_npu(self):
         self.__class__.use_npu = True
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestAnyOpWithDim(OpTest):
     def setUp(self):
         self.set_npu()
@@ -64,11 +60,9 @@ def set_npu(self):
         self.__class__.use_npu = True
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestAny8DOpWithDim(OpTest):
     def setUp(self):
         self.set_npu()
@@ -85,18 +79,16 @@ def set_npu(self):
         self.__class__.use_npu = True
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestAnyOpWithKeepDim(OpTest):
     def setUp(self):
         self.set_npu()
         self.op_type = "reduce_any"
         self.place = paddle.NPUPlace(0)
         self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")}
-        self.attrs = {'dim': (1, ), 'keep_dim': True}
+        self.attrs = {'dim': (1), 'keep_dim': True}
         self.outputs = {
             'Out': np.expand_dims(
                 self.inputs['X'].any(axis=self.attrs['dim']), axis=1)
@@ -106,7 +98,7 @@ def set_npu(self):
         self.__class__.use_npu = True
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
 
 class TestAny8DOpWithKeepDim(OpTest):
@@ -118,7 +110,7 @@ def setUp(self):
             'X': np.random.randint(0, 2,
                                    (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
         }
-        self.attrs = {'dim': (1, ), 'keep_dim': True}
+        self.attrs = {'dim': (1), 'keep_dim': True}
         self.outputs = {
             'Out': np.expand_dims(
                 self.inputs['X'].any(axis=self.attrs['dim']), axis=1)
@@ -128,7 +120,7 @@ def set_npu(self):
         self.__class__.use_npu = True
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/npu/test_reduce_sum_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reduce_sum_op_npu.py
index d3861bf0780cb5..bd7ce2a040c93b 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_reduce_sum_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_reduce_sum_op_npu.py
@@ -26,8 +26,6 @@
 SEED = 2021
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestReduceSum(OpTest):
     def setUp(self):
         np.random.seed(SEED)
@@ -66,10 +64,10 @@ def init_op_type(self):
 
     def initTestCase(self):
         self.shape = (5, 6)
-        self.axis = (0, )
+        self.axis = (0)
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
     # TODO(ascendrc): Add grad test
     # def test_check_grad(self):
@@ -84,8 +82,6 @@ def init_dtype(self):
         self.dtype = np.int32
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestReduceSumNet(unittest.TestCase):
     def set_reduce_sum_function(self, x):
         # keep_dim = False
@@ -151,16 +147,12 @@ def test_npu(self):
         self.assertTrue(np.allclose(npu_loss, cpu_loss))
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestReduceSumNet2(TestReduceSumNet):
     def set_reduce_sum_function(self, x):
         # keep_dim = True
         return paddle.fluid.layers.reduce_sum(x, dim=-1, keep_dim=True)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestReduceSumNet3(TestReduceSumNet):
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py
index 9273d01299d8f5..a2547808e6f161 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_relu_op_npu.py
@@ -26,8 +26,6 @@
 SEED = 2021
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestRelu(OpTest):
     def setUp(self):
         self.set_npu()
@@ -50,11 +48,9 @@ def init_dtype(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestReluFp16(OpTest):
     def setUp(self):
         self.set_npu()
@@ -78,11 +74,9 @@ def init_dtype(self):
         self.dtype = np.float16
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+        self.check_output_with_place(self.place, atol=1e-5)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestReluNeg(OpTest):
     def setUp(self):
         self.set_npu()
@@ -105,13 +99,13 @@ def init_dtype(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
 
 #
 #
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
+
+
 class TestReluNet(unittest.TestCase):
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_reshape2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reshape2_op_npu.py
index 885c990c702bd3..520de15f4df62b 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_reshape2_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_reshape2_op_npu.py
@@ -26,8 +26,6 @@
 SEED = 2021
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestReshape2(OpTest):
     def setUp(self):
         self.set_npu()
@@ -51,12 +49,10 @@ def init_data(self):
         self.infered_shape = (20, 10)
 
     def test_check_output(self):
-        self.check_output_with_place(
-            self.place, check_dygraph=False, no_check_set=['XShape'])
+        self.check_output_with_place(self.place, no_check_set=['XShape'])
 
     def test_check_grad_normal(self):
-        self.check_grad_with_place(
-            self.place, ['X'], 'Out', check_dygraph=False)
+        self.check_grad_with_place(self.place, ['X'], 'Out')
 
 
 class TestReshape2_case2(TestReshape2):
diff --git a/python/paddle/fluid/tests/unittests/npu/test_sampling_id_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sampling_id_op_npu.py
new file mode 100644
index 00000000000000..836d2b6d311897
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_sampling_id_op_npu.py
@@ -0,0 +1,52 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+
+from op_test import OpTest, _set_use_system_allocator
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.fluid.op import Operator
+import paddle
+
+_set_use_system_allocator(False)
+
+
+class TestSamplingIdShape(unittest.TestCase):
+    def test_shape(self):
+        paddle.enable_static()
+        x = fluid.layers.data(name='x', shape=[3], dtype='float32')
+        output = fluid.layers.sampling_id(x)
+
+        place = fluid.NPUPlace(0)
+        exe = fluid.Executor(place=place)
+        exe.run(fluid.default_startup_program())
+
+        feed = {
+            'x': np.array(
+                [[0.2, 0.3, 0.5], [0.2, 0.3, 0.4]], dtype='float32')
+        }
+        output_np = exe.run(feed=feed, fetch_list=[output])[0]
+
+        self.assertEqual(output.shape[0], -1)
+        self.assertEqual(len(output.shape), 1)
+        self.assertEqual(output_np.shape[0], 2)
+        self.assertEqual(len(output_np.shape), 1)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_save_load_npu.py b/python/paddle/fluid/tests/unittests/npu/test_save_load_npu.py
index e7e7fb39c913b2..3bdf8146fb2284 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_save_load_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_save_load_npu.py
@@ -36,56 +36,42 @@
 paddle.enable_static()
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestNPUSaveLoadBase(TestSaveLoadBase):
     def set_place(self):
         return fluid.CPUPlace() if not core.is_compiled_with_npu(
         ) else paddle.NPUPlace(0)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestNPUSaveLoadPartial(TestSaveLoadPartial):
     def set_place(self):
         return fluid.CPUPlace() if not core.is_compiled_with_npu(
         ) else paddle.NPUPlace(0)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestNPUSaveLoadSetStateDict(TestSaveLoadSetStateDict):
     def set_place(self):
         return fluid.CPUPlace() if not core.is_compiled_with_npu(
         ) else paddle.NPUPlace(0)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestNPUProgramStatePartial(TestProgramStatePartial):
     def set_place(self):
         return fluid.CPUPlace() if not core.is_compiled_with_npu(
         ) else paddle.NPUPlace(0)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestNPULoadFromOldInterface(TestLoadFromOldInterface):
     def set_place(self):
         return fluid.CPUPlace() if not core.is_compiled_with_npu(
         ) else paddle.NPUPlace(0)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestNPULoadFromOldInterfaceSingleFile(TestLoadFromOldInterfaceSingleFile):
     def set_place(self):
         return fluid.CPUPlace() if not core.is_compiled_with_npu(
         ) else paddle.NPUPlace(0)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestNPUProgramStateOldSave(TestProgramStateOldSave):
     def setUp(self):
         self.test_dygraph = False
@@ -95,8 +81,6 @@ def set_place(self):
         ) else paddle.NPUPlace(0)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestNPUProgramStateOldSaveSingleModel(TestProgramStateOldSaveSingleModel):
     def set_place(self):
         return fluid.CPUPlace() if not core.is_compiled_with_npu(
diff --git a/python/paddle/fluid/tests/unittests/npu/test_scale_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_scale_op_npu.py
index 9b4547bc24474a..65ec28fbf7d3a3 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_scale_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_scale_op_npu.py
@@ -26,8 +26,6 @@
 SEED = 2021
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestScale(OpTest):
     def setUp(self):
         self.set_npu()
@@ -51,7 +49,7 @@ def init_dtype(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
 
 class TestFP16Scale(TestScale):
@@ -82,7 +80,7 @@ def init_dtype(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/npu/test_scatter_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_scatter_op_npu.py
index c3e52c9bfad533..c05b53d9a48621 100755
--- a/python/paddle/fluid/tests/unittests/npu/test_scatter_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_scatter_op_npu.py
@@ -27,8 +27,6 @@
 SEED = 2021
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestCast1(OpTest):
     def setUp(self):
         self.set_npu()
@@ -49,7 +47,7 @@ def set_npu(self):
         self.__class__.use_npu = True
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
 
 class TestCast2(OpTest):
@@ -72,7 +70,7 @@ def set_npu(self):
         self.__class__.use_npu = True
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
 
 class TestCast3(OpTest):
@@ -95,7 +93,7 @@ def set_npu(self):
         self.__class__.use_npu = True
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
 
 class TestCast4(OpTest):
@@ -119,7 +117,7 @@ def set_npu(self):
         self.__class__.use_npu = True
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/npu/test_seed_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_seed_op_npu.py
index 29914d21e2673c..85a1e0594ba945 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_seed_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_seed_op_npu.py
@@ -26,8 +26,6 @@
 paddle.enable_static()
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestSeedOpFixSeed(OpTest):
     def setUp(self):
         self.set_npu()
@@ -43,8 +41,6 @@ def test_check_output(self):
         self.check_output_with_place(paddle.NPUPlace(0))
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestSeedOpDiffSeed(OpTest):
     def setUp(self):
         self.set_npu()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_sgd_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sgd_op_npu.py
index af0dea4776d23f..99061cba8d2700 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_sgd_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_sgd_op_npu.py
@@ -24,8 +24,6 @@
 SEED = 2021
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestSGD(OpTest):
     def setUp(self):
         self.set_npu()
@@ -50,11 +48,9 @@ def conf(self):
         self.w = 15
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestNet(unittest.TestCase):
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_shape_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_shape_op_npu.py
index 7b9a74b2be98de..cb1b0c458fcaa9 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_shape_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_shape_op_npu.py
@@ -26,8 +26,6 @@
 SEED = 2021
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestShape(OpTest):
     def setUp(self):
         self.set_npu()
@@ -50,7 +48,7 @@ def init_dtype(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py
index b56ee8c8c0748b..5a38f14868bb8a 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py
@@ -27,8 +27,6 @@
 EPOCH = 100
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestSliceOp(OpTest):
     def setUp(self):
         self.op_type = "slice"
@@ -60,13 +58,12 @@ def set_npu(self):
         self.place = paddle.NPUPlace(0)
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
     def test_check_grad_normal(self):
         if self.dtype == np.float16:
             return
-        self.check_grad_with_place(
-            self.place, ['Input'], 'Out', check_dygraph=False)
+        self.check_grad_with_place(self.place, ['Input'], 'Out')
 
 
 class TestSliceOp2(TestSliceOp):
@@ -79,8 +76,6 @@ def config(self):
         self.out = self.input[:, 0:1, :]
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestSliceOpFp16(TestSliceOp):
     def init_dtype(self):
         self.dtype = np.float16
@@ -147,8 +142,6 @@ def config(self):
         self.out = self.input[:, 0:1, :]
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestSliceOpFp16Tensor(TestSliceOpTensor):
     def init_dtype(self):
         self.dtype = np.float16
@@ -237,8 +230,6 @@ def config(self):
         self.out = self.input[:, 0:1, :]
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestSliceOpFp16TensorList(TestSliceOpTensorList):
     def init_dtype(self):
         self.dtype = np.float16
@@ -249,8 +240,6 @@ def set_npu(self):
         self.place = paddle.NPUPlace(0)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestSliceNet(unittest.TestCase):
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_softmax_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_softmax_op_npu.py
index c1ba41943a359b..f2a9ef2bee074d 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_softmax_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_softmax_op_npu.py
@@ -27,8 +27,6 @@
 SEED = 2021
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestSoftmax(OpTest):
     def setUp(self):
         self.set_npu()
@@ -51,11 +49,9 @@ def init_dtype(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestSoftmaxNet(unittest.TestCase):
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py
index 2ee089360e6dd2..8d78ee6a97efdd 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_softmax_with_cross_entropy_op_npu.py
@@ -28,8 +28,6 @@
 SEED = 2021
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestSoftmaxWithCrossEntropyOp(OpTest):
     def set_npu(self):
         self.__class__.use_npu = True
@@ -86,7 +84,7 @@ def setUp(self):
             self.attrs['axis'] = self.axis
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
     def test_check_grad(self):
         if self.dtype == np.float16:
@@ -95,13 +93,10 @@ def test_check_grad(self):
         self.check_grad_with_place(
             self.place, ['Logits'],
             'Loss',
-            check_dygraph=False,
             numeric_grad_delta=0.001,
             max_relative_error=0.5)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestPowNet(unittest.TestCase):
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py
index 556fa76424b8b6..acb99746d231de 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_sqrt_op_npu.py
@@ -26,8 +26,6 @@
 SEED = 2021
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestSqrt(OpTest):
     def setUp(self):
         self.set_npu()
@@ -50,7 +48,7 @@ def init_dtype(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
     # TODO(ascendrc): Add grad test
     # def test_check_grad(self):
@@ -60,8 +58,6 @@ def test_check_output(self):
     #
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestSqrtFp16(OpTest):
     def setUp(self):
         self.set_npu()
@@ -85,11 +81,9 @@ def init_dtype(self):
         self.dtype = np.float16
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+        self.check_output_with_place(self.place, atol=1e-5)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestSqrtNet(unittest.TestCase):
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py
index 8c1a8d0070484a..8262c3b94d6a85 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_square_op_npu.py
@@ -26,8 +26,6 @@
 SEED = 2021
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestSquare(OpTest):
     def setUp(self):
         self.set_npu()
@@ -50,7 +48,7 @@ def init_dtype(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
     # TODO(ascendrc): Add grad test
     # def test_check_grad(self):
@@ -60,8 +58,6 @@ def test_check_output(self):
     #
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestSquareFp16(OpTest):
     def setUp(self):
         self.set_npu()
@@ -85,11 +81,9 @@ def init_dtype(self):
         self.dtype = np.float16
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5)
+        self.check_output_with_place(self.place, atol=1e-5)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestSquareNet(unittest.TestCase):
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_squeeze_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_squeeze_op_npu.py
index 7a725b3b9d5d31..2e741c8d8a5ef0 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_squeeze_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_squeeze_op_npu.py
@@ -26,10 +26,9 @@
 
 paddle.enable_static()
 
-
 # Correct: General.
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
+
+
 class TestSqueezeOp(OpTest):
     def setUp(self):
         self.set_npu()
@@ -58,8 +57,8 @@ def init_attrs(self):
 
 
 # Correct: There is mins axis.
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
+
+
 class TestSqueezeOp1(TestSqueezeOp):
     def init_test_case(self):
         self.ori_shape = (1, 3, 1, 40)
@@ -68,8 +67,8 @@ def init_test_case(self):
 
 
 # Correct: No axes input.
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
+
+
 class TestSqueezeOp2(TestSqueezeOp):
     def init_test_case(self):
         self.ori_shape = (1, 20, 1, 5)
@@ -78,8 +77,8 @@ def init_test_case(self):
 
 
 # Correct: Just part of axes be squeezed. 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
+
+
 class TestSqueezeOp3(TestSqueezeOp):
     def init_test_case(self):
         self.ori_shape = (6, 1, 5, 1, 4, 1)
@@ -88,8 +87,8 @@ def init_test_case(self):
 
 
 # Correct: The demension of axis is not of size 1 remains unchanged.
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
+
+
 class TestSqueezeOp4(TestSqueezeOp):
     def init_test_case(self):
         self.ori_shape = (6, 1, 5, 1, 4, 1)
diff --git a/python/paddle/fluid/tests/unittests/npu/test_stack_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_stack_op_npu.py
index 721fb95dd9b72f..bdfc7a03c6c832 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_stack_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_stack_op_npu.py
@@ -26,8 +26,6 @@
 paddle.enable_static()
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestStackOpBase(OpTest):
     def initDefaultParameters(self):
         self.num_inputs = 4
@@ -77,50 +75,36 @@ def test_check_grad(self):
         self.check_grad_with_place(self.place, self.get_x_names(), 'Y')
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestStackOp1(TestStackOpBase):
     def initParameters(self):
         self.num_inputs = 16
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestStackOp2(TestStackOpBase):
     def initParameters(self):
         self.num_inputs = 20
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestStackOp3(TestStackOpBase):
     def initParameters(self):
         self.axis = -1
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestStackOp4(TestStackOpBase):
     def initParameters(self):
         self.axis = -4
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestStackOp5(TestStackOpBase):
     def initParameters(self):
         self.axis = 1
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestStackOp6(TestStackOpBase):
     def initParameters(self):
         self.axis = 3
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestStackAPIWithLoDTensorArray(unittest.TestCase):
     """
     Test stack api when the input(x) is a LoDTensorArray.
@@ -157,8 +141,6 @@ def test_case(self):
                     [self.x] * self.iter_num, axis=self.axis)))
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestTensorStackAPIWithLoDTensorArray(unittest.TestCase):
     """
     Test stack api when the input(x) is a LoDTensorArray.
@@ -195,8 +177,6 @@ def test_case(self):
                     [self.x] * self.iter_num, axis=self.axis)))
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class API_test(unittest.TestCase):
     def test_out(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
@@ -223,8 +203,6 @@ def test_single_tensor_error(self):
             self.assertRaises(TypeError, paddle.stack, x)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class API_DygraphTest(unittest.TestCase):
     def test_out(self):
         data1 = np.array([[1.0, 2.0]])
diff --git a/python/paddle/fluid/tests/unittests/npu/test_sum_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sum_op_npu.py
index 21b42814c07b06..1ea8504ceec01f 100755
--- a/python/paddle/fluid/tests/unittests/npu/test_sum_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_sum_op_npu.py
@@ -27,8 +27,6 @@
 SEED = 2021
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestSum1(OpTest):
     def setUp(self):
         self.set_npu()
@@ -52,7 +50,7 @@ def set_npu(self):
         self.__class__.use_npu = True
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
 
 class TestSum2(OpTest):
@@ -86,7 +84,7 @@ def set_npu(self):
         self.__class__.use_npu = True
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
 
 class TestSum3(OpTest):
@@ -111,7 +109,7 @@ def set_npu(self):
         self.__class__.use_npu = True
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py
index 235fa2783fb3c8..55be94da2b7e03 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_tanh_op_npu.py
@@ -26,8 +26,6 @@
 SEED = 2021
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestTanh(OpTest):
     def setUp(self):
         self.set_npu()
@@ -50,7 +48,7 @@ def init_dtype(self):
         self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
     # TODO(ascendrc): Add grad test
     # def test_check_grad(self):
@@ -60,8 +58,6 @@ def test_check_output(self):
     #
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestTanhFp16(OpTest):
     def setUp(self):
         self.set_npu()
@@ -85,11 +81,9 @@ def init_dtype(self):
         self.dtype = np.float16
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False, atol=1e-3)
+        self.check_output_with_place(self.place, atol=1e-3)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestTanhNet(unittest.TestCase):
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_top_k_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_top_k_op_npu.py
index 04d4565f748580..b735adf76d6c12 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_top_k_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_top_k_op_npu.py
@@ -27,8 +27,6 @@
 SEED = 2021
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestTopk(OpTest):
     def setUp(self):
         self.set_npu()
@@ -56,11 +54,9 @@ def init_dtype(self):
         self.dtype = np.float16
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestTopkV2(OpTest):
     def setUp(self):
         self.set_npu()
@@ -88,7 +84,7 @@ def init_dtype(self):
         self.dtype = np.float16
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py
index 17f6a0ae1ca9bf..e95f3cc83cfb31 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py
@@ -25,8 +25,6 @@
 paddle.enable_static()
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestTransposeOp(OpTest):
     def setUp(self):
         self.set_npu()
@@ -58,11 +56,9 @@ def init_axis(self):
         self.axis = -1
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestTransposeOpFP16(TestTransposeOp):
     no_need_check_grad = True
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_truncated_gaussian_random_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_truncated_gaussian_random_op_npu.py
index ff89508d196235..de94e7febaca76 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_truncated_gaussian_random_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_truncated_gaussian_random_op_npu.py
@@ -29,8 +29,6 @@
 SEED = 2021
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestTruncatedNormal(unittest.TestCase):
     def _test(self, run_npu=True):
         main_prog = paddle.static.Program()
diff --git a/python/paddle/fluid/tests/unittests/npu/test_uniform_random_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_uniform_random_op_npu.py
index 7c358c244f34dd..0e21c59432badc 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_uniform_random_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_uniform_random_op_npu.py
@@ -39,8 +39,6 @@ def output_hist(out):
     return hist, prob
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestNPUUniformRandomOp(OpTest):
     def setUp(self):
         self.set_npu()
@@ -76,8 +74,6 @@ def verify_output(self, outs):
                 hist, prob, rtol=0, atol=0.01), "hist: " + str(hist))
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestNPUUniformRandomOpSelectedRows(unittest.TestCase):
     def get_places(self):
         places = [core.CPUPlace()]
diff --git a/python/paddle/fluid/tests/unittests/npu/test_unstack_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_unstack_op_npu.py
index 6dd3c30c272c23..097f31c72467c0 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_unstack_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_unstack_op_npu.py
@@ -24,8 +24,6 @@
 paddle.enable_static()
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestUnStackOpBase(OpTest):
     def initDefaultParameters(self):
         self.input_dim = (5, 6, 7)
@@ -75,29 +73,21 @@ def test_check_grad(self):
         self.check_grad_with_place(self.place, ['X'], self.get_y_names())
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestStackOp3(TestUnStackOpBase):
     def initParameters(self):
         self.axis = -1
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestStackOp4(TestUnStackOpBase):
     def initParameters(self):
         self.axis = -3
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestStackOp5(TestUnStackOpBase):
     def initParameters(self):
         self.axis = 1
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestStackOp6(TestUnStackOpBase):
     def initParameters(self):
         self.axis = 2
diff --git a/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_op_npu.py
index cae3239229f441..1388adf609ff62 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_update_loss_scaling_op_npu.py
@@ -25,8 +25,6 @@
 SEED = 2021
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestUpdateLossScalingOp(OpTest):
     def setUp(self):
         self.set_npu()
@@ -71,7 +69,7 @@ def init(self):
         }
 
     def test_check_output(self):
-        self.check_output_with_place(self.place, check_dygraph=False)
+        self.check_output_with_place(self.place)
 
 
 class TestUpdateLossScalingOpBad(TestUpdateLossScalingOp):
@@ -103,8 +101,6 @@ def setUp(self):
         }
 
 
-@unittest.skipIf(not paddle.is_compiled_with_npu(),
-                 "core is not compiled with NPU")
 class TestUpdateLossScalingLayer(unittest.TestCase):
     def loss_scaling_check(self, use_npu=True, scope=fluid.Scope()):
         a = fluid.data(name="a", shape=[1024, 1024], dtype='float32')
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index 98d2493257d614..346accac01cc70 100755
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -18,7 +18,7 @@
 import numpy as np
 from scipy.special import expit, erf
 
-from op_test import OpTest, convert_float_to_uint16
+from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
@@ -1619,7 +1619,6 @@ def setUp(self):
         self.op_type = 'hard_swish'
         self.init_dtype()
 
-        from op_test import skip_check_grad_ci
         skip_check_grad_ci(reason="not implemented yet")
 
         np.random.seed(1024)
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
index b106f7aa9c1c8e..027c806fc02e90 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
@@ -898,6 +898,25 @@ def attr_padding_with_data_format():
 
         self.assertRaises(ValueError, attr_padding_with_data_format)
 
+        error_input = fluid.layers.data(
+            name='error_data', shape=[1], dtype="float32")
+
+        def error_input_size():
+            out = fluid.layers.conv2d_transpose(
+                input=error_input, groups=1, num_filters=6, filter_size=3)
+
+        self.assertRaises(ValueError, error_input_size)
+
+        def error_groups():
+            out = fluid.layers.conv2d_transpose(
+                input=data,
+                groups=0,
+                num_filters=6,
+                filter_size=3,
+                data_format='NHWC')
+
+        self.assertRaises(ValueError, error_groups)
+
 
 class TestConv2DTransposeRepr(unittest.TestCase):
     def test_case(self):
diff --git a/python/paddle/fluid/tests/unittests/test_dequantize_abs_max_op.py b/python/paddle/fluid/tests/unittests/test_dequantize_abs_max_op.py
index 8a66bdb8d152d0..696a60787b754e 100644
--- a/python/paddle/fluid/tests/unittests/test_dequantize_abs_max_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dequantize_abs_max_op.py
@@ -62,5 +62,12 @@ def set_args(self):
         self.data_type = "int8"
 
 
+class TestDequantizeMaxAbsOpInt16(TestDequantizeMaxAbsOp):
+    def set_args(self):
+        self.num_bits = 16
+        self.max_range = math.pow(2, self.num_bits - 1) - 1
+        self.data_type = "int16"
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer_with_recompute.py b/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer_with_recompute.py
new file mode 100644
index 00000000000000..f67b26e0aef65a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer_with_recompute.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import os
+
+paddle.enable_static()
+
+
+class TestFleetMetaOptimizer(unittest.TestCase):
+    def setUp(self):
+        os.environ["PADDLE_TRAINER_ID"] = "1"
+        os.environ[
+            "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002"
+
+    def test_pipeline_optimizer(self):
+        import paddle.distributed.fleet as fleet
+        import paddle.distributed.fleet.base.role_maker as role_maker
+        role = role_maker.PaddleCloudRoleMaker(is_collective=True)
+        fleet.init(role)
+        with paddle.fluid.device_guard("gpu:0"):
+            input_x = paddle.fluid.layers.data(
+                name="x", shape=[32], dtype='float32')
+            input_y = paddle.fluid.layers.data(
+                name="y", shape=[1], dtype='int64')
+            fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh')
+            fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh')
+            fc_3 = paddle.fluid.layers.fc(input=fc_2, size=64, act='tanh')
+            fc_4 = paddle.fluid.layers.fc(input=fc_3, size=64, act='tanh')
+            fc_5 = paddle.fluid.layers.fc(input=fc_4, size=64, act='tanh')
+            fc_6 = paddle.fluid.layers.fc(input=fc_5, size=64, act='tanh')
+
+        with paddle.fluid.device_guard("gpu:1"):
+            fc_7 = paddle.fluid.layers.fc(input=fc_6, size=64, act='tanh')
+            prediction = paddle.fluid.layers.fc(input=[fc_7],
+                                                size=2,
+                                                act='softmax')
+            cost = paddle.fluid.layers.cross_entropy(
+                input=prediction, label=input_y)
+            avg_cost = paddle.fluid.layers.mean(x=cost)
+
+        strategy = paddle.distributed.fleet.DistributedStrategy()
+        strategy.pipeline = True
+        strategy.pipeline_configs = {
+            'micro_batch_size': 1,
+            'accumulate_steps': 2,
+            'schedule_mode': '1F1B'
+        }
+
+        checkpoints = ['fc_5.tmp_0', 'fc_7.tmp_0']
+        strategy.recompute = True
+        strategy.recompute_configs = {
+            "checkpoints": checkpoints,
+            "enable_offload": False,
+            "checkpoint_shape": []
+        }
+
+        optimizer = paddle.fluid.optimizer.Adam(0.01)
+        optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
+        optimizer.minimize(avg_cost)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_logical_op.py b/python/paddle/fluid/tests/unittests/test_logical_op.py
index c8bb8c5b73f768..e77526bdb16bc9 100755
--- a/python/paddle/fluid/tests/unittests/test_logical_op.py
+++ b/python/paddle/fluid/tests/unittests/test_logical_op.py
@@ -21,6 +21,10 @@
 import paddle.fluid as fluid
 from paddle.static import Program, program_guard
 
+SUPPORTED_DTYPES = [
+    bool, np.int8, np.int16, np.int32, np.int64, np.float32, np.float64
+]
+
 TEST_META_OP_DATA = [{
     'op_str': 'logical_and',
     'binary_op': True
@@ -111,13 +115,13 @@ def run_static(x_np, y_np, op_str, use_gpu=False, binary_op=True):
         place = paddle.CUDAPlace(0)
     exe = fluid.Executor(place)
     with fluid.program_guard(main_program, startup_program):
-        x = paddle.static.data(name='x', shape=x_np.shape, dtype='bool')
+        x = paddle.static.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
         op = getattr(paddle, op_str)
         feed_list = {'x': x_np}
         if not binary_op:
             res = op(x)
         else:
-            y = paddle.static.data(name='y', shape=y_np.shape, dtype='bool')
+            y = paddle.static.data(name='y', shape=y_np.shape, dtype=y_np.dtype)
             feed_list['y'] = y_np
             res = op(x, y)
         exe.run(startup_program)
@@ -131,17 +135,20 @@ def run_dygraph(x_np, y_np, op_str, use_gpu=False, binary_op=True):
         place = paddle.CUDAPlace(0)
     paddle.disable_static(place)
     op = getattr(paddle, op_str)
-    x = paddle.to_tensor(x_np)
+    x = paddle.to_tensor(x_np, dtype=x_np.dtype)
     if not binary_op:
         dygraph_result = op(x)
     else:
-        y = paddle.to_tensor(y_np)
+        y = paddle.to_tensor(y_np, dtype=y_np.dtype)
         dygraph_result = op(x, y)
     return dygraph_result
 
 
-def np_data_generator(np_shape, *args, **kwargs):
-    return np.random.choice(a=[True, False], size=np_shape).astype(bool)
+def np_data_generator(np_shape, dtype, *args, **kwargs):
+    if dtype == bool:
+        return np.random.choice(a=[True, False], size=np_shape).astype(bool)
+    else:
+        return np.random.randn(*np_shape).astype(dtype)
 
 
 def test(unit_test, use_gpu=False, test_error=False):
@@ -153,40 +160,46 @@ def test(unit_test, use_gpu=False, test_error=False):
         if test_error:
             META_DATA = dict(TEST_META_WRONG_SHAPE_DATA)
         for shape_data in META_DATA.values():
-            meta_data['x_np'] = np_data_generator(shape_data['x_shape'])
-            meta_data['y_np'] = np_data_generator(shape_data['y_shape'])
-            if meta_data['binary_op'] and test_error:
-                # catch C++ Exception
-                unit_test.assertRaises(BaseException, run_static, **meta_data)
-                unit_test.assertRaises(BaseException, run_dygraph, **meta_data)
-                continue
-            static_result = run_static(**meta_data)
-            dygraph_result = run_dygraph(**meta_data)
-            if meta_data['binary_op']:
-                np_result = np_op(meta_data['x_np'], meta_data['y_np'])
-            else:
-                np_result = np_op(meta_data['x_np'])
-            unit_test.assertTrue((static_result == np_result).all())
-            unit_test.assertTrue((dygraph_result.numpy() == np_result).all())
+            for data_type in SUPPORTED_DTYPES:
+                meta_data['x_np'] = np_data_generator(
+                    shape_data['x_shape'], dtype=data_type)
+                meta_data['y_np'] = np_data_generator(
+                    shape_data['y_shape'], dtype=data_type)
+                if meta_data['binary_op'] and test_error:
+                    # catch C++ Exception
+                    unit_test.assertRaises(BaseException, run_static,
+                                           **meta_data)
+                    unit_test.assertRaises(BaseException, run_dygraph,
+                                           **meta_data)
+                    continue
+                static_result = run_static(**meta_data)
+                dygraph_result = run_dygraph(**meta_data)
+                if meta_data['binary_op']:
+                    np_result = np_op(meta_data['x_np'], meta_data['y_np'])
+                else:
+                    np_result = np_op(meta_data['x_np'])
+                unit_test.assertTrue((static_result == np_result).all())
+                unit_test.assertTrue((dygraph_result.numpy() == np_result).all(
+                ))
 
 
 def test_type_error(unit_test, use_gpu, type_str_map):
     def check_type(op_str, x, y, binary_op):
         op = getattr(paddle, op_str)
-        error_type = TypeError
+        error_type = ValueError
         if isinstance(x, np.ndarray):
             x = paddle.to_tensor(x)
             y = paddle.to_tensor(y)
             error_type = BaseException
         if binary_op:
-            if type_str_map['x'] != 'bool' or type_str_map['y'] != 'bool':
+            if type_str_map['x'] != type_str_map['y']:
                 unit_test.assertRaises(error_type, op, x=x, y=y)
             if not fluid.in_dygraph_mode():
+                error_type = TypeError
                 unit_test.assertRaises(error_type, op, x=x, y=y, out=1)
         else:
-            if type_str_map['x'] != 'bool':
-                unit_test.assertRaises(error_type, op, x=x)
             if not fluid.in_dygraph_mode():
+                error_type = TypeError
                 unit_test.assertRaises(error_type, op, x=x, out=1)
 
     place = paddle.CPUPlace()
@@ -213,12 +226,10 @@ def check_type(op_str, x, y, binary_op):
 
 
 def type_map_factory():
-    x_type_list = ['float32', 'float64', 'int32', 'int64', 'bool']
-    y_type_list = ['float32', 'float64', 'int32', 'int64', 'bool']
     return [{
         'x': x_type,
         'y': y_type
-    } for x_type in x_type_list for y_type in y_type_list]
+    } for x_type in SUPPORTED_DTYPES for y_type in SUPPORTED_DTYPES]
 
 
 class TestCPU(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
index be1a44120cd1ac..f3546a7c50d97a 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
@@ -316,6 +316,124 @@ def check_result(self, ids_array, result_array):
             assert (row == result_array[idx]).all()
 
 
+@skip_check_grad_ci(reason="Int16 type only be used in test and inference.")
+class TestLookupTableOpInt16(OpTest):
+    def setUp(self):
+        self.op_type = "lookup_table"
+        table = np.random.randint(
+            low=-128, high=127, size=(17, 31)).astype("int16")
+        ids = np.random.randint(0, 17, 4).astype("int64")
+        ids_expand = np.expand_dims(ids, axis=1)
+        self.inputs = {'W': table, 'Ids': ids_expand}
+        self.outputs = {'Out': table[ids]}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+@skip_check_grad_ci(reason="Int16 type only be used in test and inference.")
+class TestLookupTableOpWithTensorIdsInt16(OpTest):
+    def setUp(self):
+        self.op_type = "lookup_table"
+        table = np.random.randint(
+            low=-128, high=127, size=(17, 31)).astype("int16")
+        ids = np.random.randint(
+            low=0, high=17, size=(2, 4, 5, 1)).astype("int64")
+        self.inputs = {'W': table, 'Ids': ids}
+        self.outputs = {'Out': table[ids.flatten()].reshape((2, 4, 5, 31))}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+@skip_check_grad_ci(reason="Int16 type only be used in test and inference.")
+class TestLookupTableOpWithPaddingInt16(TestLookupTableOpInt16):
+    def test_check_output(self):
+        ids = np.squeeze(self.inputs['Ids'])
+        padding_idx = np.random.choice(ids, 1)[0]
+        self.outputs['Out'][ids == padding_idx] = np.zeros(31)
+        self.attrs = {'padding_idx': int(padding_idx)}
+        self.check_output()
+
+
+@skip_check_grad_ci(reason="Int16 type only be used in test and inference.")
+class TestLookupTableOpWithTensorIdsAndPaddingInt16(
+        TestLookupTableOpWithTensorIdsInt16):
+    def test_check_output(self):
+        ids = self.inputs['Ids']
+        flatten_idx = ids.flatten()
+        padding_idx = np.random.choice(flatten_idx, 1)[0]
+        self.outputs['Out'][np.squeeze(ids == padding_idx)] = np.zeros(31)
+        self.attrs = {'padding_idx': cpt.long_type(padding_idx)}
+        self.check_output()
+
+
+class TestLookupTableWIsSelectedRowsInt16(unittest.TestCase):
+    def prepare_ids(self, scope, place):
+        ids_tensor = scope.var('Ids').get_tensor()
+        ids_array = np.array([[0], [4], [3], [5]]).astype("int64")
+        ids_tensor.set(ids_array, place)
+        return ids_array
+
+    def prepare_w(self, scope, place):
+        rows = [0, 1, 2, 3, 4, 5, 6]
+        row_numel = 12
+
+        w_selected_rows = scope.var('W').get_selected_rows()
+        w_selected_rows.set_height(len(rows))
+        w_selected_rows.set_rows(rows)
+        w_array = np.ones((len(rows), row_numel)).astype("int16")
+        for i in range(len(rows)):
+            w_array[i] *= i
+        w_tensor = w_selected_rows.get_tensor()
+        w_tensor.set(w_array, place)
+
+    def create_out_tensor(self, scope, place):
+        return scope.var('Out').get_tensor()
+
+    def check_result(self, ids_array, result_array):
+        for idx, row in enumerate(ids_array):
+            assert (row[0] == result_array[idx]).all()
+
+    def check_with_place(self, place):
+        scope = core.Scope()
+
+        ids_array = self.prepare_ids(scope, place)
+
+        self.prepare_w(scope, place)
+
+        out_tensor = self.create_out_tensor(scope, place)
+
+        # create and run lookup_table operator
+        lookup_table = Operator("lookup_table", W='W', Ids='Ids', Out='Out')
+        lookup_table.run(scope, place)
+
+        # get result from Out
+        result_array = np.array(out_tensor)
+
+        self.check_result(ids_array, result_array)
+
+    def test_w_is_selected_rows(self):
+        places = [core.CPUPlace()]
+        # currently only support CPU
+        for place in places:
+            self.check_with_place(place)
+
+
+class TestLookupTableWithTensorIdsWIsSelectedRowsInt16(
+        TestLookupTableWIsSelectedRowsInt16):
+    def prepare_ids(self, scope, place):
+        ids_tensor = scope.var('Ids').get_tensor()
+        ids_array = np.random.randint(
+            low=0, high=6, size=(2, 4, 3, 1)).astype("int64")
+        ids_tensor.set(ids_array, place)
+        return ids_array
+
+    def check_result(self, ids_array, result_array):
+        for idx, row in np.ndenumerate(ids_array):
+            assert (row == result_array[idx]).all()
+
+
 class TestOutDtype(unittest.TestCase):
     def test_dtype(self):
         api_fn = F.embedding
diff --git a/python/paddle/fluid/tests/unittests/test_memcpy_op.py b/python/paddle/fluid/tests/unittests/test_memcpy_op.py
index 38e9379bc16677..3fecef9397c630 100755
--- a/python/paddle/fluid/tests/unittests/test_memcpy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_memcpy_op.py
@@ -144,32 +144,6 @@ def test_SELECTED_ROWS(self):
                 feed={},
                 fetch_list=[selected_row_var.name, pinned_var.name])
 
-    def test_OTHER_PLACE_NotImplementedError(self):
-        main_program, pinned_var = self.get_prog()
-        lod_tensor_var = main_program.global_block().create_var( \
-            name="lod_tensor_0", dtype="float32", persistable=False, stop_gradient=True)
-        main_program.global_block().append_op(
-            type="fill_constant",
-            outputs={"Out": lod_tensor_var},
-            attrs={
-                "shape": lod_tensor_var.shape,
-                "dtype": lod_tensor_var.dtype,
-                "value": 1.0,
-                "place_type": 0
-            })
-        main_program.global_block().append_op(
-            type='memcpy',
-            inputs={'X': pinned_var},
-            outputs={'Out': lod_tensor_var},
-            attrs={'dst_place_type': 0, })
-        with self.assertRaises(NotImplementedError):
-            place = fluid.CUDAPlace(0)
-            exe = fluid.Executor(place)
-            lod_tensor_var_, pinned_ = exe.run(
-                main_program,
-                feed={},
-                fetch_list=[lod_tensor_var.name, pinned_var.name])
-
 
 class TestMemcpyApi(unittest.TestCase):
     def test_api(self):
diff --git a/python/paddle/fluid/tests/unittests/test_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_mixed_precision.py
index 89d40e9314e501..57ea7ad1aa2509 100644
--- a/python/paddle/fluid/tests/unittests/test_mixed_precision.py
+++ b/python/paddle/fluid/tests/unittests/test_mixed_precision.py
@@ -47,6 +47,9 @@ def forward(self, x):
 
 
 class AMPTest(unittest.TestCase):
+    def setUp(self):
+        self.place = paddle.CUDAPlace(0)
+
     def net(self):
         input_size = 4096
         output_size = 4096
@@ -82,7 +85,8 @@ def test_skip_update(self):
             fetch_list = [
                 loss, weight, moment1, beta_pow1, 'find_infinite_scale.tmp_0'
             ]
-            exe = paddle.static.Executor(paddle.CUDAPlace(0))
+
+            exe = paddle.static.Executor(self.place)
 
             train_data = [
                 np.random.rand(batch_size, input_size).astype(np.float32)
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index 644e46f1081589..87594f0f2d0be9 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -652,6 +652,43 @@ def _test_slice_for_tensor_attr(self):
             np.array_equal(local_out[15], tensor_array[::-1, ::-1, ::-1]))
         self.assertTrue(np.array_equal(local_out[16], tensor_array[-4:4]))
 
+    def _test_for_getitem_ellipsis_index(self):
+        shape = (64, 3, 5, 256)
+        np_fp32_value = np.random.random(shape).astype('float32')
+        np_int_value = np.random.randint(1, 100, shape)
+
+        var_fp32 = paddle.to_tensor(np_fp32_value)
+        var_int = paddle.to_tensor(np_int_value)
+
+        def assert_getitem_ellipsis_index(var_tensor, var_np):
+            var = [
+                var_tensor[..., 0].numpy(),
+                var_tensor[..., 1, 0].numpy(),
+                var_tensor[0, ..., 1, 0].numpy(),
+                var_tensor[1, ..., 1].numpy(),
+                var_tensor[2, ...].numpy(),
+                var_tensor[2, 0, ...].numpy(),
+                var_tensor[2, 0, 1, ...].numpy(),
+                var_tensor[...].numpy(),
+                var_tensor[:, ..., 100].numpy(),
+            ]
+
+            self.assertTrue(np.array_equal(var[0], var_np[..., 0]))
+            self.assertTrue(np.array_equal(var[1], var_np[..., 1, 0]))
+            self.assertTrue(np.array_equal(var[2], var_np[0, ..., 1, 0]))
+            self.assertTrue(np.array_equal(var[3], var_np[1, ..., 1]))
+            self.assertTrue(np.array_equal(var[4], var_np[2, ...]))
+            self.assertTrue(np.array_equal(var[5], var_np[2, 0, ...]))
+            self.assertTrue(np.array_equal(var[6], var_np[2, 0, 1, ...]))
+            self.assertTrue(np.array_equal(var[7], var_np[...]))
+            self.assertTrue(np.array_equal(var[8], var_np[:, ..., 100]))
+
+        var_fp32 = paddle.to_tensor(np_fp32_value)
+        var_int = paddle.to_tensor(np_int_value)
+
+        assert_getitem_ellipsis_index(var_fp32, np_fp32_value)
+        assert_getitem_ellipsis_index(var_int, np_int_value)
+
     def _test_for_var(self):
         np_value = np.random.random((30, 100, 100)).astype('float32')
         w = fluid.dygraph.to_variable(np_value)
@@ -664,6 +701,7 @@ def test_slice(self):
             self._test_slice()
             self._test_slice_for_tensor_attr()
             self._test_for_var()
+            self._test_for_getitem_ellipsis_index()
 
             var = fluid.dygraph.to_variable(self.array)
             self.assertTrue(np.array_equal(var[1, :].numpy(), self.array[1, :]))
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_logical_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_logical_op_xpu.py
index 21eb99fcfbf919..7e7481bd90646c 100755
--- a/python/paddle/fluid/tests/unittests/xpu/test_logical_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_logical_op_xpu.py
@@ -25,6 +25,10 @@
 from op_test_xpu import XPUOpTest
 from paddle.static import Program, program_guard
 
+SUPPORTED_DTYPES = [
+    bool, np.int8, np.int16, np.int32, np.int64, np.float32, np.float64
+]
+
 TEST_META_OP_DATA = [{
     'op_str': 'logical_and',
     'binary_op': True
@@ -110,13 +114,13 @@ def run_static_xpu(x_np, y_np, op_str, binary_op=True):
     place = paddle.XPUPlace(0)
     exe = fluid.Executor(place)
     with fluid.program_guard(main_program, startup_program):
-        x = paddle.static.data(name='x', shape=x_np.shape, dtype='bool')
+        x = paddle.static.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
         op = getattr(paddle, op_str)
         feed_list = {'x': x_np}
         if not binary_op:
             res = op(x)
         else:
-            y = paddle.static.data(name='y', shape=y_np.shape, dtype='bool')
+            y = paddle.static.data(name='y', shape=y_np.shape, dtype=y_np.dtype)
             feed_list['y'] = y_np
             res = op(x, y)
         exe.run(startup_program)
@@ -128,17 +132,20 @@ def run_dygraph_xpu(x_np, y_np, op_str, binary_op=True):
     place = paddle.XPUPlace(0)
     paddle.disable_static(place)
     op = getattr(paddle, op_str)
-    x = paddle.to_tensor(x_np)
+    x = paddle.to_tensor(x_np, dtype=x_np.dtype)
     if not binary_op:
         dygraph_result = op(x)
     else:
-        y = paddle.to_tensor(y_np)
+        y = paddle.to_tensor(y_np, dtype=y_np.dtype)
         dygraph_result = op(x, y)
     return dygraph_result
 
 
-def np_data_generator(np_shape, *args, **kwargs):
-    return np.random.choice(a=[True, False], size=np_shape).astype(bool)
+def np_data_generator(np_shape, dtype, *args, **kwargs):
+    if dtype == bool:
+        return np.random.choice(a=[True, False], size=np_shape).astype(bool)
+    else:
+        return np.random.randn(*np_shape).astype(dtype)
 
 
 def test_xpu(unit_test, test_error=False):
@@ -149,40 +156,44 @@ def test_xpu(unit_test, test_error=False):
         if test_error:
             META_DATA = dict(TEST_META_WRONG_SHAPE_DATA)
         for shape_data in META_DATA.values():
-            meta_data['x_np'] = np_data_generator(shape_data['x_shape'])
-            meta_data['y_np'] = np_data_generator(shape_data['y_shape'])
-            if meta_data['binary_op'] and test_error:
-                # catch C++ Exception
-                unit_test.assertRaises(BaseException, run_static_xpu,
-                                       **meta_data)
-                continue
-            static_result = run_static_xpu(**meta_data)
-            dygraph_result = run_dygraph_xpu(**meta_data)
-            if meta_data['binary_op']:
-                np_result = np_op(meta_data['x_np'], meta_data['y_np'])
-            else:
-                np_result = np_op(meta_data['x_np'])
-            unit_test.assertTrue((static_result == np_result).all())
-            unit_test.assertTrue((dygraph_result.numpy() == np_result).all())
+            for data_type in SUPPORTED_DTYPES:
+                meta_data['x_np'] = np_data_generator(
+                    shape_data['x_shape'], dtype=data_type)
+                meta_data['y_np'] = np_data_generator(
+                    shape_data['y_shape'], dtype=data_type)
+                if meta_data['binary_op'] and test_error:
+                    # catch C++ Exception
+                    unit_test.assertRaises(BaseException, run_static_xpu,
+                                           **meta_data)
+                    continue
+                static_result = run_static_xpu(**meta_data)
+                dygraph_result = run_dygraph_xpu(**meta_data)
+                if meta_data['binary_op']:
+                    np_result = np_op(meta_data['x_np'], meta_data['y_np'])
+                else:
+                    np_result = np_op(meta_data['x_np'])
+                unit_test.assertTrue((static_result == np_result).all())
+                unit_test.assertTrue((dygraph_result.numpy() == np_result).all(
+                ))
 
 
 def test_type_error(unit_test, type_str_map):
     def check_type(op_str, x, y, binary_op):
         op = getattr(paddle, op_str)
-        error_type = TypeError
+        error_type = ValueError
         if isinstance(x, np.ndarray):
             x = paddle.to_tensor(x)
             y = paddle.to_tensor(y)
             error_type = BaseException
         if binary_op:
-            if type_str_map['x'] != 'bool' or type_str_map['y'] != 'bool':
+            if type_str_map['x'] != type_str_map['y']:
                 unit_test.assertRaises(error_type, op, x=x, y=y)
             if not fluid.in_dygraph_mode():
+                error_type = TypeError
                 unit_test.assertRaises(error_type, op, x=x, y=y, out=1)
         else:
-            if type_str_map['x'] != 'bool':
-                unit_test.assertRaises(error_type, op, x=x)
             if not fluid.in_dygraph_mode():
+                error_type = TypeError
                 unit_test.assertRaises(error_type, op, x=x, out=1)
 
     place = paddle.XPUPlace(0)
@@ -208,12 +219,10 @@ def check_type(op_str, x, y, binary_op):
 
 
 def type_map_factory():
-    x_type_list = ['float32', 'float64', 'int32', 'int64', 'bool']
-    y_type_list = ['float32', 'float64', 'int32', 'int64', 'bool']
     return [{
         'x': x_type,
         'y': y_type
-    } for x_type in x_type_list for y_type in y_type_list]
+    } for x_type in SUPPORTED_DTYPES for y_type in SUPPORTED_DTYPES]
 
 
 @unittest.skipIf(not paddle.is_compiled_with_xpu(),
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index c50b3c06bdff07..1ac873ce9ca509 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -218,8 +218,6 @@ def _init_context():
             fluid.disable_dygraph()
             _init_context()
             fluid.enable_dygraph(place)
-        else:
-            _init_context()
 
     else:
         assert ("Only support CUDAPlace for now.")
@@ -1709,7 +1707,8 @@ def fit(self,
 
         steps = self._len_data_loader(train_loader)
         self.num_iters = num_iters
-        if num_iters is not None and isinstance(num_iters, int):
+        if num_iters is not None and isinstance(num_iters, int) and isinstance(
+                steps, int):
             assert num_iters > 0, "num_iters must be greater than 0!"
             epochs = (num_iters // steps) + 1
             steps = min(num_iters, steps)
@@ -1744,8 +1743,8 @@ def fit(self,
                 eval_logs = self._run_one_epoch(eval_loader, cbks, 'eval')
 
                 cbks.on_end('eval', eval_logs)
-                if self.stop_training:
-                    break
+            if self.stop_training:
+                break
 
         cbks.on_end('train', logs)
         self._test_dataloader = None
@@ -1832,7 +1831,8 @@ def evaluate(self,
 
         eval_steps = self._len_data_loader(eval_loader)
         self.num_iters = num_iters
-        if num_iters is not None and isinstance(num_iters, int):
+        if num_iters is not None and isinstance(num_iters, int) and isinstance(
+                eval_steps, int):
             assert num_iters > 0, "num_iters must be greater than 0!"
             eval_steps = min(num_iters, eval_steps)
             self.num_iters = eval_steps
@@ -2094,7 +2094,9 @@ def _run_one_epoch(
             callbacks.on_batch_end(mode, step, logs)
             if hasattr(self, 'num_iters') and self.num_iters is not None:
                 self.num_iters -= 1
-                if self.num_iters == 0:
+                if self.num_iters <= 0:
+                    self.stop_training = True
+                    del self.num_iters
                     break
         self._reset_metrics()
 
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index 5fe17e8c193e3e..8f094877e74b67 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -15,6 +15,11 @@
 # TODO: import all neural network related api under this directory,
 # including layers, linear, conv, rnn etc.
 
+from ..fluid.dygraph.layers import Layer  # noqa: F401
+from ..fluid.dygraph.container import LayerList  # noqa: F401
+from ..fluid.dygraph.container import ParameterList  # noqa: F401
+from ..fluid.dygraph.container import Sequential  # noqa: F401
+
 from .clip import ClipGradByGlobalNorm  # noqa: F401
 from .clip import ClipGradByNorm  # noqa: F401
 from .clip import ClipGradByValue  # noqa: F401
@@ -130,10 +135,6 @@
 
 # TODO: remove loss, keep it for too many used in unitests
 from .layer import loss  # noqa: F401
-from ..fluid.dygraph.layers import Layer  # noqa: F401
-from ..fluid.dygraph.container import LayerList  # noqa: F401
-from ..fluid.dygraph.container import ParameterList  # noqa: F401
-from ..fluid.dygraph.container import Sequential  # noqa: F401
 
 from . import utils  # noqa: F401
 from . import functional  # noqa: F401
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index 057797ff962b42..4bc137222d2efa 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -16,18 +16,21 @@
 import paddle
 from ...fluid.framework import in_dygraph_mode, default_main_program
 from paddle.fluid.layer_helper import LayerHelper
-from paddle.fluid.layers.tensor import Variable, fill_constant, zeros, concat
+from paddle.fluid.layers.tensor import fill_constant
+from ...tensor import concat
+from ...tensor.creation import zeros
+from paddle.static import Variable
 from ...fluid.layers import core
 from ...fluid import dygraph_utils
 # TODO: define the common functions to build a neural network  
 from ...fluid.layers import unfold  # noqa: F401
-from ...fluid.layers import squeeze
-from ...fluid.layers import unsqueeze
+from ...tensor.manipulation import squeeze
+from ...tensor.manipulation import unsqueeze
 from ...tensor import clip
 from ...tensor import sum
 from ...tensor import sqrt
 from ...fluid.data_feeder import check_variable_and_dtype, check_dtype
-from ...fluid.framework import Variable, in_dygraph_mode, _varbase_creator
+from ...fluid.framework import in_dygraph_mode, _varbase_creator
 
 from ...fluid.framework import in_dygraph_mode
 from ...fluid import core, dygraph_utils
@@ -927,9 +930,9 @@ def get_attrs(prog, dropout_prob, is_test, seed):
         keep_prob = 1 - p
         if training:
             if p == 1.:
-                return layers.scale(x, scale=0.)
+                return paddle.scale(x, scale=0.)
 
-            scale_input = layers.scale(
+            scale_input = paddle.scale(
                 x, scale=1 / keep_prob) if mode == 'upscale_in_train' else x
 
             #get mask shape
@@ -947,17 +950,17 @@ def get_attrs(prog, dropout_prob, is_test, seed):
                 mask_shape[i] = input_shape[i]
 
             #get mask
-            random_tensor = layers.uniform_random(
+            random_tensor = paddle.uniform(
                 mask_shape, dtype='float32', min=0., max=1.0)
             p = layers.fill_constant(shape=[1], dtype='float32', value=p)
-            keep_mask = layers.greater_equal(random_tensor, p)
+            keep_mask = paddle.greater_equal(random_tensor, p)
 
-            scale_input = layers.cast(scale_input, dtype)
-            keep_mask = layers.cast(keep_mask, dtype)
+            scale_input = paddle.cast(scale_input, dtype)
+            keep_mask = paddle.cast(keep_mask, dtype)
             ret = paddle.multiply(scale_input, keep_mask, name=name)
             return ret
         else:  # test
-            ret = layers.scale(
+            ret = paddle.scale(
                 x, scale=keep_prob) if mode == 'downscale_in_infer' else x
             return ret
 
@@ -1113,7 +1116,7 @@ def alpha_dropout(x, p=0.5, training=True, name=None):
 
     if training:
         if p == 1:
-            return layers.scale(x, scale=0.)
+            return paddle.scale(x, scale=0.)
         #get transformation params
         alpha = 1.6732632423543772848170429916717
         scale = 1.0507009873554804934193349852946
@@ -1125,23 +1128,22 @@ def alpha_dropout(x, p=0.5, training=True, name=None):
         input_shape = x.shape
 
         #get mask
-        random_tensor = layers.uniform_random(
+        random_tensor = paddle.uniform(
             input_shape, dtype='float32', min=0., max=1.0)
         p = layers.fill_constant(shape=[1], dtype='float32', value=p)
-        keep_mask = layers.greater_equal(random_tensor, p)
-        keep_mask = layers.cast(keep_mask, dtype)
-        drop_mask = layers.elementwise_sub(
+        keep_mask = paddle.greater_equal(random_tensor, p)
+        keep_mask = paddle.cast(keep_mask, dtype)
+        drop_mask = paddle.subtract(
             layers.fill_constant(
                 shape=input_shape, dtype=dtype, value=1.),
             keep_mask)
 
         #apply mask
         b = layers.fill_constant(shape=[1], dtype=dtype, value=b)
-        y = layers.elementwise_add(
-            paddle.multiply(x, keep_mask),
-            layers.scale(
-                drop_mask, scale=alpha_p))
-        res = layers.elementwise_add(layers.scale(y, scale=a), b, name=name)
+        y = paddle.add(paddle.multiply(x, keep_mask),
+                       paddle.scale(
+                           drop_mask, scale=alpha_p))
+        res = paddle.add(paddle.scale(y, scale=a), b, name=name)
         return res
     else:  # test
         return x
@@ -1277,42 +1279,42 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
             if x_dim == 3:
                 pad = concat([zeros((4, ), dtype="int32"), pad], axis=0)
                 unsqueezed_dim = [3, 4]
-                x = unsqueeze(x, axes=unsqueezed_dim)
+                x = unsqueeze(x, axis=unsqueezed_dim)
             elif x_dim == 4:
                 pad = concat([pad, zeros((2, ), dtype="int32")], axis=0)
                 unsqueezed_dim = [2]
-                x = unsqueeze(x, axes=unsqueezed_dim)
+                x = unsqueeze(x, axis=unsqueezed_dim)
         elif data_format in ["NLC", "NHWC", "NDHWC"]:
             data_format = "NDHWC"
             if x_dim == 3:
                 pad = concat([zeros((4, ), dtype="int32"), pad], axis=0)
                 unsqueezed_dim = [2, 3]
-                x = unsqueeze(x, axes=unsqueezed_dim)
+                x = unsqueeze(x, axis=unsqueezed_dim)
             elif x_dim == 4:
                 pad = concat([pad, zeros((2, ), dtype="int32")], axis=0)
                 unsqueezed_dim = [1]
-                x = unsqueeze(x, axes=unsqueezed_dim)
+                x = unsqueeze(x, axis=unsqueezed_dim)
     else:
         if data_format in ["NCL", "NCHW", "NCDHW"]:
             data_format = "NCDHW"
             if x_dim == 3:
                 pad = [0, 0, 0, 0] + pad
                 unsqueezed_dim = [3, 4]
-                x = unsqueeze(x, axes=unsqueezed_dim)
+                x = unsqueeze(x, axis=unsqueezed_dim)
             elif x_dim == 4:
                 pad = pad + [0, 0]
                 unsqueezed_dim = [2]
-                x = unsqueeze(x, axes=unsqueezed_dim)
+                x = unsqueeze(x, axis=unsqueezed_dim)
         elif data_format in ["NLC", "NHWC", "NDHWC"]:
             data_format = "NDHWC"
             if x_dim == 3:
                 pad = [0, 0, 0, 0] + pad
                 unsqueezed_dim = [2, 3]
-                x = unsqueeze(x, axes=unsqueezed_dim)
+                x = unsqueeze(x, axis=unsqueezed_dim)
             elif x_dim == 4:
                 pad = pad + [0, 0]
                 unsqueezed_dim = [1]
-                x = unsqueeze(x, axes=unsqueezed_dim)
+                x = unsqueeze(x, axis=unsqueezed_dim)
 
     if in_dygraph_mode():
         if isinstance(pad, Variable):
@@ -1336,7 +1338,7 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
             type='pad3d', inputs=inputs, outputs={"Out": out}, attrs=attrs)
 
     if len(unsqueezed_dim) != 0:
-        out = squeeze(out, axes=unsqueezed_dim)
+        out = squeeze(out, axis=unsqueezed_dim)
 
     return out
 
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index bdbfa5877a789a..319248dfda2fab 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -16,13 +16,17 @@
 
 import numpy as np
 from ...device import get_cudnn_version
-from ...fluid.framework import Variable, in_dygraph_mode
+from ...fluid.framework import in_dygraph_mode
+from ...static import Variable
 from ...fluid import core, dygraph_utils, get_flags
-from ...fluid.layers import nn, utils
+from ...fluid.layers.utils import convert_to_list, _is_symmetric_padding
 from ...fluid.data_feeder import check_variable_and_dtype
-from ...fluid.param_attr import ParamAttr
+from ...framework import ParamAttr
 from ...fluid.layer_helper import LayerHelper
 from paddle import _C_ops
+from ...tensor.manipulation import unsqueeze, squeeze
+from ...tensor.math import add
+from ...fluid.layers import nn
 
 __all__ = []
 
@@ -69,24 +73,24 @@ def _update_padding_nd(padding, channel_last, num_dims):
             padding_algorithm = "EXPLICIT"
             padding = _exclude_padding_in_batch_and_channel(padding,
                                                             channel_last)
-            if utils._is_symmetric_padding(padding, num_dims):
+            if _is_symmetric_padding(padding, num_dims):
                 padding = padding[0::2]
         # for padding like [pad_before, pad_after, pad_before, pad_after, ...]
         elif len(padding) == 2 * num_dims and isinstance(padding[0], int):
             padding_algorithm = "EXPLICIT"
-            padding = utils.convert_to_list(padding, 2 * num_dims, 'padding')
-            if utils._is_symmetric_padding(padding, num_dims):
+            padding = convert_to_list(padding, 2 * num_dims, 'padding')
+            if _is_symmetric_padding(padding, num_dims):
                 padding = padding[0::2]
         # for padding like [pad_d1, pad_d2, ...]
         elif len(padding) == num_dims and isinstance(padding[0], int):
             padding_algorithm = "EXPLICIT"
-            padding = utils.convert_to_list(padding, num_dims, 'padding')
+            padding = convert_to_list(padding, num_dims, 'padding')
         else:
             raise ValueError("In valid padding: {}".format(padding))
     # for integer padding
     else:
         padding_algorithm = "EXPLICIT"
-        padding = utils.convert_to_list(padding, num_dims, 'padding')
+        padding = convert_to_list(padding, num_dims, 'padding')
     if not all([p >= 0 for p in padding]):
         raise ValueError(
             "Invalid padding, all value should be larger than or equal to 0, but received: {}".
@@ -323,8 +327,8 @@ def conv1d(x,
             "The size of padding's dimension should be 1 or 2. But got padding={}".
             format(padding))
 
-    stride = utils.convert_to_list(stride, 1, 'stride') + [1]
-    dilation = utils.convert_to_list(dilation, 1, 'dilation') + [1]
+    stride = convert_to_list(stride, 1, 'stride') + [1]
+    dilation = convert_to_list(dilation, 1, 'dilation') + [1]
 
     l_type = "conv2d"
     if (num_channels == groups and num_channels != 1 and
@@ -333,8 +337,8 @@ def conv1d(x,
         use_cudnn = False
 
     squeeze_aixs = -2 if channel_last else -1
-    x = nn.unsqueeze(input=x, axes=[squeeze_aixs])
-    weight = nn.unsqueeze(input=weight, axes=[-1])
+    x = unsqueeze(x, axis=[squeeze_aixs])
+    weight = unsqueeze(weight, axis=[-1])
     if in_dygraph_mode():
         attrs = ('strides', stride, 'paddings', padding, 'dilations', dilation,
                  'groups', groups, 'use_cudnn', use_cudnn, 'use_mkldnn', False,
@@ -366,7 +370,7 @@ def conv1d(x,
             type=l_type, inputs=inputs, outputs=outputs, attrs=attrs)
         if bias is not None:
             out = nn.elementwise_add(out, bias, axis=channel_dim)
-    out = nn.squeeze(input=out, axes=[squeeze_aixs])
+    out = squeeze(out, axis=[squeeze_aixs])
     return out
 
 
@@ -530,8 +534,8 @@ def conv2d(x,
 
     # update attrs
     padding, padding_algorithm = _update_padding_nd(padding, channel_last, 2)
-    stride = utils.convert_to_list(stride, 2, 'stride')
-    dilation = utils.convert_to_list(dilation, 2, 'dilation')
+    stride = convert_to_list(stride, 2, 'stride')
+    dilation = convert_to_list(dilation, 2, 'dilation')
 
     l_type = "conv2d"
     if (num_channels == groups and num_channels != 1 and
@@ -730,8 +734,8 @@ def conv1d_transpose(x,
             "The size of padding's dimension should 1 or 2. But got padding={}".
             format(padding))
 
-    stride = utils.convert_to_list(stride, 1, 'stride') + [1]
-    dilation = utils.convert_to_list(dilation, 1, 'dilation') + [1]
+    stride = convert_to_list(stride, 1, 'stride') + [1]
+    dilation = convert_to_list(dilation, 1, 'dilation') + [1]
 
     if output_size is None:
         output_size = []
@@ -740,8 +744,7 @@ def conv1d_transpose(x,
             raise ValueError('output_padding option is mutually exclusive with '
                              'output_size')
         if isinstance(output_size, (list, tuple, int)):
-            output_size = utils.convert_to_list(output_size, 1,
-                                                'output_size') + [1]
+            output_size = convert_to_list(output_size, 1, 'output_size') + [1]
         else:
             raise ValueError(
                 "output_size should be int, or list, tuple of ints")
@@ -749,8 +752,8 @@ def conv1d_transpose(x,
     if output_padding == 0:
         output_padding = []
     else:
-        output_padding = utils.convert_to_list(output_padding, 1,
-                                               'output_padding') + [0]
+        output_padding = convert_to_list(output_padding, 1,
+                                         'output_padding') + [0]
 
     if len(output_padding) > 0 and output_padding[0] > stride[0]:
         raise ValueError(
@@ -768,8 +771,8 @@ def conv1d_transpose(x,
     squeeze_axis = -2 if channel_last else -1
     conv2d_data_format = "NHWC" if channel_last else "NCHW"
 
-    x = nn.unsqueeze(input=x, axes=[squeeze_axis])
-    weight = nn.unsqueeze(input=weight, axes=[-1])
+    x = unsqueeze(x, axis=[squeeze_axis])
+    weight = unsqueeze(weight, axis=[-1])
 
     if in_dygraph_mode():
         attrs = ('output_padding', output_padding, 'output_size', output_size,
@@ -803,7 +806,7 @@ def conv1d_transpose(x,
         if bias is not None:
             out = nn.elementwise_add(out, bias, axis=channel_dim)
 
-    out = nn.squeeze(input=out, axes=[squeeze_axis])
+    out = squeeze(out, axis=[squeeze_axis])
     return out
 
 
@@ -979,8 +982,8 @@ def conv2d_transpose(x,
 
     # update attrs
     padding, padding_algorithm = _update_padding_nd(padding, channel_last, 2)
-    stride = utils.convert_to_list(stride, 2, 'stride')
-    dilation = utils.convert_to_list(dilation, 2, 'dilation')
+    stride = convert_to_list(stride, 2, 'stride')
+    dilation = convert_to_list(dilation, 2, 'dilation')
 
     if output_size is None:
         output_size = []
@@ -989,7 +992,7 @@ def conv2d_transpose(x,
             raise ValueError('output_padding option is mutually exclusive with '
                              'output_size')
         if isinstance(output_size, (list, tuple, int)):
-            output_size = utils.convert_to_list(output_size, 2, 'output_size')
+            output_size = convert_to_list(output_size, 2, 'output_size')
         else:
             raise ValueError(
                 "output_size should be int, or list, tuple of ints")
@@ -997,8 +1000,7 @@ def conv2d_transpose(x,
     if output_padding == 0:
         output_padding = []
     else:
-        output_padding = utils.convert_to_list(output_padding, 2,
-                                               'output_padding')
+        output_padding = convert_to_list(output_padding, 2, 'output_padding')
 
     op_type = 'conv2d_transpose'
     num_filters = weight.shape[1]
@@ -1187,8 +1189,8 @@ def conv3d(x,
                          cudnn_version is not None) else False
 
     padding, padding_algorithm = _update_padding_nd(padding, channel_last, 3)
-    stride = utils.convert_to_list(stride, 3, 'stride')
-    dilation = utils.convert_to_list(dilation, 3, 'dilation')
+    stride = convert_to_list(stride, 3, 'stride')
+    dilation = convert_to_list(dilation, 3, 'dilation')
     op_type = "conv3d"
 
     return _conv_nd(x, weight, bias, stride, padding, padding_algorithm,
@@ -1369,8 +1371,8 @@ def conv3d_transpose(x,
                                                                    groups))
 
     padding, padding_algorithm = _update_padding_nd(padding, channel_last, 3)
-    stride = utils.convert_to_list(stride, 3, 'stride')
-    dilation = utils.convert_to_list(dilation, 3, 'dilation')
+    stride = convert_to_list(stride, 3, 'stride')
+    dilation = convert_to_list(dilation, 3, 'dilation')
     if output_size is None:
         output_size = []
     else:
@@ -1378,7 +1380,7 @@ def conv3d_transpose(x,
             raise ValueError('output_padding option is mutually exclusive with '
                              'output_size')
         if isinstance(output_size, (list, tuple, int)):
-            output_size = utils.convert_to_list(output_size, 3, 'output_size')
+            output_size = convert_to_list(output_size, 3, 'output_size')
         else:
             raise ValueError(
                 "output_size should be int, or list, tuple of ints")
@@ -1386,8 +1388,7 @@ def conv3d_transpose(x,
     if output_padding == 0:
         output_padding = []
     else:
-        output_padding = utils.convert_to_list(output_padding, 3,
-                                               'output_padding')
+        output_padding = convert_to_list(output_padding, 3, 'output_padding')
 
     cudnn_version = get_cudnn_version()
 
diff --git a/python/paddle/nn/functional/extension.py b/python/paddle/nn/functional/extension.py
index 8a9597119ab8df..bccb7bc7334fb0 100644
--- a/python/paddle/nn/functional/extension.py
+++ b/python/paddle/nn/functional/extension.py
@@ -17,8 +17,9 @@
 import numpy as np
 from ...fluid.data_feeder import check_dtype
 from ...fluid.layer_helper import LayerHelper
-from ...fluid.framework import Variable, in_dygraph_mode
-from ...fluid.layers.tensor import assign
+from ...fluid.framework import in_dygraph_mode
+from ...static import Variable
+from ...tensor.creation import assign
 from ...fluid import core, dygraph_utils
 from ...fluid.layers.layer_function_generator import templatedoc
 from ...fluid.layers.sequence_lod import sequence_mask
diff --git a/python/paddle/nn/functional/input.py b/python/paddle/nn/functional/input.py
index 6fbb292e674861..d88ee530715b0c 100644
--- a/python/paddle/nn/functional/input.py
+++ b/python/paddle/nn/functional/input.py
@@ -14,7 +14,8 @@
 
 from __future__ import print_function
 import warnings
-from ...fluid.framework import Variable, in_dygraph_mode
+from ...fluid.framework import in_dygraph_mode
+from ...static import Variable
 from ...fluid.layer_helper import LayerHelper
 from ...fluid.layers import core
 from ...fluid.data_feeder import check_variable_and_dtype, check_dtype
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 56eabd2ec40567..cb7a50ade7ac8f 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -27,7 +27,7 @@
 from ...fluid.layers import dice_loss  # noqa: F401
 from ...fluid.layers import log_loss  # noqa: F401
 from ...fluid.layers import npair_loss  # noqa: F401
-from ...fluid.layers import reshape
+from ...tensor.manipulation import reshape
 from ...fluid.layers import softmax_with_cross_entropy as fluid_softmax_with_cross_entropy
 from ...fluid.layers import square_error_cost  # noqa: F401
 
@@ -36,7 +36,7 @@
 from ...fluid.layer_helper import LayerHelper
 from ...fluid.framework import in_dygraph_mode
 from ...fluid.framework import _varbase_creator
-from ...fluid.framework import Variable
+from ...static import Variable
 from paddle.utils import deprecated
 from paddle import _C_ops
 
@@ -291,9 +291,7 @@ def binary_cross_entropy_with_logits(logit,
             pos_weight, 'pos_weight', ['float32', 'float64'],
             'binary_cross_entropy_with_logits')
         log_weight = paddle.add(
-            paddle.multiply(
-                label, paddle.fluid.layers.elementwise_sub(pos_weight, one)),
-            one)
+            paddle.multiply(label, paddle.subtract(pos_weight, one)), one)
         pos_weight_name = name if reduction == 'none' and weight is None else None
         out = paddle.multiply(out, log_weight, name=pos_weight_name)
 
@@ -515,9 +513,9 @@ def smooth_l1_loss(input, label, reduction='mean', delta=1.0, name=None):
     if reduction == 'none':
         return out
     elif reduction == 'mean':
-        return fluid.layers.reduce_mean(out)
+        return paddle.mean(out)
     elif reduction == 'sum':
-        return fluid.layers.reduce_sum(out)
+        return paddle.sum(out)
 
 
 def margin_ranking_loss(input,
@@ -592,7 +590,7 @@ def margin_ranking_loss(input,
     fluid.data_feeder.check_variable_and_dtype(
         label, 'label', ['float32', 'float64'], 'margin_rank_loss')
 
-    out = paddle.fluid.layers.elementwise_sub(other, input)
+    out = paddle.subtract(other, input)
     out = paddle.multiply(out, label)
 
     if margin != 0.0:
@@ -898,11 +896,11 @@ def kl_div(input, label, reduction='mean', name=None):
     if fluid.data_feeder.convert_dtype(
             input.dtype) == 'float32' and fluid.data_feeder.convert_dtype(
                 label.dtype) == 'float64':
-        input = fluid.layers.cast(input, 'float64')
+        input = paddle.cast(input, 'float64')
     elif fluid.data_feeder.convert_dtype(
             input.dtype) == 'float64' and fluid.data_feeder.convert_dtype(
                 label.dtype) == 'float32':
-        label = fluid.layers.cast(label, 'float64')
+        label = paddle.cast(label, 'float64')
 
     if paddle.in_dynamic_mode():
         out = _C_ops.kldiv_loss(input, label, 'reduction', reduction)
@@ -988,16 +986,12 @@ def mse_loss(input, label, reduction='mean', name=None):
             label, 'label', ['float32', 'float64'], 'mse_loss')
 
     if reduction == 'none':
-        return paddle.fluid.layers.square(
-            paddle.fluid.layers.elementwise_sub(input, label), name=name)
+        return paddle.square(paddle.subtract(input, label), name=name)
     elif reduction == 'mean':
         return paddle.mean(
-            paddle.fluid.layers.square(
-                paddle.fluid.layers.elementwise_sub(input, label)),
-            name=name)
+            paddle.square(paddle.subtract(input, label)), name=name)
     else:
-        return paddle.sum(paddle.fluid.layers.square(
-            paddle.fluid.layers.elementwise_sub(input, label)),
+        return paddle.sum(paddle.square(paddle.subtract(input, label)),
                           name=name)
 
 
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index 2b4df534ac7474..863787c00e649d 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -19,8 +19,8 @@
 from ...fluid.layer_helper import LayerHelper
 from ...fluid.framework import in_dygraph_mode, core
 from ...framework import create_parameter
-from ...fluid.initializer import Constant
-from ...fluid.param_attr import ParamAttr
+from ..initializer import Constant
+from ...framework import ParamAttr
 from ...fluid import core, dygraph_utils
 import numbers
 from paddle import _C_ops
@@ -104,8 +104,7 @@ def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
         type='p_norm', inputs={'X': x}, outputs={'Out': out}, attrs=attrs)
     eps = out.block.create_var(dtype=out.dtype)
     paddle.fluid.layers.fill_constant([1], out.dtype, epsilon, out=eps)
-    return paddle.fluid.layers.elementwise_div(
-        x, paddle.maximum(out, eps), name=name)
+    return paddle.divide(x, paddle.maximum(out, eps), name=name)
 
 
 def batch_norm(x,
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index 5575a864cfb63a..d3ae44bf7cef33 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -15,7 +15,8 @@
 # TODO: define pooling functions
 from ...fluid import core
 from ...fluid.framework import in_dygraph_mode
-from ...fluid.layers import utils, LayerHelper, unsqueeze, squeeze
+from ...fluid.layers import utils, LayerHelper
+from ...tensor.manipulation import unsqueeze, squeeze
 from ...fluid.data_feeder import check_type, check_variable_and_dtype
 from paddle import _C_ops
 from paddle import _C_ops
diff --git a/python/paddle/nn/functional/vision.py b/python/paddle/nn/functional/vision.py
index 91e497a10ed5a3..bd3e27a25e12c4 100644
--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
@@ -13,7 +13,8 @@
 # limitations under the License.
 
 from ...device import get_cudnn_version
-from ...fluid.framework import core, in_dygraph_mode, Variable
+from ...fluid.framework import core, in_dygraph_mode
+from ...static import Variable
 from ...fluid.layer_helper import LayerHelper
 from ...fluid.data_feeder import check_variable_and_dtype
 from ...fluid import dygraph_utils
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index d5b37144cfffed..695e387bda84f0 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -14,18 +14,18 @@
 
 # TODO: define activation functions of neural network
 
-from ...fluid.dygraph import layers
 from ...fluid import core
 from ...fluid.framework import in_dygraph_mode
-from ...fluid.param_attr import ParamAttr
-from ...fluid.initializer import Constant
+from ...framework import ParamAttr
+from ..initializer import Constant
 from paddle.framework import get_default_dtype
 from .. import functional as F
+from paddle.nn import Layer
 
 __all__ = []
 
 
-class ELU(layers.Layer):
+class ELU(Layer):
     r"""
     ELU Activation.
 
@@ -67,7 +67,7 @@ def extra_repr(self):
         return 'alpha={}{}'.format(self._alpha, name_str)
 
 
-class GELU(layers.Layer):
+class GELU(Layer):
     r"""
     GELU Activation.
 
@@ -120,7 +120,7 @@ def extra_repr(self):
         return 'approximate={}{}'.format(self._approximate, name_str)
 
 
-class Hardshrink(layers.Layer):
+class Hardshrink(Layer):
     r"""
     Hardshrink Activation
 
@@ -168,7 +168,7 @@ def extra_repr(self):
         return 'threshold={}{}'.format(self._threshold, name_str)
 
 
-class Hardswish(layers.Layer):
+class Hardswish(Layer):
     r"""
     Hardswish activation
 
@@ -218,7 +218,7 @@ def extra_repr(self):
         return name_str
 
 
-class Tanh(layers.Layer):
+class Tanh(Layer):
     r"""
     Tanh Activation.
 
@@ -259,7 +259,7 @@ def extra_repr(self):
         return name_str
 
 
-class Hardtanh(layers.Layer):
+class Hardtanh(Layer):
     r"""
     Hardtanh Activation
 
@@ -305,7 +305,7 @@ def extra_repr(self):
         return 'min={}, max={}{}'.format(self._min, self._max, name_str)
 
 
-class PReLU(layers.Layer):
+class PReLU(Layer):
     """
     PReLU Activation.
 
@@ -377,7 +377,7 @@ def extra_repr(self):
             self._num_parameters, self._init, self._dtype, name_str)
 
 
-class ReLU(layers.Layer):
+class ReLU(Layer):
     """
     ReLU Activation.
 
@@ -415,7 +415,7 @@ def extra_repr(self):
         return name_str
 
 
-class ReLU6(layers.Layer):
+class ReLU6(Layer):
     """
     ReLU6 Activation
 
@@ -454,7 +454,7 @@ def extra_repr(self):
         return name_str
 
 
-class SELU(layers.Layer):
+class SELU(Layer):
     r"""
     SELU Activation
 
@@ -505,7 +505,7 @@ def extra_repr(self):
                                                        name_str)
 
 
-class LeakyReLU(layers.Layer):
+class LeakyReLU(Layer):
     r"""
     Leaky ReLU Activation.
 
@@ -553,7 +553,7 @@ def extra_repr(self):
         return 'negative_slope={}{}'.format(self._negative_slope, name_str)
 
 
-class Sigmoid(layers.Layer):
+class Sigmoid(Layer):
     """
     this interface is used to construct a callable object of the ``Sigmoid`` class. This layer calcluate the `sigmoid` of input x.
 
@@ -593,7 +593,7 @@ def extra_repr(self):
         return name_str
 
 
-class Hardsigmoid(layers.Layer):
+class Hardsigmoid(Layer):
     r"""
     This interface is used to construct a callable object of the ``Hardsigmoid`` class.
     This layer calcluate the `hardsigmoid` of input x.
@@ -644,7 +644,7 @@ def extra_repr(self):
         return name_str
 
 
-class Softplus(layers.Layer):
+class Softplus(Layer):
     r"""
     Softplus Activation
 
@@ -689,7 +689,7 @@ def extra_repr(self):
                                                 name_str)
 
 
-class Softshrink(layers.Layer):
+class Softshrink(Layer):
     r"""
     Softshrink Activation
 
@@ -734,7 +734,7 @@ def extra_repr(self):
         return 'threshold={}{}'.format(self._threshold, name_str)
 
 
-class Softsign(layers.Layer):
+class Softsign(Layer):
     r"""
     Softsign Activation
 
@@ -773,7 +773,7 @@ def extra_repr(self):
         return name_str
 
 
-class Swish(layers.Layer):
+class Swish(Layer):
     r"""
     Swish Activation.
 
@@ -812,7 +812,7 @@ def extra_repr(self):
         return name_str
 
 
-class Tanhshrink(layers.Layer):
+class Tanhshrink(Layer):
     """
     Tanhshrink Activation
 
@@ -851,7 +851,7 @@ def extra_repr(self):
         return name_str
 
 
-class ThresholdedReLU(layers.Layer):
+class ThresholdedReLU(Layer):
     r"""
     Thresholded ReLU Activation
 
@@ -895,7 +895,7 @@ def extra_repr(self):
         return 'threshold={}{}'.format(self._threshold, name_str)
 
 
-class Silu(layers.Layer):
+class Silu(Layer):
     """
     Silu Activation.
     .. math::
@@ -933,7 +933,7 @@ def extra_repr(self):
         return name_str
 
 
-class LogSigmoid(layers.Layer):
+class LogSigmoid(Layer):
     r"""
     LogSigmoid Activation.
 
@@ -972,7 +972,7 @@ def extra_repr(self):
         return name_str
 
 
-class Softmax(layers.Layer):
+class Softmax(Layer):
     r"""
     Softmax Activation.
 
@@ -1099,7 +1099,7 @@ def extra_repr(self):
         return 'axis={}{}'.format(self._axis, name_str)
 
 
-class LogSoftmax(layers.Layer):
+class LogSoftmax(Layer):
     r"""
     This operator implements the log_softmax layer. The calculation process is as follows:
 
@@ -1157,7 +1157,7 @@ def extra_repr(self):
         return 'axis={}{}'.format(self._axis, name_str)
 
 
-class Maxout(layers.Layer):
+class Maxout(Layer):
     r"""
     Maxout Activation.
 
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index 1d7f7c6589986b..9aa8097befc98b 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -15,10 +15,10 @@
 # TODO: define the common classes to build a neural network
 import paddle
 from ...fluid.dygraph import Flatten  # noqa: F401
-from ...fluid.dygraph import layers
 from ...fluid.framework import in_dygraph_mode
 from .. import functional as F
 from ...fluid.framework import _dygraph_tracer
+from paddle.nn import Layer
 
 __all__ = []
 
@@ -30,7 +30,7 @@ def _npairs(x, n):
     return x
 
 
-class Linear(layers.Layer):
+class Linear(Layer):
     r"""
 
     Fully-connected linear transformation layer. For each input :math:`X` ,
@@ -135,7 +135,7 @@ def extra_repr(self):
             self.weight.shape[0], self.weight.shape[1], self._dtype, name_str)
 
 
-class Upsample(layers.Layer):
+class Upsample(Layer):
     """
     This op resizes a batch of images.
 
@@ -385,7 +385,7 @@ def extra_repr(self):
             self.data_format, name_str)
 
 
-class UpsamplingNearest2D(layers.Layer):
+class UpsamplingNearest2D(Layer):
     """
     This op upsamples a batch of images, using nearest neighbours' pixel values.
     The input must be a 4-D Tensor of the shape (num_batches, channels, in_h, in_w),
@@ -470,7 +470,7 @@ def extra_repr(self):
                                              name_str)
 
 
-class UpsamplingBilinear2D(layers.Layer):
+class UpsamplingBilinear2D(Layer):
     """
     This op upsamples a batch of images, using bilinear' pixel values.
     The input must be a 4-D Tensor of the shape (num_batches, channels, in_h, in_w),
@@ -556,7 +556,7 @@ def extra_repr(self):
                                              name_str)
 
 
-class Bilinear(layers.Layer):
+class Bilinear(Layer):
     r"""
 
     This layer performs bilinear on two inputs.
@@ -651,7 +651,7 @@ def extra_repr(self):
             self._dtype, name_str)
 
 
-class Dropout(layers.Layer):
+class Dropout(Layer):
     """
     Dropout is a regularization technique for reducing overfitting by preventing
     neuron co-adaption during training as described in the paper:
@@ -725,7 +725,7 @@ def extra_repr(self):
                                                  name_str)
 
 
-class Dropout2D(layers.Layer):
+class Dropout2D(Layer):
     """
     Randomly zero out entire channels (in the batched input 4d tensor with the shape `NCHW` ,
     a channel is a 2D feature map with the shape `HW`). Each channel will be zeroed out independently
@@ -786,7 +786,7 @@ def extra_repr(self):
                                                name_str)
 
 
-class Dropout3D(layers.Layer):
+class Dropout3D(Layer):
     """
     Randomly zero out entire channels (in the batched input 5d tensor with the shape `NCDHW` ,
     a channel is a 3D feature map with the shape `DHW` ). Each channel will be zeroed out independently
@@ -847,7 +847,7 @@ def extra_repr(self):
                                                name_str)
 
 
-class AlphaDropout(layers.Layer):
+class AlphaDropout(Layer):
     """
     Alpha Dropout is a type of Dropout that maintains the self-normalizing property. For an input with
     zero mean and unit standard deviation, the output of Alpha Dropout maintains the original mean and
@@ -900,7 +900,7 @@ def extra_repr(self):
         return 'p={}{}'.format(self.p, name_str)
 
 
-class Pad1D(layers.Layer):
+class Pad1D(Layer):
     """
     This interface is used to construct a callable object of the ``Pad1D`` class.
     Pad tensor according to 'pad', 'mode' and 'value'.
@@ -981,7 +981,7 @@ def extra_repr(self):
             self._pad, self._mode, self._value, self._data_format, name_str)
 
 
-class Pad2D(layers.Layer):
+class Pad2D(Layer):
     """
     This interface is used to construct a callable object of the ``Pad2D`` class.
     Pad tensor according to 'pad', 'mode' and 'value'.
@@ -1065,7 +1065,7 @@ def extra_repr(self):
             self._pad, self._mode, self._value, self._data_format, name_str)
 
 
-class Pad3D(layers.Layer):
+class Pad3D(Layer):
     """
     This interface is used to construct a callable object of the ``Pad3D`` class.
     Pad tensor according to 'pad', 'mode' and 'value'.
@@ -1149,7 +1149,7 @@ def extra_repr(self):
             self._pad, self._mode, self._value, self._data_format, name_str)
 
 
-class CosineSimilarity(layers.Layer):
+class CosineSimilarity(Layer):
     """
     This interface is used to compute cosine similarity between x1 and x2 along axis.
 
@@ -1206,7 +1206,7 @@ def extra_repr(self):
         return 'axis={_axis}, eps={_eps}'.format(**self.__dict__)
 
 
-class Embedding(layers.Layer):
+class Embedding(Layer):
     r"""
     **Embedding Layer**
 
@@ -1367,7 +1367,7 @@ def extra_repr(self):
         return main_str.format(**self.__dict__)
 
 
-class Unfold(layers.Layer):
+class Unfold(Layer):
     """
     This op returns a col buffer of sliding local blocks of input x, also known
     as im2col for batched 2D image tensors. For each block under the convolution filter,
diff --git a/python/paddle/nn/layer/container.py b/python/paddle/nn/layer/container.py
index 48697aa8f50909..aadaf1efce50fa 100644
--- a/python/paddle/nn/layer/container.py
+++ b/python/paddle/nn/layer/container.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from collections import OrderedDict
-from ...fluid.dygraph.layers import Layer
+from .. import Layer
 from collections.abc import Iterable, Mapping
 
 __all__ = []
diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py
index 76011aeff5b4fb..26fd544ecce112 100644
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -19,8 +19,8 @@
 from ...fluid import get_flags
 from ...fluid import core
 from ...device import get_cudnn_version
-from ...fluid.dygraph import layers
-from ...fluid.initializer import Normal
+from .. import Layer
+from ..initializer import Normal
 from .. import functional as F
 from ...fluid.layers import utils
 from ..functional.conv import _update_padding_nd
@@ -31,7 +31,7 @@
 def _get_default_param_initializer(num_channels, filter_size):
     filter_elem_num = num_channels * np.prod(filter_size)
     std = (2.0 / filter_elem_num)**0.5
-    return Normal(0.0, std, 0)
+    return Normal(0.0, std)
 
 
 def _reverse_repeat_list(t, n):
@@ -42,7 +42,7 @@ def _reverse_repeat_list(t, n):
     return list(x for x in reversed(t) for _ in range(n))
 
 
-class _ConvNd(layers.Layer):
+class _ConvNd(Layer):
     def __init__(self,
                  in_channels,
                  out_channels,
@@ -127,7 +127,7 @@ def _get_default_param_initializer():
                 return None
             filter_elem_num = np.prod(self._kernel_size) * self._in_channels
             std = (2.0 / filter_elem_num)**0.5
-            return Normal(0.0, std, 0)
+            return Normal(0.0, std)
 
         self.weight = self.create_parameter(
             shape=filter_shape,
diff --git a/python/paddle/nn/layer/distance.py b/python/paddle/nn/layer/distance.py
index 27e904980d143d..0547bf75a4bf6c 100644
--- a/python/paddle/nn/layer/distance.py
+++ b/python/paddle/nn/layer/distance.py
@@ -15,7 +15,7 @@
 import numpy as np
 
 import paddle
-from ...fluid.dygraph import layers
+from .. import Layer
 from ...fluid.framework import core, in_dygraph_mode
 from ...fluid.data_feeder import check_variable_and_dtype, check_type
 from ...fluid.layer_helper import LayerHelper
@@ -24,7 +24,7 @@
 __all__ = []
 
 
-class PairwiseDistance(layers.Layer):
+class PairwiseDistance(Layer):
     r"""
     This operator computes the pairwise distance between two vectors. The
     distance is calculated by p-oreder norm:
@@ -87,7 +87,7 @@ def forward(self, x, y):
                                  'PairwiseDistance')
         check_variable_and_dtype(y, 'y', ['float32', 'float64'],
                                  'PairwiseDistance')
-        sub = paddle.fluid.layers.elementwise_sub(x, y)
+        sub = paddle.subtract(x, y)
 
         helper = LayerHelper("PairwiseDistance", name=self.name)
         attrs = {
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index 8f43eb8866b4bb..31b552bed162c2 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -20,11 +20,12 @@
 import paddle
 from .. import functional as F
 from paddle.fluid.framework import core, in_dygraph_mode, _varbase_creator
+from .. import Layer
 
 __all__ = []
 
 
-class BCEWithLogitsLoss(fluid.dygraph.Layer):
+class BCEWithLogitsLoss(Layer):
     r"""
     This operator combines the sigmoid layer and the :ref:`api_nn_loss_BCELoss` layer.
     Also, we can see it as the combine of ``sigmoid_cross_entropy_with_logits``
@@ -128,7 +129,7 @@ def forward(self, logit, label):
         return out
 
 
-class CrossEntropyLoss(fluid.dygraph.Layer):
+class CrossEntropyLoss(Layer):
     r"""
     By default, this operator implements the cross entropy loss function with softmax. This function 
     combines the calculation of the softmax operation and the cross entropy loss function 
@@ -407,7 +408,7 @@ def forward(self, input, label):
         return ret
 
 
-class HSigmoidLoss(fluid.dygraph.Layer):
+class HSigmoidLoss(Layer):
     """
     Hierarchical Sigmoid Layer.
     
@@ -529,7 +530,7 @@ def forward(self, input, label, path_table=None, path_code=None):
         return out
 
 
-class MSELoss(fluid.dygraph.layers.Layer):
+class MSELoss(Layer):
     r"""
     **Mean Square Error Loss**
     Computes the mean square error (squared L2 norm) of given input and label.
@@ -596,8 +597,7 @@ def forward(self, input, label):
             fluid.data_feeder.check_variable_and_dtype(
                 label, 'label', ['float32', 'float64'], 'MSELoss')
 
-        square_out = fluid.layers.square(
-            fluid.layers.elementwise_sub(input, label))
+        square_out = paddle.square(paddle.subtract(input, label))
         if self.reduction == 'none':
             return square_out
 
@@ -608,7 +608,7 @@ def forward(self, input, label):
         return getattr(fluid.layers, reduce_op)(square_out)
 
 
-class L1Loss(fluid.dygraph.Layer):
+class L1Loss(Layer):
     r"""
     This interface is used to construct a callable object of the ``L1Loss`` class.
     The L1Loss layer calculates the L1 Loss of ``input`` and ``label`` as follows.
@@ -687,7 +687,7 @@ def forward(self, input, label):
             input, label, self.reduction, name=self.name)
 
 
-class BCELoss(fluid.dygraph.Layer):
+class BCELoss(Layer):
     """
     This interface is used to construct a callable object of the ``BCELoss`` class.
     The BCELoss layer measures the binary_cross_entropy loss between input predictions ``input``
@@ -777,7 +777,7 @@ def forward(self, input, label):
         return out
 
 
-class NLLLoss(fluid.dygraph.Layer):
+class NLLLoss(Layer):
     r"""
 	:alias_main: paddle.nn.NLLLoss
 	:alias: paddle.nn.NLLLoss,paddle.nn.layer.NLLLoss,paddle.nn.layer.loss.NLLLoss
@@ -886,7 +886,7 @@ def forward(self, input, label):
             name=self._name)
 
 
-class KLDivLoss(fluid.dygraph.Layer):
+class KLDivLoss(Layer):
     r"""
     This interface calculates the Kullback-Leibler divergence loss
     between Input(X) and Input(Target). Notes that Input(X) is the
@@ -959,7 +959,7 @@ def forward(self, input, label):
         return out
 
 
-class MarginRankingLoss(fluid.dygraph.Layer):
+class MarginRankingLoss(Layer):
     r"""
 
     This interface is used to construct a callable object of the ``MarginRankingLoss`` class.
@@ -1031,7 +1031,7 @@ def forward(self, input, other, label):
         return out
 
 
-class CTCLoss(fluid.dygraph.Layer):
+class CTCLoss(Layer):
     """
 
     An operator integrating the open source Warp-CTC library (https://github.com/baidu-research/warp-ctc)
@@ -1127,7 +1127,7 @@ def forward(self,
             norm_by_times=norm_by_times)
 
 
-class SmoothL1Loss(fluid.dygraph.Layer):
+class SmoothL1Loss(Layer):
     r"""
     This operator calculates smooth_l1_loss. Creates a criterion that uses a squared
     term if the absolute element-wise error falls below 1 and an L1 term otherwise.
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index b93412a7b22ccd..9abbc494258948 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -30,15 +30,13 @@
 import six
 
 from ...fluid.dygraph import BatchNorm  # noqa: F401
-
 from ...fluid.dygraph import SpectralNorm  # noqa: F401
 
-from ...fluid.dygraph import layers
 from ...framework import get_default_dtype, set_default_dtype
 from ...fluid.framework import in_dygraph_mode
 
-from ...fluid.initializer import Constant
-from ...fluid.param_attr import ParamAttr
+from ..initializer import Constant
+from ...framework import ParamAttr
 from ...fluid.data_feeder import check_variable_and_dtype, check_type
 from ...fluid import core, dygraph_utils
 
@@ -47,14 +45,15 @@
 import numpy as np
 import numbers
 import warnings
-from ...fluid.dygraph.base import no_grad
+from ...framework import no_grad
 from .. import functional as F
 from paddle import _C_ops
+from .. import Layer
 
 __all__ = []
 
 
-class _InstanceNormBase(layers.Layer):
+class _InstanceNormBase(Layer):
     """
     This class is based class for InstanceNorm1D, 2d, 3d. 
 
@@ -317,7 +316,7 @@ def _check_input_dim(self, input):
                 len(input.shape)))
 
 
-class GroupNorm(layers.Layer):
+class GroupNorm(Layer):
     """
     This interface is used to construct a callable object of the ``GroupNorm`` class.
     For more details, refer to code examples.
@@ -436,7 +435,7 @@ def extra_repr(self):
             self._num_groups, self._num_channels, self._epsilon)
 
 
-class LayerNorm(layers.Layer):
+class LayerNorm(Layer):
     r"""
     :alias_main: paddle.nn.LayerNorm
 	:alias: paddle.nn.LayerNorm,paddle.nn.layer.LayerNorm,paddle.nn.layer.norm.LayerNorm
@@ -544,7 +543,7 @@ def extra_repr(self):
                                                         self._epsilon)
 
 
-class _BatchNormBase(layers.Layer):
+class _BatchNormBase(Layer):
     """
     BatchNorm base .
     """
@@ -1181,7 +1180,7 @@ def convert_sync_batchnorm(cls, layer):
         return layer_output
 
 
-class LocalResponseNorm(layers.Layer):
+class LocalResponseNorm(Layer):
     """
         Local Response Normalization performs a type of "lateral inhibition" by normalizing over local input regions.
         For more information, please refer to `ImageNet Classification with Deep Convolutional Neural Networks <https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf>`_
diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
index 528572ee21b7cc..881f92568414dc 100755
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -12,14 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from ...fluid.dygraph import layers
 from ...fluid.layer_helper import LayerHelper
 from .. import functional as F
+from .. import Layer
 
 __all__ = []
 
 
-class AvgPool1D(layers.Layer):
+class AvgPool1D(Layer):
     r"""
     This operation applies a 1D average pooling over an input signal composed
     of several input planes, based on the input, output_size, return_mask parameters.
@@ -109,7 +109,7 @@ def extra_repr(self):
             **self.__dict__)
 
 
-class AvgPool2D(layers.Layer):
+class AvgPool2D(Layer):
     r"""
     This operation applies 2D average pooling over input features based on the input,
     and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
@@ -220,7 +220,7 @@ def extra_repr(self):
             **self.__dict__)
 
 
-class AvgPool3D(layers.Layer):
+class AvgPool3D(Layer):
     """
     This operation applies 3D max pooling over input features based on the input,
     and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
@@ -318,7 +318,7 @@ def extra_repr(self):
             **self.__dict__)
 
 
-class MaxPool1D(layers.Layer):
+class MaxPool1D(Layer):
     """
     This operation applies 1D max pooling over input signal
     composed of several input planes based on the input,
@@ -412,7 +412,7 @@ def extra_repr(self):
             **self.__dict__)
 
 
-class MaxPool2D(layers.Layer):
+class MaxPool2D(Layer):
     r"""
     This operation applies 2D max pooling over input feature based on the input,
     and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
@@ -522,7 +522,7 @@ def extra_repr(self):
             **self.__dict__)
 
 
-class MaxPool3D(layers.Layer):
+class MaxPool3D(Layer):
     """
     This operation applies 3D max pooling over input features based on the input,
     and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
@@ -620,7 +620,7 @@ def extra_repr(self):
             **self.__dict__)
 
 
-class AdaptiveAvgPool1D(layers.Layer):
+class AdaptiveAvgPool1D(Layer):
     r"""
 
     This operation applies a 1D adaptive average pooling over an input signal composed
@@ -693,7 +693,7 @@ def extra_repr(self):
         return 'output_size={}'.format(self.output_size)
 
 
-class AdaptiveAvgPool2D(layers.Layer):
+class AdaptiveAvgPool2D(Layer):
     r"""
 
     This operation applies 2D adaptive avg pooling on input tensor. The h and w dimensions
@@ -779,7 +779,7 @@ def extra_repr(self):
         return 'output_size={}'.format(self._output_size)
 
 
-class AdaptiveAvgPool3D(layers.Layer):
+class AdaptiveAvgPool3D(Layer):
     r"""
 
     This operation applies 3D adaptive avg pooling on input tensor. The h and w dimensions
@@ -872,7 +872,7 @@ def extra_repr(self):
         return 'output_size={}'.format(self._output_size)
 
 
-class AdaptiveMaxPool1D(layers.Layer):
+class AdaptiveMaxPool1D(Layer):
     """
 
     This operation applies a 1D adaptive max pooling over an input signal composed
@@ -956,7 +956,7 @@ def extra_repr(self):
                                                        self.return_mask)
 
 
-class AdaptiveMaxPool2D(layers.Layer):
+class AdaptiveMaxPool2D(Layer):
     """
     This operation applies 2D adaptive max pooling on input tensor. The h and w dimensions
     of the output tensor are determined by the parameter output_size. The difference between adaptive pooling and
@@ -1037,7 +1037,7 @@ def extra_repr(self):
                                                        self._return_mask)
 
 
-class AdaptiveMaxPool3D(layers.Layer):
+class AdaptiveMaxPool3D(Layer):
     """
     This operation applies 3D adaptive max pooling on input tensor. The h and w dimensions of the output tensor are
     determined by the parameter output_size. The difference between adaptive pooling and pooling is adaptive one focus
diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py
index 693ec0200b0d05..77168566d88c60 100644
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -28,7 +28,7 @@
 from paddle.device import get_device, get_cudnn_version
 from paddle.nn import functional as F
 from paddle.nn import initializer as I
-from paddle.fluid.dygraph import Layer, LayerList
+from paddle.nn import Layer, LayerList
 from paddle.fluid.layers import utils
 from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as
 from paddle.fluid.data_feeder import convert_dtype
@@ -962,7 +962,7 @@ def flatten_parameters(self):
             # for static-graph, append coalesce_tensor into startup program
             with fluid.program_guard(fluid.default_startup_program(),
                                      fluid.default_startup_program()):
-                with framework.no_grad():
+                with paddle.no_grad():
                     self._helper.append_op(
                         type="coalesce_tensor",
                         inputs={"Input": self._all_weights},
@@ -1040,11 +1040,11 @@ def forward(self, inputs, initial_states=None, sequence_length=None):
             ])
         else:
             initial_states = [initial_states] if isinstance(
-                initial_states,
-                paddle.fluid.framework.Variable) else initial_states
+                initial_states, paddle.static.Variable) else initial_states
 
-        if self.could_use_cudnn and (not fluid.core.is_compiled_with_rocm() or
-                                     sequence_length is None):
+        if self.could_use_cudnn and (
+                not paddle.device.is_compiled_with_rocm() or
+                sequence_length is None):
             # Add CPU kernel and dispatch in backend later
             return self._cudnn_impl(inputs, initial_states, sequence_length)
 
diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py
index 5aba8ae85ad1b3..eacf5aac9daa9f 100644
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -24,8 +24,8 @@
 from .. import functional as F
 from ... import tensor
 from ...fluid import layers
-from ...fluid.dygraph import Layer, LayerList
-from ...fluid.param_attr import ParamAttr
+from .. import Layer, LayerList
+from ...framework import ParamAttr
 from ...fluid.data_feeder import convert_dtype
 
 __all__ = []
diff --git a/python/paddle/nn/layer/vision.py b/python/paddle/nn/layer/vision.py
index e6d3af9a37b329..7f8b51ca10818e 100644
--- a/python/paddle/nn/layer/vision.py
+++ b/python/paddle/nn/layer/vision.py
@@ -14,13 +14,13 @@
 
 # TODO: define specitial functions used in computer vision task 
 
-from ...fluid.dygraph import layers
+from .. import Layer
 from .. import functional
 
 __all__ = []
 
 
-class PixelShuffle(layers.Layer):
+class PixelShuffle(Layer):
     """
     
     PixelShuffle Layer    
diff --git a/python/paddle/tests/dist_hapi_mnist_static.py b/python/paddle/tests/dist_hapi_mnist_static.py
index eab34a6dafbc35..6120ae90e994d2 100644
--- a/python/paddle/tests/dist_hapi_mnist_static.py
+++ b/python/paddle/tests/dist_hapi_mnist_static.py
@@ -59,6 +59,7 @@ def compute_accuracy(pred, gt):
                  'CPU testing is not supported')
 class TestDistTraning(unittest.TestCase):
     def test_static_multiple_gpus(self):
+        paddle.enable_static()
         device = set_device('gpu')
 
         im_shape = (-1, 1, 28, 28)
diff --git a/python/paddle/tests/test_dataset_cifar.py b/python/paddle/tests/test_dataset_cifar.py
index 2e9efddf9712e3..abf79fb1e3974c 100644
--- a/python/paddle/tests/test_dataset_cifar.py
+++ b/python/paddle/tests/test_dataset_cifar.py
@@ -32,8 +32,6 @@ def test_main(self):
         self.assertTrue(data.shape[2] == 3)
         self.assertTrue(data.shape[1] == 32)
         self.assertTrue(data.shape[0] == 32)
-        self.assertTrue(len(label.shape) == 1)
-        self.assertTrue(label.shape[0] == 1)
         self.assertTrue(0 <= int(label) <= 9)
 
 
@@ -51,8 +49,6 @@ def test_main(self):
         self.assertTrue(data.shape[2] == 3)
         self.assertTrue(data.shape[1] == 32)
         self.assertTrue(data.shape[0] == 32)
-        self.assertTrue(len(label.shape) == 1)
-        self.assertTrue(label.shape[0] == 1)
         self.assertTrue(0 <= int(label) <= 9)
 
         # test cv2 backend
@@ -67,8 +63,6 @@ def test_main(self):
         self.assertTrue(data.shape[2] == 3)
         self.assertTrue(data.shape[1] == 32)
         self.assertTrue(data.shape[0] == 32)
-        self.assertTrue(len(label.shape) == 1)
-        self.assertTrue(label.shape[0] == 1)
         self.assertTrue(0 <= int(label) <= 99)
 
         with self.assertRaises(ValueError):
@@ -89,8 +83,6 @@ def test_main(self):
         self.assertTrue(data.shape[2] == 3)
         self.assertTrue(data.shape[1] == 32)
         self.assertTrue(data.shape[0] == 32)
-        self.assertTrue(len(label.shape) == 1)
-        self.assertTrue(label.shape[0] == 1)
         self.assertTrue(0 <= int(label) <= 99)
 
 
@@ -108,8 +100,6 @@ def test_main(self):
         self.assertTrue(data.shape[2] == 3)
         self.assertTrue(data.shape[1] == 32)
         self.assertTrue(data.shape[0] == 32)
-        self.assertTrue(len(label.shape) == 1)
-        self.assertTrue(label.shape[0] == 1)
         self.assertTrue(0 <= int(label) <= 99)
 
         # test cv2 backend
@@ -124,8 +114,6 @@ def test_main(self):
         self.assertTrue(data.shape[2] == 3)
         self.assertTrue(data.shape[1] == 32)
         self.assertTrue(data.shape[0] == 32)
-        self.assertTrue(len(label.shape) == 1)
-        self.assertTrue(label.shape[0] == 1)
         self.assertTrue(0 <= int(label) <= 99)
 
         with self.assertRaises(ValueError):
diff --git a/python/paddle/vision/datasets/cifar.py b/python/paddle/vision/datasets/cifar.py
index 97ffb239fe7adf..74ae8ef11e3de0 100644
--- a/python/paddle/vision/datasets/cifar.py
+++ b/python/paddle/vision/datasets/cifar.py
@@ -148,8 +148,7 @@ def _load_data(self):
                     six.b('labels'), batch.get(six.b('fine_labels'), None))
                 assert labels is not None
                 for sample, label in six.moves.zip(data, labels):
-                    self.data.append((sample,
-                                      np.array([label]).astype('int64')))
+                    self.data.append((sample, label))
 
     def __getitem__(self, idx):
         image, label = self.data[idx]
@@ -162,9 +161,9 @@ def __getitem__(self, idx):
             image = self.transform(image)
 
         if self.backend == 'pil':
-            return image, label.astype('int64')
+            return image, np.array(label).astype('int64')
 
-        return image.astype(self.dtype), label.astype('int64')
+        return image.astype(self.dtype), np.array(label).astype('int64')
 
     def __len__(self):
         return len(self.data)
diff --git a/tools/dockerfile/Dockerfile.npu_aarch64 b/tools/dockerfile/Dockerfile.npu_aarch64
index e3cd162edc1547..e95ff951878c94 100644
--- a/tools/dockerfile/Dockerfile.npu_aarch64
+++ b/tools/dockerfile/Dockerfile.npu_aarch64
@@ -1,17 +1,20 @@
 # A image for building paddle binaries
-# Use cann 5.0.2.alpha003 and aarch64 for A300t-9000
-# When you modify it, please be aware of cann version
+# Use cann 5.0.2.alpha005 and aarch64 for A300t-9000
+# Update CANN_VERSION if using other versions
 #
-# Build: CANN 5.0.2.alpha003
+# Build: CANN 5.0.2.alpha005
+# Download pkgs from https://www.hiascend.com/software/cann/community 
+# and copy them to current dir first, then run build commands
 # cd Paddle/tools/dockerfile
 # docker build -f Dockerfile.npu_aarch64  \
-# -t paddlepaddle/paddle:latest-cann5.0.2-gcc82-aarch64-dev .
+# --build-arg CANN_VERSION=5.0.2.alpha005 \
+# -t paddlepaddle/paddle:latest-dev-5.0.2.alpha005-gcc82-aarch64 .
 #
 # docker run -it --pids-limit 409600 \
 # -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
 # -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
 # -v /usr/local/dcmi:/usr/local/dcmi \
-# paddlepaddle/paddle:latest-cann5.0.2-gcc82-aarch64-dev /bin/bash
+# paddlepaddle/paddle:latest-dev-5.0.2.alpha005-gcc82-aarch64 /bin/bash
 
 FROM ubuntu:18.04
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
@@ -126,17 +129,19 @@ COPY ascend_install.info /etc/ascend_install.info
 RUN mkdir -p /usr/local/Ascend/driver
 COPY version.info /usr/local/Ascend/driver/version.info
 
-# Packages from https://www.hiascend.com/software/cann/community
+# Download packages from https://www.hiascend.com/software/cann/community and copy them to current dir first
 WORKDIR /usr/local/Ascend
+ARG CANN_VERSION=5.0.2.alpha005
 # update envs for driver
 ENV LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64:$LD_LIBRARY_PATH
 ENV LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/common:$LD_LIBRARY_PATH
 ENV LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/driver:$LD_LIBRARY_PATH
 
 # Install Ascend toolkit
-COPY Ascend-cann-toolkit_5.0.2.alpha003_linux-aarch64.run /usr/local/Ascend/
-RUN ./Ascend-cann-toolkit_5.0.2.alpha003_linux-aarch64.run --install --quiet
-RUN rm -rf Ascend-cann-toolkit_5.0.2.alpha003_linux-aarch64.run
+COPY Ascend-cann-toolkit_${CANN_VERSION}_linux-aarch64.run /usr/local/Ascend/
+RUN chmod +x Ascend-cann-toolkit_${CANN_VERSION}_linux-aarch64.run && \
+    ./Ascend-cann-toolkit_${CANN_VERSION}_linux-aarch64.run --install --quiet && \
+    rm -rf Ascend-cann-toolkit_${CANN_VERSION}_linux-aarch64.run
 # udpate envs for model transformation and operator develop
 ENV PATH=/usr/local/Ascend/ascend-toolkit/latest/atc/bin:$PATH
 ENV LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/atc/lib64:$LD_LIBRARY_PATH
@@ -146,9 +151,10 @@ ENV PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/toolkit/python/site-packa
 ENV TOOLCHAIN_HOME=/usr/local/Ascend/ascend-toolkit/latest/toolkit
 
 # Install Ascend NNAE
-COPY Ascend-cann-nnae_5.0.2.alpha003_linux-aarch64.run /usr/local/Ascend/
-RUN ./Ascend-cann-nnae_5.0.2.alpha003_linux-aarch64.run --install --quiet
-RUN rm -rf Ascend-cann-nnae_5.0.2.alpha003_linux-aarch64.run
+COPY Ascend-cann-nnae_${CANN_VERSION}_linux-aarch64.run /usr/local/Ascend/
+RUN chmod +x Ascend-cann-nnae_${CANN_VERSION}_linux-aarch64.run && \
+    ./Ascend-cann-nnae_${CANN_VERSION}_linux-aarch64.run --install --quiet && \
+    rm -rf Ascend-cann-nnae_${CANN_VERSION}_linux-aarch64.run
 
 # update envs for third party AI framework develop
 ENV PATH=/usr/local/Ascend/nnae/latest/fwkacllib/bin:$PATH
diff --git a/tools/dockerfile/Dockerfile.npu_x86_64 b/tools/dockerfile/Dockerfile.npu_x86_64
new file mode 100644
index 00000000000000..6689deedf4b3bf
--- /dev/null
+++ b/tools/dockerfile/Dockerfile.npu_x86_64
@@ -0,0 +1,175 @@
+# A image for building paddle binaries
+# Use cann 5.0.2.alpha005 and x86_64 for A300t-9000
+# Update CANN_VERSION if using other versions
+#
+# Build: CANN 5.0.2.alpha005
+# Download pkgs from https://www.hiascend.com/software/cann/community 
+# and copy them to current dir first, then run build commands
+# cd Paddle/tools/dockerfile
+# docker build -f Dockerfile.npu_x86_64  \
+# --build-arg CANN_VERSION=5.0.2.alpha005 \
+# -t paddlepaddle/paddle:latest-dev-5.0.2.alpha005-gcc82-x86_64 .
+#
+# docker run -it --pids-limit 409600 \
+# -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \
+# -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
+# -v /usr/local/dcmi:/usr/local/dcmi \
+# paddlepaddle/paddle:latest-dev-5.0.2.alpha005-gcc82-x86_64 /bin/bash
+
+FROM ubuntu:18.04
+MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
+
+RUN apt-get update && apt-get install -y apt-utils
+RUN ln -snf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends tzdata
+RUN apt-get update && apt-get install -y software-properties-common && add-apt-repository ppa:deadsnakes/ppa && add-apt-repository ppa:ubuntu-toolchain-r/test
+RUN apt-get update && apt-get install -y curl wget vim git unzip unrar tar xz-utils libssl-dev bzip2 gzip make libgcc-s1 sudo openssh-server \
+            coreutils ntp language-pack-zh-hans python-qt4 libsm6 libxext6 libxrender-dev libgl1-mesa-glx libsqlite3-dev libopenblas-dev \
+            bison graphviz libjpeg-dev zlib1g zlib1g-dev automake locales swig net-tools libtool module-init-tools numactl libnuma-dev \
+            openssl libffi-dev pciutils libblas-dev gfortran libblas3 liblapack-dev liblapack3 default-jre screen tmux gdb lldb gcc g++
+
+# GCC 8.2
+WORKDIR /opt
+RUN wget -q https://paddle-ci.gz.bcebos.com/gcc-8.2.0.tar.xz && \
+    tar -xvf gcc-8.2.0.tar.xz && cd gcc-8.2.0 && \
+    unset LIBRARY_PATH CPATH C_INCLUDE_PATH PKG_CONFIG_PATH CPLUS_INCLUDE_PATH INCLUDE && \
+    ./contrib/download_prerequisites && \
+    cd .. && mkdir temp_gcc82 && cd temp_gcc82 && \
+    ../gcc-8.2.0/configure --prefix=/opt/compiler/gcc-8.2 --enable-threads=posix --disable-checking --disable-multilib && \
+    make -j8 && make install && \
+    cd .. && rm -rf temp_gcc82 && rm -rf gcc-8.2.0* && \
+    cd /usr/lib/x86_64-linux-gnu && \
+    mv libstdc++.so.6 libstdc++.so.6.bak && mv libstdc++.so.6.0.25 libstdc++.so.6.0.25.bak && \
+    ln -s /opt/compiler/gcc-8.2/lib64/libgfortran.so.5 /usr/lib/x86_64-linux-gnu/libstdc++.so.5 && \
+    ln -s /opt/compiler/gcc-8.2/lib64/libstdc++.so.6   /usr/lib/x86_64-linux-gnu/libstdc++.so.6 && \
+    cp /opt/compiler/gcc-8.2/lib64/libstdc++.so.6.0.25 /usr/lib/x86_64-linux-gnu && \
+    cd /usr/bin && mv gcc gcc.bak && mv g++ g++.bak && \
+    ln -s /opt/compiler/gcc-8.2/bin/gcc /usr/bin/gcc && \
+    ln -s /opt/compiler/gcc-8.2/bin/g++ /usr/bin/g++
+ENV PATH=/opt/compiler/gcc-8.2/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/compiler/gcc-8.2/lib:/opt/compiler/gcc-8.2/lib64:$LD_LIBRARY_PATH
+
+# cmake 3.16
+WORKDIR /opt
+RUN wget -q https://cmake.org/files/v3.16/cmake-3.16.0-Linux-x86_64.tar.gz && \
+    tar -zxvf cmake-3.16.0-Linux-x86_64.tar.gz && rm cmake-3.16.0-Linux-x86_64.tar.gz && \
+    mv cmake-3.16.0-Linux-x86_64 cmake-3.16
+ENV PATH=/opt/cmake-3.16/bin:${PATH}
+
+# conda 4.9.2
+WORKDIR /opt
+ARG CONDA_FILE=Miniconda3-py37_4.9.2-Linux-x86_64.sh
+RUN cd /opt && wget -q https://repo.anaconda.com/miniconda/${CONDA_FILE} && chmod +x ${CONDA_FILE}
+RUN mkdir /opt/conda && ./${CONDA_FILE} -b -f -p "/opt/conda" && rm -rf ${CONDA_FILE}
+ENV PATH=/opt/conda/bin:${PATH}
+RUN conda init bash && conda install -n base jupyter jupyterlab
+
+# install pylint and pre-commit
+RUN /opt/conda/bin/pip install pre-commit pylint pylint pytest astroid isort coverage qtconsole 
+# install CANN 5.0.2 requirement
+RUN /opt/conda/bin/pip install 'numpy<1.20,>=1.13.3' 'decorator>=4.4.0' 'sympy>=1.4' 'cffi>=1.12.3' 'protobuf>=3.11.3'
+RUN /opt/conda/bin/pip install attrs pyyaml pathlib2 scipy requests psutil
+
+# install Paddle requirement
+RUN wget https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/requirements.txt -O /root/requirements.txt
+RUN /opt/conda/bin/pip install -r /root/requirements.txt && rm -rf /root/requirements.txt
+RUN wget https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/unittest_py/requirements.txt -O /root/requirements.txt
+RUN /opt/conda/bin/pip install -r /root/requirements.txt && rm -rf /root/requirements.txt
+
+# Install Go and glide
+RUN wget -qO- https://paddle-ci.cdn.bcebos.com/go1.8.1.linux-amd64.tar.gz | \
+    tar -xz -C /usr/local && \
+    mkdir /root/gopath && \
+    mkdir /root/gopath/bin && \
+    mkdir /root/gopath/src
+ENV GOROOT=/usr/local/go GOPATH=/root/gopath
+# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
+ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
+# install glide
+RUN curl -s -q https://glide.sh/get | sh
+
+# git credential to skip password typing
+RUN git config --global credential.helper store
+
+# Fix locales to en_US.UTF-8
+RUN localedef -i en_US -f UTF-8 en_US.UTF-8
+
+RUN apt-get install libprotobuf-dev -y
+
+# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
+# https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
+# So install a newer version here.
+RUN wget -q https://paddle-ci.cdn.bcebos.com/patchelf_0.10-2_amd64.deb && \
+    dpkg -i patchelf_0.10-2_amd64.deb && rm -rf patchelf_0.10-2_amd64.deb
+
+# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
+RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
+CMD source ~/.bashrc
+
+# ccache 3.7.9
+RUN wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \
+    tar xf ccache-3.7.9.tar.gz && mkdir /usr/local/ccache-3.7.9 && cd ccache-3.7.9 && \
+    ./configure -prefix=/usr/local/ccache-3.7.9 && \
+    make -j8 && make install && cd .. && rm -rf ccache-3.7.9* && \
+    ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache
+
+# clang-form 3.8.0
+RUN wget https://paddle-ci.cdn.bcebos.com/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-16.04.tar.xz && \ 
+    tar xf clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-16.04.tar.xz && cd clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-16.04 && \
+    cp -r * /usr/local && cd .. && rm -rf clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-16.04 && \
+    rm -rf clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-16.04.tar.xz 
+
+# HwHiAiUser
+RUN groupadd HwHiAiUser && \
+    useradd -g HwHiAiUser -m -d /home/HwHiAiUser HwHiAiUser
+
+# copy /etc/ascend_install.info to current dir fist
+COPY ascend_install.info /etc/ascend_install.info
+
+# copy /usr/local/Ascend/driver/version.info to current dir fist
+RUN mkdir -p /usr/local/Ascend/driver
+COPY version.info /usr/local/Ascend/driver/version.info
+
+# Download packages from https://www.hiascend.com/software/cann/community and copy them to current dir first
+WORKDIR /usr/local/Ascend
+ARG CANN_VERSION=5.0.2.alpha005
+# update envs for driver
+ENV LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/common:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH=/usr/local/Ascend/driver/lib64/driver:$LD_LIBRARY_PATH
+
+# Install Ascend toolkit
+COPY Ascend-cann-toolkit_${CANN_VERSION}_linux-x86_64.run /usr/local/Ascend/
+RUN chmod +x Ascend-cann-toolkit_${CANN_VERSION}_linux-x86_64.run && \
+    ./Ascend-cann-toolkit_${CANN_VERSION}_linux-x86_64.run --install --quiet && \
+    rm -rf Ascend-cann-toolkit_${CANN_VERSION}_linux-x86_64.run
+# udpate envs for model transformation and operator develop
+ENV PATH=/usr/local/Ascend/ascend-toolkit/latest/atc/bin:$PATH
+ENV LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/atc/lib64:$LD_LIBRARY_PATH
+ENV PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/pyACL/python/site-packages/acl:$PYTHONPATH
+ENV PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/atc/python/site-packages:$PYTHONPATH
+ENV PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/toolkit/python/site-packages:$PYTHONPATH
+ENV TOOLCHAIN_HOME=/usr/local/Ascend/ascend-toolkit/latest/toolkit
+
+# Install Ascend NNAE
+COPY Ascend-cann-nnae_${CANN_VERSION}_linux-x86_64.run /usr/local/Ascend/
+RUN chmod +x Ascend-cann-nnae_${CANN_VERSION}_linux-x86_64.run && \
+    ./Ascend-cann-nnae_${CANN_VERSION}_linux-x86_64.run --install --quiet && \
+    rm -rf Ascend-cann-nnae_${CANN_VERSION}_linux-x86_64.run
+# update envs for third party AI framework develop
+ENV PATH=/usr/local/Ascend/nnae/latest/fwkacllib/bin:$PATH
+ENV PATH=/usr/local/Ascend/nnae/latest/fwkacllib/ccec_compiler/bin:$PATH
+ENV LD_LIBRARY_PATH=/usr/local/Ascend/nnae/latest/fwkacllib/lib64:$LD_LIBRARY_PATH
+ENV PYTHONPATH=/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages:$PYTHONPATH
+ENV ASCEND_AICPU_PATH=/usr/local/Ascend/nnae/latest
+ENV ASCEND_OPP_PATH=/usr/local/Ascend/nnae/latest/opp
+
+# DEV image should open error level log
+# 0 debug; 1 info; 2 warning; 3 error; 4 null
+ENV ASCEND_GLOBAL_LOG_LEVEL=3
+RUN rm -rf /usr/local/Ascend/driver
+
+# Clean
+RUN apt-get clean -y
+
+EXPOSE 22
diff --git a/tools/dockerfile/build_scripts/install_trt.sh b/tools/dockerfile/build_scripts/install_trt.sh
index 69552871211fd6..a461e2a4f24b33 100644
--- a/tools/dockerfile/build_scripts/install_trt.sh
+++ b/tools/dockerfile/build_scripts/install_trt.sh
@@ -16,6 +16,16 @@
 
 VERSION=$(nvcc --version | grep release | grep -oEi "release ([0-9]+)\.([0-9])"| sed "s/release //")
 
+CUDNN_MAJOR=$(cat /usr/include/cudnn.h | grep -v CUDNN_VERSION | grep CUDNN_MAJOR | cut -d' ' -f3)
+CUDNN_MINOR=$(cat /usr/include/cudnn.h | grep -v CUDNN_VERSION | grep CUDNN_MINOR | cut -d' ' -f3)
+CUDNN_PATCHLEVEL=$(cat /usr/include/cudnn.h | grep -v CUDNN_VERSION | grep CUDNN_PATCHLEVEL | cut -d' ' -f3)
+if [[ -z "${CUDNN_MAJOR}" ]]; then
+  CUDNN_MAJOR=$(cat /usr/include/cudnn_version.h | grep -v CUDNN_VERSION | grep CUDNN_MAJOR | cut -d' ' -f3)
+  CUDNN_MINOR=$(cat /usr/include/cudnn_version.h | grep -v CUDNN_VERSION | grep CUDNN_MINOR | cut -d' ' -f3)
+  CUDNN_PATCHLEVEL=$(cat /usr/include/cudnn_version.h | grep -v CUDNN_VERSION | grep CUDNN_PATCHLEVEL | cut -d' ' -f3)
+fi
+CUDNN_VERSION="${CUDNN_MAJOR}.${CUDNN_MINOR}.${CUDNN_PATCHLEVEL}"
+
 if [[ "$VERSION" == "10.1" ]];then
   wget -q https://paddle-ci.gz.bcebos.com/TRT/TensorRT6-cuda10.1-cudnn7.tar.gz --no-check-certificate
   tar -zxf TensorRT6-cuda10.1-cudnn7.tar.gz -C /usr/local
@@ -36,7 +46,12 @@ elif [[ "$VERSION" == "11.0" ]];then
   tar -zxf TensorRT-7.1.3.4.Ubuntu-16.04.x86_64-gnu.cuda-11.0.cudnn8.0.tar.gz -C /usr/local
   cp -rf /usr/local/TensorRT-7.1.3.4/include/* /usr/include/ && cp -rf /usr/local/TensorRT-7.1.3.4/lib/* /usr/lib/
   rm TensorRT-7.1.3.4.Ubuntu-16.04.x86_64-gnu.cuda-11.0.cudnn8.0.tar.gz
-elif [[ "$VERSION" == "10.2" ]];then
+elif [[ "$VERSION" == "10.2" && "$CUDNN_VERSION" == "7.6.5" ]];then
+  wget https://paddle-ci.gz.bcebos.com/TRT/TensorRT-6.0.1.8.CentOS-7.6.x86_64-gnu.cuda-10.2.cudnn7.6.tar.gz --no-check-certificate
+  tar -zxf TensorRT-6.0.1.8.CentOS-7.6.x86_64-gnu.cuda-10.2.cudnn7.6.tar.gz -C /usr/local
+  cp -rf /usr/local/TensorRT-6.0.1.8/include/* /usr/include/ && cp -rf /usr/local/TensorRT-6.0.1.8/lib/* /usr/lib/
+  rm -f TensorRT-6.0.1.8.CentOS-7.6.x86_64-gnu.cuda-10.2.cudnn7.6.tar.gz
+elif [[ "$VERSION" == "10.2" && "$CUDNN_VERSION" == "8.1.1" ]];then
   wget https://paddle-ci.gz.bcebos.com/TRT/TensorRT-7.2.3.4.CentOS-7.9.x86_64-gnu.cuda-10.2.cudnn8.1.tar.gz --no-check-certificate
   tar -zxf TensorRT-7.2.3.4.CentOS-7.9.x86_64-gnu.cuda-10.2.cudnn8.1.tar.gz -C /usr/local
   cp -rf /usr/local/TensorRT-7.2.3.4/include/* /usr/include/ && cp -rf /usr/local/TensorRT-7.2.3.4/lib/* /usr/lib/
diff --git a/tools/dockerfile/centos7_manylinux.sh b/tools/dockerfile/centos7_manylinux.sh
index 9710ec02320951..d268341db9ae28 100755
--- a/tools/dockerfile/centos7_manylinux.sh
+++ b/tools/dockerfile/centos7_manylinux.sh
@@ -42,6 +42,11 @@ function make_cuda102cudnn7() {
   sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc82 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-8.2/bin:\$PATH \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp
 }
 
+function make_cuda102cudnn7gcc54() {
+  sed 's/<baseimg>/10.2-cudnn7-devel-centos7/g' Dockerfile.centos >Dockerfile.tmp
+  sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc54 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-8.2/bin:\$PATH \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp
+}
+
 function make_cuda102cudnn8() {
   sed 's/<baseimg>/10.2-cudnn8-devel-centos7/g' Dockerfile.centos >Dockerfile.tmp
   sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc82 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-8.2/bin:\$PATH \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp
@@ -94,6 +99,9 @@ function main() {
     cuda102cudnn7)
       make_cuda102cudnn7
       ;;
+    cuda102cudnn7gcc54)
+      make_cuda102cudnn7gcc54
+      ;;
     cuda102cudnn8)
       make_cuda102cudnn8
       ;;
diff --git a/tools/get_quick_disable_lt.py b/tools/get_quick_disable_lt.py
index 4805c909c1ba41..f56e8c24e8f752 100644
--- a/tools/get_quick_disable_lt.py
+++ b/tools/get_quick_disable_lt.py
@@ -15,6 +15,7 @@
 import sys
 import ssl
 import requests
+import paddle
 
 
 def download_file():
@@ -25,13 +26,13 @@ def download_file():
         url = "https://sys-p0.bj.bcebos.com/prec/{}".format('disable_ut_win')
     else:
         url = "https://sys-p0.bj.bcebos.com/prec/{}".format('disable_ut')
-    try:
-        import paddle.fluid.core as core
-        if core.is_compiled_with_rocm():
-            url = "https://sys-p0.bj.bcebos.com/prec/{}".format(
-                'disable_ut_rocm_ci')
-    except:
-        pass
+
+    if paddle.is_compiled_with_rocm():
+        url = "https://sys-p0.bj.bcebos.com/prec/{}".format('disable_ut_rocm')
+
+    if paddle.is_compiled_with_npu():
+        url = "https://sys-p0.bj.bcebos.com/prec/{}".format('disable_ut_npu')
+
     f = requests.get(url)
     data = f.text
     status_code = f.status_code
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index 5108d34f7bf779..fe0be21bfdf44e 100644
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -685,7 +685,6 @@
     'test_nn_functional_hot_op',
     'test_op_name_conflict',
     'test_imperative_gan',
-    'test_simnet',
     'test_amp_check_finite_and_scale_op',
     'test_random_seed',
     'test_histogram_op',
@@ -819,7 +818,6 @@
     'test_prelu_op',
     'test_fill_zeros_like_op',
     'test_pool2d_op',
-    'test_for_enumerate',
     'test_gather_op',
     'test_partial_concat_op',
     'test_gaussian_random_op',
@@ -883,7 +881,6 @@
     'test_empty_like_op',
     'test_rank_loss_op',
     'test_elementwise_mod_op',
-    'test_reinforcement_learning',
     'test_elementwise_max_op',
     'test_retain_graph',
     'test_edit_distance_op',
@@ -1001,7 +998,6 @@
     'test_static_save_load',
     'test_coalesce_tensor_op',
     'test_fuse_bn_act_pass',
-    'test_simnet_v2',
     'test_shard_index_op',
     'test_cuda_random_seed',
     'test_dequantize_log_op',
@@ -1023,7 +1019,6 @@
     'test_py_reader_pin_memory',
     'test_train_recognize_digits',
     'test_parallel_executor_feed_persistable_var',
-    'test_mnist',
     'test_update_loss_scaling_op',
     'test_rnn_cell_api',
     'test_imperative_load_static_param',