Merge remote-tracking branch 'upstream/develop' into slice/static_get…

…item
PaddlePaddle · Jul 26, 2021 · ae19cc0 · ae19cc0
2 parents 0a476ea + 6b20cb4
commit ae19cc0
Show file tree

Hide file tree

Showing 219 changed files with 4,327 additions and 2,224 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -119,17 +119,19 @@ if(WIN32)
         endforeach(flag_var)
     endif()
 
-    math(EXPR PROCESS_MAX "${CPU_CORES} * 2 / 3")
-
     # windows build turn off warnings, use parallel compiling.
     foreach(flag_var
         CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
         CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
         CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
         CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
         string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
-        # NOTE(zhouwei25): GPU compile have too high memory utilization when parallel compiling
-        if(NOT WITH_GPU)
+
+        # NOTE(zhouwei25): GPU compile have too high memory utilization when parallel compiling,
+        # For Visual Studio generators, /MP should be added. 
+        # For other generators like Ninja, it is not need to add /MP.
+        if("${CMAKE_GENERATOR}" STREQUAL "Visual Studio" AND NOT WITH_GPU)
+            math(EXPR PROCESS_MAX "${CPU_CORES} * 2 / 3")
             set(${flag_var} "${${flag_var}} /MP${PROCESS_MAX}")
         endif()
     endforeach(flag_var)
@@ -312,6 +314,17 @@ else()
      endif()
 endif()
 
+if(WITH_DISTRIBUTE)
+    if(LINUX)
+        set(WITH_GLOO ON CACHE STRING "Enable GLOO when compiling WITH_DISTRIBUTE=ON." FORCE)
+    endif()
+    if(WITH_ASCEND_CL)
+        # disable WITH_PSCORE for NPU before include third_party
+        MESSAGE(WARNING "Disable WITH_PSCORE when compiling with NPU. Force WITH_PSCORE=OFF.")
+        set(WITH_PSCORE OFF CACHE BOOL "Disable WITH_PSCORE when compiling with NPU" FORCE)
+    endif()
+endif()
+
 include(third_party)  # download, build, install third_party, Contains about 20+ dependencies
 
 include(flags)              # set paddle compile flags
@@ -322,12 +335,6 @@ if(WITH_PROFILER)
     add_definitions(-DWITH_GPERFTOOLS)
 endif()
 
-if(WITH_DISTRIBUTE)
-    if(LINUX)
-        set(WITH_GLOO ON CACHE STRING "Enable GLOO when compiling WITH_DISTRIBUTE=ON." FORCE)
-    endif()
-endif()
-
 include(ccache)             # set ccache for compilation
 include(util)               # set unittest and link libs
 include(version)            # set PADDLE_VERSION

diff --git a/cmake/ccache.cmake b/cmake/ccache.cmake
@@ -18,7 +18,7 @@ elseif("${CMAKE_GENERATOR}" STREQUAL "Ninja")
 
     if(SCCACHE_PATH)
         execute_process(COMMAND sccache -V OUTPUT_VARIABLE sccache_version)
-        message(STATUS "${sccache_version} is founded, use [${SCCACHE_PATH}] to speed up compile on Windows.")
+        message(STATUS "sccache is founded, use [${SCCACHE_PATH}] to speed up compile on Windows.")
 
         set(CMAKE_C_COMPILER_LAUNCHER ${SCCACHE_PATH})
         set(CMAKE_CXX_COMPILER_LAUNCHER ${SCCACHE_PATH})

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
@@ -197,14 +197,14 @@ cc_test(operator_exception_test SRCS operator_exception_test.cc DEPS operator op
 cc_library(version SRCS version.cc)
 cc_test(version_test SRCS version_test.cc DEPS version)
 
-cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
+cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS attribute shape_inference op_info operator glog version)
 
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
 
 cc_library(op_call_stack SRCS op_call_stack.cc DEPS op_proto_maker enforce)
 cc_test(op_call_stack_test SRCS op_call_stack_test.cc DEPS op_call_stack)
 
-cc_library(program_processing SRCS program_processing.cc DEPS framework_proto)
+cc_library(program_processing SRCS program_processing.cc DEPS boost proto_desc)
 cc_test(program_processing_test SRCS program_processing_test.cc DEPS proto_desc program_processing)
 
 if(WITH_GPU)

diff --git a/paddle/fluid/framework/ir/fuse_pass_base.cc b/paddle/fluid/framework/ir/fuse_pass_base.cc
@@ -53,6 +53,8 @@ void FusePassBase::AddStatis(int count_of_fused) const {
   auto& info =
       graph_->Get<std::unordered_map<std::string, int>>(kFuseStatisAttr);
   info[repr_] = count_of_fused;
+  if (count_of_fused > 0)
+    LOG(INFO) << "---  detected " << count_of_fused << " subgraphs";
 }
 
 FuseOptions FusePassBase::FindFuseOption(const Node& node1,

diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -88,7 +88,7 @@ void GraphPatternDetector::operator()(Graph *graph,
   ValidateByNodeRole(&subgraphs);
 
   if (subgraphs.empty()) return;
-  LOG(INFO) << "---  detected " << subgraphs.size() << " subgraphs";
+
   int id = 0;
   for (auto &g : subgraphs) {
     VLOG(3) << "optimizing #" << id++ << " subgraph";

diff --git a/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc b/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc
@@ -191,11 +191,6 @@ void MapMatmul2MulPass::ApplyImpl(ir::Graph* graph) const {
   int found_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
-    if (!IsCompat(subgraph, g)) {
-      LOG(WARNING) << "Pass in op compat failed.";
-      return;
-    }
-
     VLOG(4) << "map matmul to mul";
     GET_IR_NODE_FROM_SUBGRAPH(matmul_in_x, matmul_in_x, matmul_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(matmul_in_y, matmul_in_y, matmul_pattern);
@@ -221,6 +216,10 @@ void MapMatmul2MulPass::ApplyImpl(ir::Graph* graph) const {
            next_ops[0]->Name() == "elementwise_add";
 
     if (flag) {
+      if (!IsCompat(subgraph, g)) {
+        LOG(WARNING) << "Pass in op compat failed.";
+        return;
+      }
       OpDesc desc;
       desc.SetType("mul");
       desc.SetInput("X", {matmul_in_x->Name()});

diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
@@ -70,7 +70,7 @@ ConvBiasFusePass::ConvBiasFusePass() {
       .IsTensor()
       .End()
       .AddAttr("axis")
-      .IsIntIn({-1, 0})
+      .IsIntIn({1, 3})
       .End();
 }
 

diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc
@@ -52,7 +52,7 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
       op->SetInput("Bias", {});
   } else if (type == "elementwise_add") {
     op->SetAttr("use_mkldnn", true);
-    op->SetAttr("axis", -1);
+    op->SetAttr("axis", 1);
     op->SetInput("X", {inputs[0]});
     op->SetInput("Y", {inputs[1]});
     op->SetOutput("Out", outputs);

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1255,6 +1255,7 @@ USE_TRT_CONVERTER(nearest_interp);
 USE_TRT_CONVERTER(reshape);
 USE_TRT_CONVERTER(reduce_sum);
 USE_TRT_CONVERTER(gather_nd);
+USE_TRT_CONVERTER(reduce_mean);
 #endif
 
 namespace paddle_infer {

diff --git a/paddle/fluid/inference/tensorrt/convert/reduce_op.cc b/paddle/fluid/inference/tensorrt/convert/reduce_op.cc
@@ -35,12 +35,18 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-class ReduceSumOpConverter : public OpConverter {
+class ReduceOpConverter : public OpConverter {
  public:
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope, bool test_mode) override {
-    VLOG(4) << "convert a paddle reduce_sum op to tensorrt reduce layer";
+    VLOG(4) << "convert a paddle " << op_type << " op to tensorrt reduce layer";
     framework::OpDesc op_desc(op, nullptr);
+    nvinfer1::ReduceOperation reduce_type;
+    if (op_type == "reduce_sum") {
+      reduce_type = nvinfer1::ReduceOperation::kSUM;
+    } else if (op_type == "reduce_mean") {
+      reduce_type = nvinfer1::ReduceOperation::kAVG;
+    }
 
     auto* x = engine_->GetITensor(op_desc.Input("X").front());
     nvinfer1::Dims input_shape = x->getDimensions();
@@ -51,15 +57,13 @@ class ReduceSumOpConverter : public OpConverter {
         BOOST_GET_CONST(std::vector<int32_t>, op_desc.GetAttr("dim"));
     bool reduce_all = BOOST_GET_CONST(bool, op_desc.GetAttr("reduce_all"));
 
-    // Now we only support dynamic_shape mode.
     nvinfer1::IReduceLayer* layer = nullptr;
     if (reduce_all) {
       uint32_t reduce_dim = 0;
       for (int i = 0; i < input_dims; ++i) {
         reduce_dim |= 1 << i;
       }
-      layer = TRT_ENGINE_ADD_LAYER(engine_, Reduce, *x,
-                                   nvinfer1::ReduceOperation::kSUM, reduce_dim,
+      layer = TRT_ENGINE_ADD_LAYER(engine_, Reduce, *x, reduce_type, reduce_dim,
                                    keep_dim);
     } else {
       auto CvtToBitMask = [&](const std::vector<int32_t>& dims) -> uint32_t {
@@ -68,23 +72,37 @@ class ReduceSumOpConverter : public OpConverter {
           if (x < 0) {
             res |= 1 << (x + input_dims);
           } else {
+            if (!engine_->with_dynamic_shape()) x = x - 1;
             res |= 1 << x;
           }
         }
         return res;
       };
-      layer = TRT_ENGINE_ADD_LAYER(engine_, Reduce, *x,
-                                   nvinfer1::ReduceOperation::kSUM,
+      layer = TRT_ENGINE_ADD_LAYER(engine_, Reduce, *x, reduce_type,
                                    CvtToBitMask(dim), keep_dim);
     }
 
     auto output_name = op_desc.Output("Out")[0];
-    RreplenishLayerAndOutput(layer, "reduce_sum", {output_name}, test_mode);
+    RreplenishLayerAndOutput(layer, op_type, {output_name}, test_mode);
   }
+
+ protected:
+  std::string op_type;
+};
+
+class ReduceSumOpConverter : public ReduceOpConverter {
+ public:
+  ReduceSumOpConverter() { op_type = "reduce_sum"; }
+};
+
+class ReduceMeanOpConverter : public ReduceOpConverter {
+ public:
+  ReduceMeanOpConverter() { op_type = "reduce_mean"; }
 };
 
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
 
 REGISTER_TRT_OP_CONVERTER(reduce_sum, ReduceSumOpConverter);
+REGISTER_TRT_OP_CONVERTER(reduce_mean, ReduceMeanOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h
@@ -39,6 +39,12 @@ namespace tensorrt {
   NV_TENSORRT_MAJOR * 1000 + NV_TENSORRT_MINOR * 100 + \
       NV_TENSORRT_PATCH * 10 + NV_TENSORRT_BUILD
 
+#if IS_TRT_VERSION_GE(8000)
+#define TRT_NOEXCEPT noexcept
+#else
+#define TRT_NOEXCEPT
+#endif
+
 namespace dy = paddle::platform::dynload;
 
 // TensorRT data type to size
@@ -72,7 +78,8 @@ static int GetInferLibVersion() {
 // A logger for create TensorRT infer builder.
 class NaiveLogger : public nvinfer1::ILogger {
  public:
-  void log(nvinfer1::ILogger::Severity severity, const char* msg) override {
+  void log(nvinfer1::ILogger::Severity severity,
+           const char* msg) TRT_NOEXCEPT override {
     switch (severity) {
       case Severity::kVERBOSE:
         VLOG(3) << msg;
@@ -105,7 +112,7 @@ class NaiveProfiler : public nvinfer1::IProfiler {
   typedef std::pair<std::string, float> Record;
   std::vector<Record> mProfile;
 
-  virtual void reportLayerTime(const char* layerName, float ms) {
+  virtual void reportLayerTime(const char* layerName, float ms) TRT_NOEXCEPT {
     auto record =
         std::find_if(mProfile.begin(), mProfile.end(),
                      [&](const Record& r) { return r.first == layerName; });

diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -130,6 +130,7 @@ struct SimpleOpTypeSetTeller : public Teller {
       "nearest_interp",
       "anchor_generator",
       "reduce_sum",
+      "reduce_mean",
   };
 };
 
@@ -709,18 +710,24 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       if (!with_dynamic_shape && shape[0] == -1) return false;
     }
 
-    if (op_type == "reduce_sum") {
-      if (!with_dynamic_shape) {
-        VLOG(3) << "the reduce_sum does not support static shape yet";
-        return false;
-      }
-
+    if (op_type == "reduce_sum" || op_type == "reduce_mean") {
       if (!(desc.HasAttr("keep_dim") && desc.HasAttr("dim") &&
             desc.HasAttr("reduce_all"))) {
-        VLOG(3) << "the reduce_sum does not have attr (keep_dim or dim or "
+        VLOG(3) << "the " << op_type
+                << " does not have attr (keep_dim or dim or "
                    "reduce_all)";
         return false;
       }
+
+      // The batch size dimension cannot be reduced if it's not dynamic shape.
+      if (!with_dynamic_shape) {
+        if (desc.HasAttr("reduce_all")) return false;
+        std::vector<int32_t> dim =
+            BOOST_GET_CONST(std::vector<int32_t>, desc.GetAttr("dim"));
+        for (auto x : dim) {
+          if (!x) return false;
+        }
+      }
     }
 
     if ((*teller)(op_type, desc, use_no_calib_int8)) return true;