Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…

… llama_ops
PaddlePaddle · Oct 12, 2023 · 004fd3e · 004fd3e
2 parents 9d19f03 + f04f6ee
commit 004fd3e
Show file tree

Hide file tree

Showing 158 changed files with 2,924 additions and 1,868 deletions.
diff --git a/.clang-tidy b/.clang-tidy
@@ -189,7 +189,7 @@ modernize-use-override,
 modernize-use-transparent-functors,
 -modernize-use-uncaught-exceptions,
 performance-faster-string-find,
--performance-for-range-copy,
+performance-for-range-copy,
 -performance-implicit-conversion-in-loop,
 -performance-inefficient-algorithm,
 performance-inefficient-string-concatenation,

diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
@@ -247,6 +247,14 @@ if(NOT DEFINED WITH_MKLDNN)
   endif()
 endif()
 
+if(WIN32)
+  if(MSVC)
+    if(MSVC_VERSION LESS 1920)
+      set(WITH_MKLDNN OFF)
+    endif()
+  endif()
+endif()
+
 if(WIN32
    OR APPLE
    OR NOT WITH_GPU

diff --git a/paddle/fluid/distributed/auto_parallel/dist_attr.cc b/paddle/fluid/distributed/auto_parallel/dist_attr.cc
@@ -83,7 +83,7 @@ OperatorDistAttr& OperatorDistAttr::operator=(
 
 void OperatorDistAttr::initialize(const OpDesc* op) {
   if (op == nullptr) return;
-  for (std::string name : op->InputArgumentNames()) {
+  for (std::string const& name : op->InputArgumentNames()) {
     VarDesc* input = op->Block()->FindVarRecursive(name);
     VLOG(4) << "[OperatorDistAttr create input dist attr] " << name;
     if (input == nullptr || op->Type() == "create_py_reader") {
@@ -92,7 +92,7 @@ void OperatorDistAttr::initialize(const OpDesc* op) {
       input_dist_attrs_[name] = TensorDistAttr(get_tensor_shape(input));
     }
   }
-  for (std::string name : op->OutputArgumentNames()) {
+  for (std::string const& name : op->OutputArgumentNames()) {
     VarDesc* output = op->Block()->FindVarRecursive(name);
     VLOG(4) << "[OperatorDistAttr create output dist attr] " << name;
     if (output == nullptr) {

diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc
@@ -985,6 +985,7 @@ void EagerReducer::ProcessUnusedDenseVars() {
   opts.reduce_op = ReduceOp::SUM;
   std::vector<Tensor> reduce_tensors = {global_used_vars_};
   std::vector<phi::DenseTensor> in_out;
+  in_out.reserve(reduce_tensors.size());
   for (auto &t : reduce_tensors) {
     in_out.push_back(*std::dynamic_pointer_cast<phi::DenseTensor>(t.impl()));
   }
@@ -1081,6 +1082,7 @@ void EagerReducer::FusedAllReduceSchedule(EagerGroup *group,
   // all_reduce
   std::vector<Tensor> reduce_tensors = {group->dense_contents_};
   std::vector<phi::DenseTensor> in_out;
+  in_out.reserve(reduce_tensors.size());
   for (auto &t : reduce_tensors) {
     in_out.push_back(*std::dynamic_pointer_cast<phi::DenseTensor>(t.impl()));
   }
@@ -1166,6 +1168,7 @@ void EagerReducer::AllReduceSparse(EagerGroup *group,
   opts.reduce_op = ReduceOp::SUM;
   std::vector<Tensor> reduce_tensors = {rows_num_tensor};
   std::vector<phi::DenseTensor> in_out;
+  in_out.reserve(reduce_tensors.size());
   for (auto &t : reduce_tensors) {
     in_out.push_back(*std::dynamic_pointer_cast<phi::DenseTensor>(t.impl()));
   }
@@ -1214,6 +1217,8 @@ void EagerReducer::AllReduceSparse(EagerGroup *group,
     std::vector<Tensor> dst_rows_tensors = {dst_rows_tensor};
     std::vector<phi::DenseTensor> in;
     std::vector<phi::DenseTensor> out;
+    in.reserve(src_rows_tensors.size());
+    out.reserve(dst_rows_tensors.size());
     for (auto &t : src_rows_tensors) {
       in.push_back(*std::dynamic_pointer_cast<phi::DenseTensor>(t.impl()));
     }
@@ -1245,6 +1250,8 @@ void EagerReducer::AllReduceSparse(EagerGroup *group,
     std::vector<Tensor> dst_value_tensors = {dst_value_tensor};
     std::vector<phi::DenseTensor> src_dense;
     std::vector<phi::DenseTensor> dst_dense;
+    src_dense.reserve(src_value_tensors.size());
+    dst_dense.reserve(dst_value_tensors.size());
     for (auto &t : src_value_tensors) {
       src_dense.push_back(
           *std::dynamic_pointer_cast<phi::DenseTensor>(t.impl()));

diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc
@@ -121,7 +121,7 @@ void Carrier::CopyParameters(
     const framework::ProgramDesc& program,
     const std::vector<std::string>& inference_root_scope_vars) {
   std::map<std::string, int> inference_root_scope_var_map;
-  for (auto var_name : inference_root_scope_vars) {
+  for (auto const& var_name : inference_root_scope_vars) {
     inference_root_scope_var_map.insert({var_name, 1});
   }
   for (size_t i = 0; i < program.Size(); ++i) {
@@ -392,6 +392,7 @@ void Carrier::CreateInterceptors(
         }
       }
 
+      cores.reserve(microbatch_scopes_.size());
       for (framework::Scope* scope : microbatch_scopes_) {
         cores.push_back(std::make_shared<InterpreterCore>(
             place_, task_node->program()->Block(0), scope, execution_config));

diff --git a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
@@ -85,7 +85,7 @@ InterceptorMessage ComputeInterceptor::PrepareVarsMsg() {
   ready_msg.set_message_type(DATA_WITH_VARS);
   ready_msg.set_scope_idx(cur_scope_id_);
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  for (auto iter : node_->vars_to_dtype()) {
+  for (auto const& iter : node_->vars_to_dtype()) {
     VarList* vars = ready_msg.add_vars_list();
     const auto& var_name = iter.first;
     vars->set_name(var_name);

diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.cc b/paddle/fluid/distributed/fleet_executor/dist_model.cc
@@ -295,7 +295,7 @@ void DistModel::InsertCommOp(std::string tmp_var_name,
      << ". The ring id is: " << ring_id << ". The group has: " << nranks
      << " ranks. Current rank in the group is: " << rank
      << ". The endpoint is: " << endpoint << ". Peer endpoints are: ";
-  for (auto ep : peer_endpoints) {
+  for (const auto &ep : peer_endpoints) {
     ss << ep << ", ";
   }
   VLOG(3) << ss.str();

diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
@@ -82,7 +82,7 @@ void PreventVarsDelete(
   for (const auto& pair : *unused_vars) {
     const framework::OperatorBase* op = pair.first;
     std::vector<std::string> cur_unused = pair.second;
-    for (auto name : vars_not_gc) {
+    for (auto const& name : vars_not_gc) {
       auto iter = std::find(cur_unused.begin(), cur_unused.end(), name);
       if (iter != cur_unused.end()) {
         VLOG(3) << "Removing var: [" << name
@@ -165,7 +165,7 @@ void FleetExecutor::Init(
       while_block_vars = GetUnusedVarsAfterWhile(
           program_desc, task_node, inference_root_scope_vars);
       VLOG(3) << "Vars will be gced after while op";
-      for (auto var : while_block_vars) {
+      for (auto const& var : while_block_vars) {
         VLOG(3) << var;
       }
       task_node->SetWhileBlockVars(while_block_vars);

diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -410,7 +410,7 @@ static std::pair<std::string, std::string> GetAttrType(
       ret = "std::vector<std::string>";
       if (is_arg) ret += "&";
       val += "{";
-      for (auto x : PADDLE_GET_CONST(std::vector<std::string>, attr)) {
+      for (auto const& x : PADDLE_GET_CONST(std::vector<std::string>, attr)) {
         val += "\"" + x + "\"" + ",";
       }
       if (val.size() > 1) val.pop_back();
@@ -1238,7 +1238,7 @@ static std::string GenerateGradNodeCreationContent(
       bool found_target_name = false;
       for (const auto& iter : op_base_infos) {
         const auto& grad_outs_slot_map = iter.GetGradOutsSlotnameMap();
-        for (auto iter : grad_outs_slot_map) {
+        for (auto const& iter : grad_outs_slot_map) {
           if ((!found_target_name) && (input_name == iter.second)) {
             const char* SET_GRAD_OUT_META_TEMPLATE =
                 "      grad_node->SetGradOutMeta(%s, %d);\n";
@@ -1256,7 +1256,7 @@ static std::string GenerateGradNodeCreationContent(
       bool found_target_name = false;
       for (const auto& iter : op_base_infos) {
         const auto& grad_outs_slot_map = iter.GetGradOutsSlotnameMap();
-        for (auto iter : grad_outs_slot_map) {
+        for (auto const& iter : grad_outs_slot_map) {
           if ((!found_target_name) && (input_name == iter.second)) {
             const char* SET_GRAD_OUT_META_TEMPLATE =
                 "      grad_node->SetGradOutMeta(%s, %d);\n";
@@ -2142,7 +2142,7 @@ static std::string GenerateSingleOpBase(
   // [Generation] Get Full Zero
   std::string fill_zero_str = "";
   if (ops_to_fill_zero_for_empty_grads.count(fwd_op_type)) {
-    for (auto iter : grad_ins) {
+    for (auto const& iter : grad_ins) {
       const std::string& grad_input_name = iter.first;
       if (grad_ins_grad_slotname_map.count(grad_input_name)) {
         size_t fwd_output_position = fwd_outputs_name_pos_map.at(
@@ -2189,7 +2189,7 @@ static std::string GenerateSingleOpBase(
       "backward_inplace_tensor" + std::to_string(*outs_size);
   bool process_backward_inplace = false;
   std::string ins_contents_str = "";
-  for (auto iter : grad_ins) {
+  for (auto const& iter : grad_ins) {
     const std::string& grad_input_name = iter.first;
 
     if (grad_ins_fwd_slotname_map.count(grad_input_name)) {
@@ -2293,7 +2293,7 @@ static std::string GenerateSingleOpBase(
       paddle::string::Sprintf(BWD_INS_MAP_TEMPLATE, ins_name, ins_contents_str);
   generated_grad_function_body += ins_map_str;
 
-  for (auto iter : grad_ins) {
+  for (auto const& iter : grad_ins) {
     const std::string& grad_input_name = iter.first;
 
     if (grad_ins_fwd_slotname_map.count(grad_input_name)) {
@@ -2335,7 +2335,7 @@ static std::string GenerateSingleOpBase(
   VLOG(6) << "Generated Ins Map";
   // [Generation] Get Outs Map
   std::string outs_contents_str = "";
-  for (auto iter : grad_outs) {
+  for (auto const& iter : grad_outs) {
     const std::string& grad_output_name = iter.first;
 
     if (grad_outs_slotname_map.count(grad_output_name)) {
@@ -2440,7 +2440,7 @@ static std::string GenerateSingleOpBase(
   generated_grad_function_body += outs_map_str;
   generated_grad_function_body += outs_contents_str;
   generated_grad_function_body += "\n";
-  for (auto iter : grad_outs) {
+  for (auto const& iter : grad_outs) {
     const std::string& grad_output_name = iter.first;
 
     if (grad_outs_slotname_map.count(grad_output_name)) {
@@ -2498,7 +2498,7 @@ static std::string GenerateSingleOpBase(
         "%s[\"%s\"][0]);\n"
         "  };\n";
     std::string backward_inplace_map_str = "";
-    for (auto iter : backward_inplace_map) {
+    for (auto const& iter : backward_inplace_map) {
       std::string backward_inplace_input_name = iter.first;
       std::string backward_inplace_output_name = iter.second;
       backward_inplace_map_str += paddle::string::Sprintf(
@@ -2553,7 +2553,7 @@ static std::string GenerateSingleOpBase(
   // [Generation] Get Return
   std::string outputs_str = "";
   size_t num_appended_outputs = 0;
-  for (auto iter : grad_outs) {
+  for (auto const& iter : grad_outs) {
     const std::string& grad_out_name = iter.first;
     const std::string& fwd_name = grad_outs_slotname_map.at(grad_out_name);
 
@@ -2594,7 +2594,7 @@ static std::string GenerateSingleOpBase(
 
   /* Handle Special Case: "PullSparseOp", etc
      For returns, append "GradOut" to the very end of return list. */
-  for (auto iter : grad_outs) {
+  for (auto const& iter : grad_outs) {
     const std::string& grad_out_name = iter.first;
     const std::string& fwd_name = grad_outs_slotname_map.at(grad_out_name);
 

diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc
@@ -85,7 +85,7 @@ void EnforceGradNodeHasInput(GradNodeBase* node) {
 void DuplicateCheck(const std::vector<paddle::Tensor>& inputs, bool is_input) {
   std::unordered_set<AutogradMeta*> visisted_ins;
   std::string msg = is_input ? "inputs" : "outputs";
-  for (auto in : inputs) {
+  for (auto const& in : inputs) {
     AutogradMeta* auto_grad_meta = EagerUtils::unsafe_autograd_meta(in);
     PADDLE_ENFORCE_EQ(
         visisted_ins.count(auto_grad_meta),

diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.cc b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
@@ -190,12 +190,12 @@ RunCustomOpNode::operator()(paddle::small_vector<std::vector<paddle::Tensor>,
     }
   }
 
-  for (auto it : fwd_outs) {
+  for (auto it : fwd_outs) {  // NOLINT
     VLOG(7) << "Insert fwd_outs to grad_inputs: " << it.first;
     tmp_ins[it.first] = RunCustomOpNode::Recover(&(it.second));
   }
 
-  for (auto it : fwd_ins) {
+  for (auto it : fwd_ins) {  // NOLINT
     // NOTE(HongyuJia): returned tensor maybe un-defined tensor when inputs
     // optional<Tensor>
     VLOG(7) << "Insert fwd_ins to grad_inputs: " << it.first;
@@ -406,12 +406,12 @@ RunCustomOpDoubleGradNode::operator()(
     }
   }
 
-  for (auto it : fwd_outs) {
+  for (auto it : fwd_outs) {  // NOLINT
     VLOG(7) << "Insert fwd_outs to grad_inputs: " << it.first;
     tmp_ins[it.first] = RunCustomOpDoubleGradNode::Recover(&(it.second));
   }
 
-  for (auto it : fwd_ins) {
+  for (auto it : fwd_ins) {  // NOLINT
     VLOG(7) << "Insert fwd_ins to grad_inputs: " << it.first;
     tmp_ins[it.first] = RunCustomOpDoubleGradNode::Recover(&(it.second));
   }

diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc
@@ -277,6 +277,15 @@ void GradNodeBase::SetGradOutMeta(const paddle::Tensor& fwd_in,
       meta.SetTensorMeta(dense_tensor.meta());
       meta.SetPlace(fwd_in.place());
       // Set DistAttr
+      PADDLE_ENFORCE_EQ(dist_tensor->defined(),
+                        true,
+                        phi::errors::InvalidArgument(
+                            "The forward input DistTensor is not defined."));
+      PADDLE_ENFORCE_NE(
+          dist_tensor->dist_attr().empty(),
+          true,
+          phi::errors::InvalidArgument(
+              "The forward input DistTensor's dist attr is empty."));
       meta.SetDistAttr(dist_tensor->dist_attr());
       SetIsRunAutoParallel(true);
     } else {

diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h
@@ -191,6 +191,12 @@ static auto GetNameFromValue(const ::pir::Block *block,
                  .dyn_cast<pir::StrAttribute>()
                  .AsString();
       value2name[op->operand(0).source()] = name;
+    } else if (!is_input && op->name() == "builtin.shadow_output") {
+      name = op->attributes()
+                 .at("output_name")
+                 .dyn_cast<pir::StrAttribute>()
+                 .AsString();
+      value2name[op->operand(0).source()] = name;
     } else if (is_input && op->name() == "builtin.get_parameter") {
       name = op->attributes()
                  .at("parameter_name")

diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc
@@ -28,6 +28,38 @@
 #include "paddle/fluid/framework/variable.h"
 
 namespace egr {
+
+void SetGradOutputDistAttrIter::visit_element(paddle::Tensor* element,
+                                              const GradSlotMeta& meta) {
+  if (element == nullptr) {
+    VLOG(4) << "The input element is nullptr when calling "
+               "SetGradOutputDistAttrIter.";
+    return;
+  }
+  // Here the element is empty or defined DistTensor
+  VLOG(4) << "The input element is set DistTensor impl when calling "
+             "SetGradOutputDistAttrIter.";
+  element->set_impl(std::make_shared<phi::distributed::DistTensor>(
+      phi::DDim(), meta.DistAttr()));
+}
+
+void SetGradOutputDistAttrIter::visit(paddle::Tensor* element) {
+  if (!out_meta_[out_indexes_[cur_pos_]].empty()) {
+    visit_element(element, out_meta_[out_indexes_[cur_pos_]][0]);
+  }
+  cur_pos_++;
+}
+
+void SetGradOutputDistAttrIter::visit(
+    const std::vector<paddle::Tensor*>& elements) {
+  if (!out_meta_[out_indexes_[cur_pos_]].empty()) {
+    for (size_t i = 0; i < elements.size(); ++i) {
+      visit_element(elements.at(i), out_meta_[out_indexes_[cur_pos_]][i]);
+    }
+  }
+  cur_pos_++;
+}
+
 /**
  * Implementation of Eager Utils.
  **/

diff --git a/paddle/fluid/eager/utils.h b/paddle/fluid/eager/utils.h
@@ -97,41 +97,9 @@ class SetGradOutputDistAttrIter : public IterHelper<paddle::Tensor*> {
       : out_meta_(out_meta), out_indexes_{out_indexes} {}
 
  private:
-  void visit_element(paddle::Tensor* element, const GradSlotMeta& meta) {
-    if (element == nullptr) {
-      return;
-    }
-    if (meta.DistAttr().empty()) {
-      return;
-    }
-    if (element->defined()) {
-      if (element->is_dist_tensor()) {
-        PADDLE_THROW(phi::errors::Unimplemented(
-            "Unsupport set defined dist tensor now."));
-      } else {
-        // Only deal with dist tensor here
-        return;
-      }
-    } else {
-      element->set_impl(std::make_shared<phi::distributed::DistTensor>(
-          phi::DDim(), meta.DistAttr()));
-    }
-  }
-  void visit(paddle::Tensor* element) override {
-    if (!out_meta_[out_indexes_[cur_pos_]].empty()) {
-      visit_element(element, out_meta_[out_indexes_[cur_pos_]][0]);
-    }
-    cur_pos_++;
-  }
-
-  void visit(const std::vector<paddle::Tensor*>& elements) override {
-    if (!out_meta_[out_indexes_[cur_pos_]].empty()) {
-      for (size_t i = 0; i < elements.size(); ++i) {
-        visit_element(elements.at(i), out_meta_[out_indexes_[cur_pos_]][i]);
-      }
-    }
-    cur_pos_++;
-  }
+  void visit_element(paddle::Tensor* element, const GradSlotMeta& meta);
+  void visit(paddle::Tensor* element) override;
+  void visit(const std::vector<paddle::Tensor*>& elements) override;
 
   const paddle::small_vector<std::vector<GradSlotMeta>, kSlotSmallVectorSize>&
       out_meta_;