[AutoParallel] Adapt static spmd rules for dynamic graph (#56367)

* move matmul spmd rules into phi * add basic infer spmd utils * addspmd factory * fix compile error * add unittest * refine infer spmd test and utils * debug infer spmd test * adapt python test * poish details * change to vector attr arg * revert needless change * update matmul spmd rule test * remove original rule * polish details * fix marco error * add comment * pass backward test * fix compile error * add cmake rule for spmd_rules_test * add dist meta tensor * update pybind impl * add marco for rules
PaddlePaddle · Aug 31, 2023 · 54fcd9a · 54fcd9a
1 parent dbc9e5a
commit 54fcd9a
Show file tree

Hide file tree

Showing 21 changed files with 1,022 additions and 189 deletions.
diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h b/paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h
@@ -19,7 +19,6 @@
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/elementwise_spmd_rule.h"
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/embedding_spmd_rule.h"
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/layer_norm_spmd_rule.h"
-#include "paddle/fluid/distributed/auto_parallel/spmd_rules/matmul_spmd_rule.h"
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/reduction_spmd_rule.h"
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/replicated_spmd_rule.h"
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/reshape_spmd_rule.h"
@@ -32,9 +31,6 @@ namespace paddle {
 namespace distributed {
 namespace auto_parallel {
 
-// matmul rule
-REGISTER_SPMD_RULE(matmul, MatmulSPMDRule);
-
 // reduction rules
 REGISTER_SPMD_RULE(all, ReductionSPMDRule);
 REGISTER_SPMD_RULE(amax, ReductionSPMDRule);

diff --git a/paddle/fluid/pybind/auto_parallel_py.cc b/paddle/fluid/pybind/auto_parallel_py.cc
@@ -15,13 +15,16 @@
 #include <pybind11/operators.h>
 #include <pybind11/stl.h>
 
+#include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/pybind/auto_parallel_py.h"
+#include "paddle/fluid/pybind/pybind_variant_caster.h"
 #include "paddle/phi/core/device_context.h"
 #include "paddle/phi/core/distributed/auto_parallel/device_mesh.h"
 #include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
 #include "paddle/phi/core/distributed/auto_parallel/dist_mapper.h"
+#include "paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h"
 #include "paddle/phi/core/distributed/auto_parallel/process_mesh.h"
 #include "paddle/utils/optional.h"
 #include "paddle/utils/pybind.h"
@@ -32,6 +35,10 @@
 #include "paddle/phi/core/distributed/auto_parallel/r_to_s_reshard_function.h"
 #include "paddle/phi/core/distributed/auto_parallel/s_to_r_reshard_function.h"
 
+#ifdef PADDLE_WITH_DISTRIBUTE
+#include "paddle/phi/infermeta/spmd_rules/rules.h"
+#endif
+
 namespace py = pybind11;
 
 namespace paddle {
@@ -42,6 +49,7 @@ using paddle::distributed::auto_parallel::kDefault;
 using paddle::distributed::auto_parallel::OperatorDistAttr;
 using paddle::distributed::auto_parallel::SPMDRuleBase;
 using paddle::distributed::auto_parallel::SPMDRuleMap;
+using paddle::framework::BlockDesc;
 using paddle::framework::OpDesc;
 using paddle::framework::VarDesc;
 using phi::distributed::ProcessMesh;
@@ -343,6 +351,41 @@ void BindAutoParallel(py::module *m) {
                &SPMDRuleBase::InferBackward));
   // .def("infer_backward", &SPMDRuleBase::InferBackward) [revert in future]
 
+  py::class_<phi::distributed::SpmdRule>(*m, "SpmdRule")
+      .def("infer_forward",
+           [](const phi::distributed::SpmdRule &self,
+              const std::vector<DistTensorSpec> &input_specs,
+              const std::vector<phi::Attribute> &attrs) {
+             phi::distributed::InferSpmdContext ctx;
+             for (auto &spec : input_specs) {
+               ctx.EmplaceBackInput(phi::distributed::DistMetaTensor(
+                   phi::make_ddim(spec.shape()), spec.dist_attr()));
+             }
+             for (auto &attr : attrs) {
+               ctx.EmplaceBackAttr(attr);
+             }
+             return self.InferForward(ctx);
+           })
+      .def("infer_backward",
+           [](const phi::distributed::SpmdRule &self,
+              const std::vector<DistTensorSpec> &input_specs,
+              const std::vector<DistTensorSpec> &output_specs,
+              const std::vector<phi::Attribute> &attrs) {
+             phi::distributed::InferSpmdContext ctx;
+             for (auto &spec : input_specs) {
+               ctx.EmplaceBackInput(phi::distributed::DistMetaTensor(
+                   phi::make_ddim(spec.shape()), spec.dist_attr()));
+             }
+             for (auto &spec : output_specs) {
+               ctx.EmplaceBackInput(phi::distributed::DistMetaTensor(
+                   phi::make_ddim(spec.shape()), spec.dist_attr()));
+             }
+             for (auto &attr : attrs) {
+               ctx.EmplaceBackAttr(attr);
+             }
+             return self.InferBackward(ctx);
+           });
+
   py::class_<DistTensorSpec>(*m, "DistTensorSpec")
       .def(py::init<>())
       .def(py::init<const DistTensorSpec &>())
@@ -472,6 +515,14 @@ void BindAutoParallel(py::module *m) {
       },
       py::return_value_policy::reference);
 
+  m->def(
+      "get_phi_spmd_rule",
+      [](const std::string op_type) {
+        return phi::distributed::SpmdRuleFactory::Instance().GetSpmdRule(
+            op_type);
+      },
+      py::return_value_policy::reference);
+
   // TODO(liuzhenhai): DistributedMapper is not used for now, but
   // dist_mapper_test need the symbols forch DistributedMapper to be linked,
   // remove it latter

diff --git a/paddle/phi/core/distributed/auto_parallel/CMakeLists.txt b/paddle/phi/core/distributed/auto_parallel/CMakeLists.txt
@@ -9,6 +9,8 @@ collect_srcs(
   dist_mapper.cc
   reshard_utils.cc
   dist_tensor.cc
+  dist_meta_tensor.cc
+  inferspmd_utils.cc
   reshard_function.cc
   reshard_split_functor.cc
   reshard_concat_functor.cc

diff --git a/paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.cc b/paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.cc
@@ -0,0 +1,51 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.h"
+
+#include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
+
+namespace phi {
+namespace distributed {
+
+phi::DDim DistMetaTensor::dims() const {
+  // member values in tensor_ have higher priority than those in DistMetaTensor
+  if (tensor_ != nullptr) {
+    PADDLE_ENFORCE_EQ(this->is_dist(),
+                      true,
+                      phi::errors::InvalidArgument(
+                          "The current MetaTensor doesn't contains "
+                          "DistTensor when call `dist_attr` method."));
+    return MetaTensor::dims();
+  } else {
+    return dims_;
+  }
+}
+
+const distributed::TensorDistAttr& DistMetaTensor::dist_attr() const {
+  // member values in tensor_ have higher priority than those in DistMetaTensor
+  if (tensor_ != nullptr) {
+    PADDLE_ENFORCE_EQ(this->is_dist(),
+                      true,
+                      phi::errors::InvalidArgument(
+                          "The current MetaTensor doesn't contains "
+                          "DistTensor when call `dist_attr` method."));
+    return static_cast<phi::distributed::DistTensor*>(tensor_)->dist_attr();
+  } else {
+    return dist_attr_;
+  }
+}
+
+}  // namespace distributed
+}  // namespace phi
diff --git a/paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.h b/paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.h
@@ -0,0 +1,68 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
+#include "paddle/phi/core/meta_tensor.h"
+
+namespace phi {
+namespace distributed {
+
+class DistMetaTensor : public MetaTensor {
+ public:
+  // supporting implicit construction is easier to use
+  DistMetaTensor(TensorBase* tensor)  // NOLINT
+      : MetaTensor(tensor) {}
+  DistMetaTensor(const TensorBase& tensor)  // NOLINT
+      : MetaTensor(tensor) {}
+  DistMetaTensor(const TensorBase* tensor)  // NOLINT
+      : MetaTensor(tensor) {}
+  DistMetaTensor(TensorBase& tensor)  // NOLINT
+      : MetaTensor(tensor) {}
+  // For static mode only
+  DistMetaTensor(const phi::DDim& dims, const TensorDistAttr& dist_attr)
+      : dims_(dims), dist_attr_(dist_attr) {}
+
+  DistMetaTensor(DistMetaTensor&&) = default;
+  DistMetaTensor& operator=(DistMetaTensor&&) = default;
+  DistMetaTensor(const DistMetaTensor&) = default;
+  DistMetaTensor& operator=(const DistMetaTensor&) = default;
+
+  virtual ~DistMetaTensor() = default;
+
+  DDim dims() const override;
+
+  const distributed::TensorDistAttr& dist_attr() const;
+
+ private:
+  /**
+   * Note: When using the semi-automatic parallel segmentation derivation rules
+   * of the static graph, in order to facilitate the packaging of the input
+   * parameters of the construction, the DistMetaTensor is inherited and
+   * encapsulated, and the class members dims_ and dist_attr_ are added to it.
+   *
+   * The information contained in these two members is also in the tensor of the
+   * meta_tensor of the base class, and there is redundancy.
+   *
+   * We need to pay attention when using it to ensure the consistency.
+   * These two members are read-only, and their values cannot be changed
+   * after construction. To change their values, they need to be set
+   * directly in tensor_*/
+  phi::DDim dims_;
+  TensorDistAttr dist_attr_;
+};
+
+}  // namespace distributed
+}  //  namespace phi
diff --git a/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.cc b/paddle/phi/core/distributed/auto_parallel/inferspmd_utils.cc
@@ -0,0 +1,97 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h"
+
+namespace phi {
+namespace distributed {
+
+void InferSpmdContext::EmplaceBackInput(DistMetaTensor input) {
+  inputs_.emplace_back(std::move(input));
+}
+
+void InferSpmdContext::EmplaceBackAttr(Attribute attr) {
+  attrs_.emplace_back(std::move(attr));
+}
+
+const DistMetaTensor& InferSpmdContext::InputAt(size_t idx) const {
+  return inputs_.at(idx);
+}
+
+template <typename AttrType>
+AttrType InferSpmdContext::AttrAt(size_t idx) const {
+  try {
+    return paddle::get<AttrType>(attrs_.at(idx));
+  } catch (paddle::bad_variant_access const& e) {
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "Attribute cast error in InferSpmd Context, the input attr type is "
+        "`%s`, but the expected attribute type is `%s`.",
+        attrs_.at(idx).type().name(),
+        std::type_index(typeid(AttrType)).name()));
+  }
+}
+
+template <>
+bool InferSpmdContext::AttrAt<bool>(size_t idx) const {
+  try {
+    auto attr = attrs_.at(idx);
+    if (attr.type() == typeid(int)) {
+      return static_cast<bool>(paddle::get<int>(attr));
+    } else {
+      return paddle::get<bool>(attr);
+    }
+  } catch (paddle::bad_variant_access const& e) {
+    PADDLE_THROW(phi::errors::InvalidArgument(
+        "Attribute cast error in InferSpmd Context, the input attr type is "
+        "`%s`, but the expected attribute type is `bool`.",
+        attrs_.at(idx).type().name()));
+  }
+}
+
+const Attribute& InferSpmdContext::AttrAt(size_t idx) const {
+  return attrs_.at(idx);
+}
+
+SpmdRuleFactory& SpmdRuleFactory::Instance() {
+  static SpmdRuleFactory g_spmd_rule_map;
+  return g_spmd_rule_map;
+}
+
+bool SpmdRuleFactory::ContainsSpmdRule(const std::string& kernel_name) const {
+  return spmd_rule_map_.count(kernel_name) > 0;
+}
+
+int SpmdRuleFactory::InsertSpmdRule(std::string kernel_name, SpmdRule rule) {
+  PADDLE_ENFORCE_NE(
+      ContainsSpmdRule(kernel_name),
+      true,
+      phi::errors::AlreadyExists(
+          "`%s` Kernel's Spmd rules has been registered.", kernel_name));
+  spmd_rule_map_.insert({std::move(kernel_name), std::move(rule)});
+  return 0;
+}
+
+const SpmdRule& SpmdRuleFactory::GetSpmdRule(
+    const std::string& kernel_name) const {
+  auto it = spmd_rule_map_.find(kernel_name);
+  PADDLE_ENFORCE_NE(
+      it,
+      spmd_rule_map_.end(),
+      phi::errors::NotFound("`%s` Kernel's Spmd rules is not registered.",
+                            kernel_name));
+  return it->second;
+}
+
+}  // namespace distributed
+}  // namespace phi