Skip to content

Commit

Permalink
[AutoParallel] Adapt static spmd rules for dynamic graph (#56367)
Browse files Browse the repository at this point in the history
* move matmul spmd rules into phi

* add basic infer spmd utils

* addspmd factory

* fix compile error

* add unittest

* refine infer spmd test and utils

* debug infer spmd test

* adapt python test

* poish details

* change to vector attr arg

* revert needless change

* update matmul spmd rule test

* remove original rule

* polish details

* fix marco error

* add comment

* pass backward test

* fix compile error

* add cmake rule for spmd_rules_test

* add dist meta tensor

* update pybind impl

* add marco for rules
  • Loading branch information
chenwhql authored Aug 31, 2023
1 parent dbc9e5a commit 54fcd9a
Show file tree
Hide file tree
Showing 21 changed files with 1,022 additions and 189 deletions.
4 changes: 0 additions & 4 deletions paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
#include "paddle/fluid/distributed/auto_parallel/spmd_rules/elementwise_spmd_rule.h"
#include "paddle/fluid/distributed/auto_parallel/spmd_rules/embedding_spmd_rule.h"
#include "paddle/fluid/distributed/auto_parallel/spmd_rules/layer_norm_spmd_rule.h"
#include "paddle/fluid/distributed/auto_parallel/spmd_rules/matmul_spmd_rule.h"
#include "paddle/fluid/distributed/auto_parallel/spmd_rules/reduction_spmd_rule.h"
#include "paddle/fluid/distributed/auto_parallel/spmd_rules/replicated_spmd_rule.h"
#include "paddle/fluid/distributed/auto_parallel/spmd_rules/reshape_spmd_rule.h"
Expand All @@ -32,9 +31,6 @@ namespace paddle {
namespace distributed {
namespace auto_parallel {

// matmul rule
REGISTER_SPMD_RULE(matmul, MatmulSPMDRule);

// reduction rules
REGISTER_SPMD_RULE(all, ReductionSPMDRule);
REGISTER_SPMD_RULE(amax, ReductionSPMDRule);
Expand Down
51 changes: 51 additions & 0 deletions paddle/fluid/pybind/auto_parallel_py.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,16 @@
#include <pybind11/operators.h>
#include <pybind11/stl.h>

#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/pybind/auto_parallel_py.h"
#include "paddle/fluid/pybind/pybind_variant_caster.h"
#include "paddle/phi/core/device_context.h"
#include "paddle/phi/core/distributed/auto_parallel/device_mesh.h"
#include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
#include "paddle/phi/core/distributed/auto_parallel/dist_mapper.h"
#include "paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h"
#include "paddle/phi/core/distributed/auto_parallel/process_mesh.h"
#include "paddle/utils/optional.h"
#include "paddle/utils/pybind.h"
Expand All @@ -32,6 +35,10 @@
#include "paddle/phi/core/distributed/auto_parallel/r_to_s_reshard_function.h"
#include "paddle/phi/core/distributed/auto_parallel/s_to_r_reshard_function.h"

#ifdef PADDLE_WITH_DISTRIBUTE
#include "paddle/phi/infermeta/spmd_rules/rules.h"
#endif

namespace py = pybind11;

namespace paddle {
Expand All @@ -42,6 +49,7 @@ using paddle::distributed::auto_parallel::kDefault;
using paddle::distributed::auto_parallel::OperatorDistAttr;
using paddle::distributed::auto_parallel::SPMDRuleBase;
using paddle::distributed::auto_parallel::SPMDRuleMap;
using paddle::framework::BlockDesc;
using paddle::framework::OpDesc;
using paddle::framework::VarDesc;
using phi::distributed::ProcessMesh;
Expand Down Expand Up @@ -343,6 +351,41 @@ void BindAutoParallel(py::module *m) {
&SPMDRuleBase::InferBackward));
// .def("infer_backward", &SPMDRuleBase::InferBackward) [revert in future]

py::class_<phi::distributed::SpmdRule>(*m, "SpmdRule")
.def("infer_forward",
[](const phi::distributed::SpmdRule &self,
const std::vector<DistTensorSpec> &input_specs,
const std::vector<phi::Attribute> &attrs) {
phi::distributed::InferSpmdContext ctx;
for (auto &spec : input_specs) {
ctx.EmplaceBackInput(phi::distributed::DistMetaTensor(
phi::make_ddim(spec.shape()), spec.dist_attr()));
}
for (auto &attr : attrs) {
ctx.EmplaceBackAttr(attr);
}
return self.InferForward(ctx);
})
.def("infer_backward",
[](const phi::distributed::SpmdRule &self,
const std::vector<DistTensorSpec> &input_specs,
const std::vector<DistTensorSpec> &output_specs,
const std::vector<phi::Attribute> &attrs) {
phi::distributed::InferSpmdContext ctx;
for (auto &spec : input_specs) {
ctx.EmplaceBackInput(phi::distributed::DistMetaTensor(
phi::make_ddim(spec.shape()), spec.dist_attr()));
}
for (auto &spec : output_specs) {
ctx.EmplaceBackInput(phi::distributed::DistMetaTensor(
phi::make_ddim(spec.shape()), spec.dist_attr()));
}
for (auto &attr : attrs) {
ctx.EmplaceBackAttr(attr);
}
return self.InferBackward(ctx);
});

py::class_<DistTensorSpec>(*m, "DistTensorSpec")
.def(py::init<>())
.def(py::init<const DistTensorSpec &>())
Expand Down Expand Up @@ -472,6 +515,14 @@ void BindAutoParallel(py::module *m) {
},
py::return_value_policy::reference);

m->def(
"get_phi_spmd_rule",
[](const std::string op_type) {
return phi::distributed::SpmdRuleFactory::Instance().GetSpmdRule(
op_type);
},
py::return_value_policy::reference);

// TODO(liuzhenhai): DistributedMapper is not used for now, but
// dist_mapper_test need the symbols forch DistributedMapper to be linked,
// remove it latter
Expand Down
2 changes: 2 additions & 0 deletions paddle/phi/core/distributed/auto_parallel/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ collect_srcs(
dist_mapper.cc
reshard_utils.cc
dist_tensor.cc
dist_meta_tensor.cc
inferspmd_utils.cc
reshard_function.cc
reshard_split_functor.cc
reshard_concat_functor.cc
Expand Down
51 changes: 51 additions & 0 deletions paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.h"

#include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"

namespace phi {
namespace distributed {

phi::DDim DistMetaTensor::dims() const {
// member values in tensor_ have higher priority than those in DistMetaTensor
if (tensor_ != nullptr) {
PADDLE_ENFORCE_EQ(this->is_dist(),
true,
phi::errors::InvalidArgument(
"The current MetaTensor doesn't contains "
"DistTensor when call `dist_attr` method."));
return MetaTensor::dims();
} else {
return dims_;
}
}

const distributed::TensorDistAttr& DistMetaTensor::dist_attr() const {
// member values in tensor_ have higher priority than those in DistMetaTensor
if (tensor_ != nullptr) {
PADDLE_ENFORCE_EQ(this->is_dist(),
true,
phi::errors::InvalidArgument(
"The current MetaTensor doesn't contains "
"DistTensor when call `dist_attr` method."));
return static_cast<phi::distributed::DistTensor*>(tensor_)->dist_attr();
} else {
return dist_attr_;
}
}

} // namespace distributed
} // namespace phi
68 changes: 68 additions & 0 deletions paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once

#include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
#include "paddle/phi/core/meta_tensor.h"

namespace phi {
namespace distributed {

class DistMetaTensor : public MetaTensor {
public:
// supporting implicit construction is easier to use
DistMetaTensor(TensorBase* tensor) // NOLINT
: MetaTensor(tensor) {}
DistMetaTensor(const TensorBase& tensor) // NOLINT
: MetaTensor(tensor) {}
DistMetaTensor(const TensorBase* tensor) // NOLINT
: MetaTensor(tensor) {}
DistMetaTensor(TensorBase& tensor) // NOLINT
: MetaTensor(tensor) {}
// For static mode only
DistMetaTensor(const phi::DDim& dims, const TensorDistAttr& dist_attr)
: dims_(dims), dist_attr_(dist_attr) {}

DistMetaTensor(DistMetaTensor&&) = default;
DistMetaTensor& operator=(DistMetaTensor&&) = default;
DistMetaTensor(const DistMetaTensor&) = default;
DistMetaTensor& operator=(const DistMetaTensor&) = default;

virtual ~DistMetaTensor() = default;

DDim dims() const override;

const distributed::TensorDistAttr& dist_attr() const;

private:
/**
* Note: When using the semi-automatic parallel segmentation derivation rules
* of the static graph, in order to facilitate the packaging of the input
* parameters of the construction, the DistMetaTensor is inherited and
* encapsulated, and the class members dims_ and dist_attr_ are added to it.
*
* The information contained in these two members is also in the tensor of the
* meta_tensor of the base class, and there is redundancy.
*
* We need to pay attention when using it to ensure the consistency.
* These two members are read-only, and their values cannot be changed
* after construction. To change their values, they need to be set
* directly in tensor_*/
phi::DDim dims_;
TensorDistAttr dist_attr_;
};

} // namespace distributed
} // namespace phi
97 changes: 97 additions & 0 deletions paddle/phi/core/distributed/auto_parallel/inferspmd_utils.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h"

namespace phi {
namespace distributed {

void InferSpmdContext::EmplaceBackInput(DistMetaTensor input) {
inputs_.emplace_back(std::move(input));
}

void InferSpmdContext::EmplaceBackAttr(Attribute attr) {
attrs_.emplace_back(std::move(attr));
}

const DistMetaTensor& InferSpmdContext::InputAt(size_t idx) const {
return inputs_.at(idx);
}

template <typename AttrType>
AttrType InferSpmdContext::AttrAt(size_t idx) const {
try {
return paddle::get<AttrType>(attrs_.at(idx));
} catch (paddle::bad_variant_access const& e) {
PADDLE_THROW(phi::errors::InvalidArgument(
"Attribute cast error in InferSpmd Context, the input attr type is "
"`%s`, but the expected attribute type is `%s`.",
attrs_.at(idx).type().name(),
std::type_index(typeid(AttrType)).name()));
}
}

template <>
bool InferSpmdContext::AttrAt<bool>(size_t idx) const {
try {
auto attr = attrs_.at(idx);
if (attr.type() == typeid(int)) {
return static_cast<bool>(paddle::get<int>(attr));
} else {
return paddle::get<bool>(attr);
}
} catch (paddle::bad_variant_access const& e) {
PADDLE_THROW(phi::errors::InvalidArgument(
"Attribute cast error in InferSpmd Context, the input attr type is "
"`%s`, but the expected attribute type is `bool`.",
attrs_.at(idx).type().name()));
}
}

const Attribute& InferSpmdContext::AttrAt(size_t idx) const {
return attrs_.at(idx);
}

SpmdRuleFactory& SpmdRuleFactory::Instance() {
static SpmdRuleFactory g_spmd_rule_map;
return g_spmd_rule_map;
}

bool SpmdRuleFactory::ContainsSpmdRule(const std::string& kernel_name) const {
return spmd_rule_map_.count(kernel_name) > 0;
}

int SpmdRuleFactory::InsertSpmdRule(std::string kernel_name, SpmdRule rule) {
PADDLE_ENFORCE_NE(
ContainsSpmdRule(kernel_name),
true,
phi::errors::AlreadyExists(
"`%s` Kernel's Spmd rules has been registered.", kernel_name));
spmd_rule_map_.insert({std::move(kernel_name), std::move(rule)});
return 0;
}

const SpmdRule& SpmdRuleFactory::GetSpmdRule(
const std::string& kernel_name) const {
auto it = spmd_rule_map_.find(kernel_name);
PADDLE_ENFORCE_NE(
it,
spmd_rule_map_.end(),
phi::errors::NotFound("`%s` Kernel's Spmd rules is not registered.",
kernel_name));
return it->second;
}

} // namespace distributed
} // namespace phi
Loading

0 comments on commit 54fcd9a

Please sign in to comment.