Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AutoParallel] Adapt static spmd rules for dynamic graph #56367

Merged
merged 31 commits into from
Aug 31, 2023
Merged
Show file tree
Hide file tree
Changes from 30 commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
612bb5e
move matmul spmd rules into phi
chenwhql Aug 15, 2023
1974d2b
add basic infer spmd utils
chenwhql Aug 16, 2023
3878288
addspmd factory
chenwhql Aug 16, 2023
63b10fa
merge and resolve conflict with develop
chenwhql Aug 16, 2023
2d12ee2
fix compile error
chenwhql Aug 16, 2023
3d44d8d
add unittest
chenwhql Aug 21, 2023
7bb8a18
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
chenwhql Aug 21, 2023
2166624
refine infer spmd test and utils
chenwhql Aug 22, 2023
7480ecc
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
chenwhql Aug 22, 2023
51cd20e
debug infer spmd test
chenwhql Aug 22, 2023
babba63
adapt python test
chenwhql Aug 23, 2023
1b1b490
poish details
chenwhql Aug 24, 2023
be3249d
change to vector attr arg
chenwhql Aug 25, 2023
4051b5e
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
chenwhql Aug 25, 2023
e105c94
revert needless change
chenwhql Aug 25, 2023
12b5648
update matmul spmd rule test
chenwhql Aug 25, 2023
651f974
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
chenwhql Aug 25, 2023
d7fcfe5
remove original rule
chenwhql Aug 25, 2023
cf08fe5
polish details
chenwhql Aug 25, 2023
8da7b0c
fix marco error
chenwhql Aug 28, 2023
9226857
resolve conflict with develop
chenwhql Aug 28, 2023
006b67b
add comment
chenwhql Aug 28, 2023
25a1557
pass backward test
chenwhql Aug 28, 2023
2072094
resolve conflict with develop
chenwhql Aug 28, 2023
5d55545
fix compile error
chenwhql Aug 28, 2023
b262c03
resolve conflict with develop
chenwhql Aug 28, 2023
0603212
add cmake rule for spmd_rules_test
chenwhql Aug 29, 2023
5259b14
add dist meta tensor
chenwhql Aug 29, 2023
eafd092
update pybind impl
chenwhql Aug 29, 2023
e1cd297
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
chenwhql Aug 29, 2023
d44e43a
add marco for rules
chenwhql Aug 29, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions paddle/fluid/distributed/auto_parallel/spmd_rules/rules.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
#include "paddle/fluid/distributed/auto_parallel/spmd_rules/elementwise_spmd_rule.h"
#include "paddle/fluid/distributed/auto_parallel/spmd_rules/embedding_spmd_rule.h"
#include "paddle/fluid/distributed/auto_parallel/spmd_rules/layer_norm_spmd_rule.h"
#include "paddle/fluid/distributed/auto_parallel/spmd_rules/matmul_spmd_rule.h"
#include "paddle/fluid/distributed/auto_parallel/spmd_rules/reduction_spmd_rule.h"
#include "paddle/fluid/distributed/auto_parallel/spmd_rules/replicated_spmd_rule.h"
#include "paddle/fluid/distributed/auto_parallel/spmd_rules/reshape_spmd_rule.h"
Expand All @@ -32,9 +31,6 @@ namespace paddle {
namespace distributed {
namespace auto_parallel {

// matmul rule
REGISTER_SPMD_RULE(matmul, MatmulSPMDRule);

// reduction rules
REGISTER_SPMD_RULE(all, ReductionSPMDRule);
REGISTER_SPMD_RULE(amax, ReductionSPMDRule);
Expand Down
48 changes: 48 additions & 0 deletions paddle/fluid/pybind/auto_parallel_py.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,18 @@
#include <pybind11/operators.h>
#include <pybind11/stl.h>

#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/var_desc.h"
#include "paddle/fluid/pybind/auto_parallel_py.h"
#include "paddle/fluid/pybind/pybind_variant_caster.h"
#include "paddle/phi/core/device_context.h"
#include "paddle/phi/core/distributed/auto_parallel/device_mesh.h"
#include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
#include "paddle/phi/core/distributed/auto_parallel/dist_mapper.h"
#include "paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h"
#include "paddle/phi/core/distributed/auto_parallel/process_mesh.h"
#include "paddle/phi/infermeta/spmd_rules/rules.h"
#include "paddle/utils/optional.h"
#include "paddle/utils/pybind.h"

Expand All @@ -42,6 +46,7 @@ using paddle::distributed::auto_parallel::kDefault;
using paddle::distributed::auto_parallel::OperatorDistAttr;
using paddle::distributed::auto_parallel::SPMDRuleBase;
using paddle::distributed::auto_parallel::SPMDRuleMap;
using paddle::framework::BlockDesc;
using paddle::framework::OpDesc;
using paddle::framework::VarDesc;
using phi::distributed::ProcessMesh;
Expand Down Expand Up @@ -343,6 +348,41 @@ void BindAutoParallel(py::module *m) {
&SPMDRuleBase::InferBackward));
// .def("infer_backward", &SPMDRuleBase::InferBackward) [revert in future]

py::class_<phi::distributed::SpmdRule>(*m, "SpmdRule")
.def("infer_forward",
[](const phi::distributed::SpmdRule &self,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DistTensorSpec seen to be redundant now, would it be better that expose the InferSpmdContext and MetaTensor API into python and static mode build the input ctx directly ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this can be determined according to the needs of semi-static. This PR try not to change the original test framework as much as possible.

const std::vector<DistTensorSpec> &input_specs,
const std::vector<phi::Attribute> &attrs) {
phi::distributed::InferSpmdContext ctx;
for (auto &spec : input_specs) {
ctx.EmplaceBackInput(phi::distributed::DistMetaTensor(
phi::make_ddim(spec.shape()), spec.dist_attr()));
}
for (auto &attr : attrs) {
ctx.EmplaceBackAttr(attr);
}
return self.InferForward(ctx);
})
.def("infer_backward",
[](const phi::distributed::SpmdRule &self,
const std::vector<DistTensorSpec> &input_specs,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

infer_backward need the info of input tensors and output tensors for inference, please ref to new api:
https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/distributed/auto_parallel/spmd_rules/common.h#L62

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done, change pybind infer_backward api to this format

const std::vector<DistTensorSpec> &output_specs,
const std::vector<phi::Attribute> &attrs) {
phi::distributed::InferSpmdContext ctx;
for (auto &spec : input_specs) {
ctx.EmplaceBackInput(phi::distributed::DistMetaTensor(
phi::make_ddim(spec.shape()), spec.dist_attr()));
}
for (auto &spec : output_specs) {
ctx.EmplaceBackInput(phi::distributed::DistMetaTensor(
phi::make_ddim(spec.shape()), spec.dist_attr()));
}
for (auto &attr : attrs) {
ctx.EmplaceBackAttr(attr);
}
return self.InferBackward(ctx);
});

py::class_<DistTensorSpec>(*m, "DistTensorSpec")
.def(py::init<>())
.def(py::init<const DistTensorSpec &>())
Expand Down Expand Up @@ -472,6 +512,14 @@ void BindAutoParallel(py::module *m) {
},
py::return_value_policy::reference);

m->def(
"get_phi_spmd_rule",
[](const std::string op_type) {
return phi::distributed::SpmdRuleFactory::Instance().GetSpmdRule(
op_type);
},
py::return_value_policy::reference);

// TODO(liuzhenhai): DistributedMapper is not used for now, but
// dist_mapper_test need the symbols forch DistributedMapper to be linked,
// remove it latter
Expand Down
2 changes: 2 additions & 0 deletions paddle/phi/core/distributed/auto_parallel/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ collect_srcs(
dist_mapper.cc
reshard_utils.cc
dist_tensor.cc
dist_meta_tensor.cc
inferspmd_utils.cc
reshard_function.cc
reshard_split_functor.cc
reshard_concat_functor.cc
Expand Down
51 changes: 51 additions & 0 deletions paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.h"

#include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"

namespace phi {
namespace distributed {

phi::DDim DistMetaTensor::dims() const {
// member values in tensor_ have higher priority than those in DistMetaTensor
if (tensor_ != nullptr) {
PADDLE_ENFORCE_EQ(this->is_dist(),
true,
phi::errors::InvalidArgument(
"The current MetaTensor doesn't contains "
"DistTensor when call `dist_attr` method."));
return MetaTensor::dims();
} else {
return dims_;
}
}

const distributed::TensorDistAttr& DistMetaTensor::dist_attr() const {
// member values in tensor_ have higher priority than those in DistMetaTensor
if (tensor_ != nullptr) {
PADDLE_ENFORCE_EQ(this->is_dist(),
true,
phi::errors::InvalidArgument(
"The current MetaTensor doesn't contains "
"DistTensor when call `dist_attr` method."));
return static_cast<phi::distributed::DistTensor*>(tensor_)->dist_attr();
} else {
return dist_attr_;
}
}

} // namespace distributed
} // namespace phi
68 changes: 68 additions & 0 deletions paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#pragma once

#include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
#include "paddle/phi/core/meta_tensor.h"

namespace phi {
namespace distributed {

class DistMetaTensor : public MetaTensor {
public:
// supporting implicit construction is easier to use
DistMetaTensor(TensorBase* tensor) // NOLINT
: MetaTensor(tensor) {}
DistMetaTensor(const TensorBase& tensor) // NOLINT
: MetaTensor(tensor) {}
DistMetaTensor(const TensorBase* tensor) // NOLINT
: MetaTensor(tensor) {}
DistMetaTensor(TensorBase& tensor) // NOLINT
: MetaTensor(tensor) {}
// For static mode only
DistMetaTensor(const phi::DDim& dims, const TensorDistAttr& dist_attr)
: dims_(dims), dist_attr_(dist_attr) {}

DistMetaTensor(DistMetaTensor&&) = default;
DistMetaTensor& operator=(DistMetaTensor&&) = default;
DistMetaTensor(const DistMetaTensor&) = default;
DistMetaTensor& operator=(const DistMetaTensor&) = default;

virtual ~DistMetaTensor() = default;

DDim dims() const override;

const distributed::TensorDistAttr& dist_attr() const;

private:
/**
* Note: When using the semi-automatic parallel segmentation derivation rules
* of the static graph, in order to facilitate the packaging of the input
* parameters of the construction, the DistMetaTensor is inherited and
* encapsulated, and the class members dims_ and dist_attr_ are added to it.
*
* The information contained in these two members is also in the tensor of the
* meta_tensor of the base class, and there is redundancy.
*
* We need to pay attention when using it to ensure the consistency.
* These two members are read-only, and their values cannot be changed
* after construction. To change their values, they need to be set
* directly in tensor_*/
phi::DDim dims_;
TensorDistAttr dist_attr_;
};

} // namespace distributed
} // namespace phi
97 changes: 97 additions & 0 deletions paddle/phi/core/distributed/auto_parallel/inferspmd_utils.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h"

namespace phi {
namespace distributed {

void InferSpmdContext::EmplaceBackInput(DistMetaTensor input) {
inputs_.emplace_back(std::move(input));
}

void InferSpmdContext::EmplaceBackAttr(Attribute attr) {
attrs_.emplace_back(std::move(attr));
}

const DistMetaTensor& InferSpmdContext::InputAt(size_t idx) const {
return inputs_.at(idx);
}

template <typename AttrType>
AttrType InferSpmdContext::AttrAt(size_t idx) const {
try {
return paddle::get<AttrType>(attrs_.at(idx));
} catch (paddle::bad_variant_access const& e) {
PADDLE_THROW(phi::errors::InvalidArgument(
"Attribute cast error in InferSpmd Context, the input attr type is "
"`%s`, but the expected attribute type is `%s`.",
attrs_.at(idx).type().name(),
std::type_index(typeid(AttrType)).name()));
}
}

template <>
bool InferSpmdContext::AttrAt<bool>(size_t idx) const {
try {
auto attr = attrs_.at(idx);
if (attr.type() == typeid(int)) {
return static_cast<bool>(paddle::get<int>(attr));
} else {
return paddle::get<bool>(attr);
}
} catch (paddle::bad_variant_access const& e) {
PADDLE_THROW(phi::errors::InvalidArgument(
"Attribute cast error in InferSpmd Context, the input attr type is "
"`%s`, but the expected attribute type is `bool`.",
attrs_.at(idx).type().name()));
}
}

const Attribute& InferSpmdContext::AttrAt(size_t idx) const {
return attrs_.at(idx);
}

SpmdRuleFactory& SpmdRuleFactory::Instance() {
static SpmdRuleFactory g_spmd_rule_map;
return g_spmd_rule_map;
}

bool SpmdRuleFactory::ContainsSpmdRule(const std::string& kernel_name) const {
return spmd_rule_map_.count(kernel_name) > 0;
}

int SpmdRuleFactory::InsertSpmdRule(std::string kernel_name, SpmdRule rule) {
PADDLE_ENFORCE_NE(
ContainsSpmdRule(kernel_name),
true,
phi::errors::AlreadyExists(
"`%s` Kernel's Spmd rules has been registered.", kernel_name));
spmd_rule_map_.insert({std::move(kernel_name), std::move(rule)});
return 0;
}

const SpmdRule& SpmdRuleFactory::GetSpmdRule(
const std::string& kernel_name) const {
auto it = spmd_rule_map_.find(kernel_name);
PADDLE_ENFORCE_NE(
it,
spmd_rule_map_.end(),
phi::errors::NotFound("`%s` Kernel's Spmd rules is not registered.",
kernel_name));
return it->second;
}

} // namespace distributed
} // namespace phi
Loading