-
Notifications
You must be signed in to change notification settings - Fork 5.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AutoParallel] Adapt static spmd rules for dynamic graph #56367
Changes from 30 commits
612bb5e
1974d2b
3878288
63b10fa
2d12ee2
3d44d8d
7bb8a18
2166624
7480ecc
51cd20e
babba63
1b1b490
be3249d
4051b5e
e105c94
12b5648
651f974
d7fcfe5
cf08fe5
8da7b0c
9226857
006b67b
25a1557
2072094
5d55545
b262c03
0603212
5259b14
eafd092
e1cd297
d44e43a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,14 +15,18 @@ | |
#include <pybind11/operators.h> | ||
#include <pybind11/stl.h> | ||
|
||
#include "paddle/fluid/framework/block_desc.h" | ||
#include "paddle/fluid/framework/op_desc.h" | ||
#include "paddle/fluid/framework/var_desc.h" | ||
#include "paddle/fluid/pybind/auto_parallel_py.h" | ||
#include "paddle/fluid/pybind/pybind_variant_caster.h" | ||
#include "paddle/phi/core/device_context.h" | ||
#include "paddle/phi/core/distributed/auto_parallel/device_mesh.h" | ||
#include "paddle/phi/core/distributed/auto_parallel/dist_attr.h" | ||
#include "paddle/phi/core/distributed/auto_parallel/dist_mapper.h" | ||
#include "paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h" | ||
#include "paddle/phi/core/distributed/auto_parallel/process_mesh.h" | ||
#include "paddle/phi/infermeta/spmd_rules/rules.h" | ||
#include "paddle/utils/optional.h" | ||
#include "paddle/utils/pybind.h" | ||
|
||
|
@@ -42,6 +46,7 @@ using paddle::distributed::auto_parallel::kDefault; | |
using paddle::distributed::auto_parallel::OperatorDistAttr; | ||
using paddle::distributed::auto_parallel::SPMDRuleBase; | ||
using paddle::distributed::auto_parallel::SPMDRuleMap; | ||
using paddle::framework::BlockDesc; | ||
using paddle::framework::OpDesc; | ||
using paddle::framework::VarDesc; | ||
using phi::distributed::ProcessMesh; | ||
|
@@ -343,6 +348,41 @@ void BindAutoParallel(py::module *m) { | |
&SPMDRuleBase::InferBackward)); | ||
// .def("infer_backward", &SPMDRuleBase::InferBackward) [revert in future] | ||
|
||
py::class_<phi::distributed::SpmdRule>(*m, "SpmdRule") | ||
.def("infer_forward", | ||
[](const phi::distributed::SpmdRule &self, | ||
const std::vector<DistTensorSpec> &input_specs, | ||
const std::vector<phi::Attribute> &attrs) { | ||
phi::distributed::InferSpmdContext ctx; | ||
for (auto &spec : input_specs) { | ||
ctx.EmplaceBackInput(phi::distributed::DistMetaTensor( | ||
phi::make_ddim(spec.shape()), spec.dist_attr())); | ||
} | ||
for (auto &attr : attrs) { | ||
ctx.EmplaceBackAttr(attr); | ||
} | ||
return self.InferForward(ctx); | ||
}) | ||
.def("infer_backward", | ||
[](const phi::distributed::SpmdRule &self, | ||
const std::vector<DistTensorSpec> &input_specs, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. infer_backward need the info of input tensors and output tensors for inference, please ref to new api: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done, change pybind |
||
const std::vector<DistTensorSpec> &output_specs, | ||
const std::vector<phi::Attribute> &attrs) { | ||
phi::distributed::InferSpmdContext ctx; | ||
for (auto &spec : input_specs) { | ||
ctx.EmplaceBackInput(phi::distributed::DistMetaTensor( | ||
phi::make_ddim(spec.shape()), spec.dist_attr())); | ||
} | ||
for (auto &spec : output_specs) { | ||
ctx.EmplaceBackInput(phi::distributed::DistMetaTensor( | ||
phi::make_ddim(spec.shape()), spec.dist_attr())); | ||
} | ||
for (auto &attr : attrs) { | ||
ctx.EmplaceBackAttr(attr); | ||
} | ||
return self.InferBackward(ctx); | ||
}); | ||
|
||
py::class_<DistTensorSpec>(*m, "DistTensorSpec") | ||
.def(py::init<>()) | ||
.def(py::init<const DistTensorSpec &>()) | ||
|
@@ -472,6 +512,14 @@ void BindAutoParallel(py::module *m) { | |
}, | ||
py::return_value_policy::reference); | ||
|
||
m->def( | ||
"get_phi_spmd_rule", | ||
[](const std::string op_type) { | ||
return phi::distributed::SpmdRuleFactory::Instance().GetSpmdRule( | ||
op_type); | ||
}, | ||
py::return_value_policy::reference); | ||
|
||
// TODO(liuzhenhai): DistributedMapper is not used for now, but | ||
// dist_mapper_test need the symbols forch DistributedMapper to be linked, | ||
// remove it latter | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. */ | ||
|
||
#include "paddle/phi/core/distributed/auto_parallel/dist_meta_tensor.h" | ||
|
||
#include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h" | ||
|
||
namespace phi { | ||
namespace distributed { | ||
|
||
phi::DDim DistMetaTensor::dims() const { | ||
// member values in tensor_ have higher priority than those in DistMetaTensor | ||
if (tensor_ != nullptr) { | ||
PADDLE_ENFORCE_EQ(this->is_dist(), | ||
true, | ||
phi::errors::InvalidArgument( | ||
"The current MetaTensor doesn't contains " | ||
"DistTensor when call `dist_attr` method.")); | ||
return MetaTensor::dims(); | ||
} else { | ||
return dims_; | ||
} | ||
} | ||
|
||
const distributed::TensorDistAttr& DistMetaTensor::dist_attr() const { | ||
// member values in tensor_ have higher priority than those in DistMetaTensor | ||
if (tensor_ != nullptr) { | ||
PADDLE_ENFORCE_EQ(this->is_dist(), | ||
true, | ||
phi::errors::InvalidArgument( | ||
"The current MetaTensor doesn't contains " | ||
"DistTensor when call `dist_attr` method.")); | ||
return static_cast<phi::distributed::DistTensor*>(tensor_)->dist_attr(); | ||
} else { | ||
return dist_attr_; | ||
} | ||
} | ||
|
||
} // namespace distributed | ||
} // namespace phi |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. */ | ||
|
||
#pragma once | ||
|
||
#include "paddle/phi/core/distributed/auto_parallel/dist_attr.h" | ||
#include "paddle/phi/core/meta_tensor.h" | ||
|
||
namespace phi { | ||
namespace distributed { | ||
|
||
class DistMetaTensor : public MetaTensor { | ||
public: | ||
// supporting implicit construction is easier to use | ||
DistMetaTensor(TensorBase* tensor) // NOLINT | ||
: MetaTensor(tensor) {} | ||
DistMetaTensor(const TensorBase& tensor) // NOLINT | ||
: MetaTensor(tensor) {} | ||
DistMetaTensor(const TensorBase* tensor) // NOLINT | ||
: MetaTensor(tensor) {} | ||
DistMetaTensor(TensorBase& tensor) // NOLINT | ||
: MetaTensor(tensor) {} | ||
// For static mode only | ||
DistMetaTensor(const phi::DDim& dims, const TensorDistAttr& dist_attr) | ||
: dims_(dims), dist_attr_(dist_attr) {} | ||
|
||
DistMetaTensor(DistMetaTensor&&) = default; | ||
DistMetaTensor& operator=(DistMetaTensor&&) = default; | ||
DistMetaTensor(const DistMetaTensor&) = default; | ||
DistMetaTensor& operator=(const DistMetaTensor&) = default; | ||
|
||
virtual ~DistMetaTensor() = default; | ||
|
||
DDim dims() const override; | ||
|
||
const distributed::TensorDistAttr& dist_attr() const; | ||
|
||
private: | ||
/** | ||
* Note: When using the semi-automatic parallel segmentation derivation rules | ||
* of the static graph, in order to facilitate the packaging of the input | ||
* parameters of the construction, the DistMetaTensor is inherited and | ||
* encapsulated, and the class members dims_ and dist_attr_ are added to it. | ||
* | ||
* The information contained in these two members is also in the tensor of the | ||
* meta_tensor of the base class, and there is redundancy. | ||
* | ||
* We need to pay attention when using it to ensure the consistency. | ||
* These two members are read-only, and their values cannot be changed | ||
* after construction. To change their values, they need to be set | ||
* directly in tensor_*/ | ||
phi::DDim dims_; | ||
TensorDistAttr dist_attr_; | ||
}; | ||
|
||
} // namespace distributed | ||
} // namespace phi |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. */ | ||
|
||
#include "paddle/phi/core/distributed/auto_parallel/inferspmd_utils.h" | ||
|
||
namespace phi { | ||
namespace distributed { | ||
|
||
void InferSpmdContext::EmplaceBackInput(DistMetaTensor input) { | ||
inputs_.emplace_back(std::move(input)); | ||
} | ||
|
||
void InferSpmdContext::EmplaceBackAttr(Attribute attr) { | ||
attrs_.emplace_back(std::move(attr)); | ||
} | ||
|
||
const DistMetaTensor& InferSpmdContext::InputAt(size_t idx) const { | ||
return inputs_.at(idx); | ||
} | ||
|
||
template <typename AttrType> | ||
AttrType InferSpmdContext::AttrAt(size_t idx) const { | ||
try { | ||
return paddle::get<AttrType>(attrs_.at(idx)); | ||
} catch (paddle::bad_variant_access const& e) { | ||
PADDLE_THROW(phi::errors::InvalidArgument( | ||
"Attribute cast error in InferSpmd Context, the input attr type is " | ||
"`%s`, but the expected attribute type is `%s`.", | ||
attrs_.at(idx).type().name(), | ||
std::type_index(typeid(AttrType)).name())); | ||
} | ||
} | ||
|
||
template <> | ||
bool InferSpmdContext::AttrAt<bool>(size_t idx) const { | ||
try { | ||
auto attr = attrs_.at(idx); | ||
if (attr.type() == typeid(int)) { | ||
return static_cast<bool>(paddle::get<int>(attr)); | ||
} else { | ||
return paddle::get<bool>(attr); | ||
} | ||
} catch (paddle::bad_variant_access const& e) { | ||
PADDLE_THROW(phi::errors::InvalidArgument( | ||
"Attribute cast error in InferSpmd Context, the input attr type is " | ||
"`%s`, but the expected attribute type is `bool`.", | ||
attrs_.at(idx).type().name())); | ||
} | ||
} | ||
|
||
const Attribute& InferSpmdContext::AttrAt(size_t idx) const { | ||
return attrs_.at(idx); | ||
} | ||
|
||
SpmdRuleFactory& SpmdRuleFactory::Instance() { | ||
static SpmdRuleFactory g_spmd_rule_map; | ||
return g_spmd_rule_map; | ||
} | ||
|
||
bool SpmdRuleFactory::ContainsSpmdRule(const std::string& kernel_name) const { | ||
return spmd_rule_map_.count(kernel_name) > 0; | ||
} | ||
|
||
int SpmdRuleFactory::InsertSpmdRule(std::string kernel_name, SpmdRule rule) { | ||
PADDLE_ENFORCE_NE( | ||
ContainsSpmdRule(kernel_name), | ||
true, | ||
phi::errors::AlreadyExists( | ||
"`%s` Kernel's Spmd rules has been registered.", kernel_name)); | ||
spmd_rule_map_.insert({std::move(kernel_name), std::move(rule)}); | ||
return 0; | ||
} | ||
|
||
const SpmdRule& SpmdRuleFactory::GetSpmdRule( | ||
const std::string& kernel_name) const { | ||
auto it = spmd_rule_map_.find(kernel_name); | ||
PADDLE_ENFORCE_NE( | ||
it, | ||
spmd_rule_map_.end(), | ||
phi::errors::NotFound("`%s` Kernel's Spmd rules is not registered.", | ||
kernel_name)); | ||
return it->second; | ||
} | ||
|
||
} // namespace distributed | ||
} // namespace phi |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
DistTensorSpec seen to be redundant now, would it be better that expose the InferSpmdContext and MetaTensor API into python and static mode build the input ctx directly ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, this can be determined according to the needs of semi-static. This PR try not to change the original test framework as much as possible.