Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Auto parallel] Make sure the id semantics of every var and op unique #38132

Merged
merged 4 commits into from
Dec 30, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions paddle/fluid/framework/ir/node.h
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,10 @@ class Node {
// Please don't use this API!
int id() const { return id_; }

// Only use this for auto parallel.
// A node does not have original desc if the return is zero.
uint64_t OriginalDescId() const { return original_desc_id_; }

bool IsOp() const { return type_ == Type::kOperation; }
bool IsVar() const { return type_ == Type::kVariable; }
bool IsCtrlVar() const {
Expand Down Expand Up @@ -239,6 +243,10 @@ class Node {
int desc_order_;
int block_id_{-1};

// Store the original id of var desc or op desc.
// Only use this for auto parallel.
uint64_t original_desc_id_{0};
Copy link
Contributor

@chenwhql chenwhql Dec 30, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

如果明确该成员及相关构造函数仅用于自动并行的话,有没有可能通过继承实现?仅用于某一功能的成员放到基类中,设计上是不太好的,或者说此类成员在自动并行开发完善的过程中还是否会新增?还会新增多少?

这个可能要根据你们的实际情况来判断,如果确实必须加到全局基类中,我建议命名上直接带上功能信息,比如auto_parallel_id_,明确这一成员的使用是有特定场景要求的,避免大家没看注释的时候困惑或者误用

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

自动并行目前在python侧,采用非侵入式标记program方式,所以直接复用vardesc和opdesc,id和original_id应该是仅有需求,如果还增加新的,的确要考虑继承或者分拆类,另外未来稳定后,可能不需要这些id,这只是中间状态。


private:
// ID can only set by a Graph.
void SetId(int id) { id_ = id; }
Expand Down Expand Up @@ -267,14 +275,16 @@ class Node {
op_desc_(nullptr),
type_(Type::kVariable),
desc_order_(NO_DESC_ORDER),
block_id_(block_id) {}
block_id_(block_id),
original_desc_id_(var_desc->OriginalId()) {}

explicit Node(OpDesc* op_desc)
: name_(op_desc->Type()),
var_desc_(nullptr),
op_desc_(new OpDesc(*op_desc, op_desc->Block())),
type_(Type::kOperation),
desc_order_(NO_DESC_ORDER) {}
desc_order_(NO_DESC_ORDER),
original_desc_id_(op_desc->OriginalId()) {}

Node() = delete;

Expand Down
10 changes: 2 additions & 8 deletions paddle/fluid/framework/op_desc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -352,15 +352,9 @@ void OpDesc::CopyFrom(const OpDesc &op_desc) {
inputs_ = op_desc.inputs_;
outputs_ = op_desc.outputs_;
attrs_ = op_desc.attrs_;
// The record of original_id_ is only for auto parallel.
original_id_ = op_desc.original_id_;
need_update_ = true;
// When creating graph from program, the creation of op node will create a new
// OpDesc instead of
// referring to the original one. To find the original OpDesc of the op node,
// the id have to be
// copied to the new OpDesc. The var node has the same situation, but the
// default copy constructor
// can copy the id automatically.
id_ = op_desc.id_;
}

OpDesc::OpDesc(const proto::OpDesc &desc, BlockDesc *block)
Expand Down
27 changes: 17 additions & 10 deletions paddle/fluid/framework/op_desc.h
Original file line number Diff line number Diff line change
Expand Up @@ -154,17 +154,10 @@ class OpDesc {

const BlockDesc *Block() const { return this->block_; }

// This thread-safe implementation seems to be redudent since the neural
// networks
// are usually constructed in a single thread
static uint64_t GenerateId() {
static std::atomic<std::uint64_t> id{0};
return ++id;
}

// Note: the identity only used as a key for referring to its
// distributed attribute now.
// The Id() and OrignalId() are only used for auto parallel.
uint64_t Id() const { return id_; }
uint64_t OriginalId() const { return original_id_; }
void SetOriginalId(uint64_t original_id) { original_id_ = original_id; }

private:
template <typename MapType>
Expand All @@ -177,6 +170,14 @@ class OpDesc {
return ret_val;
}

// This thread-safe implementation seems to be redudent since the neural
// networks are usually constructed in a single thread
static uint64_t GenerateId() {
static std::atomic<std::uint64_t> uid{0};
// Must start from one
return ++uid;
}

proto::OpDesc desc_;
BlockDesc *block_{nullptr}; // not_own
// input arg name => input variable names
Expand All @@ -189,7 +190,13 @@ class OpDesc {
// local changes should be synchronized, need_update_ should be set to true.
bool need_update_{false};

// Note: the id_ is unique (only for auto parallel).
uint64_t id_ = GenerateId();
// Note: the orignal_id_ is used for referring to the original OpDesc
// that the current OpDesc is built from (only for auto parallel).
// The default original_id_ is same as the id_, which means the
// current OpDesc is not built from the other one.
uint64_t original_id_ = id_;
};
} // namespace framework
} // namespace paddle
32 changes: 23 additions & 9 deletions paddle/fluid/framework/var_desc.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,12 @@ class VarDesc {

explicit VarDesc(const proto::VarDesc &desc) : desc_(desc) {}

// Explicitly implement the copy constructor for auto parallel
VarDesc(const VarDesc &other)
: desc_(other.desc_),
attrs_(other.attrs_),
original_id_(other.original_id_) {}

proto::VarDesc *Proto() { return &desc_; }

const proto::VarDesc *Proto() const { return &desc_; }
Expand Down Expand Up @@ -153,26 +159,34 @@ class VarDesc {

Attribute GetAttr(const std::string &name) const;

// This thread-safe implementation seems to be redudent since the neural
// networks are usually constructed in a single thread.
static uint64_t GenerateId() {
static std::atomic<std::uint64_t> uid{0};
return ++uid;
}

// Note: the identity only used as a key for referring to its
// distributed attribute now.
// The Id() and OriginalId() are only used for auto parallel.
uint64_t Id() const { return id_; }
uint64_t OriginalId() const { return original_id_; }
void SetOriginalId(uint64_t original_id) { original_id_ = original_id; }

private:
const proto::VarType::TensorDesc &tensor_desc() const;
std::vector<proto::VarType::TensorDesc> tensor_descs() const;
proto::VarType::TensorDesc *mutable_tensor_desc();
std::vector<proto::VarType::TensorDesc *> mutable_tensor_descs();

// This thread-safe implementation seems to be redudent since the neural
// networks are usually constructed in a single thread.
static uint64_t GenerateId() {
static std::atomic<std::uint64_t> uid{0};
return ++uid;
}

proto::VarDesc desc_;
AttributeMap attrs_;

// Note: the id_ is unique for all VarDesc (only for auto parallel).
uint64_t id_ = GenerateId();
// Note: the orignal_id_ is used for referring to the original VarDesc
// that the current VarDesc is built from (only for auto parallel).
// The default original_id_ is same as the id_, which means the
// current VarDesc is not built from the other one.
uint64_t original_id_ = id_;
};

bool operator==(const VarDesc &left, const VarDesc &right);
Expand Down
1 change: 1 addition & 0 deletions paddle/fluid/pybind/ir.cc
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ void BindNode(py::module *m) {
.def("var", &Node::Var, return_value_policy::reference)
.def("op", &Node::Op, return_value_policy::reference)
.def("id", &Node::id)
.def("original_desc_id", &Node::OriginalDescId)
.def("is_op", &Node::IsOp)
.def("is_var", &Node::IsVar)
.def("is_ctrl_var", &Node::IsCtrlVar)
Expand Down
4 changes: 4 additions & 0 deletions paddle/fluid/pybind/protobuf.cc
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,8 @@ void BindVarDsec(pybind11::module *m) {
.def("_set_attr", &pd::VarDesc::SetAttr)
.def("remove_attr", &pd::VarDesc::RemoveAttr)
.def("id", &pd::VarDesc::Id)
.def("original_id", &pd::VarDesc::OriginalId)
.def("set_original_id", &pd::VarDesc::SetOriginalId)
.def("attr", &pd::VarDesc::GetAttr);

pybind11::enum_<pd::proto::VarType::Type> vartype(var_desc, "VarType", "");
Expand Down Expand Up @@ -305,6 +307,8 @@ void BindOpDesc(pybind11::module *m) {
.def("block", [](pd::OpDesc &self) { return self.Block(); },
pybind11::return_value_policy::reference)
.def("id", &pd::OpDesc::Id)
.def("original_id", &pd::OpDesc::OriginalId)
.def("set_original_id", &pd::OpDesc::SetOriginalId)
.def("inputs", &pd::OpDesc::Inputs)
.def("outputs", &pd::OpDesc::Outputs);
}
Expand Down
8 changes: 4 additions & 4 deletions python/paddle/distributed/auto_parallel/completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -698,13 +698,13 @@ def _get_op_by_id(ops, id):
continue

# complete the annotation of grad op (xxx_grad op or sum op)
# xxx_grad op will have a corresponding forward op in gradopidx2opidx
# xxx_grad op will have a corresponding forward op in grad_op_id_to_op_id
grad_op = ops[idx]
if grad_op.desc.id() in dist_op_context.gradopidx2opidx:
if grad_op.desc.id() in dist_op_context.grad_op_id_to_op_id:
# TODO support the case where one forward op corresponding to multiple xxx_grad op
forward_op = _get_op_by_id(
ops[:first_backward_op_idx],
dist_op_context.gradopidx2opidx[grad_op.desc.id()])
dist_op_context.grad_op_id_to_op_id[grad_op.desc.id()])
assert forward_op is not None

# op dist attr
Expand Down Expand Up @@ -769,7 +769,7 @@ def _get_op_by_id(ops, id):
dist_context.set_op_dist_attr_for_program(grad_op,
grad_op_dist_attr)

# only sum op for merge mutiple version grad has no a corresponding mapping in gradopidx2opidx
# only sum op for merge mutiple version grad has no a corresponding mapping in grad_op_id_to_op_id
else:
assert grad_op.type == "sum", "got unexpect op [{}]".format(
str(grad_op.type))
Expand Down
Loading