Skip to content

Commit

Permalink
Support eager grad interface with allow_unused and multi startup_op
Browse files Browse the repository at this point in the history
  • Loading branch information
veyron95 committed Mar 7, 2022
1 parent 9fc70fe commit ba8d79e
Show file tree
Hide file tree
Showing 10 changed files with 114 additions and 26 deletions.
4 changes: 2 additions & 2 deletions paddle/fluid/eager/accumulation/accumulation_node.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ static void CopyOrAddTensor(paddle::experimental::Tensor* tensor,
}

std::vector<std::vector<paddle::experimental::Tensor>> GradNodeAccumulation::
operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads) {
operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
const bool create_graph) {
VLOG(3) << "Running Eager Backward Node: GradNodeAccumulation";
PADDLE_ENFORCE(grads.size() == 1,
paddle::platform::errors::Fatal(
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/eager/accumulation/accumulation_node.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ class GradNodeAccumulation : public GradNodeBase {

// Functor: perform backward computations
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads)
override;
const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
const bool create_graph = false) override;

std::string name() { return "GradNodeAccumulation"; }

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,8 @@ void GradNodeScale::SetTensorWrappers_X(
void GradNodeScale::SetAttributes_scale(float scale) { scale_ = scale; }

std::vector<std::vector<paddle::experimental::Tensor>> GradNodeScale::
operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads) {
operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
const bool create_graph) {
// 1. Check Output Size
PADDLE_ENFORCE(
((grads.size() == 1) && (grads[0].size() == 1)),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ class GradNodeScale : public GradNodeBase {

// Functor: perform backward computations
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads)
override;
const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
const bool create_graph = false) override;

void SetTensorWrappers_X(
const std::vector<paddle::experimental::Tensor>& tensors);
Expand Down
7 changes: 5 additions & 2 deletions paddle/fluid/eager/auto_code_generator/eager_generator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2043,6 +2043,7 @@ static std::string GenerateGradNodeCCContents(
}

const char* BWD_RETURN_TEMPLATE =
" VLOG(1) << \"Run in GradNode, create_graph is: \" << create_graph; \n"
" std::vector<std::vector<paddle::experimental::Tensor>> hooked_grads = "
"GradNode%s::ApplyGradientHooks(grads);\n"
" std::vector<std::vector<paddle::experimental::Tensor>> outputs(%d);\n"
Expand All @@ -2056,7 +2057,8 @@ static std::string GenerateGradNodeCCContents(
const char* GRAD_FUNCTION_TEMPLATE =
"std::vector<std::vector<paddle::experimental::Tensor>> "
"GradNode%s::operator()(const "
"std::vector<std::vector<paddle::experimental::Tensor>>& grads) {\n%s\n}";
"std::vector<std::vector<paddle::experimental::Tensor>>& grads, const "
"bool create_graph) {\n%s\n}";
std::string grad_function_str = paddle::string::Sprintf(
GRAD_FUNCTION_TEMPLATE, fwd_op_type, generated_grad_function_body);

Expand Down Expand Up @@ -2091,7 +2093,8 @@ static std::string GenerateGradNodeHeaderContents(
"\n"
" virtual std::vector<std::vector<paddle::experimental::Tensor>> "
"operator()(const "
"std::vector<std::vector<paddle::experimental::Tensor>>& grads) "
"std::vector<std::vector<paddle::experimental::Tensor>>& grads, const "
"bool create_graph = false) "
"override;\n"
"\n"
" std::string name() override { return \" GradNode%s \"; } \n "
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -533,7 +533,7 @@ class {} : public egr::GradNodeBase {{
~{}() override = default;
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads) override;
const std::vector<std::vector<paddle::experimental::Tensor>>& grads, const bool create_graph = false) override;
std::string name() override {{ return \" {} \"; }}
// SetTensorWrapperX, SetTensorWrapperY, ...
{}
Expand Down Expand Up @@ -609,7 +609,7 @@ def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map,

grad_node_name = GetGradNodeName(fwd_api_name)
FUNCTION_TEMPLATE = """
std::vector<std::vector<paddle::experimental::Tensor>> {}::operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads) {{
std::vector<std::vector<paddle::experimental::Tensor>> {}::operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads, const bool create_graph) {{
// Call grad_api function
auto grad_api_returns = paddle::experimental::{}({});
{}
Expand Down
75 changes: 64 additions & 11 deletions paddle/fluid/eager/backward.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,14 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap(
// pass
std::unordered_map<GradNodeBase*, int> node_in_degree_map;

// init potential startup node's indegree
std::queue<GradNodeBase*> queue_tmp = init_queue;
while (!queue_tmp.empty()) {
GradNodeBase* node = queue_tmp.front();
queue_tmp.pop();
node_in_degree_map[node] = 0;
}

// Copy nodes
std::queue<GradNodeBase*> queue = init_queue;
std::unordered_set<GradNodeBase*> visited;
Expand Down Expand Up @@ -164,6 +172,7 @@ void GetGraphInfoBetweenTargets(
}
}
}

UpdateGraphInfo(target_nodes, depending_nodes, potential_stop_nodes);
}

Expand Down Expand Up @@ -193,17 +202,33 @@ void GetTargetNodesInfo(const std::vector<paddle::experimental::Tensor>& inputs,

std::vector<paddle::experimental::Tensor> GetResults(
const std::vector<paddle::experimental::Tensor>& inputs,
std::unordered_map<GradNodeBase*, paddle::experimental::Tensor>&
result_map) {
const std::unordered_map<GradNodeBase*, paddle::experimental::Tensor>&
results_map,
bool allow_unused) {
VLOG(1) << "Run in GetResults";
if (inputs.empty()) return {};

std::vector<paddle::experimental::Tensor> results;
results.reserve(inputs.size());
for (auto input : inputs) {
auto results_map_ = results_map;
for (size_t i = 0; i < inputs.size(); ++i) {
auto& input = inputs[i];
AutogradMeta* auto_grad_meta = EagerUtils::unsafe_autograd_meta(input);
auto target_node = auto_grad_meta->GetMutableGradNode().get();
results.emplace_back(result_map[target_node]);

if (results_map_.find(target_node) != results_map_.end()) {
// TODO(wuweilong): set StopGradient
// result_map[target_node].SetOverridedStopGradient(!create_graph_);
results.emplace_back(results_map_[target_node]);
} else {
PADDLE_ENFORCE_EQ(allow_unused, true,
paddle::platform::errors::InvalidArgument(
"The %d-th input does not appear in the backward "
"graph. Please check the input variable or set "
"allow_unused=True to get None result.",
i));
results.emplace_back();
}
}
return results;
}
Expand All @@ -220,6 +245,20 @@ std::vector<paddle::experimental::Tensor> RunBackward(
// *Inplace version check should perform at node-level
// *Cross-batch accumulation happens at forward pass

/* --- Preprocess --- */

// TODO(wuweilong): output tensor duplicate check
// TODO(wuweilong): build no_grad_vars_grads according no_grad_vars
// TODO(wuweilong): output tensor' gradient is not in no_grad_vars

// TODO(wuweilong): check input tensor has grad op and stop_gradient = False
// TODO(wuweilong): input tensor duplicate check
// TODO(wuweilong): input tensor' gradient is not in no_grad_vars

// TODO(wuweilong): Prune output_targets which is not the input of startup_ops
// TODO(wuweilong): input == output case
// TODO(wuweilong): output_targets.size() should eaqul to output_grads.size()

/* --- Initialization --- */
// 1. Init queue with starting nodes
// 2. Prepare initial input buffers
Expand Down Expand Up @@ -288,14 +327,28 @@ std::vector<paddle::experimental::Tensor> RunBackward(
getInDegreeMap(queue);

std::unordered_map<GradNodeBase*, AutogradMeta*> target_nodes_inputmeta_map;
std::unordered_set<GradNodeBase*> target_nodes;
std::unordered_set<GradNodeBase*> target_nodes; // should be updated?
GetTargetNodesInfo(inputs, &target_nodes, &target_nodes_inputmeta_map);

std::unordered_map<GradNodeBase*, GradNodeBase*> depending_nodes;
std::unordered_set<GradNodeBase*> potential_stop_nodes;
GetGraphInfoBetweenTargets(queue, &target_nodes, &depending_nodes,
&potential_stop_nodes);

std::unordered_set<GradNodeBase*> startup_ops_;
// ready_queue store all startup nodes
std::queue<GradNodeBase*> ready_queue;

// startup op's indegree should be 0
for (auto& pair : node_in_degree_map) {
if (pair.second == 0) {
auto* op = pair.first;
startup_ops_.emplace(op);
ready_queue.emplace(op);
}
}
VLOG(1) << " startup_ops' size is :" << startup_ops_.size();

std::unordered_map<GradNodeBase*, paddle::experimental::Tensor> results_map;

/* --- Topological Visit --- */
Expand All @@ -306,9 +359,9 @@ std::vector<paddle::experimental::Tensor> RunBackward(
// |- Prepare for next node
// 3. Update queue
VLOG(6) << "Run Backward";
while (!queue.empty()) {
GradNodeBase* node = queue.front();
queue.pop();
while (!ready_queue.empty()) {
GradNodeBase* node = ready_queue.front();
ready_queue.pop();

// Run node: This is where Hook happens
PADDLE_ENFORCE(
Expand All @@ -334,7 +387,7 @@ std::vector<paddle::experimental::Tensor> RunBackward(

// Run Pre Backward Node and get outputs
std::vector<std::vector<paddle::experimental::Tensor>> grad_output_tensors =
(*node)(node_input_buffer->Buffers());
(*node)(node_input_buffer->Buffers(), create_graph);
// TODO(jiabin): Should we erase it or find a more efficient way.
node_input_buffers_dict.erase(node);

Expand Down Expand Up @@ -410,13 +463,13 @@ std::vector<paddle::experimental::Tensor> RunBackward(
}

if (node_in_degree_map[next_node] == 0 && !is_potential_stop_node) {
queue.emplace(std::move(next_node));
ready_queue.emplace(std::move(next_node));
}
}
}
}
if (!inputs.empty()) {
return GetResults(inputs, results_map);
return GetResults(inputs, results_map, allow_unused);
}

VLOG(1) << "Run backward in the end, return {}";
Expand Down
3 changes: 2 additions & 1 deletion paddle/fluid/eager/grad_node_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,8 @@ class GradNodeBase {
* is better choice to fit this format.
* **/
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads) = 0;
const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
const bool create_graph = false) = 0;

/**
* AddEdges is designed to set input tensors' backward Node as current
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ class GradTestNode : public egr::GradNodeBase {
GradTestNode() : GradNodeBase() { val_ = 1.0; }
std::string name() override { return "GradTestNode"; }
std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads)
override {
const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
const bool create_graph = false) override {
val_ = std::dynamic_pointer_cast<phi::DenseTensor>(grads[0][0].impl())
->data<float>()[0];
phi::DenseTensorMeta meta =
Expand Down
31 changes: 31 additions & 0 deletions python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import unittest
from unittest import TestCase
import numpy as np
import paddle.compat as cpt
from paddle.fluid.framework import _test_eager_guard


Expand Down Expand Up @@ -61,6 +62,36 @@ def test_simple_example_eager_grad(self):
with _test_eager_guard():
self.func_simple_example_eager_grad()
self.func_simple_example_eager_grad()

def func_simple_example_eager_grad_allow_unused(self):
np.random.seed(2021)
paddle.set_device('cpu')
np_x = np.random.random((3, 3))
np_y = np.random.random((3, 1))
np_z = np.random.random((3, 1))
x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
y = paddle.to_tensor(np_y, dtype="float64", stop_gradient=False)
z = paddle.to_tensor(np_z, dtype="float64", stop_gradient=False)
out_z = paddle.nn.functional.sigmoid(z)
out = paddle.matmul(x, y)

dx = fluid.dygraph.grad(out, [x, z], allow_unused=True)

dout = np.ones_like(np_y)
expected_dx = np.matmul(dout, np.transpose(np_y))

self.assertTrue(np.allclose(dx[0].numpy(), expected_dx[0]))

try:
dx = fluid.dygraph.grad(out, [x, z])
except ValueError as e:
error_msg = cpt.get_exception_message(e)
assert error_msg.find("allow_unused") > 0

def test_simple_example_eager_grad_allow_unused(self):
with _test_eager_guard():
self.func_simple_example_eager_grad_allow_unused()
self.func_simple_example_eager_grad_allow_unused()


class TestDygraphDoubleGrad(TestCase):
Expand Down

0 comments on commit ba8d79e

Please sign in to comment.