Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AutoDiff] Automatically determine AdStack's size #2438

Merged
merged 20 commits into from
Jun 29, 2021
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions taichi/backends/cc/codegen_cc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -533,6 +533,9 @@ class CCTransformer : public IRVisitor {

void visit(AdStackAllocaStmt *stmt) override {
TI_ASSERT(stmt->width() == 1);
TI_ASSERT_INFO(
stmt->max_size > 0,
"Adaptive autodiff stack's size should have been determined.");
k-ye marked this conversation as resolved.
Show resolved Hide resolved

const auto &var_name = stmt->raw_name();
emit("Ti_u8 {}[{}];", var_name, stmt->size_in_bytes() + sizeof(uint32_t));
Expand Down
7 changes: 5 additions & 2 deletions taichi/backends/metal/codegen_metal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,10 +109,10 @@ class KernelCodegenImpl : public IRVisitor {
kernel_(kernel),
compiled_structs_(compiled_structs),
needs_root_buffer_(compiled_structs_->root_size > 0),
ctx_attribs_(*kernel_),
print_strtab_(print_strtab),
cgen_config_(config),
offloaded_(offloaded) {
offloaded_(offloaded),
ctx_attribs_(*kernel_) {
yuanming-hu marked this conversation as resolved.
Show resolved Hide resolved
ti_kernel_attribs_.name = taichi_kernel_name;
ti_kernel_attribs_.is_jit_evaluator = kernel->is_evaluator;
// allow_undefined_visitor = true;
Expand Down Expand Up @@ -737,6 +737,9 @@ class KernelCodegenImpl : public IRVisitor {

void visit(AdStackAllocaStmt *stmt) override {
TI_ASSERT(stmt->width() == 1);
TI_ASSERT_INFO(
stmt->max_size > 0,
"Adaptive autodiff stack's size should have been determined.");

const auto &var_name = stmt->raw_name();
emit("byte {}[{}];", var_name, stmt->size_in_bytes());
Expand Down
2 changes: 2 additions & 0 deletions taichi/codegen/codegen_llvm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1864,6 +1864,8 @@ void CodeGenLLVM::visit(InternalFuncStmt *stmt) {

void CodeGenLLVM::visit(AdStackAllocaStmt *stmt) {
TI_ASSERT(stmt->width() == 1);
TI_ASSERT_INFO(stmt->max_size > 0,
"Adaptive autodiff stack's size should have been determined.");
auto type = llvm::ArrayType::get(llvm::Type::getInt8Ty(*llvm_context),
stmt->size_in_bytes());
auto alloca = create_entry_block_alloca(type, sizeof(int64));
Expand Down
101 changes: 101 additions & 0 deletions taichi/ir/control_flow_graph.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "taichi/ir/control_flow_graph.h"

#include <queue>
#include <unordered_set>

#include "taichi/ir/analysis.h"
#include "taichi/ir/statements.h"
Expand Down Expand Up @@ -863,4 +864,104 @@ std::unordered_set<SNode *> ControlFlowGraph::gather_loaded_snodes() {
return snodes;
}

void ControlFlowGraph::determine_ad_stack_size(int max_ad_stack_size) {
const int num_nodes = size();
std::unordered_map<AdStackAllocaStmt *, int> max_size;
std::vector<std::unordered_map<AdStackAllocaStmt *, int>>
max_size_at_node_begin(num_nodes);
std::vector<std::unordered_map<AdStackAllocaStmt *, int>> max_increased_size(
num_nodes);
std::vector<std::unordered_map<AdStackAllocaStmt *, int>> increased_size(
num_nodes);
xumingkuan marked this conversation as resolved.
Show resolved Hide resolved
std::queue<int> to_visit;
std::vector<bool> in_queue(num_nodes);
std::unordered_map<CFGNode *, int> node_ids;
std::unordered_set<AdStackAllocaStmt *> oversized_stacks;

for (int i = 0; i < num_nodes; i++)
node_ids[nodes[i].get()] = i;

for (int i = 0; i < num_nodes; i++) {
for (int j = nodes[i]->begin_location; j < nodes[i]->end_location; j++) {
Stmt *stmt = nodes[i]->block->statements[j].get();
if (auto *stack_push = stmt->cast<AdStackPushStmt>()) {
auto *stack = stack_push->stack->as<AdStackAllocaStmt>();
if (stack->max_size == 0 /*adaptive*/) {
increased_size[i][stack]++;
if (increased_size[i][stack] > max_increased_size[i][stack]) {
max_increased_size[i][stack] = increased_size[i][stack];
}
}
} else if (auto *stack_pop = stmt->cast<AdStackPopStmt>()) {
auto *stack = stack_pop->stack->as<AdStackAllocaStmt>();
if (stack->max_size == 0 /*adaptive*/) {
increased_size[i][stack]--;
}
}
}
to_visit.push(i);
in_queue[i] = true;
}
xumingkuan marked this conversation as resolved.
Show resolved Hide resolved

while (!to_visit.empty()) {
xumingkuan marked this conversation as resolved.
Show resolved Hide resolved
int node_id = to_visit.front();
to_visit.pop();
in_queue[node_id] = false;
CFGNode *now = nodes[node_id].get();

for (auto &it : max_increased_size[node_id]) {
auto *stack = it.first;
// Inside this CFGNode
auto current_max_size =
max_size_at_node_begin[node_id][stack] + it.second;
xumingkuan marked this conversation as resolved.
Show resolved Hide resolved
if (current_max_size > max_ad_stack_size) {
current_max_size = max_ad_stack_size;
oversized_stacks.insert(stack);
}
if (current_max_size > max_size[stack]) {
max_size[stack] = current_max_size;
}
}
for (auto &it : increased_size[node_id]) {
auto *stack = it.first;
// At the end of this CFGNode
auto current_size = max_size_at_node_begin[node_id][stack] + it.second;
if (current_size > max_ad_stack_size) {
current_size = max_ad_stack_size; // avoid infinite loop
}
for (auto *next_node : now->next) {
int next_node_id = node_ids[next_node];
if (current_size > max_size_at_node_begin[next_node_id][stack]) {
max_size_at_node_begin[next_node_id][stack] = current_size;
if (!in_queue[next_node_id]) {
to_visit.push(next_node_id);
in_queue[next_node_id] = true;
}
}
}
}
}

if (!oversized_stacks.empty()) {
std::vector<std::string> oversized_stacks_name;
oversized_stacks_name.reserve(oversized_stacks.size());
for (auto &stack : oversized_stacks) {
oversized_stacks_name.push_back(stack->name());
}
TI_WARN(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How serious it is if the necessary AD stack size overflows max_ad_stack_size? If this would directly result in wrong results, we should better report an error and stop immediately?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not that serious -- IMHO in most cases, it's the control-flow graph that unable to determine the max capacity. For example:

s = stack()
for i in range(10):
  s.push(i)

The control-flow graph does not have the range 10, and thus cannot determine the capacity.

From another perspective, the current codebase uses a fixed capacity of 16, and it works fine.

On the other hand, I don't see any warnings in the current tests, so maybe even the above case (a loop with #pushes > #pops) doesn't exist, and the control-flow graph is able to determine all maximum capacities. Whether the above case exists depends on the usage of AD-stack in auto_diff.cpp.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, the code implementation looks like it is able to figure out the necessary stack size, which overflows max_ad_stack_size. But the interpretation is "unable to determine the max capacity". Is it possible to distinguish these two cases, or maybe I'm misunderstanding something ? 🤣

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh I was using the terms "size" and "capacity" interchangeably... It should be the necessary stack size, not the max capacity.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If auto_diff.cpp is bug-free, then #pushes should be always equal to #pops?

I still don't see why the fact that an undetermined stack would not lead to a bad result. IIUC, this is serious, but there are cases where this stack size just cannot be determined statically? (Like the range(10) example you gave).

I don't see any warnings in the current tests

Were you referring to the python tests, or the new CPP tests? If the former, i think that's because the output are not shown in pytest by default..

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the explanation. Let's add this comment to the code?

Copy link
Contributor Author

@xumingkuan xumingkuan Jun 28, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now I think maybe it's better to use the Bellman-Ford algorithm -- when we are "able to figure out all stack sizes statically", we are guaranteed to figure them out even if a stack needs a large size (and the algorithm's running time is approximately the same); when we are unable to figure out at least one stack size, the algorithm will run slower but we will be sure that we are unable to figure out the stack size statically (instead of a warning about possible overflow) (So, in this case, I think the message should not be a TI_WARN -- maybe a TI_INFO or TI_DEBUG). WDYT?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now I think maybe it's better to use the Bellman-Ford algorithm

Sounds great!

So, in this case, I think the message should not be a TI_WARN -- maybe a TI_INFO or TI_DEBUG. WDYT?

IIUC, we cannot determine the stack size as soon as the kernel has a loop in it? If so, yeah, maybe it's better to make this TI_DEBUG to reduce the noise..

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC, we cannot determine the stack size as soon as the kernel has a loop in it? If so, yeah, maybe it's better to make this TI_DEBUG to reduce the noise..

Right...

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ljcc0930 will help review Bellman-Ford XD

"Unable to determine capacity for autodiff stacks: {}. "
xumingkuan marked this conversation as resolved.
Show resolved Hide resolved
"Use configured maximum capacity (CompileConfig::max_ad_stack_size) {} "
"instead.",
fmt::join(oversized_stacks_name, ", "), max_ad_stack_size);
}

for (auto &it : max_size) {
auto *stack = it.first;
TI_WARN_IF(it.second == 0,
"Unused autodiff stack {} should have been eliminated.",
stack->name());
stack->max_size = it.second;
}
}

TLANG_NAMESPACE_END
7 changes: 7 additions & 0 deletions taichi/ir/control_flow_graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,13 @@ class ControlFlowGraph {

// Gather the SNodes this offload reads.
std::unordered_set<SNode *> gather_loaded_snodes();

/**
* Determine all adaptive AD-stacks' capacity with the worklist algorithm.
* @param max_ad_stack_size The maximum allowed AD stack size. This parameter
* is set to prevent infinite loops of the algorithm.
*/
void determine_ad_stack_size(int max_ad_stack_size);
};

TLANG_NAMESPACE_END
27 changes: 27 additions & 0 deletions taichi/ir/ir_builder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -418,4 +418,31 @@ ExternalPtrStmt *IRBuilder::create_external_ptr(
return insert(Stmt::make_typed<ExternalPtrStmt>(ptr, indices));
}

AdStackAllocaStmt *IRBuilder::create_ad_stack(const DataType &dt,
std::size_t max_size) {
return insert(Stmt::make_typed<AdStackAllocaStmt>(dt, max_size));
}

void IRBuilder::ad_stack_push(AdStackAllocaStmt *stack, Stmt *val) {
insert(Stmt::make_typed<AdStackPushStmt>(stack, val));
}

void IRBuilder::ad_stack_pop(AdStackAllocaStmt *stack) {
insert(Stmt::make_typed<AdStackPopStmt>(stack));
}

AdStackLoadTopStmt *IRBuilder::ad_stack_load_top(AdStackAllocaStmt *stack) {
return insert(Stmt::make_typed<AdStackLoadTopStmt>(stack));
}

AdStackLoadTopAdjStmt *IRBuilder::ad_stack_load_top_adjoint(
AdStackAllocaStmt *stack) {
return insert(Stmt::make_typed<AdStackLoadTopAdjStmt>(stack));
}

void IRBuilder::ad_stack_accumulate_adjoint(AdStackAllocaStmt *stack,
Stmt *val) {
insert(Stmt::make_typed<AdStackAccAdjointStmt>(stack, val));
}

TLANG_NAMESPACE_END
10 changes: 9 additions & 1 deletion taichi/ir/ir_builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ class IRBuilder {

// Print values and strings. Arguments can be Stmt* or std::string.
template <typename... Args>
PrintStmt *create_print(Args &&... args) {
PrintStmt *create_print(Args &&...args) {
return insert(Stmt::make_typed<PrintStmt>(std::forward<Args>(args)...));
}

Expand Down Expand Up @@ -248,6 +248,14 @@ class IRBuilder {
}
}

// Autodiff stack operations.
AdStackAllocaStmt *create_ad_stack(const DataType &dt, std::size_t max_size);
void ad_stack_push(AdStackAllocaStmt *stack, Stmt *val);
void ad_stack_pop(AdStackAllocaStmt *stack);
AdStackLoadTopStmt *ad_stack_load_top(AdStackAllocaStmt *stack);
AdStackLoadTopAdjStmt *ad_stack_load_top_adjoint(AdStackAllocaStmt *stack);
void ad_stack_accumulate_adjoint(AdStackAllocaStmt *stack, Stmt *val);

private:
std::unique_ptr<Block> root_{nullptr};
InsertPoint insert_point_;
Expand Down
2 changes: 1 addition & 1 deletion taichi/ir/statements.h
Original file line number Diff line number Diff line change
Expand Up @@ -1222,7 +1222,7 @@ class InternalFuncStmt : public Stmt {
class AdStackAllocaStmt : public Stmt {
public:
DataType dt;
std::size_t max_size; // TODO: 0 = adaptive
std::size_t max_size{0}; // 0 = adaptive

AdStackAllocaStmt(const DataType &dt, std::size_t max_size)
: dt(dt), max_size(max_size) {
Expand Down
8 changes: 8 additions & 0 deletions taichi/ir/transforms.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,13 @@ bool lower_access(IRNode *root,
void auto_diff(IRNode *root,
const CompileConfig &config,
bool use_stack = false);
/**
* Determine all adaptive AD-stacks' capacity. This pass is idempotent, i.e.,
* there are no side effects if called more than once or called when not needed.
* @return Whether the IR is modified, i.e., whether there exists adaptive
* AD-stacks before this pass.
*/
bool determine_ad_stack_size(IRNode *root, const CompileConfig &config);
xumingkuan marked this conversation as resolved.
Show resolved Hide resolved
bool constant_fold(IRNode *root,
const CompileConfig &config,
const ConstantFoldPass::Args &args);
Expand Down Expand Up @@ -124,6 +131,7 @@ void offload_to_executable(IRNode *ir,
const CompileConfig &config,
Kernel *kernel,
bool verbose,
bool determine_ad_stack_size,
bool lower_global_access,
bool make_thread_local,
bool make_block_local);
Expand Down
1 change: 1 addition & 0 deletions taichi/program/async_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ void ExecutionQueue::enqueue(const TaskLaunchRecord &ker) {
auto ir = stmt;
offload_to_executable(
ir, config, kernel, /*verbose=*/false,
/*determine_ad_stack_size=*/true,
/*lower_global_access=*/true,
/*make_thread_local=*/true,
/*make_block_local=*/
Expand Down
2 changes: 0 additions & 2 deletions taichi/program/compile_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,6 @@ CompileConfig::CompileConfig() {
cpu_max_num_threads = std::thread::hardware_concurrency();
random_seed = 0;

ad_stack_size = 16;

// LLVM backend options:
print_struct_llvm_ir = false;
print_kernel_llvm_ir = false;
Expand Down
4 changes: 3 additions & 1 deletion taichi/program/compile_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,9 @@ struct CompileConfig {
int default_cpu_block_dim;
int default_gpu_block_dim;
int gpu_max_reg;
int ad_stack_size;
int ad_stack_size{0}; // 0 = adaptive
// The maximum size when automatically determining the stack size.
int max_ad_stack_size{32};

int saturating_grid_dim;
int max_block_dim;
Expand Down
12 changes: 10 additions & 2 deletions taichi/transforms/compile_to_offloads.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ void offload_to_executable(IRNode *ir,
const CompileConfig &config,
Kernel *kernel,
bool verbose,
bool determine_ad_stack_size,
bool lower_global_access,
bool make_thread_local,
bool make_block_local) {
Expand Down Expand Up @@ -224,6 +225,11 @@ void offload_to_executable(IRNode *ir,
irpass::full_simplify(ir, config, {lower_global_access, kernel->program});
print("Simplified IV");

if (determine_ad_stack_size) {
irpass::determine_ad_stack_size(ir, config);
print("Autodiff stack size determined");
}

if (is_extension_supported(config.arch, Extension::quant)) {
irpass::optimize_bit_struct_stores(ir, config, amgr.get());
print("Bit struct stores optimized");
Expand All @@ -250,8 +256,10 @@ void compile_to_executable(IRNode *ir,
compile_to_offloads(ir, config, kernel, verbose, vectorize, grad,
ad_use_stack, start_from_ast);

offload_to_executable(ir, config, kernel, verbose, lower_global_access,
make_thread_local, make_block_local);
offload_to_executable(ir, config, kernel, verbose,
/*determine_ad_stack_size=*/grad && ad_use_stack,
lower_global_access, make_thread_local,
make_block_local);
}

void compile_inline_function(IRNode *ir,
Expand Down
33 changes: 33 additions & 0 deletions taichi/transforms/determine_ad_stack_size.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#include "taichi/ir/analysis.h"
#include "taichi/ir/control_flow_graph.h"
#include "taichi/ir/ir.h"
#include "taichi/ir/statements.h"
#include "taichi/ir/transforms.h"

#include <queue>
#include <unordered_map>

namespace taichi {
namespace lang {

namespace irpass {

bool determine_ad_stack_size(IRNode *root, const CompileConfig &config) {
if (irpass::analysis::gather_statements(root, [&](Stmt *s) {
if (auto ad_stack = s->cast<AdStackAllocaStmt>()) {
return ad_stack->max_size == 0; // adaptive
}
return false;
}).empty()) {
return false; // no AD-stacks with adaptive size
}
auto cfg = analysis::build_cfg(root);
cfg->simplify_graph();
cfg->determine_ad_stack_size(config.max_ad_stack_size);
return true;
}

} // namespace irpass

} // namespace lang
} // namespace taichi
Loading