Skip to content

Commit

Permalink
[AutoDiff] Automatically determine AdStack's size (#2438)
Browse files Browse the repository at this point in the history
* [AutoDiff] Automatically determine AdStack's size

* Auto Format

* revert auto format

* revert auto format

* update comment

* fix format

* oops

* Update taichi/ir/statements.h

Co-authored-by: Ye Kuang <k-ye@users.noreply.github.com>

* Update taichi/ir/control_flow_graph.cpp

Co-authored-by: Yuanming Hu <yuanming-hu@users.noreply.github.com>

* Apply review

* Add a documentation

* Add a basic C++ test

* Add 3 more tests

* Add comments

* Update taichi/ir/control_flow_graph.cpp

Co-authored-by: Ye Kuang <k-ye@users.noreply.github.com>

* Apply review, use parameterized tests, fix typo and code format

* Apply review

* Use the Bellman-Ford algorithm

* Update taichi/ir/control_flow_graph.cpp

* Run Bellman-Ford on each stack separately, fix a bug, add a test

Co-authored-by: Taichi Gardener <taichigardener@gmail.com>
Co-authored-by: Ye Kuang <k-ye@users.noreply.github.com>
Co-authored-by: Yuanming Hu <yuanming-hu@users.noreply.github.com>
  • Loading branch information
4 people authored Jun 29, 2021
1 parent a7d9dc2 commit 7fe1cf2
Show file tree
Hide file tree
Showing 15 changed files with 455 additions and 6 deletions.
3 changes: 3 additions & 0 deletions taichi/backends/cc/codegen_cc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -533,6 +533,9 @@ class CCTransformer : public IRVisitor {

void visit(AdStackAllocaStmt *stmt) override {
TI_ASSERT(stmt->width() == 1);
TI_ASSERT_INFO(
stmt->max_size > 0,
"Adaptive autodiff stack's size should have been determined.");

const auto &var_name = stmt->raw_name();
emit("Ti_u8 {}[{}];", var_name, stmt->size_in_bytes() + sizeof(uint32_t));
Expand Down
3 changes: 3 additions & 0 deletions taichi/backends/metal/codegen_metal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -737,6 +737,9 @@ class KernelCodegenImpl : public IRVisitor {

void visit(AdStackAllocaStmt *stmt) override {
TI_ASSERT(stmt->width() == 1);
TI_ASSERT_INFO(
stmt->max_size > 0,
"Adaptive autodiff stack's size should have been determined.");

const auto &var_name = stmt->raw_name();
emit("byte {}[{}];", var_name, stmt->size_in_bytes());
Expand Down
2 changes: 2 additions & 0 deletions taichi/codegen/codegen_llvm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1870,6 +1870,8 @@ void CodeGenLLVM::visit(InternalFuncStmt *stmt) {

void CodeGenLLVM::visit(AdStackAllocaStmt *stmt) {
TI_ASSERT(stmt->width() == 1);
TI_ASSERT_INFO(stmt->max_size > 0,
"Adaptive autodiff stack's size should have been determined.");
auto type = llvm::ArrayType::get(llvm::Type::getInt8Ty(*llvm_context),
stmt->size_in_bytes());
auto alloca = create_entry_block_alloca(type, sizeof(int64));
Expand Down
161 changes: 161 additions & 0 deletions taichi/ir/control_flow_graph.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "taichi/ir/control_flow_graph.h"

#include <queue>
#include <unordered_set>

#include "taichi/ir/analysis.h"
#include "taichi/ir/statements.h"
Expand Down Expand Up @@ -879,4 +880,164 @@ std::unordered_set<SNode *> ControlFlowGraph::gather_loaded_snodes() {
return snodes;
}

void ControlFlowGraph::determine_ad_stack_size(int default_ad_stack_size) {
/**
* Determine all adaptive AD-stacks' necessary size using the Bellman-Ford
* algorithm. When there is a positive loop (#pushes > #pops in a loop)
* for an AD-stack, we cannot determine the size of the AD-stack, and
* |default_ad_stack_size| is used. The time complexity is
* O(num_statements + num_stacks * num_edges * num_nodes).
*/
const int num_nodes = size();

// max_increased_size[i][j] is the maximum number of (pushes - pops) of
// stack |i| among all prefixes of the CFGNode |j|.
std::unordered_map<AdStackAllocaStmt *, std::vector<int>> max_increased_size;

// increased_size[i][j] is the number of (pushes - pops) of stack |i| in
// the CFGNode |j|.
std::unordered_map<AdStackAllocaStmt *, std::vector<int>> increased_size;

std::unordered_map<CFGNode *, int> node_ids;
std::unordered_set<AdStackAllocaStmt *> all_stacks;
std::unordered_set<AdStackAllocaStmt *> indeterminable_stacks;

for (int i = 0; i < num_nodes; i++)
node_ids[nodes[i].get()] = i;

for (int i = 0; i < num_nodes; i++) {
for (int j = nodes[i]->begin_location; j < nodes[i]->end_location; j++) {
Stmt *stmt = nodes[i]->block->statements[j].get();
if (auto *stack = stmt->cast<AdStackAllocaStmt>()) {
all_stacks.insert(stack);
max_increased_size.insert(
std::make_pair(stack, std::vector<int>(num_nodes, 0)));
increased_size.insert(
std::make_pair(stack, std::vector<int>(num_nodes, 0)));
}
}
}

// For each basic block we compute the increase of stack size. This is a
// pre-processing step for the next maximum stack size determining algorithm.
for (int i = 0; i < num_nodes; i++) {
for (int j = nodes[i]->begin_location; j < nodes[i]->end_location; j++) {
Stmt *stmt = nodes[i]->block->statements[j].get();
if (auto *stack_push = stmt->cast<AdStackPushStmt>()) {
auto *stack = stack_push->stack->as<AdStackAllocaStmt>();
if (stack->max_size == 0 /*adaptive*/) {
increased_size[stack][i]++;
if (increased_size[stack][i] > max_increased_size[stack][i]) {
max_increased_size[stack][i] = increased_size[stack][i];
}
}
} else if (auto *stack_pop = stmt->cast<AdStackPopStmt>()) {
auto *stack = stack_pop->stack->as<AdStackAllocaStmt>();
if (stack->max_size == 0 /*adaptive*/) {
increased_size[stack][i]--;
}
}
}
}

// The maximum stack size determining algorithm -- run the Bellman-Ford
// algorithm on each AD-stack separately.
for (auto *stack : all_stacks) {
// The maximum size of |stack| among all control flows starting at the
// beginning of the IR.
int max_size = 0;

// max_size_at_node_begin[j] is the maximum size of |stack| among
// all control flows starting at the beginning of the IR and ending at the
// beginning of the CFGNode |j|. Initialize this array to -1 to make sure
// that the first iteration of the Bellman-Ford algorithm fully updates
// this array.
std::vector<int> max_size_at_node_begin(num_nodes, -1);

// The queue for the Bellman-Ford algorithm.
std::queue<int> to_visit;

// An optimization for the Bellman-Ford algorithm.
std::vector<bool> in_queue(num_nodes);

// An array for detecting positive loop in the Bellman-Ford algorithm.
std::vector<int> times_pushed_in_queue(num_nodes, 0);

max_size_at_node_begin[start_node] = 0;
to_visit.push(start_node);
in_queue[start_node] = true;
times_pushed_in_queue[start_node]++;

bool has_positive_loop = false;

// The Bellman-Ford algorithm.
while (!to_visit.empty()) {
int node_id = to_visit.front();
to_visit.pop();
in_queue[node_id] = false;
CFGNode *now = nodes[node_id].get();

// Inside this CFGNode -- update the answer |max_size|
const auto max_size_inside_this_node = max_increased_size[stack][node_id];
const auto current_max_size =
max_size_at_node_begin[node_id] + max_size_inside_this_node;
if (current_max_size > max_size) {
max_size = current_max_size;
}
// At the end of this CFGNode -- update the state
// |max_size_at_node_begin| of other CFGNodes
const auto increase_in_this_node = increased_size[stack][node_id];
const auto current_size =
max_size_at_node_begin[node_id] + increase_in_this_node;
for (auto *next_node : now->next) {
int next_node_id = node_ids[next_node];
if (current_size > max_size_at_node_begin[next_node_id]) {
max_size_at_node_begin[next_node_id] = current_size;
if (!in_queue[next_node_id]) {
if (times_pushed_in_queue[next_node_id] <= num_nodes) {
to_visit.push(next_node_id);
in_queue[next_node_id] = true;
times_pushed_in_queue[next_node_id]++;
} else {
// A positive loop is found because a node is going to be pushed
// into the queue the (num_nodes + 1)-th time.
has_positive_loop = true;
break;
}
}
}
}
if (has_positive_loop) {
break;
}
}

if (has_positive_loop) {
stack->max_size = default_ad_stack_size;
indeterminable_stacks.insert(stack);
} else {
// Since we use |max_size| == 0 for adaptive sizes, we do not want stacks
// with maximum capacity indeed equal to 0.
TI_WARN_IF(max_size == 0,
"Unused autodiff stack {} should have been eliminated.",
stack->name());
stack->max_size = max_size;
}
}

// Print a debug message if we have indeterminable AD-stacks' sizes.
if (!indeterminable_stacks.empty()) {
std::vector<std::string> indeterminable_stacks_name;
indeterminable_stacks_name.reserve(indeterminable_stacks.size());
for (auto &stack : indeterminable_stacks) {
indeterminable_stacks_name.push_back(stack->name());
}
TI_DEBUG(
"Unable to determine the necessary size for autodiff stacks [{}]. "
"Use "
"configured size (CompileConfig::default_ad_stack_size) {} instead.",
fmt::join(indeterminable_stacks_name, ", "), default_ad_stack_size);
}
}

TLANG_NAMESPACE_END
7 changes: 7 additions & 0 deletions taichi/ir/control_flow_graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,13 @@ class ControlFlowGraph {
* task.
*/
std::unordered_set<SNode *> gather_loaded_snodes();

/**
* Determine all adaptive AD-stacks' necessary size.
* @param default_ad_stack_size The default AD-stack's size when we are
* unable to determine some AD-stack's size.
*/
void determine_ad_stack_size(int default_ad_stack_size);
};

TLANG_NAMESPACE_END
27 changes: 27 additions & 0 deletions taichi/ir/ir_builder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -418,4 +418,31 @@ ExternalPtrStmt *IRBuilder::create_external_ptr(
return insert(Stmt::make_typed<ExternalPtrStmt>(ptr, indices));
}

AdStackAllocaStmt *IRBuilder::create_ad_stack(const DataType &dt,
std::size_t max_size) {
return insert(Stmt::make_typed<AdStackAllocaStmt>(dt, max_size));
}

void IRBuilder::ad_stack_push(AdStackAllocaStmt *stack, Stmt *val) {
insert(Stmt::make_typed<AdStackPushStmt>(stack, val));
}

void IRBuilder::ad_stack_pop(AdStackAllocaStmt *stack) {
insert(Stmt::make_typed<AdStackPopStmt>(stack));
}

AdStackLoadTopStmt *IRBuilder::ad_stack_load_top(AdStackAllocaStmt *stack) {
return insert(Stmt::make_typed<AdStackLoadTopStmt>(stack));
}

AdStackLoadTopAdjStmt *IRBuilder::ad_stack_load_top_adjoint(
AdStackAllocaStmt *stack) {
return insert(Stmt::make_typed<AdStackLoadTopAdjStmt>(stack));
}

void IRBuilder::ad_stack_accumulate_adjoint(AdStackAllocaStmt *stack,
Stmt *val) {
insert(Stmt::make_typed<AdStackAccAdjointStmt>(stack, val));
}

TLANG_NAMESPACE_END
8 changes: 8 additions & 0 deletions taichi/ir/ir_builder.h
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,14 @@ class IRBuilder {
}
}

// Autodiff stack operations.
AdStackAllocaStmt *create_ad_stack(const DataType &dt, std::size_t max_size);
void ad_stack_push(AdStackAllocaStmt *stack, Stmt *val);
void ad_stack_pop(AdStackAllocaStmt *stack);
AdStackLoadTopStmt *ad_stack_load_top(AdStackAllocaStmt *stack);
AdStackLoadTopAdjStmt *ad_stack_load_top_adjoint(AdStackAllocaStmt *stack);
void ad_stack_accumulate_adjoint(AdStackAllocaStmt *stack, Stmt *val);

private:
std::unique_ptr<Block> root_{nullptr};
InsertPoint insert_point_;
Expand Down
2 changes: 1 addition & 1 deletion taichi/ir/statements.h
Original file line number Diff line number Diff line change
Expand Up @@ -1222,7 +1222,7 @@ class InternalFuncStmt : public Stmt {
class AdStackAllocaStmt : public Stmt {
public:
DataType dt;
std::size_t max_size; // TODO: 0 = adaptive
std::size_t max_size{0}; // 0 = adaptive

AdStackAllocaStmt(const DataType &dt, std::size_t max_size)
: dt(dt), max_size(max_size) {
Expand Down
8 changes: 8 additions & 0 deletions taichi/ir/transforms.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,13 @@ bool lower_access(IRNode *root,
void auto_diff(IRNode *root,
const CompileConfig &config,
bool use_stack = false);
/**
* Determine all adaptive AD-stacks' size. This pass is idempotent, i.e.,
* there are no side effects if called more than once or called when not needed.
* @return Whether the IR is modified, i.e., whether there exists adaptive
* AD-stacks before this pass.
*/
bool determine_ad_stack_size(IRNode *root, const CompileConfig &config);
bool constant_fold(IRNode *root,
const CompileConfig &config,
const ConstantFoldPass::Args &args);
Expand Down Expand Up @@ -124,6 +131,7 @@ void offload_to_executable(IRNode *ir,
const CompileConfig &config,
Kernel *kernel,
bool verbose,
bool determine_ad_stack_size,
bool lower_global_access,
bool make_thread_local,
bool make_block_local);
Expand Down
1 change: 1 addition & 0 deletions taichi/program/async_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ void ExecutionQueue::enqueue(const TaskLaunchRecord &ker) {
auto ir = stmt;
offload_to_executable(
ir, config, kernel, /*verbose=*/false,
/*determine_ad_stack_size=*/true,
/*lower_global_access=*/true,
/*make_thread_local=*/true,
/*make_block_local=*/
Expand Down
2 changes: 0 additions & 2 deletions taichi/program/compile_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,6 @@ CompileConfig::CompileConfig() {
cpu_max_num_threads = std::thread::hardware_concurrency();
random_seed = 0;

ad_stack_size = 16;

// LLVM backend options:
print_struct_llvm_ir = false;
print_kernel_llvm_ir = false;
Expand Down
5 changes: 4 additions & 1 deletion taichi/program/compile_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,10 @@ struct CompileConfig {
int default_cpu_block_dim;
int default_gpu_block_dim;
int gpu_max_reg;
int ad_stack_size;
int ad_stack_size{0}; // 0 = adaptive
// The default size when the Taichi compiler is unable to automatically
// determine the autodiff stack size.
int default_ad_stack_size{32};

int saturating_grid_dim;
int max_block_dim;
Expand Down
12 changes: 10 additions & 2 deletions taichi/transforms/compile_to_offloads.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ void offload_to_executable(IRNode *ir,
const CompileConfig &config,
Kernel *kernel,
bool verbose,
bool determine_ad_stack_size,
bool lower_global_access,
bool make_thread_local,
bool make_block_local) {
Expand Down Expand Up @@ -224,6 +225,11 @@ void offload_to_executable(IRNode *ir,
irpass::full_simplify(ir, config, {lower_global_access, kernel->program});
print("Simplified IV");

if (determine_ad_stack_size) {
irpass::determine_ad_stack_size(ir, config);
print("Autodiff stack size determined");
}

if (is_extension_supported(config.arch, Extension::quant)) {
irpass::optimize_bit_struct_stores(ir, config, amgr.get());
print("Bit struct stores optimized");
Expand All @@ -250,8 +256,10 @@ void compile_to_executable(IRNode *ir,
compile_to_offloads(ir, config, kernel, verbose, vectorize, grad,
ad_use_stack, start_from_ast);

offload_to_executable(ir, config, kernel, verbose, lower_global_access,
make_thread_local, make_block_local);
offload_to_executable(ir, config, kernel, verbose,
/*determine_ad_stack_size=*/grad && ad_use_stack,
lower_global_access, make_thread_local,
make_block_local);
}

void compile_inline_function(IRNode *ir,
Expand Down
33 changes: 33 additions & 0 deletions taichi/transforms/determine_ad_stack_size.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#include "taichi/ir/analysis.h"
#include "taichi/ir/control_flow_graph.h"
#include "taichi/ir/ir.h"
#include "taichi/ir/statements.h"
#include "taichi/ir/transforms.h"

#include <queue>
#include <unordered_map>

namespace taichi {
namespace lang {

namespace irpass {

bool determine_ad_stack_size(IRNode *root, const CompileConfig &config) {
if (irpass::analysis::gather_statements(root, [&](Stmt *s) {
if (auto ad_stack = s->cast<AdStackAllocaStmt>()) {
return ad_stack->max_size == 0; // adaptive
}
return false;
}).empty()) {
return false; // no AD-stacks with adaptive size
}
auto cfg = analysis::build_cfg(root);
cfg->simplify_graph();
cfg->determine_ad_stack_size(config.default_ad_stack_size);
return true;
}

} // namespace irpass

} // namespace lang
} // namespace taichi
Loading

0 comments on commit 7fe1cf2

Please sign in to comment.