[AutoDiff] Automatically determine AdStack's size (#2438)

* [AutoDiff] Automatically determine AdStack's size * Auto Format * revert auto format * revert auto format * update comment * fix format * oops * Update taichi/ir/statements.h Co-authored-by: Ye Kuang <k-ye@users.noreply.github.com> * Update taichi/ir/control_flow_graph.cpp Co-authored-by: Yuanming Hu <yuanming-hu@users.noreply.github.com> * Apply review * Add a documentation * Add a basic C++ test * Add 3 more tests * Add comments * Update taichi/ir/control_flow_graph.cpp Co-authored-by: Ye Kuang <k-ye@users.noreply.github.com> * Apply review, use parameterized tests, fix typo and code format * Apply review * Use the Bellman-Ford algorithm * Update taichi/ir/control_flow_graph.cpp * Run Bellman-Ford on each stack separately, fix a bug, add a test Co-authored-by: Taichi Gardener <taichigardener@gmail.com> Co-authored-by: Ye Kuang <k-ye@users.noreply.github.com> Co-authored-by: Yuanming Hu <yuanming-hu@users.noreply.github.com>
taichi-dev · Jun 29, 2021 · 7fe1cf2 · 7fe1cf2
1 parent a7d9dc2
commit 7fe1cf2
Show file tree

Hide file tree

Showing 15 changed files with 455 additions and 6 deletions.
diff --git a/taichi/backends/cc/codegen_cc.cpp b/taichi/backends/cc/codegen_cc.cpp
@@ -533,6 +533,9 @@ class CCTransformer : public IRVisitor {
 
   void visit(AdStackAllocaStmt *stmt) override {
     TI_ASSERT(stmt->width() == 1);
+    TI_ASSERT_INFO(
+        stmt->max_size > 0,
+        "Adaptive autodiff stack's size should have been determined.");
 
     const auto &var_name = stmt->raw_name();
     emit("Ti_u8 {}[{}];", var_name, stmt->size_in_bytes() + sizeof(uint32_t));

diff --git a/taichi/backends/metal/codegen_metal.cpp b/taichi/backends/metal/codegen_metal.cpp
@@ -737,6 +737,9 @@ class KernelCodegenImpl : public IRVisitor {
 
   void visit(AdStackAllocaStmt *stmt) override {
     TI_ASSERT(stmt->width() == 1);
+    TI_ASSERT_INFO(
+        stmt->max_size > 0,
+        "Adaptive autodiff stack's size should have been determined.");
 
     const auto &var_name = stmt->raw_name();
     emit("byte {}[{}];", var_name, stmt->size_in_bytes());

diff --git a/taichi/codegen/codegen_llvm.cpp b/taichi/codegen/codegen_llvm.cpp
@@ -1870,6 +1870,8 @@ void CodeGenLLVM::visit(InternalFuncStmt *stmt) {
 
 void CodeGenLLVM::visit(AdStackAllocaStmt *stmt) {
   TI_ASSERT(stmt->width() == 1);
+  TI_ASSERT_INFO(stmt->max_size > 0,
+                 "Adaptive autodiff stack's size should have been determined.");
   auto type = llvm::ArrayType::get(llvm::Type::getInt8Ty(*llvm_context),
                                    stmt->size_in_bytes());
   auto alloca = create_entry_block_alloca(type, sizeof(int64));

diff --git a/taichi/ir/control_flow_graph.cpp b/taichi/ir/control_flow_graph.cpp
@@ -1,6 +1,7 @@
 #include "taichi/ir/control_flow_graph.h"
 
 #include <queue>
+#include <unordered_set>
 
 #include "taichi/ir/analysis.h"
 #include "taichi/ir/statements.h"
@@ -879,4 +880,164 @@ std::unordered_set<SNode *> ControlFlowGraph::gather_loaded_snodes() {
   return snodes;
 }
 
+void ControlFlowGraph::determine_ad_stack_size(int default_ad_stack_size) {
+  /**
+   * Determine all adaptive AD-stacks' necessary size using the Bellman-Ford
+   * algorithm. When there is a positive loop (#pushes > #pops in a loop)
+   * for an AD-stack, we cannot determine the size of the AD-stack, and
+   * |default_ad_stack_size| is used. The time complexity is
+   * O(num_statements + num_stacks * num_edges * num_nodes).
+   */
+  const int num_nodes = size();
+
+  // max_increased_size[i][j] is the maximum number of (pushes - pops) of
+  // stack |i| among all prefixes of the CFGNode |j|.
+  std::unordered_map<AdStackAllocaStmt *, std::vector<int>> max_increased_size;
+
+  // increased_size[i][j] is the number of (pushes - pops) of stack |i| in
+  // the CFGNode |j|.
+  std::unordered_map<AdStackAllocaStmt *, std::vector<int>> increased_size;
+
+  std::unordered_map<CFGNode *, int> node_ids;
+  std::unordered_set<AdStackAllocaStmt *> all_stacks;
+  std::unordered_set<AdStackAllocaStmt *> indeterminable_stacks;
+
+  for (int i = 0; i < num_nodes; i++)
+    node_ids[nodes[i].get()] = i;
+
+  for (int i = 0; i < num_nodes; i++) {
+    for (int j = nodes[i]->begin_location; j < nodes[i]->end_location; j++) {
+      Stmt *stmt = nodes[i]->block->statements[j].get();
+      if (auto *stack = stmt->cast<AdStackAllocaStmt>()) {
+        all_stacks.insert(stack);
+        max_increased_size.insert(
+            std::make_pair(stack, std::vector<int>(num_nodes, 0)));
+        increased_size.insert(
+            std::make_pair(stack, std::vector<int>(num_nodes, 0)));
+      }
+    }
+  }
+
+  // For each basic block we compute the increase of stack size. This is a
+  // pre-processing step for the next maximum stack size determining algorithm.
+  for (int i = 0; i < num_nodes; i++) {
+    for (int j = nodes[i]->begin_location; j < nodes[i]->end_location; j++) {
+      Stmt *stmt = nodes[i]->block->statements[j].get();
+      if (auto *stack_push = stmt->cast<AdStackPushStmt>()) {
+        auto *stack = stack_push->stack->as<AdStackAllocaStmt>();
+        if (stack->max_size == 0 /*adaptive*/) {
+          increased_size[stack][i]++;
+          if (increased_size[stack][i] > max_increased_size[stack][i]) {
+            max_increased_size[stack][i] = increased_size[stack][i];
+          }
+        }
+      } else if (auto *stack_pop = stmt->cast<AdStackPopStmt>()) {
+        auto *stack = stack_pop->stack->as<AdStackAllocaStmt>();
+        if (stack->max_size == 0 /*adaptive*/) {
+          increased_size[stack][i]--;
+        }
+      }
+    }
+  }
+
+  // The maximum stack size determining algorithm -- run the Bellman-Ford
+  // algorithm on each AD-stack separately.
+  for (auto *stack : all_stacks) {
+    // The maximum size of |stack| among all control flows starting at the
+    // beginning of the IR.
+    int max_size = 0;
+
+    // max_size_at_node_begin[j] is the maximum size of |stack| among
+    // all control flows starting at the beginning of the IR and ending at the
+    // beginning of the CFGNode |j|. Initialize this array to -1 to make sure
+    // that the first iteration of the Bellman-Ford algorithm fully updates
+    // this array.
+    std::vector<int> max_size_at_node_begin(num_nodes, -1);
+
+    // The queue for the Bellman-Ford algorithm.
+    std::queue<int> to_visit;
+
+    // An optimization for the Bellman-Ford algorithm.
+    std::vector<bool> in_queue(num_nodes);
+
+    // An array for detecting positive loop in the Bellman-Ford algorithm.
+    std::vector<int> times_pushed_in_queue(num_nodes, 0);
+
+    max_size_at_node_begin[start_node] = 0;
+    to_visit.push(start_node);
+    in_queue[start_node] = true;
+    times_pushed_in_queue[start_node]++;
+
+    bool has_positive_loop = false;
+
+    // The Bellman-Ford algorithm.
+    while (!to_visit.empty()) {
+      int node_id = to_visit.front();
+      to_visit.pop();
+      in_queue[node_id] = false;
+      CFGNode *now = nodes[node_id].get();
+
+      // Inside this CFGNode -- update the answer |max_size|
+      const auto max_size_inside_this_node = max_increased_size[stack][node_id];
+      const auto current_max_size =
+          max_size_at_node_begin[node_id] + max_size_inside_this_node;
+      if (current_max_size > max_size) {
+        max_size = current_max_size;
+      }
+      // At the end of this CFGNode -- update the state
+      // |max_size_at_node_begin| of other CFGNodes
+      const auto increase_in_this_node = increased_size[stack][node_id];
+      const auto current_size =
+          max_size_at_node_begin[node_id] + increase_in_this_node;
+      for (auto *next_node : now->next) {
+        int next_node_id = node_ids[next_node];
+        if (current_size > max_size_at_node_begin[next_node_id]) {
+          max_size_at_node_begin[next_node_id] = current_size;
+          if (!in_queue[next_node_id]) {
+            if (times_pushed_in_queue[next_node_id] <= num_nodes) {
+              to_visit.push(next_node_id);
+              in_queue[next_node_id] = true;
+              times_pushed_in_queue[next_node_id]++;
+            } else {
+              // A positive loop is found because a node is going to be pushed
+              // into the queue the (num_nodes + 1)-th time.
+              has_positive_loop = true;
+              break;
+            }
+          }
+        }
+      }
+      if (has_positive_loop) {
+        break;
+      }
+    }
+
+    if (has_positive_loop) {
+      stack->max_size = default_ad_stack_size;
+      indeterminable_stacks.insert(stack);
+    } else {
+      // Since we use |max_size| == 0 for adaptive sizes, we do not want stacks
+      // with maximum capacity indeed equal to 0.
+      TI_WARN_IF(max_size == 0,
+                 "Unused autodiff stack {} should have been eliminated.",
+                 stack->name());
+      stack->max_size = max_size;
+    }
+  }
+
+  // Print a debug message if we have indeterminable AD-stacks' sizes.
+  if (!indeterminable_stacks.empty()) {
+    std::vector<std::string> indeterminable_stacks_name;
+    indeterminable_stacks_name.reserve(indeterminable_stacks.size());
+    for (auto &stack : indeterminable_stacks) {
+      indeterminable_stacks_name.push_back(stack->name());
+    }
+    TI_DEBUG(
+        "Unable to determine the necessary size for autodiff stacks [{}]. "
+        "Use "
+        "configured size (CompileConfig::default_ad_stack_size) {} instead.",
+        fmt::join(indeterminable_stacks_name, ", "), default_ad_stack_size);
+  }
+}
+
 TLANG_NAMESPACE_END
diff --git a/taichi/ir/control_flow_graph.h b/taichi/ir/control_flow_graph.h
@@ -158,6 +158,13 @@ class ControlFlowGraph {
    * task.
    */
   std::unordered_set<SNode *> gather_loaded_snodes();
+
+  /**
+   * Determine all adaptive AD-stacks' necessary size.
+   * @param default_ad_stack_size The default AD-stack's size when we are
+   * unable to determine some AD-stack's size.
+   */
+  void determine_ad_stack_size(int default_ad_stack_size);
 };
 
 TLANG_NAMESPACE_END
diff --git a/taichi/ir/ir_builder.cpp b/taichi/ir/ir_builder.cpp
@@ -418,4 +418,31 @@ ExternalPtrStmt *IRBuilder::create_external_ptr(
   return insert(Stmt::make_typed<ExternalPtrStmt>(ptr, indices));
 }
 
+AdStackAllocaStmt *IRBuilder::create_ad_stack(const DataType &dt,
+                                              std::size_t max_size) {
+  return insert(Stmt::make_typed<AdStackAllocaStmt>(dt, max_size));
+}
+
+void IRBuilder::ad_stack_push(AdStackAllocaStmt *stack, Stmt *val) {
+  insert(Stmt::make_typed<AdStackPushStmt>(stack, val));
+}
+
+void IRBuilder::ad_stack_pop(AdStackAllocaStmt *stack) {
+  insert(Stmt::make_typed<AdStackPopStmt>(stack));
+}
+
+AdStackLoadTopStmt *IRBuilder::ad_stack_load_top(AdStackAllocaStmt *stack) {
+  return insert(Stmt::make_typed<AdStackLoadTopStmt>(stack));
+}
+
+AdStackLoadTopAdjStmt *IRBuilder::ad_stack_load_top_adjoint(
+    AdStackAllocaStmt *stack) {
+  return insert(Stmt::make_typed<AdStackLoadTopAdjStmt>(stack));
+}
+
+void IRBuilder::ad_stack_accumulate_adjoint(AdStackAllocaStmt *stack,
+                                            Stmt *val) {
+  insert(Stmt::make_typed<AdStackAccAdjointStmt>(stack, val));
+}
+
 TLANG_NAMESPACE_END
diff --git a/taichi/ir/ir_builder.h b/taichi/ir/ir_builder.h
@@ -248,6 +248,14 @@ class IRBuilder {
     }
   }
 
+  // Autodiff stack operations.
+  AdStackAllocaStmt *create_ad_stack(const DataType &dt, std::size_t max_size);
+  void ad_stack_push(AdStackAllocaStmt *stack, Stmt *val);
+  void ad_stack_pop(AdStackAllocaStmt *stack);
+  AdStackLoadTopStmt *ad_stack_load_top(AdStackAllocaStmt *stack);
+  AdStackLoadTopAdjStmt *ad_stack_load_top_adjoint(AdStackAllocaStmt *stack);
+  void ad_stack_accumulate_adjoint(AdStackAllocaStmt *stack, Stmt *val);
+
  private:
   std::unique_ptr<Block> root_{nullptr};
   InsertPoint insert_point_;

diff --git a/taichi/ir/statements.h b/taichi/ir/statements.h
@@ -1222,7 +1222,7 @@ class InternalFuncStmt : public Stmt {
 class AdStackAllocaStmt : public Stmt {
  public:
   DataType dt;
-  std::size_t max_size;  // TODO: 0 = adaptive
+  std::size_t max_size{0};  // 0 = adaptive
 
   AdStackAllocaStmt(const DataType &dt, std::size_t max_size)
       : dt(dt), max_size(max_size) {

diff --git a/taichi/ir/transforms.h b/taichi/ir/transforms.h
@@ -71,6 +71,13 @@ bool lower_access(IRNode *root,
 void auto_diff(IRNode *root,
                const CompileConfig &config,
                bool use_stack = false);
+/**
+ * Determine all adaptive AD-stacks' size. This pass is idempotent, i.e.,
+ * there are no side effects if called more than once or called when not needed.
+ * @return Whether the IR is modified, i.e., whether there exists adaptive
+ * AD-stacks before this pass.
+ */
+bool determine_ad_stack_size(IRNode *root, const CompileConfig &config);
 bool constant_fold(IRNode *root,
                    const CompileConfig &config,
                    const ConstantFoldPass::Args &args);
@@ -124,6 +131,7 @@ void offload_to_executable(IRNode *ir,
                            const CompileConfig &config,
                            Kernel *kernel,
                            bool verbose,
+                           bool determine_ad_stack_size,
                            bool lower_global_access,
                            bool make_thread_local,
                            bool make_block_local);

diff --git a/taichi/program/async_engine.cpp b/taichi/program/async_engine.cpp
@@ -154,6 +154,7 @@ void ExecutionQueue::enqueue(const TaskLaunchRecord &ker) {
           auto ir = stmt;
           offload_to_executable(
               ir, config, kernel, /*verbose=*/false,
+              /*determine_ad_stack_size=*/true,
               /*lower_global_access=*/true,
               /*make_thread_local=*/true,
               /*make_block_local=*/

diff --git a/taichi/program/compile_config.cpp b/taichi/program/compile_config.cpp
@@ -45,8 +45,6 @@ CompileConfig::CompileConfig() {
   cpu_max_num_threads = std::thread::hardware_concurrency();
   random_seed = 0;
 
-  ad_stack_size = 16;
-
   // LLVM backend options:
   print_struct_llvm_ir = false;
   print_kernel_llvm_ir = false;

diff --git a/taichi/program/compile_config.h b/taichi/program/compile_config.h
@@ -42,7 +42,10 @@ struct CompileConfig {
   int default_cpu_block_dim;
   int default_gpu_block_dim;
   int gpu_max_reg;
-  int ad_stack_size;
+  int ad_stack_size{0};  // 0 = adaptive
+  // The default size when the Taichi compiler is unable to automatically
+  // determine the autodiff stack size.
+  int default_ad_stack_size{32};
 
   int saturating_grid_dim;
   int max_block_dim;

diff --git a/taichi/transforms/compile_to_offloads.cpp b/taichi/transforms/compile_to_offloads.cpp
@@ -145,6 +145,7 @@ void offload_to_executable(IRNode *ir,
                            const CompileConfig &config,
                            Kernel *kernel,
                            bool verbose,
+                           bool determine_ad_stack_size,
                            bool lower_global_access,
                            bool make_thread_local,
                            bool make_block_local) {
@@ -224,6 +225,11 @@ void offload_to_executable(IRNode *ir,
   irpass::full_simplify(ir, config, {lower_global_access, kernel->program});
   print("Simplified IV");
 
+  if (determine_ad_stack_size) {
+    irpass::determine_ad_stack_size(ir, config);
+    print("Autodiff stack size determined");
+  }
+
   if (is_extension_supported(config.arch, Extension::quant)) {
     irpass::optimize_bit_struct_stores(ir, config, amgr.get());
     print("Bit struct stores optimized");
@@ -250,8 +256,10 @@ void compile_to_executable(IRNode *ir,
   compile_to_offloads(ir, config, kernel, verbose, vectorize, grad,
                       ad_use_stack, start_from_ast);
 
-  offload_to_executable(ir, config, kernel, verbose, lower_global_access,
-                        make_thread_local, make_block_local);
+  offload_to_executable(ir, config, kernel, verbose,
+                        /*determine_ad_stack_size=*/grad && ad_use_stack,
+                        lower_global_access, make_thread_local,
+                        make_block_local);
 }
 
 void compile_inline_function(IRNode *ir,

diff --git a/taichi/transforms/determine_ad_stack_size.cpp b/taichi/transforms/determine_ad_stack_size.cpp
@@ -0,0 +1,33 @@
+#include "taichi/ir/analysis.h"
+#include "taichi/ir/control_flow_graph.h"
+#include "taichi/ir/ir.h"
+#include "taichi/ir/statements.h"
+#include "taichi/ir/transforms.h"
+
+#include <queue>
+#include <unordered_map>
+
+namespace taichi {
+namespace lang {
+
+namespace irpass {
+
+bool determine_ad_stack_size(IRNode *root, const CompileConfig &config) {
+  if (irpass::analysis::gather_statements(root, [&](Stmt *s) {
+        if (auto ad_stack = s->cast<AdStackAllocaStmt>()) {
+          return ad_stack->max_size == 0;  // adaptive
+        }
+        return false;
+      }).empty()) {
+    return false;  // no AD-stacks with adaptive size
+  }
+  auto cfg = analysis::build_cfg(root);
+  cfg->simplify_graph();
+  cfg->determine_ad_stack_size(config.default_ad_stack_size);
+  return true;
+}
+
+}  // namespace irpass
+
+}  // namespace lang
+}  // namespace taichi