apache · mbaret · Mar 13, 2021 · Apr 28, 2021 · May 18, 2021 · Jun 9, 2021
diff --git a/include/tvm/te/schedule.h b/include/tvm/te/schedule.h
@@ -251,6 +251,11 @@ class Stage : public ObjectRef {
    * \return reference to self.
    */
   TVM_DLL Stage& double_buffer();  // NOLINT(*)
+  /*!
+   * \brief Compute current stage with rolling buffering.
+   * \return reference to self.
+   */
+  TVM_DLL Stage& rolling_buffer();  // NOLINT(*)
   /*!
    * \brief whether the stage has been scheduled.
    * \return whether the stage has been scheduled.
@@ -493,6 +498,8 @@ class StageNode : public Object {
   bool is_output{false};
   /*! \brief Whether apply double buffer optimization to this stage */
   bool double_buffer{false};
+  /*! \brief Whether apply rolling buffer optimization to this stage */
+  bool rolling_buffer{false};
   /*!
    * \brief The parent group of the current stage.
    *  The stage cannot be assigned to stages outside the group.

diff --git a/include/tvm/tir/stmt.h b/include/tvm/tir/stmt.h
@@ -1270,6 +1270,8 @@ constexpr const char* double_buffer_scope = "double_buffer_scope";
  * \brief Marks region used by double buffer write
  */
 constexpr const char* double_buffer_write = "double_buffer_write";
+/*! \brief Mark realization for rolling buffer optimization */
+constexpr const char* rolling_buffer_scope = "rolling_buffer_scope";
 /*! \brief Mark of scan update scope */
 constexpr const char* scan_update_scope = "scan_update_scope";
 /*! \brief Mark of scan init scope */

diff --git a/python/tvm/driver/build_module.py b/python/tvm/driver/build_module.py
@@ -221,6 +221,7 @@ def lower(
     pass_list += [
         tvm.tir.transform.VectorizeLoop(not disable_vectorize),
         tvm.tir.transform.InjectVirtualThread(),
+        tvm.tir.transform.InjectRollingBuffer(),
         tvm.tir.transform.InjectDoubleBuffer(),
         tvm.tir.transform.StorageRewrite(),
         tvm.tir.transform.UnrollLoop(),

diff --git a/python/tvm/te/schedule.py b/python/tvm/te/schedule.py
@@ -511,6 +511,14 @@ def double_buffer(self):
         """
         _ffi_api.StageDoubleBuffer(self)
 
+    def rolling_buffer(self):
+        """Compute the current stage via rolling buffering.
+
+        This can only be applied to intermediate stage.
+        This will change the storage cost of the current stage.
+        """
+        _ffi_api.StageRollingBuffer(self)
+
 
 @tvm._ffi.register_object
 class SpecializedCondition(Object):

diff --git a/python/tvm/tir/transform/inject_rolling_buffer.py b/python/tvm/tir/transform/inject_rolling_buffer.py
@@ -0,0 +1,238 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Inject rolling buffers through a TIR transformation."""
+# pylint: disable=invalid-name,unused-argument,inconsistent-return-statements
+from collections import defaultdict, namedtuple
+import math
+
+import tvm
+from tvm import arith
+
+
+def InjectRollingBuffer():
+    """Inject rolling buffer statements.
+
+    Rolling buffers are buffers where one of the dimensions has been made into
+    a circular buffer. Two optimizations are implemented in order to accomplish
+    this: sliding window and storage folding. In particular, the sliding window
+    optimization is applied to the entire buffer (to avoid recomputing elements)
+    and storage folding is then applied to just the rolling dimension.
+
+    Rolling buffers must be inside a loop with only part of the buffer used per
+    iteration. The outermost axis will be rolled over.
+
+    For more information, see the RFC:
+    https://discuss.tvm.apache.org/t/rfc-introducing-a-rolling-buffer-scheduling-primitive/9836
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The pass
+    """
+    buffer_to_attrs = defaultdict(list)
+    rolling_buffers = set()
+    rolling_buffer_to_info = dict()
+    for_loops = list()
+    hoist_buffer_to_for = defaultdict(list)
+
+    RollingBufferInfo = namedtuple(
+        "RollingBufferInfo", ["rolling_axis", "rolling_extent", "axis_overlaps", "axis_iter_vars"]
+    )
+
+    def _pre_visit(stmt):
+        if isinstance(stmt, tvm.tir.For):
+            # Manage the stack of iter_vars
+            for_loops.append(stmt)
+
+        elif isinstance(stmt, tvm.tir.AttrStmt):
+            if isinstance(stmt.node, tvm.tir.Buffer):
+                if stmt.attr_key == "rolling_buffer_scope" and stmt.value.value:
+                    # If the attribute is indicating that a buffer should be a rolling
+                    # buffer, then update the rolling_buffers set to include the bufffer
+                    rolling_buffers.add(stmt.node)
+                # Keep a dictionary associating attribute statements with the buffers
+                # they reference. We'll need this if the buffer gets hoisted and we
+                # need to hoist all of its attributes at the same time.
+                buffer_to_attrs[stmt.node].append(stmt)
+
+        elif isinstance(stmt, tvm.tir.BufferRealize):
+            if stmt.buffer in rolling_buffers:
+                # If a BufferRealize has been identified as needing to be made into
+                # a rolling buffer, begin the analysis...
+                bound_iter_vars = []
+                bound_overlaps = []
+                # We use the bound information of the BufferRealize to calculate
+                # how we can legally roll
+                for bound in stmt.bounds:
+                    divisor = 1
+                    # Handle the case of fractional strides
+                    # They take this form: floordiv(hh.outer, 2)
+                    # Strip the floordiv and keep track of the divisor
+                    if isinstance(bound.min, tvm.tir.FloorDiv):
+                        divisor = bound.min.b.value
+                        bound.min = bound.min.a
+                    # If the bound is an int, we can't roll over it
+                    if isinstance(bound.min, tvm.tir.IntImm):
+                        iter_var = None
+                        stride = 0
+                    # If the bound is just a Var, that implies the stride is 1
+                    elif isinstance(bound.min, tvm.tir.Var):
+                        iter_var = bound.min
+                        stride = 1
+                    # Otherwise, it's the iter var multiplied by the stride
+                    # If not we're in unknown behaviour, so assert
+                    else:
+                        assert isinstance(
+                            bound.min, tvm.tir.Mul
+                        ), "Rolling buffer injection failed: the buffer striding is unsupported"
+                        assert isinstance(
+                            bound.min.a, tvm.tir.Var
+                        ), "Rolling buffer injection failed: the buffer striding is unsupported"
+                        assert isinstance(
+                            bound.min.b, tvm.tir.IntImm
+                        ), "Rolling buffer injection failed: the buffer striding is unsupported"
+                        iter_var = bound.min.a
+                        stride = bound.min.b.value
+                    stride = math.ceil(stride / divisor)
+                    bound_iter_vars.append(iter_var)
+                    if iter_var is not None:
+                        bound_overlaps.append(bound.extent.value - stride)
+                    else:
+                        bound_overlaps.append(0)
+
+                # Pick the outermost iter_var that's mentioned in the bounds
+                # to be the rolling axis
+                roll_iter_var = None
+                roll_axis = -1
+                for loop in for_loops:
+                    iter_var = loop.loop_var
+                    if iter_var in bound_iter_vars:
+                        roll_iter_var = iter_var
+                        roll_axis = bound_iter_vars.index(iter_var)
+                        break
+
+                # We must have found an axis to roll over
+                assert (
+                    roll_iter_var is not None
+                ), "Rolling buffer injection failed: no rolling axis found"
+                assert roll_axis != -1, "Rolling buffer injection failed: no rolling axis found"
+                rolling_buffer_info = RollingBufferInfo(
+                    roll_axis, stmt.bounds[roll_axis].extent.value, bound_overlaps, bound_iter_vars
+                )
+                rolling_buffer_to_info[stmt.buffer] = rolling_buffer_info
+                new_bounds = []
+                for i, extent in enumerate(stmt.buffer.shape):
+                    if i == rolling_buffer_info.rolling_axis:
+                        new_bounds.append(tvm.ir.Range(rolling_buffer_info.rolling_extent))
+                    else:
+                        new_bounds.append(tvm.ir.Range(extent))
+                new_realize = tvm.tir.BufferRealize(
+                    stmt.buffer, new_bounds, stmt.condition, stmt.body, stmt.span
+                )
+                hoist_buffer_to_for[iter_var].append(new_realize)
+
+    def _post_visit(stmt):
+        if isinstance(stmt, tvm.tir.For):
+            # Manage the stack of iter_vars
+            for_loops.pop()
+            # If the loop corresponds to an iter_var that needs a BufferRealize
+            # hoisting to its scope, perform the hoisting
+            if stmt.loop_var in hoist_buffer_to_for:
+                body = stmt
+                for realize in hoist_buffer_to_for[stmt.loop_var]:
+                    attrs = buffer_to_attrs[realize.buffer]
+                    new_realize = tvm.tir.BufferRealize(
+                        realize.buffer, realize.bounds, realize.condition, body, realize.span
+                    )
+                    # The attributes attached to the BufferRealize need hoisting too
+                    for attr in attrs:
+                        if attr.attr_key == "rolling_buffer_scope":
+                            continue
+                        new_realize = tvm.tir.AttrStmt(
+                            attr.node, attr.attr_key, attr.value, new_realize, attr.span
+                        )
+                    body = new_realize
+                return body
+        elif isinstance(stmt, tvm.tir.AttrStmt):
+            if stmt.node in rolling_buffers:
+                # Remove the attribute statements attached to rolling buffers
+                # because they will have been hoisted to the relevant rolling
+                # scope
+                return stmt.body
+        elif isinstance(stmt, tvm.tir.BufferRealize):
+            if stmt.buffer in rolling_buffers:
+                # Remove the original BufferRealize for rolling buffers
+                # because they will have been hoisted to the relevant rolling
+                # scope
+                return stmt.body
+        elif isinstance(stmt, tvm.tir.BufferStore):
+            if stmt.buffer in rolling_buffer_to_info:
+                rolling_buffer_info = rolling_buffer_to_info[stmt.buffer]
+                indices = []
+                # First modify the access indices to use modulo arithmetic
+                # for the rolling axis
+                for i, index in enumerate(stmt.indices):
+                    if i == rolling_buffer_info.rolling_axis:
+                        indices.append(tvm.tir.FloorMod(index, rolling_buffer_info.rolling_extent))
+                    else:
+                        indices.append(index)
+                buffer_store = tvm.tir.BufferStore(stmt.buffer, stmt.value, indices, stmt.span)
+                # Then wrap the BufferStores in some Ifs to avoid recomputing elements
+                for i, iter_var in enumerate(rolling_buffer_info.axis_iter_vars):
+                    if iter_var is not None and rolling_buffer_info.axis_overlaps[i] > 0:
+                        dmap = {iter_var: arith.IntervalSet(0, 0)}
+                        term_2 = arith.Analyzer().int_set(stmt.indices[i], dmap).min_value
+                        buffer_store = tvm.tir.IfThenElse(
+                            tvm.tir.Or(
+                                iter_var < 1, term_2 >= rolling_buffer_info.axis_overlaps[i]
+                            ),
+                            buffer_store,
+                            None,
+                        )
+                return buffer_store
+        elif isinstance(stmt, tvm.tir.BufferLoad):
+            if stmt.buffer in rolling_buffer_to_info:
+                rolling_buffer_info = rolling_buffer_to_info[stmt.buffer]
+                indices = []
+                # Modify the access indices to use modulo arithmetic
+                # for the rolling axis
+                for i, index in enumerate(stmt.indices):
+                    if i == rolling_buffer_info.rolling_axis:
+                        indices.append(tvm.tir.FloorMod(index, rolling_buffer_info.rolling_extent))
+                    else:
+                        indices.append(index)
+                return tvm.tir.BufferLoad(stmt.buffer, indices, stmt.span)
+
+    def _ftransform(f, mod, ctx):
+        return f.with_body(
+            tvm.tir.stmt_functor.ir_transform(
+                f.body,
+                _pre_visit,
+                _post_visit,
+                [
+                    "tir.AttrStmt",
+                    "tir.BufferRealize",
+                    "tir.For",
+                    "tir.BufferStore",
+                    "tir.BufferLoad",
+                ],
+            )
+        )
+
+    return tvm.tir.transform.prim_func_pass(
+        _ftransform, opt_level=0, name="tir.InjectRollingBuffer"
+    )
diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py
@@ -15,9 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 """Wrapping existing transformations."""
-# pylint: disable=invalid-name
+# pylint: disable=invalid-name,unused-import
 from . import _ffi_api
 from . import function_pass as _fpass
+from .inject_rolling_buffer import InjectRollingBuffer
 
 
 def Apply(ftransform):

diff --git a/src/te/operation/compute_op.cc b/src/te/operation/compute_op.cc
@@ -484,7 +484,8 @@ ComputeLoopNest ComputeLoopNest::Create(const BaseComputeOpNode* self, const Sta
     }
     ret.init_nest = MakeLoopNest(stage, dom_map, begin_loop, true, skip_iter, &(ret.init_vmap),
                                  debug_keep_trivial_loop);
-    ret.init_predicates = MakeBoundCheck(stage, dom_map, ret.init_vmap, true, skip_iter);
+    ret.init_predicates =
+        MakeBoundCheck(stage, dom_map, ret.init_vmap, !stage->rolling_buffer, skip_iter);
     for (auto& e : ret.init_predicates) {
       e = likely(e);
     }

diff --git a/src/te/schedule/schedule_lang.cc b/src/te/schedule/schedule_lang.cc
@@ -423,6 +423,13 @@ Stage& Stage::double_buffer() {
   return *this;
 }
 
+Stage& Stage::rolling_buffer() {
+  StageNode* self = operator->();
+  ICHECK(!self->is_output) << "Cannot apply rolling buffer on output";
+  self->rolling_buffer = true;
+  return *this;
+}
+
 Stage CopyStage(const Stage& s) {
   ObjectPtr<StageNode> n = make_object<StageNode>(*s.operator->());
   return Stage(n);
@@ -879,6 +886,8 @@ TVM_REGISTER_GLOBAL("te.StageStorageAlign").set_body_method(&Stage::storage_alig
 
 TVM_REGISTER_GLOBAL("te.StageDoubleBuffer").set_body_method(&Stage::double_buffer);
 
+TVM_REGISTER_GLOBAL("te.StageRollingBuffer").set_body_method(&Stage::rolling_buffer);
+
 TVM_REGISTER_GLOBAL("te.ScheduleNormalize").set_body_method(&Schedule::normalize);
 
 TVM_REGISTER_GLOBAL("te.ScheduleCreateGroup").set_body_method(&Schedule::create_group);

diff --git a/src/te/schedule/schedule_ops.cc b/src/te/schedule/schedule_ops.cc
@@ -54,6 +54,9 @@ Stmt MakePipeline(const Stage& s, const std::unordered_map<IterVar, Range>& dom_
   pipeline = s->op->BuildRealize(s, dom_map, pipeline);
   // use attribute to mark scope of the operation.
   pipeline = AttrStmt(s->op, tir::attr::realize_scope, StringImm(s->scope), pipeline);
+  if (s->rolling_buffer) {
+    pipeline = AttrStmt(s->op, tir::attr::rolling_buffer_scope, Bool(true), pipeline);
+  }
 
   return pipeline;
 }

diff --git a/src/te/schedule/schedule_postproc_to_primfunc.cc b/src/te/schedule/schedule_postproc_to_primfunc.cc
@@ -70,7 +70,8 @@ class TensorToBufferMapper : public StmtExprMutator {
     // TODO(tvm-team): remove realize_scope, turn the info into
     // Buffer's scope field in this pass.
     if (op->attr_key == tir::attr::realize_scope ||
-        op->attr_key == tir::attr::double_buffer_scope) {
+        op->attr_key == tir::attr::double_buffer_scope ||
+        op->attr_key == tir::attr::rolling_buffer_scope) {
       Stmt body = op->body;
       Operation operation = Downcast<Operation>(op->node);
       for (int i = operation->num_outputs(); i != 0; --i) {