apache · vinx13 · Jun 10, 2022 · Jun 9, 2022 · Jun 9, 2022 · Jun 9, 2022
diff --git a/include/tvm/tir/stmt.h b/include/tvm/tir/stmt.h
@@ -1441,6 +1441,11 @@ constexpr const char* pipeline_exec_scope = "pipeline_exec_scope";
  */
 constexpr const char* device_scope = "device_scope";
 
+/*!
+ * \brief Mark that the attached statement runs asynchronously.
+ */
+constexpr const char* async_scope = "async_scope";
+
 /*!
  * \brief Mark that the shape of TensorCore fragment
  */

diff --git a/include/tvm/tir/transform.h b/include/tvm/tir/transform.h
@@ -644,6 +644,12 @@ TVM_DLL Pass AnnotateEntryFunc();
  */
 TVM_DLL Pass Filter(runtime::TypedPackedFunc<bool(PrimFunc)> fcond);
 
+/*!
+ * \brief Pass to rewrite global to shared memory copy on CUDA with asyncronous copy.
+ * \return The pass.
+ */
+TVM_DLL Pass InjectPTXAsyncCopy();
+
 }  // namespace transform
 }  // namespace tir
 }  // namespace tvm

diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
@@ -1599,6 +1599,13 @@ def terminate_self():
     sys.exit(-1)
 
 
+def is_ampere_or_newer():
+    """Check if the target environment has an NVIDIA Ampere GPU or newer."""
+    arch = tvm.contrib.nvcc.get_target_compute_version()
+    major, _ = tvm.contrib.nvcc.parse_compute_version(arch)
+    return major >= 8
+
+
 def main():
     test_file = inspect.getsourcefile(sys._getframe(1))
     sys.exit(pytest.main([test_file] + sys.argv[1:]))
diff --git a/python/tvm/tir/transform/transform.py b/python/tvm/tir/transform/transform.py
@@ -825,3 +825,14 @@ def Filter(fcond: Callable):
         The result pass
     """
     return _ffi_api.Filter(fcond)  # type: ignore
+
+
+def InjectPTXAsyncCopy():
+    """Rewrite global to shared memory copy on CUDA with asyncronous copy.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.InjectPTXAsyncCopy()  # type: ignore
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
@@ -50,6 +50,7 @@ TVM_REGISTER_PASS_CONFIG_OPTION("tir.disable_storage_rewrite", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.is_entry_func", Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.add_lower_pass", Array<Array<ObjectRef>>);
 TVM_REGISTER_PASS_CONFIG_OPTION("tir.debug_keep_trivial_loop", Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION("tir.use_ptx_async_copy", Bool);
 
 using runtime::PackedFunc;
 using runtime::TVMArgs;
@@ -559,6 +560,13 @@ transform::Sequential MixedModulePassManager(IRModule mixed_mod, Target target)
   mixed_pass_list.push_back(tir::transform::InferFragment());
   mixed_pass_list.push_back(tir::transform::LowerThreadAllreduce());
 
+  bool use_ptx_async_copy =
+      pass_ctx->GetConfig<Bool>("tir.use_ptx_async_copy", Bool(false)).value();
+
+  if (use_ptx_async_copy) {
+    mixed_pass_list.push_back(tir::transform::InjectPTXAsyncCopy());
+  }
+
   bool unpacked_api = mixed_mod->GetAttr<relay::Executor>(tvm::attr::kExecutor)
                           .value_or(relay::Executor::Create("graph", {}))
                           ->GetAttr<Bool>("unpacked-api")

diff --git a/src/target/source/ptx.cc b/src/target/source/ptx.cc
@@ -651,7 +651,7 @@ std::string PrintCpAsyncAssembly(const std::string& shared_ptr,
       : "l"((void *)({smem_addr}))
     );
     __asm__ __volatile__(
-      "cp.async.cg.shared.global [%0], [%1], %2;"
+      "cp.async.{cg_or_ca}.shared.global [%0], [%1], %2;"
        :: "r"(addr), "l"((void*)({global_ptr})), "n"({bytes})
     );
   }
@@ -660,6 +660,7 @@ std::string PrintCpAsyncAssembly(const std::string& shared_ptr,
   replacer.register_rule("{smem_addr}", shared_ptr + " + " + shared_elem_offset);
   replacer.register_rule("{global_ptr}", global_ptr + " + " + global_elem_offset);
   replacer.register_rule("{bytes}", bytes);
+  replacer.register_rule("{cg_or_ca}", bytes == "16" ? "cg" : "ca");
   asm_code = replacer.rewrite(asm_code);
   return asm_code;
 }

diff --git a/src/tir/transforms/inject_ptx_async_copy.cc b/src/tir/transforms/inject_ptx_async_copy.cc
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \brief Replace copy from global to shared with async copy
+ * \file inject_ptx_async_copy.cc
+ */
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+#include "../ir/buffer_common.h"
+#include "storage_access.h"
+#include "tvm/tir/stmt.h"
+
+namespace tvm {
+namespace tir {
+
+class PTXAsyncCopyInjector : public StmtMutator {
+ public:
+  Stmt VisitStmt_(const AttrStmtNode* attr) {
+    if (attr->attr_key == tir::attr::async_scope) {
+      ICHECK(in_async == false) << "Nested async scopes not supported";
+      in_async = true;
+      auto body = this->VisitStmt(attr->body);
+      in_async = false;
+      return body;
+    }
+    return StmtMutator::VisitStmt_(attr);
+  }
+
+  Stmt VisitStmt_(const BufferStoreNode* store) {
+    if (in_async && (store->buffer.scope() == "shared" || store->buffer.scope() == "shared.dyn")) {
+      if (auto* load = store->value.as<BufferLoadNode>()) {
+        if (load->buffer.scope() == "global") {
+          ICHECK(load->indices.size() == 1 && store->indices.size() == 1);
+          ICHECK(load->indices[0]->dtype.lanes() == store->indices[0]->dtype.lanes());
+
+          const int indices_lanes = load->indices[0]->dtype.lanes();
+          const int bytes = indices_lanes * load->buffer->dtype.bytes();
+
+          if (bytes == 4 || bytes == 8 || bytes == 16) {
+            auto dst_elem_type = GetPointerType(store->buffer->data->type_annotation);
+            auto src_elem_type = GetPointerType(load->buffer->data->type_annotation);
+            ICHECK(dst_elem_type.first && src_elem_type.first)
+                << "Both store and load buffer should have a pointer type annotation.";
+
+            int index_factor = 1;
+            if (dst_elem_type != src_elem_type) {
+              // The only case where src and dst have different dtypes is when the dst shared memory
+              // is a byte buffer generated by merging dynamic shared memory.
+              ICHECK(store->buffer.scope() == "shared.dyn");
+              ICHECK(dst_elem_type.second == DataType::UInt(8));
+              // BufferStore/Load have the "pointer reinterpret" semantics according to their
+              // "value" dtype. Their "indices" are supposed to be applied after such pointer cast,
+              // for example: ((*float16)(byte_buffer))[buffer->indices] = fp16_value;
+              // To replace BufferStore/Load with cp.async, we need to multiply the store index by
+              // the byte size of the "value" dtype, to get the correct offset into the byte buffer.
+              index_factor = src_elem_type.second.bytes();
+            }
+
+            if (indices_lanes == 1) {
+              auto src_offset = load->indices[0];
+              auto dst_offset = store->indices[0];
+              return Evaluate(
+                  Call(store->buffer->dtype, tvm::tir::builtin::ptx_cp_async(),
+                       {store->buffer->data, tir::Mul(dst_offset, PrimExpr(index_factor)),
+                        load->buffer->data, src_offset, PrimExpr(bytes)}));
+            }
+
+            // Only some vectorized indexing patterns are supported for now.
+            auto src_offset = [=]() -> PrimExpr {
+              if (load->indices[0]->IsInstance<RampNode>()) {
+                return load->indices[0].as<RampNode>()->base;
+              }
+              return PrimExpr();
+            }();
+
+            auto dst_offset = [=]() -> PrimExpr {
+              if (store->indices[0].as<RampNode>()) {
+                return store->indices[0].as<RampNode>()->base;
+              } else if (store->indices[0].as<AddNode>()) {
+                // The case where the dst buffer is a byte buffer generated by merging dynamic
+                // shared memory.
+                // A_shared.dyn[(ramp(...), 1, 8) + x8(17408))] = A_global[ramp(...),1, 8)]
+                auto* add = store->indices[0].as<AddNode>();
+                if (!add->a->IsInstance<RampNode>()) return PrimExpr();
+                if (!add->b->IsInstance<BroadcastNode>()) return PrimExpr();
+                return tir::Add(add->a.as<RampNode>()->base, add->b.as<BroadcastNode>()->value);
+              }
+              return PrimExpr();
+            }();
+
+            if (src_offset.defined() && dst_offset.defined()) {
+              return Evaluate(
+                  Call(store->buffer->dtype, tvm::tir::builtin::ptx_cp_async(),
+                       {store->buffer->data, tir::Mul(dst_offset, PrimExpr(index_factor)),
+                        load->buffer->data, src_offset, PrimExpr(bytes)}));
+            }
+          }
+        }
+      }
+    }
+    return StmtMutator::VisitStmt_(store);
+  }
+
+ private:
+  bool in_async{false};
+};
+
+namespace transform {
+
+Pass InjectPTXAsyncCopy() {
+  auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) {
+    auto* n = f.CopyOnWrite();
+    n->body = PTXAsyncCopyInjector()(n->body);
+    return f;
+  };
+  return CreatePrimFuncPass(pass_func, 0, "tir.InjectPTXAsyncCopy", {});
+}
+
+TVM_REGISTER_GLOBAL("tir.transform.InjectPTXAsyncCopy").set_body_typed(InjectPTXAsyncCopy);
+
+}  // namespace transform
+
+}  // namespace tir
+}  // namespace tvm
diff --git a/tests/python/unittest/test_tir_ptx_cp_async.py b/tests/python/unittest/test_tir_ptx_cp_async.py
@@ -40,8 +40,8 @@ def ptx_cp_async(A: T.Buffer[(32, 128), "float16"], B: T.Buffer[(32, 128), "floa
             )
 
         # TODO(masahi): Remove dtype requirement from TVMScript parser
-        T.evaluate(T.ptx_commit_group(dtype="float16"))
-        T.evaluate(T.ptx_wait_group(0, dtype="float16"))
+        T.evaluate(T.ptx_commit_group(dtype=""))
+        T.evaluate(T.ptx_wait_group(0, dtype=""))
 
         for i in range(128):
             B[tx, i] = A_shared[tx, i]

diff --git a/tests/python/unittest/test_tir_schedule_tensorize_ldmatrix_mma.py b/tests/python/unittest/test_tir_schedule_tensorize_ldmatrix_mma.py
@@ -76,12 +76,6 @@ def maybe_swap(i, j):
     return (a, b, c)
 
 
-def is_ampere_or_newer():
-    arch = tvm.contrib.nvcc.get_target_compute_version()
-    major, _ = tvm.contrib.nvcc.parse_compute_version(arch)
-    return major >= 8
-
-
 def run_test(
     k_inner,
     in_dtype,
@@ -117,7 +111,7 @@ def run_test(
         mma_store_intrin,
     )
 
-    if not is_ampere_or_newer():
+    if not tvm.testing.is_ampere_or_newer():
         return None
 
     f = tvm.build(sch.mod["main"], target="cuda", name="dense")