From 2c0db878c8f1872182dd3274a9a572e0406336ac Mon Sep 17 00:00:00 2001
From: Yuanming Hu <yuanmhu@gmail.com>
Date: Sat, 18 Jul 2020 20:48:07 -0400
Subject: [PATCH] use ptx63

---
 examples/mpm99.py                     |  2 +-
 taichi/backends/cuda/codegen_cuda.cpp |  2 +-
 taichi/backends/cuda/jit_cuda.cpp     |  7 +------
 taichi/llvm/llvm_context.cpp          |  3 +--
 taichi/runtime/llvm/locked_task.h     | 20 ++++++++++++++++++++
 taichi/runtime/llvm/runtime.cpp       | 20 +++++++++++++++-----
 6 files changed, 39 insertions(+), 15 deletions(-)

diff --git a/examples/mpm99.py b/examples/mpm99.py
index 2296b9d6af4a1a..064e931b64a3fa 100644
--- a/examples/mpm99.py
+++ b/examples/mpm99.py
@@ -1,6 +1,6 @@
 import taichi as ti
 import numpy as np
-ti.init(arch=ti.gpu) # Try to run on GPU
+ti.init(arch=ti.gpu, print_kernel_nvptx=True, print_kernel_llvm_ir_optimized=True) # Try to run on GPU
 quality = 1 # Use a larger value for higher-res simulations
 n_particles, n_grid = 9000 * quality ** 2, 128 * quality
 dx, inv_dx = 1 / n_grid, float(n_grid)
diff --git a/taichi/backends/cuda/codegen_cuda.cpp b/taichi/backends/cuda/codegen_cuda.cpp
index e7468b3cf19375..546dafff4aad40 100644
--- a/taichi/backends/cuda/codegen_cuda.cpp
+++ b/taichi/backends/cuda/codegen_cuda.cpp
@@ -429,7 +429,7 @@ class CodeGenLLVMCUDA : public CodeGenLLVM {
     auto type = llvm::ArrayType::get(llvm::Type::getInt8Ty(*llvm_context),
                                      stmt->bls_size);
     bls_buffer = new GlobalVariable(
-        *module, type, false, llvm::GlobalValue::InternalLinkage, nullptr,
+        *module, type, false, llvm::GlobalValue::ExternalLinkage, nullptr,
         "bls_buffer", nullptr, llvm::GlobalVariable::NotThreadLocal,
         3 /*addrspace=shared*/);
 #if LLVM_VERSION_MAJOR >= 10
diff --git a/taichi/backends/cuda/jit_cuda.cpp b/taichi/backends/cuda/jit_cuda.cpp
index 11f040864f7e39..ade4d293215e1f 100644
--- a/taichi/backends/cuda/jit_cuda.cpp
+++ b/taichi/backends/cuda/jit_cuda.cpp
@@ -112,7 +112,7 @@ class JITSessionCUDA : public JITSession {
 };
 
 std::string cuda_mattrs() {
-  return "+ptx50";
+  return "+ptx63";
 }
 
 std::string convert(std::string new_name) {
@@ -140,15 +140,10 @@ std::string convert(std::string new_name) {
 std::string JITSessionCUDA::compile_module_to_ptx(
     std::unique_ptr<llvm::Module> &module) {
   TI_AUTO_PROF
-  // Part of this function is borrowed from Halide::CodeGen_PTX_Dev.cpp
-  // TODO: enabling this leads to LLVM error "comdat global value has private
-  // linkage"
-  /*
   if (llvm::verifyModule(*module, &llvm::errs())) {
     module->print(llvm::errs(), nullptr);
     TI_ERROR("Module broken");
   }
-  */
 
   using namespace llvm;
 
diff --git a/taichi/llvm/llvm_context.cpp b/taichi/llvm/llvm_context.cpp
index 837e2e94248621..7cf72ac9245099 100644
--- a/taichi/llvm/llvm_context.cpp
+++ b/taichi/llvm/llvm_context.cpp
@@ -398,6 +398,7 @@ std::unique_ptr<llvm::Module> TaichiLLVMContext::clone_runtime_module() {
       patch_intrinsic("block_dim", Intrinsic::nvvm_read_ptx_sreg_ntid_x);
       patch_intrinsic("grid_dim", Intrinsic::nvvm_read_ptx_sreg_nctaid_x);
       patch_intrinsic("block_barrier", Intrinsic::nvvm_barrier0, false);
+      patch_intrinsic("warp_barrier", Intrinsic::nvvm_bar_warp_sync, false);
       patch_intrinsic("block_memfence", Intrinsic::nvvm_membar_cta, false);
       patch_intrinsic("grid_memfence", Intrinsic::nvvm_membar_gl, false);
       patch_intrinsic("system_memfence", Intrinsic::nvvm_membar_sys, false);
@@ -429,8 +430,6 @@ std::unique_ptr<llvm::Module> TaichiLLVMContext::clone_runtime_module() {
 #endif
 
       // patch_intrinsic("sync_warp", Intrinsic::nvvm_bar_warp_sync, false);
-      // patch_intrinsic("warp_ballot", Intrinsic::nvvm_vote_ballot, false);
-      // patch_intrinsic("warp_active_mask", Intrinsic::nvvm_membar_cta, false);
       patch_intrinsic("block_memfence", Intrinsic::nvvm_membar_cta, false);
 
       link_module_with_cuda_libdevice(data->runtime_module);
diff --git a/taichi/runtime/llvm/locked_task.h b/taichi/runtime/llvm/locked_task.h
index 4f1ce823ce5aa7..dcafc78ca25a39 100644
--- a/taichi/runtime/llvm/locked_task.h
+++ b/taichi/runtime/llvm/locked_task.h
@@ -12,6 +12,25 @@ class lock_guard {
     mutex_unlock_i32(lock);
 #else
     // CUDA
+
+    auto active_mask = cuda_active_mask();
+    auto remaining = active_mask;
+    while (remaining) {
+      auto leader = cttz_i32(remaining);
+      if (warp_idx() == leader) {
+        // Memory fences here are necessary since CUDA has a weakly ordered
+        // memory model across threads
+        mutex_lock_i32(lock);
+        grid_memfence();
+        func();
+        grid_memfence();
+        mutex_unlock_i32(lock);
+        grid_memfence();
+      }
+      warp_barrier(active_mask);
+      remaining ^= 1u << leader;
+    }
+    /*
     for (int i = 0; i < warp_size(); i++) {
       if (warp_idx() == i) {
         // Memory fences here are necessary since CUDA has a weakly ordered
@@ -24,6 +43,7 @@ class lock_guard {
         grid_memfence();
       }
     }
+    */
     // Unfortunately critical sections on CUDA has undefined behavior (deadlock
     // or not), if more than one thread in a warp try to acquire locks
     /*
diff --git a/taichi/runtime/llvm/runtime.cpp b/taichi/runtime/llvm/runtime.cpp
index 6acb8889cd47a0..8dd3e3b39f5688 100644
--- a/taichi/runtime/llvm/runtime.cpp
+++ b/taichi/runtime/llvm/runtime.cpp
@@ -903,17 +903,28 @@ int32 cttz_i32(i32 val) {
   return 0;
 }
 
-uint32 cuda_ballot(bool bit) {
+int32 cuda_ballot(bool bit) {
   return 0;
 }
 
-uint32 cuda_ballot_sync(uint32 mask, bool bit) {
+int32 cuda_ballot_sync(int32 mask, bool bit) {
   return 0;
 }
 
-int32 cuda_active_mask() {
+#if ARCH_cuda
+uint32 cuda_active_mask() {
+  /*
+  unsigned int mask;
+  asm volatile("activemask.b32 %0;" : "=r"(mask));
+  return mask;
+  */
   return cuda_ballot(true);
 }
+#else
+uint32 cuda_active_mask() {
+  return 0;
+}
+#endif
 
 int32 grid_dim() {
   return 0;
@@ -925,8 +936,7 @@ void sync_warp(uint32 mask) {
 void block_barrier() {
 }
 
-int32 warp_active_mask() {
-  return 0;
+void warp_barrier(uint32 mask) {
 }
 
 void block_memfence() {