From 3637bbc7fe30df2fcf30c54512f3f6b67a92a68b Mon Sep 17 00:00:00 2001
From: Yuanming Hu <yuanmhu@gmail.com>
Date: Mon, 20 Jul 2020 18:23:55 -0400
Subject: [PATCH] [CUDA] Upgrade to PTX 6.3 and add a few CUDA intrinsics

---
 examples/mpm99.py                 |  2 +-
 taichi/backends/cuda/jit_cuda.cpp |  1 +
 taichi/llvm/llvm_context.cpp      |  1 -
 taichi/runtime/llvm/locked_task.h | 20 --------------------
 taichi/runtime/llvm/runtime.cpp   |  6 ------
 5 files changed, 2 insertions(+), 28 deletions(-)

diff --git a/examples/mpm99.py b/examples/mpm99.py
index 064e931b64a3fa..2296b9d6af4a1a 100644
--- a/examples/mpm99.py
+++ b/examples/mpm99.py
@@ -1,6 +1,6 @@
 import taichi as ti
 import numpy as np
-ti.init(arch=ti.gpu, print_kernel_nvptx=True, print_kernel_llvm_ir_optimized=True) # Try to run on GPU
+ti.init(arch=ti.gpu) # Try to run on GPU
 quality = 1 # Use a larger value for higher-res simulations
 n_particles, n_grid = 9000 * quality ** 2, 128 * quality
 dx, inv_dx = 1 / n_grid, float(n_grid)
diff --git a/taichi/backends/cuda/jit_cuda.cpp b/taichi/backends/cuda/jit_cuda.cpp
index ade4d293215e1f..beba017e6fd1a4 100644
--- a/taichi/backends/cuda/jit_cuda.cpp
+++ b/taichi/backends/cuda/jit_cuda.cpp
@@ -140,6 +140,7 @@ std::string convert(std::string new_name) {
 std::string JITSessionCUDA::compile_module_to_ptx(
     std::unique_ptr<llvm::Module> &module) {
   TI_AUTO_PROF
+  // Part of this function is borrowed from Halide::CodeGen_PTX_Dev.cpp
   if (llvm::verifyModule(*module, &llvm::errs())) {
     module->print(llvm::errs(), nullptr);
     TI_ERROR("Module broken");
diff --git a/taichi/llvm/llvm_context.cpp b/taichi/llvm/llvm_context.cpp
index 7cf72ac9245099..1e1074c093d2c7 100644
--- a/taichi/llvm/llvm_context.cpp
+++ b/taichi/llvm/llvm_context.cpp
@@ -429,7 +429,6 @@ std::unique_ptr<llvm::Module> TaichiLLVMContext::clone_runtime_module() {
           {llvm::PointerType::get(get_data_type(DataType::f64), 0)});
 #endif
 
-      // patch_intrinsic("sync_warp", Intrinsic::nvvm_bar_warp_sync, false);
       patch_intrinsic("block_memfence", Intrinsic::nvvm_membar_cta, false);
 
       link_module_with_cuda_libdevice(data->runtime_module);
diff --git a/taichi/runtime/llvm/locked_task.h b/taichi/runtime/llvm/locked_task.h
index dcafc78ca25a39..4f1ce823ce5aa7 100644
--- a/taichi/runtime/llvm/locked_task.h
+++ b/taichi/runtime/llvm/locked_task.h
@@ -12,25 +12,6 @@ class lock_guard {
     mutex_unlock_i32(lock);
 #else
     // CUDA
-
-    auto active_mask = cuda_active_mask();
-    auto remaining = active_mask;
-    while (remaining) {
-      auto leader = cttz_i32(remaining);
-      if (warp_idx() == leader) {
-        // Memory fences here are necessary since CUDA has a weakly ordered
-        // memory model across threads
-        mutex_lock_i32(lock);
-        grid_memfence();
-        func();
-        grid_memfence();
-        mutex_unlock_i32(lock);
-        grid_memfence();
-      }
-      warp_barrier(active_mask);
-      remaining ^= 1u << leader;
-    }
-    /*
     for (int i = 0; i < warp_size(); i++) {
       if (warp_idx() == i) {
         // Memory fences here are necessary since CUDA has a weakly ordered
@@ -43,7 +24,6 @@ class lock_guard {
         grid_memfence();
       }
     }
-    */
     // Unfortunately critical sections on CUDA has undefined behavior (deadlock
     // or not), if more than one thread in a warp try to acquire locks
     /*
diff --git a/taichi/runtime/llvm/runtime.cpp b/taichi/runtime/llvm/runtime.cpp
index 8dd3e3b39f5688..d08271ac388db4 100644
--- a/taichi/runtime/llvm/runtime.cpp
+++ b/taichi/runtime/llvm/runtime.cpp
@@ -913,12 +913,9 @@ int32 cuda_ballot_sync(int32 mask, bool bit) {
 
 #if ARCH_cuda
 uint32 cuda_active_mask() {
-  /*
   unsigned int mask;
   asm volatile("activemask.b32 %0;" : "=r"(mask));
   return mask;
-  */
-  return cuda_ballot(true);
 }
 #else
 uint32 cuda_active_mask() {
@@ -930,9 +927,6 @@ int32 grid_dim() {
   return 0;
 }
 
-void sync_warp(uint32 mask) {
-}
-
 void block_barrier() {
 }