[CUDA] Upgrade to PTX 6.3 and add a few CUDA intrinsics

taichi-dev · Jul 20, 2020 · 3637bbc · 3637bbc
1 parent 2c0db87
commit 3637bbc
Show file tree

Hide file tree

Showing 5 changed files with 2 additions and 28 deletions.
diff --git a/examples/mpm99.py b/examples/mpm99.py
@@ -1,6 +1,6 @@
 import taichi as ti
 import numpy as np
-ti.init(arch=ti.gpu, print_kernel_nvptx=True, print_kernel_llvm_ir_optimized=True) # Try to run on GPU
+ti.init(arch=ti.gpu) # Try to run on GPU
 quality = 1 # Use a larger value for higher-res simulations
 n_particles, n_grid = 9000 * quality ** 2, 128 * quality
 dx, inv_dx = 1 / n_grid, float(n_grid)

diff --git a/taichi/backends/cuda/jit_cuda.cpp b/taichi/backends/cuda/jit_cuda.cpp
@@ -140,6 +140,7 @@ std::string convert(std::string new_name) {
 std::string JITSessionCUDA::compile_module_to_ptx(
     std::unique_ptr<llvm::Module> &module) {
   TI_AUTO_PROF
+  // Part of this function is borrowed from Halide::CodeGen_PTX_Dev.cpp
   if (llvm::verifyModule(*module, &llvm::errs())) {
     module->print(llvm::errs(), nullptr);
     TI_ERROR("Module broken");

diff --git a/taichi/llvm/llvm_context.cpp b/taichi/llvm/llvm_context.cpp
@@ -429,7 +429,6 @@ std::unique_ptr<llvm::Module> TaichiLLVMContext::clone_runtime_module() {
           {llvm::PointerType::get(get_data_type(DataType::f64), 0)});
 #endif
 
-      // patch_intrinsic("sync_warp", Intrinsic::nvvm_bar_warp_sync, false);
       patch_intrinsic("block_memfence", Intrinsic::nvvm_membar_cta, false);
 
       link_module_with_cuda_libdevice(data->runtime_module);

diff --git a/taichi/runtime/llvm/locked_task.h b/taichi/runtime/llvm/locked_task.h
@@ -12,25 +12,6 @@ class lock_guard {
     mutex_unlock_i32(lock);
 #else
     // CUDA
-
-    auto active_mask = cuda_active_mask();
-    auto remaining = active_mask;
-    while (remaining) {
-      auto leader = cttz_i32(remaining);
-      if (warp_idx() == leader) {
-        // Memory fences here are necessary since CUDA has a weakly ordered
-        // memory model across threads
-        mutex_lock_i32(lock);
-        grid_memfence();
-        func();
-        grid_memfence();
-        mutex_unlock_i32(lock);
-        grid_memfence();
-      }
-      warp_barrier(active_mask);
-      remaining ^= 1u << leader;
-    }
-    /*
     for (int i = 0; i < warp_size(); i++) {
       if (warp_idx() == i) {
         // Memory fences here are necessary since CUDA has a weakly ordered
@@ -43,7 +24,6 @@ class lock_guard {
         grid_memfence();
       }
     }
-    */
     // Unfortunately critical sections on CUDA has undefined behavior (deadlock
     // or not), if more than one thread in a warp try to acquire locks
     /*

diff --git a/taichi/runtime/llvm/runtime.cpp b/taichi/runtime/llvm/runtime.cpp
@@ -913,12 +913,9 @@ int32 cuda_ballot_sync(int32 mask, bool bit) {
 
 #if ARCH_cuda
 uint32 cuda_active_mask() {
-  /*
   unsigned int mask;
   asm volatile("activemask.b32 %0;" : "=r"(mask));
   return mask;
-  */
-  return cuda_ballot(true);
 }
 #else
 uint32 cuda_active_mask() {
@@ -930,9 +927,6 @@ int32 grid_dim() {
   return 0;
 }
 
-void sync_warp(uint32 mask) {
-}
-
 void block_barrier() {
 }