From 3637bbc7fe30df2fcf30c54512f3f6b67a92a68b Mon Sep 17 00:00:00 2001 From: Yuanming Hu Date: Mon, 20 Jul 2020 18:23:55 -0400 Subject: [PATCH] [CUDA] Upgrade to PTX 6.3 and add a few CUDA intrinsics --- examples/mpm99.py | 2 +- taichi/backends/cuda/jit_cuda.cpp | 1 + taichi/llvm/llvm_context.cpp | 1 - taichi/runtime/llvm/locked_task.h | 20 -------------------- taichi/runtime/llvm/runtime.cpp | 6 ------ 5 files changed, 2 insertions(+), 28 deletions(-) diff --git a/examples/mpm99.py b/examples/mpm99.py index 064e931b64a3fa..2296b9d6af4a1a 100644 --- a/examples/mpm99.py +++ b/examples/mpm99.py @@ -1,6 +1,6 @@ import taichi as ti import numpy as np -ti.init(arch=ti.gpu, print_kernel_nvptx=True, print_kernel_llvm_ir_optimized=True) # Try to run on GPU +ti.init(arch=ti.gpu) # Try to run on GPU quality = 1 # Use a larger value for higher-res simulations n_particles, n_grid = 9000 * quality ** 2, 128 * quality dx, inv_dx = 1 / n_grid, float(n_grid) diff --git a/taichi/backends/cuda/jit_cuda.cpp b/taichi/backends/cuda/jit_cuda.cpp index ade4d293215e1f..beba017e6fd1a4 100644 --- a/taichi/backends/cuda/jit_cuda.cpp +++ b/taichi/backends/cuda/jit_cuda.cpp @@ -140,6 +140,7 @@ std::string convert(std::string new_name) { std::string JITSessionCUDA::compile_module_to_ptx( std::unique_ptr &module) { TI_AUTO_PROF + // Part of this function is borrowed from Halide::CodeGen_PTX_Dev.cpp if (llvm::verifyModule(*module, &llvm::errs())) { module->print(llvm::errs(), nullptr); TI_ERROR("Module broken"); diff --git a/taichi/llvm/llvm_context.cpp b/taichi/llvm/llvm_context.cpp index 7cf72ac9245099..1e1074c093d2c7 100644 --- a/taichi/llvm/llvm_context.cpp +++ b/taichi/llvm/llvm_context.cpp @@ -429,7 +429,6 @@ std::unique_ptr TaichiLLVMContext::clone_runtime_module() { {llvm::PointerType::get(get_data_type(DataType::f64), 0)}); #endif - // patch_intrinsic("sync_warp", Intrinsic::nvvm_bar_warp_sync, false); patch_intrinsic("block_memfence", Intrinsic::nvvm_membar_cta, false); link_module_with_cuda_libdevice(data->runtime_module); diff --git a/taichi/runtime/llvm/locked_task.h b/taichi/runtime/llvm/locked_task.h index dcafc78ca25a39..4f1ce823ce5aa7 100644 --- a/taichi/runtime/llvm/locked_task.h +++ b/taichi/runtime/llvm/locked_task.h @@ -12,25 +12,6 @@ class lock_guard { mutex_unlock_i32(lock); #else // CUDA - - auto active_mask = cuda_active_mask(); - auto remaining = active_mask; - while (remaining) { - auto leader = cttz_i32(remaining); - if (warp_idx() == leader) { - // Memory fences here are necessary since CUDA has a weakly ordered - // memory model across threads - mutex_lock_i32(lock); - grid_memfence(); - func(); - grid_memfence(); - mutex_unlock_i32(lock); - grid_memfence(); - } - warp_barrier(active_mask); - remaining ^= 1u << leader; - } - /* for (int i = 0; i < warp_size(); i++) { if (warp_idx() == i) { // Memory fences here are necessary since CUDA has a weakly ordered @@ -43,7 +24,6 @@ class lock_guard { grid_memfence(); } } - */ // Unfortunately critical sections on CUDA has undefined behavior (deadlock // or not), if more than one thread in a warp try to acquire locks /* diff --git a/taichi/runtime/llvm/runtime.cpp b/taichi/runtime/llvm/runtime.cpp index 8dd3e3b39f5688..d08271ac388db4 100644 --- a/taichi/runtime/llvm/runtime.cpp +++ b/taichi/runtime/llvm/runtime.cpp @@ -913,12 +913,9 @@ int32 cuda_ballot_sync(int32 mask, bool bit) { #if ARCH_cuda uint32 cuda_active_mask() { - /* unsigned int mask; asm volatile("activemask.b32 %0;" : "=r"(mask)); return mask; - */ - return cuda_ballot(true); } #else uint32 cuda_active_mask() { @@ -930,9 +927,6 @@ int32 grid_dim() { return 0; } -void sync_warp(uint32 mask) { -} - void block_barrier() { }