From 2c0db878c8f1872182dd3274a9a572e0406336ac Mon Sep 17 00:00:00 2001 From: Yuanming Hu Date: Sat, 18 Jul 2020 20:48:07 -0400 Subject: [PATCH] use ptx63 --- examples/mpm99.py | 2 +- taichi/backends/cuda/codegen_cuda.cpp | 2 +- taichi/backends/cuda/jit_cuda.cpp | 7 +------ taichi/llvm/llvm_context.cpp | 3 +-- taichi/runtime/llvm/locked_task.h | 20 ++++++++++++++++++++ taichi/runtime/llvm/runtime.cpp | 20 +++++++++++++++----- 6 files changed, 39 insertions(+), 15 deletions(-) diff --git a/examples/mpm99.py b/examples/mpm99.py index 2296b9d6af4a1a..064e931b64a3fa 100644 --- a/examples/mpm99.py +++ b/examples/mpm99.py @@ -1,6 +1,6 @@ import taichi as ti import numpy as np -ti.init(arch=ti.gpu) # Try to run on GPU +ti.init(arch=ti.gpu, print_kernel_nvptx=True, print_kernel_llvm_ir_optimized=True) # Try to run on GPU quality = 1 # Use a larger value for higher-res simulations n_particles, n_grid = 9000 * quality ** 2, 128 * quality dx, inv_dx = 1 / n_grid, float(n_grid) diff --git a/taichi/backends/cuda/codegen_cuda.cpp b/taichi/backends/cuda/codegen_cuda.cpp index e7468b3cf19375..546dafff4aad40 100644 --- a/taichi/backends/cuda/codegen_cuda.cpp +++ b/taichi/backends/cuda/codegen_cuda.cpp @@ -429,7 +429,7 @@ class CodeGenLLVMCUDA : public CodeGenLLVM { auto type = llvm::ArrayType::get(llvm::Type::getInt8Ty(*llvm_context), stmt->bls_size); bls_buffer = new GlobalVariable( - *module, type, false, llvm::GlobalValue::InternalLinkage, nullptr, + *module, type, false, llvm::GlobalValue::ExternalLinkage, nullptr, "bls_buffer", nullptr, llvm::GlobalVariable::NotThreadLocal, 3 /*addrspace=shared*/); #if LLVM_VERSION_MAJOR >= 10 diff --git a/taichi/backends/cuda/jit_cuda.cpp b/taichi/backends/cuda/jit_cuda.cpp index 11f040864f7e39..ade4d293215e1f 100644 --- a/taichi/backends/cuda/jit_cuda.cpp +++ b/taichi/backends/cuda/jit_cuda.cpp @@ -112,7 +112,7 @@ class JITSessionCUDA : public JITSession { }; std::string cuda_mattrs() { - return "+ptx50"; + return "+ptx63"; } std::string convert(std::string new_name) { @@ -140,15 +140,10 @@ std::string convert(std::string new_name) { std::string JITSessionCUDA::compile_module_to_ptx( std::unique_ptr &module) { TI_AUTO_PROF - // Part of this function is borrowed from Halide::CodeGen_PTX_Dev.cpp - // TODO: enabling this leads to LLVM error "comdat global value has private - // linkage" - /* if (llvm::verifyModule(*module, &llvm::errs())) { module->print(llvm::errs(), nullptr); TI_ERROR("Module broken"); } - */ using namespace llvm; diff --git a/taichi/llvm/llvm_context.cpp b/taichi/llvm/llvm_context.cpp index 837e2e94248621..7cf72ac9245099 100644 --- a/taichi/llvm/llvm_context.cpp +++ b/taichi/llvm/llvm_context.cpp @@ -398,6 +398,7 @@ std::unique_ptr TaichiLLVMContext::clone_runtime_module() { patch_intrinsic("block_dim", Intrinsic::nvvm_read_ptx_sreg_ntid_x); patch_intrinsic("grid_dim", Intrinsic::nvvm_read_ptx_sreg_nctaid_x); patch_intrinsic("block_barrier", Intrinsic::nvvm_barrier0, false); + patch_intrinsic("warp_barrier", Intrinsic::nvvm_bar_warp_sync, false); patch_intrinsic("block_memfence", Intrinsic::nvvm_membar_cta, false); patch_intrinsic("grid_memfence", Intrinsic::nvvm_membar_gl, false); patch_intrinsic("system_memfence", Intrinsic::nvvm_membar_sys, false); @@ -429,8 +430,6 @@ std::unique_ptr TaichiLLVMContext::clone_runtime_module() { #endif // patch_intrinsic("sync_warp", Intrinsic::nvvm_bar_warp_sync, false); - // patch_intrinsic("warp_ballot", Intrinsic::nvvm_vote_ballot, false); - // patch_intrinsic("warp_active_mask", Intrinsic::nvvm_membar_cta, false); patch_intrinsic("block_memfence", Intrinsic::nvvm_membar_cta, false); link_module_with_cuda_libdevice(data->runtime_module); diff --git a/taichi/runtime/llvm/locked_task.h b/taichi/runtime/llvm/locked_task.h index 4f1ce823ce5aa7..dcafc78ca25a39 100644 --- a/taichi/runtime/llvm/locked_task.h +++ b/taichi/runtime/llvm/locked_task.h @@ -12,6 +12,25 @@ class lock_guard { mutex_unlock_i32(lock); #else // CUDA + + auto active_mask = cuda_active_mask(); + auto remaining = active_mask; + while (remaining) { + auto leader = cttz_i32(remaining); + if (warp_idx() == leader) { + // Memory fences here are necessary since CUDA has a weakly ordered + // memory model across threads + mutex_lock_i32(lock); + grid_memfence(); + func(); + grid_memfence(); + mutex_unlock_i32(lock); + grid_memfence(); + } + warp_barrier(active_mask); + remaining ^= 1u << leader; + } + /* for (int i = 0; i < warp_size(); i++) { if (warp_idx() == i) { // Memory fences here are necessary since CUDA has a weakly ordered @@ -24,6 +43,7 @@ class lock_guard { grid_memfence(); } } + */ // Unfortunately critical sections on CUDA has undefined behavior (deadlock // or not), if more than one thread in a warp try to acquire locks /* diff --git a/taichi/runtime/llvm/runtime.cpp b/taichi/runtime/llvm/runtime.cpp index 6acb8889cd47a0..8dd3e3b39f5688 100644 --- a/taichi/runtime/llvm/runtime.cpp +++ b/taichi/runtime/llvm/runtime.cpp @@ -903,17 +903,28 @@ int32 cttz_i32(i32 val) { return 0; } -uint32 cuda_ballot(bool bit) { +int32 cuda_ballot(bool bit) { return 0; } -uint32 cuda_ballot_sync(uint32 mask, bool bit) { +int32 cuda_ballot_sync(int32 mask, bool bit) { return 0; } -int32 cuda_active_mask() { +#if ARCH_cuda +uint32 cuda_active_mask() { + /* + unsigned int mask; + asm volatile("activemask.b32 %0;" : "=r"(mask)); + return mask; + */ return cuda_ballot(true); } +#else +uint32 cuda_active_mask() { + return 0; +} +#endif int32 grid_dim() { return 0; @@ -925,8 +936,7 @@ void sync_warp(uint32 mask) { void block_barrier() { } -int32 warp_active_mask() { - return 0; +void warp_barrier(uint32 mask) { } void block_memfence() {