Skip to content

Commit

Permalink
use ptx63
Browse files Browse the repository at this point in the history
  • Loading branch information
yuanming-hu committed Jul 20, 2020
1 parent 3b4e7a1 commit 2c0db87
Show file tree
Hide file tree
Showing 6 changed files with 39 additions and 15 deletions.
2 changes: 1 addition & 1 deletion examples/mpm99.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import taichi as ti
import numpy as np
ti.init(arch=ti.gpu) # Try to run on GPU
ti.init(arch=ti.gpu, print_kernel_nvptx=True, print_kernel_llvm_ir_optimized=True) # Try to run on GPU
quality = 1 # Use a larger value for higher-res simulations
n_particles, n_grid = 9000 * quality ** 2, 128 * quality
dx, inv_dx = 1 / n_grid, float(n_grid)
Expand Down
2 changes: 1 addition & 1 deletion taichi/backends/cuda/codegen_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,7 @@ class CodeGenLLVMCUDA : public CodeGenLLVM {
auto type = llvm::ArrayType::get(llvm::Type::getInt8Ty(*llvm_context),
stmt->bls_size);
bls_buffer = new GlobalVariable(
*module, type, false, llvm::GlobalValue::InternalLinkage, nullptr,
*module, type, false, llvm::GlobalValue::ExternalLinkage, nullptr,
"bls_buffer", nullptr, llvm::GlobalVariable::NotThreadLocal,
3 /*addrspace=shared*/);
#if LLVM_VERSION_MAJOR >= 10
Expand Down
7 changes: 1 addition & 6 deletions taichi/backends/cuda/jit_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ class JITSessionCUDA : public JITSession {
};

std::string cuda_mattrs() {
return "+ptx50";
return "+ptx63";
}

std::string convert(std::string new_name) {
Expand Down Expand Up @@ -140,15 +140,10 @@ std::string convert(std::string new_name) {
std::string JITSessionCUDA::compile_module_to_ptx(
std::unique_ptr<llvm::Module> &module) {
TI_AUTO_PROF
// Part of this function is borrowed from Halide::CodeGen_PTX_Dev.cpp
// TODO: enabling this leads to LLVM error "comdat global value has private
// linkage"
/*
if (llvm::verifyModule(*module, &llvm::errs())) {
module->print(llvm::errs(), nullptr);
TI_ERROR("Module broken");
}
*/

using namespace llvm;

Expand Down
3 changes: 1 addition & 2 deletions taichi/llvm/llvm_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,7 @@ std::unique_ptr<llvm::Module> TaichiLLVMContext::clone_runtime_module() {
patch_intrinsic("block_dim", Intrinsic::nvvm_read_ptx_sreg_ntid_x);
patch_intrinsic("grid_dim", Intrinsic::nvvm_read_ptx_sreg_nctaid_x);
patch_intrinsic("block_barrier", Intrinsic::nvvm_barrier0, false);
patch_intrinsic("warp_barrier", Intrinsic::nvvm_bar_warp_sync, false);
patch_intrinsic("block_memfence", Intrinsic::nvvm_membar_cta, false);
patch_intrinsic("grid_memfence", Intrinsic::nvvm_membar_gl, false);
patch_intrinsic("system_memfence", Intrinsic::nvvm_membar_sys, false);
Expand Down Expand Up @@ -429,8 +430,6 @@ std::unique_ptr<llvm::Module> TaichiLLVMContext::clone_runtime_module() {
#endif

// patch_intrinsic("sync_warp", Intrinsic::nvvm_bar_warp_sync, false);
// patch_intrinsic("warp_ballot", Intrinsic::nvvm_vote_ballot, false);
// patch_intrinsic("warp_active_mask", Intrinsic::nvvm_membar_cta, false);
patch_intrinsic("block_memfence", Intrinsic::nvvm_membar_cta, false);

link_module_with_cuda_libdevice(data->runtime_module);
Expand Down
20 changes: 20 additions & 0 deletions taichi/runtime/llvm/locked_task.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,25 @@ class lock_guard {
mutex_unlock_i32(lock);
#else
// CUDA

auto active_mask = cuda_active_mask();
auto remaining = active_mask;
while (remaining) {
auto leader = cttz_i32(remaining);
if (warp_idx() == leader) {
// Memory fences here are necessary since CUDA has a weakly ordered
// memory model across threads
mutex_lock_i32(lock);
grid_memfence();
func();
grid_memfence();
mutex_unlock_i32(lock);
grid_memfence();
}
warp_barrier(active_mask);
remaining ^= 1u << leader;
}
/*
for (int i = 0; i < warp_size(); i++) {
if (warp_idx() == i) {
// Memory fences here are necessary since CUDA has a weakly ordered
Expand All @@ -24,6 +43,7 @@ class lock_guard {
grid_memfence();
}
}
*/
// Unfortunately critical sections on CUDA has undefined behavior (deadlock
// or not), if more than one thread in a warp try to acquire locks
/*
Expand Down
20 changes: 15 additions & 5 deletions taichi/runtime/llvm/runtime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -903,17 +903,28 @@ int32 cttz_i32(i32 val) {
return 0;
}

uint32 cuda_ballot(bool bit) {
int32 cuda_ballot(bool bit) {
return 0;
}

uint32 cuda_ballot_sync(uint32 mask, bool bit) {
int32 cuda_ballot_sync(int32 mask, bool bit) {
return 0;
}

int32 cuda_active_mask() {
#if ARCH_cuda
uint32 cuda_active_mask() {
/*
unsigned int mask;
asm volatile("activemask.b32 %0;" : "=r"(mask));
return mask;
*/
return cuda_ballot(true);
}
#else
uint32 cuda_active_mask() {
return 0;
}
#endif

int32 grid_dim() {
return 0;
Expand All @@ -925,8 +936,7 @@ void sync_warp(uint32 mask) {
void block_barrier() {
}

int32 warp_active_mask() {
return 0;
void warp_barrier(uint32 mask) {
}

void block_memfence() {
Expand Down

0 comments on commit 2c0db87

Please sign in to comment.