Skip to content

Commit

Permalink
[CUDA] Upgrade to PTX 6.3 and add a few CUDA intrinsics
Browse files Browse the repository at this point in the history
  • Loading branch information
yuanming-hu committed Jul 20, 2020
1 parent 2c0db87 commit 3637bbc
Show file tree
Hide file tree
Showing 5 changed files with 2 additions and 28 deletions.
2 changes: 1 addition & 1 deletion examples/mpm99.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import taichi as ti
import numpy as np
ti.init(arch=ti.gpu, print_kernel_nvptx=True, print_kernel_llvm_ir_optimized=True) # Try to run on GPU
ti.init(arch=ti.gpu) # Try to run on GPU
quality = 1 # Use a larger value for higher-res simulations
n_particles, n_grid = 9000 * quality ** 2, 128 * quality
dx, inv_dx = 1 / n_grid, float(n_grid)
Expand Down
1 change: 1 addition & 0 deletions taichi/backends/cuda/jit_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ std::string convert(std::string new_name) {
std::string JITSessionCUDA::compile_module_to_ptx(
std::unique_ptr<llvm::Module> &module) {
TI_AUTO_PROF
// Part of this function is borrowed from Halide::CodeGen_PTX_Dev.cpp
if (llvm::verifyModule(*module, &llvm::errs())) {
module->print(llvm::errs(), nullptr);
TI_ERROR("Module broken");
Expand Down
1 change: 0 additions & 1 deletion taichi/llvm/llvm_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,6 @@ std::unique_ptr<llvm::Module> TaichiLLVMContext::clone_runtime_module() {
{llvm::PointerType::get(get_data_type(DataType::f64), 0)});
#endif

// patch_intrinsic("sync_warp", Intrinsic::nvvm_bar_warp_sync, false);
patch_intrinsic("block_memfence", Intrinsic::nvvm_membar_cta, false);

link_module_with_cuda_libdevice(data->runtime_module);
Expand Down
20 changes: 0 additions & 20 deletions taichi/runtime/llvm/locked_task.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,25 +12,6 @@ class lock_guard {
mutex_unlock_i32(lock);
#else
// CUDA

auto active_mask = cuda_active_mask();
auto remaining = active_mask;
while (remaining) {
auto leader = cttz_i32(remaining);
if (warp_idx() == leader) {
// Memory fences here are necessary since CUDA has a weakly ordered
// memory model across threads
mutex_lock_i32(lock);
grid_memfence();
func();
grid_memfence();
mutex_unlock_i32(lock);
grid_memfence();
}
warp_barrier(active_mask);
remaining ^= 1u << leader;
}
/*
for (int i = 0; i < warp_size(); i++) {
if (warp_idx() == i) {
// Memory fences here are necessary since CUDA has a weakly ordered
Expand All @@ -43,7 +24,6 @@ class lock_guard {
grid_memfence();
}
}
*/
// Unfortunately critical sections on CUDA has undefined behavior (deadlock
// or not), if more than one thread in a warp try to acquire locks
/*
Expand Down
6 changes: 0 additions & 6 deletions taichi/runtime/llvm/runtime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -913,12 +913,9 @@ int32 cuda_ballot_sync(int32 mask, bool bit) {

#if ARCH_cuda
uint32 cuda_active_mask() {
/*
unsigned int mask;
asm volatile("activemask.b32 %0;" : "=r"(mask));
return mask;
*/
return cuda_ballot(true);
}
#else
uint32 cuda_active_mask() {
Expand All @@ -930,9 +927,6 @@ int32 grid_dim() {
return 0;
}

void sync_warp(uint32 mask) {
}

void block_barrier() {
}

Expand Down

0 comments on commit 3637bbc

Please sign in to comment.