Skip to content

Commit

Permalink
Use typed instead of void pointers
Browse files Browse the repository at this point in the history
Because the size can be either 32 or 64 bit, this can catch a lot of
errors.

For instance:

uint64_t * remote_buffer;
uint64_t * remote_bar;
cuda::ptx::st_async(remote_buffer, 1, remote_bar);

would previously use the .b32 path because the `1` is an integer and
determines the type resolution.

Now, this will result in a compiler error.

Resolution is to either (a) change the value type, or (b) change the
buffer type.

a)
uint64_t * remote_buffer;
cuda::ptx::st_async(remote_buffer, uint64_t(1), remote_bar);

b)
int32_t * remote_buffer;
cuda::ptx::st_async(remote_buffer, 1, remote_bar);
  • Loading branch information
ahendriksen committed Nov 9, 2023
1 parent dc1d934 commit 76044b7
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,12 @@ __global__ void test_compilation() {
NV_IF_TARGET(NV_PROVIDES_SM_90, (
if (non_eliminated_false()) {
// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar]; // 1.
auto overload = static_cast<void (*)(void* , const int32_t& , uint64_t* )>(cuda::ptx::st_async);
auto overload = static_cast<void (*)(int32_t* , const int32_t& , uint64_t* )>(cuda::ptx::st_async);
fn_ptr = reinterpret_cast<void*>(overload);
}
if (non_eliminated_false()) {
// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar]; // 1.
auto overload = static_cast<void (*)(void* , const int64_t& , uint64_t* )>(cuda::ptx::st_async);
auto overload = static_cast<void (*)(int64_t* , const int64_t& , uint64_t* )>(cuda::ptx::st_async);
fn_ptr = reinterpret_cast<void*>(overload);
}
));
Expand All @@ -65,12 +65,12 @@ __global__ void test_compilation() {
NV_IF_TARGET(NV_PROVIDES_SM_90, (
if (non_eliminated_false()) {
// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2.
auto overload = static_cast<void (*)(void* , const int32_t (&)[2], uint64_t* )>(cuda::ptx::st_async);
auto overload = static_cast<void (*)(int32_t* , const int32_t (&)[2], uint64_t* )>(cuda::ptx::st_async);
fn_ptr = reinterpret_cast<void*>(overload);
}
if (non_eliminated_false()) {
// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2.
auto overload = static_cast<void (*)(void* , const int64_t (&)[2], uint64_t* )>(cuda::ptx::st_async);
auto overload = static_cast<void (*)(int64_t* , const int64_t (&)[2], uint64_t* )>(cuda::ptx::st_async);
fn_ptr = reinterpret_cast<void*>(overload);
}
));
Expand All @@ -80,7 +80,7 @@ __global__ void test_compilation() {
NV_IF_TARGET(NV_PROVIDES_SM_90, (
if (non_eliminated_false()) {
// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar]; // 3.
auto overload = static_cast<void (*)(void* , const int32_t (&)[4], uint64_t* )>(cuda::ptx::st_async);
auto overload = static_cast<void (*)(int32_t* , const int32_t (&)[4], uint64_t* )>(cuda::ptx::st_async);
fn_ptr = reinterpret_cast<void*>(overload);
}
));
Expand Down
6 changes: 3 additions & 3 deletions libcudacxx/docs/extended_api/ptx.md
Original file line number Diff line number Diff line change
Expand Up @@ -309,22 +309,22 @@ notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release
// .type = { .b32, .b64 }
template <typename Type>
__device__ static inline void st_async(
void* addr,
Type* addr,
const Type& value,
uint64_t* remote_bar);
// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2. PTX ISA 81, SM_90
// .type = { .b32, .b64 }
template <typename Type>
__device__ static inline void st_async(
void* addr,
Type* addr,
const Type (&value)[2],
uint64_t* remote_bar);
// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar]; // 3. PTX ISA 81, SM_90
template <typename B32>
__device__ static inline void st_async(
void* addr,
B32* addr,
const B32 (&value)[4],
uint64_t* remote_bar);
```
Expand Down
12 changes: 6 additions & 6 deletions libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
Original file line number Diff line number Diff line change
Expand Up @@ -391,15 +391,15 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
// .type = { .b32, .b64 }
template <typename Type>
__device__ static inline void st_async(
void* addr,
Type* addr,
const Type& value,
uint64_t* remote_bar);
*/
#if __cccl_ptx_isa >= 810
extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_st_async_is_not_supported_before_SM_90__();
template <typename _Type>
_LIBCUDACXX_DEVICE static inline void st_async(
void* __addr,
_Type* __addr,
const _Type& __value,
_CUDA_VSTD::uint64_t* __remote_bar)
{
Expand Down Expand Up @@ -438,15 +438,15 @@ _LIBCUDACXX_DEVICE static inline void st_async(
// .type = { .b32, .b64 }
template <typename Type>
__device__ static inline void st_async(
void* addr,
Type* addr,
const Type (&value)[2],
uint64_t* remote_bar);
*/
#if __cccl_ptx_isa >= 810
extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_st_async_is_not_supported_before_SM_90__();
template <typename _Type>
_LIBCUDACXX_DEVICE static inline void st_async(
void* __addr,
_Type* __addr,
const _Type (&__value)[2],
_CUDA_VSTD::uint64_t* __remote_bar)
{
Expand Down Expand Up @@ -486,15 +486,15 @@ _LIBCUDACXX_DEVICE static inline void st_async(
// st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar]; // 3. PTX ISA 81, SM_90
template <typename B32>
__device__ static inline void st_async(
void* addr,
B32* addr,
const B32 (&value)[4],
uint64_t* remote_bar);
*/
#if __cccl_ptx_isa >= 810
extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_st_async_is_not_supported_before_SM_90__();
template <typename _B32>
_LIBCUDACXX_DEVICE static inline void st_async(
void* __addr,
_B32* __addr,
const _B32 (&__value)[4],
_CUDA_VSTD::uint64_t* __remote_bar)
{
Expand Down

0 comments on commit 76044b7

Please sign in to comment.