Use typed instead of void pointers

Because the size can be either 32 or 64 bit, this can catch a lot of errors. For instance: uint64_t * remote_buffer; uint64_t * remote_bar; cuda::ptx::st_async(remote_buffer, 1, remote_bar); would previously use the .b32 path because the `1` is an integer and determines the type resolution. Now, this will result in a compiler error. Resolution is to either (a) change the value type, or (b) change the buffer type. a) uint64_t * remote_buffer; cuda::ptx::st_async(remote_buffer, uint64_t(1), remote_bar); b) int32_t * remote_buffer; cuda::ptx::st_async(remote_buffer, 1, remote_bar);
gevtushenko · Nov 9, 2023 · 76044b7 · 76044b7
1 parent dc1d934
commit 76044b7
Show file tree

Hide file tree

Showing 3 changed files with 14 additions and 14 deletions.
diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.st.async.compile.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.st.async.compile.pass.cpp
@@ -50,12 +50,12 @@ __global__ void test_compilation() {
   NV_IF_TARGET(NV_PROVIDES_SM_90, (
     if (non_eliminated_false()) {
       // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar];    // 1.
-      auto overload = static_cast<void (*)(void* , const int32_t& , uint64_t* )>(cuda::ptx::st_async);
+      auto overload = static_cast<void (*)(int32_t* , const int32_t& , uint64_t* )>(cuda::ptx::st_async);
       fn_ptr = reinterpret_cast<void*>(overload);
     }
     if (non_eliminated_false()) {
       // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar];    // 1.
-      auto overload = static_cast<void (*)(void* , const int64_t& , uint64_t* )>(cuda::ptx::st_async);
+      auto overload = static_cast<void (*)(int64_t* , const int64_t& , uint64_t* )>(cuda::ptx::st_async);
       fn_ptr = reinterpret_cast<void*>(overload);
     }
   ));
@@ -65,12 +65,12 @@ __global__ void test_compilation() {
   NV_IF_TARGET(NV_PROVIDES_SM_90, (
     if (non_eliminated_false()) {
       // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2.
-      auto overload = static_cast<void (*)(void* , const int32_t (&)[2], uint64_t* )>(cuda::ptx::st_async);
+      auto overload = static_cast<void (*)(int32_t* , const int32_t (&)[2], uint64_t* )>(cuda::ptx::st_async);
       fn_ptr = reinterpret_cast<void*>(overload);
     }
     if (non_eliminated_false()) {
       // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2.
-      auto overload = static_cast<void (*)(void* , const int64_t (&)[2], uint64_t* )>(cuda::ptx::st_async);
+      auto overload = static_cast<void (*)(int64_t* , const int64_t (&)[2], uint64_t* )>(cuda::ptx::st_async);
       fn_ptr = reinterpret_cast<void*>(overload);
     }
   ));
@@ -80,7 +80,7 @@ __global__ void test_compilation() {
   NV_IF_TARGET(NV_PROVIDES_SM_90, (
     if (non_eliminated_false()) {
       // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar];    // 3.
-      auto overload = static_cast<void (*)(void* , const int32_t (&)[4], uint64_t* )>(cuda::ptx::st_async);
+      auto overload = static_cast<void (*)(int32_t* , const int32_t (&)[4], uint64_t* )>(cuda::ptx::st_async);
       fn_ptr = reinterpret_cast<void*>(overload);
     }
   ));

diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md
@@ -309,22 +309,22 @@ notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release
 // .type      = { .b32, .b64 }
 template <typename Type>
 __device__ static inline void st_async(
-  void* addr,
+  Type* addr,
   const Type& value,
   uint64_t* remote_bar);
 
 // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2.  PTX ISA 81, SM_90
 // .type      = { .b32, .b64 }
 template <typename Type>
 __device__ static inline void st_async(
-  void* addr,
+  Type* addr,
   const Type (&value)[2],
   uint64_t* remote_bar);
 
 // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar];    // 3.  PTX ISA 81, SM_90
 template <typename B32>
 __device__ static inline void st_async(
-  void* addr,
+  B32* addr,
   const B32 (&value)[4],
   uint64_t* remote_bar);
 ```

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
@@ -391,15 +391,15 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 // .type      = { .b32, .b64 }
 template <typename Type>
 __device__ static inline void st_async(
-  void* addr,
+  Type* addr,
   const Type& value,
   uint64_t* remote_bar);
 */
 #if __cccl_ptx_isa >= 810
 extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_st_async_is_not_supported_before_SM_90__();
 template <typename _Type>
 _LIBCUDACXX_DEVICE static inline void st_async(
-  void* __addr,
+  _Type* __addr,
   const _Type& __value,
   _CUDA_VSTD::uint64_t* __remote_bar)
 {
@@ -438,15 +438,15 @@ _LIBCUDACXX_DEVICE static inline void st_async(
 // .type      = { .b32, .b64 }
 template <typename Type>
 __device__ static inline void st_async(
-  void* addr,
+  Type* addr,
   const Type (&value)[2],
   uint64_t* remote_bar);
 */
 #if __cccl_ptx_isa >= 810
 extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_st_async_is_not_supported_before_SM_90__();
 template <typename _Type>
 _LIBCUDACXX_DEVICE static inline void st_async(
-  void* __addr,
+  _Type* __addr,
   const _Type (&__value)[2],
   _CUDA_VSTD::uint64_t* __remote_bar)
 {
@@ -486,15 +486,15 @@ _LIBCUDACXX_DEVICE static inline void st_async(
 // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar];    // 3.  PTX ISA 81, SM_90
 template <typename B32>
 __device__ static inline void st_async(
-  void* addr,
+  B32* addr,
   const B32 (&value)[4],
   uint64_t* remote_bar);
 */
 #if __cccl_ptx_isa >= 810
 extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_st_async_is_not_supported_before_SM_90__();
 template <typename _B32>
 _LIBCUDACXX_DEVICE static inline void st_async(
-  void* __addr,
+  _B32* __addr,
   const _B32 (&__value)[4],
   _CUDA_VSTD::uint64_t* __remote_bar)
 {