From 76044b749e7a017507f1c9eafd6b10f0ef1d23f9 Mon Sep 17 00:00:00 2001
From: Allard Hendriksen <ahendriksen@nvidia.com>
Date: Thu, 9 Nov 2023 13:41:08 +0100
Subject: [PATCH] Use typed instead of void pointers

Because the size can be either 32 or 64 bit, this can catch a lot of
errors.

For instance:

uint64_t * remote_buffer;
uint64_t * remote_bar;
cuda::ptx::st_async(remote_buffer, 1, remote_bar);

would previously use the .b32 path because the `1` is an integer and
determines the type resolution.

Now, this will result in a compiler error.

Resolution is to either (a) change the value type, or (b) change the
buffer type.

a)
uint64_t * remote_buffer;
cuda::ptx::st_async(remote_buffer, uint64_t(1), remote_bar);

b)
int32_t * remote_buffer;
cuda::ptx::st_async(remote_buffer, 1, remote_bar);
---
 .../test/cuda/ptx/ptx.st.async.compile.pass.cpp      | 10 +++++-----
 libcudacxx/docs/extended_api/ptx.md                  |  6 +++---
 .../cuda/std/detail/libcxx/include/__cuda/ptx.h      | 12 ++++++------
 3 files changed, 14 insertions(+), 14 deletions(-)
diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.st.async.compile.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.st.async.compile.pass.cpp
index f9c9d0f57b..0a95f9dcec 100644
--- a/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.st.async.compile.pass.cpp
+++ b/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.st.async.compile.pass.cpp
@@ -50,12 +50,12 @@ __global__ void test_compilation() {
   NV_IF_TARGET(NV_PROVIDES_SM_90, (
     if (non_eliminated_false()) {
       // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar];    // 1.
-      auto overload = static_cast<void (*)(void* , const int32_t& , uint64_t* )>(cuda::ptx::st_async);
+      auto overload = static_cast<void (*)(int32_t* , const int32_t& , uint64_t* )>(cuda::ptx::st_async);
       fn_ptr = reinterpret_cast<void*>(overload);
     }
     if (non_eliminated_false()) {
       // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar];    // 1.
-      auto overload = static_cast<void (*)(void* , const int64_t& , uint64_t* )>(cuda::ptx::st_async);
+      auto overload = static_cast<void (*)(int64_t* , const int64_t& , uint64_t* )>(cuda::ptx::st_async);
       fn_ptr = reinterpret_cast<void*>(overload);
     }
   ));
@@ -65,12 +65,12 @@ __global__ void test_compilation() {
   NV_IF_TARGET(NV_PROVIDES_SM_90, (
     if (non_eliminated_false()) {
       // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2.
-      auto overload = static_cast<void (*)(void* , const int32_t (&)[2], uint64_t* )>(cuda::ptx::st_async);
+      auto overload = static_cast<void (*)(int32_t* , const int32_t (&)[2], uint64_t* )>(cuda::ptx::st_async);
       fn_ptr = reinterpret_cast<void*>(overload);
     }
     if (non_eliminated_false()) {
       // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2.
-      auto overload = static_cast<void (*)(void* , const int64_t (&)[2], uint64_t* )>(cuda::ptx::st_async);
+      auto overload = static_cast<void (*)(int64_t* , const int64_t (&)[2], uint64_t* )>(cuda::ptx::st_async);
       fn_ptr = reinterpret_cast<void*>(overload);
     }
   ));
@@ -80,7 +80,7 @@ __global__ void test_compilation() {
   NV_IF_TARGET(NV_PROVIDES_SM_90, (
     if (non_eliminated_false()) {
       // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar];    // 3.
-      auto overload = static_cast<void (*)(void* , const int32_t (&)[4], uint64_t* )>(cuda::ptx::st_async);
+      auto overload = static_cast<void (*)(int32_t* , const int32_t (&)[4], uint64_t* )>(cuda::ptx::st_async);
       fn_ptr = reinterpret_cast<void*>(overload);
     }
   ));
diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md
index 3cb50f9f7f..5bfc2d7fa7 100644
--- a/libcudacxx/docs/extended_api/ptx.md
+++ b/libcudacxx/docs/extended_api/ptx.md
@@ -309,7 +309,7 @@ notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release
 // .type      = { .b32, .b64 }
 template <typename Type>
 __device__ static inline void st_async(
-  void* addr,
+  Type* addr,
   const Type& value,
   uint64_t* remote_bar);
 
@@ -317,14 +317,14 @@ __device__ static inline void st_async(
 // .type      = { .b32, .b64 }
 template <typename Type>
 __device__ static inline void st_async(
-  void* addr,
+  Type* addr,
   const Type (&value)[2],
   uint64_t* remote_bar);
 
 // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar];    // 3.  PTX ISA 81, SM_90
 template <typename B32>
 __device__ static inline void st_async(
-  void* addr,
+  B32* addr,
   const B32 (&value)[4],
   uint64_t* remote_bar);
 ```
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
index 9b22963cb9..48dd076986 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
@@ -391,7 +391,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
 // .type      = { .b32, .b64 }
 template <typename Type>
 __device__ static inline void st_async(
-  void* addr,
+  Type* addr,
   const Type& value,
   uint64_t* remote_bar);
 */
@@ -399,7 +399,7 @@ __device__ static inline void st_async(
 extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_st_async_is_not_supported_before_SM_90__();
 template <typename _Type>
 _LIBCUDACXX_DEVICE static inline void st_async(
-  void* __addr,
+  _Type* __addr,
   const _Type& __value,
   _CUDA_VSTD::uint64_t* __remote_bar)
 {
@@ -438,7 +438,7 @@ _LIBCUDACXX_DEVICE static inline void st_async(
 // .type      = { .b32, .b64 }
 template <typename Type>
 __device__ static inline void st_async(
-  void* addr,
+  Type* addr,
   const Type (&value)[2],
   uint64_t* remote_bar);
 */
@@ -446,7 +446,7 @@ __device__ static inline void st_async(
 extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_st_async_is_not_supported_before_SM_90__();
 template <typename _Type>
 _LIBCUDACXX_DEVICE static inline void st_async(
-  void* __addr,
+  _Type* __addr,
   const _Type (&__value)[2],
   _CUDA_VSTD::uint64_t* __remote_bar)
 {
@@ -486,7 +486,7 @@ _LIBCUDACXX_DEVICE static inline void st_async(
 // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar];    // 3.  PTX ISA 81, SM_90
 template <typename B32>
 __device__ static inline void st_async(
-  void* addr,
+  B32* addr,
   const B32 (&value)[4],
   uint64_t* remote_bar);
 */
@@ -494,7 +494,7 @@ __device__ static inline void st_async(
 extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_st_async_is_not_supported_before_SM_90__();
 template <typename _B32>
 _LIBCUDACXX_DEVICE static inline void st_async(
-  void* __addr,
+  _B32* __addr,
   const _B32 (&__value)[4],
   _CUDA_VSTD::uint64_t* __remote_bar)
 {