From 76044b749e7a017507f1c9eafd6b10f0ef1d23f9 Mon Sep 17 00:00:00 2001 From: Allard Hendriksen Date: Thu, 9 Nov 2023 13:41:08 +0100 Subject: [PATCH] Use typed instead of void pointers Because the size can be either 32 or 64 bit, this can catch a lot of errors. For instance: uint64_t * remote_buffer; uint64_t * remote_bar; cuda::ptx::st_async(remote_buffer, 1, remote_bar); would previously use the .b32 path because the `1` is an integer and determines the type resolution. Now, this will result in a compiler error. Resolution is to either (a) change the value type, or (b) change the buffer type. a) uint64_t * remote_buffer; cuda::ptx::st_async(remote_buffer, uint64_t(1), remote_bar); b) int32_t * remote_buffer; cuda::ptx::st_async(remote_buffer, 1, remote_bar); --- .../test/cuda/ptx/ptx.st.async.compile.pass.cpp | 10 +++++----- libcudacxx/docs/extended_api/ptx.md | 6 +++--- .../cuda/std/detail/libcxx/include/__cuda/ptx.h | 12 ++++++------ 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.st.async.compile.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.st.async.compile.pass.cpp index f9c9d0f57b..0a95f9dcec 100644 --- a/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.st.async.compile.pass.cpp +++ b/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.st.async.compile.pass.cpp @@ -50,12 +50,12 @@ __global__ void test_compilation() { NV_IF_TARGET(NV_PROVIDES_SM_90, ( if (non_eliminated_false()) { // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar]; // 1. - auto overload = static_cast(cuda::ptx::st_async); + auto overload = static_cast(cuda::ptx::st_async); fn_ptr = reinterpret_cast(overload); } if (non_eliminated_false()) { // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes{.type} [addr], value, [remote_bar]; // 1. - auto overload = static_cast(cuda::ptx::st_async); + auto overload = static_cast(cuda::ptx::st_async); fn_ptr = reinterpret_cast(overload); } )); @@ -65,12 +65,12 @@ __global__ void test_compilation() { NV_IF_TARGET(NV_PROVIDES_SM_90, ( if (non_eliminated_false()) { // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2. - auto overload = static_cast(cuda::ptx::st_async); + auto overload = static_cast(cuda::ptx::st_async); fn_ptr = reinterpret_cast(overload); } if (non_eliminated_false()) { // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v2{.type} [addr], value, [remote_bar]; // 2. - auto overload = static_cast(cuda::ptx::st_async); + auto overload = static_cast(cuda::ptx::st_async); fn_ptr = reinterpret_cast(overload); } )); @@ -80,7 +80,7 @@ __global__ void test_compilation() { NV_IF_TARGET(NV_PROVIDES_SM_90, ( if (non_eliminated_false()) { // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar]; // 3. - auto overload = static_cast(cuda::ptx::st_async); + auto overload = static_cast(cuda::ptx::st_async); fn_ptr = reinterpret_cast(overload); } )); diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md index 3cb50f9f7f..5bfc2d7fa7 100644 --- a/libcudacxx/docs/extended_api/ptx.md +++ b/libcudacxx/docs/extended_api/ptx.md @@ -309,7 +309,7 @@ notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release // .type = { .b32, .b64 } template __device__ static inline void st_async( - void* addr, + Type* addr, const Type& value, uint64_t* remote_bar); @@ -317,14 +317,14 @@ __device__ static inline void st_async( // .type = { .b32, .b64 } template __device__ static inline void st_async( - void* addr, + Type* addr, const Type (&value)[2], uint64_t* remote_bar); // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar]; // 3. PTX ISA 81, SM_90 template __device__ static inline void st_async( - void* addr, + B32* addr, const B32 (&value)[4], uint64_t* remote_bar); ``` diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h index 9b22963cb9..48dd076986 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h @@ -391,7 +391,7 @@ _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX // .type = { .b32, .b64 } template __device__ static inline void st_async( - void* addr, + Type* addr, const Type& value, uint64_t* remote_bar); */ @@ -399,7 +399,7 @@ __device__ static inline void st_async( extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_st_async_is_not_supported_before_SM_90__(); template _LIBCUDACXX_DEVICE static inline void st_async( - void* __addr, + _Type* __addr, const _Type& __value, _CUDA_VSTD::uint64_t* __remote_bar) { @@ -438,7 +438,7 @@ _LIBCUDACXX_DEVICE static inline void st_async( // .type = { .b32, .b64 } template __device__ static inline void st_async( - void* addr, + Type* addr, const Type (&value)[2], uint64_t* remote_bar); */ @@ -446,7 +446,7 @@ __device__ static inline void st_async( extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_st_async_is_not_supported_before_SM_90__(); template _LIBCUDACXX_DEVICE static inline void st_async( - void* __addr, + _Type* __addr, const _Type (&__value)[2], _CUDA_VSTD::uint64_t* __remote_bar) { @@ -486,7 +486,7 @@ _LIBCUDACXX_DEVICE static inline void st_async( // st.async.weak.shared::cluster.mbarrier::complete_tx::bytes.v4.b32 [addr], value, [remote_bar]; // 3. PTX ISA 81, SM_90 template __device__ static inline void st_async( - void* addr, + B32* addr, const B32 (&value)[4], uint64_t* remote_bar); */ @@ -494,7 +494,7 @@ __device__ static inline void st_async( extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_st_async_is_not_supported_before_SM_90__(); template _LIBCUDACXX_DEVICE static inline void st_async( - void* __addr, + _B32* __addr, const _B32 (&__value)[4], _CUDA_VSTD::uint64_t* __remote_bar) {