Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PTX shfl_sync #3241

Open
wants to merge 16 commits into
base: main
Choose a base branch
from
13 changes: 7 additions & 6 deletions docs/libcudacxx/ptx/instructions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,23 +9,24 @@ PTX Instructions
instructions/barrier_cluster
instructions/cp_async_bulk
instructions/cp_async_bulk_commit_group
instructions/cp_async_bulk_wait_group
instructions/cp_async_bulk_tensor
instructions/cp_async_bulk_wait_group
instructions/cp_reduce_async_bulk
instructions/cp_reduce_async_bulk_tensor
instructions/fence
instructions/getctarank
instructions/mapa
instructions/mbarrier_init
instructions/mbarrier_arrive
instructions/mbarrier_expect_tx
instructions/mbarrier_init
instructions/mbarrier_test_wait
instructions/mbarrier_try_wait
instructions/red_async
instructions/shfl_sync
instructions/special_registers
instructions/st_async
instructions/tensormap_replace
instructions/tensormap_cp_fenceproxy
instructions/special_registers
instructions/tensormap_replace


Instructions by section
Expand Down Expand Up @@ -232,8 +233,8 @@ Instructions by section
- No
* - `shfl <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-deprecated>`__
- No
* - `shfl.s <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-sync>`__
- No
* - `shfl.sync <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-sync>`__
- Yes, CCCL 2.9.0 / CUDA 12.9
* - `prmt <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt>`__
- No
* - `ld <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld>`__
Expand Down
25 changes: 25 additions & 0 deletions docs/libcudacxx/ptx/instructions/generated/shfl_sync.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@

shfl.sync
^^^^^^^^^

.. code:: cuda

// PTX ISA 6.0
// shfl.sync.mode.b32 d[|p], a, b, c, membermask;
// .mode = { .up, .down, .bfly, .idx };

template<typename T>
struct shfl_return_values {
T data;
bool pred;
};

template<typename T>
[[nodiscard]] __device__ static inline
shfl_return_values<T> shfl_sync(shfl_mode_t shfl_mode,
fbusato marked this conversation as resolved.
Show resolved Hide resolved
T data,
uint32_t lane_idx_offset,
uint32_t clamp_segmask,
uint32_t lane_mask) noexcept;

- ``shfl_mode`` is ``shfl_mode_up`` or ``shfl_mode_down`` or ``shfl_mode_bfly`` or ``shfl_mode_idx``
10 changes: 10 additions & 0 deletions docs/libcudacxx/ptx/instructions/shfl_sync.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@

.. _libcudacxx-ptx-instructions-shfl_sync:

shfl.sync
=========

- PTX ISA:
`shfl.sync <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-sync>`__

.. include:: generated/shfl_sync.rst
144 changes: 144 additions & 0 deletions libcudacxx/include/cuda/__ptx/instructions/shfl_sync.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDA_PTX_SHFL_SYNC_H
#define _CUDA_PTX_SHFL_SYNC_H

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#if _CCCL_STD_VER >= 2017

# include <cuda/__ptx/instructions/get_sreg.h>
# include <cuda/__ptx/ptx_dot_variants.h>
# include <cuda/std/__bit/bit_cast.h>
# include <cuda/std/__type_traits/is_integral.h>
# include <cuda/std/__type_traits/is_signed.h>
# include <cuda/std/cstdint>

# include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends

_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX

# if __cccl_ptx_isa >= 600

template <dot_shfl_mode _ShuffleMode>
_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t __shfl_sync_dst_lane(
shfl_mode_t<_ShuffleMode> __shfl_mode,
fbusato marked this conversation as resolved.
Show resolved Hide resolved
_CUDA_VSTD::uint32_t __lane_idx_offset,
_CUDA_VSTD::uint32_t __clamp_segmask,
_CUDA_VSTD::uint32_t __lane_mask)
{
auto __lane = get_sreg_laneid();
auto __clamp = __clamp_segmask & 0b11111;
auto __segmask = __clamp_segmask >> 8;
auto __max_lane = (__lane & __segmask) | (__clamp & ~__segmask);
auto __j = 0;
if constexpr (__shfl_mode == shfl_mode_idx)
fbusato marked this conversation as resolved.
Show resolved Hide resolved
{
auto __min_lane = (__lane & __clamp);
__j = __min_lane | (__lane_idx_offset & ~__segmask);
}
else if constexpr (__shfl_mode == shfl_mode_up)
{
__j = __lane - __lane_idx_offset;
}
else if constexpr (__shfl_mode == shfl_mode_down)
{
__j = __lane + __lane_idx_offset;
}
else
{
__j = __lane ^ __lane_idx_offset;
}
auto __dst = (__shfl_mode == shfl_mode_up) ? (__j >= __max_lane ? __j : __lane) : (__j <= __max_lane ? __j : __lane);
return (1 << __dst);
}

template <typename _Tp, dot_shfl_mode _ShuffleMode>
_CCCL_NODISCARD _CCCL_DEVICE static inline _Tp shfl_sync(
shfl_mode_t<_ShuffleMode> __shfl_mode,
_Tp __data,
_CUDA_VSTD::uint32_t __lane_idx_offset,
_CUDA_VSTD::uint32_t __clamp_segmask,
_CUDA_VSTD::uint32_t __lane_mask,
bool& __pred) noexcept
{
static_assert(sizeof(_Tp) == 4, "shfl.sync only accepts 4-byte data types");
_CCCL_ASSERT(__lane_idx_offset < 32, "the lane index or offset must be less than the warp size");
_CCCL_ASSERT((__clamp_segmask | 0b1111100011111) == 0b1111100011111,
"clamp value + segmentation mask must be less or equal than 12 bits");
_CCCL_ASSERT((__lane_mask & __activemask()) == __lane_mask, "lane mask must be a subset of the active mask");
_CCCL_ASSERT(__shfl_sync_dst_lane(__shfl_mode, __lane_idx_offset, __clamp_segmask, __lane_mask) & __lane_mask,
"the destination lane must be a member of the lane mask");
auto __data1 = _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__data);
_CUDA_VSTD::int32_t __pred1;
_CUDA_VSTD::uint32_t __ret;
if constexpr (__shfl_mode == shfl_mode_idx)
{
asm volatile(
"{ \n\t\t"
".reg .pred p; \n\t\t"
"shfl.sync.sync.idx.b32 %0|p, %2, %3, %4, %5; \n\t\t"
"selp.s32 %1, 1, 0, p; \n\t"
"}"
: "=r"(__ret), "=r"(__pred1)
fbusato marked this conversation as resolved.
Show resolved Hide resolved
: "r"(__data1), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask));
}
else if constexpr (__shfl_mode == shfl_mode_up)
{
asm volatile(
"{ \n\t\t"
".reg .pred p; \n\t\t"
"shfl.sync.sync.up.b32 %0|p, %2, %3, %4, %5; \n\t\t"
"selp.s32 %1, 1, 0, p; \n\t"
"}"
: "=r"(__ret), "=r"(__pred1)
: "r"(__data1), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask));
}
else if constexpr (__shfl_mode == shfl_mode_down)
{
asm volatile(
"{ \n\t\t"
".reg .pred p; \n\t\t"
"shfl.sync.sync.down.b32 %0|p, %2, %3, %4, %5; \n\t\t"
"selp.s32 %1, 1, 0, p; \n\t"
"}"
: "=r"(__ret), "=r"(__pred1)
: "r"(__data1), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask));
}
else
{
asm volatile(
"{ \n\t\t"
".reg .pred p; \n\t\t"
"shfl.sync.sync.bfly.b32 %0|p, %2, %3, %4, %5; \n\t\t"
"selp.s32 %1, 1, 0, p; \n\t"
"}"
: "=r"(__ret), "=r"(__pred1)
: "r"(__data1), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask));
}
__pred = static_cast<bool>(__pred1);
return _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__ret);
}

# endif // __cccl_ptx_isa >= 600

_LIBCUDACXX_END_NAMESPACE_CUDA_PTX

#endif // _CCCL_STD_VER >= 2017
#endif // _CUDA_PTX_SHFL_SYNC_H
20 changes: 20 additions & 0 deletions libcudacxx/include/cuda/__ptx/ptx_dot_variants.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,26 @@ enum class dot_scope
sys
};

enum class dot_shfl_mode
{
up,
down,
bfly,
idx
};

template <dot_shfl_mode __mode>
using shfl_mode_t = _CUDA_VSTD::integral_constant<dot_shfl_mode, __mode>;
using shfl_mode_up_t = shfl_mode_t<dot_shfl_mode::up>;
using shfl_mode_down_t = shfl_mode_t<dot_shfl_mode::down>;
using shfl_mode_bfly_t = shfl_mode_t<dot_shfl_mode::bfly>;
using shfl_mode_idx_t = shfl_mode_t<dot_shfl_mode::idx>;

static constexpr shfl_mode_up_t shfl_mode_up{};
static constexpr shfl_mode_down_t shfl_mode_down{};
static constexpr shfl_mode_bfly_t shfl_mode_bfly{};
static constexpr shfl_mode_idx_t shfl_mode_idx{};

enum class dot_op
{
add,
Expand Down
1 change: 1 addition & 0 deletions libcudacxx/include/cuda/ptx
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@
#include <cuda/__ptx/instructions/mbarrier_init.h>
#include <cuda/__ptx/instructions/mbarrier_wait.h>
#include <cuda/__ptx/instructions/red_async.h>
#include <cuda/__ptx/instructions/shfl_sync.h>
#include <cuda/__ptx/instructions/st_async.h>
#include <cuda/__ptx/instructions/tensormap_cp_fenceproxy.h>
#include <cuda/__ptx/instructions/tensormap_replace.h>
Expand Down
106 changes: 106 additions & 0 deletions libcudacxx/test/libcudacxx/cuda/ptx/manual/shfl_test.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//
// UNSUPPORTED: libcpp-has-no-threads
// UNSUPPORTED: clang && !nvcc

// <cuda/ptx>

__host__ __device__ void test_shfl_full_mask()
{
#if __cccl_ptx_isa >= 600 && __CUDA_ARCH__
constexpr unsigned FullMask = 0xFFFFFFFF;
auto data = threadIdx.x;
bool pred1, pred2, pred3, pred4;
auto res1 = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_idx, data, 2 /*idx*/, 0b11111 /*clamp*/, FullMask, pred1);
_CCCL_ASSERT(res1 == 2 && pred1, "shfl_mode_idx failed");

auto res2 = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_up, data, 2 /*offset*/, 0 /*clamp*/, FullMask, pred2);
if (threadIdx.x <= 1)
{
_CCCL_ASSERT(res2 == threadIdx.x && !pred2, "shfl_mode_up failed");
}
else
{
_CCCL_ASSERT(res2 == threadIdx.x - 2 && pred2, "shfl_mode_up failed");
}

auto res3 = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_down, data, 2 /*offset*/, 0b11111 /*clamp*/, FullMask, pred3);
if (threadIdx.x >= 30)
{
_CCCL_ASSERT(res3 == threadIdx.x && !pred3, "shfl_mode_down failed");
}
else
{
_CCCL_ASSERT(res3 == threadIdx.x + 2 && pred3, "shfl_mode_down failed");
}

auto res4 = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_bfly, data, 2 /*offset*/, 0b11111 /*clamp*/, FullMask, pred4);
_CCCL_ASSERT(res4 == threadIdx.x ^ 2 && pred4, "shfl_mode_bfly failed");
#endif // __cccl_ptx_isa >= 600
}

__host__ __device__ void test_shfl_partial_mask()
{
#if __cccl_ptx_isa >= 600 && __CUDA_ARCH__
constexpr unsigned PartialMask = 0b1111;
auto data = threadIdx.x;
bool pred1;
if (threadIdx.x <= 3)
{
auto res1 = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_idx, data, 2 /*idx*/, 0b11111 /*clamp*/, PartialMask, pred1);
_CCCL_ASSERT(res1 == 2 && pred1, "shfl_mode_idx failed");
}
#endif // __cccl_ptx_isa >= 600
}

__host__ __device__ void test_shfl_partial_warp()
{
#if __cccl_ptx_isa >= 600 && __CUDA_ARCH__
constexpr unsigned FullMask = 0xFFFFFFFF;
unsigned max_lane_mask = 16;
unsigned clamp = 0b11111;
unsigned clamp_segmark = (max_lane_mask << 8) | clamp;
auto data = threadIdx.x;
bool pred1, pred2, pred3, pred4;
auto res1 = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_idx, data, 2 /*idx*/, clamp_segmark, FullMask, pred1);
if (threadIdx.x < 16)
{
_CCCL_ASSERT(res1 == 2 && pred1, "shfl_mode_idx failed");
}
else
{
_CCCL_ASSERT(res1 == 16 + 2 && pred1, "shfl_mode_idx failed");
}

auto res2 = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_up, data, 2 /*offset*/, (max_lane_mask << 8), FullMask, pred2);
printf("%d: res2 = %d, pred2 = %d\n", threadIdx.x, res2, pred2);
if (threadIdx.x <= 1 || threadIdx.x == 16 || threadIdx.x == 17)
{
_CCCL_ASSERT(res2 == threadIdx.x && !pred2, "shfl_mode_up failed");
}
else
{
_CCCL_ASSERT(res2 == threadIdx.x - 2 && pred2, "shfl_mode_up failed");
}

auto res3 = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_down, data, 2 /*offset*/, clamp_segmark, FullMask, pred3);
if (threadIdx.x == 14 || threadIdx.x == 15 || threadIdx.x >= 30)
{
_CCCL_ASSERT(res3 == threadIdx.x && !pred3, "shfl_mode_down failed");
}
else
{
_CCCL_ASSERT(res3 == threadIdx.x + 2 && pred3, "shfl_mode_down failed");
}

auto res4 = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_bfly, data, 2 /*offset*/, clamp_segmark, FullMask, pred4);
_CCCL_ASSERT(res4 == threadIdx.x ^ 2 && pred4, "shfl_mode_bfly failed");
#endif // __cccl_ptx_isa >= 600
}
28 changes: 28 additions & 0 deletions libcudacxx/test/libcudacxx/cuda/ptx/ptx.shfl.compile.pass.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//
// UNSUPPORTED: libcpp-has-no-threads
// UNSUPPORTED: clang && !nvcc
// UNSUPPORTED: c++98, c++03, c++11, c++14
fbusato marked this conversation as resolved.
Show resolved Hide resolved

// <cuda/ptx>

#include <cuda/ptx>
#include <cuda/std/utility>

#include "manual/shfl_test.h"

int main(int, char**)
{
NV_IF_TARGET(NV_IS_HOST, cuda_thread_count = 32;)
test_shfl_full_mask();
test_shfl_partial_mask();
test_shfl_partial_warp();
return 0;
}
Loading