NVIDIA · fbusato · Jan 4, 2025 · Jan 7, 2025 · Jan 7, 2025 · Jan 7, 2025
@@ -9,23 +9,24 @@ PTX Instructions
    instructions/barrier_cluster
    instructions/cp_async_bulk
    instructions/cp_async_bulk_commit_group
-   instructions/cp_async_bulk_wait_group
    instructions/cp_async_bulk_tensor
+   instructions/cp_async_bulk_wait_group
    instructions/cp_reduce_async_bulk
    instructions/cp_reduce_async_bulk_tensor
    instructions/fence
    instructions/getctarank
    instructions/mapa
-   instructions/mbarrier_init
    instructions/mbarrier_arrive
    instructions/mbarrier_expect_tx
+   instructions/mbarrier_init
    instructions/mbarrier_test_wait
    instructions/mbarrier_try_wait
    instructions/red_async
+   instructions/shfl_sync
+   instructions/special_registers
    instructions/st_async
-   instructions/tensormap_replace
    instructions/tensormap_cp_fenceproxy
-   instructions/special_registers
+   instructions/tensormap_replace
 
 
 Instructions by section
@@ -232,8 +233,8 @@ Instructions by section
      - No
    * - `shfl <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-deprecated>`__
      - No
-   * - `shfl.s <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-sync>`__
-     - No
+   * - `shfl.sync <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-sync>`__
+     - Yes, CCCL 2.9.0 / CUDA 12.9
    * - `prmt <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt>`__
      - No
    * - `ld <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-ld>`__

@@ -0,0 +1,25 @@
+
+shfl.sync
+^^^^^^^^^
+
+.. code:: cuda
+
+   // PTX ISA 6.0
+   // shfl.sync.mode.b32  d[|p], a, b, c, membermask;
+   //   .mode = { .up, .down, .bfly, .idx };
+
+   template<typename T>
+   struct shfl_return_values {
+       T    data;
+       bool pred;
+   };
+
+   template<typename T>
+   [[nodiscard]] __device__ static inline
+   shfl_return_values<T> shfl_sync(shfl_mode_t shfl_mode,
+                                   T           data,
+                                   uint32_t    lane_idx_offset,
+                                   uint32_t    clamp_segmask,
+                                   uint32_t    lane_mask) noexcept;
+
+- ``shfl_mode`` is ``shfl_mode_up`` or ``shfl_mode_down`` or ``shfl_mode_bfly`` or ``shfl_mode_idx``
@@ -0,0 +1,10 @@
+
+.. _libcudacxx-ptx-instructions-shfl_sync:
+
+shfl.sync
+=========
+
+-  PTX ISA:
+   `shfl.sync <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-sync>`__
+
+.. include:: generated/shfl_sync.rst
@@ -0,0 +1,144 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_PTX_SHFL_SYNC_H
+#define _CUDA_PTX_SHFL_SYNC_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#if _CCCL_STD_VER >= 2017
+
+#  include <cuda/__ptx/instructions/get_sreg.h>
+#  include <cuda/__ptx/ptx_dot_variants.h>
+#  include <cuda/std/__bit/bit_cast.h>
+#  include <cuda/std/__type_traits/is_integral.h>
+#  include <cuda/std/__type_traits/is_signed.h>
+#  include <cuda/std/cstdint>
+
+#  include <nv/target> // __CUDA_MINIMUM_ARCH__ and friends
+
+_LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX
+
+#  if __cccl_ptx_isa >= 600
+
+template <dot_shfl_mode _ShuffleMode>
+_CCCL_DEVICE static inline _CUDA_VSTD::uint32_t __shfl_sync_dst_lane(
+  shfl_mode_t<_ShuffleMode> __shfl_mode,
+  _CUDA_VSTD::uint32_t __lane_idx_offset,
+  _CUDA_VSTD::uint32_t __clamp_segmask,
+  _CUDA_VSTD::uint32_t __lane_mask)
+{
+  auto __lane     = get_sreg_laneid();
+  auto __clamp    = __clamp_segmask & 0b11111;
+  auto __segmask  = __clamp_segmask >> 8;
+  auto __max_lane = (__lane & __segmask) | (__clamp & ~__segmask);
+  auto __j        = 0;
+  if constexpr (__shfl_mode == shfl_mode_idx)
+  {
+    auto __min_lane = (__lane & __clamp);
+    __j             = __min_lane | (__lane_idx_offset & ~__segmask);
+  }
+  else if constexpr (__shfl_mode == shfl_mode_up)
+  {
+    __j = __lane - __lane_idx_offset;
+  }
+  else if constexpr (__shfl_mode == shfl_mode_down)
+  {
+    __j = __lane + __lane_idx_offset;
+  }
+  else
+  {
+    __j = __lane ^ __lane_idx_offset;
+  }
+  auto __dst = (__shfl_mode == shfl_mode_up) ? (__j >= __max_lane ? __j : __lane) : (__j <= __max_lane ? __j : __lane);
+  return (1 << __dst);
+}
+
+template <typename _Tp, dot_shfl_mode _ShuffleMode>
+_CCCL_NODISCARD _CCCL_DEVICE static inline _Tp shfl_sync(
+  shfl_mode_t<_ShuffleMode> __shfl_mode,
+  _Tp __data,
+  _CUDA_VSTD::uint32_t __lane_idx_offset,
+  _CUDA_VSTD::uint32_t __clamp_segmask,
+  _CUDA_VSTD::uint32_t __lane_mask,
+  bool& __pred) noexcept
+{
+  static_assert(sizeof(_Tp) == 4, "shfl.sync only accepts 4-byte data types");
+  _CCCL_ASSERT(__lane_idx_offset < 32, "the lane index or offset must be less than the warp size");
+  _CCCL_ASSERT((__clamp_segmask | 0b1111100011111) == 0b1111100011111,
+               "clamp value + segmentation mask must be less or equal than 12 bits");
+  _CCCL_ASSERT((__lane_mask & __activemask()) == __lane_mask, "lane mask must be a subset of the active mask");
+  _CCCL_ASSERT(__shfl_sync_dst_lane(__shfl_mode, __lane_idx_offset, __clamp_segmask, __lane_mask) & __lane_mask,
+               "the destination lane must be a member of the lane mask");
+  auto __data1 = _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__data);
+  _CUDA_VSTD::int32_t __pred1;
+  _CUDA_VSTD::uint32_t __ret;
+  if constexpr (__shfl_mode == shfl_mode_idx)
+  {
+    asm volatile(
+      "{                                                           \n\t\t"
+      ".reg .pred p;                                               \n\t\t"
+      "shfl.sync.sync.idx.b32 %0|p, %2, %3, %4, %5;                \n\t\t"
+      "selp.s32 %1, 1, 0, p;                                         \n\t"
+      "}"
+      : "=r"(__ret), "=r"(__pred1)
+      : "r"(__data1), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask));
+  }
+  else if constexpr (__shfl_mode == shfl_mode_up)
+  {
+    asm volatile(
+      "{                                                           \n\t\t"
+      ".reg .pred p;                                               \n\t\t"
+      "shfl.sync.sync.up.b32 %0|p, %2, %3, %4, %5;                 \n\t\t"
+      "selp.s32 %1, 1, 0, p;                                         \n\t"
+      "}"
+      : "=r"(__ret), "=r"(__pred1)
+      : "r"(__data1), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask));
+  }
+  else if constexpr (__shfl_mode == shfl_mode_down)
+  {
+    asm volatile(
+      "{                                                           \n\t\t"
+      ".reg .pred p;                                               \n\t\t"
+      "shfl.sync.sync.down.b32 %0|p, %2, %3, %4, %5;               \n\t\t"
+      "selp.s32 %1, 1, 0, p;                                         \n\t"
+      "}"
+      : "=r"(__ret), "=r"(__pred1)
+      : "r"(__data1), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask));
+  }
+  else
+  {
+    asm volatile(
+      "{                                                           \n\t\t"
+      ".reg .pred p;                                               \n\t\t"
+      "shfl.sync.sync.bfly.b32 %0|p, %2, %3, %4, %5;               \n\t\t"
+      "selp.s32 %1, 1, 0, p;                                         \n\t"
+      "}"
+      : "=r"(__ret), "=r"(__pred1)
+      : "r"(__data1), "r"(__lane_idx_offset), "r"(__clamp_segmask), "r"(__lane_mask));
+  }
+  __pred = static_cast<bool>(__pred1);
+  return _CUDA_VSTD::bit_cast<_CUDA_VSTD::uint32_t>(__ret);
+}
+
+#  endif // __cccl_ptx_isa >= 600
+
+_LIBCUDACXX_END_NAMESPACE_CUDA_PTX
+
+#endif // _CCCL_STD_VER >= 2017
+#endif // _CUDA_PTX_SHFL_SYNC_H
diff --git a/libcudacxx/include/cuda/__ptx/ptx_dot_variants.h b/libcudacxx/include/cuda/__ptx/ptx_dot_variants.h
@@ -97,6 +97,26 @@ enum class dot_scope
   sys
 };
 
+enum class dot_shfl_mode
+{
+  up,
+  down,
+  bfly,
+  idx
+};
+
+template <dot_shfl_mode __mode>
+using shfl_mode_t      = _CUDA_VSTD::integral_constant<dot_shfl_mode, __mode>;
+using shfl_mode_up_t   = shfl_mode_t<dot_shfl_mode::up>;
+using shfl_mode_down_t = shfl_mode_t<dot_shfl_mode::down>;
+using shfl_mode_bfly_t = shfl_mode_t<dot_shfl_mode::bfly>;
+using shfl_mode_idx_t  = shfl_mode_t<dot_shfl_mode::idx>;
+
+static constexpr shfl_mode_up_t shfl_mode_up{};
+static constexpr shfl_mode_down_t shfl_mode_down{};
+static constexpr shfl_mode_bfly_t shfl_mode_bfly{};
+static constexpr shfl_mode_idx_t shfl_mode_idx{};
+
 enum class dot_op
 {
   add,

@@ -83,6 +83,7 @@
 #include <cuda/__ptx/instructions/mbarrier_init.h>
 #include <cuda/__ptx/instructions/mbarrier_wait.h>
 #include <cuda/__ptx/instructions/red_async.h>
+#include <cuda/__ptx/instructions/shfl_sync.h>
 #include <cuda/__ptx/instructions/st_async.h>
 #include <cuda/__ptx/instructions/tensormap_cp_fenceproxy.h>
 #include <cuda/__ptx/instructions/tensormap_replace.h>

@@ -0,0 +1,106 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: libcpp-has-no-threads
+// UNSUPPORTED: clang && !nvcc
+
+// <cuda/ptx>
+
+__host__ __device__ void test_shfl_full_mask()
+{
+#if __cccl_ptx_isa >= 600 && __CUDA_ARCH__
+  constexpr unsigned FullMask = 0xFFFFFFFF;
+  auto data                   = threadIdx.x;
+  bool pred1, pred2, pred3, pred4;
+  auto res1 = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_idx, data, 2 /*idx*/, 0b11111 /*clamp*/, FullMask, pred1);
+  _CCCL_ASSERT(res1 == 2 && pred1, "shfl_mode_idx failed");
+
+  auto res2 = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_up, data, 2 /*offset*/, 0 /*clamp*/, FullMask, pred2);
+  if (threadIdx.x <= 1)
+  {
+    _CCCL_ASSERT(res2 == threadIdx.x && !pred2, "shfl_mode_up failed");
+  }
+  else
+  {
+    _CCCL_ASSERT(res2 == threadIdx.x - 2 && pred2, "shfl_mode_up failed");
+  }
+
+  auto res3 = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_down, data, 2 /*offset*/, 0b11111 /*clamp*/, FullMask, pred3);
+  if (threadIdx.x >= 30)
+  {
+    _CCCL_ASSERT(res3 == threadIdx.x && !pred3, "shfl_mode_down failed");
+  }
+  else
+  {
+    _CCCL_ASSERT(res3 == threadIdx.x + 2 && pred3, "shfl_mode_down failed");
+  }
+
+  auto res4 = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_bfly, data, 2 /*offset*/, 0b11111 /*clamp*/, FullMask, pred4);
+  _CCCL_ASSERT(res4 == threadIdx.x ^ 2 && pred4, "shfl_mode_bfly failed");
+#endif // __cccl_ptx_isa >= 600
+}
+
+__host__ __device__ void test_shfl_partial_mask()
+{
+#if __cccl_ptx_isa >= 600 && __CUDA_ARCH__
+  constexpr unsigned PartialMask = 0b1111;
+  auto data                      = threadIdx.x;
+  bool pred1;
+  if (threadIdx.x <= 3)
+  {
+    auto res1 = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_idx, data, 2 /*idx*/, 0b11111 /*clamp*/, PartialMask, pred1);
+    _CCCL_ASSERT(res1 == 2 && pred1, "shfl_mode_idx failed");
+  }
+#endif // __cccl_ptx_isa >= 600
+}
+
+__host__ __device__ void test_shfl_partial_warp()
+{
+#if __cccl_ptx_isa >= 600 && __CUDA_ARCH__
+  constexpr unsigned FullMask = 0xFFFFFFFF;
+  unsigned max_lane_mask      = 16;
+  unsigned clamp              = 0b11111;
+  unsigned clamp_segmark      = (max_lane_mask << 8) | clamp;
+  auto data                   = threadIdx.x;
+  bool pred1, pred2, pred3, pred4;
+  auto res1 = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_idx, data, 2 /*idx*/, clamp_segmark, FullMask, pred1);
+  if (threadIdx.x < 16)
+  {
+    _CCCL_ASSERT(res1 == 2 && pred1, "shfl_mode_idx failed");
+  }
+  else
+  {
+    _CCCL_ASSERT(res1 == 16 + 2 && pred1, "shfl_mode_idx failed");
+  }
+
+  auto res2 = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_up, data, 2 /*offset*/, (max_lane_mask << 8), FullMask, pred2);
+  printf("%d:  res2 = %d, pred2 = %d\n", threadIdx.x, res2, pred2);
+  if (threadIdx.x <= 1 || threadIdx.x == 16 || threadIdx.x == 17)
+  {
+    _CCCL_ASSERT(res2 == threadIdx.x && !pred2, "shfl_mode_up failed");
+  }
+  else
+  {
+    _CCCL_ASSERT(res2 == threadIdx.x - 2 && pred2, "shfl_mode_up failed");
+  }
+
+  auto res3 = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_down, data, 2 /*offset*/, clamp_segmark, FullMask, pred3);
+  if (threadIdx.x == 14 || threadIdx.x == 15 || threadIdx.x >= 30)
+  {
+    _CCCL_ASSERT(res3 == threadIdx.x && !pred3, "shfl_mode_down failed");
+  }
+  else
+  {
+    _CCCL_ASSERT(res3 == threadIdx.x + 2 && pred3, "shfl_mode_down failed");
+  }
+
+  auto res4 = cuda::ptx::shfl_sync(cuda::ptx::shfl_mode_bfly, data, 2 /*offset*/, clamp_segmark, FullMask, pred4);
+  _CCCL_ASSERT(res4 == threadIdx.x ^ 2 && pred4, "shfl_mode_bfly failed");
+#endif // __cccl_ptx_isa >= 600
+}
@@ -0,0 +1,28 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: libcpp-has-no-threads
+// UNSUPPORTED: clang && !nvcc
+// UNSUPPORTED: c++98, c++03, c++11, c++14
+
+// <cuda/ptx>
+
+#include <cuda/ptx>
+#include <cuda/std/utility>
+
+#include "manual/shfl_test.h"
+
+int main(int, char**)
+{
+  NV_IF_TARGET(NV_IS_HOST, cuda_thread_count = 32;)
+  test_shfl_full_mask();
+  test_shfl_partial_mask();
+  test_shfl_partial_warp();
+  return 0;
+}