Skip to content

Commit

Permalink
[Clang] Add handlers for 'match_any' and 'match_all' to gpuintrin.h (
Browse files Browse the repository at this point in the history
…#127504)

Summary:
These helpers are very useful but currently absent. They allow the user
to get a bitmask representing the matches within the warp. I have made
an executive decision to drop the `predicate` return from `match_all`
because it's easily testable with `match_all() == __activemask()`.
  • Loading branch information
jhuber6 authored Feb 17, 2025
1 parent a7a3568 commit 9a584b0
Show file tree
Hide file tree
Showing 5 changed files with 182 additions and 0 deletions.
56 changes: 56 additions & 0 deletions clang/lib/Headers/amdgpuintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,62 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __lo, __width));
}

// Returns a bitmask marking all lanes that have the same value of __x.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
uint32_t __match_mask = 0;

bool __done = 0;
while (__gpu_ballot(__lane_mask, !__done)) {
if (!__done) {
uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
if (__first == __x) {
__match_mask = __gpu_lane_mask();
__done = 1;
}
}
}
__gpu_sync_lane(__lane_mask);
return __match_mask;
}

// Returns a bitmask marking all lanes that have the same value of __x.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
uint64_t __match_mask = 0;

bool __done = 0;
while (__gpu_ballot(__lane_mask, __done)) {
if (!__done) {
uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
if (__first == __x) {
__match_mask = __gpu_lane_mask();
__done = 1;
}
}
}
__gpu_sync_lane(__lane_mask);
return __match_mask;
}

// Returns the current lane mask if every lane contains __x.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
__gpu_sync_lane(__lane_mask);
return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
}

// Returns the current lane mask if every lane contains __x.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
__gpu_sync_lane(__lane_mask);
return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
}

// Returns true if the flat pointer points to AMDGPU 'shared' memory.
_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
return __builtin_amdgcn_is_shared((void [[clang::address_space(0)]] *)((
Expand Down
74 changes: 74 additions & 0 deletions clang/lib/Headers/nvptxintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@
#error "This file is intended for NVPTX targets or offloading to NVPTX"
#endif

#ifndef __CUDA_ARCH__
#define __CUDA_ARCH__ 0
#endif

#include <stdint.h>

#if !defined(__cplusplus)
Expand Down Expand Up @@ -168,6 +172,76 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width));
}

// Returns a bitmask marking all lanes that have the same value of __x.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
// Newer targets can use the dedicated CUDA support.
if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
return __nvvm_match_any_sync_i32(__lane_mask, __x);

uint32_t __match_mask = 0;
bool __done = 0;
while (__gpu_ballot(__lane_mask, !__done)) {
if (!__done) {
uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
if (__first == __x) {
__match_mask = __gpu_lane_mask();
__done = 1;
}
}
}
return __match_mask;
}

// Returns a bitmask marking all lanes that have the same value of __x.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
// Newer targets can use the dedicated CUDA support.
if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
return __nvvm_match_any_sync_i64(__lane_mask, __x);

uint64_t __match_mask = 0;

bool __done = 0;
while (__gpu_ballot(__lane_mask, __done)) {
if (!__done) {
uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
if (__first == __x) {
__match_mask = __gpu_lane_mask();
__done = 1;
}
}
}
__gpu_sync_lane(__lane_mask);
return __match_mask;
}

// Returns the current lane mask if every lane contains __x.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
// Newer targets can use the dedicated CUDA support.
int predicate;
if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
return __nvvm_match_all_sync_i32p(__lane_mask, __x, &predicate);

uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
}

// Returns the current lane mask if every lane contains __x.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
// Newer targets can use the dedicated CUDA support.
int predicate;
if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
return __nvvm_match_all_sync_i64p(__lane_mask, __x, &predicate);

uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
}

// Returns true if the flat pointer points to CUDA 'shared' memory.
_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
return __nvvm_isspacep_shared(ptr);
Expand Down
8 changes: 8 additions & 0 deletions libc/src/__support/GPU/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,14 @@ LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x,
return __gpu_shuffle_idx_u32(lane_mask, idx, x, width);
}

LIBC_INLINE uint64_t match_any(uint64_t lane_mask, uint32_t x) {
return __gpu_match_any_u32(lane_mask, x);
}

LIBC_INLINE uint64_t match_all(uint64_t lane_mask, uint32_t x) {
return __gpu_match_all_u32(lane_mask, x);
}

[[noreturn]] LIBC_INLINE void end_program() { __gpu_exit(); }

LIBC_INLINE bool is_first_lane(uint64_t lane_mask) {
Expand Down
9 changes: 9 additions & 0 deletions libc/test/integration/src/__support/GPU/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,12 @@ add_integration_test(
LOADER_ARGS
--threads 64
)

add_integration_test(
match_test
SUITE libc-support-gpu-tests
SRCS
match.cpp
LOADER_ARGS
--threads 64
)
35 changes: 35 additions & 0 deletions libc/test/integration/src/__support/GPU/match.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
//===-- Test for the shuffle operations on the GPU ------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "src/__support/CPP/bit.h"
#include "src/__support/GPU/utils.h"
#include "test/IntegrationTest/test.h"

using namespace LIBC_NAMESPACE;

// Test to ensure that match any / match all work.
static void test_match() {
uint64_t mask = gpu::get_lane_mask();
EXPECT_EQ(1ull << gpu::get_lane_id(),
gpu::match_any(mask, gpu::get_lane_id()));
EXPECT_EQ(mask, gpu::match_any(mask, 1));

uint64_t expected = gpu::get_lane_id() < 16 ? 0xffff : 0xffff0000;
EXPECT_EQ(expected, gpu::match_any(mask, gpu::get_lane_id() < 16));
EXPECT_EQ(mask, gpu::match_all(mask, 1));
EXPECT_EQ(0ull, gpu::match_all(mask, gpu::get_lane_id()));
}

TEST_MAIN(int argc, char **argv, char **envp) {
if (gpu::get_thread_id() >= gpu::get_lane_size())
return 0;

test_match();

return 0;
}

0 comments on commit 9a584b0

Please sign in to comment.