Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

release/20.x: [Clang] Add handlers for 'match_any' and 'match_all' to gpuintrin.h (#127504) #127704

Open
wants to merge 1 commit into
base: release/20.x
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions clang/lib/Headers/amdgpuintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,62 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __lo, __width));
}

// Returns a bitmask marking all lanes that have the same value of __x.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
uint32_t __match_mask = 0;

bool __done = 0;
while (__gpu_ballot(__lane_mask, !__done)) {
if (!__done) {
uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
if (__first == __x) {
__match_mask = __gpu_lane_mask();
__done = 1;
}
}
}
__gpu_sync_lane(__lane_mask);
return __match_mask;
}

// Returns a bitmask marking all lanes that have the same value of __x.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
uint64_t __match_mask = 0;

bool __done = 0;
while (__gpu_ballot(__lane_mask, __done)) {
if (!__done) {
uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
if (__first == __x) {
__match_mask = __gpu_lane_mask();
__done = 1;
}
}
}
__gpu_sync_lane(__lane_mask);
return __match_mask;
}

// Returns the current lane mask if every lane contains __x.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
__gpu_sync_lane(__lane_mask);
return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
}

// Returns the current lane mask if every lane contains __x.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
__gpu_sync_lane(__lane_mask);
return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
}

// Returns true if the flat pointer points to AMDGPU 'shared' memory.
_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
return __builtin_amdgcn_is_shared((void [[clang::address_space(0)]] *)((
Expand Down
74 changes: 74 additions & 0 deletions clang/lib/Headers/nvptxintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@
#error "This file is intended for NVPTX targets or offloading to NVPTX"
#endif

#ifndef __CUDA_ARCH__
#define __CUDA_ARCH__ 0
#endif

#include <stdint.h>

#if !defined(__cplusplus)
Expand Down Expand Up @@ -168,6 +172,76 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width));
}

// Returns a bitmask marking all lanes that have the same value of __x.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
// Newer targets can use the dedicated CUDA support.
if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
return __nvvm_match_any_sync_i32(__lane_mask, __x);

uint32_t __match_mask = 0;
bool __done = 0;
while (__gpu_ballot(__lane_mask, !__done)) {
if (!__done) {
uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
if (__first == __x) {
__match_mask = __gpu_lane_mask();
__done = 1;
}
}
}
return __match_mask;
}

// Returns a bitmask marking all lanes that have the same value of __x.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
// Newer targets can use the dedicated CUDA support.
if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
return __nvvm_match_any_sync_i64(__lane_mask, __x);

uint64_t __match_mask = 0;

bool __done = 0;
while (__gpu_ballot(__lane_mask, __done)) {
if (!__done) {
uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
if (__first == __x) {
__match_mask = __gpu_lane_mask();
__done = 1;
}
}
}
__gpu_sync_lane(__lane_mask);
return __match_mask;
}

// Returns the current lane mask if every lane contains __x.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
// Newer targets can use the dedicated CUDA support.
int predicate;
if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
return __nvvm_match_all_sync_i32p(__lane_mask, __x, &predicate);

uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
}

// Returns the current lane mask if every lane contains __x.
_DEFAULT_FN_ATTRS static __inline__ uint64_t
__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
// Newer targets can use the dedicated CUDA support.
int predicate;
if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
return __nvvm_match_all_sync_i64p(__lane_mask, __x, &predicate);

uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
}

// Returns true if the flat pointer points to CUDA 'shared' memory.
_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
return __nvvm_isspacep_shared(ptr);
Expand Down
8 changes: 8 additions & 0 deletions libc/src/__support/GPU/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,14 @@ LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x,
return __gpu_shuffle_idx_u32(lane_mask, idx, x, width);
}

LIBC_INLINE uint64_t match_any(uint64_t lane_mask, uint32_t x) {
return __gpu_match_any_u32(lane_mask, x);
}

LIBC_INLINE uint64_t match_all(uint64_t lane_mask, uint32_t x) {
return __gpu_match_all_u32(lane_mask, x);
}

[[noreturn]] LIBC_INLINE void end_program() { __gpu_exit(); }

LIBC_INLINE bool is_first_lane(uint64_t lane_mask) {
Expand Down
9 changes: 9 additions & 0 deletions libc/test/integration/src/__support/GPU/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,12 @@ add_integration_test(
LOADER_ARGS
--threads 64
)

add_integration_test(
match_test
SUITE libc-support-gpu-tests
SRCS
match.cpp
LOADER_ARGS
--threads 64
)
35 changes: 35 additions & 0 deletions libc/test/integration/src/__support/GPU/match.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
//===-- Test for the shuffle operations on the GPU ------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "src/__support/CPP/bit.h"
#include "src/__support/GPU/utils.h"
#include "test/IntegrationTest/test.h"

using namespace LIBC_NAMESPACE;

// Test to ensure that match any / match all work.
static void test_match() {
uint64_t mask = gpu::get_lane_mask();
EXPECT_EQ(1ull << gpu::get_lane_id(),
gpu::match_any(mask, gpu::get_lane_id()));
EXPECT_EQ(mask, gpu::match_any(mask, 1));

uint64_t expected = gpu::get_lane_id() < 16 ? 0xffff : 0xffff0000;
EXPECT_EQ(expected, gpu::match_any(mask, gpu::get_lane_id() < 16));
EXPECT_EQ(mask, gpu::match_all(mask, 1));
EXPECT_EQ(0ull, gpu::match_all(mask, gpu::get_lane_id()));
}

TEST_MAIN(int argc, char **argv, char **envp) {
if (gpu::get_thread_id() >= gpu::get_lane_size())
return 0;

test_match();

return 0;
}