-
Notifications
You must be signed in to change notification settings - Fork 12.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
release/20.x: [Clang] Add handlers for 'match_any' and 'match_all' to gpuintrin.h
(#127504)
#127704
base: release/20.x
Are you sure you want to change the base?
Conversation
…llvm#127504) Summary: These helpers are very useful but currently absent. They allow the user to get a bitmask representing the matches within the warp. I have made an executive decision to drop the `predicate` return from `match_all` because it's easily testable with `match_all() == __activemask()`. (cherry picked from commit 9a584b0)
@jhuber6 What do you think about merging this PR to the release branch? |
@llvm/pr-subscribers-libc Author: None (llvmbot) ChangesBackport 9a584b0 Requested by: @jhuber6 Full diff: https://github.com/llvm/llvm-project/pull/127704.diff 5 Files Affected:
diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 9dad99ffe9439..355e75d0b2d42 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -162,6 +162,62 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __lo, __width));
}
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
+ uint32_t __match_mask = 0;
+
+ bool __done = 0;
+ while (__gpu_ballot(__lane_mask, !__done)) {
+ if (!__done) {
+ uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
+ if (__first == __x) {
+ __match_mask = __gpu_lane_mask();
+ __done = 1;
+ }
+ }
+ }
+ __gpu_sync_lane(__lane_mask);
+ return __match_mask;
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
+ uint64_t __match_mask = 0;
+
+ bool __done = 0;
+ while (__gpu_ballot(__lane_mask, __done)) {
+ if (!__done) {
+ uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ if (__first == __x) {
+ __match_mask = __gpu_lane_mask();
+ __done = 1;
+ }
+ }
+ }
+ __gpu_sync_lane(__lane_mask);
+ return __match_mask;
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
+ uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
+ __gpu_sync_lane(__lane_mask);
+ return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
+ uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
+ __gpu_sync_lane(__lane_mask);
+ return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+}
+
// Returns true if the flat pointer points to AMDGPU 'shared' memory.
_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
return __builtin_amdgcn_is_shared((void [[clang::address_space(0)]] *)((
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index 40fa2edebe975..f857a87b5f4c7 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -13,6 +13,10 @@
#error "This file is intended for NVPTX targets or offloading to NVPTX"
#endif
+#ifndef __CUDA_ARCH__
+#define __CUDA_ARCH__ 0
+#endif
+
#include <stdint.h>
#if !defined(__cplusplus)
@@ -168,6 +172,76 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width));
}
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
+ // Newer targets can use the dedicated CUDA support.
+ if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
+ return __nvvm_match_any_sync_i32(__lane_mask, __x);
+
+ uint32_t __match_mask = 0;
+ bool __done = 0;
+ while (__gpu_ballot(__lane_mask, !__done)) {
+ if (!__done) {
+ uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
+ if (__first == __x) {
+ __match_mask = __gpu_lane_mask();
+ __done = 1;
+ }
+ }
+ }
+ return __match_mask;
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
+ // Newer targets can use the dedicated CUDA support.
+ if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
+ return __nvvm_match_any_sync_i64(__lane_mask, __x);
+
+ uint64_t __match_mask = 0;
+
+ bool __done = 0;
+ while (__gpu_ballot(__lane_mask, __done)) {
+ if (!__done) {
+ uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ if (__first == __x) {
+ __match_mask = __gpu_lane_mask();
+ __done = 1;
+ }
+ }
+ }
+ __gpu_sync_lane(__lane_mask);
+ return __match_mask;
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
+ // Newer targets can use the dedicated CUDA support.
+ int predicate;
+ if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
+ return __nvvm_match_all_sync_i32p(__lane_mask, __x, &predicate);
+
+ uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
+ return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
+ // Newer targets can use the dedicated CUDA support.
+ int predicate;
+ if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
+ return __nvvm_match_all_sync_i64p(__lane_mask, __x, &predicate);
+
+ uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
+ return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+}
+
// Returns true if the flat pointer points to CUDA 'shared' memory.
_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
return __nvvm_isspacep_shared(ptr);
diff --git a/libc/src/__support/GPU/utils.h b/libc/src/__support/GPU/utils.h
index 323c003f1ff07..0fd3a6498b865 100644
--- a/libc/src/__support/GPU/utils.h
+++ b/libc/src/__support/GPU/utils.h
@@ -92,6 +92,14 @@ LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x,
return __gpu_shuffle_idx_u32(lane_mask, idx, x, width);
}
+LIBC_INLINE uint64_t match_any(uint64_t lane_mask, uint32_t x) {
+ return __gpu_match_any_u32(lane_mask, x);
+}
+
+LIBC_INLINE uint64_t match_all(uint64_t lane_mask, uint32_t x) {
+ return __gpu_match_all_u32(lane_mask, x);
+}
+
[[noreturn]] LIBC_INLINE void end_program() { __gpu_exit(); }
LIBC_INLINE bool is_first_lane(uint64_t lane_mask) {
diff --git a/libc/test/integration/src/__support/GPU/CMakeLists.txt b/libc/test/integration/src/__support/GPU/CMakeLists.txt
index 68bbc3849bc7e..e066830f6cc0d 100644
--- a/libc/test/integration/src/__support/GPU/CMakeLists.txt
+++ b/libc/test/integration/src/__support/GPU/CMakeLists.txt
@@ -18,3 +18,12 @@ add_integration_test(
LOADER_ARGS
--threads 64
)
+
+add_integration_test(
+ match_test
+ SUITE libc-support-gpu-tests
+ SRCS
+ match.cpp
+ LOADER_ARGS
+ --threads 64
+)
diff --git a/libc/test/integration/src/__support/GPU/match.cpp b/libc/test/integration/src/__support/GPU/match.cpp
new file mode 100644
index 0000000000000..0eadb1364eec7
--- /dev/null
+++ b/libc/test/integration/src/__support/GPU/match.cpp
@@ -0,0 +1,35 @@
+//===-- Test for the shuffle operations on the GPU ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/GPU/utils.h"
+#include "test/IntegrationTest/test.h"
+
+using namespace LIBC_NAMESPACE;
+
+// Test to ensure that match any / match all work.
+static void test_match() {
+ uint64_t mask = gpu::get_lane_mask();
+ EXPECT_EQ(1ull << gpu::get_lane_id(),
+ gpu::match_any(mask, gpu::get_lane_id()));
+ EXPECT_EQ(mask, gpu::match_any(mask, 1));
+
+ uint64_t expected = gpu::get_lane_id() < 16 ? 0xffff : 0xffff0000;
+ EXPECT_EQ(expected, gpu::match_any(mask, gpu::get_lane_id() < 16));
+ EXPECT_EQ(mask, gpu::match_all(mask, 1));
+ EXPECT_EQ(0ull, gpu::match_all(mask, gpu::get_lane_id()));
+}
+
+TEST_MAIN(int argc, char **argv, char **envp) {
+ if (gpu::get_thread_id() >= gpu::get_lane_size())
+ return 0;
+
+ test_match();
+
+ return 0;
+}
|
@llvm/pr-subscribers-backend-amdgpu Author: None (llvmbot) ChangesBackport 9a584b0 Requested by: @jhuber6 Full diff: https://github.com/llvm/llvm-project/pull/127704.diff 5 Files Affected:
diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 9dad99ffe9439..355e75d0b2d42 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -162,6 +162,62 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __lo, __width));
}
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
+ uint32_t __match_mask = 0;
+
+ bool __done = 0;
+ while (__gpu_ballot(__lane_mask, !__done)) {
+ if (!__done) {
+ uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
+ if (__first == __x) {
+ __match_mask = __gpu_lane_mask();
+ __done = 1;
+ }
+ }
+ }
+ __gpu_sync_lane(__lane_mask);
+ return __match_mask;
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
+ uint64_t __match_mask = 0;
+
+ bool __done = 0;
+ while (__gpu_ballot(__lane_mask, __done)) {
+ if (!__done) {
+ uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ if (__first == __x) {
+ __match_mask = __gpu_lane_mask();
+ __done = 1;
+ }
+ }
+ }
+ __gpu_sync_lane(__lane_mask);
+ return __match_mask;
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
+ uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
+ __gpu_sync_lane(__lane_mask);
+ return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
+ uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
+ __gpu_sync_lane(__lane_mask);
+ return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+}
+
// Returns true if the flat pointer points to AMDGPU 'shared' memory.
_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
return __builtin_amdgcn_is_shared((void [[clang::address_space(0)]] *)((
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index 40fa2edebe975..f857a87b5f4c7 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -13,6 +13,10 @@
#error "This file is intended for NVPTX targets or offloading to NVPTX"
#endif
+#ifndef __CUDA_ARCH__
+#define __CUDA_ARCH__ 0
+#endif
+
#include <stdint.h>
#if !defined(__cplusplus)
@@ -168,6 +172,76 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width));
}
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
+ // Newer targets can use the dedicated CUDA support.
+ if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
+ return __nvvm_match_any_sync_i32(__lane_mask, __x);
+
+ uint32_t __match_mask = 0;
+ bool __done = 0;
+ while (__gpu_ballot(__lane_mask, !__done)) {
+ if (!__done) {
+ uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
+ if (__first == __x) {
+ __match_mask = __gpu_lane_mask();
+ __done = 1;
+ }
+ }
+ }
+ return __match_mask;
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
+ // Newer targets can use the dedicated CUDA support.
+ if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
+ return __nvvm_match_any_sync_i64(__lane_mask, __x);
+
+ uint64_t __match_mask = 0;
+
+ bool __done = 0;
+ while (__gpu_ballot(__lane_mask, __done)) {
+ if (!__done) {
+ uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ if (__first == __x) {
+ __match_mask = __gpu_lane_mask();
+ __done = 1;
+ }
+ }
+ }
+ __gpu_sync_lane(__lane_mask);
+ return __match_mask;
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
+ // Newer targets can use the dedicated CUDA support.
+ int predicate;
+ if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
+ return __nvvm_match_all_sync_i32p(__lane_mask, __x, &predicate);
+
+ uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
+ return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
+ // Newer targets can use the dedicated CUDA support.
+ int predicate;
+ if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
+ return __nvvm_match_all_sync_i64p(__lane_mask, __x, &predicate);
+
+ uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
+ return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+}
+
// Returns true if the flat pointer points to CUDA 'shared' memory.
_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
return __nvvm_isspacep_shared(ptr);
diff --git a/libc/src/__support/GPU/utils.h b/libc/src/__support/GPU/utils.h
index 323c003f1ff07..0fd3a6498b865 100644
--- a/libc/src/__support/GPU/utils.h
+++ b/libc/src/__support/GPU/utils.h
@@ -92,6 +92,14 @@ LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x,
return __gpu_shuffle_idx_u32(lane_mask, idx, x, width);
}
+LIBC_INLINE uint64_t match_any(uint64_t lane_mask, uint32_t x) {
+ return __gpu_match_any_u32(lane_mask, x);
+}
+
+LIBC_INLINE uint64_t match_all(uint64_t lane_mask, uint32_t x) {
+ return __gpu_match_all_u32(lane_mask, x);
+}
+
[[noreturn]] LIBC_INLINE void end_program() { __gpu_exit(); }
LIBC_INLINE bool is_first_lane(uint64_t lane_mask) {
diff --git a/libc/test/integration/src/__support/GPU/CMakeLists.txt b/libc/test/integration/src/__support/GPU/CMakeLists.txt
index 68bbc3849bc7e..e066830f6cc0d 100644
--- a/libc/test/integration/src/__support/GPU/CMakeLists.txt
+++ b/libc/test/integration/src/__support/GPU/CMakeLists.txt
@@ -18,3 +18,12 @@ add_integration_test(
LOADER_ARGS
--threads 64
)
+
+add_integration_test(
+ match_test
+ SUITE libc-support-gpu-tests
+ SRCS
+ match.cpp
+ LOADER_ARGS
+ --threads 64
+)
diff --git a/libc/test/integration/src/__support/GPU/match.cpp b/libc/test/integration/src/__support/GPU/match.cpp
new file mode 100644
index 0000000000000..0eadb1364eec7
--- /dev/null
+++ b/libc/test/integration/src/__support/GPU/match.cpp
@@ -0,0 +1,35 @@
+//===-- Test for the shuffle operations on the GPU ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/GPU/utils.h"
+#include "test/IntegrationTest/test.h"
+
+using namespace LIBC_NAMESPACE;
+
+// Test to ensure that match any / match all work.
+static void test_match() {
+ uint64_t mask = gpu::get_lane_mask();
+ EXPECT_EQ(1ull << gpu::get_lane_id(),
+ gpu::match_any(mask, gpu::get_lane_id()));
+ EXPECT_EQ(mask, gpu::match_any(mask, 1));
+
+ uint64_t expected = gpu::get_lane_id() < 16 ? 0xffff : 0xffff0000;
+ EXPECT_EQ(expected, gpu::match_any(mask, gpu::get_lane_id() < 16));
+ EXPECT_EQ(mask, gpu::match_all(mask, 1));
+ EXPECT_EQ(0ull, gpu::match_all(mask, gpu::get_lane_id()));
+}
+
+TEST_MAIN(int argc, char **argv, char **envp) {
+ if (gpu::get_thread_id() >= gpu::get_lane_size())
+ return 0;
+
+ test_match();
+
+ return 0;
+}
|
@llvm/pr-subscribers-backend-x86 Author: None (llvmbot) ChangesBackport 9a584b0 Requested by: @jhuber6 Full diff: https://github.com/llvm/llvm-project/pull/127704.diff 5 Files Affected:
diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 9dad99ffe9439..355e75d0b2d42 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -162,6 +162,62 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __lo, __width));
}
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
+ uint32_t __match_mask = 0;
+
+ bool __done = 0;
+ while (__gpu_ballot(__lane_mask, !__done)) {
+ if (!__done) {
+ uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
+ if (__first == __x) {
+ __match_mask = __gpu_lane_mask();
+ __done = 1;
+ }
+ }
+ }
+ __gpu_sync_lane(__lane_mask);
+ return __match_mask;
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
+ uint64_t __match_mask = 0;
+
+ bool __done = 0;
+ while (__gpu_ballot(__lane_mask, __done)) {
+ if (!__done) {
+ uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ if (__first == __x) {
+ __match_mask = __gpu_lane_mask();
+ __done = 1;
+ }
+ }
+ }
+ __gpu_sync_lane(__lane_mask);
+ return __match_mask;
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
+ uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
+ __gpu_sync_lane(__lane_mask);
+ return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
+ uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
+ __gpu_sync_lane(__lane_mask);
+ return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+}
+
// Returns true if the flat pointer points to AMDGPU 'shared' memory.
_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
return __builtin_amdgcn_is_shared((void [[clang::address_space(0)]] *)((
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index 40fa2edebe975..f857a87b5f4c7 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -13,6 +13,10 @@
#error "This file is intended for NVPTX targets or offloading to NVPTX"
#endif
+#ifndef __CUDA_ARCH__
+#define __CUDA_ARCH__ 0
+#endif
+
#include <stdint.h>
#if !defined(__cplusplus)
@@ -168,6 +172,76 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width));
}
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
+ // Newer targets can use the dedicated CUDA support.
+ if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
+ return __nvvm_match_any_sync_i32(__lane_mask, __x);
+
+ uint32_t __match_mask = 0;
+ bool __done = 0;
+ while (__gpu_ballot(__lane_mask, !__done)) {
+ if (!__done) {
+ uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
+ if (__first == __x) {
+ __match_mask = __gpu_lane_mask();
+ __done = 1;
+ }
+ }
+ }
+ return __match_mask;
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
+ // Newer targets can use the dedicated CUDA support.
+ if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
+ return __nvvm_match_any_sync_i64(__lane_mask, __x);
+
+ uint64_t __match_mask = 0;
+
+ bool __done = 0;
+ while (__gpu_ballot(__lane_mask, __done)) {
+ if (!__done) {
+ uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ if (__first == __x) {
+ __match_mask = __gpu_lane_mask();
+ __done = 1;
+ }
+ }
+ }
+ __gpu_sync_lane(__lane_mask);
+ return __match_mask;
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
+ // Newer targets can use the dedicated CUDA support.
+ int predicate;
+ if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
+ return __nvvm_match_all_sync_i32p(__lane_mask, __x, &predicate);
+
+ uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
+ return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
+ // Newer targets can use the dedicated CUDA support.
+ int predicate;
+ if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
+ return __nvvm_match_all_sync_i64p(__lane_mask, __x, &predicate);
+
+ uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
+ return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+}
+
// Returns true if the flat pointer points to CUDA 'shared' memory.
_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
return __nvvm_isspacep_shared(ptr);
diff --git a/libc/src/__support/GPU/utils.h b/libc/src/__support/GPU/utils.h
index 323c003f1ff07..0fd3a6498b865 100644
--- a/libc/src/__support/GPU/utils.h
+++ b/libc/src/__support/GPU/utils.h
@@ -92,6 +92,14 @@ LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x,
return __gpu_shuffle_idx_u32(lane_mask, idx, x, width);
}
+LIBC_INLINE uint64_t match_any(uint64_t lane_mask, uint32_t x) {
+ return __gpu_match_any_u32(lane_mask, x);
+}
+
+LIBC_INLINE uint64_t match_all(uint64_t lane_mask, uint32_t x) {
+ return __gpu_match_all_u32(lane_mask, x);
+}
+
[[noreturn]] LIBC_INLINE void end_program() { __gpu_exit(); }
LIBC_INLINE bool is_first_lane(uint64_t lane_mask) {
diff --git a/libc/test/integration/src/__support/GPU/CMakeLists.txt b/libc/test/integration/src/__support/GPU/CMakeLists.txt
index 68bbc3849bc7e..e066830f6cc0d 100644
--- a/libc/test/integration/src/__support/GPU/CMakeLists.txt
+++ b/libc/test/integration/src/__support/GPU/CMakeLists.txt
@@ -18,3 +18,12 @@ add_integration_test(
LOADER_ARGS
--threads 64
)
+
+add_integration_test(
+ match_test
+ SUITE libc-support-gpu-tests
+ SRCS
+ match.cpp
+ LOADER_ARGS
+ --threads 64
+)
diff --git a/libc/test/integration/src/__support/GPU/match.cpp b/libc/test/integration/src/__support/GPU/match.cpp
new file mode 100644
index 0000000000000..0eadb1364eec7
--- /dev/null
+++ b/libc/test/integration/src/__support/GPU/match.cpp
@@ -0,0 +1,35 @@
+//===-- Test for the shuffle operations on the GPU ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/GPU/utils.h"
+#include "test/IntegrationTest/test.h"
+
+using namespace LIBC_NAMESPACE;
+
+// Test to ensure that match any / match all work.
+static void test_match() {
+ uint64_t mask = gpu::get_lane_mask();
+ EXPECT_EQ(1ull << gpu::get_lane_id(),
+ gpu::match_any(mask, gpu::get_lane_id()));
+ EXPECT_EQ(mask, gpu::match_any(mask, 1));
+
+ uint64_t expected = gpu::get_lane_id() < 16 ? 0xffff : 0xffff0000;
+ EXPECT_EQ(expected, gpu::match_any(mask, gpu::get_lane_id() < 16));
+ EXPECT_EQ(mask, gpu::match_all(mask, 1));
+ EXPECT_EQ(0ull, gpu::match_all(mask, gpu::get_lane_id()));
+}
+
+TEST_MAIN(int argc, char **argv, char **envp) {
+ if (gpu::get_thread_id() >= gpu::get_lane_size())
+ return 0;
+
+ test_match();
+
+ return 0;
+}
|
@llvm/pr-subscribers-clang Author: None (llvmbot) ChangesBackport 9a584b0 Requested by: @jhuber6 Full diff: https://github.com/llvm/llvm-project/pull/127704.diff 5 Files Affected:
diff --git a/clang/lib/Headers/amdgpuintrin.h b/clang/lib/Headers/amdgpuintrin.h
index 9dad99ffe9439..355e75d0b2d42 100644
--- a/clang/lib/Headers/amdgpuintrin.h
+++ b/clang/lib/Headers/amdgpuintrin.h
@@ -162,6 +162,62 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
((uint64_t)__gpu_shuffle_idx_u32(__lane_mask, __idx, __lo, __width));
}
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
+ uint32_t __match_mask = 0;
+
+ bool __done = 0;
+ while (__gpu_ballot(__lane_mask, !__done)) {
+ if (!__done) {
+ uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
+ if (__first == __x) {
+ __match_mask = __gpu_lane_mask();
+ __done = 1;
+ }
+ }
+ }
+ __gpu_sync_lane(__lane_mask);
+ return __match_mask;
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
+ uint64_t __match_mask = 0;
+
+ bool __done = 0;
+ while (__gpu_ballot(__lane_mask, __done)) {
+ if (!__done) {
+ uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ if (__first == __x) {
+ __match_mask = __gpu_lane_mask();
+ __done = 1;
+ }
+ }
+ }
+ __gpu_sync_lane(__lane_mask);
+ return __match_mask;
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
+ uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
+ __gpu_sync_lane(__lane_mask);
+ return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
+ uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
+ __gpu_sync_lane(__lane_mask);
+ return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+}
+
// Returns true if the flat pointer points to AMDGPU 'shared' memory.
_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
return __builtin_amdgcn_is_shared((void [[clang::address_space(0)]] *)((
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index 40fa2edebe975..f857a87b5f4c7 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -13,6 +13,10 @@
#error "This file is intended for NVPTX targets or offloading to NVPTX"
#endif
+#ifndef __CUDA_ARCH__
+#define __CUDA_ARCH__ 0
+#endif
+
#include <stdint.h>
#if !defined(__cplusplus)
@@ -168,6 +172,76 @@ __gpu_shuffle_idx_u64(uint64_t __lane_mask, uint32_t __idx, uint64_t __x,
((uint64_t)__gpu_shuffle_idx_u32(__mask, __idx, __lo, __width));
}
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u32(uint64_t __lane_mask, uint32_t __x) {
+ // Newer targets can use the dedicated CUDA support.
+ if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
+ return __nvvm_match_any_sync_i32(__lane_mask, __x);
+
+ uint32_t __match_mask = 0;
+ bool __done = 0;
+ while (__gpu_ballot(__lane_mask, !__done)) {
+ if (!__done) {
+ uint32_t __first = __gpu_read_first_lane_u32(__lane_mask, __x);
+ if (__first == __x) {
+ __match_mask = __gpu_lane_mask();
+ __done = 1;
+ }
+ }
+ }
+ return __match_mask;
+}
+
+// Returns a bitmask marking all lanes that have the same value of __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_any_u64(uint64_t __lane_mask, uint64_t __x) {
+ // Newer targets can use the dedicated CUDA support.
+ if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
+ return __nvvm_match_any_sync_i64(__lane_mask, __x);
+
+ uint64_t __match_mask = 0;
+
+ bool __done = 0;
+ while (__gpu_ballot(__lane_mask, __done)) {
+ if (!__done) {
+ uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ if (__first == __x) {
+ __match_mask = __gpu_lane_mask();
+ __done = 1;
+ }
+ }
+ }
+ __gpu_sync_lane(__lane_mask);
+ return __match_mask;
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u32(uint64_t __lane_mask, uint32_t __x) {
+ // Newer targets can use the dedicated CUDA support.
+ int predicate;
+ if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
+ return __nvvm_match_all_sync_i32p(__lane_mask, __x, &predicate);
+
+ uint32_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
+ return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+}
+
+// Returns the current lane mask if every lane contains __x.
+_DEFAULT_FN_ATTRS static __inline__ uint64_t
+__gpu_match_all_u64(uint64_t __lane_mask, uint64_t __x) {
+ // Newer targets can use the dedicated CUDA support.
+ int predicate;
+ if (__CUDA_ARCH__ >= 700 || __nvvm_reflect("__CUDA_ARCH") >= 700)
+ return __nvvm_match_all_sync_i64p(__lane_mask, __x, &predicate);
+
+ uint64_t __first = __gpu_read_first_lane_u64(__lane_mask, __x);
+ uint64_t __ballot = __gpu_ballot(__lane_mask, __x == __first);
+ return __ballot == __gpu_lane_mask() ? __gpu_lane_mask() : 0ull;
+}
+
// Returns true if the flat pointer points to CUDA 'shared' memory.
_DEFAULT_FN_ATTRS static __inline__ bool __gpu_is_ptr_local(void *ptr) {
return __nvvm_isspacep_shared(ptr);
diff --git a/libc/src/__support/GPU/utils.h b/libc/src/__support/GPU/utils.h
index 323c003f1ff07..0fd3a6498b865 100644
--- a/libc/src/__support/GPU/utils.h
+++ b/libc/src/__support/GPU/utils.h
@@ -92,6 +92,14 @@ LIBC_INLINE uint32_t shuffle(uint64_t lane_mask, uint32_t idx, uint32_t x,
return __gpu_shuffle_idx_u32(lane_mask, idx, x, width);
}
+LIBC_INLINE uint64_t match_any(uint64_t lane_mask, uint32_t x) {
+ return __gpu_match_any_u32(lane_mask, x);
+}
+
+LIBC_INLINE uint64_t match_all(uint64_t lane_mask, uint32_t x) {
+ return __gpu_match_all_u32(lane_mask, x);
+}
+
[[noreturn]] LIBC_INLINE void end_program() { __gpu_exit(); }
LIBC_INLINE bool is_first_lane(uint64_t lane_mask) {
diff --git a/libc/test/integration/src/__support/GPU/CMakeLists.txt b/libc/test/integration/src/__support/GPU/CMakeLists.txt
index 68bbc3849bc7e..e066830f6cc0d 100644
--- a/libc/test/integration/src/__support/GPU/CMakeLists.txt
+++ b/libc/test/integration/src/__support/GPU/CMakeLists.txt
@@ -18,3 +18,12 @@ add_integration_test(
LOADER_ARGS
--threads 64
)
+
+add_integration_test(
+ match_test
+ SUITE libc-support-gpu-tests
+ SRCS
+ match.cpp
+ LOADER_ARGS
+ --threads 64
+)
diff --git a/libc/test/integration/src/__support/GPU/match.cpp b/libc/test/integration/src/__support/GPU/match.cpp
new file mode 100644
index 0000000000000..0eadb1364eec7
--- /dev/null
+++ b/libc/test/integration/src/__support/GPU/match.cpp
@@ -0,0 +1,35 @@
+//===-- Test for the shuffle operations on the GPU ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/bit.h"
+#include "src/__support/GPU/utils.h"
+#include "test/IntegrationTest/test.h"
+
+using namespace LIBC_NAMESPACE;
+
+// Test to ensure that match any / match all work.
+static void test_match() {
+ uint64_t mask = gpu::get_lane_mask();
+ EXPECT_EQ(1ull << gpu::get_lane_id(),
+ gpu::match_any(mask, gpu::get_lane_id()));
+ EXPECT_EQ(mask, gpu::match_any(mask, 1));
+
+ uint64_t expected = gpu::get_lane_id() < 16 ? 0xffff : 0xffff0000;
+ EXPECT_EQ(expected, gpu::match_any(mask, gpu::get_lane_id() < 16));
+ EXPECT_EQ(mask, gpu::match_all(mask, 1));
+ EXPECT_EQ(0ull, gpu::match_all(mask, gpu::get_lane_id()));
+}
+
+TEST_MAIN(int argc, char **argv, char **envp) {
+ if (gpu::get_thread_id() >= gpu::get_lane_size())
+ return 0;
+
+ test_match();
+
+ return 0;
+}
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Approving my own patch feels like a conflict of interest.
@shiltian What do you think about backporting this? |
This looks like a new feature so I'm not sure if we need to back port it. |
@jhuber6 Why do you want to back port this and what's the impact if we don't? |
Sorry, #127703 is the actually important one and I forget to cherry pick it, fixes a test and incorrect behavior. Figured if I was backporting that I could merge this as well, but if that's too much then it's not a big deal. |
Backport 9a584b0
Requested by: @jhuber6