Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Clang] Fix cross-lane scan when given divergent lanes #127703

Merged
merged 1 commit into from
Feb 19, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 49 additions & 25 deletions clang/lib/Headers/gpuintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -150,35 +150,33 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
__builtin_bit_cast(uint64_t, __x), __width));
}

// Gets the sum of all lanes inside the warp or wavefront.
#define __DO_LANE_SUM(__type, __suffix) \
_DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_sum_##__suffix( \
uint64_t __lane_mask, __type __x) { \
for (uint32_t __step = __gpu_num_lanes() / 2; __step > 0; __step /= 2) { \
uint32_t __index = __step + __gpu_lane_id(); \
__x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \
__gpu_num_lanes()); \
} \
return __gpu_read_first_lane_##__suffix(__lane_mask, __x); \
}
__DO_LANE_SUM(uint32_t, u32); // uint32_t __gpu_lane_sum_u32(m, x)
__DO_LANE_SUM(uint64_t, u64); // uint64_t __gpu_lane_sum_u64(m, x)
__DO_LANE_SUM(float, f32); // float __gpu_lane_sum_f32(m, x)
__DO_LANE_SUM(double, f64); // double __gpu_lane_sum_f64(m, x)
#undef __DO_LANE_SUM

// Gets the accumulator scan of the threads in the warp or wavefront.
#define __DO_LANE_SCAN(__type, __bitmask_type, __suffix) \
_DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_scan_##__suffix( \
uint64_t __lane_mask, uint32_t __x) { \
for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
uint32_t __index = __gpu_lane_id() - __step; \
__bitmask_type bitmask = __gpu_lane_id() >= __step; \
__x += __builtin_bit_cast( \
__type, -bitmask & __builtin_bit_cast(__bitmask_type, \
__gpu_shuffle_idx_##__suffix( \
__lane_mask, __index, __x, \
__gpu_num_lanes()))); \
uint64_t __first = __lane_mask >> __builtin_ctzll(__lane_mask); \
bool __divergent = __gpu_read_first_lane_##__suffix( \
__lane_mask, __first & (__first + 1)); \
if (__divergent) { \
__type __accum = 0; \
for (uint64_t __mask = __lane_mask; __mask; __mask &= __mask - 1) { \
__type __index = __builtin_ctzll(__mask); \
__type __tmp = __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \
__gpu_num_lanes()); \
__x = __gpu_lane_id() == __index ? __accum + __tmp : __x; \
__accum += __tmp; \
} \
} else { \
for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
uint32_t __index = __gpu_lane_id() - __step; \
__bitmask_type bitmask = __gpu_lane_id() >= __step; \
__x += __builtin_bit_cast( \
__type, \
-bitmask & __builtin_bit_cast(__bitmask_type, \
__gpu_shuffle_idx_##__suffix( \
__lane_mask, __index, __x, \
__gpu_num_lanes()))); \
} \
} \
return __x; \
}
Expand All @@ -188,6 +186,32 @@ __DO_LANE_SCAN(float, uint32_t, f32); // float __gpu_lane_scan_f32(m, x)
__DO_LANE_SCAN(double, uint64_t, f64); // double __gpu_lane_scan_f64(m, x)
#undef __DO_LANE_SCAN

// Gets the sum of all lanes inside the warp or wavefront.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: why do you need to sink this macro?

#define __DO_LANE_SUM(__type, __suffix) \
_DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_sum_##__suffix( \
uint64_t __lane_mask, __type __x) { \
uint64_t __first = __lane_mask >> __builtin_ctzll(__lane_mask); \
bool __divergent = __gpu_read_first_lane_##__suffix( \
__lane_mask, __first & (__first + 1)); \
if (__divergent) { \
return __gpu_shuffle_idx_##__suffix( \
__lane_mask, 63 - __builtin_clzll(__lane_mask), \
__gpu_lane_scan_##__suffix(__lane_mask, __x), __gpu_num_lanes()); \
} else { \
for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \
uint32_t __index = __step + __gpu_lane_id(); \
__x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \
__gpu_num_lanes()); \
} \
return __gpu_read_first_lane_##__suffix(__lane_mask, __x); \
} \
}
__DO_LANE_SUM(uint32_t, u32); // uint32_t __gpu_lane_sum_u32(m, x)
__DO_LANE_SUM(uint64_t, u64); // uint64_t __gpu_lane_sum_u64(m, x)
__DO_LANE_SUM(float, f32); // float __gpu_lane_sum_f32(m, x)
__DO_LANE_SUM(double, f64); // double __gpu_lane_sum_f64(m, x)
#undef __DO_LANE_SUM

_Pragma("omp end declare variant");
_Pragma("omp end declare target");

Expand Down
5 changes: 4 additions & 1 deletion clang/lib/Headers/nvptxintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -155,8 +155,11 @@ _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
_DEFAULT_FN_ATTRS static __inline__ uint32_t
__gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
uint32_t __width) {
// Mask out inactive lanes to match AMDGPU behavior.
uint32_t __mask = (uint32_t)__lane_mask;
return __nvvm_shfl_sync_idx_i32(__mask, __x, __idx,
bool __bitmask = (1ull << __idx) & __lane_mask;
return -__bitmask &
__nvvm_shfl_sync_idx_i32(__mask, __x, __idx,
((__gpu_num_lanes() - __width) << 8u) | 0x1f);
}

Expand Down
49 changes: 49 additions & 0 deletions libc/test/integration/src/__support/GPU/scan_reduce.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,59 @@ static void test_scan() {
EXPECT_EQ(z, gpu::get_lane_id() % 2 ? gpu::get_lane_id() / 2 + 1 : 0);
}

static uint32_t random(uint64_t *rand_next) {
uint64_t x = *rand_next;
x ^= x >> 12;
x ^= x << 25;
x ^= x >> 27;
*rand_next = x;
return static_cast<uint32_t>((x * 0x2545F4914F6CDD1Dul) >> 32);
}

// Scan operations can break down under thread divergence, make sure that the
// function works under some random divergence. We do this by trivially
// implementing a scan with shared scratch memory and then comparing the
// results.
static void test_scan_divergent() {
static uint32_t input[64] = {0};
static uint32_t result[64] = {0};
uint64_t state = gpu::processor_clock() + __gpu_lane_id();

for (int i = 0; i < 64; ++i) {
uint64_t lanemask = gpu::get_lane_mask();
if (random(&state) & (1ull << gpu::get_lane_id())) {
uint64_t divergent = gpu::get_lane_mask();
uint32_t value = random(&state) % 256;
input[gpu::get_lane_id()] = value;

if (gpu::is_first_lane(divergent)) {
uint32_t accumulator = 0;
for (uint32_t lane = 0; lane < gpu::get_lane_size(); ++lane) {
uint32_t tmp = input[lane];
result[lane] = tmp + accumulator;
accumulator += tmp;
}
}
gpu::sync_lane(divergent);

uint32_t scan = gpu::scan(divergent, value);
EXPECT_EQ(scan, result[gpu::get_lane_id()]);
}
if (gpu::is_first_lane(lanemask))
__builtin_memset(input, 0, sizeof(input));
gpu::sync_lane(lanemask);
}
}

TEST_MAIN(int argc, char **argv, char **envp) {
if (gpu::get_thread_id() >= gpu::get_lane_size())
return 0;

test_reduce();

test_scan();

test_scan_divergent();

return 0;
}
Loading