diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h index 11c87e85cd497..efdc3d94ac0b3 100644 --- a/clang/lib/Headers/gpuintrin.h +++ b/clang/lib/Headers/gpuintrin.h @@ -150,35 +150,33 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x, __builtin_bit_cast(uint64_t, __x), __width)); } -// Gets the sum of all lanes inside the warp or wavefront. -#define __DO_LANE_SUM(__type, __suffix) \ - _DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_sum_##__suffix( \ - uint64_t __lane_mask, __type __x) { \ - for (uint32_t __step = __gpu_num_lanes() / 2; __step > 0; __step /= 2) { \ - uint32_t __index = __step + __gpu_lane_id(); \ - __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \ - __gpu_num_lanes()); \ - } \ - return __gpu_read_first_lane_##__suffix(__lane_mask, __x); \ - } -__DO_LANE_SUM(uint32_t, u32); // uint32_t __gpu_lane_sum_u32(m, x) -__DO_LANE_SUM(uint64_t, u64); // uint64_t __gpu_lane_sum_u64(m, x) -__DO_LANE_SUM(float, f32); // float __gpu_lane_sum_f32(m, x) -__DO_LANE_SUM(double, f64); // double __gpu_lane_sum_f64(m, x) -#undef __DO_LANE_SUM - // Gets the accumulator scan of the threads in the warp or wavefront. #define __DO_LANE_SCAN(__type, __bitmask_type, __suffix) \ _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_scan_##__suffix( \ uint64_t __lane_mask, uint32_t __x) { \ - for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \ - uint32_t __index = __gpu_lane_id() - __step; \ - __bitmask_type bitmask = __gpu_lane_id() >= __step; \ - __x += __builtin_bit_cast( \ - __type, -bitmask & __builtin_bit_cast(__bitmask_type, \ - __gpu_shuffle_idx_##__suffix( \ - __lane_mask, __index, __x, \ - __gpu_num_lanes()))); \ + uint64_t __first = __lane_mask >> __builtin_ctzll(__lane_mask); \ + bool __divergent = __gpu_read_first_lane_##__suffix( \ + __lane_mask, __first & (__first + 1)); \ + if (__divergent) { \ + __type __accum = 0; \ + for (uint64_t __mask = __lane_mask; __mask; __mask &= __mask - 1) { \ + __type __index = __builtin_ctzll(__mask); \ + __type __tmp = __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \ + __gpu_num_lanes()); \ + __x = __gpu_lane_id() == __index ? __accum + __tmp : __x; \ + __accum += __tmp; \ + } \ + } else { \ + for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \ + uint32_t __index = __gpu_lane_id() - __step; \ + __bitmask_type bitmask = __gpu_lane_id() >= __step; \ + __x += __builtin_bit_cast( \ + __type, \ + -bitmask & __builtin_bit_cast(__bitmask_type, \ + __gpu_shuffle_idx_##__suffix( \ + __lane_mask, __index, __x, \ + __gpu_num_lanes()))); \ + } \ } \ return __x; \ } @@ -188,6 +186,32 @@ __DO_LANE_SCAN(float, uint32_t, f32); // float __gpu_lane_scan_f32(m, x) __DO_LANE_SCAN(double, uint64_t, f64); // double __gpu_lane_scan_f64(m, x) #undef __DO_LANE_SCAN +// Gets the sum of all lanes inside the warp or wavefront. +#define __DO_LANE_SUM(__type, __suffix) \ + _DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_sum_##__suffix( \ + uint64_t __lane_mask, __type __x) { \ + uint64_t __first = __lane_mask >> __builtin_ctzll(__lane_mask); \ + bool __divergent = __gpu_read_first_lane_##__suffix( \ + __lane_mask, __first & (__first + 1)); \ + if (__divergent) { \ + return __gpu_shuffle_idx_##__suffix( \ + __lane_mask, 63 - __builtin_clzll(__lane_mask), \ + __gpu_lane_scan_##__suffix(__lane_mask, __x), __gpu_num_lanes()); \ + } else { \ + for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) { \ + uint32_t __index = __step + __gpu_lane_id(); \ + __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \ + __gpu_num_lanes()); \ + } \ + return __gpu_read_first_lane_##__suffix(__lane_mask, __x); \ + } \ + } +__DO_LANE_SUM(uint32_t, u32); // uint32_t __gpu_lane_sum_u32(m, x) +__DO_LANE_SUM(uint64_t, u64); // uint64_t __gpu_lane_sum_u64(m, x) +__DO_LANE_SUM(float, f32); // float __gpu_lane_sum_f32(m, x) +__DO_LANE_SUM(double, f64); // double __gpu_lane_sum_f64(m, x) +#undef __DO_LANE_SUM + _Pragma("omp end declare variant"); _Pragma("omp end declare target"); diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h index f857a87b5f4c7..29d0adcabc82f 100644 --- a/clang/lib/Headers/nvptxintrin.h +++ b/clang/lib/Headers/nvptxintrin.h @@ -155,8 +155,11 @@ _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) { _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x, uint32_t __width) { + // Mask out inactive lanes to match AMDGPU behavior. uint32_t __mask = (uint32_t)__lane_mask; - return __nvvm_shfl_sync_idx_i32(__mask, __x, __idx, + bool __bitmask = (1ull << __idx) & __lane_mask; + return -__bitmask & + __nvvm_shfl_sync_idx_i32(__mask, __x, __idx, ((__gpu_num_lanes() - __width) << 8u) | 0x1f); } diff --git a/libc/test/integration/src/__support/GPU/scan_reduce.cpp b/libc/test/integration/src/__support/GPU/scan_reduce.cpp index bc621c3300cbe..1d50e1f99bf31 100644 --- a/libc/test/integration/src/__support/GPU/scan_reduce.cpp +++ b/libc/test/integration/src/__support/GPU/scan_reduce.cpp @@ -53,10 +53,59 @@ static void test_scan() { EXPECT_EQ(z, gpu::get_lane_id() % 2 ? gpu::get_lane_id() / 2 + 1 : 0); } +static uint32_t random(uint64_t *rand_next) { + uint64_t x = *rand_next; + x ^= x >> 12; + x ^= x << 25; + x ^= x >> 27; + *rand_next = x; + return static_cast((x * 0x2545F4914F6CDD1Dul) >> 32); +} + +// Scan operations can break down under thread divergence, make sure that the +// function works under some random divergence. We do this by trivially +// implementing a scan with shared scratch memory and then comparing the +// results. +static void test_scan_divergent() { + static uint32_t input[64] = {0}; + static uint32_t result[64] = {0}; + uint64_t state = gpu::processor_clock() + __gpu_lane_id(); + + for (int i = 0; i < 64; ++i) { + uint64_t lanemask = gpu::get_lane_mask(); + if (random(&state) & (1ull << gpu::get_lane_id())) { + uint64_t divergent = gpu::get_lane_mask(); + uint32_t value = random(&state) % 256; + input[gpu::get_lane_id()] = value; + + if (gpu::is_first_lane(divergent)) { + uint32_t accumulator = 0; + for (uint32_t lane = 0; lane < gpu::get_lane_size(); ++lane) { + uint32_t tmp = input[lane]; + result[lane] = tmp + accumulator; + accumulator += tmp; + } + } + gpu::sync_lane(divergent); + + uint32_t scan = gpu::scan(divergent, value); + EXPECT_EQ(scan, result[gpu::get_lane_id()]); + } + if (gpu::is_first_lane(lanemask)) + __builtin_memset(input, 0, sizeof(input)); + gpu::sync_lane(lanemask); + } +} + TEST_MAIN(int argc, char **argv, char **envp) { + if (gpu::get_thread_id() >= gpu::get_lane_size()) + return 0; + test_reduce(); test_scan(); + test_scan_divergent(); + return 0; }