diff --git a/clang/lib/Headers/gpuintrin.h b/clang/lib/Headers/gpuintrin.h
index 11c87e85cd497..efdc3d94ac0b3 100644
--- a/clang/lib/Headers/gpuintrin.h
+++ b/clang/lib/Headers/gpuintrin.h
@@ -150,35 +150,33 @@ __gpu_shuffle_idx_f64(uint64_t __lane_mask, uint32_t __idx, double __x,
                             __builtin_bit_cast(uint64_t, __x), __width));
 }
 
-// Gets the sum of all lanes inside the warp or wavefront.
-#define __DO_LANE_SUM(__type, __suffix)                                        \
-  _DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_sum_##__suffix(        \
-      uint64_t __lane_mask, __type __x) {                                      \
-    for (uint32_t __step = __gpu_num_lanes() / 2; __step > 0; __step /= 2) {   \
-      uint32_t __index = __step + __gpu_lane_id();                             \
-      __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x,           \
-                                          __gpu_num_lanes());                  \
-    }                                                                          \
-    return __gpu_read_first_lane_##__suffix(__lane_mask, __x);                 \
-  }
-__DO_LANE_SUM(uint32_t, u32); // uint32_t __gpu_lane_sum_u32(m, x)
-__DO_LANE_SUM(uint64_t, u64); // uint64_t __gpu_lane_sum_u64(m, x)
-__DO_LANE_SUM(float, f32);    // float __gpu_lane_sum_f32(m, x)
-__DO_LANE_SUM(double, f64);   // double __gpu_lane_sum_f64(m, x)
-#undef __DO_LANE_SUM
-
 // Gets the accumulator scan of the threads in the warp or wavefront.
 #define __DO_LANE_SCAN(__type, __bitmask_type, __suffix)                       \
   _DEFAULT_FN_ATTRS static __inline__ uint32_t __gpu_lane_scan_##__suffix(     \
       uint64_t __lane_mask, uint32_t __x) {                                    \
-    for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) {       \
-      uint32_t __index = __gpu_lane_id() - __step;                             \
-      __bitmask_type bitmask = __gpu_lane_id() >= __step;                      \
-      __x += __builtin_bit_cast(                                               \
-          __type, -bitmask & __builtin_bit_cast(__bitmask_type,                \
-                                                __gpu_shuffle_idx_##__suffix(  \
-                                                    __lane_mask, __index, __x, \
-                                                    __gpu_num_lanes())));      \
+    uint64_t __first = __lane_mask >> __builtin_ctzll(__lane_mask);            \
+    bool __divergent = __gpu_read_first_lane_##__suffix(                       \
+        __lane_mask, __first & (__first + 1));                                 \
+    if (__divergent) {                                                         \
+      __type __accum = 0;                                                      \
+      for (uint64_t __mask = __lane_mask; __mask; __mask &= __mask - 1) {      \
+        __type __index = __builtin_ctzll(__mask);                              \
+        __type __tmp = __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x, \
+                                                    __gpu_num_lanes());        \
+        __x = __gpu_lane_id() == __index ? __accum + __tmp : __x;              \
+        __accum += __tmp;                                                      \
+      }                                                                        \
+    } else {                                                                   \
+      for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) {     \
+        uint32_t __index = __gpu_lane_id() - __step;                           \
+        __bitmask_type bitmask = __gpu_lane_id() >= __step;                    \
+        __x += __builtin_bit_cast(                                             \
+            __type,                                                            \
+            -bitmask & __builtin_bit_cast(__bitmask_type,                      \
+                                          __gpu_shuffle_idx_##__suffix(        \
+                                              __lane_mask, __index, __x,       \
+                                              __gpu_num_lanes())));            \
+      }                                                                        \
     }                                                                          \
     return __x;                                                                \
   }
@@ -188,6 +186,32 @@ __DO_LANE_SCAN(float, uint32_t, f32);    // float __gpu_lane_scan_f32(m, x)
 __DO_LANE_SCAN(double, uint64_t, f64);   // double __gpu_lane_scan_f64(m, x)
 #undef __DO_LANE_SCAN
 
+// Gets the sum of all lanes inside the warp or wavefront.
+#define __DO_LANE_SUM(__type, __suffix)                                        \
+  _DEFAULT_FN_ATTRS static __inline__ __type __gpu_lane_sum_##__suffix(        \
+      uint64_t __lane_mask, __type __x) {                                      \
+    uint64_t __first = __lane_mask >> __builtin_ctzll(__lane_mask);            \
+    bool __divergent = __gpu_read_first_lane_##__suffix(                       \
+        __lane_mask, __first & (__first + 1));                                 \
+    if (__divergent) {                                                         \
+      return __gpu_shuffle_idx_##__suffix(                                     \
+          __lane_mask, 63 - __builtin_clzll(__lane_mask),                      \
+          __gpu_lane_scan_##__suffix(__lane_mask, __x), __gpu_num_lanes());    \
+    } else {                                                                   \
+      for (uint32_t __step = 1; __step < __gpu_num_lanes(); __step *= 2) {     \
+        uint32_t __index = __step + __gpu_lane_id();                           \
+        __x += __gpu_shuffle_idx_##__suffix(__lane_mask, __index, __x,         \
+                                            __gpu_num_lanes());                \
+      }                                                                        \
+      return __gpu_read_first_lane_##__suffix(__lane_mask, __x);               \
+    }                                                                          \
+  }
+__DO_LANE_SUM(uint32_t, u32); // uint32_t __gpu_lane_sum_u32(m, x)
+__DO_LANE_SUM(uint64_t, u64); // uint64_t __gpu_lane_sum_u64(m, x)
+__DO_LANE_SUM(float, f32);    // float __gpu_lane_sum_f32(m, x)
+__DO_LANE_SUM(double, f64);   // double __gpu_lane_sum_f64(m, x)
+#undef __DO_LANE_SUM
+
 _Pragma("omp end declare variant");
 _Pragma("omp end declare target");
 
diff --git a/clang/lib/Headers/nvptxintrin.h b/clang/lib/Headers/nvptxintrin.h
index f857a87b5f4c7..29d0adcabc82f 100644
--- a/clang/lib/Headers/nvptxintrin.h
+++ b/clang/lib/Headers/nvptxintrin.h
@@ -155,8 +155,11 @@ _DEFAULT_FN_ATTRS static __inline__ void __gpu_sync_lane(uint64_t __lane_mask) {
 _DEFAULT_FN_ATTRS static __inline__ uint32_t
 __gpu_shuffle_idx_u32(uint64_t __lane_mask, uint32_t __idx, uint32_t __x,
                       uint32_t __width) {
+  // Mask out inactive lanes to match AMDGPU behavior.
   uint32_t __mask = (uint32_t)__lane_mask;
-  return __nvvm_shfl_sync_idx_i32(__mask, __x, __idx,
+  bool __bitmask = (1ull << __idx) & __lane_mask;
+  return -__bitmask &
+         __nvvm_shfl_sync_idx_i32(__mask, __x, __idx,
                                   ((__gpu_num_lanes() - __width) << 8u) | 0x1f);
 }
 
diff --git a/libc/test/integration/src/__support/GPU/scan_reduce.cpp b/libc/test/integration/src/__support/GPU/scan_reduce.cpp
index bc621c3300cbe..1d50e1f99bf31 100644
--- a/libc/test/integration/src/__support/GPU/scan_reduce.cpp
+++ b/libc/test/integration/src/__support/GPU/scan_reduce.cpp
@@ -53,10 +53,59 @@ static void test_scan() {
   EXPECT_EQ(z, gpu::get_lane_id() % 2 ? gpu::get_lane_id() / 2 + 1 : 0);
 }
 
+static uint32_t random(uint64_t *rand_next) {
+  uint64_t x = *rand_next;
+  x ^= x >> 12;
+  x ^= x << 25;
+  x ^= x >> 27;
+  *rand_next = x;
+  return static_cast<uint32_t>((x * 0x2545F4914F6CDD1Dul) >> 32);
+}
+
+// Scan operations can break down under thread divergence, make sure that the
+// function works under some random divergence. We do this by trivially
+// implementing a scan with shared scratch memory and then comparing the
+// results.
+static void test_scan_divergent() {
+  static uint32_t input[64] = {0};
+  static uint32_t result[64] = {0};
+  uint64_t state = gpu::processor_clock() + __gpu_lane_id();
+
+  for (int i = 0; i < 64; ++i) {
+    uint64_t lanemask = gpu::get_lane_mask();
+    if (random(&state) & (1ull << gpu::get_lane_id())) {
+      uint64_t divergent = gpu::get_lane_mask();
+      uint32_t value = random(&state) % 256;
+      input[gpu::get_lane_id()] = value;
+
+      if (gpu::is_first_lane(divergent)) {
+        uint32_t accumulator = 0;
+        for (uint32_t lane = 0; lane < gpu::get_lane_size(); ++lane) {
+          uint32_t tmp = input[lane];
+          result[lane] = tmp + accumulator;
+          accumulator += tmp;
+        }
+      }
+      gpu::sync_lane(divergent);
+
+      uint32_t scan = gpu::scan(divergent, value);
+      EXPECT_EQ(scan, result[gpu::get_lane_id()]);
+    }
+    if (gpu::is_first_lane(lanemask))
+      __builtin_memset(input, 0, sizeof(input));
+    gpu::sync_lane(lanemask);
+  }
+}
+
 TEST_MAIN(int argc, char **argv, char **envp) {
+  if (gpu::get_thread_id() >= gpu::get_lane_size())
+    return 0;
+
   test_reduce();
 
   test_scan();
 
+  test_scan_divergent();
+
   return 0;
 }