From 093294af1c751edeed2cce43ba000a5bbabec581 Mon Sep 17 00:00:00 2001
From: Chuang Zhu <chuangz@nvidia.com>
Date: Fri, 4 Aug 2023 09:33:34 +0000
Subject: [PATCH 1/6] Implemente weighted_sample with raft_topk

---
 ...ighted_sample_without_replacement_func.cuh | 669 ++++++++++--------
 .../graph_ops/csr_add_self_loop_utils.cu      |   2 +-
 .../graph_sampling_test_utils.cu              |  54 +-
 ...ighted_sample_without_replacement_tests.cu |  16 +-
 ...aph_weighted_sample_without_replacement.py |  26 +-
 5 files changed, 438 insertions(+), 329 deletions(-)
diff --git a/cpp/src/wholegraph_ops/weighted_sample_without_replacement_func.cuh b/cpp/src/wholegraph_ops/weighted_sample_without_replacement_func.cuh
index 22a97fd19..c948d8cc3 100644
--- a/cpp/src/wholegraph_ops/weighted_sample_without_replacement_func.cuh
+++ b/cpp/src/wholegraph_ops/weighted_sample_without_replacement_func.cuh
@@ -15,21 +15,23 @@
  */
 #pragma once
 #include <cub/device/device_radix_sort.cuh>
+#include <driver_types.h>
+#include <raft/matrix/select_k.cuh>
 #include <random>
 #include <thrust/scan.h>
 
+#include "raft/matrix/detail/select_warpsort.cuh"
+#include "raft/util/cuda_dev_essentials.cuh"
+#include "wholememory_ops/output_memory_handle.hpp"
+#include "wholememory_ops/raft_random.cuh"
+#include "wholememory_ops/temp_memory_handle.hpp"
+#include "wholememory_ops/thrust_allocator.hpp"
 #include <raft/util/integer_utils.hpp>
 #include <wholememory/device_reference.cuh>
 #include <wholememory/env_func_ptrs.h>
 #include <wholememory/global_reference.h>
 #include <wholememory/tensor_description.h>
 
-#include "wholememory_ops/output_memory_handle.hpp"
-#include "wholememory_ops/raft_random.cuh"
-#include "wholememory_ops/temp_memory_handle.hpp"
-#include "wholememory_ops/thrust_allocator.hpp"
-
-#include "block_radix_topk.cuh"
 #include "cuda_macros.hpp"
 #include "error.hpp"
 #include "sample_comm.cuh"
@@ -53,39 +55,76 @@ __device__ __forceinline__ float gen_key_from_weight(const WeightType weight, PC
   return logk;
 }
 
+template <typename T, typename IdxT>
+__device__ __host__ void set_buf_pointers(T* buf1,
+                                          IdxT* idx_buf1,
+                                          T* buf2,
+                                          IdxT* idx_buf2,
+                                          int pass,
+                                          const T*& in_buf,
+                                          const IdxT*& in_idx_buf,
+                                          T*& out_buf,
+                                          IdxT*& out_idx_buf)
+{
+  if (pass == 0) {
+    in_buf      = buf1;
+    in_idx_buf  = nullptr;
+    out_buf     = nullptr;
+    out_idx_buf = nullptr;
+
+  } else if (pass % 2 == 0) {
+    in_buf      = buf2;
+    in_idx_buf  = idx_buf2;
+    out_buf     = buf1;
+    out_idx_buf = idx_buf1;
+  } else {
+    in_buf      = buf1;
+    in_idx_buf  = idx_buf1;
+    out_buf     = buf2;
+    out_idx_buf = idx_buf2;
+  }
+}
+
 template <typename IdType,
           typename LocalIdType,
           typename WeightType,
           typename WeightKeyType,
+          typename NeighborIdxType,
           typename WMIdType,
           typename WMOffsetType,
           typename WMWeightType,
           unsigned int BLOCK_SIZE,
-          bool NeedRandom = true,
-          bool Ascending  = false>
-__launch_bounds__(BLOCK_SIZE) __global__ void weighted_sample_without_replacement_large_kernel(
-  wholememory_gref_t wm_csr_row_ptr,
-  wholememory_array_description_t wm_csr_row_ptr_desc,
-  wholememory_gref_t wm_csr_col_ptr,
-  wholememory_array_description_t wm_csr_col_ptr_desc,
-  wholememory_gref_t wm_csr_weight_ptr,
-  wholememory_array_description_t wm_csr_weight_ptr_desc,
-  const IdType* input_nodes,
-  const int input_node_count,
-  const int max_sample_count,
-  unsigned long long random_seed,
-  const int* sample_offset,
-  wholememory_array_description_t sample_offset_desc,
-  const int* target_neighbor_offset,
-  WMIdType* output,
-  int* src_lid,
-  int64_t* out_edge_gid,
-  WeightKeyType* weight_keys_buff)
+          int BitsPerPass,
+          bool NeedRandom = true>
+__launch_bounds__(BLOCK_SIZE) __global__
+  void weighted_sample_without_replacement_large_raft_radix_kernel(
+    wholememory_gref_t wm_csr_row_ptr,
+    wholememory_array_description_t wm_csr_row_ptr_desc,
+    wholememory_gref_t wm_csr_col_ptr,
+    wholememory_array_description_t wm_csr_col_ptr_desc,
+    wholememory_gref_t wm_csr_weight_ptr,
+    wholememory_array_description_t wm_csr_weight_ptr_desc,
+    const IdType* input_nodes,
+    const int input_node_count,
+    const int max_sample_count,
+    unsigned long long random_seed,
+    const int* sample_offset,
+    wholememory_array_description_t sample_offset_desc,
+    const int* target_neighbor_offset,
+    WMIdType* output,
+    LocalIdType* src_lid,
+    int64_t* out_edge_gid,
+    WeightKeyType* weight_keys_buff0,
+    NeighborIdxType* local_idx_buff0,
+    WeightKeyType* weight_keys_buff1,
+    NeighborIdxType* local_idx_buff1,
+    WeightKeyType* weight_keys_out,
+    NeighborIdxType* local_idx_out,
+    const bool select_min = false)
 {
   int input_idx = blockIdx.x;
   if (input_idx >= input_node_count) return;
   int gidx = threadIdx.x + blockIdx.x * BLOCK_SIZE;
-
   wholememory::device_reference<WMOffsetType> csr_row_ptr_gen(wm_csr_row_ptr);
   wholememory::device_reference<WMIdType> csr_col_ptr_gen(wm_csr_col_ptr);
   wholememory::device_reference<WMWeightType> csr_weight_ptr_gen(wm_csr_weight_ptr);
@@ -93,9 +132,7 @@ __launch_bounds__(BLOCK_SIZE) __global__ void weighted_sample_without_replacemen
   int64_t start      = csr_row_ptr_gen[nid];
   int64_t end        = csr_row_ptr_gen[nid + 1];
   int neighbor_count = (int)(end - start);
-
-  WeightKeyType* weight_keys_local_buff = weight_keys_buff + target_neighbor_offset[input_idx];
-  int offset                            = sample_offset[input_idx];
+  int offset         = sample_offset[input_idx];
   if (neighbor_count <= max_sample_count) {
     for (int sample_id = threadIdx.x; sample_id < neighbor_count; sample_id += BLOCK_SIZE) {
       int neighbor_idx           = sample_id;
@@ -110,82 +147,103 @@ __launch_bounds__(BLOCK_SIZE) __global__ void weighted_sample_without_replacemen
   }
 
   PCGenerator rng(random_seed, (uint64_t)gidx, (uint64_t)0);
+  int buff_offset = target_neighbor_offset[input_idx];
+  weight_keys_buff0 += buff_offset;
+  local_idx_buff0 += buff_offset;
+  weight_keys_buff1 += buff_offset;
+  local_idx_buff1 += buff_offset;
+  weight_keys_out += input_idx * max_sample_count;
+  local_idx_out += input_idx * max_sample_count;
+
   for (int id = threadIdx.x; id < neighbor_count; id += BLOCK_SIZE) {
     WeightType thread_weight = csr_weight_ptr_gen[start + id];
-    weight_keys_local_buff[id] =
-      NeedRandom ? static_cast<WeightKeyType>(gen_key_from_weight(thread_weight, rng))
-                 : (static_cast<WeightKeyType>(thread_weight));
+    weight_keys_buff0[id]    = NeedRandom
+                                 ? static_cast<WeightKeyType>(gen_key_from_weight(thread_weight, rng))
+                                 : (static_cast<WeightKeyType>(thread_weight));
+    local_idx_buff0[id]      = id;
   }
 
+  constexpr int num_buckets =
+    raft::matrix::detail::select::radix::impl::calc_num_buckets<BitsPerPass>();
+  __shared__ raft::matrix::detail::select::radix::impl::Counter<WeightKeyType, NeighborIdxType>
+    counter;
+  __shared__ NeighborIdxType histogram[num_buckets];
+  if (threadIdx.x == 0) {
+    counter.k              = max_sample_count;
+    counter.len            = neighbor_count;
+    counter.previous_len   = neighbor_count;
+    counter.kth_value_bits = 0;
+    counter.out_cnt        = 0;
+    counter.out_back_cnt   = 0;
+  }
   __syncthreads();
+  const WeightKeyType* in_buf       = nullptr;
+  const NeighborIdxType* in_idx_buf = nullptr;
+  WeightKeyType* out_buf            = nullptr;
+  NeighborIdxType* out_idx_buf      = nullptr;
+  constexpr int num_passes =
+    raft::matrix::detail::select::radix::impl::calc_num_passes<WeightKeyType, BitsPerPass>();
+  for (int pass = 0; pass < num_passes; ++pass) {
+    set_buf_pointers(weight_keys_buff0,
+                     local_idx_buff0,
+                     weight_keys_buff1,
+                     local_idx_buff1,
+                     pass,
+                     in_buf,
+                     in_idx_buf,
+                     out_buf,
+                     out_idx_buf);
+    NeighborIdxType current_len = counter.len;
+    NeighborIdxType current_k   = counter.k;
+    raft::matrix::detail::select::radix::impl::
+      filter_and_histogram_for_one_block<WeightKeyType, NeighborIdxType, BitsPerPass>(
+        in_buf,
+        in_idx_buf,
+        out_buf,
+        out_idx_buf,
+        weight_keys_out,
+        local_idx_out,
+        &counter,
+        histogram,
+        select_min,
+        pass);
+    __syncthreads();
 
-  WeightKeyType topk_val;
-  bool topk_is_unique;
-
-  using BlockRadixSelectT =
-    std::conditional_t<Ascending,
-                       BlockRadixTopKGlobalMemory<WeightKeyType, BLOCK_SIZE, false>,
-                       BlockRadixTopKGlobalMemory<WeightKeyType, BLOCK_SIZE, true>>;
-  __shared__ typename BlockRadixSelectT::TempStorage share_storage;
-
-  BlockRadixSelectT{share_storage}.radixTopKGetThreshold(
-    weight_keys_local_buff, max_sample_count, neighbor_count, topk_val, topk_is_unique);
-  __shared__ int cnt;
-
-  if (threadIdx.x == 0) { cnt = 0; }
-  __syncthreads();
-
-  for (int i = threadIdx.x; i < max_sample_count; i += BLOCK_SIZE) {
-    if (src_lid) src_lid[offset + i] = (LocalIdType)input_idx;
-  }
+    raft::matrix::detail::select::radix::impl::scan<NeighborIdxType, BitsPerPass, BLOCK_SIZE>(
+      histogram);
+    __syncthreads();
 
-  // We use atomicAdd 1 operations instead of binaryScan to calculate the write
-  // index, since we do not need to keep the relative positions of element.
-
-  if (topk_is_unique) {
-    for (int neighbor_idx = threadIdx.x; neighbor_idx < neighbor_count;
-         neighbor_idx += BLOCK_SIZE) {
-      WeightKeyType key = weight_keys_local_buff[neighbor_idx];
-      bool has_topk     = Ascending ? (key <= topk_val) : (key >= topk_val);
-
-      if (has_topk) {
-        int write_index                = atomicAdd(&cnt, 1);
-        LocalIdType local_original_idx = neighbor_idx;
-        output[offset + write_index]   = csr_col_ptr_gen[start + local_original_idx];
-        if (out_edge_gid)
-          out_edge_gid[offset + write_index] = static_cast<IdType>(start + local_original_idx);
-      }
-    }
-  } else {
-    for (int neighbor_idx = threadIdx.x; neighbor_idx < neighbor_count;
-         neighbor_idx += BLOCK_SIZE) {
-      WeightKeyType key = weight_keys_local_buff[neighbor_idx];
-      bool has_topk     = Ascending ? (key < topk_val) : (key > topk_val);
-
-      if (has_topk) {
-        int write_index                = atomicAdd(&cnt, 1);
-        LocalIdType local_original_idx = neighbor_idx;
-        output[offset + write_index]   = csr_col_ptr_gen[start + local_original_idx];
-        if (out_edge_gid)
-          out_edge_gid[offset + write_index] = static_cast<IdType>(start + local_original_idx);
-      }
-    }
+    raft::matrix::detail::select::radix::impl::
+      choose_bucket<WeightKeyType, NeighborIdxType, BitsPerPass>(
+        &counter, histogram, current_k, pass);
+    if (threadIdx.x == 0) { counter.previous_len = current_len; }
     __syncthreads();
-    for (int neighbor_idx = threadIdx.x; neighbor_idx < neighbor_count;
-         neighbor_idx += BLOCK_SIZE) {
-      WeightKeyType key = weight_keys_local_buff[neighbor_idx];
-      bool has_topk     = (key == topk_val);
-
-      if (has_topk) {
-        int write_index = atomicAdd(&cnt, 1);
-        if (write_index >= max_sample_count) break;
-        LocalIdType local_original_idx = neighbor_idx;
-        output[offset + write_index]   = csr_col_ptr_gen[start + local_original_idx];
-        if (out_edge_gid)
-          out_edge_gid[offset + write_index] = static_cast<IdType>(start + local_original_idx);
-      }
+
+    if (counter.len == counter.k || pass == num_passes - 1) {
+      raft::matrix::detail::select::radix::impl::
+        last_filter<WeightKeyType, NeighborIdxType, BitsPerPass>(
+          pass == 0 ? weight_keys_buff0 : out_buf,
+          pass == 0 ? local_idx_buff0 : out_idx_buf,
+          weight_keys_out,
+          local_idx_out,
+          current_len,
+          max_sample_count,
+          &counter,
+          select_min,
+          pass);
+      break;
     }
   }
+  // topk  idx in local_idx_out
+  __syncthreads();
+  for (int sample_id = threadIdx.x; sample_id < max_sample_count; sample_id += BLOCK_SIZE) {
+    int original_neighbor_idx  = local_idx_out[sample_id];
+    IdType gid                 = csr_col_ptr_gen[start + original_neighbor_idx];
+    output[offset + sample_id] = gid;
+    if (src_lid) src_lid[offset + sample_id] = (LocalIdType)input_idx;
+    if (out_edge_gid)
+      out_edge_gid[offset + sample_id] = static_cast<int64_t>(start + original_neighbor_idx);
+  }
 }
 
 template <typename IdType, typename WMOffsetType, bool NeedNeighbor = false>
@@ -216,21 +274,30 @@ __global__ void get_sample_count_and_neighbor_count_without_replacement_kernel(
   }
 }
 
+// to  avoid queue.store()  store keys or values in output.
+struct null_store_t {};
+struct null_store_op {
+  template <typename Type, typename... UnusedArgs>
+  constexpr auto operator()(const Type& in, UnusedArgs...) const
+  {
+    return null_store_t{};
+  }
+};
+
 // A-RES algorithmn
 // https://en.wikipedia.org/wiki/Reservoir_sampling#Algorithm_A-Res
-// max_sample_count should <=(BLOCK_SIZE*ITEMS_PER_THREAD*/4)  otherwise,need to
-// change the template parameters of BlockRadixTopK.
-template <typename IdType,
+template <template <int, bool, typename, typename> class WarpSortClass,
+          int Capacity,
+          typename IdType,
           typename LocalIdType,
           typename WeightType,
+          typename NeighborIdxType,
           typename WMIdType,
           typename WMOffsetType,
           typename WMWeightType,
-          unsigned int ITEMS_PER_THREAD,
-          unsigned int BLOCK_SIZE,
-          bool NeedRandom = true,
-          bool Ascending  = false>
-__launch_bounds__(BLOCK_SIZE) __global__ void weighted_sample_without_replacement_kernel(
+          bool NEED_RANDOM = true,
+          bool ASCENDING   = false>
+__launch_bounds__(256) __global__ void weighted_sample_without_replacement_raft_kernel(
   wholememory_gref_t wm_csr_row_ptr,
   wholememory_array_description_t wm_csr_row_ptr_desc,
   wholememory_gref_t wm_csr_col_ptr,
@@ -244,13 +311,12 @@ __launch_bounds__(BLOCK_SIZE) __global__ void weighted_sample_without_replacemen
   const int* sample_offset,
   wholememory_array_description_t sample_offset_desc,
   WMIdType* output,
-  int* src_lid,
+  LocalIdType* src_lid,
   int64_t* out_edge_gid)
 {
   int input_idx = blockIdx.x;
   if (input_idx >= input_node_count) return;
-  int gidx = threadIdx.x + blockIdx.x * BLOCK_SIZE;
-
+  int gidx = threadIdx.x + blockIdx.x * blockDim.x;
   wholememory::device_reference<WMOffsetType> csr_row_ptr_gen(wm_csr_row_ptr);
   wholememory::device_reference<WMIdType> csr_col_ptr_gen(wm_csr_col_ptr);
   wholememory::device_reference<WMWeightType> csr_weight_ptr_gen(wm_csr_weight_ptr);
@@ -258,86 +324,153 @@ __launch_bounds__(BLOCK_SIZE) __global__ void weighted_sample_without_replacemen
   IdType nid         = input_nodes[input_idx];
   int64_t start      = csr_row_ptr_gen[nid];
   int64_t end        = csr_row_ptr_gen[nid + 1];
-  int neighbor_count = (int)(end - start);
+  int neighbor_count = static_cast<int>(end - start);
   int offset         = sample_offset[input_idx];
   if (neighbor_count <= max_sample_count) {
-    for (int sample_id = threadIdx.x; sample_id < neighbor_count; sample_id += BLOCK_SIZE) {
+    for (int sample_id = threadIdx.x; sample_id < neighbor_count; sample_id += blockDim.x) {
       int neighbor_idx           = sample_id;
       int original_neighbor_idx  = neighbor_idx;
       IdType gid                 = csr_col_ptr_gen[start + original_neighbor_idx];
       output[offset + sample_id] = gid;
-      if (src_lid) src_lid[offset + sample_id] = (LocalIdType)input_idx;
+      if (src_lid) src_lid[offset + sample_id] = input_idx;
       if (out_edge_gid)
         out_edge_gid[offset + sample_id] = static_cast<int64_t>(start + original_neighbor_idx);
     }
     return;
   } else {
-    PCGenerator rng(random_seed, (uint64_t)gidx, (uint64_t)0);
-
-    float weight_keys[ITEMS_PER_THREAD];
-    int neighbor_idxs[ITEMS_PER_THREAD];
-
-    using BlockRadixTopKT =
-      std::conditional_t<Ascending,
-                         BlockRadixTopKRegister<float, BLOCK_SIZE, ITEMS_PER_THREAD, false, int>,
-                         BlockRadixTopKRegister<float, BLOCK_SIZE, ITEMS_PER_THREAD, true, int>>;
-
-    __shared__ typename BlockRadixTopKT::TempStorage sort_tmp_storage;
-
-    const int tx = threadIdx.x;
-#pragma unroll
-    for (int i = 0; i < ITEMS_PER_THREAD; i++) {
-      int idx = BLOCK_SIZE * i + tx;
+    extern __shared__ __align__(256) uint8_t smem_buf_bytes[];
+    using bq_t = raft::matrix::detail::select::warpsort::
+      block_sort<WarpSortClass, Capacity, ASCENDING, WeightType, NeighborIdxType>;
+
+    uint8_t* warp_smem = bq_t::queue_t::mem_required(blockDim.x) > 0 ? smem_buf_bytes : nullptr;
+    bq_t queue(max_sample_count, warp_smem);
+    PCGenerator rng(random_seed, static_cast<uint64_t>(gidx), static_cast<uint64_t>(0));
+    const int per_thread_lim = neighbor_count + raft::laneId();
+    for (int idx = threadIdx.x; idx < per_thread_lim; idx += blockDim.x) {
+      WeightType weight_key =
+        WarpSortClass<Capacity, ASCENDING, WeightType, NeighborIdxType>::kDummy;
       if (idx < neighbor_count) {
         WeightType thread_weight = csr_weight_ptr_gen[start + idx];
-        weight_keys[i] =
-          NeedRandom ? gen_key_from_weight(thread_weight, rng) : (float)thread_weight;
-        neighbor_idxs[i] = idx;
+        weight_key = NEED_RANDOM ? gen_key_from_weight(thread_weight, rng) : thread_weight;
       }
+      queue.add(weight_key, idx);
     }
-    const int valid_count = (neighbor_count < (BLOCK_SIZE * ITEMS_PER_THREAD))
-                              ? neighbor_count
-                              : (BLOCK_SIZE * ITEMS_PER_THREAD);
-    BlockRadixTopKT{sort_tmp_storage}.radixTopKToStriped(
-      weight_keys, neighbor_idxs, max_sample_count, valid_count);
+    queue.done(smem_buf_bytes);
+
     __syncthreads();
-    const int stride = BLOCK_SIZE * ITEMS_PER_THREAD - max_sample_count;
-
-    for (int idx_offset = ITEMS_PER_THREAD * BLOCK_SIZE; idx_offset < neighbor_count;
-         idx_offset += stride) {
-#pragma unroll
-      for (int i = 0; i < ITEMS_PER_THREAD; i++) {
-        int local_idx = BLOCK_SIZE * i + tx - max_sample_count;
-        // [0,BLOCK_SIZE*ITEMS_PER_THREAD-max_sample_count)
-        int target_idx = idx_offset + local_idx;
-        if (local_idx >= 0 && target_idx < neighbor_count) {
-          WeightType thread_weight = csr_weight_ptr_gen[start + target_idx];
-          weight_keys[i] =
-            NeedRandom ? gen_key_from_weight(thread_weight, rng) : (float)thread_weight;
-          neighbor_idxs[i] = target_idx;
-        }
+    NeighborIdxType* smem_topk_idx = reinterpret_cast<NeighborIdxType*>(smem_buf_bytes);
+    queue.store(static_cast<null_store_t*>(nullptr), smem_topk_idx, null_store_op{});
+    __syncthreads();
+    for (int idx = threadIdx.x; idx < max_sample_count; idx += blockDim.x) {
+      NeighborIdxType local_original_idx = static_cast<NeighborIdxType>(smem_topk_idx[idx]);
+      if (src_lid) { src_lid[offset + idx] = static_cast<LocalIdType>(input_idx); }
+      output[offset + idx] = csr_col_ptr_gen[start + local_original_idx];
+      if (out_edge_gid) {
+        out_edge_gid[offset + idx] = static_cast<int64_t>(start + local_original_idx);
       }
-      const int iter_valid_count = ((neighbor_count - idx_offset) >= stride)
-                                     ? (BLOCK_SIZE * ITEMS_PER_THREAD)
-                                     : (max_sample_count + neighbor_count - idx_offset);
-      BlockRadixTopKT{sort_tmp_storage}.radixTopKToStriped(
-        weight_keys, neighbor_idxs, max_sample_count, iter_valid_count);
-      __syncthreads();
     }
-#pragma unroll
-    for (int i = 0; i < ITEMS_PER_THREAD; i++) {
-      int idx = i * BLOCK_SIZE + tx;
-      if (idx < max_sample_count) {
-        if (src_lid) src_lid[offset + idx] = (LocalIdType)input_idx;
-        LocalIdType local_original_idx = neighbor_idxs[i];
-        output[offset + idx]           = csr_col_ptr_gen[start + local_original_idx];
-        if (out_edge_gid)
-          out_edge_gid[offset + idx] = static_cast<int64_t>(start + local_original_idx);
-      }
+  };
+}
+
+template <template <int, bool, typename, typename> class WarpSortClass,
+          int Capacity,
+          typename IdType,
+          typename LocalIdType,
+          typename WeightType,
+          typename NeighborIdxType,
+          typename WMIdType,
+          typename WMOffsetType,
+          typename WMWeightType,
+          bool NEED_RANDOM = true,
+          bool ASCENDING   = false>
+void launch_kernel(wholememory_gref_t wm_csr_row_ptr,
+                   wholememory_array_description_t wm_csr_row_ptr_desc,
+                   wholememory_gref_t wm_csr_col_ptr,
+                   wholememory_array_description_t wm_csr_col_ptr_desc,
+                   wholememory_gref_t wm_csr_weight_ptr,
+                   wholememory_array_description_t wm_csr_weight_ptr_desc,
+                   const IdType* input_nodes,
+                   const int input_node_count,
+                   const int max_sample_count,
+                   unsigned long long random_seed,
+                   const int* sample_offset,
+                   wholememory_array_description_t sample_offset_desc,
+                   WMIdType* output,
+                   LocalIdType* src_lid,
+                   int64_t* out_edge_gid,
+                   int block_dim,
+                   int smem_size,
+                   cudaStream_t stream)
+{
+  const int capacity = raft::bound_by_power_of_two(max_sample_count);
+  if constexpr (Capacity > 8) {
+    if (capacity < Capacity) {
+      return launch_kernel<WarpSortClass,
+                           Capacity / 2,
+                           IdType,
+                           LocalIdType,
+                           WeightType,
+                           NeighborIdxType,
+                           WMIdType,
+                           WMOffsetType,
+                           WMWeightType,
+                           NEED_RANDOM,
+                           ASCENDING>(wm_csr_row_ptr,
+                                      wm_csr_row_ptr_desc,
+                                      wm_csr_col_ptr,
+                                      wm_csr_col_ptr_desc,
+                                      wm_csr_weight_ptr,
+                                      wm_csr_weight_ptr_desc,
+                                      input_nodes,
+                                      input_node_count,
+                                      max_sample_count,
+                                      random_seed,
+                                      sample_offset,
+                                      sample_offset_desc,
+                                      output,
+                                      src_lid,
+                                      out_edge_gid,
+                                      block_dim,
+                                      smem_size,
+                                      stream);
     }
   }
+  WHOLEMEMORY_EXPECTS(
+    capacity <= Capacity, "Requested max_sample_count is too large (%d)", max_sample_count);
+  smem_size = std::max<int>(
+    smem_size, WarpSortClass<1, true, WeightType, NeighborIdxType>::mem_required(block_dim));
+  weighted_sample_without_replacement_raft_kernel<WarpSortClass,
+                                                  Capacity,
+                                                  IdType,
+                                                  LocalIdType,
+                                                  WeightType,
+                                                  NeighborIdxType,
+                                                  WMIdType,
+                                                  WMOffsetType,
+                                                  WMWeightType,
+                                                  NEED_RANDOM,
+                                                  ASCENDING>
+    <<<input_node_count, block_dim, smem_size, stream>>>(wm_csr_row_ptr,
+                                                         wm_csr_row_ptr_desc,
+                                                         wm_csr_col_ptr,
+                                                         wm_csr_col_ptr_desc,
+                                                         wm_csr_weight_ptr,
+                                                         wm_csr_weight_ptr_desc,
+                                                         input_nodes,
+                                                         input_node_count,
+                                                         max_sample_count,
+                                                         random_seed,
+                                                         sample_offset,
+                                                         sample_offset_desc,
+                                                         output,
+                                                         src_lid,
+                                                         out_edge_gid);
 }
 
+template <int Capacity, bool Ascending, class T, class IdxT>
+using WarpSortClassT =
+  raft::matrix::detail::select::warpsort::warp_sort_distributed_ext<Capacity, Ascending, T, IdxT>;
+
 template <typename IdType, typename WMIdType, typename WeightType>
 void wholegraph_csr_weighted_sample_without_replacement_func(
   wholememory_gref_t wm_csr_row_ptr,
@@ -372,12 +505,13 @@ void wholegraph_csr_weighted_sample_without_replacement_func(
                       "output_sample_offset_desc.dtype = %d",
                       output_sample_offset_desc.dtype);
 
-  constexpr int sample_count_threshold = 1024;
-  const bool need_neighbor_count       = max_sample_count > sample_count_threshold;
+  constexpr int sample_count_threshold = raft::matrix::detail::select::warpsort::kMaxCapacity;
+
+  const bool need_neighbor_count = max_sample_count > sample_count_threshold;
 
   wholememory_ops::temp_memory_handle gen_buffer_tmh(p_env_fns);
   int* tmp_sample_count_mem_pointer =
-    (int*)gen_buffer_tmh.device_malloc(center_node_count + 1, WHOLEMEMORY_DT_INT);
+    static_cast<int*>(gen_buffer_tmh.device_malloc(center_node_count + 1, WHOLEMEMORY_DT_INT));
 
   wholememory_ops::temp_memory_handle gen_neighbor_count_buffer_tmh(p_env_fns);
   int* tmp_neighbor_counts_mem_pointer = nullptr;
@@ -386,12 +520,12 @@ void wholegraph_csr_weighted_sample_without_replacement_func(
   int block_count = raft::div_rounding_up_safe<int>(center_node_count, thread_x);
 
   if (need_neighbor_count) {
-    tmp_neighbor_counts_mem_pointer =
-      (int*)gen_neighbor_count_buffer_tmh.device_malloc(center_node_count + 1, WHOLEMEMORY_DT_INT);
+    tmp_neighbor_counts_mem_pointer = static_cast<int*>(
+      gen_neighbor_count_buffer_tmh.device_malloc(center_node_count + 1, WHOLEMEMORY_DT_INT));
     get_sample_count_and_neighbor_count_without_replacement_kernel<IdType, int64_t, true>
       <<<block_count, thread_x, 0, stream>>>(wm_csr_row_ptr,
                                              wm_csr_row_ptr_desc,
-                                             (const IdType*)center_nodes,
+                                             static_cast<const IdType*>(center_nodes),
                                              center_node_count,
                                              tmp_sample_count_mem_pointer,
                                              tmp_neighbor_counts_mem_pointer,
@@ -400,7 +534,7 @@ void wholegraph_csr_weighted_sample_without_replacement_func(
     get_sample_count_and_neighbor_count_without_replacement_kernel<IdType, int64_t, false>
       <<<block_count, thread_x, 0, stream>>>(wm_csr_row_ptr,
                                              wm_csr_row_ptr_desc,
-                                             (const IdType*)center_nodes,
+                                             static_cast<const IdType*>(center_nodes),
                                              center_node_count,
                                              tmp_sample_count_mem_pointer,
                                              tmp_neighbor_counts_mem_pointer,
@@ -412,7 +546,7 @@ void wholegraph_csr_weighted_sample_without_replacement_func(
   thrust::exclusive_scan(thrust::cuda::par(thrust_allocator).on(stream),
                          tmp_sample_count_mem_pointer,
                          tmp_sample_count_mem_pointer + center_node_count + 1,
-                         (int*)output_sample_offset);
+                         static_cast<int*>(output_sample_offset));
 
   int count;
   WM_CUDA_CHECK(cudaMemcpyAsync(&count,
@@ -431,16 +565,16 @@ void wholegraph_csr_weighted_sample_without_replacement_func(
   if (output_center_localid_memory_context) {
     wholememory_ops::output_memory_handle gen_output_center_localid_buffer_mh(
       p_env_fns, output_center_localid_memory_context);
-    output_center_localid_ptr =
-      (int*)gen_output_center_localid_buffer_mh.device_malloc(count, WHOLEMEMORY_DT_INT);
+    output_center_localid_ptr = static_cast<int*>(
+      gen_output_center_localid_buffer_mh.device_malloc(count, WHOLEMEMORY_DT_INT));
   }
 
   int64_t* output_edge_gid_ptr = nullptr;
   if (output_edge_gid_memory_context) {
     wholememory_ops::output_memory_handle gen_output_edge_gid_buffer_mh(
       p_env_fns, output_edge_gid_memory_context);
-    output_edge_gid_ptr =
-      (int64_t*)gen_output_edge_gid_buffer_mh.device_malloc(count, WHOLEMEMORY_DT_INT64);
+    output_edge_gid_ptr = static_cast<int64_t*>(
+      gen_output_edge_gid_buffer_mh.device_malloc(count, WHOLEMEMORY_DT_INT64));
   }
 
   if (max_sample_count > sample_count_threshold) {
@@ -458,19 +592,42 @@ void wholegraph_csr_weighted_sample_without_replacement_func(
                                   stream));
     WM_CUDA_CHECK(cudaStreamSynchronize(stream));
 
-    wholememory_ops::temp_memory_handle gen_weights_buffer_tmh(p_env_fns);
-    WeightType* tmp_weights_buffer_mem_pointer = (WeightType*)gen_weights_buffer_tmh.device_malloc(
-      target_neighbor_counts, wm_csr_weight_ptr_desc.dtype);
-
-    constexpr int BLOCK_SIZE = 256;
-    weighted_sample_without_replacement_large_kernel<IdType,
-                                                     int,
-                                                     WeightType,
-                                                     WeightType,
-                                                     WMIdType,
-                                                     int64_t,
-                                                     WeightType,
-                                                     BLOCK_SIZE>
+    wholememory_ops::temp_memory_handle gen_weights_buffer0_tmh(p_env_fns);
+    WeightType* tmp_weights_buffer0_mem_pointer =
+      (WeightType*)gen_weights_buffer0_tmh.device_malloc(target_neighbor_counts,
+                                                         wm_csr_weight_ptr_desc.dtype);
+    wholememory_ops::temp_memory_handle gen_weights_buffer1_tmh(p_env_fns);
+    WeightType* tmp_weights_buffer1_mem_pointer =
+      (WeightType*)gen_weights_buffer1_tmh.device_malloc(target_neighbor_counts,
+                                                         wm_csr_weight_ptr_desc.dtype);
+    wholememory_ops::temp_memory_handle gen_weights_buffer_out_tmh(p_env_fns);
+    WeightType* tmp_weights_buffer_out_mem_pointer =
+      (WeightType*)gen_weights_buffer_out_tmh.device_malloc(center_node_count * max_sample_count,
+                                                            wm_csr_weight_ptr_desc.dtype);
+
+    auto local_idx_dtype = wholememory_dtype_t::WHOLEMEMORY_DT_INT;
+    wholememory_ops::temp_memory_handle local_idx_buffer0_tmh(p_env_fns);
+    int* local_idx_buffer0_mem_pointer = static_cast<int*>(
+      local_idx_buffer0_tmh.device_malloc(target_neighbor_counts, local_idx_dtype));
+    wholememory_ops::temp_memory_handle local_idx_buffer1_tmh(p_env_fns);
+    int* local_idx_buffer1_mem_pointer = static_cast<int*>(
+      local_idx_buffer1_tmh.device_malloc(target_neighbor_counts, local_idx_dtype));
+    wholememory_ops::temp_memory_handle local_idx_buffer_out_tmh(p_env_fns);
+    int* local_idx_buffer_out_mem_pointer =
+      static_cast<int*>(local_idx_buffer_out_tmh.device_malloc(center_node_count * max_sample_count,
+                                                               local_idx_dtype));
+    constexpr int BLOCK_SIZE  = 256;
+    constexpr int BitsPerPass = 8;
+    weighted_sample_without_replacement_large_raft_radix_kernel<IdType,
+                                                                int,
+                                                                WeightType,
+                                                                WeightType,
+                                                                int,
+                                                                WMIdType,
+                                                                int64_t,
+                                                                WeightType,
+                                                                BLOCK_SIZE,
+                                                                BitsPerPass>
       <<<center_node_count, BLOCK_SIZE, 0, stream>>>(wm_csr_row_ptr,
                                                      wm_csr_row_ptr_desc,
                                                      wm_csr_col_ptr,
@@ -487,7 +644,13 @@ void wholegraph_csr_weighted_sample_without_replacement_func(
                                                      (WMIdType*)output_dest_node_ptr,
                                                      (int*)output_center_localid_ptr,
                                                      (int64_t*)output_edge_gid_ptr,
-                                                     tmp_weights_buffer_mem_pointer);
+                                                     tmp_weights_buffer0_mem_pointer,
+                                                     local_idx_buffer0_mem_pointer,
+                                                     tmp_weights_buffer1_mem_pointer,
+                                                     local_idx_buffer1_mem_pointer,
+                                                     tmp_weights_buffer_out_mem_pointer,
+                                                     local_idx_buffer_out_mem_pointer,
+                                                     false);
 
     WM_CUDA_CHECK(cudaGetLastError());
     WM_CUDA_CHECK(cudaStreamSynchronize(stream));
@@ -502,10 +665,10 @@ void wholegraph_csr_weighted_sample_without_replacement_func(
                                              wm_csr_col_ptr_desc,
                                              (const IdType*)center_nodes,
                                              center_node_count,
-                                             (const int*)output_sample_offset,
+                                             static_cast<const int*>(output_sample_offset),
                                              output_sample_offset_desc,
                                              (WMIdType*)output_dest_node_ptr,
-                                             (int*)output_center_localid_ptr,
+                                             output_center_localid_ptr,
                                              (int64_t*)output_edge_gid_ptr);
 
     WM_CUDA_CHECK(cudaGetLastError());
@@ -513,91 +676,41 @@ void wholegraph_csr_weighted_sample_without_replacement_func(
     return;
   }
 
-  using weighted_sample_fun_type = void (*)(wholememory_gref_t,
-                                            wholememory_array_description_t,
-                                            wholememory_gref_t,
-                                            wholememory_array_description_t,
-                                            wholememory_gref_t,
-                                            wholememory_array_description_t,
-                                            const IdType*,
-                                            const int,
-                                            const int,
-                                            unsigned long long,
-                                            const int*,
-                                            wholememory_array_description_t,
-                                            WMIdType*,
-                                            int*,
-                                            int64_t*);
-
-  static const weighted_sample_fun_type func_array[4] = {
-    weighted_sample_without_replacement_kernel<IdType,
-                                               int,
-                                               WeightType,
-                                               WMIdType,
-                                               int64_t,
-                                               WeightType,
-                                               4,
-                                               128>,
-    weighted_sample_without_replacement_kernel<IdType,
-                                               int,
-                                               WeightType,
-                                               WMIdType,
-                                               int64_t,
-                                               WeightType,
-                                               4,
-                                               256>,
-    weighted_sample_without_replacement_kernel<IdType,
-                                               int,
-                                               WeightType,
-                                               WMIdType,
-                                               int64_t,
-                                               WeightType,
-                                               8,
-                                               256>,
-    weighted_sample_without_replacement_kernel<IdType,
-                                               int,
-                                               WeightType,
-                                               WMIdType,
-                                               int64_t,
-                                               WeightType,
-                                               8,
-                                               512>,
-  };
-
-  // 128,256,512,1024
-  // Maximum one-fourth ratio , however it  may not be a good way to choose a
-  // fun.
-
-  const int block_sizes[4] = {128, 256, 256, 512};
-  auto choose_fun_idx      = [](int max_sample_count) {
-    if (max_sample_count <= 128) {
-      // return (max_sample_count - 1) / 32;
-      return 0;
-    }
-    if (max_sample_count <= 256) { return 1; }
-    if (max_sample_count <= 512) { return 2; }
-    return 3;
-  };
-  int func_idx = choose_fun_idx(max_sample_count);
-
-  int block_size = block_sizes[func_idx];
-
-  func_array[func_idx]<<<center_node_count, block_size, 0, stream>>>(
-    wm_csr_row_ptr,
-    wm_csr_row_ptr_desc,
-    wm_csr_col_ptr,
-    wm_csr_col_ptr_desc,
-    wm_csr_weight_ptr,
-    wm_csr_weight_ptr_desc,
-    (const IdType*)center_nodes,
-    center_node_count,
-    max_sample_count,
-    random_seed,
-    (const int*)output_sample_offset,
-    output_sample_offset_desc,
-    (WMIdType*)output_dest_node_ptr,
-    (int*)output_center_localid_ptr,
-    (int64_t*)output_edge_gid_ptr);
+  constexpr int Capacity    = sample_count_threshold;
+  const int capacity        = raft::bound_by_power_of_two(max_sample_count);
+  constexpr int block_dim   = 128;
+  constexpr int num_of_warp = block_dim / raft::WarpSize;
+  int smem_size = raft::matrix::detail::select::warpsort::calc_smem_size_for_block_wide<float, int>(
+    num_of_warp, max_sample_count);
+  smem_size = raft::max(static_cast<int>(max_sample_count * sizeof(int)),
+                        smem_size);  // store values of topk-result
+
+  launch_kernel<WarpSortClassT,
+                Capacity,
+                IdType,
+                int,
+                WeightType,
+                int,
+                WMIdType,
+                int64_t,
+                WeightType>(wm_csr_row_ptr,
+                            wm_csr_row_ptr_desc,
+                            wm_csr_col_ptr,
+                            wm_csr_col_ptr_desc,
+                            wm_csr_weight_ptr,
+                            wm_csr_weight_ptr_desc,
+                            (const IdType*)center_nodes,
+                            center_node_count,
+                            max_sample_count,
+                            random_seed,
+                            static_cast<const int*>(output_sample_offset),
+                            output_sample_offset_desc,
+                            output_dest_node_ptr,
+                            output_center_localid_ptr,
+                            output_edge_gid_ptr,
+                            block_dim,
+                            smem_size,
+                            stream);
 
   WM_CUDA_CHECK(cudaGetLastError());
   WM_CUDA_CHECK(cudaStreamSynchronize(stream));
diff --git a/cpp/tests/graph_ops/csr_add_self_loop_utils.cu b/cpp/tests/graph_ops/csr_add_self_loop_utils.cu
index 77783c081..276d01db4 100644
--- a/cpp/tests/graph_ops/csr_add_self_loop_utils.cu
+++ b/cpp/tests/graph_ops/csr_add_self_loop_utils.cu
@@ -148,7 +148,7 @@ void host_get_csr_add_self_loop(int* host_csr_row_ptr,
 {
   for (int64_t row_id = 0; row_id < csr_row_ptr_array_desc.size - 1; row_id++) {
     int start                                   = host_csr_row_ptr[row_id];
-    int end                                     = host_csr_col_ptr[row_id + 1];
+    int end                                     = host_csr_row_ptr[row_id + 1];
     host_ref_output_csr_row_ptr[row_id]         = start + row_id;
     host_ref_output_csr_col_ptr[start + row_id] = row_id;
     for (int64_t j = start; j < end; j++) {
diff --git a/cpp/tests/wholegraph_ops/graph_sampling_test_utils.cu b/cpp/tests/wholegraph_ops/graph_sampling_test_utils.cu
index 4e60c0aec..54f3ab934 100644
--- a/cpp/tests/wholegraph_ops/graph_sampling_test_utils.cu
+++ b/cpp/tests/wholegraph_ops/graph_sampling_test_utils.cu
@@ -500,7 +500,7 @@ void wholegraph_csr_unweighted_sample_without_replacement_cpu(
 template <typename DataType>
 void check_value_same(void* value, void* ref, int64_t size)
 {
-  int64_t diff_count;
+  int64_t diff_count = 0;
 
   DataType* value_ptr = static_cast<DataType*>(value);
   DataType* ref_ptr   = static_cast<DataType*>(ref);
@@ -593,22 +593,8 @@ void host_weighted_sample_without_replacement(
 
   int64_t center_nodes_count = center_node_desc.size;
 
-  const int block_sizes[4]       = {128, 256, 256, 512};
-  const int items_per_threads[4] = {4, 4, 8, 8};
-  auto choose_fun_idx            = [](int max_sample_count) {
-    if (max_sample_count <= 128) {
-      // return (max_sample_count - 1) / 32;
-      return 0;
-    }
-    if (max_sample_count <= 256) { return 1; }
-    if (max_sample_count <= 512) { return 2; }
-    return 3;
-  };
-  int func_idx = choose_fun_idx(max_sample_count);
-
-  int block_size       = block_sizes[func_idx];
-  int items_per_thread = items_per_threads[func_idx];
-
+  int block_size = 128;
+  if (max_sample_count > 256) { block_size = 256; }
   for (int64_t i = 0; i < center_nodes_count; i++) {
     int output_id          = output_sample_offset_ptr[i];
     int output_local_id    = 0;
@@ -637,25 +623,27 @@ void host_weighted_sample_without_replacement(
       };
       std::priority_queue<std::pair<int, WeightType>, std::vector<std::pair<int, WeightType>>, cmp>
         small_heap;
+
+      auto consume_fun = [&](int id, PCGenerator& rng) {
+        WeightType edge_weight = csr_weight_ptr[start + id];
+        WeightType weight      = host_gen_key_from_weight(edge_weight, rng);
+        process_count++;
+        if (process_count <= max_sample_count) {
+          small_heap.push(std::make_pair(id, weight));
+        } else {
+          std::pair<int, WeightType> small_heap_top_ele = small_heap.top();
+          if (small_heap_top_ele.second < weight) {
+            small_heap.pop();
+            small_heap.push(std::make_pair(id, weight));
+          }
+        }
+      };
+
       for (int j = 0; j < block_size; j++) {
         int local_gidx = gidx + j;
         PCGenerator rng(random_seed, (uint64_t)local_gidx, (uint64_t)0);
-        for (int k = 0; k < items_per_thread; k++) {
-          int id = k * block_size + j;
-          if (id < neighbor_count) {
-            WeightType edge_weight = csr_weight_ptr[start + id];
-            WeightType weight      = host_gen_key_from_weight(edge_weight, rng);
-            process_count++;
-            if (process_count < max_sample_count) {
-              small_heap.push(std::make_pair(id, weight));
-            } else {
-              std::pair<int, WeightType> small_heap_top_ele = small_heap.top();
-              if (small_heap_top_ele.second < weight) {
-                small_heap.pop();
-                small_heap.push(std::make_pair(id, weight));
-              }
-            }
-          }
+        for (int id = j; id < neighbor_count; id += block_size) {
+          if (id < neighbor_count) { consume_fun(id, rng); }
         }
       }
 
diff --git a/cpp/tests/wholegraph_ops/wholegraph_csr_weighted_sample_without_replacement_tests.cu b/cpp/tests/wholegraph_ops/wholegraph_csr_weighted_sample_without_replacement_tests.cu
index bb0828ac9..fa8cd4f10 100644
--- a/cpp/tests/wholegraph_ops/wholegraph_csr_weighted_sample_without_replacement_tests.cu
+++ b/cpp/tests/wholegraph_ops/wholegraph_csr_weighted_sample_without_replacement_tests.cu
@@ -105,7 +105,7 @@ typedef struct WholeGraphCSRWeightedSampleWithoutReplacementTestParam {
 
   wholememory_memory_type_t memory_type                 = WHOLEMEMORY_MT_CHUNKED;
   wholememory_memory_location_t memory_location         = WHOLEMEMORY_ML_DEVICE;
-  int64_t max_sample_count                              = 50;
+  int64_t max_sample_count                              = 10;
   int64_t center_node_count                             = 512;
   int64_t graph_node_count                              = 9703LL;
   int64_t graph_edge_count                              = 104323L;
@@ -369,7 +369,13 @@ TEST_P(WholeGraphCSRWeightedSampleWithoutReplacementParameterTests, WeightedSamp
         random_seed);
 
       EXPECT_EQ(total_sample_count, host_total_sample_count);
-
+      wholegraph_ops::testing::segment_sort_output(
+        host_ref_output_sample_offset,
+        output_sample_offset_desc,
+        host_ref_output_dest_nodes,
+        wholememory_create_array_desc(host_total_sample_count, 0, csr_col_ptr_desc.dtype),
+        host_ref_output_global_edge_id,
+        wholememory_create_array_desc(host_total_sample_count, 0, WHOLEMEMORY_DT_INT64));
       wholegraph_ops::testing::host_check_two_array_same(host_output_sample_offset,
                                                          output_sample_offset_desc,
                                                          host_ref_output_sample_offset,
@@ -440,6 +446,12 @@ INSTANTIATE_TEST_SUITE_P(WholeGraphCSRWeightedSampleWithoutReplacementOpTests,
                                              .set_center_node_count(35)
                                              .set_graph_node_count(23289)
                                              .set_graph_edge_couont(689403),
+                                              WholeGraphCSRWeightedSampleWithoutReplacementTestParam()
+                                             .set_memory_type(WHOLEMEMORY_MT_CONTINUOUS)
+                                             .set_max_sample_count(300)
+                                             .set_center_node_count(256)
+                                             .set_graph_node_count(23200)
+                                             .set_graph_edge_couont(68940300),
                                            WholeGraphCSRWeightedSampleWithoutReplacementTestParam()
                                              .set_memory_type(WHOLEMEMORY_MT_CHUNKED)
                                              .set_center_node_type(WHOLEMEMORY_DT_INT64)));
diff --git a/python/pylibwholegraph/pylibwholegraph/tests/wholegraph_torch/ops/test_wholegraph_weighted_sample_without_replacement.py b/python/pylibwholegraph/pylibwholegraph/tests/wholegraph_torch/ops/test_wholegraph_weighted_sample_without_replacement.py
index 3727829bf..0f53044bd 100644
--- a/python/pylibwholegraph/pylibwholegraph/tests/wholegraph_torch/ops/test_wholegraph_weighted_sample_without_replacement.py
+++ b/python/pylibwholegraph/pylibwholegraph/tests/wholegraph_torch/ops/test_wholegraph_weighted_sample_without_replacement.py
@@ -46,14 +46,8 @@ def host_weighted_sample_without_replacement_func(
     output_center_localid_tensor = torch.empty((total_sample_count,), dtype=torch.int32)
     output_edge_gid_tensor = torch.empty((total_sample_count,), dtype=torch.int64)
     center_nodes_count = center_nodes.size(0)
-    block_sizes = [128, 256, 256, 512]
-    items_per_threads = [4, 4, 8, 8]
-    fun_idx = int((max_sample_count - 1) / 128)
-    if fun_idx > 3:
-        fun_idx = 3
+    block_size = 128 if max_sample_count <=256 else 256
 
-    block_size = block_sizes[fun_idx]
-    items_per_thread = items_per_threads[fun_idx]
 
     for i in range(center_nodes_count):
         node_id = center_nodes[i]
@@ -72,16 +66,18 @@ def host_weighted_sample_without_replacement_func(
             edge_weight_corresponding_ids = torch.tensor([], dtype=col_id_dtype)
             for j in range(block_size):
                 local_gidx = gidx + j
-                local_edge_weights = torch.empty(
-                    (items_per_thread,), dtype=csr_weight_dtype
+                local_edge_weights = torch.tensor( [],dtype=csr_weight_dtype
                 )
                 generated_edge_weight_count = 0
-                for k in range(items_per_thread):
-                    id = k * block_size + j
-                    if id < neighbor_count:
-                        local_edge_weights[k] = host_csr_weight_ptr[start + id]
-                        generated_edge_weight_count += 1
-                        edge_weight_corresponding_ids = torch.cat(
+                for id in range(j,neighbor_count,block_size):
+                    local_edge_weights = torch.cat(
+                    (
+                        local_edge_weights,
+                         torch.tensor([host_csr_weight_ptr[start + id]], dtype=csr_weight_dtype),
+                     )
+                    )
+                    generated_edge_weight_count += 1
+                    edge_weight_corresponding_ids = torch.cat(
                             (
                                 edge_weight_corresponding_ids,
                                 torch.tensor([id], dtype=col_id_dtype),

From 1a04933ad25b3476cb0b2e01b7fc0ed2bfde47a8 Mon Sep 17 00:00:00 2001
From: Chuang Zhu <chuangz@nvidia.com>
Date: Mon, 7 Aug 2023 10:20:52 +0000
Subject: [PATCH 2/6] use cub::segment_sort to impl weighted sampling when
 sample_count is large

---
 ...ighted_sample_without_replacement_func.cuh | 315 +++++++-----------
 1 file changed, 127 insertions(+), 188 deletions(-)

diff --git a/cpp/src/wholegraph_ops/weighted_sample_without_replacement_func.cuh b/cpp/src/wholegraph_ops/weighted_sample_without_replacement_func.cuh
index c948d8cc3..a2915cd00 100644
--- a/cpp/src/wholegraph_ops/weighted_sample_without_replacement_func.cuh
+++ b/cpp/src/wholegraph_ops/weighted_sample_without_replacement_func.cuh
@@ -14,7 +14,9 @@
  * limitations under the License.
  */
 #pragma once
+#include <cstdlib>
 #include <cub/device/device_radix_sort.cuh>
+#include <cub/device/device_segmented_sort.cuh>
 #include <driver_types.h>
 #include <raft/matrix/select_k.cuh>
 #include <random>
@@ -55,88 +57,91 @@ __device__ __forceinline__ float gen_key_from_weight(const WeightType weight, PC
   return logk;
 }
 
-template <typename T, typename IdxT>
-__device__ __host__ void set_buf_pointers(T* buf1,
-                                          IdxT* idx_buf1,
-                                          T* buf2,
-                                          IdxT* idx_buf2,
-                                          int pass,
-                                          const T*& in_buf,
-                                          const IdxT*& in_idx_buf,
-                                          T*& out_buf,
-                                          IdxT*& out_idx_buf)
+template <typename IdType,
+          typename WeightType,
+          typename WeightKeyType,
+          typename NeighborIdxType,
+          typename WMIdType,
+          typename WMOffsetType,
+          typename WMWeightType,
+          unsigned int BLOCK_SIZE>
+__launch_bounds__(BLOCK_SIZE) __global__ void generate_weighted_keys_and_idxs_kernel(
+  wholememory_gref_t wm_csr_row_ptr,
+  wholememory_array_description_t wm_csr_row_ptr_desc,
+  wholememory_gref_t wm_csr_col_ptr,
+  wholememory_array_description_t wm_csr_col_ptr_desc,
+  wholememory_gref_t wm_csr_weight_ptr,
+  wholememory_array_description_t wm_csr_weight_ptr_desc,
+  const IdType* input_nodes,
+  const int input_node_count,
+  const int max_sample_count,
+  unsigned long long random_seed,
+  const int* target_neighbor_offset,
+  WeightKeyType* output_weighted_keys,
+  NeighborIdxType* output_idxs,
+  bool need_random = true)
 {
-  if (pass == 0) {
-    in_buf      = buf1;
-    in_idx_buf  = nullptr;
-    out_buf     = nullptr;
-    out_idx_buf = nullptr;
-
-  } else if (pass % 2 == 0) {
-    in_buf      = buf2;
-    in_idx_buf  = idx_buf2;
-    out_buf     = buf1;
-    out_idx_buf = idx_buf1;
-  } else {
-    in_buf      = buf1;
-    in_idx_buf  = idx_buf1;
-    out_buf     = buf2;
-    out_idx_buf = idx_buf2;
+  int input_idx = blockIdx.x;
+  if (input_idx >= input_node_count) return;
+  int gidx = threadIdx.x + blockIdx.x * BLOCK_SIZE;
+  wholememory::device_reference<WMOffsetType> csr_row_ptr_gen(wm_csr_row_ptr);
+  wholememory::device_reference<WMIdType> csr_col_ptr_gen(wm_csr_col_ptr);
+  wholememory::device_reference<WMWeightType> csr_weight_ptr_gen(wm_csr_weight_ptr);
+  IdType nid         = input_nodes[input_idx];
+  int64_t start      = csr_row_ptr_gen[nid];
+  int64_t end        = csr_row_ptr_gen[nid + 1];
+  int neighbor_count = (int)(end - start);
+  if (neighbor_count <= max_sample_count) { need_random = false; }
+
+  PCGenerator rng(random_seed, (uint64_t)gidx, (uint64_t)0);
+  int output_offset = target_neighbor_offset[input_idx];
+  output_weighted_keys += output_offset;
+  output_idxs += output_offset;
+  for (int id = threadIdx.x; id < neighbor_count; id += BLOCK_SIZE) {
+    WeightType thread_weight = csr_weight_ptr_gen[start + id];
+    output_weighted_keys[id] =
+      need_random ? static_cast<WeightKeyType>(gen_key_from_weight(thread_weight, rng))
+                  : (static_cast<WeightKeyType>(thread_weight));
+    output_idxs[id] = static_cast<NeighborIdxType>(id);
   }
 }
 
 template <typename IdType,
           typename LocalIdType,
-          typename WeightType,
-          typename WeightKeyType,
           typename NeighborIdxType,
           typename WMIdType,
           typename WMOffsetType,
-          typename WMWeightType,
-          unsigned int BLOCK_SIZE,
-          int BitsPerPass,
-          bool NeedRandom = true>
+          int BLOCK_SIZE>
 __launch_bounds__(BLOCK_SIZE) __global__
-  void weighted_sample_without_replacement_large_raft_radix_kernel(
-    wholememory_gref_t wm_csr_row_ptr,
-    wholememory_array_description_t wm_csr_row_ptr_desc,
-    wholememory_gref_t wm_csr_col_ptr,
-    wholememory_array_description_t wm_csr_col_ptr_desc,
-    wholememory_gref_t wm_csr_weight_ptr,
-    wholememory_array_description_t wm_csr_weight_ptr_desc,
-    const IdType* input_nodes,
-    const int input_node_count,
-    const int max_sample_count,
-    unsigned long long random_seed,
-    const int* sample_offset,
-    wholememory_array_description_t sample_offset_desc,
-    const int* target_neighbor_offset,
-    WMIdType* output,
-    LocalIdType* src_lid,
-    int64_t* out_edge_gid,
-    WeightKeyType* weight_keys_buff0,
-    NeighborIdxType* local_idx_buff0,
-    WeightKeyType* weight_keys_buff1,
-    NeighborIdxType* local_idx_buff1,
-    WeightKeyType* weight_keys_out,
-    NeighborIdxType* local_idx_out,
-    const bool select_min = false)
+  void weighted_sample_select_k_kernel(wholememory_gref_t wm_csr_row_ptr,
+                                       wholememory_array_description_t wm_csr_row_ptr_desc,
+                                       wholememory_gref_t wm_csr_col_ptr,
+                                       wholememory_array_description_t wm_csr_col_ptr_desc,
+                                       const IdType* input_nodes,
+                                       const int input_node_count,
+                                       const int max_sample_count,
+                                       const int* sample_offset,
+                                       wholememory_array_description_t sample_offset_desc,
+                                       const NeighborIdxType* sorted_idxs,
+                                       const int* target_neighbor_offset,
+                                       WMIdType* output,
+                                       LocalIdType* src_lid,
+                                       int64_t* out_edge_gid)
 {
   int input_idx = blockIdx.x;
   if (input_idx >= input_node_count) return;
-  int gidx = threadIdx.x + blockIdx.x * BLOCK_SIZE;
   wholememory::device_reference<WMOffsetType> csr_row_ptr_gen(wm_csr_row_ptr);
   wholememory::device_reference<WMIdType> csr_col_ptr_gen(wm_csr_col_ptr);
-  wholememory::device_reference<WMWeightType> csr_weight_ptr_gen(wm_csr_weight_ptr);
   IdType nid         = input_nodes[input_idx];
   int64_t start      = csr_row_ptr_gen[nid];
   int64_t end        = csr_row_ptr_gen[nid + 1];
   int neighbor_count = (int)(end - start);
-  int offset         = sample_offset[input_idx];
+
+  int offset = sample_offset[input_idx];
+
   if (neighbor_count <= max_sample_count) {
     for (int sample_id = threadIdx.x; sample_id < neighbor_count; sample_id += BLOCK_SIZE) {
-      int neighbor_idx           = sample_id;
-      int original_neighbor_idx  = neighbor_idx;
+      int original_neighbor_idx  = sample_id;
       IdType gid                 = csr_col_ptr_gen[start + original_neighbor_idx];
       output[offset + sample_id] = gid;
       if (src_lid) src_lid[offset + sample_id] = (LocalIdType)input_idx;
@@ -145,99 +150,9 @@ __launch_bounds__(BLOCK_SIZE) __global__
     }
     return;
   }
-
-  PCGenerator rng(random_seed, (uint64_t)gidx, (uint64_t)0);
-  int buff_offset = target_neighbor_offset[input_idx];
-  weight_keys_buff0 += buff_offset;
-  local_idx_buff0 += buff_offset;
-  weight_keys_buff1 += buff_offset;
-  local_idx_buff1 += buff_offset;
-  weight_keys_out += input_idx * max_sample_count;
-  local_idx_out += input_idx * max_sample_count;
-
-  for (int id = threadIdx.x; id < neighbor_count; id += BLOCK_SIZE) {
-    WeightType thread_weight = csr_weight_ptr_gen[start + id];
-    weight_keys_buff0[id]    = NeedRandom
-                                 ? static_cast<WeightKeyType>(gen_key_from_weight(thread_weight, rng))
-                                 : (static_cast<WeightKeyType>(thread_weight));
-    local_idx_buff0[id]      = id;
-  }
-
-  constexpr int num_buckets =
-    raft::matrix::detail::select::radix::impl::calc_num_buckets<BitsPerPass>();
-  __shared__ raft::matrix::detail::select::radix::impl::Counter<WeightKeyType, NeighborIdxType>
-    counter;
-  __shared__ NeighborIdxType histogram[num_buckets];
-  if (threadIdx.x == 0) {
-    counter.k              = max_sample_count;
-    counter.len            = neighbor_count;
-    counter.previous_len   = neighbor_count;
-    counter.kth_value_bits = 0;
-    counter.out_cnt        = 0;
-    counter.out_back_cnt   = 0;
-  }
-  __syncthreads();
-  const WeightKeyType* in_buf       = nullptr;
-  const NeighborIdxType* in_idx_buf = nullptr;
-  WeightKeyType* out_buf            = nullptr;
-  NeighborIdxType* out_idx_buf      = nullptr;
-  constexpr int num_passes =
-    raft::matrix::detail::select::radix::impl::calc_num_passes<WeightKeyType, BitsPerPass>();
-  for (int pass = 0; pass < num_passes; ++pass) {
-    set_buf_pointers(weight_keys_buff0,
-                     local_idx_buff0,
-                     weight_keys_buff1,
-                     local_idx_buff1,
-                     pass,
-                     in_buf,
-                     in_idx_buf,
-                     out_buf,
-                     out_idx_buf);
-    NeighborIdxType current_len = counter.len;
-    NeighborIdxType current_k   = counter.k;
-    raft::matrix::detail::select::radix::impl::
-      filter_and_histogram_for_one_block<WeightKeyType, NeighborIdxType, BitsPerPass>(
-        in_buf,
-        in_idx_buf,
-        out_buf,
-        out_idx_buf,
-        weight_keys_out,
-        local_idx_out,
-        &counter,
-        histogram,
-        select_min,
-        pass);
-    __syncthreads();
-
-    raft::matrix::detail::select::radix::impl::scan<NeighborIdxType, BitsPerPass, BLOCK_SIZE>(
-      histogram);
-    __syncthreads();
-
-    raft::matrix::detail::select::radix::impl::
-      choose_bucket<WeightKeyType, NeighborIdxType, BitsPerPass>(
-        &counter, histogram, current_k, pass);
-    if (threadIdx.x == 0) { counter.previous_len = current_len; }
-    __syncthreads();
-
-    if (counter.len == counter.k || pass == num_passes - 1) {
-      raft::matrix::detail::select::radix::impl::
-        last_filter<WeightKeyType, NeighborIdxType, BitsPerPass>(
-          pass == 0 ? weight_keys_buff0 : out_buf,
-          pass == 0 ? local_idx_buff0 : out_idx_buf,
-          weight_keys_out,
-          local_idx_out,
-          current_len,
-          max_sample_count,
-          &counter,
-          select_min,
-          pass);
-      break;
-    }
-  }
-  // topk  idx in local_idx_out
-  __syncthreads();
+  int neighbor_offset = target_neighbor_offset[input_idx];
   for (int sample_id = threadIdx.x; sample_id < max_sample_count; sample_id += BLOCK_SIZE) {
-    int original_neighbor_idx  = local_idx_out[sample_id];
+    int original_neighbor_idx  = sorted_idxs[neighbor_offset + sample_id];
     IdType gid                 = csr_col_ptr_gen[start + original_neighbor_idx];
     output[offset + sample_id] = gid;
     if (src_lid) src_lid[offset + sample_id] = (LocalIdType)input_idx;
@@ -583,7 +498,6 @@ void wholegraph_csr_weighted_sample_without_replacement_func(
                            tmp_neighbor_counts_mem_pointer,
                            tmp_neighbor_counts_mem_pointer + center_node_count + 1,
                            tmp_neighbor_counts_mem_pointer);
-    int* tmp_neighbor_counts_offset = tmp_neighbor_counts_mem_pointer;
     int target_neighbor_counts;
     WM_CUDA_CHECK(cudaMemcpyAsync(&target_neighbor_counts,
                                   ((int*)tmp_neighbor_counts_mem_pointer) + center_node_count,
@@ -600,34 +514,24 @@ void wholegraph_csr_weighted_sample_without_replacement_func(
     WeightType* tmp_weights_buffer1_mem_pointer =
       (WeightType*)gen_weights_buffer1_tmh.device_malloc(target_neighbor_counts,
                                                          wm_csr_weight_ptr_desc.dtype);
-    wholememory_ops::temp_memory_handle gen_weights_buffer_out_tmh(p_env_fns);
-    WeightType* tmp_weights_buffer_out_mem_pointer =
-      (WeightType*)gen_weights_buffer_out_tmh.device_malloc(center_node_count * max_sample_count,
-                                                            wm_csr_weight_ptr_desc.dtype);
 
-    auto local_idx_dtype = wholememory_dtype_t::WHOLEMEMORY_DT_INT;
+    auto neighbor_idx_dtype = wholememory_dtype_t::WHOLEMEMORY_DT_INT;
     wholememory_ops::temp_memory_handle local_idx_buffer0_tmh(p_env_fns);
     int* local_idx_buffer0_mem_pointer = static_cast<int*>(
-      local_idx_buffer0_tmh.device_malloc(target_neighbor_counts, local_idx_dtype));
+      local_idx_buffer0_tmh.device_malloc(target_neighbor_counts, neighbor_idx_dtype));
     wholememory_ops::temp_memory_handle local_idx_buffer1_tmh(p_env_fns);
     int* local_idx_buffer1_mem_pointer = static_cast<int*>(
-      local_idx_buffer1_tmh.device_malloc(target_neighbor_counts, local_idx_dtype));
-    wholememory_ops::temp_memory_handle local_idx_buffer_out_tmh(p_env_fns);
-    int* local_idx_buffer_out_mem_pointer =
-      static_cast<int*>(local_idx_buffer_out_tmh.device_malloc(center_node_count * max_sample_count,
-                                                               local_idx_dtype));
-    constexpr int BLOCK_SIZE  = 256;
-    constexpr int BitsPerPass = 8;
-    weighted_sample_without_replacement_large_raft_radix_kernel<IdType,
-                                                                int,
-                                                                WeightType,
-                                                                WeightType,
-                                                                int,
-                                                                WMIdType,
-                                                                int64_t,
-                                                                WeightType,
-                                                                BLOCK_SIZE,
-                                                                BitsPerPass>
+      local_idx_buffer1_tmh.device_malloc(target_neighbor_counts, neighbor_idx_dtype));
+
+    constexpr int BLOCK_SIZE = 256;
+    generate_weighted_keys_and_idxs_kernel<IdType,
+                                           WeightType,
+                                           WeightType,
+                                           int,
+                                           WMIdType,
+                                           int64_t,
+                                           WeightType,
+                                           BLOCK_SIZE>
       <<<center_node_count, BLOCK_SIZE, 0, stream>>>(wm_csr_row_ptr,
                                                      wm_csr_row_ptr_desc,
                                                      wm_csr_col_ptr,
@@ -638,19 +542,54 @@ void wholegraph_csr_weighted_sample_without_replacement_func(
                                                      center_node_count,
                                                      max_sample_count,
                                                      random_seed,
-                                                     (const int*)output_sample_offset,
-                                                     output_sample_offset_desc,
-                                                     tmp_neighbor_counts_offset,
-                                                     (WMIdType*)output_dest_node_ptr,
-                                                     (int*)output_center_localid_ptr,
-                                                     (int64_t*)output_edge_gid_ptr,
+                                                     tmp_neighbor_counts_mem_pointer,
                                                      tmp_weights_buffer0_mem_pointer,
                                                      local_idx_buffer0_mem_pointer,
-                                                     tmp_weights_buffer1_mem_pointer,
-                                                     local_idx_buffer1_mem_pointer,
-                                                     tmp_weights_buffer_out_mem_pointer,
-                                                     local_idx_buffer_out_mem_pointer,
-                                                     false);
+                                                     true);
+    cub::DoubleBuffer<WeightType> weighted_key_double_buffer{tmp_weights_buffer0_mem_pointer,
+                                                             tmp_weights_buffer1_mem_pointer};
+    cub::DoubleBuffer<int> neighbor_idx_double_buffer{local_idx_buffer0_mem_pointer,
+                                                      local_idx_buffer1_mem_pointer};
+    void* d_temp_storage      = nullptr;
+    size_t temp_storage_bytes = 0;
+
+    WM_CUDA_CHECK(cub::DeviceSegmentedSort::SortPairsDescending(d_temp_storage,
+                                                                temp_storage_bytes,
+                                                                weighted_key_double_buffer,
+                                                                neighbor_idx_double_buffer,
+                                                                target_neighbor_counts,
+                                                                center_node_count,
+                                                                tmp_neighbor_counts_mem_pointer,
+                                                                tmp_neighbor_counts_mem_pointer + 1,
+                                                                stream));
+    wholememory_ops::temp_memory_handle segment_sort_storge_tmp(p_env_fns);
+    d_temp_storage = segment_sort_storge_tmp.device_malloc(temp_storage_bytes, WHOLEMEMORY_DT_INT8);
+
+    WM_CUDA_CHECK(cub::DeviceSegmentedSort::SortPairsDescending(d_temp_storage,
+                                                                temp_storage_bytes,
+                                                                weighted_key_double_buffer,
+                                                                neighbor_idx_double_buffer,
+                                                                target_neighbor_counts,
+                                                                center_node_count,
+                                                                tmp_neighbor_counts_mem_pointer,
+                                                                tmp_neighbor_counts_mem_pointer + 1,
+                                                                stream));
+
+    weighted_sample_select_k_kernel<IdType, int, int, WMIdType, int64_t, BLOCK_SIZE>
+      <<<center_node_count, BLOCK_SIZE, 0, stream>>>(wm_csr_row_ptr,
+                                                     wm_csr_row_ptr_desc,
+                                                     wm_csr_col_ptr,
+                                                     wm_csr_col_ptr_desc,
+                                                     (const IdType*)center_nodes,
+                                                     center_node_count,
+                                                     max_sample_count,
+                                                     static_cast<const int*>(output_sample_offset),
+                                                     output_sample_offset_desc,
+                                                     neighbor_idx_double_buffer.Current(),
+                                                     tmp_neighbor_counts_mem_pointer,
+                                                     output_dest_node_ptr,
+                                                     output_center_localid_ptr,
+                                                     output_edge_gid_ptr);
 
     WM_CUDA_CHECK(cudaGetLastError());
     WM_CUDA_CHECK(cudaStreamSynchronize(stream));

From a051750a23cd10ff6b2e0dd913fdf2c916ee7591 Mon Sep 17 00:00:00 2001
From: Chuang Zhu <chuangz@nvidia.com>
Date: Tue, 8 Aug 2023 09:18:02 +0000
Subject: [PATCH 3/6] use raft:warp_sort replace block_radix_topk

---
 cpp/src/wholegraph_ops/block_radix_topk.cuh   | 371 ------------------
 .../wholegraph_ops/block_topk_with_raft.cuh   | 109 +++++
 .../functions/embedding_cache_func.cuh        |  10 +-
 3 files changed, 113 insertions(+), 377 deletions(-)
 delete mode 100644 cpp/src/wholegraph_ops/block_radix_topk.cuh
 create mode 100644 cpp/src/wholegraph_ops/block_topk_with_raft.cuh

diff --git a/cpp/src/wholegraph_ops/block_radix_topk.cuh b/cpp/src/wholegraph_ops/block_radix_topk.cuh
deleted file mode 100644
index 624c07510..000000000
--- a/cpp/src/wholegraph_ops/block_radix_topk.cuh
+++ /dev/null
@@ -1,371 +0,0 @@
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cub/block/block_load.cuh>
-#include <cub/block/block_scan.cuh>
-#include <cub/block/block_store.cuh>
-#include <cub/util_ptx.cuh>
-
-namespace wholegraph_ops {
-
-template <typename KeyT, int BLOCK_SIZE, bool GREATER = true, int RADIX_BITS = 8>
-class BlockRadixTopKGlobalMemory {
-  static_assert(cub::PowerOfTwo<RADIX_BITS>::VALUE && (RADIX_BITS <= (sizeof(KeyT) * 8)),
-                "RADIX_BITS should be power of 2, and <= (sizeof(KeyT) * 8)");
-  static_assert(cub::PowerOfTwo<BLOCK_SIZE>::VALUE, "BLOCK_SIZE should be power of 2");
-  using KeyTraits                            = cub::Traits<KeyT>;
-  using UnsignedBits                         = typename KeyTraits::UnsignedBits;
-  using BlockScanT                           = cub::BlockScan<int, BLOCK_SIZE>;
-  static constexpr int RADIX_SIZE            = (1 << RADIX_BITS);
-  static constexpr int SCAN_ITEMS_PER_THREAD = (RADIX_SIZE + BLOCK_SIZE - 1) / BLOCK_SIZE;
-  using BinBlockLoad  = cub::BlockLoad<int, BLOCK_SIZE, SCAN_ITEMS_PER_THREAD>;
-  using BinBlockStore = cub::BlockStore<int, BLOCK_SIZE, SCAN_ITEMS_PER_THREAD>;
-  struct _TempStorage {
-    typename BlockScanT::TempStorage scan_storage;
-    union {
-      typename BinBlockLoad::TempStorage load_storage;
-      typename BinBlockStore::TempStorage store_storage;
-    } load_store;
-    union {
-      int shared_bins[RADIX_SIZE];
-    };
-    int share_target_k;
-    int share_bucket_id;
-  };
-
- public:
-  struct TempStorage : cub::Uninitialized<_TempStorage> {};
-  __device__ __forceinline__ BlockRadixTopKGlobalMemory(TempStorage& temp_storage)
-    : temp_storage_{temp_storage.Alias()}, tid_(threadIdx.x){};
-  __device__ __forceinline__ void radixTopKGetThreshold(
-    const KeyT* data, int k, int size, KeyT& topK, bool& topk_is_unique)
-  {
-    assert(k < size && k > 0);
-    int target_k             = k;
-    UnsignedBits key_pattern = 0;
-    int digit_pos            = sizeof(KeyT) * 8 - RADIX_BITS;
-    for (; digit_pos >= 0; digit_pos -= RADIX_BITS) {
-      UpdateSharedBins(data, size, digit_pos, key_pattern);
-      InclusiveScanBins();
-      UpdateTopK(digit_pos, target_k, key_pattern);
-      if (target_k == 0) break;
-    }
-    if (target_k == 0) {
-      key_pattern -= 1;
-      topk_is_unique = true;
-    } else {
-      topk_is_unique = false;
-    }
-    if (GREATER) key_pattern = ~key_pattern;
-    UnsignedBits topK_unsigned = KeyTraits::TwiddleOut(key_pattern);
-    topK                       = reinterpret_cast<KeyT&>(topK_unsigned);
-  }
-
- private:
-  __device__ __forceinline__ void UpdateSharedBins(const KeyT* key,
-                                                   int size,
-                                                   int digit_pos,
-                                                   UnsignedBits key_pattern)
-  {
-    for (int id = tid_; id < RADIX_SIZE; id += BLOCK_SIZE) {
-      temp_storage_.shared_bins[id] = 0;
-    }
-    cub::CTA_SYNC();
-    UnsignedBits key_mask = ((UnsignedBits)(-1)) << ((UnsignedBits)(digit_pos + RADIX_BITS));
-#pragma unroll
-    for (int idx = tid_; idx < size; idx += BLOCK_SIZE) {
-      KeyT key_data              = key[idx];
-      UnsignedBits twiddled_data = KeyTraits::TwiddleIn(reinterpret_cast<UnsignedBits&>(key_data));
-      if (GREATER) twiddled_data = ~twiddled_data;
-      UnsignedBits digit_in_radix = cub::BFE<UnsignedBits>(twiddled_data, digit_pos, RADIX_BITS);
-      if ((twiddled_data & key_mask) == (key_pattern & key_mask)) {
-        atomicAdd(&temp_storage_.shared_bins[digit_in_radix], 1);
-      }
-    }
-    cub::CTA_SYNC();
-  }
-  __device__ __forceinline__ void InclusiveScanBins()
-  {
-    int items[SCAN_ITEMS_PER_THREAD];
-    BinBlockLoad(temp_storage_.load_store.load_storage)
-      .Load(temp_storage_.shared_bins, items, RADIX_SIZE, 0);
-    cub::CTA_SYNC();
-    BlockScanT(temp_storage_.scan_storage).InclusiveSum(items, items);
-    cub::CTA_SYNC();
-    BinBlockStore(temp_storage_.load_store.store_storage)
-      .Store(temp_storage_.shared_bins, items, RADIX_SIZE);
-    cub::CTA_SYNC();
-  }
-  __device__ __forceinline__ void UpdateTopK(int digit_pos,
-                                             int& target_k,
-                                             UnsignedBits& target_pattern)
-  {
-    for (int idx = tid_; (idx < RADIX_SIZE); idx += BLOCK_SIZE) {
-      int prev_count = (idx == 0) ? 0 : temp_storage_.shared_bins[idx - 1];
-      int cur_count  = temp_storage_.shared_bins[idx];
-      if (prev_count <= target_k && cur_count > target_k) {
-        temp_storage_.share_target_k  = target_k - prev_count;
-        temp_storage_.share_bucket_id = idx;
-      }
-    }
-    cub::CTA_SYNC();
-    target_k                 = temp_storage_.share_target_k;
-    int target_bucket_id     = temp_storage_.share_bucket_id;
-    UnsignedBits key_segment = ((UnsignedBits)target_bucket_id) << ((UnsignedBits)digit_pos);
-    target_pattern |= key_segment;
-  }
-  _TempStorage& temp_storage_;
-  int tid_;
-};
-
-template <typename KeyT,
-          int BLOCK_SIZE,
-          int ITEMS_PER_THREAD,
-          bool GREATER    = true,
-          typename ValueT = cub::NullType,
-          int RADIX_BITS  = 8>
-class BlockRadixTopKRegister {
-  static_assert(cub::PowerOfTwo<RADIX_BITS>::VALUE && (RADIX_BITS <= (sizeof(KeyT) * 8)),
-                "RADIX_BITS should be power of 2, and <= (sizeof(KeyT) * 8)");
-  static_assert(cub::PowerOfTwo<BLOCK_SIZE>::VALUE, "BLOCK_SIZE should be power of 2");
-  using KeyTraits                            = cub::Traits<KeyT>;
-  using UnsignedBits                         = typename KeyTraits::UnsignedBits;
-  using BlockScanT                           = cub::BlockScan<int, BLOCK_SIZE>;
-  static constexpr int RADIX_SIZE            = (1 << RADIX_BITS);
-  static constexpr bool KEYS_ONLY            = std::is_same<ValueT, cub::NullType>::value;
-  static constexpr int SCAN_ITEMS_PER_THREAD = (RADIX_SIZE + BLOCK_SIZE - 1) / BLOCK_SIZE;
-  using BinBlockLoad       = cub::BlockLoad<int, BLOCK_SIZE, SCAN_ITEMS_PER_THREAD>;
-  using BinBlockStore      = cub::BlockStore<int, BLOCK_SIZE, SCAN_ITEMS_PER_THREAD>;
-  using BlockExchangeKey   = cub::BlockExchange<KeyT, BLOCK_SIZE, ITEMS_PER_THREAD>;
-  using BlockExchangeValue = cub::BlockExchange<ValueT, BLOCK_SIZE, ITEMS_PER_THREAD>;
-
-  using _ExchangeKeyTempStorage   = typename BlockExchangeKey::TempStorage;
-  using _ExchangeValueTempStorage = typename BlockExchangeValue::TempStorage;
-  typedef union ExchangeKeyTempStorageType {
-    _ExchangeKeyTempStorage key_storage;
-  } ExchKeyTempStorageType;
-  typedef union ExchangeKeyValueTempStorageType {
-    _ExchangeKeyTempStorage key_storage;
-    _ExchangeValueTempStorage value_storage;
-  } ExchKeyValueTempStorageType;
-  using _ExchangeType =
-    typename std::conditional<KEYS_ONLY, ExchKeyTempStorageType, ExchKeyValueTempStorageType>::type;
-
-  struct _TempStorage {
-    typename BlockScanT::TempStorage scan_storage;
-    union {
-      typename BinBlockLoad::TempStorage load_storage;
-      typename BinBlockStore::TempStorage store_storage;
-    } load_store;
-    union {
-      int shared_bins[RADIX_SIZE];
-      _ExchangeType exchange_storage;
-    };
-    int share_target_k;
-    int share_bucket_id;
-    int share_prev_count;
-  };
-
- public:
-  struct TempStorage : cub::Uninitialized<_TempStorage> {};
-  __device__ __forceinline__ BlockRadixTopKRegister(TempStorage& temp_storage)
-    : temp_storage_{temp_storage.Alias()}, tid_(threadIdx.x){};
-  __device__ __forceinline__ void radixTopKToStriped(KeyT (&keys)[ITEMS_PER_THREAD],
-                                                     const int k,
-                                                     const int valid_count)
-  {
-    if (k == valid_count) return;
-    TopKGenRank(keys, k, valid_count);
-    int is_valid[ITEMS_PER_THREAD];
-    GenValidArray(is_valid, k);
-    BlockExchangeKey{temp_storage_.exchange_storage.key_storage}.ScatterToStripedFlagged(
-      keys, keys, ranks_, is_valid);
-    cub::CTA_SYNC();
-  }
-  __device__ __forceinline__ void radixTopKToStriped(KeyT (&keys)[ITEMS_PER_THREAD],
-                                                     ValueT (&values)[ITEMS_PER_THREAD],
-                                                     const int k,
-                                                     const int valid_count)
-  {
-    if (k == valid_count) return;
-    TopKGenRank(keys, k, valid_count);
-    int is_valid[ITEMS_PER_THREAD];
-    GenValidArray(is_valid, k);
-    BlockExchangeKey{temp_storage_.exchange_storage.key_storage}.ScatterToStripedFlagged(
-      keys, keys, ranks_, is_valid);
-    cub::CTA_SYNC();
-    BlockExchangeValue{temp_storage_.exchange_storage.value_storage}.ScatterToStripedFlagged(
-      values, values, ranks_, is_valid);
-    cub::CTA_SYNC();
-  }
-
- private:
-  __device__ __forceinline__ void TopKGenRank(KeyT (&keys)[ITEMS_PER_THREAD],
-                                              const int k,
-                                              const int valid_count)
-  {
-    assert(k <= BLOCK_SIZE * ITEMS_PER_THREAD);
-    assert(k <= valid_count);
-    UnsignedBits(&unsigned_keys)[ITEMS_PER_THREAD] =
-      reinterpret_cast<UnsignedBits(&)[ITEMS_PER_THREAD]>(keys);
-    search_mask_ = 0;
-    top_k_mask_  = 0;
-
-#pragma unroll
-    for (unsigned int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) {
-      int idx            = KEY * BLOCK_SIZE + tid_;
-      unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
-      if (GREATER) unsigned_keys[KEY] = ~unsigned_keys[KEY];
-      if (idx < valid_count) search_mask_ |= (1U << KEY);
-    }
-
-    int target_k = k;
-    int prefix_k = 0;
-
-    for (int digit_pos = sizeof(KeyT) * 8 - RADIX_BITS; digit_pos >= 0; digit_pos -= RADIX_BITS) {
-      UpdateSharedBins(unsigned_keys, digit_pos, prefix_k);
-      InclusiveScanBins();
-      UpdateTopK(unsigned_keys, digit_pos, target_k, prefix_k, digit_pos == 0);
-      if (target_k == 0) break;
-    }
-
-#pragma unroll
-    for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) {
-      if (GREATER) unsigned_keys[KEY] = ~unsigned_keys[KEY];
-      unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
-    }
-  }
-  __device__ __forceinline__ void GenValidArray(int (&is_valid)[ITEMS_PER_THREAD], int k)
-  {
-#pragma unroll
-    for (unsigned int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) {
-      if ((top_k_mask_ & (1U << KEY)) && ranks_[KEY] < k) {
-        is_valid[KEY] = 1;
-      } else {
-        is_valid[KEY] = 0;
-      }
-    }
-  }
-  __device__ __forceinline__ void UpdateSharedBins(UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD],
-                                                   int digit_pos,
-                                                   int prefix_k)
-  {
-    for (int id = tid_; id < RADIX_SIZE; id += BLOCK_SIZE) {
-      temp_storage_.shared_bins[id] = 0;
-    }
-    cub::CTA_SYNC();
-// #define USE_MATCH
-#ifdef USE_MATCH
-    int lane_mask = cub::LaneMaskLt();
-#pragma unroll
-    for (unsigned int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) {
-      bool is_search = search_mask_ & (1U << KEY);
-      int bucket_idx = -1;
-      if (is_search) {
-        UnsignedBits digit_in_radix =
-          cub::BFE<UnsignedBits>(unsigned_keys[KEY], digit_pos, RADIX_BITS);
-        bucket_idx = (int)digit_in_radix;
-      }
-      int warp_match_mask       = __match_any_sync(0xffffffff, bucket_idx);
-      int same_count            = __popc(warp_match_mask);
-      int idx_in_same_bucket    = __popc(warp_match_mask & lane_mask);
-      int same_bucket_root_lane = __ffs(warp_match_mask) - 1;
-      int same_bucket_start_idx;
-      if (idx_in_same_bucket == 0 && is_search) {
-        same_bucket_start_idx = atomicAdd(&temp_storage_.shared_bins[bucket_idx], same_count);
-      }
-      same_bucket_start_idx =
-        __shfl_sync(0xffffffff, same_bucket_start_idx, same_bucket_root_lane, 32);
-      if (is_search) { ranks_[KEY] = same_bucket_start_idx + idx_in_same_bucket + prefix_k; }
-    }
-#else
-#pragma unroll
-    for (unsigned int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) {
-      bool is_search = search_mask_ & (1U << KEY);
-      int bucket_idx = -1;
-      if (is_search) {
-        UnsignedBits digit_in_radix =
-          cub::BFE<UnsignedBits>(unsigned_keys[KEY], digit_pos, RADIX_BITS);
-        bucket_idx  = (int)digit_in_radix;
-        ranks_[KEY] = atomicAdd(&temp_storage_.shared_bins[bucket_idx], 1) + prefix_k;
-      }
-    }
-#endif
-    cub::CTA_SYNC();
-  }
-  __device__ __forceinline__ void InclusiveScanBins()
-  {
-    int items[SCAN_ITEMS_PER_THREAD];
-    BinBlockLoad(temp_storage_.load_store.load_storage)
-      .Load(temp_storage_.shared_bins, items, RADIX_SIZE, 0);
-    cub::CTA_SYNC();
-    BlockScanT(temp_storage_.scan_storage).InclusiveSum(items, items);
-    cub::CTA_SYNC();
-    BinBlockStore(temp_storage_.load_store.store_storage)
-      .Store(temp_storage_.shared_bins, items, RADIX_SIZE);
-    cub::CTA_SYNC();
-  }
-  __device__ __forceinline__ void UpdateTopK(UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD],
-                                             int digit_pos,
-                                             int& target_k,
-                                             int& prefix_k,
-                                             bool mark_equal)
-  {
-    for (int idx = tid_; (idx < RADIX_SIZE); idx += BLOCK_SIZE) {
-      int prev_count = (idx == 0) ? 0 : temp_storage_.shared_bins[idx - 1];
-      int cur_count  = temp_storage_.shared_bins[idx];
-      if (prev_count <= target_k && cur_count > target_k) {
-        temp_storage_.share_target_k   = target_k - prev_count;
-        temp_storage_.share_bucket_id  = idx;
-        temp_storage_.share_prev_count = prev_count;
-      }
-    }
-    cub::CTA_SYNC();
-    target_k = temp_storage_.share_target_k;
-    prefix_k += temp_storage_.share_prev_count;
-    int target_bucket_id = temp_storage_.share_bucket_id;
-#pragma unroll
-    for (unsigned int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++) {
-      if (search_mask_ & (1U << KEY)) {
-        UnsignedBits digit_in_radix =
-          cub::BFE<UnsignedBits>(unsigned_keys[KEY], digit_pos, RADIX_BITS);
-        if (digit_in_radix < target_bucket_id) {
-          top_k_mask_ |= (1U << KEY);
-          search_mask_ &= ~(1U << KEY);
-        } else if (digit_in_radix > target_bucket_id) {
-          search_mask_ &= ~(1U << KEY);
-        } else {
-          if (mark_equal) top_k_mask_ |= (1U << KEY);
-        }
-        if (digit_in_radix <= target_bucket_id) {
-          int prev_count =
-            (digit_in_radix == 0) ? 0 : temp_storage_.shared_bins[digit_in_radix - 1];
-          ranks_[KEY] += prev_count;
-        }
-      }
-    }
-    cub::CTA_SYNC();
-  }
-
-  _TempStorage& temp_storage_;
-  int tid_;
-  int ranks_[ITEMS_PER_THREAD];
-  unsigned int search_mask_;
-  unsigned int top_k_mask_;
-};
-
-}  // namespace wholegraph_ops
diff --git a/cpp/src/wholegraph_ops/block_topk_with_raft.cuh b/cpp/src/wholegraph_ops/block_topk_with_raft.cuh
new file mode 100644
index 000000000..db5a436b5
--- /dev/null
+++ b/cpp/src/wholegraph_ops/block_topk_with_raft.cuh
@@ -0,0 +1,109 @@
+#pragma once
+#include <raft/matrix/detail/select_k-inl.cuh>
+namespace wholegraph_ops {
+template <typename T, typename IdxT>
+constexpr auto calc_smem_size_for_block_wide(int num_of_warp, int k) -> int
+{
+  return raft::Pow2<256>::roundUp(raft::ceildiv(num_of_warp, 2) * sizeof(T) * k) +
+         raft::ceildiv(num_of_warp, 2) * sizeof(IdxT) * k;
+}
+
+template <typename KeyT,
+          int BLOCK_SIZE,
+          int ITEMS_PER_THREAD,
+          int MAXK,
+          bool ASCENDING  = false,
+          typename ValueT = cub::NullType,
+          template <int, bool, typename, typename> class WarpSortClassT =
+            raft::matrix::detail::select::warpsort::warp_sort_distributed_ext>
+class BlockTopkRaftWarpSort {
+  static_assert(MAXK <= raft::matrix::detail::select::warpsort::kMaxCapacity,
+                "MAXK should be smaller than warpsort::kMaxCapacity");
+  static_assert(MAXK >= 1 && raft::is_a_power_of_two(MAXK),
+                "MAXK should >=1 and is a power of two ");
+
+  using bq_t = raft::matrix::detail::select::warpsort::
+    block_sort<WarpSortClassT, MAXK, ASCENDING, KeyT, ValueT>;
+
+  static constexpr int WARP_SIZE = 32;
+  static constexpr int CAL_SMEM_SIZE =
+    calc_smem_size_for_block_wide<KeyT, ValueT>(BLOCK_SIZE / WARP_SIZE, MAXK);
+  static constexpr int SMEM_REQUIRED = bq_t::queue_t::mem_required(BLOCK_SIZE);
+  struct _TempStorage {
+    union {
+      __align__(256) uint8_t smem_buf_bytes0[CAL_SMEM_SIZE];
+      __align__(256) uint8_t smem_buf_bytes1[SMEM_REQUIRED];
+      struct {
+        KeyT store_keys[MAXK];
+        ValueT store_values[MAXK];
+      };
+    };
+  };
+
+ public:
+  struct TempStorage : cub::Uninitialized<_TempStorage> {};
+
+  __device__ __forceinline__ BlockTopkRaftWarpSort(TempStorage& temp_storage)
+    : temp_storage_{temp_storage.Alias()}, tid_(threadIdx.x){};
+
+  __device__ __forceinline__ void TopKToStriped(KeyT (&keys)[ITEMS_PER_THREAD],
+                                                ValueT (&values)[ITEMS_PER_THREAD],
+                                                const int k,
+                                                const int valid_count)
+  {
+    bq_t queue(k, temp_storage_.smem_buf_bytes1);
+
+#pragma unroll
+    for (int i = 0; i < ITEMS_PER_THREAD; i++) {
+      KeyT key = (i * BLOCK_SIZE + tid_) < valid_count
+                   ? keys[i]
+                   : WarpSortClassT<MAXK, ASCENDING, KeyT, ValueT>::kDummy;
+      queue.add(key, values[i]);
+    }
+    queue.done(temp_storage_.smem_buf_bytes0);
+    __syncthreads();
+    queue.store(temp_storage_.store_keys, temp_storage_.store_values);
+    __syncthreads();
+#pragma unroll
+    for (int i = 0; i < ITEMS_PER_THREAD; i++) {
+      int id = i * BLOCK_SIZE + tid_;
+      if (id < k) {
+        keys[i]   = temp_storage_.store_keys[id];
+        values[i] = temp_storage_.store_values[id];
+      }
+    }
+  }
+
+  __device__ __forceinline__ void TopKToStriped(KeyT (&keys)[ITEMS_PER_THREAD],
+
+                                                const int k,
+                                                const int valid_count)
+  {
+    bq_t queue(k, temp_storage_.smem_buf_bytes1);
+
+#pragma unroll
+    for (int i = 0; i < ITEMS_PER_THREAD; i++) {
+      KeyT key = (i * BLOCK_SIZE + tid_) < valid_count
+                   ? keys[i]
+                   : WarpSortClassT<MAXK, ASCENDING, KeyT, ValueT>::kDummy;
+      queue.add(key, i);
+    }
+    queue.done(temp_storage_.smem_buf_bytes0);
+    __syncthreads();
+
+    queue.store(temp_storage_.store_keys, temp_storage_.store_values);
+    __syncthreads();
+
+#pragma unroll
+    for (int i = 0; i < ITEMS_PER_THREAD; i++) {
+      int id = i * BLOCK_SIZE + tid_;
+      if (id < k) { keys[i] = temp_storage_.store_keys[id]; }
+    }
+  }
+
+ private:
+  _TempStorage& temp_storage_;
+  int tid_;
+};
+
+};  // namespace wholegraph_ops
diff --git a/cpp/src/wholememory_ops/functions/embedding_cache_func.cuh b/cpp/src/wholememory_ops/functions/embedding_cache_func.cuh
index 5a110a1e4..e76893db7 100644
--- a/cpp/src/wholememory_ops/functions/embedding_cache_func.cuh
+++ b/cpp/src/wholememory_ops/functions/embedding_cache_func.cuh
@@ -20,7 +20,7 @@
 
 #include <stdint.h>
 
-#include "wholegraph_ops//block_radix_topk.cuh"
+#include "wholegraph_ops/block_topk_with_raft.cuh"
 
 namespace wholememory_ops {
 
@@ -157,8 +157,7 @@ class CacheSetUpdater {
   static constexpr int kTopKRegisterCount = 4;
   static constexpr int kCacheSetSize      = CacheLineInfo::kCacheSetSize;
   static constexpr int kScaledCounterBits = 14;
-  using BlockTopK =
-    wholegraph_ops::BlockRadixTopKRegister<int64_t, kCacheSetSize, kTopKRegisterCount, true, int>;
+  using BlockTopK = wholegraph_ops::BlockTopkRaftWarpSort<int64_t, kCacheSetSize, kTopKRegisterCount, kCacheSetSize, false, int>;
   struct TempStorage : BlockTopK::TempStorage {};
   /**
    * From all invalid CacheSet, recompute lids to cache, and update cache_line_info.
@@ -287,8 +286,7 @@ class CacheSetUpdater {
       candidate_local_id_[1]  = cached_local_id;
     }
     BlockTopK(temp_storage)
-      .radixTopKToStriped(
-        candidate_lfu_count_, candidate_local_id_, kCacheSetSize, kCacheSetSize * 2);
+      .TopKToStriped(candidate_lfu_count_, candidate_local_id_, kCacheSetSize, kCacheSetSize * 2);
     // printf("[TopK merge dump] threadIdx.x=%d, lfu_count=%ld, lid=%d\n", threadIdx.x,
     // candidate_lfu_count_[0], candidate_local_id_[0]);
     match_flag     = WarpMatchLocalIDPairSync(candidate_local_id_[0], cached_local_id);
@@ -391,7 +389,7 @@ class CacheSetUpdater {
     int has_local_id_count           = (cached_local_id != -1) ? __popc(local_id_match_mask) : 0;
     if (StrideIdx == kTopKRegisterCount - 1) {
       BlockTopK(temp_storage)
-        .radixTopKToStriped(
+        .TopKToStriped(
           candidate_lfu_count_, candidate_local_id_, min(kCacheSetSize, valid_count), valid_count);
       valid_count = min(valid_count, kCacheSetSize);
     }

From b5ebc596a45f874fec3114a4c50a40bdbd7dad33 Mon Sep 17 00:00:00 2001
From: Chuang Zhu <chuangz@nvidia.com>
Date: Wed, 9 Aug 2023 07:02:28 +0000
Subject: [PATCH 4/6] Implement CacheSetUpdater using warp sort

---
 .../functions/embedding_cache_func.cuh        | 202 ++++++++----------
 1 file changed, 95 insertions(+), 107 deletions(-)

diff --git a/cpp/src/wholememory_ops/functions/embedding_cache_func.cuh b/cpp/src/wholememory_ops/functions/embedding_cache_func.cuh
index e76893db7..c46bf6e65 100644
--- a/cpp/src/wholememory_ops/functions/embedding_cache_func.cuh
+++ b/cpp/src/wholememory_ops/functions/embedding_cache_func.cuh
@@ -20,7 +20,7 @@
 
 #include <stdint.h>
 
-#include "wholegraph_ops/block_topk_with_raft.cuh"
+#include <raft/matrix/detail/select_k-inl.cuh>
 
 namespace wholememory_ops {
 
@@ -151,14 +151,30 @@ class CacheLineInfo {
   uint32_t lfu_count_;
 };
 
+
 template <typename NodeIDT>
 class CacheSetUpdater {
  public:
   static constexpr int kTopKRegisterCount = 4;
   static constexpr int kCacheSetSize      = CacheLineInfo::kCacheSetSize;
   static constexpr int kScaledCounterBits = 14;
-  using BlockTopK = wholegraph_ops::BlockTopkRaftWarpSort<int64_t, kCacheSetSize, kTopKRegisterCount, kCacheSetSize, false, int>;
-  struct TempStorage : BlockTopK::TempStorage {};
+
+ private:
+
+  using warp_bq_t =
+    raft::matrix::detail::select::warpsort::warp_sort_immediate<kCacheSetSize, false, int64_t, int>;
+
+  static constexpr int WARP_SIZE  = 32;
+  static constexpr int BLOCK_SIZE = kCacheSetSize;
+  static_assert(kCacheSetSize == WARP_SIZE,"only support CacheSetSize==32,and BLOCK_SIZE==32\n");
+
+ public:
+  struct TempStorage {
+    int64_t store_keys[kCacheSetSize];
+    int store_values[kCacheSetSize];
+  };
+
+  ;
   /**
    * From all invalid CacheSet, recompute lids to cache, and update cache_line_info.
    * NOTE: data are not loaded, need to load after this function
@@ -175,26 +191,14 @@ class CacheSetUpdater {
   {
     if (id_count <= 0) return;
     assert(cache_line_info.IsInValid());
-#pragma unroll
-    for (int i = 0; i < kTopKRegisterCount; i++) {
-      candidate_lfu_count_[i] = -1;
-      candidate_local_id_[i]  = -1;
-    }
-    int base_idx    = 0;
-    int valid_count = 0;
-    FillCandidate<0, false>(
-      nullptr, nullptr, memory_lfu_counter, base_idx, valid_count, 0, id_count, temp_storage, -1);
-    while (base_idx < id_count) {
-      FillCandidate<1, false>(
-        nullptr, nullptr, memory_lfu_counter, base_idx, valid_count, 0, id_count, temp_storage, -1);
-      FillCandidate<2, false>(
-        nullptr, nullptr, memory_lfu_counter, base_idx, valid_count, 0, id_count, temp_storage, -1);
-      FillCandidate<3, false>(
-        nullptr, nullptr, memory_lfu_counter, base_idx, valid_count, 0, id_count, temp_storage, -1);
-    }
+
+    // int base_idx    = 0;
+    // int valid_count = 0;
+
+    FillCandidate<false>(nullptr, nullptr, memory_lfu_counter, 0, id_count, temp_storage, -1);
     cache_line_info.ClearCacheLine();
-    cache_line_info.SetLocalID(candidate_local_id_[0]);
-    cache_line_info.SetScaleLfuCountSync(candidate_local_id_[0] >= 0 ? candidate_lfu_count_[0] : 0);
+    cache_line_info.SetLocalID(candidate_local_id_);
+    cache_line_info.SetScaleLfuCountSync(candidate_local_id_ >= 0 ? candidate_lfu_count_ : 0);
   }
   /**
    * Update cache set according to gids and inc_count
@@ -225,84 +229,58 @@ class CacheSetUpdater {
                                               int id_count)
   {
     if (id_count <= 0) return;
-#pragma unroll
-    for (int i = 0; i < kTopKRegisterCount; i++) {
-      candidate_lfu_count_[i] = -1;
-      candidate_local_id_[i]  = -1;
-    }
-    int base_idx           = 0;
-    int valid_count        = 0;
+
+    candidate_lfu_count_   = -1;
+    candidate_local_id_    = -1;
     int cached_local_id    = cache_line_info.LocalID();
-    int has_local_id_count = 0;
-    has_local_id_count += FillCandidate<0>(gids,
-                                           inc_count,
-                                           memory_lfu_counter,
-                                           base_idx,
-                                           valid_count,
-                                           set_start_id,
-                                           id_count,
-                                           temp_storage,
-                                           cached_local_id);
-    while (base_idx < id_count) {
-      has_local_id_count += FillCandidate<1>(gids,
-                                             inc_count,
-                                             memory_lfu_counter,
-                                             base_idx,
-                                             valid_count,
-                                             set_start_id,
-                                             id_count,
-                                             temp_storage,
-                                             cached_local_id);
-      has_local_id_count += FillCandidate<2>(gids,
-                                             inc_count,
-                                             memory_lfu_counter,
-                                             base_idx,
-                                             valid_count,
-                                             set_start_id,
-                                             id_count,
-                                             temp_storage,
-                                             cached_local_id);
-      has_local_id_count += FillCandidate<3>(gids,
-                                             inc_count,
-                                             memory_lfu_counter,
-                                             base_idx,
-                                             valid_count,
-                                             set_start_id,
-                                             id_count,
-                                             temp_storage,
-                                             cached_local_id);
-    }
-    // printf("[TopK init dump] threadIdx.x=%d, lfu_count=%ld, lid=%d\n", threadIdx.x,
-    // candidate_lfu_count_[0], candidate_local_id_[0]);
-    candidate_lfu_count_[1] = -1;
-    candidate_local_id_[1]  = -1;
+    int has_local_id_count = FillCandidate(
+      gids, inc_count, memory_lfu_counter, set_start_id, id_count, temp_storage, cached_local_id);
+
+    // printf("[TopK init dump] threadIdx.x=%d, lfu_count=%ld, lid=%d, has_local_id_count = %d \n",
+    //        threadIdx.x,
+    //        candidate_lfu_count_,
+    //        candidate_local_id_,
+    //        has_local_id_count);
+    int64_t candidate_lfu_count0 = -1;
+    int candidate_local_id0  = -1;
     unsigned int match_flag;
     // match_flag = WarpMatchLocalIDPairSync(candidate_local_id_[0], cached_local_id);
     int64_t estimated_lfu_count = cache_line_info.LfuCountSync();
     // Valid AND NOT exist in update list
+
     if (cached_local_id != -1 && has_local_id_count == 0) {
       // cached key not updated, use estimated lfu_count from cache
-      candidate_lfu_count_[1] = estimated_lfu_count;
-      candidate_local_id_[1]  = cached_local_id;
+      candidate_lfu_count0 = estimated_lfu_count;
+      candidate_local_id0  = cached_local_id;
+    }
+
+    warp_bq_t warp_queue(kCacheSetSize);
+    warp_queue.add(candidate_lfu_count_, candidate_local_id_);
+    warp_queue.add(candidate_lfu_count0, candidate_local_id0);
+    warp_queue.done();
+    warp_queue.store(temp_storage.store_keys, temp_storage.store_values);
+    __syncthreads();
+    if (threadIdx.x < kCacheSetSize) {
+      candidate_lfu_count_ = temp_storage.store_keys[threadIdx.x];
+      candidate_local_id_  = temp_storage.store_values[threadIdx.x];
     }
-    BlockTopK(temp_storage)
-      .TopKToStriped(candidate_lfu_count_, candidate_local_id_, kCacheSetSize, kCacheSetSize * 2);
+
     // printf("[TopK merge dump] threadIdx.x=%d, lfu_count=%ld, lid=%d\n", threadIdx.x,
     // candidate_lfu_count_[0], candidate_local_id_[0]);
-    match_flag     = WarpMatchLocalIDPairSync(candidate_local_id_[0], cached_local_id);
+    match_flag     = WarpMatchLocalIDPairSync(candidate_local_id_, cached_local_id);
     int from_lane  = -1;
     bool has_match = (cached_local_id >= 0 && match_flag != 0);
     if (has_match) from_lane = __ffs(match_flag) - 1;
     unsigned int can_update_mask   = __ballot_sync(0xFFFFFFFF, !has_match);
     unsigned int lower_thread_mask = (1U << threadIdx.x) - 1;
     int updatable_cache_line_rank  = !has_match ? __popc(can_update_mask & lower_thread_mask) : -1;
-    unsigned int new_match_flag = WarpMatchLocalIDPairSync(cached_local_id, candidate_local_id_[0]);
+    unsigned int new_match_flag    = WarpMatchLocalIDPairSync(cached_local_id, candidate_local_id_);
     // printf("tid=%d, cached_local_id=%d, candidate_local_id_=%d, new_match_flag=%x\n",
     //        threadIdx.x,
     //        cached_local_id,
-    //        candidate_local_id_[0],
+    //        candidate_local_id_,
     //        new_match_flag);
-    bool new_need_slot              = (candidate_local_id_[0] >= 0 && new_match_flag == 0);
+    bool new_need_slot              = (candidate_local_id_ >= 0 && new_match_flag == 0);
     unsigned int need_new_slot_mask = __ballot_sync(0xFFFFFFFF, new_need_slot);
     int insert_data_rank = new_need_slot ? __popc(need_new_slot_mask & lower_thread_mask) : -1;
     // printf("tid=%d, updatable_cache_line_rank=%d, insert_data_rank=%d\n", threadIdx.x,
@@ -313,8 +291,8 @@ class CacheSetUpdater {
       from_lane = __ffs(rank_match_flag) - 1;
     }
     int src_lane_idx      = from_lane >= 0 ? from_lane : 0;
-    int64_t new_lfu_count = __shfl_sync(0xFFFFFFFF, candidate_lfu_count_[0], src_lane_idx, 32);
-    int new_local_id      = __shfl_sync(0xFFFFFFFF, candidate_local_id_[0], src_lane_idx, 32);
+    int64_t new_lfu_count = __shfl_sync(0xFFFFFFFF, candidate_lfu_count_, src_lane_idx, 32);
+    int new_local_id      = __shfl_sync(0xFFFFFFFF, candidate_local_id_, src_lane_idx, 32);
     if (from_lane == -1) {
       new_local_id  = -1;
       new_lfu_count = 0;
@@ -323,7 +301,7 @@ class CacheSetUpdater {
     // new_lfu_count);
     if (NeedOutputLoadIDs && need_load_to_cache_ids != nullptr) {
       int new_cached_lid = -1;
-      if (new_need_slot) { new_cached_lid = candidate_local_id_[0]; }
+      if (new_need_slot) { new_cached_lid = candidate_local_id_; }
       unsigned int load_cache_mask = __ballot_sync(0xFFFFFFFF, new_cached_lid >= 0);
       int output_idx               = __popc(load_cache_mask & ((1 << threadIdx.x) - 1));
       int total_load_count         = __popc(load_cache_mask);
@@ -359,41 +337,51 @@ class CacheSetUpdater {
   }
 
  private:
-  int64_t candidate_lfu_count_[kTopKRegisterCount];
-  int candidate_local_id_[kTopKRegisterCount];
-  template <int StrideIdx, bool IncCounter = true>
+  int64_t candidate_lfu_count_;
+  int candidate_local_id_;
+  template <bool IncCounter = true>
   __device__ __forceinline__ int FillCandidate(const NodeIDT* gids,
                                                const int* inc_freq_count,
                                                int64_t* cache_set_coverage_counter,
-                                               int& base_idx,
-                                               int& valid_count,
                                                int64_t cache_set_start_id,
                                                int id_count,
                                                TempStorage& temp_storage,
                                                int cached_local_id)
   {
-    int const idx = base_idx + threadIdx.x;
-    valid_count += min(kCacheSetSize, max(0, id_count - base_idx));
-    int local_id = -1;
-    if (idx < id_count) {
-      local_id                        = gids != nullptr ? gids[idx] - cache_set_start_id : idx;
-      candidate_lfu_count_[StrideIdx] = cache_set_coverage_counter[local_id];
-      if (IncCounter) {
-        int id_inc_count = inc_freq_count != nullptr ? inc_freq_count[idx] : 1;
-        candidate_lfu_count_[StrideIdx] += id_inc_count;
-        cache_set_coverage_counter[local_id] = candidate_lfu_count_[StrideIdx];
+
+    warp_bq_t warp_queue(kCacheSetSize);
+    const int per_thread_lim = id_count + raft::laneId();
+
+    int has_local_id_count = 0;
+    for (int idx = threadIdx.x; idx < per_thread_lim; idx += BLOCK_SIZE) {
+      int local_id            = -1;
+      int64_t candidate_lfu_count = -1;
+      int candidate_local_id  = -1;
+      if (idx < id_count) {
+        local_id            = gids != nullptr ? gids[idx] - cache_set_start_id : idx;
+        candidate_lfu_count = cache_set_coverage_counter[local_id];
+        if (IncCounter) {
+          int id_inc_count = inc_freq_count != nullptr ? inc_freq_count[idx] : 1;
+          candidate_lfu_count += id_inc_count;
+          cache_set_coverage_counter[local_id] = candidate_lfu_count;
+        }
+        candidate_local_id = local_id;
       }
-      candidate_local_id_[StrideIdx] = local_id;
+      unsigned int local_id_match_mask = WarpMatchLocalIDPairSync(local_id, cached_local_id);
+      has_local_id_count += ((cached_local_id != -1) ? __popc(local_id_match_mask) : 0);
+      warp_queue.add(candidate_lfu_count, candidate_local_id);
     }
-    unsigned int local_id_match_mask = WarpMatchLocalIDPairSync(local_id, cached_local_id);
-    int has_local_id_count           = (cached_local_id != -1) ? __popc(local_id_match_mask) : 0;
-    if (StrideIdx == kTopKRegisterCount - 1) {
-      BlockTopK(temp_storage)
-        .TopKToStriped(
-          candidate_lfu_count_, candidate_local_id_, min(kCacheSetSize, valid_count), valid_count);
-      valid_count = min(valid_count, kCacheSetSize);
+
+    warp_queue.done();
+    warp_queue.store(temp_storage.store_keys, temp_storage.store_values);
+    __syncthreads();
+    if (threadIdx.x < kCacheSetSize) {
+      candidate_lfu_count_ = temp_storage.store_keys[threadIdx.x];
+      candidate_local_id_  = temp_storage.store_values[threadIdx.x];
     }
-    base_idx += kCacheSetSize;
+    __syncthreads();
+
+
     return has_local_id_count;
   }
 };

From 4be6fb937d9d3ae9d32334ac1e241fddf6012c67 Mon Sep 17 00:00:00 2001
From: Chuang Zhu <chuangz@nvidia.com>
Date: Wed, 9 Aug 2023 15:09:21 +0800
Subject: [PATCH 5/6] remove block_topk_with raft.cuh

---
 .../wholegraph_ops/block_topk_with_raft.cuh   | 109 ------------------
 1 file changed, 109 deletions(-)
 delete mode 100644 cpp/src/wholegraph_ops/block_topk_with_raft.cuh

diff --git a/cpp/src/wholegraph_ops/block_topk_with_raft.cuh b/cpp/src/wholegraph_ops/block_topk_with_raft.cuh
deleted file mode 100644
index db5a436b5..000000000
--- a/cpp/src/wholegraph_ops/block_topk_with_raft.cuh
+++ /dev/null
@@ -1,109 +0,0 @@
-#pragma once
-#include <raft/matrix/detail/select_k-inl.cuh>
-namespace wholegraph_ops {
-template <typename T, typename IdxT>
-constexpr auto calc_smem_size_for_block_wide(int num_of_warp, int k) -> int
-{
-  return raft::Pow2<256>::roundUp(raft::ceildiv(num_of_warp, 2) * sizeof(T) * k) +
-         raft::ceildiv(num_of_warp, 2) * sizeof(IdxT) * k;
-}
-
-template <typename KeyT,
-          int BLOCK_SIZE,
-          int ITEMS_PER_THREAD,
-          int MAXK,
-          bool ASCENDING  = false,
-          typename ValueT = cub::NullType,
-          template <int, bool, typename, typename> class WarpSortClassT =
-            raft::matrix::detail::select::warpsort::warp_sort_distributed_ext>
-class BlockTopkRaftWarpSort {
-  static_assert(MAXK <= raft::matrix::detail::select::warpsort::kMaxCapacity,
-                "MAXK should be smaller than warpsort::kMaxCapacity");
-  static_assert(MAXK >= 1 && raft::is_a_power_of_two(MAXK),
-                "MAXK should >=1 and is a power of two ");
-
-  using bq_t = raft::matrix::detail::select::warpsort::
-    block_sort<WarpSortClassT, MAXK, ASCENDING, KeyT, ValueT>;
-
-  static constexpr int WARP_SIZE = 32;
-  static constexpr int CAL_SMEM_SIZE =
-    calc_smem_size_for_block_wide<KeyT, ValueT>(BLOCK_SIZE / WARP_SIZE, MAXK);
-  static constexpr int SMEM_REQUIRED = bq_t::queue_t::mem_required(BLOCK_SIZE);
-  struct _TempStorage {
-    union {
-      __align__(256) uint8_t smem_buf_bytes0[CAL_SMEM_SIZE];
-      __align__(256) uint8_t smem_buf_bytes1[SMEM_REQUIRED];
-      struct {
-        KeyT store_keys[MAXK];
-        ValueT store_values[MAXK];
-      };
-    };
-  };
-
- public:
-  struct TempStorage : cub::Uninitialized<_TempStorage> {};
-
-  __device__ __forceinline__ BlockTopkRaftWarpSort(TempStorage& temp_storage)
-    : temp_storage_{temp_storage.Alias()}, tid_(threadIdx.x){};
-
-  __device__ __forceinline__ void TopKToStriped(KeyT (&keys)[ITEMS_PER_THREAD],
-                                                ValueT (&values)[ITEMS_PER_THREAD],
-                                                const int k,
-                                                const int valid_count)
-  {
-    bq_t queue(k, temp_storage_.smem_buf_bytes1);
-
-#pragma unroll
-    for (int i = 0; i < ITEMS_PER_THREAD; i++) {
-      KeyT key = (i * BLOCK_SIZE + tid_) < valid_count
-                   ? keys[i]
-                   : WarpSortClassT<MAXK, ASCENDING, KeyT, ValueT>::kDummy;
-      queue.add(key, values[i]);
-    }
-    queue.done(temp_storage_.smem_buf_bytes0);
-    __syncthreads();
-    queue.store(temp_storage_.store_keys, temp_storage_.store_values);
-    __syncthreads();
-#pragma unroll
-    for (int i = 0; i < ITEMS_PER_THREAD; i++) {
-      int id = i * BLOCK_SIZE + tid_;
-      if (id < k) {
-        keys[i]   = temp_storage_.store_keys[id];
-        values[i] = temp_storage_.store_values[id];
-      }
-    }
-  }
-
-  __device__ __forceinline__ void TopKToStriped(KeyT (&keys)[ITEMS_PER_THREAD],
-
-                                                const int k,
-                                                const int valid_count)
-  {
-    bq_t queue(k, temp_storage_.smem_buf_bytes1);
-
-#pragma unroll
-    for (int i = 0; i < ITEMS_PER_THREAD; i++) {
-      KeyT key = (i * BLOCK_SIZE + tid_) < valid_count
-                   ? keys[i]
-                   : WarpSortClassT<MAXK, ASCENDING, KeyT, ValueT>::kDummy;
-      queue.add(key, i);
-    }
-    queue.done(temp_storage_.smem_buf_bytes0);
-    __syncthreads();
-
-    queue.store(temp_storage_.store_keys, temp_storage_.store_values);
-    __syncthreads();
-
-#pragma unroll
-    for (int i = 0; i < ITEMS_PER_THREAD; i++) {
-      int id = i * BLOCK_SIZE + tid_;
-      if (id < k) { keys[i] = temp_storage_.store_keys[id]; }
-    }
-  }
-
- private:
-  _TempStorage& temp_storage_;
-  int tid_;
-};
-
-};  // namespace wholegraph_ops

From 09558094e641e996b5c45be3e571d67de18ca715 Mon Sep 17 00:00:00 2001
From: Chuang Zhu <chuangz@nvidia.com>
Date: Tue, 15 Aug 2023 13:42:43 +0000
Subject: [PATCH 6/6] code style

---
 .../functions/embedding_cache_func.cuh        | 12 +++------
 ...ighted_sample_without_replacement_tests.cu |  2 +-
 ...aph_weighted_sample_without_replacement.py | 27 ++++++++++---------
 3 files changed, 19 insertions(+), 22 deletions(-)

diff --git a/cpp/src/wholememory_ops/functions/embedding_cache_func.cuh b/cpp/src/wholememory_ops/functions/embedding_cache_func.cuh
index c46bf6e65..e3589285f 100644
--- a/cpp/src/wholememory_ops/functions/embedding_cache_func.cuh
+++ b/cpp/src/wholememory_ops/functions/embedding_cache_func.cuh
@@ -151,7 +151,6 @@ class CacheLineInfo {
   uint32_t lfu_count_;
 };
 
-
 template <typename NodeIDT>
 class CacheSetUpdater {
  public:
@@ -160,13 +159,12 @@ class CacheSetUpdater {
   static constexpr int kScaledCounterBits = 14;
 
  private:
-
   using warp_bq_t =
     raft::matrix::detail::select::warpsort::warp_sort_immediate<kCacheSetSize, false, int64_t, int>;
 
   static constexpr int WARP_SIZE  = 32;
   static constexpr int BLOCK_SIZE = kCacheSetSize;
-  static_assert(kCacheSetSize == WARP_SIZE,"only support CacheSetSize==32,and BLOCK_SIZE==32\n");
+  static_assert(kCacheSetSize == WARP_SIZE, "only support CacheSetSize==32,and BLOCK_SIZE==32\n");
 
  public:
   struct TempStorage {
@@ -242,7 +240,7 @@ class CacheSetUpdater {
     //        candidate_local_id_,
     //        has_local_id_count);
     int64_t candidate_lfu_count0 = -1;
-    int candidate_local_id0  = -1;
+    int candidate_local_id0      = -1;
     unsigned int match_flag;
     // match_flag = WarpMatchLocalIDPairSync(candidate_local_id_[0], cached_local_id);
     int64_t estimated_lfu_count = cache_line_info.LfuCountSync();
@@ -348,15 +346,14 @@ class CacheSetUpdater {
                                                TempStorage& temp_storage,
                                                int cached_local_id)
   {
-
     warp_bq_t warp_queue(kCacheSetSize);
     const int per_thread_lim = id_count + raft::laneId();
 
     int has_local_id_count = 0;
     for (int idx = threadIdx.x; idx < per_thread_lim; idx += BLOCK_SIZE) {
-      int local_id            = -1;
+      int local_id                = -1;
       int64_t candidate_lfu_count = -1;
-      int candidate_local_id  = -1;
+      int candidate_local_id      = -1;
       if (idx < id_count) {
         local_id            = gids != nullptr ? gids[idx] - cache_set_start_id : idx;
         candidate_lfu_count = cache_set_coverage_counter[local_id];
@@ -381,7 +378,6 @@ class CacheSetUpdater {
     }
     __syncthreads();
 
-
     return has_local_id_count;
   }
 };
diff --git a/cpp/tests/wholegraph_ops/wholegraph_csr_weighted_sample_without_replacement_tests.cu b/cpp/tests/wholegraph_ops/wholegraph_csr_weighted_sample_without_replacement_tests.cu
index fa8cd4f10..eac1723af 100644
--- a/cpp/tests/wholegraph_ops/wholegraph_csr_weighted_sample_without_replacement_tests.cu
+++ b/cpp/tests/wholegraph_ops/wholegraph_csr_weighted_sample_without_replacement_tests.cu
@@ -446,7 +446,7 @@ INSTANTIATE_TEST_SUITE_P(WholeGraphCSRWeightedSampleWithoutReplacementOpTests,
                                              .set_center_node_count(35)
                                              .set_graph_node_count(23289)
                                              .set_graph_edge_couont(689403),
-                                              WholeGraphCSRWeightedSampleWithoutReplacementTestParam()
+                                           WholeGraphCSRWeightedSampleWithoutReplacementTestParam()
                                              .set_memory_type(WHOLEMEMORY_MT_CONTINUOUS)
                                              .set_max_sample_count(300)
                                              .set_center_node_count(256)
diff --git a/python/pylibwholegraph/pylibwholegraph/tests/wholegraph_torch/ops/test_wholegraph_weighted_sample_without_replacement.py b/python/pylibwholegraph/pylibwholegraph/tests/wholegraph_torch/ops/test_wholegraph_weighted_sample_without_replacement.py
index 0f53044bd..138163a87 100644
--- a/python/pylibwholegraph/pylibwholegraph/tests/wholegraph_torch/ops/test_wholegraph_weighted_sample_without_replacement.py
+++ b/python/pylibwholegraph/pylibwholegraph/tests/wholegraph_torch/ops/test_wholegraph_weighted_sample_without_replacement.py
@@ -46,8 +46,7 @@ def host_weighted_sample_without_replacement_func(
     output_center_localid_tensor = torch.empty((total_sample_count,), dtype=torch.int32)
     output_edge_gid_tensor = torch.empty((total_sample_count,), dtype=torch.int64)
     center_nodes_count = center_nodes.size(0)
-    block_size = 128 if max_sample_count <=256 else 256
-
+    block_size = 128 if max_sample_count <= 256 else 256
 
     for i in range(center_nodes_count):
         node_id = center_nodes[i]
@@ -66,23 +65,25 @@ def host_weighted_sample_without_replacement_func(
             edge_weight_corresponding_ids = torch.tensor([], dtype=col_id_dtype)
             for j in range(block_size):
                 local_gidx = gidx + j
-                local_edge_weights = torch.tensor( [],dtype=csr_weight_dtype
-                )
+                local_edge_weights = torch.tensor([], dtype=csr_weight_dtype)
                 generated_edge_weight_count = 0
-                for id in range(j,neighbor_count,block_size):
+                for id in range(j, neighbor_count, block_size):
                     local_edge_weights = torch.cat(
-                    (
-                        local_edge_weights,
-                         torch.tensor([host_csr_weight_ptr[start + id]], dtype=csr_weight_dtype),
-                     )
+                        (
+                            local_edge_weights,
+                            torch.tensor(
+                                [host_csr_weight_ptr[start + id]],
+                                dtype=csr_weight_dtype,
+                            ),
+                        )
                     )
                     generated_edge_weight_count += 1
                     edge_weight_corresponding_ids = torch.cat(
-                            (
-                                edge_weight_corresponding_ids,
-                                torch.tensor([id], dtype=col_id_dtype),
-                            )
+                        (
+                            edge_weight_corresponding_ids,
+                            torch.tensor([id], dtype=col_id_dtype),
                         )
+                    )
                 random_values = (
                     wg_ops.generate_exponential_distribution_negative_float_cpu(
                         random_seed, local_gidx, generated_edge_weight_count