From d4213757a79dbcfe6c7786663d00c3f69fcb1bb5 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 23 Feb 2023 13:52:40 +0100
Subject: [PATCH 01/25] Add a method to reconstruct the compressed index data

---
 .../raft/neighbors/detail/ivf_pq_build.cuh    | 285 +++++++++++++++---
 cpp/test/neighbors/ann_ivf_pq.cuh             |  67 +++-
 2 files changed, 311 insertions(+), 41 deletions(-)
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
index bf3014568a..e4ac26f4c0 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
@@ -557,6 +557,177 @@ void train_per_cluster(raft::device_resources const& handle,
   transpose_pq_centers(handle, index, pq_centers_tmp.data());
 }
 
+/**
+ * Decode a lvl-2 pq-encoded vector in the given list (cluster).
+ * One vector per thread.
+ * NB: this function only decodes the PQ (second level) enconding; to get the approximation of the
+ * original vector, you need to add the cluster centroid and apply the inverse matrix transform to
+ * the result of this function.
+ *
+ * @tparam PqBits
+ *
+ * @param[out] out_vector the destination for the decoded vector (one-per-thread).
+ * @param[in] in_list_data the encoded cluster data.
+ * @param[in] pq_centers the codebook
+ * @param[in] codebook_kind
+ * @param[in] in_ix in-cluster index of the vector to be decoded (one-per-thread).
+ * @param[in] cluster_ix label/id of the cluster (one-per-thread).
+ */
+template <uint32_t PqBits>
+__device__ void reconstruct_vector(
+  device_vector_view<float, uint32_t, row_major> out_vector,
+  device_mdspan<const uint8_t, list_spec<uint32_t>::list_extents, row_major> in_list_data,
+  device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers,
+  codebook_gen codebook_kind,
+  uint32_t in_ix,
+  uint32_t cluster_ix)
+{
+  using group_align         = Pow2<kIndexGroupSize>;
+  const uint32_t group_ix   = group_align::div(in_ix);
+  const uint32_t ingroup_ix = group_align::mod(in_ix);
+  const uint32_t pq_len     = pq_centers.extent(1);
+  const uint32_t pq_dim     = out_vector.extent(0) / pq_len;
+
+  using layout_t            = typename decltype(out_vector)::layout_type;
+  using accessor_t          = typename decltype(out_vector)::accessor_type;
+  auto reinterpreted_vector = mdspan<float, extent_2d<uint32_t>, layout_t, accessor_t>(
+    out_vector.data_handle(), extent_2d<uint32_t>{pq_dim, pq_len});
+
+  pq_vec_t code_chunk;
+  bitfield_view_t<PqBits> code_view{reinterpret_cast<uint8_t*>(&code_chunk)};
+  constexpr uint32_t kChunkSize = (sizeof(pq_vec_t) * 8u) / PqBits;
+  for (uint32_t j = 0, i = 0; j < pq_dim; i++) {
+    // read the chunk
+    code_chunk = *reinterpret_cast<const pq_vec_t*>(&in_list_data(group_ix, i, ingroup_ix, 0));
+    // read the codes, one/pq_dim at a time
+#pragma unroll
+    for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++) {
+      uint32_t partition_ix;
+      switch (codebook_kind) {
+        case codebook_gen::PER_CLUSTER: {
+          partition_ix = cluster_ix;
+        } break;
+        case codebook_gen::PER_SUBSPACE: {
+          partition_ix = j;
+        } break;
+        default: __builtin_unreachable();
+      }
+      uint8_t code = code_view[k];
+      // read a piece of the reconstructed vector
+      for (uint32_t l = 0; l < pq_len; l++) {
+        reinterpreted_vector(j, l) = pq_centers(partition_ix, l, code);
+      }
+    }
+  }
+}
+
+template <uint32_t BlockSize, uint32_t PqBits>
+__launch_bounds__(BlockSize) __global__ void reconstruct_list_data_kernel(
+  device_matrix_view<float, uint32_t, row_major> out_vectors,
+  device_vector_view<const uint8_t* const, uint32_t, row_major> data_ptrs,
+  device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers,
+  device_matrix_view<const float, uint32_t, row_major> centers_rot,
+  codebook_gen codebook_kind,
+  uint32_t cluster_ix,
+  uint32_t n_skip)
+{
+  const auto out_dim = out_vectors.extent(1);
+  using layout_t     = typename decltype(out_vectors)::layout_type;
+  using accessor_t   = typename decltype(out_vectors)::accessor_type;
+
+  const uint32_t pq_dim = out_dim / pq_centers.extent(1);
+  auto pq_extents =
+    list_spec<uint32_t>{PqBits, pq_dim, true}.make_list_extents(out_vectors.extent(0) + n_skip + 1);
+  auto pq_dataset =
+    make_mdspan<const uint8_t, uint32_t, row_major, false, true>(data_ptrs[cluster_ix], pq_extents);
+
+  for (uint32_t ix = threadIdx.x + BlockSize * blockIdx.x; ix < out_vectors.extent(0);
+       ix += BlockSize) {
+    auto one_vector = mdspan<float, extent_1d<uint32_t>, layout_t, accessor_t>(
+      &out_vectors(ix, 0), extent_1d<uint32_t>{out_vectors.extent(1)});
+    reconstruct_vector<PqBits>(
+      one_vector, pq_dataset, pq_centers, codebook_kind, ix + n_skip, cluster_ix);
+    for (uint32_t j = 0; j < out_dim; j++) {
+      one_vector(j) += centers_rot(cluster_ix, j);
+    }
+  }
+}
+
+template <typename T, typename IdxT>
+void reconstruct_list_data(raft::device_resources const& res,
+                           const index<IdxT>& index,
+                           device_matrix_view<T, uint32_t, row_major> out_vectors,
+                           uint32_t label,
+                           uint32_t n_skip)
+{
+  auto n_rows = out_vectors.extent(0);
+  if (n_rows == 0) { return; }
+  // sic! I'm using the upper bound `list.size` instead of exact `list_sizes(label)`
+  // to avoid an extra device-host data copy and the stream sync.
+  RAFT_EXPECTS(n_skip + n_rows <= index.lists()[label]->size.load(),
+               "n_skip + output size must be not bigger than the cluster size.");
+
+  auto tmp = make_device_mdarray<float>(
+    res, res.get_workspace_resource(), make_extents<uint32_t>(n_rows, index.rot_dim()));
+
+  constexpr uint32_t kBlockSize = 256;
+  dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize), 1, 1);
+  dim3 threads(kBlockSize, 1, 1);
+  auto kernel = [](uint32_t pq_bits) {
+    switch (pq_bits) {
+      case 4: return reconstruct_list_data_kernel<kBlockSize, 4>;
+      case 5: return reconstruct_list_data_kernel<kBlockSize, 5>;
+      case 6: return reconstruct_list_data_kernel<kBlockSize, 6>;
+      case 7: return reconstruct_list_data_kernel<kBlockSize, 7>;
+      case 8: return reconstruct_list_data_kernel<kBlockSize, 8>;
+      default: RAFT_FAIL("Invalid pq_bits (%u), the value must be within [4, 8]", pq_bits);
+    }
+  }(index.pq_bits());
+  kernel<<<blocks, threads, 0, res.get_stream()>>>(tmp.view(),
+                                                   index.data_ptrs(),
+                                                   index.pq_centers(),
+                                                   index.centers_rot(),
+                                                   index.codebook_kind(),
+                                                   label,
+                                                   n_skip);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+  float* out_float_ptr = nullptr;
+  rmm::device_uvector<float> out_float_buf(0, res.get_stream(), res.get_workspace_resource());
+  if constexpr (std::is_same_v<T, float>) {
+    out_float_ptr = out_vectors.data_handle();
+  } else {
+    out_float_buf.resize(size_t{n_rows} * size_t{index.dim()}, res.get_stream());
+    out_float_ptr = out_float_buf.data();
+  }
+  // Rotate the results back to the original space
+  float alpha = 1.0;
+  float beta  = 0.0;
+  linalg::gemm(res,
+               false,
+               false,
+               index.dim(),
+               n_rows,
+               index.rot_dim(),
+               &alpha,
+               index.rotation_matrix().data_handle(),
+               index.dim(),
+               tmp.data_handle(),
+               index.rot_dim(),
+               &beta,
+               out_float_ptr,
+               index.dim(),
+               res.get_stream());
+  // Transform the data to the original type, if necessary
+  if constexpr (!std::is_same_v<T, float>) {
+    linalg::map_k(out_vectors.data_handle(),
+                  out_float_buf.size(),
+                  utils::mapping<T>{},
+                  res.get_stream(),
+                  out_float_ptr);
+  }
+}
+
 /**
  * Compute the code: find the closest cluster in each pq_dim-subspace.
  *
@@ -625,6 +796,67 @@ __device__ auto compute_pq_code(
   return code;
 }
 
+/**
+ * Compute a PQ code for a single input vector per subwarp and write it into the
+ * appropriate cluster.
+ * Subwarp size here is the minimum between WarpSize and the codebook size.
+ *
+ * @tparam BlockSize
+ * @tparam PqBits
+ *
+ * @param[out] out_list_data an array of pointers to the database clusers.
+ * @param[in] in_vector input unencoded data, one-per-subwarp
+ * @param[in] pq_centers codebook
+ * @param[in] codebook_kind
+ * @param[in] out_ix in-cluster output index (where to write the encoded data), one-per-subwarp.
+ * @param[in] cluster_ix label/id of the cluster to fill, one-per-subwarp.
+ */
+template <uint32_t BlockSize, uint32_t PqBits>
+__device__ auto compute_and_write_pq_code(
+  device_mdspan<uint8_t, list_spec<uint32_t>::list_extents, row_major> out_list_data,
+  device_vector_view<const float, uint32_t, row_major> in_vector,
+  device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers,
+  codebook_gen codebook_kind,
+  uint32_t out_ix,
+  uint32_t cluster_ix)
+{
+  constexpr uint32_t kSubWarpSize = std::min<uint32_t>(WarpSize, 1u << PqBits);
+  using subwarp_align             = Pow2<kSubWarpSize>;
+  const uint32_t lane_id          = subwarp_align::mod(threadIdx.x);
+
+  using group_align         = Pow2<kIndexGroupSize>;
+  const uint32_t group_ix   = group_align::div(out_ix);
+  const uint32_t ingroup_ix = group_align::mod(out_ix);
+  const uint32_t pq_len     = pq_centers.extent(1);
+  const uint32_t pq_dim     = in_vector.extent(0) / pq_len;
+
+  using layout_t            = typename decltype(in_vector)::layout_type;
+  using accessor_t          = typename decltype(in_vector)::accessor_type;
+  auto reinterpreted_vector = mdspan<const float, extent_2d<uint32_t>, layout_t, accessor_t>(
+    in_vector.data_handle(), extent_2d<uint32_t>{pq_dim, pq_len});
+
+  __shared__ pq_vec_t codes[subwarp_align::div(BlockSize)];
+  pq_vec_t& code = codes[subwarp_align::div(threadIdx.x)];
+  bitfield_view_t<PqBits> out{reinterpret_cast<uint8_t*>(&code)};
+  constexpr uint32_t kChunkSize = (sizeof(pq_vec_t) * 8u) / PqBits;
+  for (uint32_t j = 0, i = 0; j < pq_dim; i++) {
+    // clear the chunk for writing
+    if (lane_id == 0) { code = pq_vec_t{}; }
+    // fill-in the values, one/pq_dim at a time
+#pragma unroll
+    for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++) {
+      // find the label
+      auto l = compute_pq_code<kSubWarpSize>(
+        pq_centers, reinterpreted_vector, codebook_kind, j, cluster_ix);
+      if (lane_id == 0) { out[k] = l; }
+    }
+    // write the chunk into the dataset
+    if (lane_id == 0) {
+      *reinterpret_cast<pq_vec_t*>(&out_list_data(group_ix, i, ingroup_ix, 0)) = code;
+    }
+  }
+}
+
 template <uint32_t BlockSize, uint32_t PqBits, typename IdxT>
 __launch_bounds__(BlockSize) __global__ void process_and_fill_codes_kernel(
   device_matrix_view<const float, IdxT, row_major> new_vectors,
@@ -639,7 +871,7 @@ __launch_bounds__(BlockSize) __global__ void process_and_fill_codes_kernel(
   constexpr uint32_t kSubWarpSize = std::min<uint32_t>(WarpSize, 1u << PqBits);
   using subwarp_align             = Pow2<kSubWarpSize>;
   const uint32_t lane_id          = subwarp_align::mod(threadIdx.x);
-  const IdxT row_ix = subwarp_align::div(IdxT{threadIdx.x} + IdxT{blockDim.x} * IdxT{blockIdx.x});
+  const IdxT row_ix = subwarp_align::div(IdxT{threadIdx.x} + IdxT{BlockSize} * IdxT{blockIdx.x});
   if (row_ix >= new_vectors.extent(0)) { return; }
 
   const uint32_t cluster_ix = new_labels[row_ix];
@@ -647,7 +879,7 @@ __launch_bounds__(BlockSize) __global__ void process_and_fill_codes_kernel(
   if (lane_id == 0) { out_ix = atomicAdd(&list_sizes(cluster_ix), 1); }
   out_ix = shfl(out_ix, 0, kSubWarpSize);
 
-  // write the label
+  // write the label  (one record per subwarp)
   auto pq_indices = inds_ptrs(cluster_ix);
   if (lane_id == 0) {
     if (std::holds_alternative<IdxT>(src_offset_or_indices)) {
@@ -657,40 +889,21 @@ __launch_bounds__(BlockSize) __global__ void process_and_fill_codes_kernel(
     }
   }
 
-  // write the codes
-  using group_align         = Pow2<kIndexGroupSize>;
-  const uint32_t group_ix   = group_align::div(out_ix);
-  const uint32_t ingroup_ix = group_align::mod(out_ix);
-  const uint32_t pq_len     = pq_centers.extent(1);
-  const uint32_t pq_dim     = new_vectors.extent(1) / pq_len;
-
-  auto pq_extents = list_spec<uint32_t>{PqBits, pq_dim, true}.make_list_extents(out_ix + 1);
-  auto pq_extents_vectorized =
-    make_extents<uint32_t>(pq_extents.extent(0), pq_extents.extent(1), pq_extents.extent(2));
-  auto pq_dataset = make_mdspan<pq_vec_t, uint32_t, row_major, false, true>(
-    reinterpret_cast<pq_vec_t*>(data_ptrs[cluster_ix]), pq_extents_vectorized);
-
-  __shared__ pq_vec_t codes[subwarp_align::div(BlockSize)];
-  pq_vec_t& code = codes[subwarp_align::div(threadIdx.x)];
-  bitfield_view_t<PqBits> out{reinterpret_cast<uint8_t*>(&code)};
-  constexpr uint32_t kChunkSize = (sizeof(pq_vec_t) * 8u) / PqBits;
-  for (uint32_t j = 0, i = 0; j < pq_dim; i++) {
-    // clear the chunk for writing
-    if (lane_id == 0) { code = pq_vec_t{}; }
-    // fill-in the values, one/pq_dim at a time
-#pragma unroll
-    for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++) {
-      // find the label
-      using layout_t   = typename decltype(new_vectors)::layout_type;
-      using accessor_t = typename decltype(new_vectors)::accessor_type;
-      auto one_vector  = mdspan<const float, extent_2d<uint32_t>, layout_t, accessor_t>(
-        &new_vectors(row_ix, 0), extent_2d<uint32_t>{pq_dim, pq_len});
-      auto l = compute_pq_code<kSubWarpSize>(pq_centers, one_vector, codebook_kind, j, cluster_ix);
-      if (lane_id == 0) { out[k] = l; }
-    }
-    // write the chunk into the dataset
-    if (lane_id == 0) { pq_dataset(group_ix, i, ingroup_ix) = code; }
-  }
+  // write the codes (one record per subwarp):
+  // 1. select input row
+  using layout_t    = typename decltype(new_vectors)::layout_type;
+  using accessor_t  = typename decltype(new_vectors)::accessor_type;
+  const auto in_dim = new_vectors.extent(1);
+  auto one_vector =
+    mdspan<const float, extent_1d<uint32_t>, layout_t, accessor_t>(&new_vectors(row_ix, 0), in_dim);
+  // 2. select output cluster
+  const uint32_t pq_dim = in_dim / pq_centers.extent(1);
+  auto pq_extents       = list_spec<uint32_t>{PqBits, pq_dim, true}.make_list_extents(out_ix + 1);
+  auto pq_dataset =
+    make_mdspan<uint8_t, uint32_t, row_major, false, true>(data_ptrs[cluster_ix], pq_extents);
+  // 3. compute and write the vector
+  compute_and_write_pq_code<BlockSize, PqBits>(
+    pq_dataset, one_vector, pq_centers, codebook_kind, out_ix, cluster_ix);
 }
 
 /**
diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh
index df295b8bcb..6ecdd8674c 100644
--- a/cpp/test/neighbors/ann_ivf_pq.cuh
+++ b/cpp/test/neighbors/ann_ivf_pq.cuh
@@ -22,6 +22,7 @@
 
 #include <raft/core/logger.hpp>
 #include <raft/distance/distance_types.hpp>
+#include <raft/matrix/gather.cuh>
 #include <raft/neighbors/ivf_pq.cuh>
 #include <raft/random/rng.cuh>
 #if defined RAFT_DISTANCE_COMPILED
@@ -139,7 +140,6 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
   {
   }
 
- protected:
   void gen_data()
   {
     database.resize(size_t{ps.num_db_vecs} * size_t{ps.dim}, stream_);
@@ -216,11 +216,70 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
     return ivf_pq::detail::deserialize<IdxT>(handle_, "ivf_pq_index");
   }
 
+  void check_reconstruction(const index<IdxT>& index,
+                            double compression_ratio,
+                            uint32_t label,
+                            uint32_t n_take,
+                            uint32_t n_skip)
+  {
+    auto rec_list = index.lists()[label];
+    auto dim      = index.dim();
+    n_take        = std::min<uint32_t>(n_take, rec_list->size.load());
+    n_skip        = std::min<uint32_t>(n_skip, rec_list->size.load() - n_take);
+
+    if (n_take == 0) { return; }
+
+    auto rec_data  = make_device_matrix<DataT>(handle_, n_take, dim);
+    auto orig_data = make_device_matrix<DataT>(handle_, n_take, dim);
+
+    rmm::mr::managed_memory_resource managed_memory;
+    auto dist =
+      make_device_mdarray<double>(handle_, &managed_memory, make_extents<uint32_t>(n_take));
+
+    ivf_pq::detail::reconstruct_list_data(handle_, index, rec_data.view(), label, n_skip);
+
+    matrix::gather(database.data(),
+                   IdxT{dim},
+                   IdxT{n_take},
+                   rec_list->indices.data_handle() + n_skip,
+                   IdxT{n_take},
+                   orig_data.data_handle(),
+                   stream_);
+
+    auto rec_data_view  = rec_data.view();
+    auto orig_data_view = orig_data.view();
+    linalg::map_offset(
+      handle_, dist.view(), [rec_data_view, orig_data_view, dim] __device__(uint32_t i) {
+        spatial::knn::detail::utils::mapping<float> f{};
+        double d = 0.0f;
+        for (uint32_t j = 0; j < dim; j++) {
+          double t = f(rec_data_view(i, j)) - f(orig_data_view(i, j));
+          d += t * t;
+        }
+        return sqrt(d / double(dim));
+      });
+    handle_.sync_stream();
+    for (uint32_t i = 0; i < n_take; i++) {
+      double d = dist(i);
+      // The theoretical estimate of the error is hard to come up with,
+      // the estimate below is based on experimentation + curse of dimensionality
+      ASSERT_LE(d, 0.04 * std::pow(2.0, compression_ratio))
+        << " (label = " << label << ", ix = " << (n_skip + i) << ")";
+    }
+  }
+
   template <typename BuildIndex>
   void run(BuildIndex build_index)
   {
     auto index = build_index();
 
+    double compression_ratio =
+      static_cast<double>(ps.dim * 8) / static_cast<double>(index.pq_dim() * index.pq_bits());
+
+    // check a small subset of data in a randomly chosen cluster to see if the data reconstruction
+    // works well.
+    check_reconstruction(index, compression_ratio, uint32_t(rand()) % index.n_lists(), 100, 7);
+
     size_t queries_size = ps.num_queries * ps.k;
     std::vector<IdxT> indices_ivf_pq(queries_size);
     std::vector<EvalT> distances_ivf_pq(queries_size);
@@ -244,11 +303,9 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
     // A very conservative lower bound on recall
     double min_recall =
       static_cast<double>(ps.search_params.n_probes) / static_cast<double>(ps.index_params.n_lists);
-    double low_precision_factor =
-      static_cast<double>(ps.dim * 8) / static_cast<double>(index.pq_dim() * index.pq_bits());
     // Using a heuristic to lower the required recall due to code-packing errors
     min_recall =
-      std::min(std::erfc(0.05 * low_precision_factor / std::max(min_recall, 0.5)), min_recall);
+      std::min(std::erfc(0.05 * compression_ratio / std::max(min_recall, 0.5)), min_recall);
     // Use explicit per-test min recall value if provided.
     min_recall = ps.min_recall.value_or(min_recall);
 
@@ -258,7 +315,7 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
                                 distances_ivf_pq,
                                 ps.num_queries,
                                 ps.k,
-                                0.0001 * low_precision_factor,
+                                0.0001 * compression_ratio,
                                 min_recall))
       << ps;
 

From c7b557419988155214b9cbd41fb854e0a2748821 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 23 Feb 2023 13:59:13 +0100
Subject: [PATCH 02/25] Fix a typo in the docs

---
 cpp/include/raft/neighbors/detail/ivf_pq_build.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
index e4ac26f4c0..90e6cbf668 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
@@ -560,7 +560,7 @@ void train_per_cluster(raft::device_resources const& handle,
 /**
  * Decode a lvl-2 pq-encoded vector in the given list (cluster).
  * One vector per thread.
- * NB: this function only decodes the PQ (second level) enconding; to get the approximation of the
+ * NB: this function only decodes the PQ (second level) encoding; to get the approximation of the
  * original vector, you need to add the cluster centroid and apply the inverse matrix transform to
  * the result of this function.
  *

From 087919f0b020982dbf9491063f72f7ed9e582909 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 23 Feb 2023 15:05:44 +0100
Subject: [PATCH 03/25] Relax the constraints a bit

---
 cpp/test/neighbors/ann_ivf_pq.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh
index 6ecdd8674c..d624c4a00a 100644
--- a/cpp/test/neighbors/ann_ivf_pq.cuh
+++ b/cpp/test/neighbors/ann_ivf_pq.cuh
@@ -263,7 +263,7 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
       double d = dist(i);
       // The theoretical estimate of the error is hard to come up with,
       // the estimate below is based on experimentation + curse of dimensionality
-      ASSERT_LE(d, 0.04 * std::pow(2.0, compression_ratio))
+      ASSERT_LE(d, 0.05 * std::pow(2.0, compression_ratio))
         << " (label = " << label << ", ix = " << (n_skip + i) << ")";
     }
   }

From b5e984486f96210b27a53fadbd752e91d8a0ac1b Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 23 Feb 2023 16:08:30 +0100
Subject: [PATCH 04/25] Add the public interface

---
 .../raft/neighbors/detail/ivf_pq_build.cuh    |  1 +
 cpp/include/raft/neighbors/ivf_pq.cuh         | 42 +++++++++++++++++++
 cpp/test/neighbors/ann_ivf_pq.cuh             |  2 +-
 3 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
index 90e6cbf668..96e93370e8 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
@@ -653,6 +653,7 @@ __launch_bounds__(BlockSize) __global__ void reconstruct_list_data_kernel(
   }
 }
 
+/** Decode the list data; see the public interface for the api and usage. */
 template <typename T, typename IdxT>
 void reconstruct_list_data(raft::device_resources const& res,
                            const index<IdxT>& index,
diff --git a/cpp/include/raft/neighbors/ivf_pq.cuh b/cpp/include/raft/neighbors/ivf_pq.cuh
index e2cc3c4728..9c85bc333f 100644
--- a/cpp/include/raft/neighbors/ivf_pq.cuh
+++ b/cpp/include/raft/neighbors/ivf_pq.cuh
@@ -75,6 +75,48 @@ auto build(raft::device_resources const& handle,
   return detail::build(handle, params, dataset, n_rows, dim);
 }
 
+/**
+ * @brief Decode `n_take` consecutive records of a single list (cluster) in the compressed index
+ * starting at given offset `n_skip`.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // We will reconstruct the fourth cluster
+ *   uint32_t label = 3;
+ *   // Get the list size
+ *   uint32_t list_size = 0;
+ *   raft::copy(&list_size, index.list_sizes().data_handle() + label, 1, res.get_stream());
+ *   res.sync_stream();
+ *   // allocate the buffer for the output
+ *   auto decoded_vectors = raft::make_device_matrix<float>(res, list_size, index.dim());
+ *   // decode the whole list
+ *   ivf_pq::reconstruct_list_data(res, index, decoded_vectors.view(), label, 0);
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] res
+ * @param[in] index
+ * @param[out] out_vectors
+ *   the destination buffer [n_take, index.dim()].
+ *   The length `n_take` defines how many records to reconstruct,
+ *   it must be smaller than the list size.
+ * @param[in] label
+ *   The id of the list (cluster) to decode.
+ * @param[in] n_skip
+ *   How many records in the list to skip.
+ */
+template <typename T, typename IdxT>
+void reconstruct_list_data(raft::device_resources const& res,
+                           const index<IdxT>& index,
+                           device_matrix_view<T, uint32_t, row_major> out_vectors,
+                           uint32_t label,
+                           uint32_t n_skip)
+{
+  return detail::reconstruct_list_data(res, index, out_vectors, label, n_skip);
+}
+
 /**
  * @brief Build a new index containing the data of the original plus new extra vectors.
  *
diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh
index d624c4a00a..81eb266906 100644
--- a/cpp/test/neighbors/ann_ivf_pq.cuh
+++ b/cpp/test/neighbors/ann_ivf_pq.cuh
@@ -236,7 +236,7 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
     auto dist =
       make_device_mdarray<double>(handle_, &managed_memory, make_extents<uint32_t>(n_take));
 
-    ivf_pq::detail::reconstruct_list_data(handle_, index, rec_data.view(), label, n_skip);
+    ivf_pq::reconstruct_list_data(handle_, index, rec_data.view(), label, n_skip);
 
     matrix::gather(database.data(),
                    IdxT{dim},

From 2773a8607143c12a76bacc70cab42dca5a4532f1 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 15 Mar 2023 10:59:48 +0100
Subject: [PATCH 05/25] Fix the merge errors

---
 cpp/include/raft/neighbors/detail/ivf_pq_build.cuh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
index 4ab008c173..aa56f2baec 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
@@ -576,7 +576,7 @@ void train_per_cluster(raft::device_resources const& handle,
 template <uint32_t PqBits>
 __device__ void reconstruct_vector(
   device_vector_view<float, uint32_t, row_major> out_vector,
-  device_mdspan<const uint8_t, list_spec<uint32_t>::list_extents, row_major> in_list_data,
+  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> in_list_data,
   device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers,
   codebook_gen codebook_kind,
   uint32_t in_ix,
@@ -636,8 +636,8 @@ __launch_bounds__(BlockSize) __global__ void reconstruct_list_data_kernel(
   using accessor_t   = typename decltype(out_vectors)::accessor_type;
 
   const uint32_t pq_dim = out_dim / pq_centers.extent(1);
-  auto pq_extents =
-    list_spec<uint32_t>{PqBits, pq_dim, true}.make_list_extents(out_vectors.extent(0) + n_skip + 1);
+  auto pq_extents       = list_spec<uint32_t, uint32_t>{PqBits, pq_dim, true}.make_list_extents(
+    out_vectors.extent(0) + n_skip + 1);
   auto pq_dataset =
     make_mdspan<const uint8_t, uint32_t, row_major, false, true>(data_ptrs[cluster_ix], pq_extents);
 
@@ -814,7 +814,7 @@ __device__ auto compute_pq_code(
  */
 template <uint32_t BlockSize, uint32_t PqBits>
 __device__ auto compute_and_write_pq_code(
-  device_mdspan<uint8_t, list_spec<uint32_t>::list_extents, row_major> out_list_data,
+  device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> out_list_data,
   device_vector_view<const float, uint32_t, row_major> in_vector,
   device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers,
   codebook_gen codebook_kind,

From 82be40c7183553f0cab568eaaffe9a8d649e9410 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 15 Mar 2023 11:49:28 +0100
Subject: [PATCH 06/25] Add an option to reconstruct cluster data by in-cluster
 indices

---
 .../raft/neighbors/detail/ivf_pq_build.cuh    | 37 ++++++++------
 cpp/include/raft/neighbors/ivf_pq.cuh         | 50 +++++++++++++++++--
 2 files changed, 68 insertions(+), 19 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
index aa56f2baec..4c51423be4 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
@@ -625,19 +625,20 @@ template <uint32_t BlockSize, uint32_t PqBits>
 __launch_bounds__(BlockSize) __global__ void reconstruct_list_data_kernel(
   device_matrix_view<float, uint32_t, row_major> out_vectors,
   device_vector_view<const uint8_t* const, uint32_t, row_major> data_ptrs,
+  device_vector_view<const uint32_t, uint32_t, row_major> list_sizes,
   device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers,
   device_matrix_view<const float, uint32_t, row_major> centers_rot,
   codebook_gen codebook_kind,
   uint32_t cluster_ix,
-  uint32_t n_skip)
+  std::variant<uint32_t, const uint32_t*> offset_or_indices)
 {
   const auto out_dim = out_vectors.extent(1);
   using layout_t     = typename decltype(out_vectors)::layout_type;
   using accessor_t   = typename decltype(out_vectors)::accessor_type;
 
   const uint32_t pq_dim = out_dim / pq_centers.extent(1);
-  auto pq_extents       = list_spec<uint32_t, uint32_t>{PqBits, pq_dim, true}.make_list_extents(
-    out_vectors.extent(0) + n_skip + 1);
+  auto pq_extents =
+    list_spec<uint32_t, uint32_t>{PqBits, pq_dim, true}.make_list_extents(list_sizes[cluster_ix]);
   auto pq_dataset =
     make_mdspan<const uint8_t, uint32_t, row_major, false, true>(data_ptrs[cluster_ix], pq_extents);
 
@@ -645,8 +646,11 @@ __launch_bounds__(BlockSize) __global__ void reconstruct_list_data_kernel(
        ix += BlockSize) {
     auto one_vector = mdspan<float, extent_1d<uint32_t>, layout_t, accessor_t>(
       &out_vectors(ix, 0), extent_1d<uint32_t>{out_vectors.extent(1)});
+    const uint32_t src_ix = std::holds_alternative<uint32_t>(offset_or_indices)
+                              ? std::get<uint32_t>(offset_or_indices) + ix
+                              : std::get<const uint32_t*>(offset_or_indices)[ix];
     reconstruct_vector<PqBits>(
-      one_vector, pq_dataset, pq_centers, codebook_kind, ix + n_skip, cluster_ix);
+      one_vector, pq_dataset, pq_centers, codebook_kind, src_ix, cluster_ix);
     for (uint32_t j = 0; j < out_dim; j++) {
       one_vector(j) += centers_rot(cluster_ix, j);
     }
@@ -659,14 +663,17 @@ void reconstruct_list_data(raft::device_resources const& res,
                            const index<IdxT>& index,
                            device_matrix_view<T, uint32_t, row_major> out_vectors,
                            uint32_t label,
-                           uint32_t n_skip)
+                           std::variant<uint32_t, const uint32_t*> offset_or_indices)
 {
   auto n_rows = out_vectors.extent(0);
   if (n_rows == 0) { return; }
-  // sic! I'm using the upper bound `list.size` instead of exact `list_sizes(label)`
-  // to avoid an extra device-host data copy and the stream sync.
-  RAFT_EXPECTS(n_skip + n_rows <= index.lists()[label]->size.load(),
-               "n_skip + output size must be not bigger than the cluster size.");
+  if (std::holds_alternative<uint32_t>(offset_or_indices)) {
+    auto n_skip = std::get<uint32_t>(offset_or_indices);
+    // sic! I'm using the upper bound `list.size` instead of exact `list_sizes(label)`
+    // to avoid an extra device-host data copy and the stream sync.
+    RAFT_EXPECTS(n_skip + n_rows <= index.lists()[label]->size.load(),
+                 "offset + output size must be not bigger than the cluster size.");
+  }
 
   auto tmp = make_device_mdarray<float>(
     res, res.get_workspace_resource(), make_extents<uint32_t>(n_rows, index.rot_dim()));
@@ -686,11 +693,12 @@ void reconstruct_list_data(raft::device_resources const& res,
   }(index.pq_bits());
   kernel<<<blocks, threads, 0, res.get_stream()>>>(tmp.view(),
                                                    index.data_ptrs(),
+                                                   index.list_sizes(),
                                                    index.pq_centers(),
                                                    index.centers_rot(),
                                                    index.codebook_kind(),
                                                    label,
-                                                   n_skip);
+                                                   offset_or_indices);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
   float* out_float_ptr = nullptr;
@@ -721,11 +729,10 @@ void reconstruct_list_data(raft::device_resources const& res,
                res.get_stream());
   // Transform the data to the original type, if necessary
   if constexpr (!std::is_same_v<T, float>) {
-    linalg::map_k(out_vectors.data_handle(),
-                  out_float_buf.size(),
-                  utils::mapping<T>{},
-                  res.get_stream(),
-                  out_float_ptr);
+    linalg::map(res,
+                out_vectors,
+                utils::mapping<T>{},
+                make_device_matrix_view<const float>(out_float_ptr, n_rows, index.dim()));
   }
 }
 
diff --git a/cpp/include/raft/neighbors/ivf_pq.cuh b/cpp/include/raft/neighbors/ivf_pq.cuh
index 1739a9450a..05e09f063b 100644
--- a/cpp/include/raft/neighbors/ivf_pq.cuh
+++ b/cpp/include/raft/neighbors/ivf_pq.cuh
@@ -213,7 +213,7 @@ auto build(raft::device_resources const& handle,
 
 /**
  * @brief Decode `n_take` consecutive records of a single list (cluster) in the compressed index
- * starting at given offset `n_skip`.
+ * starting at given `offset`.
  *
  * Usage example:
  * @code{.cpp}
@@ -240,7 +240,7 @@ auto build(raft::device_resources const& handle,
  *   it must be smaller than the list size.
  * @param[in] label
  *   The id of the list (cluster) to decode.
- * @param[in] n_skip
+ * @param[in] offset
  *   How many records in the list to skip.
  */
 template <typename T, typename IdxT>
@@ -248,9 +248,51 @@ void reconstruct_list_data(raft::device_resources const& res,
                            const index<IdxT>& index,
                            device_matrix_view<T, uint32_t, row_major> out_vectors,
                            uint32_t label,
-                           uint32_t n_skip)
+                           uint32_t offset)
+{
+  return detail::reconstruct_list_data(res, index, out_vectors, label, offset);
+}
+
+/**
+ * @brief Decode `n_take` consecutive records of a single list (cluster) in the compressed index
+ * starting at given offset `n_skip`.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // We will reconstruct the fourth cluster
+ *   uint32_t label = 3;
+ *   // Get the list size
+ *   uint32_t list_size = 0;
+ *   raft::copy(&list_size, index.list_sizes().data_handle() + label, 1, res.get_stream());
+ *   res.sync_stream();
+ *   // allocate the buffer for the output
+ *   auto decoded_vectors = raft::make_device_matrix<float>(res, list_size, index.dim());
+ *   // decode the whole list
+ *   ivf_pq::reconstruct_list_data(res, index, decoded_vectors.view(), label, 0);
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] res
+ * @param[in] index
+ * @param[in] in_cluster_indices
+ *   The offsets of the selected indices within the cluster.
+ * @param[out] out_vectors
+ *   the destination buffer [n_take, index.dim()].
+ *   The length `n_take` defines how many records to reconstruct,
+ *   it must be smaller than the list size.
+ * @param[in] label
+ *   The id of the list (cluster) to decode.
+ */
+template <typename T, typename IdxT>
+void reconstruct_list_data(raft::device_resources const& res,
+                           const index<IdxT>& index,
+                           device_vector_view<const uint32_t> in_cluster_indices,
+                           device_matrix_view<T, uint32_t, row_major> out_vectors,
+                           uint32_t label)
 {
-  return detail::reconstruct_list_data(res, index, out_vectors, label, n_skip);
+  return detail::reconstruct_list_data(res, index, out_vectors, label, in_cluster_indices);
 }
 
 /**

From 5fc1c53fd85bbcfba9009407ecfdddd2af43c65f Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 15 Mar 2023 11:54:18 +0100
Subject: [PATCH 07/25] Update the docs for the new function

---
 cpp/include/raft/neighbors/ivf_pq.cuh | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/cpp/include/raft/neighbors/ivf_pq.cuh b/cpp/include/raft/neighbors/ivf_pq.cuh
index 05e09f063b..def127e24f 100644
--- a/cpp/include/raft/neighbors/ivf_pq.cuh
+++ b/cpp/include/raft/neighbors/ivf_pq.cuh
@@ -254,21 +254,23 @@ void reconstruct_list_data(raft::device_resources const& res,
 }
 
 /**
- * @brief Decode `n_take` consecutive records of a single list (cluster) in the compressed index
- * starting at given offset `n_skip`.
+ * @brief Decode a series of records of a single list (cluster) in the compressed index
+ * by their in-list offsets.
  *
  * Usage example:
  * @code{.cpp}
  *   // We will reconstruct the fourth cluster
  *   uint32_t label = 3;
- *   // Get the list size
- *   uint32_t list_size = 0;
- *   raft::copy(&list_size, index.list_sizes().data_handle() + label, 1, res.get_stream());
+ *   // Create the selection vector
+ *   auto selected_indices = raft::make_device_vector<uint32_t>(res, 4);
+ *   ... fill the indices ...
  *   res.sync_stream();
  *   // allocate the buffer for the output
- *   auto decoded_vectors = raft::make_device_matrix<float>(res, list_size, index.dim());
+ *   auto decoded_vectors = raft::make_device_matrix<float>(
+ *                             res, selected_indices.size(), index.dim());
  *   // decode the whole list
- *   ivf_pq::reconstruct_list_data(res, index, decoded_vectors.view(), label, 0);
+ *   ivf_pq::reconstruct_list_data(
+ *       res, index, selected_indices.view(), decoded_vectors.view(), label);
  * @endcode
  *
  * @tparam T data element type

From ddc446604901ff786a2fa17a85365f1221a5e7f6 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Wed, 15 Mar 2023 19:25:35 +0100
Subject: [PATCH 08/25] Detach reconstruction logic from the data reading logic

---
 .../raft/neighbors/detail/ivf_pq_build.cuh    | 173 ++++++++++++------
 1 file changed, 114 insertions(+), 59 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
index 4c51423be4..ef282b8b76 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
@@ -557,41 +557,99 @@ void train_per_cluster(raft::device_resources const& handle,
   transpose_pq_centers(handle, index, pq_centers_tmp.data());
 }
 
+struct reconstruct_vectors {
+  codebook_gen codebook_kind;
+  uint32_t cluster_ix;
+  uint32_t pq_len;
+  device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers;
+  device_mdspan<const float, extent_3d<uint32_t>, row_major> centers_rot;
+  device_mdspan<float, extent_3d<uint32_t>, row_major> out_vectors;
+
+  /**
+   * Create the functor to be passed to `run_on_list`.
+   *
+   * @param[out] out_vectors the destination for the decoded vectors.
+   * @param[in] pq_centers the codebook
+   * @param[in] centers_rot
+   * @param[in] codebook_kind
+   * @param[in] cluster_ix label/id of the cluster.
+   */
+  __device__ inline reconstruct_vectors(
+    device_matrix_view<float, uint32_t, row_major> out_vectors,
+    device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers,
+    device_matrix_view<const float, uint32_t, row_major> centers_rot,
+    codebook_gen codebook_kind,
+    uint32_t cluster_ix)
+    : codebook_kind{codebook_kind},
+      cluster_ix{cluster_ix},
+      pq_len{pq_centers.extent(1)},
+      pq_centers{pq_centers},
+      centers_rot{reinterpret_vectors(centers_rot, pq_centers)},
+      out_vectors{reinterpret_vectors(out_vectors, pq_centers)}
+  {
+  }
+
+  /**
+   * Decode j-th component of the i-th vector by its code and write it into a chunk of the output
+   * vectors (pq_len elements).
+   */
+  __device__ inline void operator()(uint8_t code, uint32_t i, uint32_t j)
+  {
+    uint32_t partition_ix;
+    switch (codebook_kind) {
+      case codebook_gen::PER_CLUSTER: {
+        partition_ix = cluster_ix;
+      } break;
+      case codebook_gen::PER_SUBSPACE: {
+        partition_ix = j;
+      } break;
+      default: __builtin_unreachable();
+    }
+    for (uint32_t k = 0; k < pq_len; k++) {
+      out_vectors(i, j, k) = pq_centers(partition_ix, k, code) + centers_rot(cluster_ix, j, k);
+    }
+  }
+
+ private:
+  template <typename T>
+  static __device__ auto reinterpret_vectors(
+    device_matrix_view<T, uint32_t, row_major> out_vectors,
+    device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers)
+    -> device_mdspan<T, extent_3d<uint32_t>, row_major>
+  {
+    const uint32_t pq_len = pq_centers.extent(1);
+    const uint32_t pq_dim = out_vectors.extent(1) / pq_len;
+    using layout_t        = typename decltype(out_vectors)::layout_type;
+    using accessor_t      = typename decltype(out_vectors)::accessor_type;
+    return mdspan<T, extent_3d<uint32_t>, layout_t, accessor_t>(
+      out_vectors.data_handle(), extent_3d<uint32_t>{out_vectors.extent(0), pq_dim, pq_len});
+  }
+};
+
 /**
- * Decode a lvl-2 pq-encoded vector in the given list (cluster).
- * One vector per thread.
- * NB: this function only decodes the PQ (second level) encoding; to get the approximation of the
- * original vector, you need to add the cluster centroid and apply the inverse matrix transform to
- * the result of this function.
+ * Process a single vector in a list.
  *
  * @tparam PqBits
+ * @tparam Action tells how to process a single vectors (e.g. reconstruct or just unpack)
  *
- * @param[out] out_vector the destination for the decoded vector (one-per-thread).
  * @param[in] in_list_data the encoded cluster data.
- * @param[in] pq_centers the codebook
- * @param[in] codebook_kind
  * @param[in] in_ix in-cluster index of the vector to be decoded (one-per-thread).
- * @param[in] cluster_ix label/id of the cluster (one-per-thread).
+ * @param[in] out_ix the output index passed to the action
+ * @param[in] pq_dim
+ * @param action a callable action to be invoked on each PQ code (component of the encoding)
+ *    type: void (uint8_t code, uint32_t out_ix, uint32_t j), where j = [0..pq_dim).
  */
-template <uint32_t PqBits>
-__device__ void reconstruct_vector(
-  device_vector_view<float, uint32_t, row_major> out_vector,
+template <uint32_t PqBits, typename Action>
+__device__ void run_on_vector(
   device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> in_list_data,
-  device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers,
-  codebook_gen codebook_kind,
   uint32_t in_ix,
-  uint32_t cluster_ix)
+  uint32_t out_ix,
+  uint32_t pq_dim,
+  Action action)
 {
   using group_align         = Pow2<kIndexGroupSize>;
   const uint32_t group_ix   = group_align::div(in_ix);
   const uint32_t ingroup_ix = group_align::mod(in_ix);
-  const uint32_t pq_len     = pq_centers.extent(1);
-  const uint32_t pq_dim     = out_vector.extent(0) / pq_len;
-
-  using layout_t            = typename decltype(out_vector)::layout_type;
-  using accessor_t          = typename decltype(out_vector)::accessor_type;
-  auto reinterpreted_vector = mdspan<float, extent_2d<uint32_t>, layout_t, accessor_t>(
-    out_vector.data_handle(), extent_2d<uint32_t>{pq_dim, pq_len});
 
   pq_vec_t code_chunk;
   bitfield_view_t<PqBits> code_view{reinterpret_cast<uint8_t*>(&code_chunk)};
@@ -602,25 +660,35 @@ __device__ void reconstruct_vector(
     // read the codes, one/pq_dim at a time
 #pragma unroll
     for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++) {
-      uint32_t partition_ix;
-      switch (codebook_kind) {
-        case codebook_gen::PER_CLUSTER: {
-          partition_ix = cluster_ix;
-        } break;
-        case codebook_gen::PER_SUBSPACE: {
-          partition_ix = j;
-        } break;
-        default: __builtin_unreachable();
-      }
-      uint8_t code = code_view[k];
       // read a piece of the reconstructed vector
-      for (uint32_t l = 0; l < pq_len; l++) {
-        reinterpreted_vector(j, l) = pq_centers(partition_ix, l, code);
-      }
+      action(code_view[k], out_ix, j);
     }
   }
 }
 
+/** Process the given indices or a block of a single list (cluster). */
+template <uint32_t PqBits, typename Action>
+__device__ void run_on_list(device_vector_view<const uint8_t* const, uint32_t, row_major> data_ptrs,
+                            device_vector_view<const uint32_t, uint32_t, row_major> list_sizes,
+                            std::variant<uint32_t, const uint32_t*> offset_or_indices,
+                            uint32_t len,
+                            uint32_t cluster_ix,
+                            uint32_t pq_dim,
+                            Action action)
+{
+  auto pq_extents =
+    list_spec<uint32_t, uint32_t>{PqBits, pq_dim, true}.make_list_extents(list_sizes[cluster_ix]);
+  auto pq_dataset =
+    make_mdspan<const uint8_t, uint32_t, row_major, false, true>(data_ptrs[cluster_ix], pq_extents);
+
+  for (uint32_t ix = threadIdx.x + blockDim.x * blockIdx.x; ix < len; ix += blockDim.x) {
+    const uint32_t src_ix = std::holds_alternative<uint32_t>(offset_or_indices)
+                              ? std::get<uint32_t>(offset_or_indices) + ix
+                              : std::get<const uint32_t*>(offset_or_indices)[ix];
+    run_on_vector<PqBits>(pq_dataset, src_ix, ix, pq_dim, action);
+  }
+}
+
 template <uint32_t BlockSize, uint32_t PqBits>
 __launch_bounds__(BlockSize) __global__ void reconstruct_list_data_kernel(
   device_matrix_view<float, uint32_t, row_major> out_vectors,
@@ -632,29 +700,16 @@ __launch_bounds__(BlockSize) __global__ void reconstruct_list_data_kernel(
   uint32_t cluster_ix,
   std::variant<uint32_t, const uint32_t*> offset_or_indices)
 {
-  const auto out_dim = out_vectors.extent(1);
-  using layout_t     = typename decltype(out_vectors)::layout_type;
-  using accessor_t   = typename decltype(out_vectors)::accessor_type;
-
-  const uint32_t pq_dim = out_dim / pq_centers.extent(1);
-  auto pq_extents =
-    list_spec<uint32_t, uint32_t>{PqBits, pq_dim, true}.make_list_extents(list_sizes[cluster_ix]);
-  auto pq_dataset =
-    make_mdspan<const uint8_t, uint32_t, row_major, false, true>(data_ptrs[cluster_ix], pq_extents);
-
-  for (uint32_t ix = threadIdx.x + BlockSize * blockIdx.x; ix < out_vectors.extent(0);
-       ix += BlockSize) {
-    auto one_vector = mdspan<float, extent_1d<uint32_t>, layout_t, accessor_t>(
-      &out_vectors(ix, 0), extent_1d<uint32_t>{out_vectors.extent(1)});
-    const uint32_t src_ix = std::holds_alternative<uint32_t>(offset_or_indices)
-                              ? std::get<uint32_t>(offset_or_indices) + ix
-                              : std::get<const uint32_t*>(offset_or_indices)[ix];
-    reconstruct_vector<PqBits>(
-      one_vector, pq_dataset, pq_centers, codebook_kind, src_ix, cluster_ix);
-    for (uint32_t j = 0; j < out_dim; j++) {
-      one_vector(j) += centers_rot(cluster_ix, j);
-    }
-  }
+  const uint32_t pq_dim = out_vectors.extent(1) / pq_centers.extent(1);
+  auto reconstruct_action =
+    reconstruct_vectors{out_vectors, pq_centers, centers_rot, codebook_kind, cluster_ix};
+  run_on_list<PqBits>(data_ptrs,
+                      list_sizes,
+                      offset_or_indices,
+                      out_vectors.extent(0),
+                      cluster_ix,
+                      pq_dim,
+                      reconstruct_action);
 }
 
 /** Decode the list data; see the public interface for the api and usage. */

From 3a2c62235861e1e6aa7a1c64c4fa9e1e4036ae99 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 16 Mar 2023 07:23:46 +0100
Subject: [PATCH 09/25] Implement unpack_list_data

---
 .../raft/neighbors/detail/ivf_pq_build.cuh    | 209 ++++++++++++------
 1 file changed, 145 insertions(+), 64 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
index ef282b8b76..c19232a51d 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
@@ -557,6 +557,150 @@ void train_per_cluster(raft::device_resources const& handle,
   transpose_pq_centers(handle, index, pq_centers_tmp.data());
 }
 
+/**
+ * Process a single vector in a list.
+ *
+ * @tparam PqBits
+ * @tparam Action tells how to process a single vectors (e.g. reconstruct or just unpack)
+ *
+ * @param[in] in_list_data the encoded cluster data.
+ * @param[in] in_ix in-cluster index of the vector to be decoded (one-per-thread).
+ * @param[in] out_ix the output index passed to the action
+ * @param[in] pq_dim
+ * @param action a callable action to be invoked on each PQ code (component of the encoding)
+ *    type: void (uint8_t code, uint32_t out_ix, uint32_t j), where j = [0..pq_dim).
+ */
+template <uint32_t PqBits, typename Action>
+__device__ void run_on_vector(
+  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> in_list_data,
+  uint32_t in_ix,
+  uint32_t out_ix,
+  uint32_t pq_dim,
+  Action action)
+{
+  using group_align         = Pow2<kIndexGroupSize>;
+  const uint32_t group_ix   = group_align::div(in_ix);
+  const uint32_t ingroup_ix = group_align::mod(in_ix);
+
+  pq_vec_t code_chunk;
+  bitfield_view_t<PqBits> code_view{reinterpret_cast<uint8_t*>(&code_chunk)};
+  constexpr uint32_t kChunkSize = (sizeof(pq_vec_t) * 8u) / PqBits;
+  for (uint32_t j = 0, i = 0; j < pq_dim; i++) {
+    // read the chunk
+    code_chunk = *reinterpret_cast<const pq_vec_t*>(&in_list_data(group_ix, i, ingroup_ix, 0));
+    // read the codes, one/pq_dim at a time
+#pragma unroll
+    for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++) {
+      // read a piece of the reconstructed vector
+      action(code_view[k], out_ix, j);
+    }
+  }
+}
+
+/** Process the given indices or a block of a single list (cluster). */
+template <uint32_t PqBits, typename Action>
+__device__ void run_on_list(device_vector_view<const uint8_t* const, uint32_t, row_major> data_ptrs,
+                            device_vector_view<const uint32_t, uint32_t, row_major> list_sizes,
+                            std::variant<uint32_t, const uint32_t*> offset_or_indices,
+                            uint32_t len,
+                            uint32_t cluster_ix,
+                            uint32_t pq_dim,
+                            Action action)
+{
+  auto pq_extents =
+    list_spec<uint32_t, uint32_t>{PqBits, pq_dim, true}.make_list_extents(list_sizes[cluster_ix]);
+  auto pq_dataset =
+    make_mdspan<const uint8_t, uint32_t, row_major, false, true>(data_ptrs[cluster_ix], pq_extents);
+
+  for (uint32_t ix = threadIdx.x + blockDim.x * blockIdx.x; ix < len; ix += blockDim.x) {
+    const uint32_t src_ix = std::holds_alternative<uint32_t>(offset_or_indices)
+                              ? std::get<uint32_t>(offset_or_indices) + ix
+                              : std::get<const uint32_t*>(offset_or_indices)[ix];
+    run_on_vector<PqBits>(pq_dataset, src_ix, ix, pq_dim, action);
+  }
+}
+
+/**
+ * A consumer for the `run_on_list` and `run_on_vec` that just flattens PQ codes
+ * one-per-byte. That is, independent of the code width (pq_bits), one code uses
+ * the whole byte, hence one vectors uses pq_dim bytes.
+ */
+struct unpack_codes {
+  device_matrix_view<uint8_t, uint32_t, row_major> out_codes;
+
+  /**
+   * Create a callable to be passed to `run_on_list`.
+   *
+   * @param[out] out_codes the destination for the read codes.
+   */
+  __device__ inline unpack_codes(device_matrix_view<uint8_t, uint32_t, row_major> out_codes)
+    : out_codes{out_codes}
+  {
+  }
+
+  /**  Write j-th component (code) of the i-th vector into the output array. */
+  __device__ inline void operator()(uint8_t code, uint32_t i, uint32_t j)
+  {
+    out_codes(i, j) = code;
+  }
+};
+
+template <uint32_t BlockSize, uint32_t PqBits>
+__launch_bounds__(BlockSize) __global__ void unpack_list_data_kernel(
+  device_matrix_view<uint8_t, uint32_t, row_major> out_codes,
+  device_vector_view<const uint8_t* const, uint32_t, row_major> data_ptrs,
+  device_vector_view<const uint32_t, uint32_t, row_major> list_sizes,
+  uint32_t cluster_ix,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices)
+{
+  const uint32_t pq_dim = out_codes.extent(1);
+  auto unpack_action    = unpack_codes{out_codes};
+  run_on_list<PqBits>(data_ptrs,
+                      list_sizes,
+                      offset_or_indices,
+                      out_codes.extent(0),
+                      cluster_ix,
+                      pq_dim,
+                      unpack_action);
+}
+
+/** Decode the list data; see the public interface for the api and usage. */
+template <typename IdxT>
+void unpack_list_data(raft::device_resources const& res,
+                      const index<IdxT>& index,
+                      device_matrix_view<uint8_t, uint32_t, row_major> out_codes,
+                      uint32_t label,
+                      std::variant<uint32_t, const uint32_t*> offset_or_indices)
+{
+  auto n_rows = out_codes.extent(0);
+  if (n_rows == 0) { return; }
+  if (std::holds_alternative<uint32_t>(offset_or_indices)) {
+    auto n_skip = std::get<uint32_t>(offset_or_indices);
+    // sic! I'm using the upper bound `list.size` instead of exact `list_sizes(label)`
+    // to avoid an extra device-host data copy and the stream sync.
+    RAFT_EXPECTS(n_skip + n_rows <= index.lists()[label]->size.load(),
+                 "offset + output size must be not bigger than the cluster size.");
+  }
+
+  constexpr uint32_t kBlockSize = 256;
+  dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize), 1, 1);
+  dim3 threads(kBlockSize, 1, 1);
+  auto kernel = [](uint32_t pq_bits) {
+    switch (pq_bits) {
+      case 4: return unpack_list_data_kernel<kBlockSize, 4>;
+      case 5: return unpack_list_data_kernel<kBlockSize, 5>;
+      case 6: return unpack_list_data_kernel<kBlockSize, 6>;
+      case 7: return unpack_list_data_kernel<kBlockSize, 7>;
+      case 8: return unpack_list_data_kernel<kBlockSize, 8>;
+      default: RAFT_FAIL("Invalid pq_bits (%u), the value must be within [4, 8]", pq_bits);
+    }
+  }(index.pq_bits());
+  kernel<<<blocks, threads, 0, res.get_stream()>>>(
+    out_codes, index.data_ptrs(), index.list_sizes(), label, offset_or_indices);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+/** A consumer for the `run_on_list` and `run_on_vec` that approximates the original input data. */
 struct reconstruct_vectors {
   codebook_gen codebook_kind;
   uint32_t cluster_ix;
@@ -566,7 +710,7 @@ struct reconstruct_vectors {
   device_mdspan<float, extent_3d<uint32_t>, row_major> out_vectors;
 
   /**
-   * Create the functor to be passed to `run_on_list`.
+   * Create a callable to be passed to `run_on_list`.
    *
    * @param[out] out_vectors the destination for the decoded vectors.
    * @param[in] pq_centers the codebook
@@ -626,69 +770,6 @@ struct reconstruct_vectors {
   }
 };
 
-/**
- * Process a single vector in a list.
- *
- * @tparam PqBits
- * @tparam Action tells how to process a single vectors (e.g. reconstruct or just unpack)
- *
- * @param[in] in_list_data the encoded cluster data.
- * @param[in] in_ix in-cluster index of the vector to be decoded (one-per-thread).
- * @param[in] out_ix the output index passed to the action
- * @param[in] pq_dim
- * @param action a callable action to be invoked on each PQ code (component of the encoding)
- *    type: void (uint8_t code, uint32_t out_ix, uint32_t j), where j = [0..pq_dim).
- */
-template <uint32_t PqBits, typename Action>
-__device__ void run_on_vector(
-  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> in_list_data,
-  uint32_t in_ix,
-  uint32_t out_ix,
-  uint32_t pq_dim,
-  Action action)
-{
-  using group_align         = Pow2<kIndexGroupSize>;
-  const uint32_t group_ix   = group_align::div(in_ix);
-  const uint32_t ingroup_ix = group_align::mod(in_ix);
-
-  pq_vec_t code_chunk;
-  bitfield_view_t<PqBits> code_view{reinterpret_cast<uint8_t*>(&code_chunk)};
-  constexpr uint32_t kChunkSize = (sizeof(pq_vec_t) * 8u) / PqBits;
-  for (uint32_t j = 0, i = 0; j < pq_dim; i++) {
-    // read the chunk
-    code_chunk = *reinterpret_cast<const pq_vec_t*>(&in_list_data(group_ix, i, ingroup_ix, 0));
-    // read the codes, one/pq_dim at a time
-#pragma unroll
-    for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++) {
-      // read a piece of the reconstructed vector
-      action(code_view[k], out_ix, j);
-    }
-  }
-}
-
-/** Process the given indices or a block of a single list (cluster). */
-template <uint32_t PqBits, typename Action>
-__device__ void run_on_list(device_vector_view<const uint8_t* const, uint32_t, row_major> data_ptrs,
-                            device_vector_view<const uint32_t, uint32_t, row_major> list_sizes,
-                            std::variant<uint32_t, const uint32_t*> offset_or_indices,
-                            uint32_t len,
-                            uint32_t cluster_ix,
-                            uint32_t pq_dim,
-                            Action action)
-{
-  auto pq_extents =
-    list_spec<uint32_t, uint32_t>{PqBits, pq_dim, true}.make_list_extents(list_sizes[cluster_ix]);
-  auto pq_dataset =
-    make_mdspan<const uint8_t, uint32_t, row_major, false, true>(data_ptrs[cluster_ix], pq_extents);
-
-  for (uint32_t ix = threadIdx.x + blockDim.x * blockIdx.x; ix < len; ix += blockDim.x) {
-    const uint32_t src_ix = std::holds_alternative<uint32_t>(offset_or_indices)
-                              ? std::get<uint32_t>(offset_or_indices) + ix
-                              : std::get<const uint32_t*>(offset_or_indices)[ix];
-    run_on_vector<PqBits>(pq_dataset, src_ix, ix, pq_dim, action);
-  }
-}
-
 template <uint32_t BlockSize, uint32_t PqBits>
 __launch_bounds__(BlockSize) __global__ void reconstruct_list_data_kernel(
   device_matrix_view<float, uint32_t, row_major> out_vectors,

From e7c35e791831e2c934c18c8c6048ceb835086e03 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 16 Mar 2023 07:27:01 +0100
Subject: [PATCH 10/25] Public interface for unpack_list_data

---
 cpp/include/raft/neighbors/ivf_pq.cuh | 83 +++++++++++++++++++++++++++
 1 file changed, 83 insertions(+)

diff --git a/cpp/include/raft/neighbors/ivf_pq.cuh b/cpp/include/raft/neighbors/ivf_pq.cuh
index def127e24f..c420123a01 100644
--- a/cpp/include/raft/neighbors/ivf_pq.cuh
+++ b/cpp/include/raft/neighbors/ivf_pq.cuh
@@ -211,6 +211,89 @@ auto build(raft::device_resources const& handle,
   return detail::build(handle, params, dataset, n_rows, dim);
 }
 
+/**
+ * @brief Unpack `n_take` consecutive records of a single list (cluster) in the compressed index
+ * starting at given `offset`, one code per byte (independently of pq_bits).
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // We will unpack the fourth cluster
+ *   uint32_t label = 3;
+ *   // Get the list size
+ *   uint32_t list_size = 0;
+ *   raft::copy(&list_size, index.list_sizes().data_handle() + label, 1, res.get_stream());
+ *   res.sync_stream();
+ *   // allocate the buffer for the output
+ *   auto codes = raft::make_device_matrix<float>(res, list_size, index.pq_dim());
+ *   // unpack the whole list
+ *   ivf_pq::unpack_list_data(res, index, codes.view(), label, 0);
+ * @endcode
+ *
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] res
+ * @param[in] index
+ * @param[out] out_codes
+ *   the destination buffer [n_take, index.pq_dim()].
+ *   The length `n_take` defines how many records to unpack,
+ *   it must be smaller than the list size.
+ * @param[in] label
+ *   The id of the list (cluster) to decode.
+ * @param[in] offset
+ *   How many records in the list to skip.
+ */
+template <typename IdxT>
+void unpack_list_data(raft::device_resources const& res,
+                      const index<IdxT>& index,
+                      device_matrix_view<uint8_t, uint32_t, row_major> out_codes,
+                      uint32_t label,
+                      uint32_t offset)
+{
+  return detail::unpack_list_data(res, index, out_codes, label, offset);
+}
+
+/**
+ * @brief Unpack a series of records of a single list (cluster) in the compressed index
+ * by their in-list offsets, one code per byte (independently of pq_bits).
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // We will unpack the fourth cluster
+ *   uint32_t label = 3;
+ *   // Create the selection vector
+ *   auto selected_indices = raft::make_device_vector<uint32_t>(res, 4);
+ *   ... fill the indices ...
+ *   res.sync_stream();
+ *   // allocate the buffer for the output
+ *   auto codes = raft::make_device_matrix<float>(res, selected_indices.size(), index.pq_dim());
+ *   // decode the whole list
+ *   ivf_pq::unpack_list_data(
+ *       res, index, selected_indices.view(), codes.view(), label);
+ * @endcode
+ *
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] res
+ * @param[in] index
+ * @param[in] in_cluster_indices
+ *   The offsets of the selected indices within the cluster.
+ * @param[out] out_codes
+ *   the destination buffer [n_take, index.pq_dim()].
+ *   The length `n_take` defines how many records to unpack,
+ *   it must be smaller than the list size.
+ * @param[in] label
+ *   The id of the list (cluster) to decode.
+ */
+template <typename IdxT>
+void unpack_list_data(raft::device_resources const& res,
+                      const index<IdxT>& index,
+                      device_vector_view<const uint32_t> in_cluster_indices,
+                      device_matrix_view<uint8_t, uint32_t, row_major> out_codes,
+                      uint32_t label)
+{
+  return detail::unpack_list_data(res, index, out_codes, label, in_cluster_indices);
+}
+
 /**
  * @brief Decode `n_take` consecutive records of a single list (cluster) in the compressed index
  * starting at given `offset`.

From 839cd489b68233099b235cea5a69d58a815a43ac Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 16 Mar 2023 07:30:43 +0100
Subject: [PATCH 11/25] Fix (unrelated to the PR) test condition being just a
 little bit too tight

---
 cpp/test/neighbors/ann_ivf_pq.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh
index b990e364c5..177dd28db9 100644
--- a/cpp/test/neighbors/ann_ivf_pq.cuh
+++ b/cpp/test/neighbors/ann_ivf_pq.cuh
@@ -271,7 +271,7 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
       double d = dist(i);
       // The theoretical estimate of the error is hard to come up with,
       // the estimate below is based on experimentation + curse of dimensionality
-      ASSERT_LE(d, 0.05 * std::pow(2.0, compression_ratio))
+      ASSERT_LE(d, 0.055 * std::pow(2.0, compression_ratio))
         << " (label = " << label << ", ix = " << (n_skip + i) << ")";
     }
   }

From 46d5a84280217cbdb2cd9793328f4c5367939d1a Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 16 Mar 2023 08:11:25 +0100
Subject: [PATCH 12/25] Fix a miswording in the docs

---
 cpp/include/raft/neighbors/detail/ivf_pq_build.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
index c19232a51d..3132a33f69 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
@@ -664,7 +664,7 @@ __launch_bounds__(BlockSize) __global__ void unpack_list_data_kernel(
                       unpack_action);
 }
 
-/** Decode the list data; see the public interface for the api and usage. */
+/** Unpack the list data; see the public interface for the api and usage. */
 template <typename IdxT>
 void unpack_list_data(raft::device_resources const& res,
                       const index<IdxT>& index,

From 9fc9818ebd49067aeba73c57f5cf4a633d111059 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 16 Mar 2023 10:50:31 +0100
Subject: [PATCH 13/25] Add public api for extending individual lists

---
 .../raft/neighbors/detail/ivf_pq_build.cuh    | 99 +++++++++++++++++++
 cpp/include/raft/neighbors/ivf_pq.cuh         | 60 +++++++++++
 cpp/test/neighbors/ann_ivf_pq.cuh             | 21 ++++
 3 files changed, 180 insertions(+)

diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
index 3132a33f69..638c8b4edd 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
@@ -1123,6 +1123,31 @@ void process_and_fill_codes(raft::device_resources const& handle,
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
+/**
+ * Write flat PQ codes into an existing list by the given offset.
+ *
+ * NB: no memory allocation happens here; the list must fit the data (offset + n_rows).
+ *
+ * @tparam IdxT
+ *
+ * @param[in] res
+ * @param[inout] index
+ * @param[in] codes flat PQ codes, one code per byte [n_rows, index.pq_dim()]
+ * @param[in] label
+ *   The id of the list (cluster) to decode.
+ * @param[in] offset how many records in the list to skip.
+ *
+ */
+template <typename IdxT>
+void pack_list_data(raft::device_resources const& res,
+                    index<IdxT>* index,
+                    device_matrix_view<const uint8_t, uint32_t, row_major> new_codes,
+                    uint32_t label,
+                    uint32_t offset)
+{
+  /** TODO: implementation is missing */
+}
+
 /** Update the state of the dependent index members. */
 template <typename IdxT>
 void recompute_internal_state(const raft::device_resources& res, index<IdxT>& index)
@@ -1176,6 +1201,80 @@ void recompute_internal_state(const raft::device_resources& res, index<IdxT>& in
   }
 }
 
+/**
+ * Extend one list of the index in-place, by the list label, skipping the classification and
+ * encoding steps.
+ * See the public interface for the api and usage.
+ */
+template <typename IdxT>
+void extend_list_with_codes(raft::device_resources const& res,
+                            index<IdxT>* index,
+                            device_matrix_view<const uint8_t, uint32_t, row_major> new_codes,
+                            device_vector_view<const IdxT, uint32_t, row_major> new_indices,
+                            uint32_t label)
+{
+  uint32_t n_rows = new_indices.extent(0);
+  uint32_t offset;
+  // Allocate the lists to fit the new data
+  copy(&offset, index->list_sizes().data_handle() + label, 1, res.get_stream());
+  res.sync_stream();
+  uint32_t new_size = offset + n_rows;
+  copy(index->list_sizes().data_handle() + label, &new_size, 1, res.get_stream());
+  auto spec = list_spec<uint32_t, IdxT>{
+    index->pq_bits(), index->pq_dim(), index->conservative_memory_allocation()};
+  auto& list = index->lists()[label];
+  ivf::resize_list(res, list, spec, new_size, offset);
+  copy(list->indices.data_handle() + offset, new_indices.data_handle(), n_rows, res.get_stream());
+
+  pack_list_data(res, index, new_codes, label, offset);
+
+  // Update the pointers and the sizes
+  recompute_internal_state(res, *index);
+}
+
+/**
+ * Extend one list of the index in-place, by the list label, skipping the classification step.
+ * See the public interface for the api and usage.
+ */
+template <typename T, typename IdxT>
+void extend_list(raft::device_resources const& res,
+                 index<IdxT>* index,
+                 device_matrix_view<const T, uint32_t, row_major> new_vectors,
+                 device_vector_view<const IdxT, uint32_t, row_major> new_indices,
+                 uint32_t label)
+{
+  uint32_t n_rows = new_indices.extent(0);
+  uint32_t offset;
+  // Allocate the lists to fit the new data
+  copy(&offset, index->list_sizes().data_handle() + label, 1, res.get_stream());
+  res.sync_stream();
+  uint32_t new_size = offset + n_rows;
+  copy(index->list_sizes().data_handle() + label, &new_size, 1, res.get_stream());
+  auto spec = list_spec<uint32_t, IdxT>{
+    index->pq_bits(), index->pq_dim(), index->conservative_memory_allocation()};
+  auto& list = index->lists()[label];
+  ivf::resize_list(res, list, spec, new_size, offset);
+  copy(list->indices.data_handle() + offset, new_indices.data_handle(), n_rows, res.get_stream());
+
+  /** TODO: implementation is missing */
+
+  // Update the pointers and the sizes
+  recompute_internal_state(res, *index);
+}
+
+/**
+ * Remove all data from a single list.
+ * See the public interface for the api and usage.
+ */
+template <typename IdxT>
+void erase_list(raft::device_resources const& res, index<IdxT>* index, uint32_t label)
+{
+  uint32_t zero = 0;
+  copy(index->list_sizes().data_handle() + label, &zero, 1, res.get_stream());
+  index->lists()[label].reset();
+  recompute_internal_state(res, *index);
+}
+
 /** Copy the state of an index into a new index, but share the list data among the two. */
 template <typename IdxT>
 auto clone(const raft::device_resources& res, const index<IdxT>& source) -> index<IdxT>
diff --git a/cpp/include/raft/neighbors/ivf_pq.cuh b/cpp/include/raft/neighbors/ivf_pq.cuh
index c420123a01..cfb7e0a773 100644
--- a/cpp/include/raft/neighbors/ivf_pq.cuh
+++ b/cpp/include/raft/neighbors/ivf_pq.cuh
@@ -446,6 +446,66 @@ void extend(raft::device_resources const& handle,
   detail::extend(handle, index, new_vectors, new_indices, n_rows);
 }
 
+/**
+ * @brief Extend one list of the index in-place, by the list label, skipping the classification and
+ * encoding steps.
+ *
+ * @tparam IdxT
+ *
+ * @param[in] res
+ * @param[inout] index
+ * @param[in] new_codes flat PQ codes, one code per byte [n_rows, index.pq_dim()]
+ * @param[in] new_indices source indices [n_rows]
+ * @param[in] label the id of the target list (cluster).
+ */
+template <typename IdxT>
+void extend_list_with_codes(raft::device_resources const& res,
+                            index<IdxT>* index,
+                            device_matrix_view<const uint8_t, uint32_t, row_major> new_codes,
+                            device_vector_view<const IdxT, uint32_t, row_major> new_indices,
+                            uint32_t label)
+{
+  detail::extend_list_with_codes(res, index, new_codes, new_indices, label);
+}
+
+/**
+ * @brief Extend one list of the index in-place, by the list label, skipping the classification
+ * step.
+ *
+ * @tparam T
+ * @tparam IdxT
+ *
+ * @param[in] res
+ * @param[inout] index
+ * @param[in] new_vectors data to encode [n_rows, index.dim()]
+ * @param[in] new_indices source indices [n_rows]
+ * @param[in] label the id of the target list (cluster).
+ *
+ */
+template <typename T, typename IdxT>
+void extend_list(raft::device_resources const& res,
+                 index<IdxT>* index,
+                 device_matrix_view<const T, uint32_t, row_major> new_vectors,
+                 device_vector_view<const IdxT, uint32_t, row_major> new_indices,
+                 uint32_t label)
+{
+  detail::extend_list(res, index, new_vectors, new_vectors, label);
+}
+
+/**
+ * @brief Remove all data from a single list (cluster) in the index.
+ *
+ * @tparam IdxT
+ * @param[in] res
+ * @param[inout] index
+ * @param[in] label the id of the target list (cluster).
+ */
+template <typename IdxT>
+void erase_list(raft::device_resources const& res, index<IdxT>* index, uint32_t label)
+{
+  detail::erase_list(res, index, label);
+}
+
 /**
  * @brief Search ANN using the constructed index.
  *
diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh
index 177dd28db9..14098bbce4 100644
--- a/cpp/test/neighbors/ann_ivf_pq.cuh
+++ b/cpp/test/neighbors/ann_ivf_pq.cuh
@@ -276,11 +276,32 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
     }
   }
 
+  void check_packing(index<IdxT>* index, uint32_t label)
+  {
+    auto rec_list = index->lists()[label];
+    auto n_rows   = rec_list->size.load();
+
+    if (n_rows == 0) { return; }
+
+    auto codes   = make_device_matrix<uint8_t>(handle_, n_rows, index->pq_dim());
+    auto indices = make_device_vector<IdxT>(handle_, n_rows);
+    copy(indices.data_handle(), rec_list->indices.data_handle(), n_rows, stream_);
+
+    ivf_pq::unpack_list_data(handle_, *index, codes.view(), label, 0);
+    ivf_pq::erase_list(handle_, index, label);
+    // NB: passing the type parameter because const->non-const implicit conversion of the mdspans
+    // breaks type inference
+    ivf_pq::extend_list_with_codes<IdxT>(handle_, index, codes.view(), indices.view(), label);
+  }
+
   template <typename BuildIndex>
   void run(BuildIndex build_index)
   {
     auto index = build_index();
 
+    // Dump and re-write codes for one label
+    check_packing(&index, 0);
+
     double compression_ratio =
       static_cast<double>(ps.dim * 8) / static_cast<double>(index.pq_dim() * index.pq_bits());
 

From ac8aa1832a9739d6d34e27664e0a7c44befa876d Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 16 Mar 2023 17:04:15 +0100
Subject: [PATCH 14/25] Implemented pack_list_data

---
 .../raft/neighbors/detail/ivf_pq_build.cuh    | 265 ++++++++++++------
 cpp/include/raft/neighbors/ivf_pq.cuh         |   4 +-
 2 files changed, 185 insertions(+), 84 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
index 638c8b4edd..96c68c50f4 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
@@ -597,31 +597,71 @@ __device__ void run_on_vector(
   }
 }
 
-/** Process the given indices or a block of a single list (cluster). */
 template <uint32_t PqBits, typename Action>
-__device__ void run_on_list(device_vector_view<const uint8_t* const, uint32_t, row_major> data_ptrs,
-                            device_vector_view<const uint32_t, uint32_t, row_major> list_sizes,
-                            std::variant<uint32_t, const uint32_t*> offset_or_indices,
-                            uint32_t len,
-                            uint32_t cluster_ix,
-                            uint32_t pq_dim,
-                            Action action)
+__device__ void write_vector(
+  device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> out_list_data,
+  uint32_t out_ix,
+  uint32_t in_ix,
+  uint32_t pq_dim,
+  Action action)
 {
-  auto pq_extents =
-    list_spec<uint32_t, uint32_t>{PqBits, pq_dim, true}.make_list_extents(list_sizes[cluster_ix]);
-  auto pq_dataset =
-    make_mdspan<const uint8_t, uint32_t, row_major, false, true>(data_ptrs[cluster_ix], pq_extents);
+  using group_align         = Pow2<kIndexGroupSize>;
+  const uint32_t group_ix   = group_align::div(out_ix);
+  const uint32_t ingroup_ix = group_align::mod(out_ix);
 
+  pq_vec_t code_chunk;
+  bitfield_view_t<PqBits> code_view{reinterpret_cast<uint8_t*>(&code_chunk)};
+  constexpr uint32_t kChunkSize = (sizeof(pq_vec_t) * 8u) / PqBits;
+  for (uint32_t j = 0, i = 0; j < pq_dim; i++) {
+    // clear the chunk
+    code_chunk = pq_vec_t{};
+    // write the codes, one/pq_dim at a time
+#pragma unroll
+    for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++) {
+      // write a single code
+      code_view[k] = action(in_ix, j);
+    }
+    // write the chunk to the list
+    *reinterpret_cast<pq_vec_t*>(&out_list_data(group_ix, i, ingroup_ix, 0)) = code_chunk;
+  }
+}
+
+/** Process the given indices or a block of a single list (cluster). */
+template <uint32_t PqBits, typename Action>
+__device__ void run_on_list(
+  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> in_list_data,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices,
+  uint32_t len,
+  uint32_t pq_dim,
+  Action action)
+{
   for (uint32_t ix = threadIdx.x + blockDim.x * blockIdx.x; ix < len; ix += blockDim.x) {
     const uint32_t src_ix = std::holds_alternative<uint32_t>(offset_or_indices)
                               ? std::get<uint32_t>(offset_or_indices) + ix
                               : std::get<const uint32_t*>(offset_or_indices)[ix];
-    run_on_vector<PqBits>(pq_dataset, src_ix, ix, pq_dim, action);
+    run_on_vector<PqBits>(in_list_data, src_ix, ix, pq_dim, action);
+  }
+}
+
+/** Process the given indices or a block of a single list (cluster). */
+template <uint32_t PqBits, typename Action>
+__device__ void write_list(
+  device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> out_list_data,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices,
+  uint32_t len,
+  uint32_t pq_dim,
+  Action action)
+{
+  for (uint32_t ix = threadIdx.x + blockDim.x * blockIdx.x; ix < len; ix += blockDim.x) {
+    const uint32_t dst_ix = std::holds_alternative<uint32_t>(offset_or_indices)
+                              ? std::get<uint32_t>(offset_or_indices) + ix
+                              : std::get<const uint32_t*>(offset_or_indices)[ix];
+    write_vector<PqBits>(out_list_data, dst_ix, ix, pq_dim, action);
   }
 }
 
 /**
- * A consumer for the `run_on_list` and `run_on_vec` that just flattens PQ codes
+ * A consumer for the `run_on_list` and `run_on_vector` that just flattens PQ codes
  * one-per-byte. That is, independent of the code width (pq_bits), one code uses
  * the whole byte, hence one vectors uses pq_dim bytes.
  */
@@ -648,44 +688,37 @@ struct unpack_codes {
 template <uint32_t BlockSize, uint32_t PqBits>
 __launch_bounds__(BlockSize) __global__ void unpack_list_data_kernel(
   device_matrix_view<uint8_t, uint32_t, row_major> out_codes,
-  device_vector_view<const uint8_t* const, uint32_t, row_major> data_ptrs,
-  device_vector_view<const uint32_t, uint32_t, row_major> list_sizes,
-  uint32_t cluster_ix,
+  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> in_list_data,
   std::variant<uint32_t, const uint32_t*> offset_or_indices)
 {
   const uint32_t pq_dim = out_codes.extent(1);
   auto unpack_action    = unpack_codes{out_codes};
-  run_on_list<PqBits>(data_ptrs,
-                      list_sizes,
-                      offset_or_indices,
-                      out_codes.extent(0),
-                      cluster_ix,
-                      pq_dim,
-                      unpack_action);
+  run_on_list<PqBits>(in_list_data, offset_or_indices, out_codes.extent(0), pq_dim, unpack_action);
 }
 
-/** Unpack the list data; see the public interface for the api and usage. */
-template <typename IdxT>
-void unpack_list_data(raft::device_resources const& res,
-                      const index<IdxT>& index,
-                      device_matrix_view<uint8_t, uint32_t, row_major> out_codes,
-                      uint32_t label,
-                      std::variant<uint32_t, const uint32_t*> offset_or_indices)
+/**
+ * Unpack flat PQ codes from an existing list by the given offset.
+ *
+ * @param[out] codes flat PQ codes, one code per byte [n_rows, pq_dim]
+ * @param[in] list_data the packed ivf::list data.
+ * @param[in] offset_or_indices how many records in the list to skip or the exact indices.
+ * @param[in] pq_bits codebook size (1 << pq_bits)
+ * @param[in] stream
+ */
+inline void unpack_list_data(
+  device_matrix_view<uint8_t, uint32_t, row_major> codes,
+  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> list_data,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices,
+  uint32_t pq_bits,
+  rmm::cuda_stream_view stream)
 {
-  auto n_rows = out_codes.extent(0);
+  auto n_rows = codes.extent(0);
   if (n_rows == 0) { return; }
-  if (std::holds_alternative<uint32_t>(offset_or_indices)) {
-    auto n_skip = std::get<uint32_t>(offset_or_indices);
-    // sic! I'm using the upper bound `list.size` instead of exact `list_sizes(label)`
-    // to avoid an extra device-host data copy and the stream sync.
-    RAFT_EXPECTS(n_skip + n_rows <= index.lists()[label]->size.load(),
-                 "offset + output size must be not bigger than the cluster size.");
-  }
 
   constexpr uint32_t kBlockSize = 256;
   dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize), 1, 1);
   dim3 threads(kBlockSize, 1, 1);
-  auto kernel = [](uint32_t pq_bits) {
+  auto kernel = [pq_bits]() {
     switch (pq_bits) {
       case 4: return unpack_list_data_kernel<kBlockSize, 4>;
       case 5: return unpack_list_data_kernel<kBlockSize, 5>;
@@ -694,13 +727,28 @@ void unpack_list_data(raft::device_resources const& res,
       case 8: return unpack_list_data_kernel<kBlockSize, 8>;
       default: RAFT_FAIL("Invalid pq_bits (%u), the value must be within [4, 8]", pq_bits);
     }
-  }(index.pq_bits());
-  kernel<<<blocks, threads, 0, res.get_stream()>>>(
-    out_codes, index.data_ptrs(), index.list_sizes(), label, offset_or_indices);
+  }();
+  kernel<<<blocks, threads, 0, stream>>>(codes, list_data, offset_or_indices);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
-/** A consumer for the `run_on_list` and `run_on_vec` that approximates the original input data. */
+/** Unpack the list data; see the public interface for the api and usage. */
+template <typename IdxT>
+void unpack_list_data(raft::device_resources const& res,
+                      const index<IdxT>& index,
+                      device_matrix_view<uint8_t, uint32_t, row_major> out_codes,
+                      uint32_t label,
+                      std::variant<uint32_t, const uint32_t*> offset_or_indices)
+{
+  unpack_list_data(out_codes,
+                   index.lists()[label]->data.view(),
+                   offset_or_indices,
+                   index.pq_bits(),
+                   res.get_stream());
+}
+
+/** A consumer for the `run_on_list` and `run_on_vector` that approximates the original input data.
+ */
 struct reconstruct_vectors {
   codebook_gen codebook_kind;
   uint32_t cluster_ix;
@@ -773,8 +821,7 @@ struct reconstruct_vectors {
 template <uint32_t BlockSize, uint32_t PqBits>
 __launch_bounds__(BlockSize) __global__ void reconstruct_list_data_kernel(
   device_matrix_view<float, uint32_t, row_major> out_vectors,
-  device_vector_view<const uint8_t* const, uint32_t, row_major> data_ptrs,
-  device_vector_view<const uint32_t, uint32_t, row_major> list_sizes,
+  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> in_list_data,
   device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers,
   device_matrix_view<const float, uint32_t, row_major> centers_rot,
   codebook_gen codebook_kind,
@@ -784,13 +831,8 @@ __launch_bounds__(BlockSize) __global__ void reconstruct_list_data_kernel(
   const uint32_t pq_dim = out_vectors.extent(1) / pq_centers.extent(1);
   auto reconstruct_action =
     reconstruct_vectors{out_vectors, pq_centers, centers_rot, codebook_kind, cluster_ix};
-  run_on_list<PqBits>(data_ptrs,
-                      list_sizes,
-                      offset_or_indices,
-                      out_vectors.extent(0),
-                      cluster_ix,
-                      pq_dim,
-                      reconstruct_action);
+  run_on_list<PqBits>(
+    in_list_data, offset_or_indices, out_vectors.extent(0), pq_dim, reconstruct_action);
 }
 
 /** Decode the list data; see the public interface for the api and usage. */
@@ -803,11 +845,12 @@ void reconstruct_list_data(raft::device_resources const& res,
 {
   auto n_rows = out_vectors.extent(0);
   if (n_rows == 0) { return; }
+  auto& list = index.lists()[label];
   if (std::holds_alternative<uint32_t>(offset_or_indices)) {
     auto n_skip = std::get<uint32_t>(offset_or_indices);
     // sic! I'm using the upper bound `list.size` instead of exact `list_sizes(label)`
     // to avoid an extra device-host data copy and the stream sync.
-    RAFT_EXPECTS(n_skip + n_rows <= index.lists()[label]->size.load(),
+    RAFT_EXPECTS(n_skip + n_rows <= list->size.load(),
                  "offset + output size must be not bigger than the cluster size.");
   }
 
@@ -828,8 +871,7 @@ void reconstruct_list_data(raft::device_resources const& res,
     }
   }(index.pq_bits());
   kernel<<<blocks, threads, 0, res.get_stream()>>>(tmp.view(),
-                                                   index.data_ptrs(),
-                                                   index.list_sizes(),
+                                                   list->data.view(),
                                                    index.pq_centers(),
                                                    index.centers_rot(),
                                                    index.codebook_kind(),
@@ -940,6 +982,90 @@ __device__ auto compute_pq_code(
   return code;
 }
 
+/**
+ * A producer for the `write_list` and `write_vector` reads the codes byte-by-byte. That is,
+ * independent of the code width (pq_bits), one code uses the whole byte, hence one vectors uses
+ * pq_dim bytes.
+ */
+struct pass_codes {
+  device_matrix_view<const uint8_t, uint32_t, row_major> codes;
+
+  /**
+   * Create a callable to be passed to `run_on_list`.
+   *
+   * @param[in] codes the source codes.
+   */
+  __device__ inline pass_codes(device_matrix_view<const uint8_t, uint32_t, row_major> codes)
+    : codes{codes}
+  {
+  }
+
+  /** Read j-th component (code) of the i-th vector from the source. */
+  __device__ inline auto operator()(uint32_t i, uint32_t j) const -> uint8_t { return codes(i, j); }
+};
+
+template <uint32_t BlockSize, uint32_t PqBits>
+__launch_bounds__(BlockSize) __global__ void pack_list_data_kernel(
+  device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> list_data,
+  device_matrix_view<const uint8_t, uint32_t, row_major> codes,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices)
+{
+  write_list<PqBits>(
+    list_data, offset_or_indices, codes.extent(0), codes.extent(1), pass_codes{codes});
+}
+
+/**
+ * Write flat PQ codes into an existing list by the given offset.
+ *
+ * NB: no memory allocation happens here; the list must fit the data (offset + n_rows).
+ *
+ * @param[out] list_data the packed ivf::list data.
+ * @param[in] codes flat PQ codes, one code per byte [n_rows, pq_dim]
+ * @param[in] offset_or_indices how many records in the list to skip or the exact indices.
+ * @param[in] pq_bits codebook size (1 << pq_bits)
+ * @param[in] stream
+ */
+inline void pack_list_data(
+  device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> list_data,
+  device_matrix_view<const uint8_t, uint32_t, row_major> codes,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices,
+  uint32_t pq_bits,
+  rmm::cuda_stream_view stream)
+{
+  auto n_rows = codes.extent(0);
+  if (n_rows == 0) { return; }
+
+  constexpr uint32_t kBlockSize = 256;
+  dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize), 1, 1);
+  dim3 threads(kBlockSize, 1, 1);
+  auto kernel = [pq_bits]() {
+    switch (pq_bits) {
+      case 4: return pack_list_data_kernel<kBlockSize, 4>;
+      case 5: return pack_list_data_kernel<kBlockSize, 5>;
+      case 6: return pack_list_data_kernel<kBlockSize, 6>;
+      case 7: return pack_list_data_kernel<kBlockSize, 7>;
+      case 8: return pack_list_data_kernel<kBlockSize, 8>;
+      default: RAFT_FAIL("Invalid pq_bits (%u), the value must be within [4, 8]", pq_bits);
+    }
+  }();
+  kernel<<<blocks, threads, 0, stream>>>(list_data, codes, offset_or_indices);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+template <typename IdxT>
+void pack_list_data(raft::device_resources const& res,
+                    index<IdxT>* index,
+                    device_matrix_view<const uint8_t, uint32_t, row_major> new_codes,
+                    uint32_t label,
+                    std::variant<uint32_t, const uint32_t*> offset_or_indices)
+{
+  pack_list_data(index->lists()[label]->data.view(),
+                 new_codes,
+                 offset_or_indices,
+                 index->pq_bits(),
+                 res.get_stream());
+}
+
 /**
  * Compute a PQ code for a single input vector per subwarp and write it into the
  * appropriate cluster.
@@ -1123,31 +1249,6 @@ void process_and_fill_codes(raft::device_resources const& handle,
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
-/**
- * Write flat PQ codes into an existing list by the given offset.
- *
- * NB: no memory allocation happens here; the list must fit the data (offset + n_rows).
- *
- * @tparam IdxT
- *
- * @param[in] res
- * @param[inout] index
- * @param[in] codes flat PQ codes, one code per byte [n_rows, index.pq_dim()]
- * @param[in] label
- *   The id of the list (cluster) to decode.
- * @param[in] offset how many records in the list to skip.
- *
- */
-template <typename IdxT>
-void pack_list_data(raft::device_resources const& res,
-                    index<IdxT>* index,
-                    device_matrix_view<const uint8_t, uint32_t, row_major> new_codes,
-                    uint32_t label,
-                    uint32_t offset)
-{
-  /** TODO: implementation is missing */
-}
-
 /** Update the state of the dependent index members. */
 template <typename IdxT>
 void recompute_internal_state(const raft::device_resources& res, index<IdxT>& index)
@@ -1226,7 +1327,7 @@ void extend_list_with_codes(raft::device_resources const& res,
   ivf::resize_list(res, list, spec, new_size, offset);
   copy(list->indices.data_handle() + offset, new_indices.data_handle(), n_rows, res.get_stream());
 
-  pack_list_data(res, index, new_codes, label, offset);
+  pack_list_data<IdxT>(res, index, new_codes, label, offset);
 
   // Update the pointers and the sizes
   recompute_internal_state(res, *index);
diff --git a/cpp/include/raft/neighbors/ivf_pq.cuh b/cpp/include/raft/neighbors/ivf_pq.cuh
index 5905bd9c0d..eca57bf9d9 100644
--- a/cpp/include/raft/neighbors/ivf_pq.cuh
+++ b/cpp/include/raft/neighbors/ivf_pq.cuh
@@ -268,7 +268,7 @@ void unpack_list_data(raft::device_resources const& res,
                       uint32_t label,
                       uint32_t offset)
 {
-  return detail::unpack_list_data(res, index, out_codes, label, offset);
+  return detail::unpack_list_data<IdxT>(res, index, out_codes, label, offset);
 }
 
 /**
@@ -310,7 +310,7 @@ void unpack_list_data(raft::device_resources const& res,
                       device_matrix_view<uint8_t, uint32_t, row_major> out_codes,
                       uint32_t label)
 {
-  return detail::unpack_list_data(res, index, out_codes, label, in_cluster_indices);
+  return detail::unpack_list_data<IdxT>(res, index, out_codes, label, in_cluster_indices);
 }
 
 /**

From f62273ddcd4d2901a921c8d017d2f83bf586d7d0 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Thu, 16 Mar 2023 18:44:02 +0100
Subject: [PATCH 15/25] Reuse write_vector inside the
 process_and_fill_codes_kernel

---
 .../raft/neighbors/detail/ivf_pq_build.cuh    | 121 +++++++-----------
 1 file changed, 48 insertions(+), 73 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
index 96c68c50f4..a0ebf3e3c2 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
@@ -561,7 +561,7 @@ void train_per_cluster(raft::device_resources const& handle,
  * Process a single vector in a list.
  *
  * @tparam PqBits
- * @tparam Action tells how to process a single vectors (e.g. reconstruct or just unpack)
+ * @tparam Action tells how to process a single vector (e.g. reconstruct or just unpack)
  *
  * @param[in] in_list_data the encoded cluster data.
  * @param[in] in_ix in-cluster index of the vector to be decoded (one-per-thread).
@@ -597,7 +597,21 @@ __device__ void run_on_vector(
   }
 }
 
-template <uint32_t PqBits, typename Action>
+/**
+ * Process a single vector in a list.
+ *
+ * @tparam PqBits
+ * @tparam SubWarpSize how many threads work on the same ix (only the first thread writes data).
+ * @tparam Action tells how to process a single vector (e.g. encode or just pack)
+ *
+ * @param[in] out_list_data the encoded cluster data.
+ * @param[in] out_ix in-cluster index of the vector to be processed (one-per-SubWarpSize threads).
+ * @param[in] in_ix the input index passed to the action (one-per-SubWarpSize threads).
+ * @param[in] pq_dim
+ * @param action a callable action to be invoked on each PQ code (component of the encoding)
+ *    type: (uint32_t in_ix, uint32_t j) -> uint8_t, where j = [0..pq_dim).
+ */
+template <uint32_t PqBits, uint32_t SubWarpSize, typename Action>
 __device__ void write_vector(
   device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> out_list_data,
   uint32_t out_ix,
@@ -605,6 +619,8 @@ __device__ void write_vector(
   uint32_t pq_dim,
   Action action)
 {
+  const uint32_t lane_id = Pow2<SubWarpSize>::mod(threadIdx.x);
+
   using group_align         = Pow2<kIndexGroupSize>;
   const uint32_t group_ix   = group_align::div(out_ix);
   const uint32_t ingroup_ix = group_align::mod(out_ix);
@@ -614,15 +630,18 @@ __device__ void write_vector(
   constexpr uint32_t kChunkSize = (sizeof(pq_vec_t) * 8u) / PqBits;
   for (uint32_t j = 0, i = 0; j < pq_dim; i++) {
     // clear the chunk
-    code_chunk = pq_vec_t{};
+    if (lane_id == 0) { code_chunk = pq_vec_t{}; }
     // write the codes, one/pq_dim at a time
 #pragma unroll
     for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++) {
       // write a single code
-      code_view[k] = action(in_ix, j);
+      uint8_t code = action(in_ix, j);
+      if (lane_id == 0) { code_view[k] = code; }
     }
     // write the chunk to the list
-    *reinterpret_cast<pq_vec_t*>(&out_list_data(group_ix, i, ingroup_ix, 0)) = code_chunk;
+    if (lane_id == 0) {
+      *reinterpret_cast<pq_vec_t*>(&out_list_data(group_ix, i, ingroup_ix, 0)) = code_chunk;
+    }
   }
 }
 
@@ -644,7 +663,7 @@ __device__ void run_on_list(
 }
 
 /** Process the given indices or a block of a single list (cluster). */
-template <uint32_t PqBits, typename Action>
+template <uint32_t PqBits, uint32_t SubWarpSize, typename Action>
 __device__ void write_list(
   device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> out_list_data,
   std::variant<uint32_t, const uint32_t*> offset_or_indices,
@@ -652,11 +671,14 @@ __device__ void write_list(
   uint32_t pq_dim,
   Action action)
 {
-  for (uint32_t ix = threadIdx.x + blockDim.x * blockIdx.x; ix < len; ix += blockDim.x) {
+  using subwarp_align = Pow2<SubWarpSize>;
+  uint32_t stride     = subwarp_align::div(blockDim.x);
+  uint32_t ix         = subwarp_align::div(threadIdx.x + blockDim.x * blockIdx.x);
+  for (; ix < len; ix += stride) {
     const uint32_t dst_ix = std::holds_alternative<uint32_t>(offset_or_indices)
                               ? std::get<uint32_t>(offset_or_indices) + ix
                               : std::get<const uint32_t*>(offset_or_indices)[ix];
-    write_vector<PqBits>(out_list_data, dst_ix, ix, pq_dim, action);
+    write_vector<PqBits, SubWarpSize>(out_list_data, dst_ix, ix, pq_dim, action);
   }
 }
 
@@ -919,7 +941,7 @@ void reconstruct_list_data(raft::device_resources const& res,
  *
  * @tparam SubWarpSize
  *   how many threads work on a single vector;
- *   bouded by either WarpSize or pq_book_size.
+ *   bounded by either WarpSize or pq_book_size.
  *
  * @param pq_centers
  *   - codebook_gen::PER_SUBSPACE: [pq_dim , pq_len, pq_book_size]
@@ -1010,7 +1032,7 @@ __launch_bounds__(BlockSize) __global__ void pack_list_data_kernel(
   device_matrix_view<const uint8_t, uint32_t, row_major> codes,
   std::variant<uint32_t, const uint32_t*> offset_or_indices)
 {
-  write_list<PqBits>(
+  write_list<PqBits, 1>(
     list_data, offset_or_indices, codes.extent(0), codes.extent(1), pass_codes{codes});
 }
 
@@ -1066,67 +1088,6 @@ void pack_list_data(raft::device_resources const& res,
                  res.get_stream());
 }
 
-/**
- * Compute a PQ code for a single input vector per subwarp and write it into the
- * appropriate cluster.
- * Subwarp size here is the minimum between WarpSize and the codebook size.
- *
- * @tparam BlockSize
- * @tparam PqBits
- *
- * @param[out] out_list_data an array of pointers to the database clusers.
- * @param[in] in_vector input unencoded data, one-per-subwarp
- * @param[in] pq_centers codebook
- * @param[in] codebook_kind
- * @param[in] out_ix in-cluster output index (where to write the encoded data), one-per-subwarp.
- * @param[in] cluster_ix label/id of the cluster to fill, one-per-subwarp.
- */
-template <uint32_t BlockSize, uint32_t PqBits>
-__device__ auto compute_and_write_pq_code(
-  device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> out_list_data,
-  device_vector_view<const float, uint32_t, row_major> in_vector,
-  device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers,
-  codebook_gen codebook_kind,
-  uint32_t out_ix,
-  uint32_t cluster_ix)
-{
-  constexpr uint32_t kSubWarpSize = std::min<uint32_t>(WarpSize, 1u << PqBits);
-  using subwarp_align             = Pow2<kSubWarpSize>;
-  const uint32_t lane_id          = subwarp_align::mod(threadIdx.x);
-
-  using group_align         = Pow2<kIndexGroupSize>;
-  const uint32_t group_ix   = group_align::div(out_ix);
-  const uint32_t ingroup_ix = group_align::mod(out_ix);
-  const uint32_t pq_len     = pq_centers.extent(1);
-  const uint32_t pq_dim     = in_vector.extent(0) / pq_len;
-
-  using layout_t            = typename decltype(in_vector)::layout_type;
-  using accessor_t          = typename decltype(in_vector)::accessor_type;
-  auto reinterpreted_vector = mdspan<const float, extent_2d<uint32_t>, layout_t, accessor_t>(
-    in_vector.data_handle(), extent_2d<uint32_t>{pq_dim, pq_len});
-
-  __shared__ pq_vec_t codes[subwarp_align::div(BlockSize)];
-  pq_vec_t& code = codes[subwarp_align::div(threadIdx.x)];
-  bitfield_view_t<PqBits> out{reinterpret_cast<uint8_t*>(&code)};
-  constexpr uint32_t kChunkSize = (sizeof(pq_vec_t) * 8u) / PqBits;
-  for (uint32_t j = 0, i = 0; j < pq_dim; i++) {
-    // clear the chunk for writing
-    if (lane_id == 0) { code = pq_vec_t{}; }
-    // fill-in the values, one/pq_dim at a time
-#pragma unroll
-    for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++) {
-      // find the label
-      auto l = compute_pq_code<kSubWarpSize>(
-        pq_centers, reinterpreted_vector, codebook_kind, j, cluster_ix);
-      if (lane_id == 0) { out[k] = l; }
-    }
-    // write the chunk into the dataset
-    if (lane_id == 0) {
-      *reinterpret_cast<pq_vec_t*>(&out_list_data(group_ix, i, ingroup_ix, 0)) = code;
-    }
-  }
-}
-
 template <uint32_t BlockSize, uint32_t PqBits, typename IdxT>
 __launch_bounds__(BlockSize) __global__ void process_and_fill_codes_kernel(
   device_matrix_view<const float, IdxT, row_major> new_vectors,
@@ -1171,9 +1132,23 @@ __launch_bounds__(BlockSize) __global__ void process_and_fill_codes_kernel(
   auto pq_extents = list_spec<uint32_t, IdxT>{PqBits, pq_dim, true}.make_list_extents(out_ix + 1);
   auto pq_dataset =
     make_mdspan<uint8_t, uint32_t, row_major, false, true>(data_ptrs[cluster_ix], pq_extents);
+
   // 3. compute and write the vector
-  compute_and_write_pq_code<BlockSize, PqBits>(
-    pq_dataset, one_vector, pq_centers, codebook_kind, out_ix, cluster_ix);
+  const uint32_t pq_len     = pq_centers.extent(1);
+  auto reinterpreted_vector = mdspan<const float, extent_2d<uint32_t>, layout_t, accessor_t>(
+    one_vector.data_handle(), extent_2d<uint32_t>{pq_dim, pq_len});
+
+  write_vector<PqBits, kSubWarpSize>(
+    pq_dataset,
+    out_ix,
+    0,
+    pq_dim,
+    [pq_centers, reinterpreted_vector, codebook_kind, cluster_ix] __device__(
+      uint32_t, uint32_t j) -> uint8_t {
+      // find the label
+      return compute_pq_code<kSubWarpSize>(
+        pq_centers, reinterpreted_vector, codebook_kind, j, cluster_ix);
+    });
 }
 
 /**

From 38895bbf44b8c9aa28f39b0485b93a8be4cc02f6 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 17 Mar 2023 11:32:38 +0100
Subject: [PATCH 16/25] Initial implementation of extend_list (failing tests)

---
 .../raft/neighbors/detail/ivf_pq_build.cuh    | 366 +++++++++++-------
 cpp/include/raft/neighbors/ivf_pq.cuh         |   4 +-
 cpp/test/neighbors/ann_ivf_pq.cuh             |  87 ++++-
 3 files changed, 295 insertions(+), 162 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
index a0ebf3e3c2..ee602b05af 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
@@ -276,7 +276,7 @@ void flat_compute_residuals(
   device_matrix_view<const float, uint32_t, row_major> rotation_matrix,  // [rot_dim, dim]
   device_matrix_view<const float, uint32_t, row_major> centers,          // [n_lists, dim_ext]
   const T* dataset,                                                      // [n_rows, dim]
-  const uint32_t* labels,                                                // [n_rows]
+  std::variant<uint32_t, const uint32_t*> labels,                        // [n_rows]
   rmm::mr::device_memory_resource* device_memory)
 {
   auto stream  = handle.get_stream();
@@ -287,7 +287,9 @@ void flat_compute_residuals(
   linalg::map_offset(handle, tmp_view, [centers, dataset, labels, dim] __device__(size_t i) {
     auto row_ix = i / dim;
     auto el_ix  = i % dim;
-    auto label  = labels[row_ix];
+    auto label  = std::holds_alternative<uint32_t>(labels)
+                    ? std::get<uint32_t>(labels)
+                    : std::get<const uint32_t*>(labels)[row_ix];
     return utils::mapping<float>{}(dataset[i]) - centers(label, el_ix);
   });
 
@@ -557,6 +559,32 @@ void train_per_cluster(raft::device_resources const& handle,
   transpose_pq_centers(handle, index, pq_centers_tmp.data());
 }
 
+/**
+ * A helper function: given the dataset in the rotated space
+ *  [n_rows, rot_dim] = [n_rows, pq_dim * pq_len],
+ * reinterpret the last dimension as two: [n_rows, pq_dim, pq_len]
+ *
+ * @tparam T
+ * @tparam IdxT
+ *
+ * @param vectors input data [n_rows, rot_dim]
+ * @param pq_centers codebook (used to infer the structure - pq_len)
+ * @return reinterpreted vectors [n_rows, pq_dim, pq_len]
+ */
+template <typename T, typename IdxT>
+static __device__ auto reinterpret_vectors(
+  device_matrix_view<T, IdxT, row_major> vectors,
+  device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers)
+  -> device_mdspan<T, extent_3d<IdxT>, row_major>
+{
+  const uint32_t pq_len = pq_centers.extent(1);
+  const uint32_t pq_dim = vectors.extent(1) / pq_len;
+  using layout_t        = typename decltype(vectors)::layout_type;
+  using accessor_t      = typename decltype(vectors)::accessor_type;
+  return mdspan<T, extent_3d<IdxT>, layout_t, accessor_t>(
+    vectors.data_handle(), extent_3d<IdxT>{vectors.extent(0), pq_dim, pq_len});
+}
+
 /**
  * Process a single vector in a list.
  *
@@ -602,6 +630,7 @@ __device__ void run_on_vector(
  *
  * @tparam PqBits
  * @tparam SubWarpSize how many threads work on the same ix (only the first thread writes data).
+ * @tparam IdxT type of the index passed to the action
  * @tparam Action tells how to process a single vector (e.g. encode or just pack)
  *
  * @param[in] out_list_data the encoded cluster data.
@@ -611,11 +640,11 @@ __device__ void run_on_vector(
  * @param action a callable action to be invoked on each PQ code (component of the encoding)
  *    type: (uint32_t in_ix, uint32_t j) -> uint8_t, where j = [0..pq_dim).
  */
-template <uint32_t PqBits, uint32_t SubWarpSize, typename Action>
+template <uint32_t PqBits, uint32_t SubWarpSize, typename IdxT, typename Action>
 __device__ void write_vector(
   device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> out_list_data,
   uint32_t out_ix,
-  uint32_t in_ix,
+  IdxT in_ix,
   uint32_t pq_dim,
   Action action)
 {
@@ -823,21 +852,6 @@ struct reconstruct_vectors {
       out_vectors(i, j, k) = pq_centers(partition_ix, k, code) + centers_rot(cluster_ix, j, k);
     }
   }
-
- private:
-  template <typename T>
-  static __device__ auto reinterpret_vectors(
-    device_matrix_view<T, uint32_t, row_major> out_vectors,
-    device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers)
-    -> device_mdspan<T, extent_3d<uint32_t>, row_major>
-  {
-    const uint32_t pq_len = pq_centers.extent(1);
-    const uint32_t pq_dim = out_vectors.extent(1) / pq_len;
-    using layout_t        = typename decltype(out_vectors)::layout_type;
-    using accessor_t      = typename decltype(out_vectors)::accessor_type;
-    return mdspan<T, extent_3d<uint32_t>, layout_t, accessor_t>(
-      out_vectors.data_handle(), extent_3d<uint32_t>{out_vectors.extent(0), pq_dim, pq_len});
-  }
 };
 
 template <uint32_t BlockSize, uint32_t PqBits>
@@ -936,74 +950,6 @@ void reconstruct_list_data(raft::device_resources const& res,
   }
 }
 
-/**
- * Compute the code: find the closest cluster in each pq_dim-subspace.
- *
- * @tparam SubWarpSize
- *   how many threads work on a single vector;
- *   bounded by either WarpSize or pq_book_size.
- *
- * @param pq_centers
- *   - codebook_gen::PER_SUBSPACE: [pq_dim , pq_len, pq_book_size]
- *   - codebook_gen::PER_CLUSTER:  [n_lists, pq_len, pq_book_size]
- * @param new_vector a single input of length rot_dim, reinterpreted as [pq_dim, pq_len].
- *   the input must be already transformed to floats, rotated, and the level 1 cluster
- *   center must be already substructed (i.e. this is the residual of a single input vector).
- * @param codebook_kind
- * @param j index along pq_dim "dimension"
- * @param cluster_ix is used for PER_CLUSTER codebooks.
- */
-template <uint32_t SubWarpSize>
-__device__ auto compute_pq_code(
-  device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers,
-  device_mdspan<const float, extent_2d<uint32_t>, row_major> new_vector,
-  codebook_gen codebook_kind,
-  uint32_t j,
-  uint32_t cluster_ix) -> uint8_t
-{
-  using subwarp_align = Pow2<SubWarpSize>;
-  uint32_t lane_id    = subwarp_align::mod(laneId());
-  uint32_t partition_ix;
-  switch (codebook_kind) {
-    case codebook_gen::PER_CLUSTER: {
-      partition_ix = cluster_ix;
-    } break;
-    case codebook_gen::PER_SUBSPACE: {
-      partition_ix = j;
-    } break;
-    default: __builtin_unreachable();
-  }
-
-  const uint32_t pq_book_size = pq_centers.extent(2);
-  const uint32_t pq_len       = pq_centers.extent(1);
-  float min_dist              = std::numeric_limits<float>::infinity();
-  uint8_t code                = 0;
-  // calculate the distance for each PQ cluster, find the minimum for each thread
-  for (uint32_t i = lane_id; i < pq_book_size; i += subwarp_align::Value) {
-    // NB: the L2 quantifiers on residuals are always trained on L2 metric.
-    float d = 0.0f;
-    for (uint32_t k = 0; k < pq_len; k++) {
-      auto t = new_vector(j, k) - pq_centers(partition_ix, k, i);
-      d += t * t;
-    }
-    if (d < min_dist) {
-      min_dist = d;
-      code     = uint8_t(i);
-    }
-  }
-  // reduce among threads
-#pragma unroll
-  for (uint32_t stride = SubWarpSize >> 1; stride > 0; stride >>= 1) {
-    const auto other_dist = shfl_xor(min_dist, stride, SubWarpSize);
-    const auto other_code = shfl_xor(code, stride, SubWarpSize);
-    if (other_dist < min_dist) {
-      min_dist = other_dist;
-      code     = other_code;
-    }
-  }
-  return code;
-}
-
 /**
  * A producer for the `write_list` and `write_vector` reads the codes byte-by-byte. That is,
  * independent of the code width (pq_bits), one code uses the whole byte, hence one vectors uses
@@ -1088,6 +1034,96 @@ void pack_list_data(raft::device_resources const& res,
                  res.get_stream());
 }
 
+/**
+ *
+ * A producer for the `write_list` and `write_vector` that encodes level-1 input vector residuals
+ * into lvl-2 PQ codes.
+ * Computing a PQ code means finding the closest cluster in a pq_dim-subspace.
+ *
+ * @tparam SubWarpSize
+ *   how many threads work on a single vector;
+ *   bounded by either WarpSize or pq_book_size.
+ *
+ * @param pq_centers
+ *   - codebook_gen::PER_SUBSPACE: [pq_dim , pq_len, pq_book_size]
+ *   - codebook_gen::PER_CLUSTER:  [n_lists, pq_len, pq_book_size]
+ * @param new_vector a single input of length rot_dim, reinterpreted as [pq_dim, pq_len].
+ *   the input must be already transformed to floats, rotated, and the level 1 cluster
+ *   center must be already substructed (i.e. this is the residual of a single input vector).
+ * @param codebook_kind
+ * @param j index along pq_dim "dimension"
+ * @param cluster_ix is used for PER_CLUSTER codebooks.
+ */
+/**
+ */
+template <uint32_t SubWarpSize, typename IdxT>
+struct encode_vectors {
+  codebook_gen codebook_kind;
+  uint32_t cluster_ix;
+  device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers;
+  device_mdspan<const float, extent_3d<IdxT>, row_major> in_vectors;
+
+  __device__ inline encode_vectors(
+    device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers,
+    device_matrix_view<const float, IdxT, row_major> in_vectors,
+    codebook_gen codebook_kind,
+    uint32_t cluster_ix)
+    : codebook_kind{codebook_kind},
+      cluster_ix{cluster_ix},
+      pq_centers{pq_centers},
+      in_vectors{reinterpret_vectors(in_vectors, pq_centers)}
+  {
+  }
+
+  /**
+   * Decode j-th component of the i-th vector by its code and write it into a chunk of the output
+   * vectors (pq_len elements).
+   */
+  __device__ inline auto operator()(IdxT i, uint32_t j) -> uint8_t
+  {
+    uint32_t lane_id = Pow2<SubWarpSize>::mod(laneId());
+    uint32_t partition_ix;
+    switch (codebook_kind) {
+      case codebook_gen::PER_CLUSTER: {
+        partition_ix = cluster_ix;
+      } break;
+      case codebook_gen::PER_SUBSPACE: {
+        partition_ix = j;
+      } break;
+      default: __builtin_unreachable();
+    }
+
+    const uint32_t pq_book_size = pq_centers.extent(2);
+    const uint32_t pq_len       = pq_centers.extent(1);
+    float min_dist              = std::numeric_limits<float>::infinity();
+    uint8_t code                = 0;
+    // calculate the distance for each PQ cluster, find the minimum for each thread
+    for (uint32_t l = lane_id; l < pq_book_size; l += SubWarpSize) {
+      // NB: the L2 quantifiers on residuals are always trained on L2 metric.
+      float d = 0.0f;
+      for (uint32_t k = 0; k < pq_len; k++) {
+        auto t = in_vectors(i, j, k) - pq_centers(partition_ix, k, l);
+        d += t * t;
+      }
+      if (d < min_dist) {
+        min_dist = d;
+        code     = uint8_t(l);
+      }
+    }
+    // reduce among threads
+#pragma unroll
+    for (uint32_t stride = SubWarpSize >> 1; stride > 0; stride >>= 1) {
+      const auto other_dist = shfl_xor(min_dist, stride, SubWarpSize);
+      const auto other_code = shfl_xor(code, stride, SubWarpSize);
+      if (other_dist < min_dist) {
+        min_dist = other_dist;
+        code     = other_code;
+      }
+    }
+    return code;
+  }
+};
+
 template <uint32_t BlockSize, uint32_t PqBits, typename IdxT>
 __launch_bounds__(BlockSize) __global__ void process_and_fill_codes_kernel(
   device_matrix_view<const float, IdxT, row_major> new_vectors,
@@ -1121,34 +1157,79 @@ __launch_bounds__(BlockSize) __global__ void process_and_fill_codes_kernel(
   }
 
   // write the codes (one record per subwarp):
-  // 1. select input row
-  using layout_t    = typename decltype(new_vectors)::layout_type;
-  using accessor_t  = typename decltype(new_vectors)::accessor_type;
-  const auto in_dim = new_vectors.extent(1);
-  auto one_vector =
-    mdspan<const float, extent_1d<uint32_t>, layout_t, accessor_t>(&new_vectors(row_ix, 0), in_dim);
-  // 2. select output cluster
-  const uint32_t pq_dim = in_dim / pq_centers.extent(1);
+  const uint32_t pq_dim = new_vectors.extent(1) / pq_centers.extent(1);
   auto pq_extents = list_spec<uint32_t, IdxT>{PqBits, pq_dim, true}.make_list_extents(out_ix + 1);
   auto pq_dataset =
     make_mdspan<uint8_t, uint32_t, row_major, false, true>(data_ptrs[cluster_ix], pq_extents);
-
-  // 3. compute and write the vector
-  const uint32_t pq_len     = pq_centers.extent(1);
-  auto reinterpreted_vector = mdspan<const float, extent_2d<uint32_t>, layout_t, accessor_t>(
-    one_vector.data_handle(), extent_2d<uint32_t>{pq_dim, pq_len});
-
   write_vector<PqBits, kSubWarpSize>(
     pq_dataset,
     out_ix,
-    0,
+    row_ix,
     pq_dim,
-    [pq_centers, reinterpreted_vector, codebook_kind, cluster_ix] __device__(
-      uint32_t, uint32_t j) -> uint8_t {
-      // find the label
-      return compute_pq_code<kSubWarpSize>(
-        pq_centers, reinterpreted_vector, codebook_kind, j, cluster_ix);
-    });
+    encode_vectors<kSubWarpSize, IdxT>{pq_centers, new_vectors, codebook_kind, cluster_ix});
+}
+
+template <uint32_t BlockSize, uint32_t PqBits>
+__launch_bounds__(BlockSize) __global__ void encode_list_data_kernel(
+  device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> list_data,
+  device_matrix_view<const float, uint32_t, row_major> new_vectors,
+  device_mdspan<const float, extent_3d<uint32_t>, row_major> pq_centers,
+  codebook_gen codebook_kind,
+  uint32_t cluster_ix,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices)
+{
+  constexpr uint32_t kSubWarpSize = std::min<uint32_t>(WarpSize, 1u << PqBits);
+  const uint32_t pq_dim           = new_vectors.extent(1) / pq_centers.extent(1);
+  auto encode_action =
+    encode_vectors<kSubWarpSize, uint32_t>{pq_centers, new_vectors, codebook_kind, cluster_ix};
+  write_list<PqBits, kSubWarpSize>(
+    list_data, offset_or_indices, new_vectors.extent(0), pq_dim, encode_action);
+}
+
+template <typename T, typename IdxT>
+void encode_list_data(raft::device_resources const& res,
+                      index<IdxT>* index,
+                      device_matrix_view<const T, uint32_t, row_major> new_vectors,
+                      uint32_t label,
+                      std::variant<uint32_t, const uint32_t*> offset_or_indices)
+{
+  auto n_rows = new_vectors.extent(0);
+  if (n_rows == 0) { return; }
+
+  auto mr = res.get_workspace_resource();
+
+  auto new_vectors_residual =
+    make_device_mdarray<float>(res, mr, make_extents<uint32_t>(n_rows, index->rot_dim()));
+
+  flat_compute_residuals<T, uint32_t>(res,
+                                      new_vectors_residual.data_handle(),
+                                      n_rows,
+                                      index->rotation_matrix(),
+                                      index->centers(),
+                                      new_vectors.data_handle(),
+                                      label,
+                                      mr);
+
+  constexpr uint32_t kBlockSize = 256;
+  dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize), 1, 1);
+  dim3 threads(kBlockSize, 1, 1);
+  auto kernel = [](uint32_t pq_bits) {
+    switch (pq_bits) {
+      case 4: return encode_list_data_kernel<kBlockSize, 4>;
+      case 5: return encode_list_data_kernel<kBlockSize, 5>;
+      case 6: return encode_list_data_kernel<kBlockSize, 6>;
+      case 7: return encode_list_data_kernel<kBlockSize, 7>;
+      case 8: return encode_list_data_kernel<kBlockSize, 8>;
+      default: RAFT_FAIL("Invalid pq_bits (%u), the value must be within [4, 8]", pq_bits);
+    }
+  }(index->pq_bits());
+  kernel<<<blocks, threads, 0, res.get_stream()>>>(index->lists()[label]->data.view(),
+                                                   new_vectors_residual.view(),
+                                                   index->pq_centers(),
+                                                   index->codebook_kind(),
+                                                   label,
+                                                   offset_or_indices);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 /**
@@ -1190,14 +1271,14 @@ void process_and_fill_codes(raft::device_resources const& handle,
   auto new_vectors_residual =
     make_device_mdarray<float>(handle, mr, make_extents<IdxT>(n_rows, index.rot_dim()));
 
-  flat_compute_residuals(handle,
-                         new_vectors_residual.data_handle(),
-                         n_rows,
-                         index.rotation_matrix(),
-                         index.centers(),
-                         new_vectors,
-                         new_labels,
-                         mr);
+  flat_compute_residuals<T, IdxT>(handle,
+                                  new_vectors_residual.data_handle(),
+                                  n_rows,
+                                  index.rotation_matrix(),
+                                  index.centers(),
+                                  new_vectors,
+                                  new_labels,
+                                  mr);
 
   constexpr uint32_t kBlockSize  = 256;
   const uint32_t threads_per_vec = std::min<uint32_t>(WarpSize, index.pq_book_size());
@@ -1278,16 +1359,16 @@ void recompute_internal_state(const raft::device_resources& res, index<IdxT>& in
 }
 
 /**
- * Extend one list of the index in-place, by the list label, skipping the classification and
- * encoding steps.
- * See the public interface for the api and usage.
+ * Helper function: allocate enough space in the list, compute the offset, at which to start
+ * writing, and fill-in indices.
+ *
+ * @return offset for writing the data
  */
 template <typename IdxT>
-void extend_list_with_codes(raft::device_resources const& res,
-                            index<IdxT>* index,
-                            device_matrix_view<const uint8_t, uint32_t, row_major> new_codes,
-                            device_vector_view<const IdxT, uint32_t, row_major> new_indices,
-                            uint32_t label)
+auto extend_list_prepare(raft::device_resources const& res,
+                         index<IdxT>* index,
+                         device_vector_view<const IdxT, uint32_t, row_major> new_indices,
+                         uint32_t label) -> uint32_t
 {
   uint32_t n_rows = new_indices.extent(0);
   uint32_t offset;
@@ -1301,9 +1382,25 @@ void extend_list_with_codes(raft::device_resources const& res,
   auto& list = index->lists()[label];
   ivf::resize_list(res, list, spec, new_size, offset);
   copy(list->indices.data_handle() + offset, new_indices.data_handle(), n_rows, res.get_stream());
+  return offset;
+}
 
+/**
+ * Extend one list of the index in-place, by the list label, skipping the classification and
+ * encoding steps.
+ * See the public interface for the api and usage.
+ */
+template <typename IdxT>
+void extend_list_with_codes(raft::device_resources const& res,
+                            index<IdxT>* index,
+                            device_matrix_view<const uint8_t, uint32_t, row_major> new_codes,
+                            device_vector_view<const IdxT, uint32_t, row_major> new_indices,
+                            uint32_t label)
+{
+  // Allocate memory and write indices
+  auto offset = extend_list_prepare(res, index, new_indices, label);
+  // Pack the data
   pack_list_data<IdxT>(res, index, new_codes, label, offset);
-
   // Update the pointers and the sizes
   recompute_internal_state(res, *index);
 }
@@ -1319,21 +1416,10 @@ void extend_list(raft::device_resources const& res,
                  device_vector_view<const IdxT, uint32_t, row_major> new_indices,
                  uint32_t label)
 {
-  uint32_t n_rows = new_indices.extent(0);
-  uint32_t offset;
-  // Allocate the lists to fit the new data
-  copy(&offset, index->list_sizes().data_handle() + label, 1, res.get_stream());
-  res.sync_stream();
-  uint32_t new_size = offset + n_rows;
-  copy(index->list_sizes().data_handle() + label, &new_size, 1, res.get_stream());
-  auto spec = list_spec<uint32_t, IdxT>{
-    index->pq_bits(), index->pq_dim(), index->conservative_memory_allocation()};
-  auto& list = index->lists()[label];
-  ivf::resize_list(res, list, spec, new_size, offset);
-  copy(list->indices.data_handle() + offset, new_indices.data_handle(), n_rows, res.get_stream());
-
-  /** TODO: implementation is missing */
-
+  // Allocate memory and write indices
+  auto offset = extend_list_prepare(res, index, new_indices, label);
+  // Encode the data
+  encode_list_data<T, IdxT>(res, index, new_vectors, label, offset);
   // Update the pointers and the sizes
   recompute_internal_state(res, *index);
 }
diff --git a/cpp/include/raft/neighbors/ivf_pq.cuh b/cpp/include/raft/neighbors/ivf_pq.cuh
index eca57bf9d9..464a00fb67 100644
--- a/cpp/include/raft/neighbors/ivf_pq.cuh
+++ b/cpp/include/raft/neighbors/ivf_pq.cuh
@@ -403,7 +403,7 @@ void reconstruct_list_data(raft::device_resources const& res,
  * @brief Build a new index containing the data of the original plus new extra vectors.
  *
  * Implementation note:
- *    The new data is clustered according to existing kmeans clusters, then the cluster
+ *    The new data is clustered according to existing kmeans clusters, the cluster
  *    centers are unchanged.
  *
  * Usage example:
@@ -508,7 +508,7 @@ void extend_list(raft::device_resources const& res,
                  device_vector_view<const IdxT, uint32_t, row_major> new_indices,
                  uint32_t label)
 {
-  detail::extend_list(res, index, new_vectors, new_vectors, label);
+  detail::extend_list(res, index, new_vectors, new_indices, label);
 }
 
 /**
diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh
index 4eea925741..52987f2227 100644
--- a/cpp/test/neighbors/ann_ivf_pq.cuh
+++ b/cpp/test/neighbors/ann_ivf_pq.cuh
@@ -22,6 +22,8 @@
 
 #include <raft/core/logger.hpp>
 #include <raft/distance/distance_types.hpp>
+#include <raft/linalg/map.cuh>
+#include <raft/linalg/map_reduce.cuh>
 #include <raft/matrix/gather.cuh>
 #include <raft/neighbors/ivf_pq.cuh>
 #include <raft/random/rng.cuh>
@@ -39,8 +41,6 @@
 #include <gtest/gtest.h>
 
 #include <cub/cub.cuh>
-#include <thrust/reduce.h>
-#include <thrust/sequence.h>
 
 #include <algorithm>
 #include <cstddef>
@@ -190,17 +190,15 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
 
   index<IdxT> build_2_extends()
   {
-    rmm::device_uvector<IdxT> db_indices(ps.num_db_vecs, stream_);
-    thrust::sequence(handle_.get_thrust_policy(),
-                     thrust::device_pointer_cast(db_indices.data()),
-                     thrust::device_pointer_cast(db_indices.data() + ps.num_db_vecs));
+    auto db_indices = make_device_vector<IdxT>(handle_, ps.num_db_vecs);
+    linalg::map_offset(handle_, db_indices.view(), identity_op{});
     handle_.sync_stream(stream_);
     auto size_1 = IdxT(ps.num_db_vecs) / 2;
     auto size_2 = IdxT(ps.num_db_vecs) - size_1;
     auto vecs_1 = database.data();
     auto vecs_2 = database.data() + size_t(size_1) * size_t(ps.dim);
-    auto inds_1 = db_indices.data();
-    auto inds_2 = db_indices.data() + size_t(size_1);
+    auto inds_1 = db_indices.data_handle();
+    auto inds_2 = db_indices.data_handle() + size_t(size_1);
 
     auto ipams              = ps.index_params;
     ipams.add_data_on_build = false;
@@ -233,10 +231,10 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
                             uint32_t n_take,
                             uint32_t n_skip)
   {
-    auto rec_list = index.lists()[label];
-    auto dim      = index.dim();
-    n_take        = std::min<uint32_t>(n_take, rec_list->size.load());
-    n_skip        = std::min<uint32_t>(n_skip, rec_list->size.load() - n_take);
+    auto& rec_list = index.lists()[label];
+    auto dim       = index.dim();
+    n_take         = std::min<uint32_t>(n_take, rec_list->size.load());
+    n_skip         = std::min<uint32_t>(n_skip, rec_list->size.load() - n_take);
 
     if (n_take == 0) { return; }
 
@@ -279,22 +277,68 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
     }
   }
 
+  void check_reconstruct_extend(index<IdxT>* index, double compression_ratio, uint32_t label)
+  {
+    // NB: this is not reference, the list is retained; the index will have to create a new list on
+    // `erase_list` op.
+    auto old_list = index->lists()[label];
+    auto n_rows   = old_list->size.load();
+    if (n_rows == 0) { return; }
+
+    auto vectors = make_device_matrix<DataT>(handle_, n_rows, index->dim());
+    auto indices = make_device_vector<IdxT>(handle_, n_rows);
+    copy(indices.data_handle(), old_list->indices.data_handle(), n_rows, stream_);
+
+    ivf_pq::reconstruct_list_data(handle_, *index, vectors.view(), label, 0);
+    ivf_pq::erase_list(handle_, index, label);
+    // NB: passing the type parameter because const->non-const implicit conversion of the mdspans
+    // breaks type inference
+    ivf_pq::extend_list<DataT, IdxT>(handle_, index, vectors.view(), indices.view(), label);
+
+    auto& new_list = index->lists()[label];
+    ASSERT_NE(old_list.get(), new_list.get())
+      << "The old list should have been shared and retained after ivf_pq index has erased the "
+         "corresponding cluster.";
+
+    auto n_codes = old_list->data.size();
+    rmm::mr::managed_memory_resource managed_memory;
+    rmm::device_scalar<int> errs(stream_, &managed_memory);
+    linalg::mapReduce(errs.data(),
+                      n_codes,
+                      0,
+                      compose_op{cast_op<int>{}, notequal_op{}},
+                      add_op{},
+                      stream_,
+                      old_list->data.data_handle(),
+                      new_list->data.data_handle());
+    auto err_value = errs.value(stream_);
+    auto err_rate  = double(err_value) / double(n_codes);
+    ASSERT_LE(err_rate, 0.01 * std::pow(2.0, compression_ratio))
+      << " (label = " << label << ", errors = " << err_value << ", size = " << n_codes << ")";
+  }
+
   void check_packing(index<IdxT>* index, uint32_t label)
   {
-    auto rec_list = index->lists()[label];
-    auto n_rows   = rec_list->size.load();
+    auto old_list = index->lists()[label];
+    auto n_rows   = old_list->size.load();
 
     if (n_rows == 0) { return; }
 
     auto codes   = make_device_matrix<uint8_t>(handle_, n_rows, index->pq_dim());
     auto indices = make_device_vector<IdxT>(handle_, n_rows);
-    copy(indices.data_handle(), rec_list->indices.data_handle(), n_rows, stream_);
+    copy(indices.data_handle(), old_list->indices.data_handle(), n_rows, stream_);
 
     ivf_pq::unpack_list_data(handle_, *index, codes.view(), label, 0);
     ivf_pq::erase_list(handle_, index, label);
-    // NB: passing the type parameter because const->non-const implicit conversion of the mdspans
-    // breaks type inference
     ivf_pq::extend_list_with_codes<IdxT>(handle_, index, codes.view(), indices.view(), label);
+
+    auto& new_list = index->lists()[label];
+    ASSERT_NE(old_list.get(), new_list.get())
+      << "The old list should have been shared and retained after ivf_pq index has erased the "
+         "corresponding cluster.";
+
+    ASSERT_TRUE(devArrMatch(
+      old_list->data.data_handle(), new_list->data.data_handle(), n_rows, Compare<uint8_t>{}));
   }
 
   template <typename BuildIndex>
@@ -302,12 +346,15 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
   {
     index<IdxT> index = build_index();
 
-    // Dump and re-write codes for one label
-    check_packing(&index, 0);
-
     double compression_ratio =
       static_cast<double>(ps.dim * 8) / static_cast<double>(index.pq_dim() * index.pq_bits());
 
+    // Reconstruct and re-write vectors for one label
+    check_reconstruct_extend(&index, compression_ratio, uint32_t(rand()) % index.n_lists());
+
+    // Dump and re-write codes for one label
+    check_packing(&index, uint32_t(rand()) % index.n_lists());
+
     // check a small subset of data in a randomly chosen cluster to see if the data reconstruction
     // works well.
     check_reconstruction(index, compression_ratio, uint32_t(rand()) % index.n_lists(), 100, 7);

From a674768bdb9ed256e9c1bf3915e42eb6177de0a9 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 17 Mar 2023 12:18:09 +0100
Subject: [PATCH 17/25] Adjust the scheduling of the encode_list_data_kernel

---
 cpp/include/raft/neighbors/detail/ivf_pq_build.cuh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
index ee602b05af..c11aecc73c 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
@@ -1210,8 +1210,9 @@ void encode_list_data(raft::device_resources const& res,
                                       label,
                                       mr);
 
-  constexpr uint32_t kBlockSize = 256;
-  dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize), 1, 1);
+  constexpr uint32_t kBlockSize  = 256;
+  const uint32_t threads_per_vec = std::min<uint32_t>(WarpSize, index->pq_book_size());
+  dim3 blocks(div_rounding_up_safe<uint32_t>(n_rows, kBlockSize / threads_per_vec), 1, 1);
   dim3 threads(kBlockSize, 1, 1);
   auto kernel = [](uint32_t pq_bits) {
     switch (pq_bits) {

From 1be2f6bfbbfb2151bf7c73651d549eef90892774 Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 17 Mar 2023 12:18:44 +0100
Subject: [PATCH 18/25] Factor code-packing out of the build file

---
 .../raft/neighbors/detail/ivf_pq_build.cuh    | 186 +--------------
 .../neighbors/detail/ivf_pq_codepacking.cuh   | 214 ++++++++++++++++++
 2 files changed, 215 insertions(+), 185 deletions(-)
 create mode 100644 cpp/include/raft/neighbors/detail/ivf_pq_codepacking.cuh

diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
index c11aecc73c..36ceccc36f 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
@@ -18,6 +18,7 @@
 
 #include <raft/spatial/knn/detail/ann_utils.cuh>
 
+#include <raft/neighbors/detail/ivf_pq_codepacking.cuh>
 #include <raft/neighbors/ivf_list.hpp>
 #include <raft/neighbors/ivf_pq_types.hpp>
 
@@ -60,63 +61,6 @@ namespace raft::neighbors::ivf_pq::detail {
 
 using namespace raft::spatial::knn::detail;  // NOLINT
 
-/** A chunk of PQ-encoded vector managed by one CUDA thread. */
-using pq_vec_t = TxN_t<uint8_t, kIndexGroupVecLen>::io_t;
-
-namespace {
-
-/**
- * This type mimics the `uint8_t&` for the indexing operator of `bitfield_view_t`.
- *
- * @tparam Bits number of bits comprising the value.
- */
-template <uint32_t Bits>
-struct bitfield_ref_t {
-  static_assert(Bits <= 8 && Bits > 0, "Bit code must fit one byte");
-  constexpr static uint8_t kMask = static_cast<uint8_t>((1u << Bits) - 1u);
-  uint8_t* ptr;
-  uint32_t offset;
-
-  constexpr operator uint8_t()  // NOLINT
-  {
-    auto pair = static_cast<uint16_t>(ptr[0]);
-    if (offset + Bits > 8) { pair |= static_cast<uint16_t>(ptr[1]) << 8; }
-    return static_cast<uint8_t>((pair >> offset) & kMask);
-  }
-
-  constexpr auto operator=(uint8_t code) -> bitfield_ref_t&
-  {
-    if (offset + Bits > 8) {
-      auto pair = static_cast<uint16_t>(ptr[0]);
-      pair |= static_cast<uint16_t>(ptr[1]) << 8;
-      pair &= ~(static_cast<uint16_t>(kMask) << offset);
-      pair |= static_cast<uint16_t>(code) << offset;
-      ptr[0] = static_cast<uint8_t>(Pow2<256>::mod(pair));
-      ptr[1] = static_cast<uint8_t>(Pow2<256>::div(pair));
-    } else {
-      ptr[0] = (ptr[0] & ~(kMask << offset)) | (code << offset);
-    }
-    return *this;
-  }
-};
-
-/**
- * View a byte array as an array of unsigned integers of custom small bit size.
- *
- * @tparam Bits number of bits comprising a single element of the array.
- */
-template <uint32_t Bits>
-struct bitfield_view_t {
-  static_assert(Bits <= 8 && Bits > 0, "Bit code must fit one byte");
-  uint8_t* raw;
-
-  constexpr auto operator[](uint32_t i) -> bitfield_ref_t<Bits>
-  {
-    uint32_t bit_offset = i * Bits;
-    return bitfield_ref_t<Bits>{raw + Pow2<8>::div(bit_offset), Pow2<8>::mod(bit_offset)};
-  }
-};
-
 template <uint32_t BlockDim, typename T, typename S>
 __launch_bounds__(BlockDim) __global__ void copy_warped_kernel(
   T* out, uint32_t ld_out, const S* in, uint32_t ld_in, uint32_t n_cols, size_t n_rows)
@@ -162,8 +106,6 @@ void copy_warped(T* out,
     <<<blocks, threads, 0, stream>>>(out, ld_out, in, ld_in, n_cols, n_rows);
 }
 
-}  // namespace
-
 /**
  * @brief Fill-in a random orthogonal transformation matrix.
  *
@@ -585,132 +527,6 @@ static __device__ auto reinterpret_vectors(
     vectors.data_handle(), extent_3d<IdxT>{vectors.extent(0), pq_dim, pq_len});
 }
 
-/**
- * Process a single vector in a list.
- *
- * @tparam PqBits
- * @tparam Action tells how to process a single vector (e.g. reconstruct or just unpack)
- *
- * @param[in] in_list_data the encoded cluster data.
- * @param[in] in_ix in-cluster index of the vector to be decoded (one-per-thread).
- * @param[in] out_ix the output index passed to the action
- * @param[in] pq_dim
- * @param action a callable action to be invoked on each PQ code (component of the encoding)
- *    type: void (uint8_t code, uint32_t out_ix, uint32_t j), where j = [0..pq_dim).
- */
-template <uint32_t PqBits, typename Action>
-__device__ void run_on_vector(
-  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> in_list_data,
-  uint32_t in_ix,
-  uint32_t out_ix,
-  uint32_t pq_dim,
-  Action action)
-{
-  using group_align         = Pow2<kIndexGroupSize>;
-  const uint32_t group_ix   = group_align::div(in_ix);
-  const uint32_t ingroup_ix = group_align::mod(in_ix);
-
-  pq_vec_t code_chunk;
-  bitfield_view_t<PqBits> code_view{reinterpret_cast<uint8_t*>(&code_chunk)};
-  constexpr uint32_t kChunkSize = (sizeof(pq_vec_t) * 8u) / PqBits;
-  for (uint32_t j = 0, i = 0; j < pq_dim; i++) {
-    // read the chunk
-    code_chunk = *reinterpret_cast<const pq_vec_t*>(&in_list_data(group_ix, i, ingroup_ix, 0));
-    // read the codes, one/pq_dim at a time
-#pragma unroll
-    for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++) {
-      // read a piece of the reconstructed vector
-      action(code_view[k], out_ix, j);
-    }
-  }
-}
-
-/**
- * Process a single vector in a list.
- *
- * @tparam PqBits
- * @tparam SubWarpSize how many threads work on the same ix (only the first thread writes data).
- * @tparam IdxT type of the index passed to the action
- * @tparam Action tells how to process a single vector (e.g. encode or just pack)
- *
- * @param[in] out_list_data the encoded cluster data.
- * @param[in] out_ix in-cluster index of the vector to be processed (one-per-SubWarpSize threads).
- * @param[in] in_ix the input index passed to the action (one-per-SubWarpSize threads).
- * @param[in] pq_dim
- * @param action a callable action to be invoked on each PQ code (component of the encoding)
- *    type: (uint32_t in_ix, uint32_t j) -> uint8_t, where j = [0..pq_dim).
- */
-template <uint32_t PqBits, uint32_t SubWarpSize, typename IdxT, typename Action>
-__device__ void write_vector(
-  device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> out_list_data,
-  uint32_t out_ix,
-  IdxT in_ix,
-  uint32_t pq_dim,
-  Action action)
-{
-  const uint32_t lane_id = Pow2<SubWarpSize>::mod(threadIdx.x);
-
-  using group_align         = Pow2<kIndexGroupSize>;
-  const uint32_t group_ix   = group_align::div(out_ix);
-  const uint32_t ingroup_ix = group_align::mod(out_ix);
-
-  pq_vec_t code_chunk;
-  bitfield_view_t<PqBits> code_view{reinterpret_cast<uint8_t*>(&code_chunk)};
-  constexpr uint32_t kChunkSize = (sizeof(pq_vec_t) * 8u) / PqBits;
-  for (uint32_t j = 0, i = 0; j < pq_dim; i++) {
-    // clear the chunk
-    if (lane_id == 0) { code_chunk = pq_vec_t{}; }
-    // write the codes, one/pq_dim at a time
-#pragma unroll
-    for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++) {
-      // write a single code
-      uint8_t code = action(in_ix, j);
-      if (lane_id == 0) { code_view[k] = code; }
-    }
-    // write the chunk to the list
-    if (lane_id == 0) {
-      *reinterpret_cast<pq_vec_t*>(&out_list_data(group_ix, i, ingroup_ix, 0)) = code_chunk;
-    }
-  }
-}
-
-/** Process the given indices or a block of a single list (cluster). */
-template <uint32_t PqBits, typename Action>
-__device__ void run_on_list(
-  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> in_list_data,
-  std::variant<uint32_t, const uint32_t*> offset_or_indices,
-  uint32_t len,
-  uint32_t pq_dim,
-  Action action)
-{
-  for (uint32_t ix = threadIdx.x + blockDim.x * blockIdx.x; ix < len; ix += blockDim.x) {
-    const uint32_t src_ix = std::holds_alternative<uint32_t>(offset_or_indices)
-                              ? std::get<uint32_t>(offset_or_indices) + ix
-                              : std::get<const uint32_t*>(offset_or_indices)[ix];
-    run_on_vector<PqBits>(in_list_data, src_ix, ix, pq_dim, action);
-  }
-}
-
-/** Process the given indices or a block of a single list (cluster). */
-template <uint32_t PqBits, uint32_t SubWarpSize, typename Action>
-__device__ void write_list(
-  device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> out_list_data,
-  std::variant<uint32_t, const uint32_t*> offset_or_indices,
-  uint32_t len,
-  uint32_t pq_dim,
-  Action action)
-{
-  using subwarp_align = Pow2<SubWarpSize>;
-  uint32_t stride     = subwarp_align::div(blockDim.x);
-  uint32_t ix         = subwarp_align::div(threadIdx.x + blockDim.x * blockIdx.x);
-  for (; ix < len; ix += stride) {
-    const uint32_t dst_ix = std::holds_alternative<uint32_t>(offset_or_indices)
-                              ? std::get<uint32_t>(offset_or_indices) + ix
-                              : std::get<const uint32_t*>(offset_or_indices)[ix];
-    write_vector<PqBits, SubWarpSize>(out_list_data, dst_ix, ix, pq_dim, action);
-  }
-}
-
 /**
  * A consumer for the `run_on_list` and `run_on_vector` that just flattens PQ codes
  * one-per-byte. That is, independent of the code width (pq_bits), one code uses
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_codepacking.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_codepacking.cuh
new file mode 100644
index 0000000000..52969dd176
--- /dev/null
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_codepacking.cuh
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/neighbors/ivf_list.hpp>
+#include <raft/neighbors/ivf_pq_types.hpp>
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/device_atomics.cuh>
+#include <raft/util/integer_utils.hpp>
+#include <raft/util/pow2_utils.cuh>
+#include <raft/util/vectorized.cuh>
+
+#include <variant>
+
+namespace raft::neighbors::ivf_pq::detail {
+
+/** A chunk of PQ-encoded vector managed by one CUDA thread. */
+using pq_vec_t = TxN_t<uint8_t, kIndexGroupVecLen>::io_t;
+
+/**
+ * This type mimics the `uint8_t&` for the indexing operator of `bitfield_view_t`.
+ *
+ * @tparam Bits number of bits comprising the value.
+ */
+template <uint32_t Bits>
+struct bitfield_ref_t {
+  static_assert(Bits <= 8 && Bits > 0, "Bit code must fit one byte");
+  constexpr static uint8_t kMask = static_cast<uint8_t>((1u << Bits) - 1u);
+  uint8_t* ptr;
+  uint32_t offset;
+
+  constexpr operator uint8_t()  // NOLINT
+  {
+    auto pair = static_cast<uint16_t>(ptr[0]);
+    if (offset + Bits > 8) { pair |= static_cast<uint16_t>(ptr[1]) << 8; }
+    return static_cast<uint8_t>((pair >> offset) & kMask);
+  }
+
+  constexpr auto operator=(uint8_t code) -> bitfield_ref_t&
+  {
+    if (offset + Bits > 8) {
+      auto pair = static_cast<uint16_t>(ptr[0]);
+      pair |= static_cast<uint16_t>(ptr[1]) << 8;
+      pair &= ~(static_cast<uint16_t>(kMask) << offset);
+      pair |= static_cast<uint16_t>(code) << offset;
+      ptr[0] = static_cast<uint8_t>(Pow2<256>::mod(pair));
+      ptr[1] = static_cast<uint8_t>(Pow2<256>::div(pair));
+    } else {
+      ptr[0] = (ptr[0] & ~(kMask << offset)) | (code << offset);
+    }
+    return *this;
+  }
+};
+
+/**
+ * View a byte array as an array of unsigned integers of custom small bit size.
+ *
+ * @tparam Bits number of bits comprising a single element of the array.
+ */
+template <uint32_t Bits>
+struct bitfield_view_t {
+  static_assert(Bits <= 8 && Bits > 0, "Bit code must fit one byte");
+  uint8_t* raw;
+
+  constexpr auto operator[](uint32_t i) -> bitfield_ref_t<Bits>
+  {
+    uint32_t bit_offset = i * Bits;
+    return bitfield_ref_t<Bits>{raw + Pow2<8>::div(bit_offset), Pow2<8>::mod(bit_offset)};
+  }
+};
+
+/**
+ * Process a single vector in a list.
+ *
+ * @tparam PqBits
+ * @tparam Action tells how to process a single vector (e.g. reconstruct or just unpack)
+ *
+ * @param[in] in_list_data the encoded cluster data.
+ * @param[in] in_ix in-cluster index of the vector to be decoded (one-per-thread).
+ * @param[in] out_ix the output index passed to the action
+ * @param[in] pq_dim
+ * @param action a callable action to be invoked on each PQ code (component of the encoding)
+ *    type: void (uint8_t code, uint32_t out_ix, uint32_t j), where j = [0..pq_dim).
+ */
+template <uint32_t PqBits, typename Action>
+__device__ void run_on_vector(
+  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> in_list_data,
+  uint32_t in_ix,
+  uint32_t out_ix,
+  uint32_t pq_dim,
+  Action action)
+{
+  using group_align         = Pow2<kIndexGroupSize>;
+  const uint32_t group_ix   = group_align::div(in_ix);
+  const uint32_t ingroup_ix = group_align::mod(in_ix);
+
+  pq_vec_t code_chunk;
+  bitfield_view_t<PqBits> code_view{reinterpret_cast<uint8_t*>(&code_chunk)};
+  constexpr uint32_t kChunkSize = (sizeof(pq_vec_t) * 8u) / PqBits;
+  for (uint32_t j = 0, i = 0; j < pq_dim; i++) {
+    // read the chunk
+    code_chunk = *reinterpret_cast<const pq_vec_t*>(&in_list_data(group_ix, i, ingroup_ix, 0));
+    // read the codes, one/pq_dim at a time
+#pragma unroll
+    for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++) {
+      // read a piece of the reconstructed vector
+      action(code_view[k], out_ix, j);
+    }
+  }
+}
+
+/**
+ * Process a single vector in a list.
+ *
+ * @tparam PqBits
+ * @tparam SubWarpSize how many threads work on the same ix (only the first thread writes data).
+ * @tparam IdxT type of the index passed to the action
+ * @tparam Action tells how to process a single vector (e.g. encode or just pack)
+ *
+ * @param[in] out_list_data the encoded cluster data.
+ * @param[in] out_ix in-cluster index of the vector to be processed (one-per-SubWarpSize threads).
+ * @param[in] in_ix the input index passed to the action (one-per-SubWarpSize threads).
+ * @param[in] pq_dim
+ * @param action a callable action to be invoked on each PQ code (component of the encoding)
+ *    type: (uint32_t in_ix, uint32_t j) -> uint8_t, where j = [0..pq_dim).
+ */
+template <uint32_t PqBits, uint32_t SubWarpSize, typename IdxT, typename Action>
+__device__ void write_vector(
+  device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> out_list_data,
+  uint32_t out_ix,
+  IdxT in_ix,
+  uint32_t pq_dim,
+  Action action)
+{
+  const uint32_t lane_id = Pow2<SubWarpSize>::mod(threadIdx.x);
+
+  using group_align         = Pow2<kIndexGroupSize>;
+  const uint32_t group_ix   = group_align::div(out_ix);
+  const uint32_t ingroup_ix = group_align::mod(out_ix);
+
+  pq_vec_t code_chunk;
+  bitfield_view_t<PqBits> code_view{reinterpret_cast<uint8_t*>(&code_chunk)};
+  constexpr uint32_t kChunkSize = (sizeof(pq_vec_t) * 8u) / PqBits;
+  for (uint32_t j = 0, i = 0; j < pq_dim; i++) {
+    // clear the chunk
+    if (lane_id == 0) { code_chunk = pq_vec_t{}; }
+    // write the codes, one/pq_dim at a time
+#pragma unroll
+    for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++) {
+      // write a single code
+      uint8_t code = action(in_ix, j);
+      if (lane_id == 0) { code_view[k] = code; }
+    }
+    // write the chunk to the list
+    if (lane_id == 0) {
+      *reinterpret_cast<pq_vec_t*>(&out_list_data(group_ix, i, ingroup_ix, 0)) = code_chunk;
+    }
+  }
+}
+
+/** Process the given indices or a block of a single list (cluster). */
+template <uint32_t PqBits, typename Action>
+__device__ void run_on_list(
+  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> in_list_data,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices,
+  uint32_t len,
+  uint32_t pq_dim,
+  Action action)
+{
+  for (uint32_t ix = threadIdx.x + blockDim.x * blockIdx.x; ix < len; ix += blockDim.x) {
+    const uint32_t src_ix = std::holds_alternative<uint32_t>(offset_or_indices)
+                              ? std::get<uint32_t>(offset_or_indices) + ix
+                              : std::get<const uint32_t*>(offset_or_indices)[ix];
+    run_on_vector<PqBits>(in_list_data, src_ix, ix, pq_dim, action);
+  }
+}
+
+/** Process the given indices or a block of a single list (cluster). */
+template <uint32_t PqBits, uint32_t SubWarpSize, typename Action>
+__device__ void write_list(
+  device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> out_list_data,
+  std::variant<uint32_t, const uint32_t*> offset_or_indices,
+  uint32_t len,
+  uint32_t pq_dim,
+  Action action)
+{
+  using subwarp_align = Pow2<SubWarpSize>;
+  uint32_t stride     = subwarp_align::div(blockDim.x);
+  uint32_t ix         = subwarp_align::div(threadIdx.x + blockDim.x * blockIdx.x);
+  for (; ix < len; ix += stride) {
+    const uint32_t dst_ix = std::holds_alternative<uint32_t>(offset_or_indices)
+                              ? std::get<uint32_t>(offset_or_indices) + ix
+                              : std::get<const uint32_t*>(offset_or_indices)[ix];
+    write_vector<PqBits, SubWarpSize>(out_list_data, dst_ix, ix, pq_dim, action);
+  }
+}
+
+}  // namespace raft::neighbors::ivf_pq::detail

From d6cad17a66136a986186581d31fa967449d933ec Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 17 Mar 2023 13:36:16 +0100
Subject: [PATCH 19/25] Fix failing tests

---
 cpp/test/neighbors/ann_ivf_pq.cuh | 114 ++++++++++++++++--------------
 1 file changed, 59 insertions(+), 55 deletions(-)

diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh
index 52987f2227..34dfcd83d2 100644
--- a/cpp/test/neighbors/ann_ivf_pq.cuh
+++ b/cpp/test/neighbors/ann_ivf_pq.cuh
@@ -116,6 +116,33 @@ inline auto operator<<(std::ostream& os, const ivf_pq_inputs& p) -> std::ostream
   return os;
 }
 
+template <typename T>
+void compare_vectors_l2(
+  const raft::device_resources& res, T a, T b, uint32_t label, double compression_ratio, double eps)
+{
+  auto n_rows = a.extent(0);
+  auto dim    = a.extent(1);
+  rmm::mr::managed_memory_resource managed_memory;
+  auto dist = make_device_mdarray<double>(res, &managed_memory, make_extents<uint32_t>(n_rows));
+  linalg::map_offset(res, dist.view(), [a, b, dim] __device__(uint32_t i) {
+    spatial::knn::detail::utils::mapping<float> f{};
+    double d = 0.0f;
+    for (uint32_t j = 0; j < dim; j++) {
+      double t = f(a(i, j)) - f(b(i, j));
+      d += t * t;
+    }
+    return sqrt(d / double(dim));
+  });
+  res.sync_stream();
+  for (uint32_t i = 0; i < n_rows; i++) {
+    double d = dist(i);
+    // The theoretical estimate of the error is hard to come up with,
+    // the estimate below is based on experimentation + curse of dimensionality
+    ASSERT_LE(d, eps * std::pow(2.0, compression_ratio))
+      << " (label = " << label << ", ix = " << i << ", eps = " << eps << ")";
+  }
+}
+
 template <typename IdxT>
 auto min_output_size(const raft::device_resources& handle,
                      const ivf_pq::index<IdxT>& index,
@@ -178,7 +205,7 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
     handle_.sync_stream(stream_);
   }
 
-  index<IdxT> build_only()
+  auto build_only()
   {
     auto ipams              = ps.index_params;
     ipams.add_data_on_build = true;
@@ -188,7 +215,7 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
     return ivf_pq::build<DataT, IdxT>(handle_, ipams, index_view);
   }
 
-  index<IdxT> build_2_extends()
+  auto build_2_extends()
   {
     auto db_indices = make_device_vector<IdxT>(handle_, ps.num_db_vecs);
     linalg::map_offset(handle_, db_indices.view(), identity_op{});
@@ -219,7 +246,7 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
     return idx;
   }
 
-  index<IdxT> build_serialize()
+  auto build_serialize()
   {
     ivf_pq::serialize<IdxT>(handle_, "ivf_pq_index", build_only());
     return ivf_pq::deserialize<IdxT>(handle_, "ivf_pq_index");
@@ -241,10 +268,6 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
     auto rec_data  = make_device_matrix<DataT>(handle_, n_take, dim);
     auto orig_data = make_device_matrix<DataT>(handle_, n_take, dim);
 
-    rmm::mr::managed_memory_resource managed_memory;
-    auto dist =
-      make_device_mdarray<double>(handle_, &managed_memory, make_extents<uint32_t>(n_take));
-
     ivf_pq::reconstruct_list_data(handle_, index, rec_data.view(), label, n_skip);
 
     matrix::gather(database.data(),
@@ -255,26 +278,7 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
                    orig_data.data_handle(),
                    stream_);
 
-    auto rec_data_view  = rec_data.view();
-    auto orig_data_view = orig_data.view();
-    linalg::map_offset(
-      handle_, dist.view(), [rec_data_view, orig_data_view, dim] __device__(uint32_t i) {
-        spatial::knn::detail::utils::mapping<float> f{};
-        double d = 0.0f;
-        for (uint32_t j = 0; j < dim; j++) {
-          double t = f(rec_data_view(i, j)) - f(orig_data_view(i, j));
-          d += t * t;
-        }
-        return sqrt(d / double(dim));
-      });
-    handle_.sync_stream();
-    for (uint32_t i = 0; i < n_take; i++) {
-      double d = dist(i);
-      // The theoretical estimate of the error is hard to come up with,
-      // the estimate below is based on experimentation + curse of dimensionality
-      ASSERT_LE(d, 0.055 * std::pow(2.0, compression_ratio))
-        << " (label = " << label << ", ix = " << (n_skip + i) << ")";
-    }
+    compare_vectors_l2(handle_, rec_data.view(), orig_data.view(), label, compression_ratio, 0.055);
   }
 
   void check_reconstruct_extend(index<IdxT>* index, double compression_ratio, uint32_t label)
@@ -285,36 +289,28 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
     auto n_rows   = old_list->size.load();
     if (n_rows == 0) { return; }
 
-    auto vectors = make_device_matrix<DataT>(handle_, n_rows, index->dim());
-    auto indices = make_device_vector<IdxT>(handle_, n_rows);
+    auto vectors_1 = make_device_matrix<EvalT>(handle_, n_rows, index->dim());
+    auto indices   = make_device_vector<IdxT>(handle_, n_rows);
     copy(indices.data_handle(), old_list->indices.data_handle(), n_rows, stream_);
 
-    ivf_pq::reconstruct_list_data(handle_, *index, vectors.view(), label, 0);
+    ivf_pq::reconstruct_list_data(handle_, *index, vectors_1.view(), label, 0);
     ivf_pq::erase_list(handle_, index, label);
     // NB: passing the type parameter because const->non-const implicit conversion of the mdspans
     // breaks type inference
-    ivf_pq::extend_list<DataT, IdxT>(handle_, index, vectors.view(), indices.view(), label);
+    ivf_pq::extend_list<EvalT, IdxT>(handle_, index, vectors_1.view(), indices.view(), label);
 
     auto& new_list = index->lists()[label];
     ASSERT_NE(old_list.get(), new_list.get())
       << "The old list should have been shared and retained after ivf_pq index has erased the "
          "corresponding cluster.";
 
-    auto n_codes = old_list->data.size();
-    rmm::mr::managed_memory_resource managed_memory;
-    rmm::device_scalar<int> errs(stream_, &managed_memory);
-    linalg::mapReduce(errs.data(),
-                      n_codes,
-                      0,
-                      compose_op{cast_op<int>{}, notequal_op{}},
-                      add_op{},
-                      stream_,
-                      old_list->data.data_handle(),
-                      new_list->data.data_handle());
-    auto err_value = errs.value(stream_);
-    auto err_rate  = double(err_value) / double(n_codes);
-    ASSERT_LE(err_rate, 0.01 * std::pow(2.0, compression_ratio))
-      << " (label = " << label << ", errors = " << err_value << ", size = " << n_codes << ")";
+    auto vectors_2 = make_device_matrix<EvalT>(handle_, n_rows, index->dim());
+    ivf_pq::reconstruct_list_data(handle_, *index, vectors_2.view(), label, 0);
+    // The code search is unstable, and there's high chance of repeating values of the lvl-2 codes.
+    // Hence, encoding-decoding chain often leads to altering both the PQ codes and the
+    // reconstructed data.
+    compare_vectors_l2(
+      handle_, vectors_1.view(), vectors_2.view(), label, compression_ratio, 0.025);
   }
 
   void check_packing(index<IdxT>* index, uint32_t label)
@@ -349,15 +345,23 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
     double compression_ratio =
       static_cast<double>(ps.dim * 8) / static_cast<double>(index.pq_dim() * index.pq_bits());
 
-    // Reconstruct and re-write vectors for one label
-    check_reconstruct_extend(&index, compression_ratio, uint32_t(rand()) % index.n_lists());
-
-    // Dump and re-write codes for one label
-    check_packing(&index, uint32_t(rand()) % index.n_lists());
-
-    // check a small subset of data in a randomly chosen cluster to see if the data reconstruction
-    // works well.
-    check_reconstruction(index, compression_ratio, uint32_t(rand()) % index.n_lists(), 100, 7);
+    for (uint32_t label = 0; label < index.n_lists(); label++) {
+      switch (label % 3) {
+        case 0: {
+          // Reconstruct and re-write vectors for one label
+          check_reconstruct_extend(&index, compression_ratio, label);
+        } break;
+        case 1: {
+          // Dump and re-write codes for one label
+          check_packing(&index, label);
+        } break;
+        default: {
+          // check a small subset of data in a randomly chosen cluster to see if the data
+          // reconstruction works well.
+          check_reconstruction(index, compression_ratio, label, 100, 7);
+        }
+      }
+    }
 
     size_t queries_size = ps.num_queries * ps.k;
     std::vector<IdxT> indices_ivf_pq(queries_size);

From 1504027193dd69e13d36b81d217d4235ccafea3e Mon Sep 17 00:00:00 2001
From: achirkin <achirkin@users.noreply.github.com>
Date: Fri, 17 Mar 2023 17:10:10 +0100
Subject: [PATCH 20/25] Relax the test criterion eps a little bit

---
 cpp/test/neighbors/ann_ivf_pq.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh
index 34dfcd83d2..7e87302606 100644
--- a/cpp/test/neighbors/ann_ivf_pq.cuh
+++ b/cpp/test/neighbors/ann_ivf_pq.cuh
@@ -278,7 +278,7 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
                    orig_data.data_handle(),
                    stream_);
 
-    compare_vectors_l2(handle_, rec_data.view(), orig_data.view(), label, compression_ratio, 0.055);
+    compare_vectors_l2(handle_, rec_data.view(), orig_data.view(), label, compression_ratio, 0.06);
   }
 
   void check_reconstruct_extend(index<IdxT>* index, double compression_ratio, uint32_t label)

From 09f1a0dbaade4441647e6a8f6200bdca8857b386 Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Fri, 31 Mar 2023 21:08:44 +0200
Subject: [PATCH 21/25] Move ivf_pq helpers to separate file and namespace

---
 cpp/include/raft/neighbors/ivf_pq.cuh         | 229 -------------
 cpp/include/raft/neighbors/ivf_pq_helpers.cuh | 301 ++++++++++++++++++
 cpp/test/neighbors/ann_ivf_pq.cuh             |  19 +-
 3 files changed, 312 insertions(+), 237 deletions(-)
 create mode 100644 cpp/include/raft/neighbors/ivf_pq_helpers.cuh

diff --git a/cpp/include/raft/neighbors/ivf_pq.cuh b/cpp/include/raft/neighbors/ivf_pq.cuh
index 464a00fb67..0a3ce68dc2 100644
--- a/cpp/include/raft/neighbors/ivf_pq.cuh
+++ b/cpp/include/raft/neighbors/ivf_pq.cuh
@@ -230,175 +230,6 @@ auto build(raft::device_resources const& handle,
   return detail::build(handle, params, dataset, n_rows, dim);
 }
 
-/**
- * @brief Unpack `n_take` consecutive records of a single list (cluster) in the compressed index
- * starting at given `offset`, one code per byte (independently of pq_bits).
- *
- * Usage example:
- * @code{.cpp}
- *   // We will unpack the fourth cluster
- *   uint32_t label = 3;
- *   // Get the list size
- *   uint32_t list_size = 0;
- *   raft::copy(&list_size, index.list_sizes().data_handle() + label, 1, res.get_stream());
- *   res.sync_stream();
- *   // allocate the buffer for the output
- *   auto codes = raft::make_device_matrix<float>(res, list_size, index.pq_dim());
- *   // unpack the whole list
- *   ivf_pq::unpack_list_data(res, index, codes.view(), label, 0);
- * @endcode
- *
- * @tparam IdxT type of the indices in the source dataset
- *
- * @param[in] res
- * @param[in] index
- * @param[out] out_codes
- *   the destination buffer [n_take, index.pq_dim()].
- *   The length `n_take` defines how many records to unpack,
- *   it must be smaller than the list size.
- * @param[in] label
- *   The id of the list (cluster) to decode.
- * @param[in] offset
- *   How many records in the list to skip.
- */
-template <typename IdxT>
-void unpack_list_data(raft::device_resources const& res,
-                      const index<IdxT>& index,
-                      device_matrix_view<uint8_t, uint32_t, row_major> out_codes,
-                      uint32_t label,
-                      uint32_t offset)
-{
-  return detail::unpack_list_data<IdxT>(res, index, out_codes, label, offset);
-}
-
-/**
- * @brief Unpack a series of records of a single list (cluster) in the compressed index
- * by their in-list offsets, one code per byte (independently of pq_bits).
- *
- * Usage example:
- * @code{.cpp}
- *   // We will unpack the fourth cluster
- *   uint32_t label = 3;
- *   // Create the selection vector
- *   auto selected_indices = raft::make_device_vector<uint32_t>(res, 4);
- *   ... fill the indices ...
- *   res.sync_stream();
- *   // allocate the buffer for the output
- *   auto codes = raft::make_device_matrix<float>(res, selected_indices.size(), index.pq_dim());
- *   // decode the whole list
- *   ivf_pq::unpack_list_data(
- *       res, index, selected_indices.view(), codes.view(), label);
- * @endcode
- *
- * @tparam IdxT type of the indices in the source dataset
- *
- * @param[in] res
- * @param[in] index
- * @param[in] in_cluster_indices
- *   The offsets of the selected indices within the cluster.
- * @param[out] out_codes
- *   the destination buffer [n_take, index.pq_dim()].
- *   The length `n_take` defines how many records to unpack,
- *   it must be smaller than the list size.
- * @param[in] label
- *   The id of the list (cluster) to decode.
- */
-template <typename IdxT>
-void unpack_list_data(raft::device_resources const& res,
-                      const index<IdxT>& index,
-                      device_vector_view<const uint32_t> in_cluster_indices,
-                      device_matrix_view<uint8_t, uint32_t, row_major> out_codes,
-                      uint32_t label)
-{
-  return detail::unpack_list_data<IdxT>(res, index, out_codes, label, in_cluster_indices);
-}
-
-/**
- * @brief Decode `n_take` consecutive records of a single list (cluster) in the compressed index
- * starting at given `offset`.
- *
- * Usage example:
- * @code{.cpp}
- *   // We will reconstruct the fourth cluster
- *   uint32_t label = 3;
- *   // Get the list size
- *   uint32_t list_size = 0;
- *   raft::copy(&list_size, index.list_sizes().data_handle() + label, 1, res.get_stream());
- *   res.sync_stream();
- *   // allocate the buffer for the output
- *   auto decoded_vectors = raft::make_device_matrix<float>(res, list_size, index.dim());
- *   // decode the whole list
- *   ivf_pq::reconstruct_list_data(res, index, decoded_vectors.view(), label, 0);
- * @endcode
- *
- * @tparam T data element type
- * @tparam IdxT type of the indices in the source dataset
- *
- * @param[in] res
- * @param[in] index
- * @param[out] out_vectors
- *   the destination buffer [n_take, index.dim()].
- *   The length `n_take` defines how many records to reconstruct,
- *   it must be smaller than the list size.
- * @param[in] label
- *   The id of the list (cluster) to decode.
- * @param[in] offset
- *   How many records in the list to skip.
- */
-template <typename T, typename IdxT>
-void reconstruct_list_data(raft::device_resources const& res,
-                           const index<IdxT>& index,
-                           device_matrix_view<T, uint32_t, row_major> out_vectors,
-                           uint32_t label,
-                           uint32_t offset)
-{
-  return detail::reconstruct_list_data(res, index, out_vectors, label, offset);
-}
-
-/**
- * @brief Decode a series of records of a single list (cluster) in the compressed index
- * by their in-list offsets.
- *
- * Usage example:
- * @code{.cpp}
- *   // We will reconstruct the fourth cluster
- *   uint32_t label = 3;
- *   // Create the selection vector
- *   auto selected_indices = raft::make_device_vector<uint32_t>(res, 4);
- *   ... fill the indices ...
- *   res.sync_stream();
- *   // allocate the buffer for the output
- *   auto decoded_vectors = raft::make_device_matrix<float>(
- *                             res, selected_indices.size(), index.dim());
- *   // decode the whole list
- *   ivf_pq::reconstruct_list_data(
- *       res, index, selected_indices.view(), decoded_vectors.view(), label);
- * @endcode
- *
- * @tparam T data element type
- * @tparam IdxT type of the indices in the source dataset
- *
- * @param[in] res
- * @param[in] index
- * @param[in] in_cluster_indices
- *   The offsets of the selected indices within the cluster.
- * @param[out] out_vectors
- *   the destination buffer [n_take, index.dim()].
- *   The length `n_take` defines how many records to reconstruct,
- *   it must be smaller than the list size.
- * @param[in] label
- *   The id of the list (cluster) to decode.
- */
-template <typename T, typename IdxT>
-void reconstruct_list_data(raft::device_resources const& res,
-                           const index<IdxT>& index,
-                           device_vector_view<const uint32_t> in_cluster_indices,
-                           device_matrix_view<T, uint32_t, row_major> out_vectors,
-                           uint32_t label)
-{
-  return detail::reconstruct_list_data(res, index, out_vectors, label, in_cluster_indices);
-}
-
 /**
  * @brief Build a new index containing the data of the original plus new extra vectors.
  *
@@ -465,66 +296,6 @@ void extend(raft::device_resources const& handle,
   detail::extend(handle, idx, new_vectors, new_indices, n_rows);
 }
 
-/**
- * @brief Extend one list of the index in-place, by the list label, skipping the classification and
- * encoding steps.
- *
- * @tparam IdxT
- *
- * @param[in] res
- * @param[inout] index
- * @param[in] new_codes flat PQ codes, one code per byte [n_rows, index.pq_dim()]
- * @param[in] new_indices source indices [n_rows]
- * @param[in] label the id of the target list (cluster).
- */
-template <typename IdxT>
-void extend_list_with_codes(raft::device_resources const& res,
-                            index<IdxT>* index,
-                            device_matrix_view<const uint8_t, uint32_t, row_major> new_codes,
-                            device_vector_view<const IdxT, uint32_t, row_major> new_indices,
-                            uint32_t label)
-{
-  detail::extend_list_with_codes(res, index, new_codes, new_indices, label);
-}
-
-/**
- * @brief Extend one list of the index in-place, by the list label, skipping the classification
- * step.
- *
- * @tparam T
- * @tparam IdxT
- *
- * @param[in] res
- * @param[inout] index
- * @param[in] new_vectors data to encode [n_rows, index.dim()]
- * @param[in] new_indices source indices [n_rows]
- * @param[in] label the id of the target list (cluster).
- *
- */
-template <typename T, typename IdxT>
-void extend_list(raft::device_resources const& res,
-                 index<IdxT>* index,
-                 device_matrix_view<const T, uint32_t, row_major> new_vectors,
-                 device_vector_view<const IdxT, uint32_t, row_major> new_indices,
-                 uint32_t label)
-{
-  detail::extend_list(res, index, new_vectors, new_indices, label);
-}
-
-/**
- * @brief Remove all data from a single list (cluster) in the index.
- *
- * @tparam IdxT
- * @param[in] res
- * @param[inout] index
- * @param[in] label the id of the target list (cluster).
- */
-template <typename IdxT>
-void erase_list(raft::device_resources const& res, index<IdxT>* index, uint32_t label)
-{
-  detail::erase_list(res, index, label);
-}
-
 /**
  * @brief Search ANN using the constructed index.
  *
diff --git a/cpp/include/raft/neighbors/ivf_pq_helpers.cuh b/cpp/include/raft/neighbors/ivf_pq_helpers.cuh
new file mode 100644
index 0000000000..b51f0696d0
--- /dev/null
+++ b/cpp/include/raft/neighbors/ivf_pq_helpers.cuh
@@ -0,0 +1,301 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/neighbors/detail/ivf_pq_build.cuh>
+#include <raft/neighbors/ivf_pq_types.hpp>
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+
+namespace raft::neighbors::ivf_pq::helpers {
+
+/**
+ * @defgroup ivf_pq_helpers Helper functions for manipulationg IVF PQ Index
+ * @{
+ */
+
+/**
+ * @brief Unpack `n_take` consecutive records of a single list (cluster) in the compressed index
+ * starting at given `offset`, one code per byte (independently of pq_bits).
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // We will unpack the fourth cluster
+ *   uint32_t label = 3;
+ *   // Get the list size
+ *   uint32_t list_size = 0;
+ *   raft::copy(&list_size, index.list_sizes().data_handle() + label, 1, res.get_stream());
+ *   res.sync_stream();
+ *   // allocate the buffer for the output
+ *   auto codes = raft::make_device_matrix<float>(res, list_size, index.pq_dim());
+ *   // unpack the whole list
+ *   ivf_pq::helpers::unpack_list_data(res, index, codes.view(), label, 0);
+ * @endcode
+ *
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] res
+ * @param[in] index
+ * @param[out] out_codes
+ *   the destination buffer [n_take, index.pq_dim()].
+ *   The length `n_take` defines how many records to unpack,
+ *   it must be smaller than the list size.
+ * @param[in] label
+ *   The id of the list (cluster) to decode.
+ * @param[in] offset
+ *   How many records in the list to skip.
+ */
+template <typename IdxT>
+void unpack_list_data(raft::device_resources const& res,
+                      const index<IdxT>& index,
+                      device_matrix_view<uint8_t, uint32_t, row_major> out_codes,
+                      uint32_t label,
+                      uint32_t offset)
+{
+  return ivf_pq::detail::unpack_list_data<IdxT>(res, index, out_codes, label, offset);
+}
+
+/**
+ * @brief Unpack a series of records of a single list (cluster) in the compressed index
+ * by their in-list offsets, one code per byte (independently of pq_bits).
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // We will unpack the fourth cluster
+ *   uint32_t label = 3;
+ *   // Create the selection vector
+ *   auto selected_indices = raft::make_device_vector<uint32_t>(res, 4);
+ *   ... fill the indices ...
+ *   res.sync_stream();
+ *   // allocate the buffer for the output
+ *   auto codes = raft::make_device_matrix<float>(res, selected_indices.size(), index.pq_dim());
+ *   // decode the whole list
+ *   ivf_pq::helpers::unpack_list_data(
+ *       res, index, selected_indices.view(), codes.view(), label);
+ * @endcode
+ *
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] res
+ * @param[in] index
+ * @param[in] in_cluster_indices
+ *   The offsets of the selected indices within the cluster.
+ * @param[out] out_codes
+ *   the destination buffer [n_take, index.pq_dim()].
+ *   The length `n_take` defines how many records to unpack,
+ *   it must be smaller than the list size.
+ * @param[in] label
+ *   The id of the list (cluster) to decode.
+ */
+template <typename IdxT>
+void unpack_list_data(raft::device_resources const& res,
+                      const index<IdxT>& index,
+                      device_vector_view<const uint32_t> in_cluster_indices,
+                      device_matrix_view<uint8_t, uint32_t, row_major> out_codes,
+                      uint32_t label)
+{
+  return ivf_pq::detail::unpack_list_data<IdxT>(res, index, out_codes, label, in_cluster_indices);
+}
+
+/**
+ * @brief Decode `n_take` consecutive records of a single list (cluster) in the compressed index
+ * starting at given `offset`.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // We will reconstruct the fourth cluster
+ *   uint32_t label = 3;
+ *   // Get the list size
+ *   uint32_t list_size = 0;
+ *   raft::copy(&list_size, index.list_sizes().data_handle() + label, 1, res.get_stream());
+ *   res.sync_stream();
+ *   // allocate the buffer for the output
+ *   auto decoded_vectors = raft::make_device_matrix<float>(res, list_size, index.dim());
+ *   // decode the whole list
+ *   ivf_pq::helpers::reconstruct_list_data(res, index, decoded_vectors.view(), label, 0);
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] res
+ * @param[in] index
+ * @param[out] out_vectors
+ *   the destination buffer [n_take, index.dim()].
+ *   The length `n_take` defines how many records to reconstruct,
+ *   it must be smaller than the list size.
+ * @param[in] label
+ *   The id of the list (cluster) to decode.
+ * @param[in] offset
+ *   How many records in the list to skip.
+ */
+template <typename T, typename IdxT>
+void reconstruct_list_data(raft::device_resources const& res,
+                           const index<IdxT>& index,
+                           device_matrix_view<T, uint32_t, row_major> out_vectors,
+                           uint32_t label,
+                           uint32_t offset)
+{
+  return ivf_pq::detail::reconstruct_list_data(res, index, out_vectors, label, offset);
+}
+
+/**
+ * @brief Decode a series of records of a single list (cluster) in the compressed index
+ * by their in-list offsets.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // We will reconstruct the fourth cluster
+ *   uint32_t label = 3;
+ *   // Create the selection vector
+ *   auto selected_indices = raft::make_device_vector<uint32_t>(res, 4);
+ *   ... fill the indices ...
+ *   res.sync_stream();
+ *   // allocate the buffer for the output
+ *   auto decoded_vectors = raft::make_device_matrix<float>(
+ *                             res, selected_indices.size(), index.dim());
+ *   // decode the whole list
+ *   ivf_pq::helpers::reconstruct_list_data(
+ *       res, index, selected_indices.view(), decoded_vectors.view(), label);
+ * @endcode
+ *
+ * @tparam T data element type
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] res
+ * @param[in] index
+ * @param[in] in_cluster_indices
+ *   The offsets of the selected indices within the cluster.
+ * @param[out] out_vectors
+ *   the destination buffer [n_take, index.dim()].
+ *   The length `n_take` defines how many records to reconstruct,
+ *   it must be smaller than the list size.
+ * @param[in] label
+ *   The id of the list (cluster) to decode.
+ */
+template <typename T, typename IdxT>
+void reconstruct_list_data(raft::device_resources const& res,
+                           const index<IdxT>& index,
+                           device_vector_view<const uint32_t> in_cluster_indices,
+                           device_matrix_view<T, uint32_t, row_major> out_vectors,
+                           uint32_t label)
+{
+  return ivf_pq::detail::reconstruct_list_data(res, index, out_vectors, label, in_cluster_indices);
+}
+
+/**
+ * @brief Extend one list of the index in-place, by the list label, skipping the classification and
+ * encoding steps.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // We will extend the fourth cluster
+ *   uint32_t label = 3;
+ *   // We will fill 4 new vectors
+ *   uint32_t n_vec = 4;
+ *   // Indices of the new vectors
+ *   auto indices = raft::make_device_vector<uint32_t>(res, n_vec);
+ *   ... fill the indices ...
+ *   auto new_codes = raft::make_device_matrix<uint8_t, uint32_t, row_major> new_codes(
+ *       res, n_vec, index.pq_dim());
+ *   ... fill codes ...
+ *   // extend list with new codes
+ *   ivf_pq::helpers::extend_list_with_codes(
+ *       res, &index, codes.view(), indices.view(), label);
+ * @endcode
+ *
+ * @tparam IdxT
+ *
+ * @param[in] res
+ * @param[inout] index
+ * @param[in] new_codes flat PQ codes, one code per byte [n_rows, index.pq_dim()]
+ * @param[in] new_indices source indices [n_rows]
+ * @param[in] label the id of the target list (cluster).
+ */
+template <typename IdxT>
+void extend_list_with_codes(raft::device_resources const& res,
+                            index<IdxT>* index,
+                            device_matrix_view<const uint8_t, uint32_t, row_major> new_codes,
+                            device_vector_view<const IdxT, uint32_t, row_major> new_indices,
+                            uint32_t label)
+{
+  ivf_pq::detail::extend_list_with_codes(res, index, new_codes, new_indices, label);
+}
+
+/**
+ * @brief Extend one list of the index in-place, by the list label, skipping the classification
+ * step.
+ *
+ *  Usage example:
+ * @code{.cpp}
+ *   // We will extend the fourth cluster
+ *   uint32_t label = 3;
+ *   // We will extend with 4 new vectors
+ *   uint32_t n_vec = 4;
+ *   // Indices of the new vectors
+ *   auto indices = raft::make_device_vector<uint32_t>(res, n_vec);
+ *   ... fill the indices ...
+ *   auto new_vectors = raft::make_device_matrix<float, uint32_t, row_major> new_codes(
+ *       res, n_vec, index.dim());
+ *   ... fill vectors ...
+ *   // extend list with new vectors
+ *   ivf_pq::helpers::extend_list(
+ *       res, &index, new_vectors.view(), indices.view(), label);
+ * @endcode
+ *
+ * @tparam T
+ * @tparam IdxT
+ *
+ * @param[in] res
+ * @param[inout] index
+ * @param[in] new_vectors data to encode [n_rows, index.dim()]
+ * @param[in] new_indices source indices [n_rows]
+ * @param[in] label the id of the target list (cluster).
+ *
+ */
+template <typename T, typename IdxT>
+void extend_list(raft::device_resources const& res,
+                 index<IdxT>* index,
+                 device_matrix_view<const T, uint32_t, row_major> new_vectors,
+                 device_vector_view<const IdxT, uint32_t, row_major> new_indices,
+                 uint32_t label)
+{
+  ivf_pq::detail::extend_list(res, index, new_vectors, new_indices, label);
+}
+
+/**
+ * @brief Remove all data from a single list (cluster) in the index.
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // We will erase the fourth cluster (label = 3)
+ *   ivf_pq::helpers::erase_list(res, &index, 3);
+ * @endcode
+ *
+ * @tparam IdxT
+ * @param[in] res
+ * @param[inout] index
+ * @param[in] label the id of the target list (cluster).
+ */
+template <typename IdxT>
+void erase_list(raft::device_resources const& res, index<IdxT>* index, uint32_t label)
+{
+  ivf_pq::detail::erase_list(res, index, label);
+}
+
+}  // namespace raft::neighbors::ivf_pq::helpers
diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh
index edc82f4172..101509a878 100644
--- a/cpp/test/neighbors/ann_ivf_pq.cuh
+++ b/cpp/test/neighbors/ann_ivf_pq.cuh
@@ -26,6 +26,7 @@
 #include <raft/linalg/map_reduce.cuh>
 #include <raft/matrix/gather.cuh>
 #include <raft/neighbors/ivf_pq.cuh>
+#include <raft/neighbors/ivf_pq_helpers.cuh>
 #include <raft/random/rng.cuh>
 #ifdef RAFT_COMPILED
 #include <raft/neighbors/specializations.cuh>
@@ -268,7 +269,7 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
     auto rec_data  = make_device_matrix<DataT>(handle_, n_take, dim);
     auto orig_data = make_device_matrix<DataT>(handle_, n_take, dim);
 
-    ivf_pq::reconstruct_list_data(handle_, index, rec_data.view(), label, n_skip);
+    ivf_pq::helpers::reconstruct_list_data(handle_, index, rec_data.view(), label, n_skip);
 
     matrix::gather(database.data(),
                    IdxT{dim},
@@ -293,11 +294,12 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
     auto indices   = make_device_vector<IdxT>(handle_, n_rows);
     copy(indices.data_handle(), old_list->indices.data_handle(), n_rows, stream_);
 
-    ivf_pq::reconstruct_list_data(handle_, *index, vectors_1.view(), label, 0);
-    ivf_pq::erase_list(handle_, index, label);
+    ivf_pq::helpers::reconstruct_list_data(handle_, *index, vectors_1.view(), label, 0);
+    ivf_pq::helpers::erase_list(handle_, index, label);
     // NB: passing the type parameter because const->non-const implicit conversion of the mdspans
     // breaks type inference
-    ivf_pq::extend_list<EvalT, IdxT>(handle_, index, vectors_1.view(), indices.view(), label);
+    ivf_pq::helpers::extend_list<EvalT, IdxT>(
+      handle_, index, vectors_1.view(), indices.view(), label);
 
     auto& new_list = index->lists()[label];
     ASSERT_NE(old_list.get(), new_list.get())
@@ -305,7 +307,7 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
          "corresponding cluster.";
 
     auto vectors_2 = make_device_matrix<EvalT>(handle_, n_rows, index->dim());
-    ivf_pq::reconstruct_list_data(handle_, *index, vectors_2.view(), label, 0);
+    ivf_pq::helpers::reconstruct_list_data(handle_, *index, vectors_2.view(), label, 0);
     // The code search is unstable, and there's high chance of repeating values of the lvl-2 codes.
     // Hence, encoding-decoding chain often leads to altering both the PQ codes and the
     // reconstructed data.
@@ -324,9 +326,10 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
     auto indices = make_device_vector<IdxT>(handle_, n_rows);
     copy(indices.data_handle(), old_list->indices.data_handle(), n_rows, stream_);
 
-    ivf_pq::unpack_list_data(handle_, *index, codes.view(), label, 0);
-    ivf_pq::erase_list(handle_, index, label);
-    ivf_pq::extend_list_with_codes<IdxT>(handle_, index, codes.view(), indices.view(), label);
+    ivf_pq::helpers::unpack_list_data(handle_, *index, codes.view(), label, 0);
+    ivf_pq::helpers::erase_list(handle_, index, label);
+    ivf_pq::helpers::extend_list_with_codes<IdxT>(
+      handle_, index, codes.view(), indices.view(), label);
 
     auto& new_list = index->lists()[label];
     ASSERT_NE(old_list.get(), new_list.get())

From 926f510aab3ec16844b9b18b4c04fcc51e8f1f7d Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Fri, 31 Mar 2023 22:16:03 +0200
Subject: [PATCH 22/25] Add public API for pack_list_data

---
 cpp/include/raft/neighbors/ivf_pq_helpers.cuh | 34 +++++++++++++++++++
 cpp/test/neighbors/ann_ivf_pq.cuh             | 11 ++++++
 2 files changed, 45 insertions(+)

diff --git a/cpp/include/raft/neighbors/ivf_pq_helpers.cuh b/cpp/include/raft/neighbors/ivf_pq_helpers.cuh
index b51f0696d0..4655c60916 100644
--- a/cpp/include/raft/neighbors/ivf_pq_helpers.cuh
+++ b/cpp/include/raft/neighbors/ivf_pq_helpers.cuh
@@ -29,6 +29,40 @@ namespace raft::neighbors::ivf_pq::helpers {
  * @{
  */
 
+/**
+ * Write flat PQ codes into an existing list by the given offset.
+ *
+ * The list is identified by its label.
+ *
+ * NB: no memory allocation happens here; the list must fit the data (offset + n_vec).
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   // We will write into the 137th cluster
+ *   uint32_t label = 137;
+ *   // allocate the buffer for the input codes
+ *   auto codes = raft::make_device_matrix<float>(res, n_vec, index.pq_dim());
+ *   ... prepare n_vecs to pack into the list in codes ...
+ *   // write codes into the list starting from the 42nd position
+ *   ivf_pq::helpers::pack_list_data(res, &index, codes_to_pack, label, 42);
+ * @endcode
+ *
+ * @param[in] res
+ * @param[inout] index IVF-PQ index.
+ * @param[in] codes flat PQ codes, one code per byte [n_rows, pq_dim]
+ * @param[in] label The id of the list (cluster) into which we write.
+ * @param[in] offset how many records to skip before writing the data into the list
+ */
+template <typename IdxT>
+void pack_list_data(raft::device_resources const& res,
+                    index<IdxT>* index,
+                    device_matrix_view<const uint8_t, uint32_t, row_major> codes,
+                    uint32_t label,
+                    uint32_t offset)
+{
+  ivf_pq::detail::pack_list_data(res, index, codes, label, offset);
+}
+
 /**
  * @brief Unpack `n_take` consecutive records of a single list (cluster) in the compressed index
  * starting at given `offset`, one code per byte (independently of pq_bits).
diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh
index 101509a878..37534c0cc9 100644
--- a/cpp/test/neighbors/ann_ivf_pq.cuh
+++ b/cpp/test/neighbors/ann_ivf_pq.cuh
@@ -338,6 +338,17 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
 
     ASSERT_TRUE(devArrMatch(
       old_list->data.data_handle(), new_list->data.data_handle(), n_rows, Compare<uint8_t>{}));
+
+    // Pack a few vectors back to the list.
+    int row_offset = 9;
+    int n_vec      = 3;
+    ASSERT_TRUE(row_offset + n_vec < n_rows);
+    size_t offset      = row_offset * index->pq_dim();
+    auto codes_to_pack = make_device_matrix_view<const uint8_t, uint32_t>(
+      codes.data_handle() + offset, n_vec, index->pq_dim());
+    ivf_pq::helpers::pack_list_data(handle_, index, codes_to_pack, label, row_offset);
+    ASSERT_TRUE(devArrMatch(
+      old_list->data.data_handle(), new_list->data.data_handle(), n_rows, Compare<uint8_t>{}));
   }
 
   template <typename BuildIndex>

From 8e311d9876b375267d70d9abbd23fb0857f2b7b4 Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Sun, 2 Apr 2023 21:39:39 +0200
Subject: [PATCH 23/25] Increase tolarance for vector reconstruction test

---
 cpp/test/neighbors/ann_ivf_pq.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh
index 37534c0cc9..b2528bbd47 100644
--- a/cpp/test/neighbors/ann_ivf_pq.cuh
+++ b/cpp/test/neighbors/ann_ivf_pq.cuh
@@ -139,7 +139,7 @@ void compare_vectors_l2(
     double d = dist(i);
     // The theoretical estimate of the error is hard to come up with,
     // the estimate below is based on experimentation + curse of dimensionality
-    ASSERT_LE(d, eps * std::pow(2.0, compression_ratio))
+    ASSERT_LE(d, 1.2 * eps * std::pow(2.0, compression_ratio))
       << " (label = " << label << ", ix = " << i << ", eps = " << eps << ")";
   }
 }

From 280c040149a32f2248e2938f3dc87cfdf74cb7d2 Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Mon, 3 Apr 2023 12:06:41 +0200
Subject: [PATCH 24/25] Correct number of list elements to compare

---
 cpp/test/neighbors/ann_ivf_pq.cuh | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh
index b2528bbd47..707392efdf 100644
--- a/cpp/test/neighbors/ann_ivf_pq.cuh
+++ b/cpp/test/neighbors/ann_ivf_pq.cuh
@@ -335,9 +335,15 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
     ASSERT_NE(old_list.get(), new_list.get())
       << "The old list should have been shared and retained after ivf_pq index has erased the "
          "corresponding cluster.";
+    auto list_data_size = (n_rows / ivf_pq::kIndexGroupSize) * new_list->data.extent(1) *
+                          new_list->data.extent(2) * new_list->data.extent(3);
 
-    ASSERT_TRUE(devArrMatch(
-      old_list->data.data_handle(), new_list->data.data_handle(), n_rows, Compare<uint8_t>{}));
+    ASSERT_TRUE(old_list->data.size() >= list_data_size);
+    ASSERT_TRUE(new_list->data.size() >= list_data_size);
+    ASSERT_TRUE(devArrMatch(old_list->data.data_handle(),
+                            new_list->data.data_handle(),
+                            list_data_size,
+                            Compare<uint8_t>{}));
 
     // Pack a few vectors back to the list.
     int row_offset = 9;
@@ -347,8 +353,10 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
     auto codes_to_pack = make_device_matrix_view<const uint8_t, uint32_t>(
       codes.data_handle() + offset, n_vec, index->pq_dim());
     ivf_pq::helpers::pack_list_data(handle_, index, codes_to_pack, label, row_offset);
-    ASSERT_TRUE(devArrMatch(
-      old_list->data.data_handle(), new_list->data.data_handle(), n_rows, Compare<uint8_t>{}));
+    ASSERT_TRUE(devArrMatch(old_list->data.data_handle(),
+                            new_list->data.data_handle(),
+                            list_data_size,
+                            Compare<uint8_t>{}));
   }
 
   template <typename BuildIndex>

From a7e54196e97431345be1a676bfa1e1feeee07eda Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Wed, 5 Apr 2023 14:11:04 +0200
Subject: [PATCH 25/25] Add ivf_pq::helpers::codepacker::pack / unpack

---
 cpp/include/raft/neighbors/ivf_pq_helpers.cuh | 78 ++++++++++++++++++-
 cpp/test/neighbors/ann_ivf_pq.cuh             | 16 ++++
 2 files changed, 92 insertions(+), 2 deletions(-)

diff --git a/cpp/include/raft/neighbors/ivf_pq_helpers.cuh b/cpp/include/raft/neighbors/ivf_pq_helpers.cuh
index 4655c60916..398bd545f1 100644
--- a/cpp/include/raft/neighbors/ivf_pq_helpers.cuh
+++ b/cpp/include/raft/neighbors/ivf_pq_helpers.cuh
@@ -23,12 +23,85 @@
 #include <raft/core/device_resources.hpp>
 
 namespace raft::neighbors::ivf_pq::helpers {
-
 /**
  * @defgroup ivf_pq_helpers Helper functions for manipulationg IVF PQ Index
  * @{
  */
 
+namespace codepacker {
+/**
+ * @brief Unpack `n_take` consecutive records of a single list (cluster) in the compressed index
+ * starting at given `offset`.
+ *
+ * Bit compression is removed, which means output will have pq_dim dimensional vectors (one code per
+ * byte, instead of ceildiv(pq_dim * pq_bits, 8) bytes of pq codes).
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   auto list_data = index.lists()[label]->data.view();
+ *   // allocate the buffer for the output
+ *   uint32_t n_take = 4;
+ *   auto codes = raft::make_device_matrix<uint8_t>(res, n_take, index.pq_dim());
+ *   uint32_t offset = 0;
+ *   // unpack n_take elements from the list
+ *   ivf_pq::helpers::codepacker::unpack(res, list_data, index.pq_bits(), offset, codes.view());
+ * @endcode
+ *
+ * @tparam IdxT type of the indices in the source dataset
+ *
+ * @param[in] res raft resource
+ * @param[in] list_data block to read from
+ * @param[in] pq_bits bit length of encoded vector elements
+ * @param[in] offset
+ *   How many records in the list to skip.
+ * @param[out] codes
+ *   the destination buffer [n_take, index.pq_dim()].
+ *   The length `n_take` defines how many records to unpack,
+ *   it must be smaller than the list size.
+ */
+inline void unpack(
+  raft::device_resources const& res,
+  device_mdspan<const uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> list_data,
+  uint32_t pq_bits,
+  uint32_t offset,
+  device_matrix_view<uint8_t, uint32_t, row_major> codes)
+{
+  ivf_pq::detail::unpack_list_data(codes, list_data, offset, pq_bits, res.get_stream());
+}
+
+/**
+ * Write flat PQ codes into an existing list by the given offset.
+ *
+ * NB: no memory allocation happens here; the list must fit the data (offset + n_vec).
+ *
+ * Usage example:
+ * @code{.cpp}
+ *   auto list_data  = index.lists()[label]->data.view();
+ *   // allocate the buffer for the input codes
+ *   auto codes = raft::make_device_matrix<uint8_t>(res, n_vec, index.pq_dim());
+ *   ... prepare n_vecs to pack into the list in codes ...
+ *   // write codes into the list starting from the 42nd position
+ *   ivf_pq::helpers::codepacker::pack(
+ *       res, make_const_mdspan(codes.view()), index.pq_bits(), 42, list_data);
+ * @endcode
+ *
+ * @param[in] res
+ * @param[in] codes flat PQ codes, one code per byte [n_vec, pq_dim]
+ * @param[in] pq_bits bit length of encoded vector elements
+ * @param[in] offset how many records to skip before writing the data into the list
+ * @param[in] list_data block to write into
+ */
+inline void pack(
+  raft::device_resources const& res,
+  device_matrix_view<const uint8_t, uint32_t, row_major> codes,
+  uint32_t pq_bits,
+  uint32_t offset,
+  device_mdspan<uint8_t, list_spec<uint32_t, uint32_t>::list_extents, row_major> list_data)
+{
+  ivf_pq::detail::pack_list_data(list_data, codes, offset, pq_bits, res.get_stream());
+}
+}  // namespace codepacker
+
 /**
  * Write flat PQ codes into an existing list by the given offset.
  *
@@ -41,7 +114,7 @@ namespace raft::neighbors::ivf_pq::helpers {
  *   // We will write into the 137th cluster
  *   uint32_t label = 137;
  *   // allocate the buffer for the input codes
- *   auto codes = raft::make_device_matrix<float>(res, n_vec, index.pq_dim());
+ *   auto codes = raft::make_device_matrix<const uint8_t>(res, n_vec, index.pq_dim());
  *   ... prepare n_vecs to pack into the list in codes ...
  *   // write codes into the list starting from the 42nd position
  *   ivf_pq::helpers::pack_list_data(res, &index, codes_to_pack, label, 42);
@@ -332,4 +405,5 @@ void erase_list(raft::device_resources const& res, index<IdxT>* index, uint32_t
   ivf_pq::detail::erase_list(res, index, label);
 }
 
+/** @} */
 }  // namespace raft::neighbors::ivf_pq::helpers
diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh
index 707392efdf..f90b6fb318 100644
--- a/cpp/test/neighbors/ann_ivf_pq.cuh
+++ b/cpp/test/neighbors/ann_ivf_pq.cuh
@@ -357,6 +357,22 @@ class ivf_pq_test : public ::testing::TestWithParam<ivf_pq_inputs> {
                             new_list->data.data_handle(),
                             list_data_size,
                             Compare<uint8_t>{}));
+
+    // Another test with the API that take list_data directly
+    auto list_data  = index->lists()[label]->data.view();
+    uint32_t n_take = 4;
+    ASSERT_TRUE(row_offset + n_take < n_rows);
+    auto codes2 = raft::make_device_matrix<uint8_t>(handle_, n_take, index->pq_dim());
+    ivf_pq::helpers::codepacker::unpack(
+      handle_, list_data, index->pq_bits(), row_offset, codes2.view());
+
+    // Write it back
+    ivf_pq::helpers::codepacker::pack(
+      handle_, make_const_mdspan(codes2.view()), index->pq_bits(), row_offset, list_data);
+    ASSERT_TRUE(devArrMatch(old_list->data.data_handle(),
+                            new_list->data.data_handle(),
+                            list_data_size,
+                            Compare<uint8_t>{}));
   }
 
   template <typename BuildIndex>