From d4213757a79dbcfe6c7786663d00c3f69fcb1bb5 Mon Sep 17 00:00:00 2001 From: achirkin Date: Thu, 23 Feb 2023 13:52:40 +0100 Subject: [PATCH 01/25] Add a method to reconstruct the compressed index data --- .../raft/neighbors/detail/ivf_pq_build.cuh | 285 +++++++++++++++--- cpp/test/neighbors/ann_ivf_pq.cuh | 67 +++- 2 files changed, 311 insertions(+), 41 deletions(-) diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh index bf3014568a..e4ac26f4c0 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh @@ -557,6 +557,177 @@ void train_per_cluster(raft::device_resources const& handle, transpose_pq_centers(handle, index, pq_centers_tmp.data()); } +/** + * Decode a lvl-2 pq-encoded vector in the given list (cluster). + * One vector per thread. + * NB: this function only decodes the PQ (second level) enconding; to get the approximation of the + * original vector, you need to add the cluster centroid and apply the inverse matrix transform to + * the result of this function. + * + * @tparam PqBits + * + * @param[out] out_vector the destination for the decoded vector (one-per-thread). + * @param[in] in_list_data the encoded cluster data. + * @param[in] pq_centers the codebook + * @param[in] codebook_kind + * @param[in] in_ix in-cluster index of the vector to be decoded (one-per-thread). + * @param[in] cluster_ix label/id of the cluster (one-per-thread). + */ +template +__device__ void reconstruct_vector( + device_vector_view out_vector, + device_mdspan::list_extents, row_major> in_list_data, + device_mdspan, row_major> pq_centers, + codebook_gen codebook_kind, + uint32_t in_ix, + uint32_t cluster_ix) +{ + using group_align = Pow2; + const uint32_t group_ix = group_align::div(in_ix); + const uint32_t ingroup_ix = group_align::mod(in_ix); + const uint32_t pq_len = pq_centers.extent(1); + const uint32_t pq_dim = out_vector.extent(0) / pq_len; + + using layout_t = typename decltype(out_vector)::layout_type; + using accessor_t = typename decltype(out_vector)::accessor_type; + auto reinterpreted_vector = mdspan, layout_t, accessor_t>( + out_vector.data_handle(), extent_2d{pq_dim, pq_len}); + + pq_vec_t code_chunk; + bitfield_view_t code_view{reinterpret_cast(&code_chunk)}; + constexpr uint32_t kChunkSize = (sizeof(pq_vec_t) * 8u) / PqBits; + for (uint32_t j = 0, i = 0; j < pq_dim; i++) { + // read the chunk + code_chunk = *reinterpret_cast(&in_list_data(group_ix, i, ingroup_ix, 0)); + // read the codes, one/pq_dim at a time +#pragma unroll + for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++) { + uint32_t partition_ix; + switch (codebook_kind) { + case codebook_gen::PER_CLUSTER: { + partition_ix = cluster_ix; + } break; + case codebook_gen::PER_SUBSPACE: { + partition_ix = j; + } break; + default: __builtin_unreachable(); + } + uint8_t code = code_view[k]; + // read a piece of the reconstructed vector + for (uint32_t l = 0; l < pq_len; l++) { + reinterpreted_vector(j, l) = pq_centers(partition_ix, l, code); + } + } + } +} + +template +__launch_bounds__(BlockSize) __global__ void reconstruct_list_data_kernel( + device_matrix_view out_vectors, + device_vector_view data_ptrs, + device_mdspan, row_major> pq_centers, + device_matrix_view centers_rot, + codebook_gen codebook_kind, + uint32_t cluster_ix, + uint32_t n_skip) +{ + const auto out_dim = out_vectors.extent(1); + using layout_t = typename decltype(out_vectors)::layout_type; + using accessor_t = typename decltype(out_vectors)::accessor_type; + + const uint32_t pq_dim = out_dim / pq_centers.extent(1); + auto pq_extents = + list_spec{PqBits, pq_dim, true}.make_list_extents(out_vectors.extent(0) + n_skip + 1); + auto pq_dataset = + make_mdspan(data_ptrs[cluster_ix], pq_extents); + + for (uint32_t ix = threadIdx.x + BlockSize * blockIdx.x; ix < out_vectors.extent(0); + ix += BlockSize) { + auto one_vector = mdspan, layout_t, accessor_t>( + &out_vectors(ix, 0), extent_1d{out_vectors.extent(1)}); + reconstruct_vector( + one_vector, pq_dataset, pq_centers, codebook_kind, ix + n_skip, cluster_ix); + for (uint32_t j = 0; j < out_dim; j++) { + one_vector(j) += centers_rot(cluster_ix, j); + } + } +} + +template +void reconstruct_list_data(raft::device_resources const& res, + const index& index, + device_matrix_view out_vectors, + uint32_t label, + uint32_t n_skip) +{ + auto n_rows = out_vectors.extent(0); + if (n_rows == 0) { return; } + // sic! I'm using the upper bound `list.size` instead of exact `list_sizes(label)` + // to avoid an extra device-host data copy and the stream sync. + RAFT_EXPECTS(n_skip + n_rows <= index.lists()[label]->size.load(), + "n_skip + output size must be not bigger than the cluster size."); + + auto tmp = make_device_mdarray( + res, res.get_workspace_resource(), make_extents(n_rows, index.rot_dim())); + + constexpr uint32_t kBlockSize = 256; + dim3 blocks(div_rounding_up_safe(n_rows, kBlockSize), 1, 1); + dim3 threads(kBlockSize, 1, 1); + auto kernel = [](uint32_t pq_bits) { + switch (pq_bits) { + case 4: return reconstruct_list_data_kernel; + case 5: return reconstruct_list_data_kernel; + case 6: return reconstruct_list_data_kernel; + case 7: return reconstruct_list_data_kernel; + case 8: return reconstruct_list_data_kernel; + default: RAFT_FAIL("Invalid pq_bits (%u), the value must be within [4, 8]", pq_bits); + } + }(index.pq_bits()); + kernel<<>>(tmp.view(), + index.data_ptrs(), + index.pq_centers(), + index.centers_rot(), + index.codebook_kind(), + label, + n_skip); + RAFT_CUDA_TRY(cudaPeekAtLastError()); + + float* out_float_ptr = nullptr; + rmm::device_uvector out_float_buf(0, res.get_stream(), res.get_workspace_resource()); + if constexpr (std::is_same_v) { + out_float_ptr = out_vectors.data_handle(); + } else { + out_float_buf.resize(size_t{n_rows} * size_t{index.dim()}, res.get_stream()); + out_float_ptr = out_float_buf.data(); + } + // Rotate the results back to the original space + float alpha = 1.0; + float beta = 0.0; + linalg::gemm(res, + false, + false, + index.dim(), + n_rows, + index.rot_dim(), + &alpha, + index.rotation_matrix().data_handle(), + index.dim(), + tmp.data_handle(), + index.rot_dim(), + &beta, + out_float_ptr, + index.dim(), + res.get_stream()); + // Transform the data to the original type, if necessary + if constexpr (!std::is_same_v) { + linalg::map_k(out_vectors.data_handle(), + out_float_buf.size(), + utils::mapping{}, + res.get_stream(), + out_float_ptr); + } +} + /** * Compute the code: find the closest cluster in each pq_dim-subspace. * @@ -625,6 +796,67 @@ __device__ auto compute_pq_code( return code; } +/** + * Compute a PQ code for a single input vector per subwarp and write it into the + * appropriate cluster. + * Subwarp size here is the minimum between WarpSize and the codebook size. + * + * @tparam BlockSize + * @tparam PqBits + * + * @param[out] out_list_data an array of pointers to the database clusers. + * @param[in] in_vector input unencoded data, one-per-subwarp + * @param[in] pq_centers codebook + * @param[in] codebook_kind + * @param[in] out_ix in-cluster output index (where to write the encoded data), one-per-subwarp. + * @param[in] cluster_ix label/id of the cluster to fill, one-per-subwarp. + */ +template +__device__ auto compute_and_write_pq_code( + device_mdspan::list_extents, row_major> out_list_data, + device_vector_view in_vector, + device_mdspan, row_major> pq_centers, + codebook_gen codebook_kind, + uint32_t out_ix, + uint32_t cluster_ix) +{ + constexpr uint32_t kSubWarpSize = std::min(WarpSize, 1u << PqBits); + using subwarp_align = Pow2; + const uint32_t lane_id = subwarp_align::mod(threadIdx.x); + + using group_align = Pow2; + const uint32_t group_ix = group_align::div(out_ix); + const uint32_t ingroup_ix = group_align::mod(out_ix); + const uint32_t pq_len = pq_centers.extent(1); + const uint32_t pq_dim = in_vector.extent(0) / pq_len; + + using layout_t = typename decltype(in_vector)::layout_type; + using accessor_t = typename decltype(in_vector)::accessor_type; + auto reinterpreted_vector = mdspan, layout_t, accessor_t>( + in_vector.data_handle(), extent_2d{pq_dim, pq_len}); + + __shared__ pq_vec_t codes[subwarp_align::div(BlockSize)]; + pq_vec_t& code = codes[subwarp_align::div(threadIdx.x)]; + bitfield_view_t out{reinterpret_cast(&code)}; + constexpr uint32_t kChunkSize = (sizeof(pq_vec_t) * 8u) / PqBits; + for (uint32_t j = 0, i = 0; j < pq_dim; i++) { + // clear the chunk for writing + if (lane_id == 0) { code = pq_vec_t{}; } + // fill-in the values, one/pq_dim at a time +#pragma unroll + for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++) { + // find the label + auto l = compute_pq_code( + pq_centers, reinterpreted_vector, codebook_kind, j, cluster_ix); + if (lane_id == 0) { out[k] = l; } + } + // write the chunk into the dataset + if (lane_id == 0) { + *reinterpret_cast(&out_list_data(group_ix, i, ingroup_ix, 0)) = code; + } + } +} + template __launch_bounds__(BlockSize) __global__ void process_and_fill_codes_kernel( device_matrix_view new_vectors, @@ -639,7 +871,7 @@ __launch_bounds__(BlockSize) __global__ void process_and_fill_codes_kernel( constexpr uint32_t kSubWarpSize = std::min(WarpSize, 1u << PqBits); using subwarp_align = Pow2; const uint32_t lane_id = subwarp_align::mod(threadIdx.x); - const IdxT row_ix = subwarp_align::div(IdxT{threadIdx.x} + IdxT{blockDim.x} * IdxT{blockIdx.x}); + const IdxT row_ix = subwarp_align::div(IdxT{threadIdx.x} + IdxT{BlockSize} * IdxT{blockIdx.x}); if (row_ix >= new_vectors.extent(0)) { return; } const uint32_t cluster_ix = new_labels[row_ix]; @@ -647,7 +879,7 @@ __launch_bounds__(BlockSize) __global__ void process_and_fill_codes_kernel( if (lane_id == 0) { out_ix = atomicAdd(&list_sizes(cluster_ix), 1); } out_ix = shfl(out_ix, 0, kSubWarpSize); - // write the label + // write the label (one record per subwarp) auto pq_indices = inds_ptrs(cluster_ix); if (lane_id == 0) { if (std::holds_alternative(src_offset_or_indices)) { @@ -657,40 +889,21 @@ __launch_bounds__(BlockSize) __global__ void process_and_fill_codes_kernel( } } - // write the codes - using group_align = Pow2; - const uint32_t group_ix = group_align::div(out_ix); - const uint32_t ingroup_ix = group_align::mod(out_ix); - const uint32_t pq_len = pq_centers.extent(1); - const uint32_t pq_dim = new_vectors.extent(1) / pq_len; - - auto pq_extents = list_spec{PqBits, pq_dim, true}.make_list_extents(out_ix + 1); - auto pq_extents_vectorized = - make_extents(pq_extents.extent(0), pq_extents.extent(1), pq_extents.extent(2)); - auto pq_dataset = make_mdspan( - reinterpret_cast(data_ptrs[cluster_ix]), pq_extents_vectorized); - - __shared__ pq_vec_t codes[subwarp_align::div(BlockSize)]; - pq_vec_t& code = codes[subwarp_align::div(threadIdx.x)]; - bitfield_view_t out{reinterpret_cast(&code)}; - constexpr uint32_t kChunkSize = (sizeof(pq_vec_t) * 8u) / PqBits; - for (uint32_t j = 0, i = 0; j < pq_dim; i++) { - // clear the chunk for writing - if (lane_id == 0) { code = pq_vec_t{}; } - // fill-in the values, one/pq_dim at a time -#pragma unroll - for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++) { - // find the label - using layout_t = typename decltype(new_vectors)::layout_type; - using accessor_t = typename decltype(new_vectors)::accessor_type; - auto one_vector = mdspan, layout_t, accessor_t>( - &new_vectors(row_ix, 0), extent_2d{pq_dim, pq_len}); - auto l = compute_pq_code(pq_centers, one_vector, codebook_kind, j, cluster_ix); - if (lane_id == 0) { out[k] = l; } - } - // write the chunk into the dataset - if (lane_id == 0) { pq_dataset(group_ix, i, ingroup_ix) = code; } - } + // write the codes (one record per subwarp): + // 1. select input row + using layout_t = typename decltype(new_vectors)::layout_type; + using accessor_t = typename decltype(new_vectors)::accessor_type; + const auto in_dim = new_vectors.extent(1); + auto one_vector = + mdspan, layout_t, accessor_t>(&new_vectors(row_ix, 0), in_dim); + // 2. select output cluster + const uint32_t pq_dim = in_dim / pq_centers.extent(1); + auto pq_extents = list_spec{PqBits, pq_dim, true}.make_list_extents(out_ix + 1); + auto pq_dataset = + make_mdspan(data_ptrs[cluster_ix], pq_extents); + // 3. compute and write the vector + compute_and_write_pq_code( + pq_dataset, one_vector, pq_centers, codebook_kind, out_ix, cluster_ix); } /** diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh index df295b8bcb..6ecdd8674c 100644 --- a/cpp/test/neighbors/ann_ivf_pq.cuh +++ b/cpp/test/neighbors/ann_ivf_pq.cuh @@ -22,6 +22,7 @@ #include #include +#include #include #include #if defined RAFT_DISTANCE_COMPILED @@ -139,7 +140,6 @@ class ivf_pq_test : public ::testing::TestWithParam { { } - protected: void gen_data() { database.resize(size_t{ps.num_db_vecs} * size_t{ps.dim}, stream_); @@ -216,11 +216,70 @@ class ivf_pq_test : public ::testing::TestWithParam { return ivf_pq::detail::deserialize(handle_, "ivf_pq_index"); } + void check_reconstruction(const index& index, + double compression_ratio, + uint32_t label, + uint32_t n_take, + uint32_t n_skip) + { + auto rec_list = index.lists()[label]; + auto dim = index.dim(); + n_take = std::min(n_take, rec_list->size.load()); + n_skip = std::min(n_skip, rec_list->size.load() - n_take); + + if (n_take == 0) { return; } + + auto rec_data = make_device_matrix(handle_, n_take, dim); + auto orig_data = make_device_matrix(handle_, n_take, dim); + + rmm::mr::managed_memory_resource managed_memory; + auto dist = + make_device_mdarray(handle_, &managed_memory, make_extents(n_take)); + + ivf_pq::detail::reconstruct_list_data(handle_, index, rec_data.view(), label, n_skip); + + matrix::gather(database.data(), + IdxT{dim}, + IdxT{n_take}, + rec_list->indices.data_handle() + n_skip, + IdxT{n_take}, + orig_data.data_handle(), + stream_); + + auto rec_data_view = rec_data.view(); + auto orig_data_view = orig_data.view(); + linalg::map_offset( + handle_, dist.view(), [rec_data_view, orig_data_view, dim] __device__(uint32_t i) { + spatial::knn::detail::utils::mapping f{}; + double d = 0.0f; + for (uint32_t j = 0; j < dim; j++) { + double t = f(rec_data_view(i, j)) - f(orig_data_view(i, j)); + d += t * t; + } + return sqrt(d / double(dim)); + }); + handle_.sync_stream(); + for (uint32_t i = 0; i < n_take; i++) { + double d = dist(i); + // The theoretical estimate of the error is hard to come up with, + // the estimate below is based on experimentation + curse of dimensionality + ASSERT_LE(d, 0.04 * std::pow(2.0, compression_ratio)) + << " (label = " << label << ", ix = " << (n_skip + i) << ")"; + } + } + template void run(BuildIndex build_index) { auto index = build_index(); + double compression_ratio = + static_cast(ps.dim * 8) / static_cast(index.pq_dim() * index.pq_bits()); + + // check a small subset of data in a randomly chosen cluster to see if the data reconstruction + // works well. + check_reconstruction(index, compression_ratio, uint32_t(rand()) % index.n_lists(), 100, 7); + size_t queries_size = ps.num_queries * ps.k; std::vector indices_ivf_pq(queries_size); std::vector distances_ivf_pq(queries_size); @@ -244,11 +303,9 @@ class ivf_pq_test : public ::testing::TestWithParam { // A very conservative lower bound on recall double min_recall = static_cast(ps.search_params.n_probes) / static_cast(ps.index_params.n_lists); - double low_precision_factor = - static_cast(ps.dim * 8) / static_cast(index.pq_dim() * index.pq_bits()); // Using a heuristic to lower the required recall due to code-packing errors min_recall = - std::min(std::erfc(0.05 * low_precision_factor / std::max(min_recall, 0.5)), min_recall); + std::min(std::erfc(0.05 * compression_ratio / std::max(min_recall, 0.5)), min_recall); // Use explicit per-test min recall value if provided. min_recall = ps.min_recall.value_or(min_recall); @@ -258,7 +315,7 @@ class ivf_pq_test : public ::testing::TestWithParam { distances_ivf_pq, ps.num_queries, ps.k, - 0.0001 * low_precision_factor, + 0.0001 * compression_ratio, min_recall)) << ps; From c7b557419988155214b9cbd41fb854e0a2748821 Mon Sep 17 00:00:00 2001 From: achirkin Date: Thu, 23 Feb 2023 13:59:13 +0100 Subject: [PATCH 02/25] Fix a typo in the docs --- cpp/include/raft/neighbors/detail/ivf_pq_build.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh index e4ac26f4c0..90e6cbf668 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh @@ -560,7 +560,7 @@ void train_per_cluster(raft::device_resources const& handle, /** * Decode a lvl-2 pq-encoded vector in the given list (cluster). * One vector per thread. - * NB: this function only decodes the PQ (second level) enconding; to get the approximation of the + * NB: this function only decodes the PQ (second level) encoding; to get the approximation of the * original vector, you need to add the cluster centroid and apply the inverse matrix transform to * the result of this function. * From 087919f0b020982dbf9491063f72f7ed9e582909 Mon Sep 17 00:00:00 2001 From: achirkin Date: Thu, 23 Feb 2023 15:05:44 +0100 Subject: [PATCH 03/25] Relax the constraints a bit --- cpp/test/neighbors/ann_ivf_pq.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh index 6ecdd8674c..d624c4a00a 100644 --- a/cpp/test/neighbors/ann_ivf_pq.cuh +++ b/cpp/test/neighbors/ann_ivf_pq.cuh @@ -263,7 +263,7 @@ class ivf_pq_test : public ::testing::TestWithParam { double d = dist(i); // The theoretical estimate of the error is hard to come up with, // the estimate below is based on experimentation + curse of dimensionality - ASSERT_LE(d, 0.04 * std::pow(2.0, compression_ratio)) + ASSERT_LE(d, 0.05 * std::pow(2.0, compression_ratio)) << " (label = " << label << ", ix = " << (n_skip + i) << ")"; } } From b5e984486f96210b27a53fadbd752e91d8a0ac1b Mon Sep 17 00:00:00 2001 From: achirkin Date: Thu, 23 Feb 2023 16:08:30 +0100 Subject: [PATCH 04/25] Add the public interface --- .../raft/neighbors/detail/ivf_pq_build.cuh | 1 + cpp/include/raft/neighbors/ivf_pq.cuh | 42 +++++++++++++++++++ cpp/test/neighbors/ann_ivf_pq.cuh | 2 +- 3 files changed, 44 insertions(+), 1 deletion(-) diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh index 90e6cbf668..96e93370e8 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh @@ -653,6 +653,7 @@ __launch_bounds__(BlockSize) __global__ void reconstruct_list_data_kernel( } } +/** Decode the list data; see the public interface for the api and usage. */ template void reconstruct_list_data(raft::device_resources const& res, const index& index, diff --git a/cpp/include/raft/neighbors/ivf_pq.cuh b/cpp/include/raft/neighbors/ivf_pq.cuh index e2cc3c4728..9c85bc333f 100644 --- a/cpp/include/raft/neighbors/ivf_pq.cuh +++ b/cpp/include/raft/neighbors/ivf_pq.cuh @@ -75,6 +75,48 @@ auto build(raft::device_resources const& handle, return detail::build(handle, params, dataset, n_rows, dim); } +/** + * @brief Decode `n_take` consecutive records of a single list (cluster) in the compressed index + * starting at given offset `n_skip`. + * + * Usage example: + * @code{.cpp} + * // We will reconstruct the fourth cluster + * uint32_t label = 3; + * // Get the list size + * uint32_t list_size = 0; + * raft::copy(&list_size, index.list_sizes().data_handle() + label, 1, res.get_stream()); + * res.sync_stream(); + * // allocate the buffer for the output + * auto decoded_vectors = raft::make_device_matrix(res, list_size, index.dim()); + * // decode the whole list + * ivf_pq::reconstruct_list_data(res, index, decoded_vectors.view(), label, 0); + * @endcode + * + * @tparam T data element type + * @tparam IdxT type of the indices in the source dataset + * + * @param[in] res + * @param[in] index + * @param[out] out_vectors + * the destination buffer [n_take, index.dim()]. + * The length `n_take` defines how many records to reconstruct, + * it must be smaller than the list size. + * @param[in] label + * The id of the list (cluster) to decode. + * @param[in] n_skip + * How many records in the list to skip. + */ +template +void reconstruct_list_data(raft::device_resources const& res, + const index& index, + device_matrix_view out_vectors, + uint32_t label, + uint32_t n_skip) +{ + return detail::reconstruct_list_data(res, index, out_vectors, label, n_skip); +} + /** * @brief Build a new index containing the data of the original plus new extra vectors. * diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh index d624c4a00a..81eb266906 100644 --- a/cpp/test/neighbors/ann_ivf_pq.cuh +++ b/cpp/test/neighbors/ann_ivf_pq.cuh @@ -236,7 +236,7 @@ class ivf_pq_test : public ::testing::TestWithParam { auto dist = make_device_mdarray(handle_, &managed_memory, make_extents(n_take)); - ivf_pq::detail::reconstruct_list_data(handle_, index, rec_data.view(), label, n_skip); + ivf_pq::reconstruct_list_data(handle_, index, rec_data.view(), label, n_skip); matrix::gather(database.data(), IdxT{dim}, From 2773a8607143c12a76bacc70cab42dca5a4532f1 Mon Sep 17 00:00:00 2001 From: achirkin Date: Wed, 15 Mar 2023 10:59:48 +0100 Subject: [PATCH 05/25] Fix the merge errors --- cpp/include/raft/neighbors/detail/ivf_pq_build.cuh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh index 4ab008c173..aa56f2baec 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh @@ -576,7 +576,7 @@ void train_per_cluster(raft::device_resources const& handle, template __device__ void reconstruct_vector( device_vector_view out_vector, - device_mdspan::list_extents, row_major> in_list_data, + device_mdspan::list_extents, row_major> in_list_data, device_mdspan, row_major> pq_centers, codebook_gen codebook_kind, uint32_t in_ix, @@ -636,8 +636,8 @@ __launch_bounds__(BlockSize) __global__ void reconstruct_list_data_kernel( using accessor_t = typename decltype(out_vectors)::accessor_type; const uint32_t pq_dim = out_dim / pq_centers.extent(1); - auto pq_extents = - list_spec{PqBits, pq_dim, true}.make_list_extents(out_vectors.extent(0) + n_skip + 1); + auto pq_extents = list_spec{PqBits, pq_dim, true}.make_list_extents( + out_vectors.extent(0) + n_skip + 1); auto pq_dataset = make_mdspan(data_ptrs[cluster_ix], pq_extents); @@ -814,7 +814,7 @@ __device__ auto compute_pq_code( */ template __device__ auto compute_and_write_pq_code( - device_mdspan::list_extents, row_major> out_list_data, + device_mdspan::list_extents, row_major> out_list_data, device_vector_view in_vector, device_mdspan, row_major> pq_centers, codebook_gen codebook_kind, From 82be40c7183553f0cab568eaaffe9a8d649e9410 Mon Sep 17 00:00:00 2001 From: achirkin Date: Wed, 15 Mar 2023 11:49:28 +0100 Subject: [PATCH 06/25] Add an option to reconstruct cluster data by in-cluster indices --- .../raft/neighbors/detail/ivf_pq_build.cuh | 37 ++++++++------ cpp/include/raft/neighbors/ivf_pq.cuh | 50 +++++++++++++++++-- 2 files changed, 68 insertions(+), 19 deletions(-) diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh index aa56f2baec..4c51423be4 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh @@ -625,19 +625,20 @@ template __launch_bounds__(BlockSize) __global__ void reconstruct_list_data_kernel( device_matrix_view out_vectors, device_vector_view data_ptrs, + device_vector_view list_sizes, device_mdspan, row_major> pq_centers, device_matrix_view centers_rot, codebook_gen codebook_kind, uint32_t cluster_ix, - uint32_t n_skip) + std::variant offset_or_indices) { const auto out_dim = out_vectors.extent(1); using layout_t = typename decltype(out_vectors)::layout_type; using accessor_t = typename decltype(out_vectors)::accessor_type; const uint32_t pq_dim = out_dim / pq_centers.extent(1); - auto pq_extents = list_spec{PqBits, pq_dim, true}.make_list_extents( - out_vectors.extent(0) + n_skip + 1); + auto pq_extents = + list_spec{PqBits, pq_dim, true}.make_list_extents(list_sizes[cluster_ix]); auto pq_dataset = make_mdspan(data_ptrs[cluster_ix], pq_extents); @@ -645,8 +646,11 @@ __launch_bounds__(BlockSize) __global__ void reconstruct_list_data_kernel( ix += BlockSize) { auto one_vector = mdspan, layout_t, accessor_t>( &out_vectors(ix, 0), extent_1d{out_vectors.extent(1)}); + const uint32_t src_ix = std::holds_alternative(offset_or_indices) + ? std::get(offset_or_indices) + ix + : std::get(offset_or_indices)[ix]; reconstruct_vector( - one_vector, pq_dataset, pq_centers, codebook_kind, ix + n_skip, cluster_ix); + one_vector, pq_dataset, pq_centers, codebook_kind, src_ix, cluster_ix); for (uint32_t j = 0; j < out_dim; j++) { one_vector(j) += centers_rot(cluster_ix, j); } @@ -659,14 +663,17 @@ void reconstruct_list_data(raft::device_resources const& res, const index& index, device_matrix_view out_vectors, uint32_t label, - uint32_t n_skip) + std::variant offset_or_indices) { auto n_rows = out_vectors.extent(0); if (n_rows == 0) { return; } - // sic! I'm using the upper bound `list.size` instead of exact `list_sizes(label)` - // to avoid an extra device-host data copy and the stream sync. - RAFT_EXPECTS(n_skip + n_rows <= index.lists()[label]->size.load(), - "n_skip + output size must be not bigger than the cluster size."); + if (std::holds_alternative(offset_or_indices)) { + auto n_skip = std::get(offset_or_indices); + // sic! I'm using the upper bound `list.size` instead of exact `list_sizes(label)` + // to avoid an extra device-host data copy and the stream sync. + RAFT_EXPECTS(n_skip + n_rows <= index.lists()[label]->size.load(), + "offset + output size must be not bigger than the cluster size."); + } auto tmp = make_device_mdarray( res, res.get_workspace_resource(), make_extents(n_rows, index.rot_dim())); @@ -686,11 +693,12 @@ void reconstruct_list_data(raft::device_resources const& res, }(index.pq_bits()); kernel<<>>(tmp.view(), index.data_ptrs(), + index.list_sizes(), index.pq_centers(), index.centers_rot(), index.codebook_kind(), label, - n_skip); + offset_or_indices); RAFT_CUDA_TRY(cudaPeekAtLastError()); float* out_float_ptr = nullptr; @@ -721,11 +729,10 @@ void reconstruct_list_data(raft::device_resources const& res, res.get_stream()); // Transform the data to the original type, if necessary if constexpr (!std::is_same_v) { - linalg::map_k(out_vectors.data_handle(), - out_float_buf.size(), - utils::mapping{}, - res.get_stream(), - out_float_ptr); + linalg::map(res, + out_vectors, + utils::mapping{}, + make_device_matrix_view(out_float_ptr, n_rows, index.dim())); } } diff --git a/cpp/include/raft/neighbors/ivf_pq.cuh b/cpp/include/raft/neighbors/ivf_pq.cuh index 1739a9450a..05e09f063b 100644 --- a/cpp/include/raft/neighbors/ivf_pq.cuh +++ b/cpp/include/raft/neighbors/ivf_pq.cuh @@ -213,7 +213,7 @@ auto build(raft::device_resources const& handle, /** * @brief Decode `n_take` consecutive records of a single list (cluster) in the compressed index - * starting at given offset `n_skip`. + * starting at given `offset`. * * Usage example: * @code{.cpp} @@ -240,7 +240,7 @@ auto build(raft::device_resources const& handle, * it must be smaller than the list size. * @param[in] label * The id of the list (cluster) to decode. - * @param[in] n_skip + * @param[in] offset * How many records in the list to skip. */ template @@ -248,9 +248,51 @@ void reconstruct_list_data(raft::device_resources const& res, const index& index, device_matrix_view out_vectors, uint32_t label, - uint32_t n_skip) + uint32_t offset) +{ + return detail::reconstruct_list_data(res, index, out_vectors, label, offset); +} + +/** + * @brief Decode `n_take` consecutive records of a single list (cluster) in the compressed index + * starting at given offset `n_skip`. + * + * Usage example: + * @code{.cpp} + * // We will reconstruct the fourth cluster + * uint32_t label = 3; + * // Get the list size + * uint32_t list_size = 0; + * raft::copy(&list_size, index.list_sizes().data_handle() + label, 1, res.get_stream()); + * res.sync_stream(); + * // allocate the buffer for the output + * auto decoded_vectors = raft::make_device_matrix(res, list_size, index.dim()); + * // decode the whole list + * ivf_pq::reconstruct_list_data(res, index, decoded_vectors.view(), label, 0); + * @endcode + * + * @tparam T data element type + * @tparam IdxT type of the indices in the source dataset + * + * @param[in] res + * @param[in] index + * @param[in] in_cluster_indices + * The offsets of the selected indices within the cluster. + * @param[out] out_vectors + * the destination buffer [n_take, index.dim()]. + * The length `n_take` defines how many records to reconstruct, + * it must be smaller than the list size. + * @param[in] label + * The id of the list (cluster) to decode. + */ +template +void reconstruct_list_data(raft::device_resources const& res, + const index& index, + device_vector_view in_cluster_indices, + device_matrix_view out_vectors, + uint32_t label) { - return detail::reconstruct_list_data(res, index, out_vectors, label, n_skip); + return detail::reconstruct_list_data(res, index, out_vectors, label, in_cluster_indices); } /** From 5fc1c53fd85bbcfba9009407ecfdddd2af43c65f Mon Sep 17 00:00:00 2001 From: achirkin Date: Wed, 15 Mar 2023 11:54:18 +0100 Subject: [PATCH 07/25] Update the docs for the new function --- cpp/include/raft/neighbors/ivf_pq.cuh | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/cpp/include/raft/neighbors/ivf_pq.cuh b/cpp/include/raft/neighbors/ivf_pq.cuh index 05e09f063b..def127e24f 100644 --- a/cpp/include/raft/neighbors/ivf_pq.cuh +++ b/cpp/include/raft/neighbors/ivf_pq.cuh @@ -254,21 +254,23 @@ void reconstruct_list_data(raft::device_resources const& res, } /** - * @brief Decode `n_take` consecutive records of a single list (cluster) in the compressed index - * starting at given offset `n_skip`. + * @brief Decode a series of records of a single list (cluster) in the compressed index + * by their in-list offsets. * * Usage example: * @code{.cpp} * // We will reconstruct the fourth cluster * uint32_t label = 3; - * // Get the list size - * uint32_t list_size = 0; - * raft::copy(&list_size, index.list_sizes().data_handle() + label, 1, res.get_stream()); + * // Create the selection vector + * auto selected_indices = raft::make_device_vector(res, 4); + * ... fill the indices ... * res.sync_stream(); * // allocate the buffer for the output - * auto decoded_vectors = raft::make_device_matrix(res, list_size, index.dim()); + * auto decoded_vectors = raft::make_device_matrix( + * res, selected_indices.size(), index.dim()); * // decode the whole list - * ivf_pq::reconstruct_list_data(res, index, decoded_vectors.view(), label, 0); + * ivf_pq::reconstruct_list_data( + * res, index, selected_indices.view(), decoded_vectors.view(), label); * @endcode * * @tparam T data element type From ddc446604901ff786a2fa17a85365f1221a5e7f6 Mon Sep 17 00:00:00 2001 From: achirkin Date: Wed, 15 Mar 2023 19:25:35 +0100 Subject: [PATCH 08/25] Detach reconstruction logic from the data reading logic --- .../raft/neighbors/detail/ivf_pq_build.cuh | 173 ++++++++++++------ 1 file changed, 114 insertions(+), 59 deletions(-) diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh index 4c51423be4..ef282b8b76 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh @@ -557,41 +557,99 @@ void train_per_cluster(raft::device_resources const& handle, transpose_pq_centers(handle, index, pq_centers_tmp.data()); } +struct reconstruct_vectors { + codebook_gen codebook_kind; + uint32_t cluster_ix; + uint32_t pq_len; + device_mdspan, row_major> pq_centers; + device_mdspan, row_major> centers_rot; + device_mdspan, row_major> out_vectors; + + /** + * Create the functor to be passed to `run_on_list`. + * + * @param[out] out_vectors the destination for the decoded vectors. + * @param[in] pq_centers the codebook + * @param[in] centers_rot + * @param[in] codebook_kind + * @param[in] cluster_ix label/id of the cluster. + */ + __device__ inline reconstruct_vectors( + device_matrix_view out_vectors, + device_mdspan, row_major> pq_centers, + device_matrix_view centers_rot, + codebook_gen codebook_kind, + uint32_t cluster_ix) + : codebook_kind{codebook_kind}, + cluster_ix{cluster_ix}, + pq_len{pq_centers.extent(1)}, + pq_centers{pq_centers}, + centers_rot{reinterpret_vectors(centers_rot, pq_centers)}, + out_vectors{reinterpret_vectors(out_vectors, pq_centers)} + { + } + + /** + * Decode j-th component of the i-th vector by its code and write it into a chunk of the output + * vectors (pq_len elements). + */ + __device__ inline void operator()(uint8_t code, uint32_t i, uint32_t j) + { + uint32_t partition_ix; + switch (codebook_kind) { + case codebook_gen::PER_CLUSTER: { + partition_ix = cluster_ix; + } break; + case codebook_gen::PER_SUBSPACE: { + partition_ix = j; + } break; + default: __builtin_unreachable(); + } + for (uint32_t k = 0; k < pq_len; k++) { + out_vectors(i, j, k) = pq_centers(partition_ix, k, code) + centers_rot(cluster_ix, j, k); + } + } + + private: + template + static __device__ auto reinterpret_vectors( + device_matrix_view out_vectors, + device_mdspan, row_major> pq_centers) + -> device_mdspan, row_major> + { + const uint32_t pq_len = pq_centers.extent(1); + const uint32_t pq_dim = out_vectors.extent(1) / pq_len; + using layout_t = typename decltype(out_vectors)::layout_type; + using accessor_t = typename decltype(out_vectors)::accessor_type; + return mdspan, layout_t, accessor_t>( + out_vectors.data_handle(), extent_3d{out_vectors.extent(0), pq_dim, pq_len}); + } +}; + /** - * Decode a lvl-2 pq-encoded vector in the given list (cluster). - * One vector per thread. - * NB: this function only decodes the PQ (second level) encoding; to get the approximation of the - * original vector, you need to add the cluster centroid and apply the inverse matrix transform to - * the result of this function. + * Process a single vector in a list. * * @tparam PqBits + * @tparam Action tells how to process a single vectors (e.g. reconstruct or just unpack) * - * @param[out] out_vector the destination for the decoded vector (one-per-thread). * @param[in] in_list_data the encoded cluster data. - * @param[in] pq_centers the codebook - * @param[in] codebook_kind * @param[in] in_ix in-cluster index of the vector to be decoded (one-per-thread). - * @param[in] cluster_ix label/id of the cluster (one-per-thread). + * @param[in] out_ix the output index passed to the action + * @param[in] pq_dim + * @param action a callable action to be invoked on each PQ code (component of the encoding) + * type: void (uint8_t code, uint32_t out_ix, uint32_t j), where j = [0..pq_dim). */ -template -__device__ void reconstruct_vector( - device_vector_view out_vector, +template +__device__ void run_on_vector( device_mdspan::list_extents, row_major> in_list_data, - device_mdspan, row_major> pq_centers, - codebook_gen codebook_kind, uint32_t in_ix, - uint32_t cluster_ix) + uint32_t out_ix, + uint32_t pq_dim, + Action action) { using group_align = Pow2; const uint32_t group_ix = group_align::div(in_ix); const uint32_t ingroup_ix = group_align::mod(in_ix); - const uint32_t pq_len = pq_centers.extent(1); - const uint32_t pq_dim = out_vector.extent(0) / pq_len; - - using layout_t = typename decltype(out_vector)::layout_type; - using accessor_t = typename decltype(out_vector)::accessor_type; - auto reinterpreted_vector = mdspan, layout_t, accessor_t>( - out_vector.data_handle(), extent_2d{pq_dim, pq_len}); pq_vec_t code_chunk; bitfield_view_t code_view{reinterpret_cast(&code_chunk)}; @@ -602,25 +660,35 @@ __device__ void reconstruct_vector( // read the codes, one/pq_dim at a time #pragma unroll for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++) { - uint32_t partition_ix; - switch (codebook_kind) { - case codebook_gen::PER_CLUSTER: { - partition_ix = cluster_ix; - } break; - case codebook_gen::PER_SUBSPACE: { - partition_ix = j; - } break; - default: __builtin_unreachable(); - } - uint8_t code = code_view[k]; // read a piece of the reconstructed vector - for (uint32_t l = 0; l < pq_len; l++) { - reinterpreted_vector(j, l) = pq_centers(partition_ix, l, code); - } + action(code_view[k], out_ix, j); } } } +/** Process the given indices or a block of a single list (cluster). */ +template +__device__ void run_on_list(device_vector_view data_ptrs, + device_vector_view list_sizes, + std::variant offset_or_indices, + uint32_t len, + uint32_t cluster_ix, + uint32_t pq_dim, + Action action) +{ + auto pq_extents = + list_spec{PqBits, pq_dim, true}.make_list_extents(list_sizes[cluster_ix]); + auto pq_dataset = + make_mdspan(data_ptrs[cluster_ix], pq_extents); + + for (uint32_t ix = threadIdx.x + blockDim.x * blockIdx.x; ix < len; ix += blockDim.x) { + const uint32_t src_ix = std::holds_alternative(offset_or_indices) + ? std::get(offset_or_indices) + ix + : std::get(offset_or_indices)[ix]; + run_on_vector(pq_dataset, src_ix, ix, pq_dim, action); + } +} + template __launch_bounds__(BlockSize) __global__ void reconstruct_list_data_kernel( device_matrix_view out_vectors, @@ -632,29 +700,16 @@ __launch_bounds__(BlockSize) __global__ void reconstruct_list_data_kernel( uint32_t cluster_ix, std::variant offset_or_indices) { - const auto out_dim = out_vectors.extent(1); - using layout_t = typename decltype(out_vectors)::layout_type; - using accessor_t = typename decltype(out_vectors)::accessor_type; - - const uint32_t pq_dim = out_dim / pq_centers.extent(1); - auto pq_extents = - list_spec{PqBits, pq_dim, true}.make_list_extents(list_sizes[cluster_ix]); - auto pq_dataset = - make_mdspan(data_ptrs[cluster_ix], pq_extents); - - for (uint32_t ix = threadIdx.x + BlockSize * blockIdx.x; ix < out_vectors.extent(0); - ix += BlockSize) { - auto one_vector = mdspan, layout_t, accessor_t>( - &out_vectors(ix, 0), extent_1d{out_vectors.extent(1)}); - const uint32_t src_ix = std::holds_alternative(offset_or_indices) - ? std::get(offset_or_indices) + ix - : std::get(offset_or_indices)[ix]; - reconstruct_vector( - one_vector, pq_dataset, pq_centers, codebook_kind, src_ix, cluster_ix); - for (uint32_t j = 0; j < out_dim; j++) { - one_vector(j) += centers_rot(cluster_ix, j); - } - } + const uint32_t pq_dim = out_vectors.extent(1) / pq_centers.extent(1); + auto reconstruct_action = + reconstruct_vectors{out_vectors, pq_centers, centers_rot, codebook_kind, cluster_ix}; + run_on_list(data_ptrs, + list_sizes, + offset_or_indices, + out_vectors.extent(0), + cluster_ix, + pq_dim, + reconstruct_action); } /** Decode the list data; see the public interface for the api and usage. */ From 3a2c62235861e1e6aa7a1c64c4fa9e1e4036ae99 Mon Sep 17 00:00:00 2001 From: achirkin Date: Thu, 16 Mar 2023 07:23:46 +0100 Subject: [PATCH 09/25] Implement unpack_list_data --- .../raft/neighbors/detail/ivf_pq_build.cuh | 209 ++++++++++++------ 1 file changed, 145 insertions(+), 64 deletions(-) diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh index ef282b8b76..c19232a51d 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh @@ -557,6 +557,150 @@ void train_per_cluster(raft::device_resources const& handle, transpose_pq_centers(handle, index, pq_centers_tmp.data()); } +/** + * Process a single vector in a list. + * + * @tparam PqBits + * @tparam Action tells how to process a single vectors (e.g. reconstruct or just unpack) + * + * @param[in] in_list_data the encoded cluster data. + * @param[in] in_ix in-cluster index of the vector to be decoded (one-per-thread). + * @param[in] out_ix the output index passed to the action + * @param[in] pq_dim + * @param action a callable action to be invoked on each PQ code (component of the encoding) + * type: void (uint8_t code, uint32_t out_ix, uint32_t j), where j = [0..pq_dim). + */ +template +__device__ void run_on_vector( + device_mdspan::list_extents, row_major> in_list_data, + uint32_t in_ix, + uint32_t out_ix, + uint32_t pq_dim, + Action action) +{ + using group_align = Pow2; + const uint32_t group_ix = group_align::div(in_ix); + const uint32_t ingroup_ix = group_align::mod(in_ix); + + pq_vec_t code_chunk; + bitfield_view_t code_view{reinterpret_cast(&code_chunk)}; + constexpr uint32_t kChunkSize = (sizeof(pq_vec_t) * 8u) / PqBits; + for (uint32_t j = 0, i = 0; j < pq_dim; i++) { + // read the chunk + code_chunk = *reinterpret_cast(&in_list_data(group_ix, i, ingroup_ix, 0)); + // read the codes, one/pq_dim at a time +#pragma unroll + for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++) { + // read a piece of the reconstructed vector + action(code_view[k], out_ix, j); + } + } +} + +/** Process the given indices or a block of a single list (cluster). */ +template +__device__ void run_on_list(device_vector_view data_ptrs, + device_vector_view list_sizes, + std::variant offset_or_indices, + uint32_t len, + uint32_t cluster_ix, + uint32_t pq_dim, + Action action) +{ + auto pq_extents = + list_spec{PqBits, pq_dim, true}.make_list_extents(list_sizes[cluster_ix]); + auto pq_dataset = + make_mdspan(data_ptrs[cluster_ix], pq_extents); + + for (uint32_t ix = threadIdx.x + blockDim.x * blockIdx.x; ix < len; ix += blockDim.x) { + const uint32_t src_ix = std::holds_alternative(offset_or_indices) + ? std::get(offset_or_indices) + ix + : std::get(offset_or_indices)[ix]; + run_on_vector(pq_dataset, src_ix, ix, pq_dim, action); + } +} + +/** + * A consumer for the `run_on_list` and `run_on_vec` that just flattens PQ codes + * one-per-byte. That is, independent of the code width (pq_bits), one code uses + * the whole byte, hence one vectors uses pq_dim bytes. + */ +struct unpack_codes { + device_matrix_view out_codes; + + /** + * Create a callable to be passed to `run_on_list`. + * + * @param[out] out_codes the destination for the read codes. + */ + __device__ inline unpack_codes(device_matrix_view out_codes) + : out_codes{out_codes} + { + } + + /** Write j-th component (code) of the i-th vector into the output array. */ + __device__ inline void operator()(uint8_t code, uint32_t i, uint32_t j) + { + out_codes(i, j) = code; + } +}; + +template +__launch_bounds__(BlockSize) __global__ void unpack_list_data_kernel( + device_matrix_view out_codes, + device_vector_view data_ptrs, + device_vector_view list_sizes, + uint32_t cluster_ix, + std::variant offset_or_indices) +{ + const uint32_t pq_dim = out_codes.extent(1); + auto unpack_action = unpack_codes{out_codes}; + run_on_list(data_ptrs, + list_sizes, + offset_or_indices, + out_codes.extent(0), + cluster_ix, + pq_dim, + unpack_action); +} + +/** Decode the list data; see the public interface for the api and usage. */ +template +void unpack_list_data(raft::device_resources const& res, + const index& index, + device_matrix_view out_codes, + uint32_t label, + std::variant offset_or_indices) +{ + auto n_rows = out_codes.extent(0); + if (n_rows == 0) { return; } + if (std::holds_alternative(offset_or_indices)) { + auto n_skip = std::get(offset_or_indices); + // sic! I'm using the upper bound `list.size` instead of exact `list_sizes(label)` + // to avoid an extra device-host data copy and the stream sync. + RAFT_EXPECTS(n_skip + n_rows <= index.lists()[label]->size.load(), + "offset + output size must be not bigger than the cluster size."); + } + + constexpr uint32_t kBlockSize = 256; + dim3 blocks(div_rounding_up_safe(n_rows, kBlockSize), 1, 1); + dim3 threads(kBlockSize, 1, 1); + auto kernel = [](uint32_t pq_bits) { + switch (pq_bits) { + case 4: return unpack_list_data_kernel; + case 5: return unpack_list_data_kernel; + case 6: return unpack_list_data_kernel; + case 7: return unpack_list_data_kernel; + case 8: return unpack_list_data_kernel; + default: RAFT_FAIL("Invalid pq_bits (%u), the value must be within [4, 8]", pq_bits); + } + }(index.pq_bits()); + kernel<<>>( + out_codes, index.data_ptrs(), index.list_sizes(), label, offset_or_indices); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +/** A consumer for the `run_on_list` and `run_on_vec` that approximates the original input data. */ struct reconstruct_vectors { codebook_gen codebook_kind; uint32_t cluster_ix; @@ -566,7 +710,7 @@ struct reconstruct_vectors { device_mdspan, row_major> out_vectors; /** - * Create the functor to be passed to `run_on_list`. + * Create a callable to be passed to `run_on_list`. * * @param[out] out_vectors the destination for the decoded vectors. * @param[in] pq_centers the codebook @@ -626,69 +770,6 @@ struct reconstruct_vectors { } }; -/** - * Process a single vector in a list. - * - * @tparam PqBits - * @tparam Action tells how to process a single vectors (e.g. reconstruct or just unpack) - * - * @param[in] in_list_data the encoded cluster data. - * @param[in] in_ix in-cluster index of the vector to be decoded (one-per-thread). - * @param[in] out_ix the output index passed to the action - * @param[in] pq_dim - * @param action a callable action to be invoked on each PQ code (component of the encoding) - * type: void (uint8_t code, uint32_t out_ix, uint32_t j), where j = [0..pq_dim). - */ -template -__device__ void run_on_vector( - device_mdspan::list_extents, row_major> in_list_data, - uint32_t in_ix, - uint32_t out_ix, - uint32_t pq_dim, - Action action) -{ - using group_align = Pow2; - const uint32_t group_ix = group_align::div(in_ix); - const uint32_t ingroup_ix = group_align::mod(in_ix); - - pq_vec_t code_chunk; - bitfield_view_t code_view{reinterpret_cast(&code_chunk)}; - constexpr uint32_t kChunkSize = (sizeof(pq_vec_t) * 8u) / PqBits; - for (uint32_t j = 0, i = 0; j < pq_dim; i++) { - // read the chunk - code_chunk = *reinterpret_cast(&in_list_data(group_ix, i, ingroup_ix, 0)); - // read the codes, one/pq_dim at a time -#pragma unroll - for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++) { - // read a piece of the reconstructed vector - action(code_view[k], out_ix, j); - } - } -} - -/** Process the given indices or a block of a single list (cluster). */ -template -__device__ void run_on_list(device_vector_view data_ptrs, - device_vector_view list_sizes, - std::variant offset_or_indices, - uint32_t len, - uint32_t cluster_ix, - uint32_t pq_dim, - Action action) -{ - auto pq_extents = - list_spec{PqBits, pq_dim, true}.make_list_extents(list_sizes[cluster_ix]); - auto pq_dataset = - make_mdspan(data_ptrs[cluster_ix], pq_extents); - - for (uint32_t ix = threadIdx.x + blockDim.x * blockIdx.x; ix < len; ix += blockDim.x) { - const uint32_t src_ix = std::holds_alternative(offset_or_indices) - ? std::get(offset_or_indices) + ix - : std::get(offset_or_indices)[ix]; - run_on_vector(pq_dataset, src_ix, ix, pq_dim, action); - } -} - template __launch_bounds__(BlockSize) __global__ void reconstruct_list_data_kernel( device_matrix_view out_vectors, From e7c35e791831e2c934c18c8c6048ceb835086e03 Mon Sep 17 00:00:00 2001 From: achirkin Date: Thu, 16 Mar 2023 07:27:01 +0100 Subject: [PATCH 10/25] Public interface for unpack_list_data --- cpp/include/raft/neighbors/ivf_pq.cuh | 83 +++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/cpp/include/raft/neighbors/ivf_pq.cuh b/cpp/include/raft/neighbors/ivf_pq.cuh index def127e24f..c420123a01 100644 --- a/cpp/include/raft/neighbors/ivf_pq.cuh +++ b/cpp/include/raft/neighbors/ivf_pq.cuh @@ -211,6 +211,89 @@ auto build(raft::device_resources const& handle, return detail::build(handle, params, dataset, n_rows, dim); } +/** + * @brief Unpack `n_take` consecutive records of a single list (cluster) in the compressed index + * starting at given `offset`, one code per byte (independently of pq_bits). + * + * Usage example: + * @code{.cpp} + * // We will unpack the fourth cluster + * uint32_t label = 3; + * // Get the list size + * uint32_t list_size = 0; + * raft::copy(&list_size, index.list_sizes().data_handle() + label, 1, res.get_stream()); + * res.sync_stream(); + * // allocate the buffer for the output + * auto codes = raft::make_device_matrix(res, list_size, index.pq_dim()); + * // unpack the whole list + * ivf_pq::unpack_list_data(res, index, codes.view(), label, 0); + * @endcode + * + * @tparam IdxT type of the indices in the source dataset + * + * @param[in] res + * @param[in] index + * @param[out] out_codes + * the destination buffer [n_take, index.pq_dim()]. + * The length `n_take` defines how many records to unpack, + * it must be smaller than the list size. + * @param[in] label + * The id of the list (cluster) to decode. + * @param[in] offset + * How many records in the list to skip. + */ +template +void unpack_list_data(raft::device_resources const& res, + const index& index, + device_matrix_view out_codes, + uint32_t label, + uint32_t offset) +{ + return detail::unpack_list_data(res, index, out_codes, label, offset); +} + +/** + * @brief Unpack a series of records of a single list (cluster) in the compressed index + * by their in-list offsets, one code per byte (independently of pq_bits). + * + * Usage example: + * @code{.cpp} + * // We will unpack the fourth cluster + * uint32_t label = 3; + * // Create the selection vector + * auto selected_indices = raft::make_device_vector(res, 4); + * ... fill the indices ... + * res.sync_stream(); + * // allocate the buffer for the output + * auto codes = raft::make_device_matrix(res, selected_indices.size(), index.pq_dim()); + * // decode the whole list + * ivf_pq::unpack_list_data( + * res, index, selected_indices.view(), codes.view(), label); + * @endcode + * + * @tparam IdxT type of the indices in the source dataset + * + * @param[in] res + * @param[in] index + * @param[in] in_cluster_indices + * The offsets of the selected indices within the cluster. + * @param[out] out_codes + * the destination buffer [n_take, index.pq_dim()]. + * The length `n_take` defines how many records to unpack, + * it must be smaller than the list size. + * @param[in] label + * The id of the list (cluster) to decode. + */ +template +void unpack_list_data(raft::device_resources const& res, + const index& index, + device_vector_view in_cluster_indices, + device_matrix_view out_codes, + uint32_t label) +{ + return detail::unpack_list_data(res, index, out_codes, label, in_cluster_indices); +} + /** * @brief Decode `n_take` consecutive records of a single list (cluster) in the compressed index * starting at given `offset`. From 839cd489b68233099b235cea5a69d58a815a43ac Mon Sep 17 00:00:00 2001 From: achirkin Date: Thu, 16 Mar 2023 07:30:43 +0100 Subject: [PATCH 11/25] Fix (unrelated to the PR) test condition being just a little bit too tight --- cpp/test/neighbors/ann_ivf_pq.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh index b990e364c5..177dd28db9 100644 --- a/cpp/test/neighbors/ann_ivf_pq.cuh +++ b/cpp/test/neighbors/ann_ivf_pq.cuh @@ -271,7 +271,7 @@ class ivf_pq_test : public ::testing::TestWithParam { double d = dist(i); // The theoretical estimate of the error is hard to come up with, // the estimate below is based on experimentation + curse of dimensionality - ASSERT_LE(d, 0.05 * std::pow(2.0, compression_ratio)) + ASSERT_LE(d, 0.055 * std::pow(2.0, compression_ratio)) << " (label = " << label << ", ix = " << (n_skip + i) << ")"; } } From 46d5a84280217cbdb2cd9793328f4c5367939d1a Mon Sep 17 00:00:00 2001 From: achirkin Date: Thu, 16 Mar 2023 08:11:25 +0100 Subject: [PATCH 12/25] Fix a miswording in the docs --- cpp/include/raft/neighbors/detail/ivf_pq_build.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh index c19232a51d..3132a33f69 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh @@ -664,7 +664,7 @@ __launch_bounds__(BlockSize) __global__ void unpack_list_data_kernel( unpack_action); } -/** Decode the list data; see the public interface for the api and usage. */ +/** Unpack the list data; see the public interface for the api and usage. */ template void unpack_list_data(raft::device_resources const& res, const index& index, From 9fc9818ebd49067aeba73c57f5cf4a633d111059 Mon Sep 17 00:00:00 2001 From: achirkin Date: Thu, 16 Mar 2023 10:50:31 +0100 Subject: [PATCH 13/25] Add public api for extending individual lists --- .../raft/neighbors/detail/ivf_pq_build.cuh | 99 +++++++++++++++++++ cpp/include/raft/neighbors/ivf_pq.cuh | 60 +++++++++++ cpp/test/neighbors/ann_ivf_pq.cuh | 21 ++++ 3 files changed, 180 insertions(+) diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh index 3132a33f69..638c8b4edd 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh @@ -1123,6 +1123,31 @@ void process_and_fill_codes(raft::device_resources const& handle, RAFT_CUDA_TRY(cudaPeekAtLastError()); } +/** + * Write flat PQ codes into an existing list by the given offset. + * + * NB: no memory allocation happens here; the list must fit the data (offset + n_rows). + * + * @tparam IdxT + * + * @param[in] res + * @param[inout] index + * @param[in] codes flat PQ codes, one code per byte [n_rows, index.pq_dim()] + * @param[in] label + * The id of the list (cluster) to decode. + * @param[in] offset how many records in the list to skip. + * + */ +template +void pack_list_data(raft::device_resources const& res, + index* index, + device_matrix_view new_codes, + uint32_t label, + uint32_t offset) +{ + /** TODO: implementation is missing */ +} + /** Update the state of the dependent index members. */ template void recompute_internal_state(const raft::device_resources& res, index& index) @@ -1176,6 +1201,80 @@ void recompute_internal_state(const raft::device_resources& res, index& in } } +/** + * Extend one list of the index in-place, by the list label, skipping the classification and + * encoding steps. + * See the public interface for the api and usage. + */ +template +void extend_list_with_codes(raft::device_resources const& res, + index* index, + device_matrix_view new_codes, + device_vector_view new_indices, + uint32_t label) +{ + uint32_t n_rows = new_indices.extent(0); + uint32_t offset; + // Allocate the lists to fit the new data + copy(&offset, index->list_sizes().data_handle() + label, 1, res.get_stream()); + res.sync_stream(); + uint32_t new_size = offset + n_rows; + copy(index->list_sizes().data_handle() + label, &new_size, 1, res.get_stream()); + auto spec = list_spec{ + index->pq_bits(), index->pq_dim(), index->conservative_memory_allocation()}; + auto& list = index->lists()[label]; + ivf::resize_list(res, list, spec, new_size, offset); + copy(list->indices.data_handle() + offset, new_indices.data_handle(), n_rows, res.get_stream()); + + pack_list_data(res, index, new_codes, label, offset); + + // Update the pointers and the sizes + recompute_internal_state(res, *index); +} + +/** + * Extend one list of the index in-place, by the list label, skipping the classification step. + * See the public interface for the api and usage. + */ +template +void extend_list(raft::device_resources const& res, + index* index, + device_matrix_view new_vectors, + device_vector_view new_indices, + uint32_t label) +{ + uint32_t n_rows = new_indices.extent(0); + uint32_t offset; + // Allocate the lists to fit the new data + copy(&offset, index->list_sizes().data_handle() + label, 1, res.get_stream()); + res.sync_stream(); + uint32_t new_size = offset + n_rows; + copy(index->list_sizes().data_handle() + label, &new_size, 1, res.get_stream()); + auto spec = list_spec{ + index->pq_bits(), index->pq_dim(), index->conservative_memory_allocation()}; + auto& list = index->lists()[label]; + ivf::resize_list(res, list, spec, new_size, offset); + copy(list->indices.data_handle() + offset, new_indices.data_handle(), n_rows, res.get_stream()); + + /** TODO: implementation is missing */ + + // Update the pointers and the sizes + recompute_internal_state(res, *index); +} + +/** + * Remove all data from a single list. + * See the public interface for the api and usage. + */ +template +void erase_list(raft::device_resources const& res, index* index, uint32_t label) +{ + uint32_t zero = 0; + copy(index->list_sizes().data_handle() + label, &zero, 1, res.get_stream()); + index->lists()[label].reset(); + recompute_internal_state(res, *index); +} + /** Copy the state of an index into a new index, but share the list data among the two. */ template auto clone(const raft::device_resources& res, const index& source) -> index diff --git a/cpp/include/raft/neighbors/ivf_pq.cuh b/cpp/include/raft/neighbors/ivf_pq.cuh index c420123a01..cfb7e0a773 100644 --- a/cpp/include/raft/neighbors/ivf_pq.cuh +++ b/cpp/include/raft/neighbors/ivf_pq.cuh @@ -446,6 +446,66 @@ void extend(raft::device_resources const& handle, detail::extend(handle, index, new_vectors, new_indices, n_rows); } +/** + * @brief Extend one list of the index in-place, by the list label, skipping the classification and + * encoding steps. + * + * @tparam IdxT + * + * @param[in] res + * @param[inout] index + * @param[in] new_codes flat PQ codes, one code per byte [n_rows, index.pq_dim()] + * @param[in] new_indices source indices [n_rows] + * @param[in] label the id of the target list (cluster). + */ +template +void extend_list_with_codes(raft::device_resources const& res, + index* index, + device_matrix_view new_codes, + device_vector_view new_indices, + uint32_t label) +{ + detail::extend_list_with_codes(res, index, new_codes, new_indices, label); +} + +/** + * @brief Extend one list of the index in-place, by the list label, skipping the classification + * step. + * + * @tparam T + * @tparam IdxT + * + * @param[in] res + * @param[inout] index + * @param[in] new_vectors data to encode [n_rows, index.dim()] + * @param[in] new_indices source indices [n_rows] + * @param[in] label the id of the target list (cluster). + * + */ +template +void extend_list(raft::device_resources const& res, + index* index, + device_matrix_view new_vectors, + device_vector_view new_indices, + uint32_t label) +{ + detail::extend_list(res, index, new_vectors, new_vectors, label); +} + +/** + * @brief Remove all data from a single list (cluster) in the index. + * + * @tparam IdxT + * @param[in] res + * @param[inout] index + * @param[in] label the id of the target list (cluster). + */ +template +void erase_list(raft::device_resources const& res, index* index, uint32_t label) +{ + detail::erase_list(res, index, label); +} + /** * @brief Search ANN using the constructed index. * diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh index 177dd28db9..14098bbce4 100644 --- a/cpp/test/neighbors/ann_ivf_pq.cuh +++ b/cpp/test/neighbors/ann_ivf_pq.cuh @@ -276,11 +276,32 @@ class ivf_pq_test : public ::testing::TestWithParam { } } + void check_packing(index* index, uint32_t label) + { + auto rec_list = index->lists()[label]; + auto n_rows = rec_list->size.load(); + + if (n_rows == 0) { return; } + + auto codes = make_device_matrix(handle_, n_rows, index->pq_dim()); + auto indices = make_device_vector(handle_, n_rows); + copy(indices.data_handle(), rec_list->indices.data_handle(), n_rows, stream_); + + ivf_pq::unpack_list_data(handle_, *index, codes.view(), label, 0); + ivf_pq::erase_list(handle_, index, label); + // NB: passing the type parameter because const->non-const implicit conversion of the mdspans + // breaks type inference + ivf_pq::extend_list_with_codes(handle_, index, codes.view(), indices.view(), label); + } + template void run(BuildIndex build_index) { auto index = build_index(); + // Dump and re-write codes for one label + check_packing(&index, 0); + double compression_ratio = static_cast(ps.dim * 8) / static_cast(index.pq_dim() * index.pq_bits()); From ac8aa1832a9739d6d34e27664e0a7c44befa876d Mon Sep 17 00:00:00 2001 From: achirkin Date: Thu, 16 Mar 2023 17:04:15 +0100 Subject: [PATCH 14/25] Implemented pack_list_data --- .../raft/neighbors/detail/ivf_pq_build.cuh | 265 ++++++++++++------ cpp/include/raft/neighbors/ivf_pq.cuh | 4 +- 2 files changed, 185 insertions(+), 84 deletions(-) diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh index 638c8b4edd..96c68c50f4 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh @@ -597,31 +597,71 @@ __device__ void run_on_vector( } } -/** Process the given indices or a block of a single list (cluster). */ template -__device__ void run_on_list(device_vector_view data_ptrs, - device_vector_view list_sizes, - std::variant offset_or_indices, - uint32_t len, - uint32_t cluster_ix, - uint32_t pq_dim, - Action action) +__device__ void write_vector( + device_mdspan::list_extents, row_major> out_list_data, + uint32_t out_ix, + uint32_t in_ix, + uint32_t pq_dim, + Action action) { - auto pq_extents = - list_spec{PqBits, pq_dim, true}.make_list_extents(list_sizes[cluster_ix]); - auto pq_dataset = - make_mdspan(data_ptrs[cluster_ix], pq_extents); + using group_align = Pow2; + const uint32_t group_ix = group_align::div(out_ix); + const uint32_t ingroup_ix = group_align::mod(out_ix); + pq_vec_t code_chunk; + bitfield_view_t code_view{reinterpret_cast(&code_chunk)}; + constexpr uint32_t kChunkSize = (sizeof(pq_vec_t) * 8u) / PqBits; + for (uint32_t j = 0, i = 0; j < pq_dim; i++) { + // clear the chunk + code_chunk = pq_vec_t{}; + // write the codes, one/pq_dim at a time +#pragma unroll + for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++) { + // write a single code + code_view[k] = action(in_ix, j); + } + // write the chunk to the list + *reinterpret_cast(&out_list_data(group_ix, i, ingroup_ix, 0)) = code_chunk; + } +} + +/** Process the given indices or a block of a single list (cluster). */ +template +__device__ void run_on_list( + device_mdspan::list_extents, row_major> in_list_data, + std::variant offset_or_indices, + uint32_t len, + uint32_t pq_dim, + Action action) +{ for (uint32_t ix = threadIdx.x + blockDim.x * blockIdx.x; ix < len; ix += blockDim.x) { const uint32_t src_ix = std::holds_alternative(offset_or_indices) ? std::get(offset_or_indices) + ix : std::get(offset_or_indices)[ix]; - run_on_vector(pq_dataset, src_ix, ix, pq_dim, action); + run_on_vector(in_list_data, src_ix, ix, pq_dim, action); + } +} + +/** Process the given indices or a block of a single list (cluster). */ +template +__device__ void write_list( + device_mdspan::list_extents, row_major> out_list_data, + std::variant offset_or_indices, + uint32_t len, + uint32_t pq_dim, + Action action) +{ + for (uint32_t ix = threadIdx.x + blockDim.x * blockIdx.x; ix < len; ix += blockDim.x) { + const uint32_t dst_ix = std::holds_alternative(offset_or_indices) + ? std::get(offset_or_indices) + ix + : std::get(offset_or_indices)[ix]; + write_vector(out_list_data, dst_ix, ix, pq_dim, action); } } /** - * A consumer for the `run_on_list` and `run_on_vec` that just flattens PQ codes + * A consumer for the `run_on_list` and `run_on_vector` that just flattens PQ codes * one-per-byte. That is, independent of the code width (pq_bits), one code uses * the whole byte, hence one vectors uses pq_dim bytes. */ @@ -648,44 +688,37 @@ struct unpack_codes { template __launch_bounds__(BlockSize) __global__ void unpack_list_data_kernel( device_matrix_view out_codes, - device_vector_view data_ptrs, - device_vector_view list_sizes, - uint32_t cluster_ix, + device_mdspan::list_extents, row_major> in_list_data, std::variant offset_or_indices) { const uint32_t pq_dim = out_codes.extent(1); auto unpack_action = unpack_codes{out_codes}; - run_on_list(data_ptrs, - list_sizes, - offset_or_indices, - out_codes.extent(0), - cluster_ix, - pq_dim, - unpack_action); + run_on_list(in_list_data, offset_or_indices, out_codes.extent(0), pq_dim, unpack_action); } -/** Unpack the list data; see the public interface for the api and usage. */ -template -void unpack_list_data(raft::device_resources const& res, - const index& index, - device_matrix_view out_codes, - uint32_t label, - std::variant offset_or_indices) +/** + * Unpack flat PQ codes from an existing list by the given offset. + * + * @param[out] codes flat PQ codes, one code per byte [n_rows, pq_dim] + * @param[in] list_data the packed ivf::list data. + * @param[in] offset_or_indices how many records in the list to skip or the exact indices. + * @param[in] pq_bits codebook size (1 << pq_bits) + * @param[in] stream + */ +inline void unpack_list_data( + device_matrix_view codes, + device_mdspan::list_extents, row_major> list_data, + std::variant offset_or_indices, + uint32_t pq_bits, + rmm::cuda_stream_view stream) { - auto n_rows = out_codes.extent(0); + auto n_rows = codes.extent(0); if (n_rows == 0) { return; } - if (std::holds_alternative(offset_or_indices)) { - auto n_skip = std::get(offset_or_indices); - // sic! I'm using the upper bound `list.size` instead of exact `list_sizes(label)` - // to avoid an extra device-host data copy and the stream sync. - RAFT_EXPECTS(n_skip + n_rows <= index.lists()[label]->size.load(), - "offset + output size must be not bigger than the cluster size."); - } constexpr uint32_t kBlockSize = 256; dim3 blocks(div_rounding_up_safe(n_rows, kBlockSize), 1, 1); dim3 threads(kBlockSize, 1, 1); - auto kernel = [](uint32_t pq_bits) { + auto kernel = [pq_bits]() { switch (pq_bits) { case 4: return unpack_list_data_kernel; case 5: return unpack_list_data_kernel; @@ -694,13 +727,28 @@ void unpack_list_data(raft::device_resources const& res, case 8: return unpack_list_data_kernel; default: RAFT_FAIL("Invalid pq_bits (%u), the value must be within [4, 8]", pq_bits); } - }(index.pq_bits()); - kernel<<>>( - out_codes, index.data_ptrs(), index.list_sizes(), label, offset_or_indices); + }(); + kernel<<>>(codes, list_data, offset_or_indices); RAFT_CUDA_TRY(cudaPeekAtLastError()); } -/** A consumer for the `run_on_list` and `run_on_vec` that approximates the original input data. */ +/** Unpack the list data; see the public interface for the api and usage. */ +template +void unpack_list_data(raft::device_resources const& res, + const index& index, + device_matrix_view out_codes, + uint32_t label, + std::variant offset_or_indices) +{ + unpack_list_data(out_codes, + index.lists()[label]->data.view(), + offset_or_indices, + index.pq_bits(), + res.get_stream()); +} + +/** A consumer for the `run_on_list` and `run_on_vector` that approximates the original input data. + */ struct reconstruct_vectors { codebook_gen codebook_kind; uint32_t cluster_ix; @@ -773,8 +821,7 @@ struct reconstruct_vectors { template __launch_bounds__(BlockSize) __global__ void reconstruct_list_data_kernel( device_matrix_view out_vectors, - device_vector_view data_ptrs, - device_vector_view list_sizes, + device_mdspan::list_extents, row_major> in_list_data, device_mdspan, row_major> pq_centers, device_matrix_view centers_rot, codebook_gen codebook_kind, @@ -784,13 +831,8 @@ __launch_bounds__(BlockSize) __global__ void reconstruct_list_data_kernel( const uint32_t pq_dim = out_vectors.extent(1) / pq_centers.extent(1); auto reconstruct_action = reconstruct_vectors{out_vectors, pq_centers, centers_rot, codebook_kind, cluster_ix}; - run_on_list(data_ptrs, - list_sizes, - offset_or_indices, - out_vectors.extent(0), - cluster_ix, - pq_dim, - reconstruct_action); + run_on_list( + in_list_data, offset_or_indices, out_vectors.extent(0), pq_dim, reconstruct_action); } /** Decode the list data; see the public interface for the api and usage. */ @@ -803,11 +845,12 @@ void reconstruct_list_data(raft::device_resources const& res, { auto n_rows = out_vectors.extent(0); if (n_rows == 0) { return; } + auto& list = index.lists()[label]; if (std::holds_alternative(offset_or_indices)) { auto n_skip = std::get(offset_or_indices); // sic! I'm using the upper bound `list.size` instead of exact `list_sizes(label)` // to avoid an extra device-host data copy and the stream sync. - RAFT_EXPECTS(n_skip + n_rows <= index.lists()[label]->size.load(), + RAFT_EXPECTS(n_skip + n_rows <= list->size.load(), "offset + output size must be not bigger than the cluster size."); } @@ -828,8 +871,7 @@ void reconstruct_list_data(raft::device_resources const& res, } }(index.pq_bits()); kernel<<>>(tmp.view(), - index.data_ptrs(), - index.list_sizes(), + list->data.view(), index.pq_centers(), index.centers_rot(), index.codebook_kind(), @@ -940,6 +982,90 @@ __device__ auto compute_pq_code( return code; } +/** + * A producer for the `write_list` and `write_vector` reads the codes byte-by-byte. That is, + * independent of the code width (pq_bits), one code uses the whole byte, hence one vectors uses + * pq_dim bytes. + */ +struct pass_codes { + device_matrix_view codes; + + /** + * Create a callable to be passed to `run_on_list`. + * + * @param[in] codes the source codes. + */ + __device__ inline pass_codes(device_matrix_view codes) + : codes{codes} + { + } + + /** Read j-th component (code) of the i-th vector from the source. */ + __device__ inline auto operator()(uint32_t i, uint32_t j) const -> uint8_t { return codes(i, j); } +}; + +template +__launch_bounds__(BlockSize) __global__ void pack_list_data_kernel( + device_mdspan::list_extents, row_major> list_data, + device_matrix_view codes, + std::variant offset_or_indices) +{ + write_list( + list_data, offset_or_indices, codes.extent(0), codes.extent(1), pass_codes{codes}); +} + +/** + * Write flat PQ codes into an existing list by the given offset. + * + * NB: no memory allocation happens here; the list must fit the data (offset + n_rows). + * + * @param[out] list_data the packed ivf::list data. + * @param[in] codes flat PQ codes, one code per byte [n_rows, pq_dim] + * @param[in] offset_or_indices how many records in the list to skip or the exact indices. + * @param[in] pq_bits codebook size (1 << pq_bits) + * @param[in] stream + */ +inline void pack_list_data( + device_mdspan::list_extents, row_major> list_data, + device_matrix_view codes, + std::variant offset_or_indices, + uint32_t pq_bits, + rmm::cuda_stream_view stream) +{ + auto n_rows = codes.extent(0); + if (n_rows == 0) { return; } + + constexpr uint32_t kBlockSize = 256; + dim3 blocks(div_rounding_up_safe(n_rows, kBlockSize), 1, 1); + dim3 threads(kBlockSize, 1, 1); + auto kernel = [pq_bits]() { + switch (pq_bits) { + case 4: return pack_list_data_kernel; + case 5: return pack_list_data_kernel; + case 6: return pack_list_data_kernel; + case 7: return pack_list_data_kernel; + case 8: return pack_list_data_kernel; + default: RAFT_FAIL("Invalid pq_bits (%u), the value must be within [4, 8]", pq_bits); + } + }(); + kernel<<>>(list_data, codes, offset_or_indices); + RAFT_CUDA_TRY(cudaPeekAtLastError()); +} + +template +void pack_list_data(raft::device_resources const& res, + index* index, + device_matrix_view new_codes, + uint32_t label, + std::variant offset_or_indices) +{ + pack_list_data(index->lists()[label]->data.view(), + new_codes, + offset_or_indices, + index->pq_bits(), + res.get_stream()); +} + /** * Compute a PQ code for a single input vector per subwarp and write it into the * appropriate cluster. @@ -1123,31 +1249,6 @@ void process_and_fill_codes(raft::device_resources const& handle, RAFT_CUDA_TRY(cudaPeekAtLastError()); } -/** - * Write flat PQ codes into an existing list by the given offset. - * - * NB: no memory allocation happens here; the list must fit the data (offset + n_rows). - * - * @tparam IdxT - * - * @param[in] res - * @param[inout] index - * @param[in] codes flat PQ codes, one code per byte [n_rows, index.pq_dim()] - * @param[in] label - * The id of the list (cluster) to decode. - * @param[in] offset how many records in the list to skip. - * - */ -template -void pack_list_data(raft::device_resources const& res, - index* index, - device_matrix_view new_codes, - uint32_t label, - uint32_t offset) -{ - /** TODO: implementation is missing */ -} - /** Update the state of the dependent index members. */ template void recompute_internal_state(const raft::device_resources& res, index& index) @@ -1226,7 +1327,7 @@ void extend_list_with_codes(raft::device_resources const& res, ivf::resize_list(res, list, spec, new_size, offset); copy(list->indices.data_handle() + offset, new_indices.data_handle(), n_rows, res.get_stream()); - pack_list_data(res, index, new_codes, label, offset); + pack_list_data(res, index, new_codes, label, offset); // Update the pointers and the sizes recompute_internal_state(res, *index); diff --git a/cpp/include/raft/neighbors/ivf_pq.cuh b/cpp/include/raft/neighbors/ivf_pq.cuh index 5905bd9c0d..eca57bf9d9 100644 --- a/cpp/include/raft/neighbors/ivf_pq.cuh +++ b/cpp/include/raft/neighbors/ivf_pq.cuh @@ -268,7 +268,7 @@ void unpack_list_data(raft::device_resources const& res, uint32_t label, uint32_t offset) { - return detail::unpack_list_data(res, index, out_codes, label, offset); + return detail::unpack_list_data(res, index, out_codes, label, offset); } /** @@ -310,7 +310,7 @@ void unpack_list_data(raft::device_resources const& res, device_matrix_view out_codes, uint32_t label) { - return detail::unpack_list_data(res, index, out_codes, label, in_cluster_indices); + return detail::unpack_list_data(res, index, out_codes, label, in_cluster_indices); } /** From f62273ddcd4d2901a921c8d017d2f83bf586d7d0 Mon Sep 17 00:00:00 2001 From: achirkin Date: Thu, 16 Mar 2023 18:44:02 +0100 Subject: [PATCH 15/25] Reuse write_vector inside the process_and_fill_codes_kernel --- .../raft/neighbors/detail/ivf_pq_build.cuh | 121 +++++++----------- 1 file changed, 48 insertions(+), 73 deletions(-) diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh index 96c68c50f4..a0ebf3e3c2 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh @@ -561,7 +561,7 @@ void train_per_cluster(raft::device_resources const& handle, * Process a single vector in a list. * * @tparam PqBits - * @tparam Action tells how to process a single vectors (e.g. reconstruct or just unpack) + * @tparam Action tells how to process a single vector (e.g. reconstruct or just unpack) * * @param[in] in_list_data the encoded cluster data. * @param[in] in_ix in-cluster index of the vector to be decoded (one-per-thread). @@ -597,7 +597,21 @@ __device__ void run_on_vector( } } -template +/** + * Process a single vector in a list. + * + * @tparam PqBits + * @tparam SubWarpSize how many threads work on the same ix (only the first thread writes data). + * @tparam Action tells how to process a single vector (e.g. encode or just pack) + * + * @param[in] out_list_data the encoded cluster data. + * @param[in] out_ix in-cluster index of the vector to be processed (one-per-SubWarpSize threads). + * @param[in] in_ix the input index passed to the action (one-per-SubWarpSize threads). + * @param[in] pq_dim + * @param action a callable action to be invoked on each PQ code (component of the encoding) + * type: (uint32_t in_ix, uint32_t j) -> uint8_t, where j = [0..pq_dim). + */ +template __device__ void write_vector( device_mdspan::list_extents, row_major> out_list_data, uint32_t out_ix, @@ -605,6 +619,8 @@ __device__ void write_vector( uint32_t pq_dim, Action action) { + const uint32_t lane_id = Pow2::mod(threadIdx.x); + using group_align = Pow2; const uint32_t group_ix = group_align::div(out_ix); const uint32_t ingroup_ix = group_align::mod(out_ix); @@ -614,15 +630,18 @@ __device__ void write_vector( constexpr uint32_t kChunkSize = (sizeof(pq_vec_t) * 8u) / PqBits; for (uint32_t j = 0, i = 0; j < pq_dim; i++) { // clear the chunk - code_chunk = pq_vec_t{}; + if (lane_id == 0) { code_chunk = pq_vec_t{}; } // write the codes, one/pq_dim at a time #pragma unroll for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++) { // write a single code - code_view[k] = action(in_ix, j); + uint8_t code = action(in_ix, j); + if (lane_id == 0) { code_view[k] = code; } } // write the chunk to the list - *reinterpret_cast(&out_list_data(group_ix, i, ingroup_ix, 0)) = code_chunk; + if (lane_id == 0) { + *reinterpret_cast(&out_list_data(group_ix, i, ingroup_ix, 0)) = code_chunk; + } } } @@ -644,7 +663,7 @@ __device__ void run_on_list( } /** Process the given indices or a block of a single list (cluster). */ -template +template __device__ void write_list( device_mdspan::list_extents, row_major> out_list_data, std::variant offset_or_indices, @@ -652,11 +671,14 @@ __device__ void write_list( uint32_t pq_dim, Action action) { - for (uint32_t ix = threadIdx.x + blockDim.x * blockIdx.x; ix < len; ix += blockDim.x) { + using subwarp_align = Pow2; + uint32_t stride = subwarp_align::div(blockDim.x); + uint32_t ix = subwarp_align::div(threadIdx.x + blockDim.x * blockIdx.x); + for (; ix < len; ix += stride) { const uint32_t dst_ix = std::holds_alternative(offset_or_indices) ? std::get(offset_or_indices) + ix : std::get(offset_or_indices)[ix]; - write_vector(out_list_data, dst_ix, ix, pq_dim, action); + write_vector(out_list_data, dst_ix, ix, pq_dim, action); } } @@ -919,7 +941,7 @@ void reconstruct_list_data(raft::device_resources const& res, * * @tparam SubWarpSize * how many threads work on a single vector; - * bouded by either WarpSize or pq_book_size. + * bounded by either WarpSize or pq_book_size. * * @param pq_centers * - codebook_gen::PER_SUBSPACE: [pq_dim , pq_len, pq_book_size] @@ -1010,7 +1032,7 @@ __launch_bounds__(BlockSize) __global__ void pack_list_data_kernel( device_matrix_view codes, std::variant offset_or_indices) { - write_list( + write_list( list_data, offset_or_indices, codes.extent(0), codes.extent(1), pass_codes{codes}); } @@ -1066,67 +1088,6 @@ void pack_list_data(raft::device_resources const& res, res.get_stream()); } -/** - * Compute a PQ code for a single input vector per subwarp and write it into the - * appropriate cluster. - * Subwarp size here is the minimum between WarpSize and the codebook size. - * - * @tparam BlockSize - * @tparam PqBits - * - * @param[out] out_list_data an array of pointers to the database clusers. - * @param[in] in_vector input unencoded data, one-per-subwarp - * @param[in] pq_centers codebook - * @param[in] codebook_kind - * @param[in] out_ix in-cluster output index (where to write the encoded data), one-per-subwarp. - * @param[in] cluster_ix label/id of the cluster to fill, one-per-subwarp. - */ -template -__device__ auto compute_and_write_pq_code( - device_mdspan::list_extents, row_major> out_list_data, - device_vector_view in_vector, - device_mdspan, row_major> pq_centers, - codebook_gen codebook_kind, - uint32_t out_ix, - uint32_t cluster_ix) -{ - constexpr uint32_t kSubWarpSize = std::min(WarpSize, 1u << PqBits); - using subwarp_align = Pow2; - const uint32_t lane_id = subwarp_align::mod(threadIdx.x); - - using group_align = Pow2; - const uint32_t group_ix = group_align::div(out_ix); - const uint32_t ingroup_ix = group_align::mod(out_ix); - const uint32_t pq_len = pq_centers.extent(1); - const uint32_t pq_dim = in_vector.extent(0) / pq_len; - - using layout_t = typename decltype(in_vector)::layout_type; - using accessor_t = typename decltype(in_vector)::accessor_type; - auto reinterpreted_vector = mdspan, layout_t, accessor_t>( - in_vector.data_handle(), extent_2d{pq_dim, pq_len}); - - __shared__ pq_vec_t codes[subwarp_align::div(BlockSize)]; - pq_vec_t& code = codes[subwarp_align::div(threadIdx.x)]; - bitfield_view_t out{reinterpret_cast(&code)}; - constexpr uint32_t kChunkSize = (sizeof(pq_vec_t) * 8u) / PqBits; - for (uint32_t j = 0, i = 0; j < pq_dim; i++) { - // clear the chunk for writing - if (lane_id == 0) { code = pq_vec_t{}; } - // fill-in the values, one/pq_dim at a time -#pragma unroll - for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++) { - // find the label - auto l = compute_pq_code( - pq_centers, reinterpreted_vector, codebook_kind, j, cluster_ix); - if (lane_id == 0) { out[k] = l; } - } - // write the chunk into the dataset - if (lane_id == 0) { - *reinterpret_cast(&out_list_data(group_ix, i, ingroup_ix, 0)) = code; - } - } -} - template __launch_bounds__(BlockSize) __global__ void process_and_fill_codes_kernel( device_matrix_view new_vectors, @@ -1171,9 +1132,23 @@ __launch_bounds__(BlockSize) __global__ void process_and_fill_codes_kernel( auto pq_extents = list_spec{PqBits, pq_dim, true}.make_list_extents(out_ix + 1); auto pq_dataset = make_mdspan(data_ptrs[cluster_ix], pq_extents); + // 3. compute and write the vector - compute_and_write_pq_code( - pq_dataset, one_vector, pq_centers, codebook_kind, out_ix, cluster_ix); + const uint32_t pq_len = pq_centers.extent(1); + auto reinterpreted_vector = mdspan, layout_t, accessor_t>( + one_vector.data_handle(), extent_2d{pq_dim, pq_len}); + + write_vector( + pq_dataset, + out_ix, + 0, + pq_dim, + [pq_centers, reinterpreted_vector, codebook_kind, cluster_ix] __device__( + uint32_t, uint32_t j) -> uint8_t { + // find the label + return compute_pq_code( + pq_centers, reinterpreted_vector, codebook_kind, j, cluster_ix); + }); } /** From 38895bbf44b8c9aa28f39b0485b93a8be4cc02f6 Mon Sep 17 00:00:00 2001 From: achirkin Date: Fri, 17 Mar 2023 11:32:38 +0100 Subject: [PATCH 16/25] Initial implementation of extend_list (failing tests) --- .../raft/neighbors/detail/ivf_pq_build.cuh | 366 +++++++++++------- cpp/include/raft/neighbors/ivf_pq.cuh | 4 +- cpp/test/neighbors/ann_ivf_pq.cuh | 87 ++++- 3 files changed, 295 insertions(+), 162 deletions(-) diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh index a0ebf3e3c2..ee602b05af 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh @@ -276,7 +276,7 @@ void flat_compute_residuals( device_matrix_view rotation_matrix, // [rot_dim, dim] device_matrix_view centers, // [n_lists, dim_ext] const T* dataset, // [n_rows, dim] - const uint32_t* labels, // [n_rows] + std::variant labels, // [n_rows] rmm::mr::device_memory_resource* device_memory) { auto stream = handle.get_stream(); @@ -287,7 +287,9 @@ void flat_compute_residuals( linalg::map_offset(handle, tmp_view, [centers, dataset, labels, dim] __device__(size_t i) { auto row_ix = i / dim; auto el_ix = i % dim; - auto label = labels[row_ix]; + auto label = std::holds_alternative(labels) + ? std::get(labels) + : std::get(labels)[row_ix]; return utils::mapping{}(dataset[i]) - centers(label, el_ix); }); @@ -557,6 +559,32 @@ void train_per_cluster(raft::device_resources const& handle, transpose_pq_centers(handle, index, pq_centers_tmp.data()); } +/** + * A helper function: given the dataset in the rotated space + * [n_rows, rot_dim] = [n_rows, pq_dim * pq_len], + * reinterpret the last dimension as two: [n_rows, pq_dim, pq_len] + * + * @tparam T + * @tparam IdxT + * + * @param vectors input data [n_rows, rot_dim] + * @param pq_centers codebook (used to infer the structure - pq_len) + * @return reinterpreted vectors [n_rows, pq_dim, pq_len] + */ +template +static __device__ auto reinterpret_vectors( + device_matrix_view vectors, + device_mdspan, row_major> pq_centers) + -> device_mdspan, row_major> +{ + const uint32_t pq_len = pq_centers.extent(1); + const uint32_t pq_dim = vectors.extent(1) / pq_len; + using layout_t = typename decltype(vectors)::layout_type; + using accessor_t = typename decltype(vectors)::accessor_type; + return mdspan, layout_t, accessor_t>( + vectors.data_handle(), extent_3d{vectors.extent(0), pq_dim, pq_len}); +} + /** * Process a single vector in a list. * @@ -602,6 +630,7 @@ __device__ void run_on_vector( * * @tparam PqBits * @tparam SubWarpSize how many threads work on the same ix (only the first thread writes data). + * @tparam IdxT type of the index passed to the action * @tparam Action tells how to process a single vector (e.g. encode or just pack) * * @param[in] out_list_data the encoded cluster data. @@ -611,11 +640,11 @@ __device__ void run_on_vector( * @param action a callable action to be invoked on each PQ code (component of the encoding) * type: (uint32_t in_ix, uint32_t j) -> uint8_t, where j = [0..pq_dim). */ -template +template __device__ void write_vector( device_mdspan::list_extents, row_major> out_list_data, uint32_t out_ix, - uint32_t in_ix, + IdxT in_ix, uint32_t pq_dim, Action action) { @@ -823,21 +852,6 @@ struct reconstruct_vectors { out_vectors(i, j, k) = pq_centers(partition_ix, k, code) + centers_rot(cluster_ix, j, k); } } - - private: - template - static __device__ auto reinterpret_vectors( - device_matrix_view out_vectors, - device_mdspan, row_major> pq_centers) - -> device_mdspan, row_major> - { - const uint32_t pq_len = pq_centers.extent(1); - const uint32_t pq_dim = out_vectors.extent(1) / pq_len; - using layout_t = typename decltype(out_vectors)::layout_type; - using accessor_t = typename decltype(out_vectors)::accessor_type; - return mdspan, layout_t, accessor_t>( - out_vectors.data_handle(), extent_3d{out_vectors.extent(0), pq_dim, pq_len}); - } }; template @@ -936,74 +950,6 @@ void reconstruct_list_data(raft::device_resources const& res, } } -/** - * Compute the code: find the closest cluster in each pq_dim-subspace. - * - * @tparam SubWarpSize - * how many threads work on a single vector; - * bounded by either WarpSize or pq_book_size. - * - * @param pq_centers - * - codebook_gen::PER_SUBSPACE: [pq_dim , pq_len, pq_book_size] - * - codebook_gen::PER_CLUSTER: [n_lists, pq_len, pq_book_size] - * @param new_vector a single input of length rot_dim, reinterpreted as [pq_dim, pq_len]. - * the input must be already transformed to floats, rotated, and the level 1 cluster - * center must be already substructed (i.e. this is the residual of a single input vector). - * @param codebook_kind - * @param j index along pq_dim "dimension" - * @param cluster_ix is used for PER_CLUSTER codebooks. - */ -template -__device__ auto compute_pq_code( - device_mdspan, row_major> pq_centers, - device_mdspan, row_major> new_vector, - codebook_gen codebook_kind, - uint32_t j, - uint32_t cluster_ix) -> uint8_t -{ - using subwarp_align = Pow2; - uint32_t lane_id = subwarp_align::mod(laneId()); - uint32_t partition_ix; - switch (codebook_kind) { - case codebook_gen::PER_CLUSTER: { - partition_ix = cluster_ix; - } break; - case codebook_gen::PER_SUBSPACE: { - partition_ix = j; - } break; - default: __builtin_unreachable(); - } - - const uint32_t pq_book_size = pq_centers.extent(2); - const uint32_t pq_len = pq_centers.extent(1); - float min_dist = std::numeric_limits::infinity(); - uint8_t code = 0; - // calculate the distance for each PQ cluster, find the minimum for each thread - for (uint32_t i = lane_id; i < pq_book_size; i += subwarp_align::Value) { - // NB: the L2 quantifiers on residuals are always trained on L2 metric. - float d = 0.0f; - for (uint32_t k = 0; k < pq_len; k++) { - auto t = new_vector(j, k) - pq_centers(partition_ix, k, i); - d += t * t; - } - if (d < min_dist) { - min_dist = d; - code = uint8_t(i); - } - } - // reduce among threads -#pragma unroll - for (uint32_t stride = SubWarpSize >> 1; stride > 0; stride >>= 1) { - const auto other_dist = shfl_xor(min_dist, stride, SubWarpSize); - const auto other_code = shfl_xor(code, stride, SubWarpSize); - if (other_dist < min_dist) { - min_dist = other_dist; - code = other_code; - } - } - return code; -} - /** * A producer for the `write_list` and `write_vector` reads the codes byte-by-byte. That is, * independent of the code width (pq_bits), one code uses the whole byte, hence one vectors uses @@ -1088,6 +1034,96 @@ void pack_list_data(raft::device_resources const& res, res.get_stream()); } +/** + * + * A producer for the `write_list` and `write_vector` that encodes level-1 input vector residuals + * into lvl-2 PQ codes. + * Computing a PQ code means finding the closest cluster in a pq_dim-subspace. + * + * @tparam SubWarpSize + * how many threads work on a single vector; + * bounded by either WarpSize or pq_book_size. + * + * @param pq_centers + * - codebook_gen::PER_SUBSPACE: [pq_dim , pq_len, pq_book_size] + * - codebook_gen::PER_CLUSTER: [n_lists, pq_len, pq_book_size] + * @param new_vector a single input of length rot_dim, reinterpreted as [pq_dim, pq_len]. + * the input must be already transformed to floats, rotated, and the level 1 cluster + * center must be already substructed (i.e. this is the residual of a single input vector). + * @param codebook_kind + * @param j index along pq_dim "dimension" + * @param cluster_ix is used for PER_CLUSTER codebooks. + */ +/** + */ +template +struct encode_vectors { + codebook_gen codebook_kind; + uint32_t cluster_ix; + device_mdspan, row_major> pq_centers; + device_mdspan, row_major> in_vectors; + + __device__ inline encode_vectors( + device_mdspan, row_major> pq_centers, + device_matrix_view in_vectors, + codebook_gen codebook_kind, + uint32_t cluster_ix) + : codebook_kind{codebook_kind}, + cluster_ix{cluster_ix}, + pq_centers{pq_centers}, + in_vectors{reinterpret_vectors(in_vectors, pq_centers)} + { + } + + /** + * Decode j-th component of the i-th vector by its code and write it into a chunk of the output + * vectors (pq_len elements). + */ + __device__ inline auto operator()(IdxT i, uint32_t j) -> uint8_t + { + uint32_t lane_id = Pow2::mod(laneId()); + uint32_t partition_ix; + switch (codebook_kind) { + case codebook_gen::PER_CLUSTER: { + partition_ix = cluster_ix; + } break; + case codebook_gen::PER_SUBSPACE: { + partition_ix = j; + } break; + default: __builtin_unreachable(); + } + + const uint32_t pq_book_size = pq_centers.extent(2); + const uint32_t pq_len = pq_centers.extent(1); + float min_dist = std::numeric_limits::infinity(); + uint8_t code = 0; + // calculate the distance for each PQ cluster, find the minimum for each thread + for (uint32_t l = lane_id; l < pq_book_size; l += SubWarpSize) { + // NB: the L2 quantifiers on residuals are always trained on L2 metric. + float d = 0.0f; + for (uint32_t k = 0; k < pq_len; k++) { + auto t = in_vectors(i, j, k) - pq_centers(partition_ix, k, l); + d += t * t; + } + if (d < min_dist) { + min_dist = d; + code = uint8_t(l); + } + } + // reduce among threads +#pragma unroll + for (uint32_t stride = SubWarpSize >> 1; stride > 0; stride >>= 1) { + const auto other_dist = shfl_xor(min_dist, stride, SubWarpSize); + const auto other_code = shfl_xor(code, stride, SubWarpSize); + if (other_dist < min_dist) { + min_dist = other_dist; + code = other_code; + } + } + return code; + } +}; + template __launch_bounds__(BlockSize) __global__ void process_and_fill_codes_kernel( device_matrix_view new_vectors, @@ -1121,34 +1157,79 @@ __launch_bounds__(BlockSize) __global__ void process_and_fill_codes_kernel( } // write the codes (one record per subwarp): - // 1. select input row - using layout_t = typename decltype(new_vectors)::layout_type; - using accessor_t = typename decltype(new_vectors)::accessor_type; - const auto in_dim = new_vectors.extent(1); - auto one_vector = - mdspan, layout_t, accessor_t>(&new_vectors(row_ix, 0), in_dim); - // 2. select output cluster - const uint32_t pq_dim = in_dim / pq_centers.extent(1); + const uint32_t pq_dim = new_vectors.extent(1) / pq_centers.extent(1); auto pq_extents = list_spec{PqBits, pq_dim, true}.make_list_extents(out_ix + 1); auto pq_dataset = make_mdspan(data_ptrs[cluster_ix], pq_extents); - - // 3. compute and write the vector - const uint32_t pq_len = pq_centers.extent(1); - auto reinterpreted_vector = mdspan, layout_t, accessor_t>( - one_vector.data_handle(), extent_2d{pq_dim, pq_len}); - write_vector( pq_dataset, out_ix, - 0, + row_ix, pq_dim, - [pq_centers, reinterpreted_vector, codebook_kind, cluster_ix] __device__( - uint32_t, uint32_t j) -> uint8_t { - // find the label - return compute_pq_code( - pq_centers, reinterpreted_vector, codebook_kind, j, cluster_ix); - }); + encode_vectors{pq_centers, new_vectors, codebook_kind, cluster_ix}); +} + +template +__launch_bounds__(BlockSize) __global__ void encode_list_data_kernel( + device_mdspan::list_extents, row_major> list_data, + device_matrix_view new_vectors, + device_mdspan, row_major> pq_centers, + codebook_gen codebook_kind, + uint32_t cluster_ix, + std::variant offset_or_indices) +{ + constexpr uint32_t kSubWarpSize = std::min(WarpSize, 1u << PqBits); + const uint32_t pq_dim = new_vectors.extent(1) / pq_centers.extent(1); + auto encode_action = + encode_vectors{pq_centers, new_vectors, codebook_kind, cluster_ix}; + write_list( + list_data, offset_or_indices, new_vectors.extent(0), pq_dim, encode_action); +} + +template +void encode_list_data(raft::device_resources const& res, + index* index, + device_matrix_view new_vectors, + uint32_t label, + std::variant offset_or_indices) +{ + auto n_rows = new_vectors.extent(0); + if (n_rows == 0) { return; } + + auto mr = res.get_workspace_resource(); + + auto new_vectors_residual = + make_device_mdarray(res, mr, make_extents(n_rows, index->rot_dim())); + + flat_compute_residuals(res, + new_vectors_residual.data_handle(), + n_rows, + index->rotation_matrix(), + index->centers(), + new_vectors.data_handle(), + label, + mr); + + constexpr uint32_t kBlockSize = 256; + dim3 blocks(div_rounding_up_safe(n_rows, kBlockSize), 1, 1); + dim3 threads(kBlockSize, 1, 1); + auto kernel = [](uint32_t pq_bits) { + switch (pq_bits) { + case 4: return encode_list_data_kernel; + case 5: return encode_list_data_kernel; + case 6: return encode_list_data_kernel; + case 7: return encode_list_data_kernel; + case 8: return encode_list_data_kernel; + default: RAFT_FAIL("Invalid pq_bits (%u), the value must be within [4, 8]", pq_bits); + } + }(index->pq_bits()); + kernel<<>>(index->lists()[label]->data.view(), + new_vectors_residual.view(), + index->pq_centers(), + index->codebook_kind(), + label, + offset_or_indices); + RAFT_CUDA_TRY(cudaPeekAtLastError()); } /** @@ -1190,14 +1271,14 @@ void process_and_fill_codes(raft::device_resources const& handle, auto new_vectors_residual = make_device_mdarray(handle, mr, make_extents(n_rows, index.rot_dim())); - flat_compute_residuals(handle, - new_vectors_residual.data_handle(), - n_rows, - index.rotation_matrix(), - index.centers(), - new_vectors, - new_labels, - mr); + flat_compute_residuals(handle, + new_vectors_residual.data_handle(), + n_rows, + index.rotation_matrix(), + index.centers(), + new_vectors, + new_labels, + mr); constexpr uint32_t kBlockSize = 256; const uint32_t threads_per_vec = std::min(WarpSize, index.pq_book_size()); @@ -1278,16 +1359,16 @@ void recompute_internal_state(const raft::device_resources& res, index& in } /** - * Extend one list of the index in-place, by the list label, skipping the classification and - * encoding steps. - * See the public interface for the api and usage. + * Helper function: allocate enough space in the list, compute the offset, at which to start + * writing, and fill-in indices. + * + * @return offset for writing the data */ template -void extend_list_with_codes(raft::device_resources const& res, - index* index, - device_matrix_view new_codes, - device_vector_view new_indices, - uint32_t label) +auto extend_list_prepare(raft::device_resources const& res, + index* index, + device_vector_view new_indices, + uint32_t label) -> uint32_t { uint32_t n_rows = new_indices.extent(0); uint32_t offset; @@ -1301,9 +1382,25 @@ void extend_list_with_codes(raft::device_resources const& res, auto& list = index->lists()[label]; ivf::resize_list(res, list, spec, new_size, offset); copy(list->indices.data_handle() + offset, new_indices.data_handle(), n_rows, res.get_stream()); + return offset; +} +/** + * Extend one list of the index in-place, by the list label, skipping the classification and + * encoding steps. + * See the public interface for the api and usage. + */ +template +void extend_list_with_codes(raft::device_resources const& res, + index* index, + device_matrix_view new_codes, + device_vector_view new_indices, + uint32_t label) +{ + // Allocate memory and write indices + auto offset = extend_list_prepare(res, index, new_indices, label); + // Pack the data pack_list_data(res, index, new_codes, label, offset); - // Update the pointers and the sizes recompute_internal_state(res, *index); } @@ -1319,21 +1416,10 @@ void extend_list(raft::device_resources const& res, device_vector_view new_indices, uint32_t label) { - uint32_t n_rows = new_indices.extent(0); - uint32_t offset; - // Allocate the lists to fit the new data - copy(&offset, index->list_sizes().data_handle() + label, 1, res.get_stream()); - res.sync_stream(); - uint32_t new_size = offset + n_rows; - copy(index->list_sizes().data_handle() + label, &new_size, 1, res.get_stream()); - auto spec = list_spec{ - index->pq_bits(), index->pq_dim(), index->conservative_memory_allocation()}; - auto& list = index->lists()[label]; - ivf::resize_list(res, list, spec, new_size, offset); - copy(list->indices.data_handle() + offset, new_indices.data_handle(), n_rows, res.get_stream()); - - /** TODO: implementation is missing */ - + // Allocate memory and write indices + auto offset = extend_list_prepare(res, index, new_indices, label); + // Encode the data + encode_list_data(res, index, new_vectors, label, offset); // Update the pointers and the sizes recompute_internal_state(res, *index); } diff --git a/cpp/include/raft/neighbors/ivf_pq.cuh b/cpp/include/raft/neighbors/ivf_pq.cuh index eca57bf9d9..464a00fb67 100644 --- a/cpp/include/raft/neighbors/ivf_pq.cuh +++ b/cpp/include/raft/neighbors/ivf_pq.cuh @@ -403,7 +403,7 @@ void reconstruct_list_data(raft::device_resources const& res, * @brief Build a new index containing the data of the original plus new extra vectors. * * Implementation note: - * The new data is clustered according to existing kmeans clusters, then the cluster + * The new data is clustered according to existing kmeans clusters, the cluster * centers are unchanged. * * Usage example: @@ -508,7 +508,7 @@ void extend_list(raft::device_resources const& res, device_vector_view new_indices, uint32_t label) { - detail::extend_list(res, index, new_vectors, new_vectors, label); + detail::extend_list(res, index, new_vectors, new_indices, label); } /** diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh index 4eea925741..52987f2227 100644 --- a/cpp/test/neighbors/ann_ivf_pq.cuh +++ b/cpp/test/neighbors/ann_ivf_pq.cuh @@ -22,6 +22,8 @@ #include #include +#include +#include #include #include #include @@ -39,8 +41,6 @@ #include #include -#include -#include #include #include @@ -190,17 +190,15 @@ class ivf_pq_test : public ::testing::TestWithParam { index build_2_extends() { - rmm::device_uvector db_indices(ps.num_db_vecs, stream_); - thrust::sequence(handle_.get_thrust_policy(), - thrust::device_pointer_cast(db_indices.data()), - thrust::device_pointer_cast(db_indices.data() + ps.num_db_vecs)); + auto db_indices = make_device_vector(handle_, ps.num_db_vecs); + linalg::map_offset(handle_, db_indices.view(), identity_op{}); handle_.sync_stream(stream_); auto size_1 = IdxT(ps.num_db_vecs) / 2; auto size_2 = IdxT(ps.num_db_vecs) - size_1; auto vecs_1 = database.data(); auto vecs_2 = database.data() + size_t(size_1) * size_t(ps.dim); - auto inds_1 = db_indices.data(); - auto inds_2 = db_indices.data() + size_t(size_1); + auto inds_1 = db_indices.data_handle(); + auto inds_2 = db_indices.data_handle() + size_t(size_1); auto ipams = ps.index_params; ipams.add_data_on_build = false; @@ -233,10 +231,10 @@ class ivf_pq_test : public ::testing::TestWithParam { uint32_t n_take, uint32_t n_skip) { - auto rec_list = index.lists()[label]; - auto dim = index.dim(); - n_take = std::min(n_take, rec_list->size.load()); - n_skip = std::min(n_skip, rec_list->size.load() - n_take); + auto& rec_list = index.lists()[label]; + auto dim = index.dim(); + n_take = std::min(n_take, rec_list->size.load()); + n_skip = std::min(n_skip, rec_list->size.load() - n_take); if (n_take == 0) { return; } @@ -279,22 +277,68 @@ class ivf_pq_test : public ::testing::TestWithParam { } } + void check_reconstruct_extend(index* index, double compression_ratio, uint32_t label) + { + // NB: this is not reference, the list is retained; the index will have to create a new list on + // `erase_list` op. + auto old_list = index->lists()[label]; + auto n_rows = old_list->size.load(); + if (n_rows == 0) { return; } + + auto vectors = make_device_matrix(handle_, n_rows, index->dim()); + auto indices = make_device_vector(handle_, n_rows); + copy(indices.data_handle(), old_list->indices.data_handle(), n_rows, stream_); + + ivf_pq::reconstruct_list_data(handle_, *index, vectors.view(), label, 0); + ivf_pq::erase_list(handle_, index, label); + // NB: passing the type parameter because const->non-const implicit conversion of the mdspans + // breaks type inference + ivf_pq::extend_list(handle_, index, vectors.view(), indices.view(), label); + + auto& new_list = index->lists()[label]; + ASSERT_NE(old_list.get(), new_list.get()) + << "The old list should have been shared and retained after ivf_pq index has erased the " + "corresponding cluster."; + + auto n_codes = old_list->data.size(); + rmm::mr::managed_memory_resource managed_memory; + rmm::device_scalar errs(stream_, &managed_memory); + linalg::mapReduce(errs.data(), + n_codes, + 0, + compose_op{cast_op{}, notequal_op{}}, + add_op{}, + stream_, + old_list->data.data_handle(), + new_list->data.data_handle()); + auto err_value = errs.value(stream_); + auto err_rate = double(err_value) / double(n_codes); + ASSERT_LE(err_rate, 0.01 * std::pow(2.0, compression_ratio)) + << " (label = " << label << ", errors = " << err_value << ", size = " << n_codes << ")"; + } + void check_packing(index* index, uint32_t label) { - auto rec_list = index->lists()[label]; - auto n_rows = rec_list->size.load(); + auto old_list = index->lists()[label]; + auto n_rows = old_list->size.load(); if (n_rows == 0) { return; } auto codes = make_device_matrix(handle_, n_rows, index->pq_dim()); auto indices = make_device_vector(handle_, n_rows); - copy(indices.data_handle(), rec_list->indices.data_handle(), n_rows, stream_); + copy(indices.data_handle(), old_list->indices.data_handle(), n_rows, stream_); ivf_pq::unpack_list_data(handle_, *index, codes.view(), label, 0); ivf_pq::erase_list(handle_, index, label); - // NB: passing the type parameter because const->non-const implicit conversion of the mdspans - // breaks type inference ivf_pq::extend_list_with_codes(handle_, index, codes.view(), indices.view(), label); + + auto& new_list = index->lists()[label]; + ASSERT_NE(old_list.get(), new_list.get()) + << "The old list should have been shared and retained after ivf_pq index has erased the " + "corresponding cluster."; + + ASSERT_TRUE(devArrMatch( + old_list->data.data_handle(), new_list->data.data_handle(), n_rows, Compare{})); } template @@ -302,12 +346,15 @@ class ivf_pq_test : public ::testing::TestWithParam { { index index = build_index(); - // Dump and re-write codes for one label - check_packing(&index, 0); - double compression_ratio = static_cast(ps.dim * 8) / static_cast(index.pq_dim() * index.pq_bits()); + // Reconstruct and re-write vectors for one label + check_reconstruct_extend(&index, compression_ratio, uint32_t(rand()) % index.n_lists()); + + // Dump and re-write codes for one label + check_packing(&index, uint32_t(rand()) % index.n_lists()); + // check a small subset of data in a randomly chosen cluster to see if the data reconstruction // works well. check_reconstruction(index, compression_ratio, uint32_t(rand()) % index.n_lists(), 100, 7); From a674768bdb9ed256e9c1bf3915e42eb6177de0a9 Mon Sep 17 00:00:00 2001 From: achirkin Date: Fri, 17 Mar 2023 12:18:09 +0100 Subject: [PATCH 17/25] Adjust the scheduling of the encode_list_data_kernel --- cpp/include/raft/neighbors/detail/ivf_pq_build.cuh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh index ee602b05af..c11aecc73c 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh @@ -1210,8 +1210,9 @@ void encode_list_data(raft::device_resources const& res, label, mr); - constexpr uint32_t kBlockSize = 256; - dim3 blocks(div_rounding_up_safe(n_rows, kBlockSize), 1, 1); + constexpr uint32_t kBlockSize = 256; + const uint32_t threads_per_vec = std::min(WarpSize, index->pq_book_size()); + dim3 blocks(div_rounding_up_safe(n_rows, kBlockSize / threads_per_vec), 1, 1); dim3 threads(kBlockSize, 1, 1); auto kernel = [](uint32_t pq_bits) { switch (pq_bits) { From 1be2f6bfbbfb2151bf7c73651d549eef90892774 Mon Sep 17 00:00:00 2001 From: achirkin Date: Fri, 17 Mar 2023 12:18:44 +0100 Subject: [PATCH 18/25] Factor code-packing out of the build file --- .../raft/neighbors/detail/ivf_pq_build.cuh | 186 +-------------- .../neighbors/detail/ivf_pq_codepacking.cuh | 214 ++++++++++++++++++ 2 files changed, 215 insertions(+), 185 deletions(-) create mode 100644 cpp/include/raft/neighbors/detail/ivf_pq_codepacking.cuh diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh index c11aecc73c..36ceccc36f 100644 --- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh +++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh @@ -18,6 +18,7 @@ #include +#include #include #include @@ -60,63 +61,6 @@ namespace raft::neighbors::ivf_pq::detail { using namespace raft::spatial::knn::detail; // NOLINT -/** A chunk of PQ-encoded vector managed by one CUDA thread. */ -using pq_vec_t = TxN_t::io_t; - -namespace { - -/** - * This type mimics the `uint8_t&` for the indexing operator of `bitfield_view_t`. - * - * @tparam Bits number of bits comprising the value. - */ -template -struct bitfield_ref_t { - static_assert(Bits <= 8 && Bits > 0, "Bit code must fit one byte"); - constexpr static uint8_t kMask = static_cast((1u << Bits) - 1u); - uint8_t* ptr; - uint32_t offset; - - constexpr operator uint8_t() // NOLINT - { - auto pair = static_cast(ptr[0]); - if (offset + Bits > 8) { pair |= static_cast(ptr[1]) << 8; } - return static_cast((pair >> offset) & kMask); - } - - constexpr auto operator=(uint8_t code) -> bitfield_ref_t& - { - if (offset + Bits > 8) { - auto pair = static_cast(ptr[0]); - pair |= static_cast(ptr[1]) << 8; - pair &= ~(static_cast(kMask) << offset); - pair |= static_cast(code) << offset; - ptr[0] = static_cast(Pow2<256>::mod(pair)); - ptr[1] = static_cast(Pow2<256>::div(pair)); - } else { - ptr[0] = (ptr[0] & ~(kMask << offset)) | (code << offset); - } - return *this; - } -}; - -/** - * View a byte array as an array of unsigned integers of custom small bit size. - * - * @tparam Bits number of bits comprising a single element of the array. - */ -template -struct bitfield_view_t { - static_assert(Bits <= 8 && Bits > 0, "Bit code must fit one byte"); - uint8_t* raw; - - constexpr auto operator[](uint32_t i) -> bitfield_ref_t - { - uint32_t bit_offset = i * Bits; - return bitfield_ref_t{raw + Pow2<8>::div(bit_offset), Pow2<8>::mod(bit_offset)}; - } -}; - template __launch_bounds__(BlockDim) __global__ void copy_warped_kernel( T* out, uint32_t ld_out, const S* in, uint32_t ld_in, uint32_t n_cols, size_t n_rows) @@ -162,8 +106,6 @@ void copy_warped(T* out, <<>>(out, ld_out, in, ld_in, n_cols, n_rows); } -} // namespace - /** * @brief Fill-in a random orthogonal transformation matrix. * @@ -585,132 +527,6 @@ static __device__ auto reinterpret_vectors( vectors.data_handle(), extent_3d{vectors.extent(0), pq_dim, pq_len}); } -/** - * Process a single vector in a list. - * - * @tparam PqBits - * @tparam Action tells how to process a single vector (e.g. reconstruct or just unpack) - * - * @param[in] in_list_data the encoded cluster data. - * @param[in] in_ix in-cluster index of the vector to be decoded (one-per-thread). - * @param[in] out_ix the output index passed to the action - * @param[in] pq_dim - * @param action a callable action to be invoked on each PQ code (component of the encoding) - * type: void (uint8_t code, uint32_t out_ix, uint32_t j), where j = [0..pq_dim). - */ -template -__device__ void run_on_vector( - device_mdspan::list_extents, row_major> in_list_data, - uint32_t in_ix, - uint32_t out_ix, - uint32_t pq_dim, - Action action) -{ - using group_align = Pow2; - const uint32_t group_ix = group_align::div(in_ix); - const uint32_t ingroup_ix = group_align::mod(in_ix); - - pq_vec_t code_chunk; - bitfield_view_t code_view{reinterpret_cast(&code_chunk)}; - constexpr uint32_t kChunkSize = (sizeof(pq_vec_t) * 8u) / PqBits; - for (uint32_t j = 0, i = 0; j < pq_dim; i++) { - // read the chunk - code_chunk = *reinterpret_cast(&in_list_data(group_ix, i, ingroup_ix, 0)); - // read the codes, one/pq_dim at a time -#pragma unroll - for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++) { - // read a piece of the reconstructed vector - action(code_view[k], out_ix, j); - } - } -} - -/** - * Process a single vector in a list. - * - * @tparam PqBits - * @tparam SubWarpSize how many threads work on the same ix (only the first thread writes data). - * @tparam IdxT type of the index passed to the action - * @tparam Action tells how to process a single vector (e.g. encode or just pack) - * - * @param[in] out_list_data the encoded cluster data. - * @param[in] out_ix in-cluster index of the vector to be processed (one-per-SubWarpSize threads). - * @param[in] in_ix the input index passed to the action (one-per-SubWarpSize threads). - * @param[in] pq_dim - * @param action a callable action to be invoked on each PQ code (component of the encoding) - * type: (uint32_t in_ix, uint32_t j) -> uint8_t, where j = [0..pq_dim). - */ -template -__device__ void write_vector( - device_mdspan::list_extents, row_major> out_list_data, - uint32_t out_ix, - IdxT in_ix, - uint32_t pq_dim, - Action action) -{ - const uint32_t lane_id = Pow2::mod(threadIdx.x); - - using group_align = Pow2; - const uint32_t group_ix = group_align::div(out_ix); - const uint32_t ingroup_ix = group_align::mod(out_ix); - - pq_vec_t code_chunk; - bitfield_view_t code_view{reinterpret_cast(&code_chunk)}; - constexpr uint32_t kChunkSize = (sizeof(pq_vec_t) * 8u) / PqBits; - for (uint32_t j = 0, i = 0; j < pq_dim; i++) { - // clear the chunk - if (lane_id == 0) { code_chunk = pq_vec_t{}; } - // write the codes, one/pq_dim at a time -#pragma unroll - for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++) { - // write a single code - uint8_t code = action(in_ix, j); - if (lane_id == 0) { code_view[k] = code; } - } - // write the chunk to the list - if (lane_id == 0) { - *reinterpret_cast(&out_list_data(group_ix, i, ingroup_ix, 0)) = code_chunk; - } - } -} - -/** Process the given indices or a block of a single list (cluster). */ -template -__device__ void run_on_list( - device_mdspan::list_extents, row_major> in_list_data, - std::variant offset_or_indices, - uint32_t len, - uint32_t pq_dim, - Action action) -{ - for (uint32_t ix = threadIdx.x + blockDim.x * blockIdx.x; ix < len; ix += blockDim.x) { - const uint32_t src_ix = std::holds_alternative(offset_or_indices) - ? std::get(offset_or_indices) + ix - : std::get(offset_or_indices)[ix]; - run_on_vector(in_list_data, src_ix, ix, pq_dim, action); - } -} - -/** Process the given indices or a block of a single list (cluster). */ -template -__device__ void write_list( - device_mdspan::list_extents, row_major> out_list_data, - std::variant offset_or_indices, - uint32_t len, - uint32_t pq_dim, - Action action) -{ - using subwarp_align = Pow2; - uint32_t stride = subwarp_align::div(blockDim.x); - uint32_t ix = subwarp_align::div(threadIdx.x + blockDim.x * blockIdx.x); - for (; ix < len; ix += stride) { - const uint32_t dst_ix = std::holds_alternative(offset_or_indices) - ? std::get(offset_or_indices) + ix - : std::get(offset_or_indices)[ix]; - write_vector(out_list_data, dst_ix, ix, pq_dim, action); - } -} - /** * A consumer for the `run_on_list` and `run_on_vector` that just flattens PQ codes * one-per-byte. That is, independent of the code width (pq_bits), one code uses diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_codepacking.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_codepacking.cuh new file mode 100644 index 0000000000..52969dd176 --- /dev/null +++ b/cpp/include/raft/neighbors/detail/ivf_pq_codepacking.cuh @@ -0,0 +1,214 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +namespace raft::neighbors::ivf_pq::detail { + +/** A chunk of PQ-encoded vector managed by one CUDA thread. */ +using pq_vec_t = TxN_t::io_t; + +/** + * This type mimics the `uint8_t&` for the indexing operator of `bitfield_view_t`. + * + * @tparam Bits number of bits comprising the value. + */ +template +struct bitfield_ref_t { + static_assert(Bits <= 8 && Bits > 0, "Bit code must fit one byte"); + constexpr static uint8_t kMask = static_cast((1u << Bits) - 1u); + uint8_t* ptr; + uint32_t offset; + + constexpr operator uint8_t() // NOLINT + { + auto pair = static_cast(ptr[0]); + if (offset + Bits > 8) { pair |= static_cast(ptr[1]) << 8; } + return static_cast((pair >> offset) & kMask); + } + + constexpr auto operator=(uint8_t code) -> bitfield_ref_t& + { + if (offset + Bits > 8) { + auto pair = static_cast(ptr[0]); + pair |= static_cast(ptr[1]) << 8; + pair &= ~(static_cast(kMask) << offset); + pair |= static_cast(code) << offset; + ptr[0] = static_cast(Pow2<256>::mod(pair)); + ptr[1] = static_cast(Pow2<256>::div(pair)); + } else { + ptr[0] = (ptr[0] & ~(kMask << offset)) | (code << offset); + } + return *this; + } +}; + +/** + * View a byte array as an array of unsigned integers of custom small bit size. + * + * @tparam Bits number of bits comprising a single element of the array. + */ +template +struct bitfield_view_t { + static_assert(Bits <= 8 && Bits > 0, "Bit code must fit one byte"); + uint8_t* raw; + + constexpr auto operator[](uint32_t i) -> bitfield_ref_t + { + uint32_t bit_offset = i * Bits; + return bitfield_ref_t{raw + Pow2<8>::div(bit_offset), Pow2<8>::mod(bit_offset)}; + } +}; + +/** + * Process a single vector in a list. + * + * @tparam PqBits + * @tparam Action tells how to process a single vector (e.g. reconstruct or just unpack) + * + * @param[in] in_list_data the encoded cluster data. + * @param[in] in_ix in-cluster index of the vector to be decoded (one-per-thread). + * @param[in] out_ix the output index passed to the action + * @param[in] pq_dim + * @param action a callable action to be invoked on each PQ code (component of the encoding) + * type: void (uint8_t code, uint32_t out_ix, uint32_t j), where j = [0..pq_dim). + */ +template +__device__ void run_on_vector( + device_mdspan::list_extents, row_major> in_list_data, + uint32_t in_ix, + uint32_t out_ix, + uint32_t pq_dim, + Action action) +{ + using group_align = Pow2; + const uint32_t group_ix = group_align::div(in_ix); + const uint32_t ingroup_ix = group_align::mod(in_ix); + + pq_vec_t code_chunk; + bitfield_view_t code_view{reinterpret_cast(&code_chunk)}; + constexpr uint32_t kChunkSize = (sizeof(pq_vec_t) * 8u) / PqBits; + for (uint32_t j = 0, i = 0; j < pq_dim; i++) { + // read the chunk + code_chunk = *reinterpret_cast(&in_list_data(group_ix, i, ingroup_ix, 0)); + // read the codes, one/pq_dim at a time +#pragma unroll + for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++) { + // read a piece of the reconstructed vector + action(code_view[k], out_ix, j); + } + } +} + +/** + * Process a single vector in a list. + * + * @tparam PqBits + * @tparam SubWarpSize how many threads work on the same ix (only the first thread writes data). + * @tparam IdxT type of the index passed to the action + * @tparam Action tells how to process a single vector (e.g. encode or just pack) + * + * @param[in] out_list_data the encoded cluster data. + * @param[in] out_ix in-cluster index of the vector to be processed (one-per-SubWarpSize threads). + * @param[in] in_ix the input index passed to the action (one-per-SubWarpSize threads). + * @param[in] pq_dim + * @param action a callable action to be invoked on each PQ code (component of the encoding) + * type: (uint32_t in_ix, uint32_t j) -> uint8_t, where j = [0..pq_dim). + */ +template +__device__ void write_vector( + device_mdspan::list_extents, row_major> out_list_data, + uint32_t out_ix, + IdxT in_ix, + uint32_t pq_dim, + Action action) +{ + const uint32_t lane_id = Pow2::mod(threadIdx.x); + + using group_align = Pow2; + const uint32_t group_ix = group_align::div(out_ix); + const uint32_t ingroup_ix = group_align::mod(out_ix); + + pq_vec_t code_chunk; + bitfield_view_t code_view{reinterpret_cast(&code_chunk)}; + constexpr uint32_t kChunkSize = (sizeof(pq_vec_t) * 8u) / PqBits; + for (uint32_t j = 0, i = 0; j < pq_dim; i++) { + // clear the chunk + if (lane_id == 0) { code_chunk = pq_vec_t{}; } + // write the codes, one/pq_dim at a time +#pragma unroll + for (uint32_t k = 0; k < kChunkSize && j < pq_dim; k++, j++) { + // write a single code + uint8_t code = action(in_ix, j); + if (lane_id == 0) { code_view[k] = code; } + } + // write the chunk to the list + if (lane_id == 0) { + *reinterpret_cast(&out_list_data(group_ix, i, ingroup_ix, 0)) = code_chunk; + } + } +} + +/** Process the given indices or a block of a single list (cluster). */ +template +__device__ void run_on_list( + device_mdspan::list_extents, row_major> in_list_data, + std::variant offset_or_indices, + uint32_t len, + uint32_t pq_dim, + Action action) +{ + for (uint32_t ix = threadIdx.x + blockDim.x * blockIdx.x; ix < len; ix += blockDim.x) { + const uint32_t src_ix = std::holds_alternative(offset_or_indices) + ? std::get(offset_or_indices) + ix + : std::get(offset_or_indices)[ix]; + run_on_vector(in_list_data, src_ix, ix, pq_dim, action); + } +} + +/** Process the given indices or a block of a single list (cluster). */ +template +__device__ void write_list( + device_mdspan::list_extents, row_major> out_list_data, + std::variant offset_or_indices, + uint32_t len, + uint32_t pq_dim, + Action action) +{ + using subwarp_align = Pow2; + uint32_t stride = subwarp_align::div(blockDim.x); + uint32_t ix = subwarp_align::div(threadIdx.x + blockDim.x * blockIdx.x); + for (; ix < len; ix += stride) { + const uint32_t dst_ix = std::holds_alternative(offset_or_indices) + ? std::get(offset_or_indices) + ix + : std::get(offset_or_indices)[ix]; + write_vector(out_list_data, dst_ix, ix, pq_dim, action); + } +} + +} // namespace raft::neighbors::ivf_pq::detail From d6cad17a66136a986186581d31fa967449d933ec Mon Sep 17 00:00:00 2001 From: achirkin Date: Fri, 17 Mar 2023 13:36:16 +0100 Subject: [PATCH 19/25] Fix failing tests --- cpp/test/neighbors/ann_ivf_pq.cuh | 114 ++++++++++++++++-------------- 1 file changed, 59 insertions(+), 55 deletions(-) diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh index 52987f2227..34dfcd83d2 100644 --- a/cpp/test/neighbors/ann_ivf_pq.cuh +++ b/cpp/test/neighbors/ann_ivf_pq.cuh @@ -116,6 +116,33 @@ inline auto operator<<(std::ostream& os, const ivf_pq_inputs& p) -> std::ostream return os; } +template +void compare_vectors_l2( + const raft::device_resources& res, T a, T b, uint32_t label, double compression_ratio, double eps) +{ + auto n_rows = a.extent(0); + auto dim = a.extent(1); + rmm::mr::managed_memory_resource managed_memory; + auto dist = make_device_mdarray(res, &managed_memory, make_extents(n_rows)); + linalg::map_offset(res, dist.view(), [a, b, dim] __device__(uint32_t i) { + spatial::knn::detail::utils::mapping f{}; + double d = 0.0f; + for (uint32_t j = 0; j < dim; j++) { + double t = f(a(i, j)) - f(b(i, j)); + d += t * t; + } + return sqrt(d / double(dim)); + }); + res.sync_stream(); + for (uint32_t i = 0; i < n_rows; i++) { + double d = dist(i); + // The theoretical estimate of the error is hard to come up with, + // the estimate below is based on experimentation + curse of dimensionality + ASSERT_LE(d, eps * std::pow(2.0, compression_ratio)) + << " (label = " << label << ", ix = " << i << ", eps = " << eps << ")"; + } +} + template auto min_output_size(const raft::device_resources& handle, const ivf_pq::index& index, @@ -178,7 +205,7 @@ class ivf_pq_test : public ::testing::TestWithParam { handle_.sync_stream(stream_); } - index build_only() + auto build_only() { auto ipams = ps.index_params; ipams.add_data_on_build = true; @@ -188,7 +215,7 @@ class ivf_pq_test : public ::testing::TestWithParam { return ivf_pq::build(handle_, ipams, index_view); } - index build_2_extends() + auto build_2_extends() { auto db_indices = make_device_vector(handle_, ps.num_db_vecs); linalg::map_offset(handle_, db_indices.view(), identity_op{}); @@ -219,7 +246,7 @@ class ivf_pq_test : public ::testing::TestWithParam { return idx; } - index build_serialize() + auto build_serialize() { ivf_pq::serialize(handle_, "ivf_pq_index", build_only()); return ivf_pq::deserialize(handle_, "ivf_pq_index"); @@ -241,10 +268,6 @@ class ivf_pq_test : public ::testing::TestWithParam { auto rec_data = make_device_matrix(handle_, n_take, dim); auto orig_data = make_device_matrix(handle_, n_take, dim); - rmm::mr::managed_memory_resource managed_memory; - auto dist = - make_device_mdarray(handle_, &managed_memory, make_extents(n_take)); - ivf_pq::reconstruct_list_data(handle_, index, rec_data.view(), label, n_skip); matrix::gather(database.data(), @@ -255,26 +278,7 @@ class ivf_pq_test : public ::testing::TestWithParam { orig_data.data_handle(), stream_); - auto rec_data_view = rec_data.view(); - auto orig_data_view = orig_data.view(); - linalg::map_offset( - handle_, dist.view(), [rec_data_view, orig_data_view, dim] __device__(uint32_t i) { - spatial::knn::detail::utils::mapping f{}; - double d = 0.0f; - for (uint32_t j = 0; j < dim; j++) { - double t = f(rec_data_view(i, j)) - f(orig_data_view(i, j)); - d += t * t; - } - return sqrt(d / double(dim)); - }); - handle_.sync_stream(); - for (uint32_t i = 0; i < n_take; i++) { - double d = dist(i); - // The theoretical estimate of the error is hard to come up with, - // the estimate below is based on experimentation + curse of dimensionality - ASSERT_LE(d, 0.055 * std::pow(2.0, compression_ratio)) - << " (label = " << label << ", ix = " << (n_skip + i) << ")"; - } + compare_vectors_l2(handle_, rec_data.view(), orig_data.view(), label, compression_ratio, 0.055); } void check_reconstruct_extend(index* index, double compression_ratio, uint32_t label) @@ -285,36 +289,28 @@ class ivf_pq_test : public ::testing::TestWithParam { auto n_rows = old_list->size.load(); if (n_rows == 0) { return; } - auto vectors = make_device_matrix(handle_, n_rows, index->dim()); - auto indices = make_device_vector(handle_, n_rows); + auto vectors_1 = make_device_matrix(handle_, n_rows, index->dim()); + auto indices = make_device_vector(handle_, n_rows); copy(indices.data_handle(), old_list->indices.data_handle(), n_rows, stream_); - ivf_pq::reconstruct_list_data(handle_, *index, vectors.view(), label, 0); + ivf_pq::reconstruct_list_data(handle_, *index, vectors_1.view(), label, 0); ivf_pq::erase_list(handle_, index, label); // NB: passing the type parameter because const->non-const implicit conversion of the mdspans // breaks type inference - ivf_pq::extend_list(handle_, index, vectors.view(), indices.view(), label); + ivf_pq::extend_list(handle_, index, vectors_1.view(), indices.view(), label); auto& new_list = index->lists()[label]; ASSERT_NE(old_list.get(), new_list.get()) << "The old list should have been shared and retained after ivf_pq index has erased the " "corresponding cluster."; - auto n_codes = old_list->data.size(); - rmm::mr::managed_memory_resource managed_memory; - rmm::device_scalar errs(stream_, &managed_memory); - linalg::mapReduce(errs.data(), - n_codes, - 0, - compose_op{cast_op{}, notequal_op{}}, - add_op{}, - stream_, - old_list->data.data_handle(), - new_list->data.data_handle()); - auto err_value = errs.value(stream_); - auto err_rate = double(err_value) / double(n_codes); - ASSERT_LE(err_rate, 0.01 * std::pow(2.0, compression_ratio)) - << " (label = " << label << ", errors = " << err_value << ", size = " << n_codes << ")"; + auto vectors_2 = make_device_matrix(handle_, n_rows, index->dim()); + ivf_pq::reconstruct_list_data(handle_, *index, vectors_2.view(), label, 0); + // The code search is unstable, and there's high chance of repeating values of the lvl-2 codes. + // Hence, encoding-decoding chain often leads to altering both the PQ codes and the + // reconstructed data. + compare_vectors_l2( + handle_, vectors_1.view(), vectors_2.view(), label, compression_ratio, 0.025); } void check_packing(index* index, uint32_t label) @@ -349,15 +345,23 @@ class ivf_pq_test : public ::testing::TestWithParam { double compression_ratio = static_cast(ps.dim * 8) / static_cast(index.pq_dim() * index.pq_bits()); - // Reconstruct and re-write vectors for one label - check_reconstruct_extend(&index, compression_ratio, uint32_t(rand()) % index.n_lists()); - - // Dump and re-write codes for one label - check_packing(&index, uint32_t(rand()) % index.n_lists()); - - // check a small subset of data in a randomly chosen cluster to see if the data reconstruction - // works well. - check_reconstruction(index, compression_ratio, uint32_t(rand()) % index.n_lists(), 100, 7); + for (uint32_t label = 0; label < index.n_lists(); label++) { + switch (label % 3) { + case 0: { + // Reconstruct and re-write vectors for one label + check_reconstruct_extend(&index, compression_ratio, label); + } break; + case 1: { + // Dump and re-write codes for one label + check_packing(&index, label); + } break; + default: { + // check a small subset of data in a randomly chosen cluster to see if the data + // reconstruction works well. + check_reconstruction(index, compression_ratio, label, 100, 7); + } + } + } size_t queries_size = ps.num_queries * ps.k; std::vector indices_ivf_pq(queries_size); From 1504027193dd69e13d36b81d217d4235ccafea3e Mon Sep 17 00:00:00 2001 From: achirkin Date: Fri, 17 Mar 2023 17:10:10 +0100 Subject: [PATCH 20/25] Relax the test criterion eps a little bit --- cpp/test/neighbors/ann_ivf_pq.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh index 34dfcd83d2..7e87302606 100644 --- a/cpp/test/neighbors/ann_ivf_pq.cuh +++ b/cpp/test/neighbors/ann_ivf_pq.cuh @@ -278,7 +278,7 @@ class ivf_pq_test : public ::testing::TestWithParam { orig_data.data_handle(), stream_); - compare_vectors_l2(handle_, rec_data.view(), orig_data.view(), label, compression_ratio, 0.055); + compare_vectors_l2(handle_, rec_data.view(), orig_data.view(), label, compression_ratio, 0.06); } void check_reconstruct_extend(index* index, double compression_ratio, uint32_t label) From 09f1a0dbaade4441647e6a8f6200bdca8857b386 Mon Sep 17 00:00:00 2001 From: Tamas Bela Feher Date: Fri, 31 Mar 2023 21:08:44 +0200 Subject: [PATCH 21/25] Move ivf_pq helpers to separate file and namespace --- cpp/include/raft/neighbors/ivf_pq.cuh | 229 ------------- cpp/include/raft/neighbors/ivf_pq_helpers.cuh | 301 ++++++++++++++++++ cpp/test/neighbors/ann_ivf_pq.cuh | 19 +- 3 files changed, 312 insertions(+), 237 deletions(-) create mode 100644 cpp/include/raft/neighbors/ivf_pq_helpers.cuh diff --git a/cpp/include/raft/neighbors/ivf_pq.cuh b/cpp/include/raft/neighbors/ivf_pq.cuh index 464a00fb67..0a3ce68dc2 100644 --- a/cpp/include/raft/neighbors/ivf_pq.cuh +++ b/cpp/include/raft/neighbors/ivf_pq.cuh @@ -230,175 +230,6 @@ auto build(raft::device_resources const& handle, return detail::build(handle, params, dataset, n_rows, dim); } -/** - * @brief Unpack `n_take` consecutive records of a single list (cluster) in the compressed index - * starting at given `offset`, one code per byte (independently of pq_bits). - * - * Usage example: - * @code{.cpp} - * // We will unpack the fourth cluster - * uint32_t label = 3; - * // Get the list size - * uint32_t list_size = 0; - * raft::copy(&list_size, index.list_sizes().data_handle() + label, 1, res.get_stream()); - * res.sync_stream(); - * // allocate the buffer for the output - * auto codes = raft::make_device_matrix(res, list_size, index.pq_dim()); - * // unpack the whole list - * ivf_pq::unpack_list_data(res, index, codes.view(), label, 0); - * @endcode - * - * @tparam IdxT type of the indices in the source dataset - * - * @param[in] res - * @param[in] index - * @param[out] out_codes - * the destination buffer [n_take, index.pq_dim()]. - * The length `n_take` defines how many records to unpack, - * it must be smaller than the list size. - * @param[in] label - * The id of the list (cluster) to decode. - * @param[in] offset - * How many records in the list to skip. - */ -template -void unpack_list_data(raft::device_resources const& res, - const index& index, - device_matrix_view out_codes, - uint32_t label, - uint32_t offset) -{ - return detail::unpack_list_data(res, index, out_codes, label, offset); -} - -/** - * @brief Unpack a series of records of a single list (cluster) in the compressed index - * by their in-list offsets, one code per byte (independently of pq_bits). - * - * Usage example: - * @code{.cpp} - * // We will unpack the fourth cluster - * uint32_t label = 3; - * // Create the selection vector - * auto selected_indices = raft::make_device_vector(res, 4); - * ... fill the indices ... - * res.sync_stream(); - * // allocate the buffer for the output - * auto codes = raft::make_device_matrix(res, selected_indices.size(), index.pq_dim()); - * // decode the whole list - * ivf_pq::unpack_list_data( - * res, index, selected_indices.view(), codes.view(), label); - * @endcode - * - * @tparam IdxT type of the indices in the source dataset - * - * @param[in] res - * @param[in] index - * @param[in] in_cluster_indices - * The offsets of the selected indices within the cluster. - * @param[out] out_codes - * the destination buffer [n_take, index.pq_dim()]. - * The length `n_take` defines how many records to unpack, - * it must be smaller than the list size. - * @param[in] label - * The id of the list (cluster) to decode. - */ -template -void unpack_list_data(raft::device_resources const& res, - const index& index, - device_vector_view in_cluster_indices, - device_matrix_view out_codes, - uint32_t label) -{ - return detail::unpack_list_data(res, index, out_codes, label, in_cluster_indices); -} - -/** - * @brief Decode `n_take` consecutive records of a single list (cluster) in the compressed index - * starting at given `offset`. - * - * Usage example: - * @code{.cpp} - * // We will reconstruct the fourth cluster - * uint32_t label = 3; - * // Get the list size - * uint32_t list_size = 0; - * raft::copy(&list_size, index.list_sizes().data_handle() + label, 1, res.get_stream()); - * res.sync_stream(); - * // allocate the buffer for the output - * auto decoded_vectors = raft::make_device_matrix(res, list_size, index.dim()); - * // decode the whole list - * ivf_pq::reconstruct_list_data(res, index, decoded_vectors.view(), label, 0); - * @endcode - * - * @tparam T data element type - * @tparam IdxT type of the indices in the source dataset - * - * @param[in] res - * @param[in] index - * @param[out] out_vectors - * the destination buffer [n_take, index.dim()]. - * The length `n_take` defines how many records to reconstruct, - * it must be smaller than the list size. - * @param[in] label - * The id of the list (cluster) to decode. - * @param[in] offset - * How many records in the list to skip. - */ -template -void reconstruct_list_data(raft::device_resources const& res, - const index& index, - device_matrix_view out_vectors, - uint32_t label, - uint32_t offset) -{ - return detail::reconstruct_list_data(res, index, out_vectors, label, offset); -} - -/** - * @brief Decode a series of records of a single list (cluster) in the compressed index - * by their in-list offsets. - * - * Usage example: - * @code{.cpp} - * // We will reconstruct the fourth cluster - * uint32_t label = 3; - * // Create the selection vector - * auto selected_indices = raft::make_device_vector(res, 4); - * ... fill the indices ... - * res.sync_stream(); - * // allocate the buffer for the output - * auto decoded_vectors = raft::make_device_matrix( - * res, selected_indices.size(), index.dim()); - * // decode the whole list - * ivf_pq::reconstruct_list_data( - * res, index, selected_indices.view(), decoded_vectors.view(), label); - * @endcode - * - * @tparam T data element type - * @tparam IdxT type of the indices in the source dataset - * - * @param[in] res - * @param[in] index - * @param[in] in_cluster_indices - * The offsets of the selected indices within the cluster. - * @param[out] out_vectors - * the destination buffer [n_take, index.dim()]. - * The length `n_take` defines how many records to reconstruct, - * it must be smaller than the list size. - * @param[in] label - * The id of the list (cluster) to decode. - */ -template -void reconstruct_list_data(raft::device_resources const& res, - const index& index, - device_vector_view in_cluster_indices, - device_matrix_view out_vectors, - uint32_t label) -{ - return detail::reconstruct_list_data(res, index, out_vectors, label, in_cluster_indices); -} - /** * @brief Build a new index containing the data of the original plus new extra vectors. * @@ -465,66 +296,6 @@ void extend(raft::device_resources const& handle, detail::extend(handle, idx, new_vectors, new_indices, n_rows); } -/** - * @brief Extend one list of the index in-place, by the list label, skipping the classification and - * encoding steps. - * - * @tparam IdxT - * - * @param[in] res - * @param[inout] index - * @param[in] new_codes flat PQ codes, one code per byte [n_rows, index.pq_dim()] - * @param[in] new_indices source indices [n_rows] - * @param[in] label the id of the target list (cluster). - */ -template -void extend_list_with_codes(raft::device_resources const& res, - index* index, - device_matrix_view new_codes, - device_vector_view new_indices, - uint32_t label) -{ - detail::extend_list_with_codes(res, index, new_codes, new_indices, label); -} - -/** - * @brief Extend one list of the index in-place, by the list label, skipping the classification - * step. - * - * @tparam T - * @tparam IdxT - * - * @param[in] res - * @param[inout] index - * @param[in] new_vectors data to encode [n_rows, index.dim()] - * @param[in] new_indices source indices [n_rows] - * @param[in] label the id of the target list (cluster). - * - */ -template -void extend_list(raft::device_resources const& res, - index* index, - device_matrix_view new_vectors, - device_vector_view new_indices, - uint32_t label) -{ - detail::extend_list(res, index, new_vectors, new_indices, label); -} - -/** - * @brief Remove all data from a single list (cluster) in the index. - * - * @tparam IdxT - * @param[in] res - * @param[inout] index - * @param[in] label the id of the target list (cluster). - */ -template -void erase_list(raft::device_resources const& res, index* index, uint32_t label) -{ - detail::erase_list(res, index, label); -} - /** * @brief Search ANN using the constructed index. * diff --git a/cpp/include/raft/neighbors/ivf_pq_helpers.cuh b/cpp/include/raft/neighbors/ivf_pq_helpers.cuh new file mode 100644 index 0000000000..b51f0696d0 --- /dev/null +++ b/cpp/include/raft/neighbors/ivf_pq_helpers.cuh @@ -0,0 +1,301 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include +#include + +namespace raft::neighbors::ivf_pq::helpers { + +/** + * @defgroup ivf_pq_helpers Helper functions for manipulationg IVF PQ Index + * @{ + */ + +/** + * @brief Unpack `n_take` consecutive records of a single list (cluster) in the compressed index + * starting at given `offset`, one code per byte (independently of pq_bits). + * + * Usage example: + * @code{.cpp} + * // We will unpack the fourth cluster + * uint32_t label = 3; + * // Get the list size + * uint32_t list_size = 0; + * raft::copy(&list_size, index.list_sizes().data_handle() + label, 1, res.get_stream()); + * res.sync_stream(); + * // allocate the buffer for the output + * auto codes = raft::make_device_matrix(res, list_size, index.pq_dim()); + * // unpack the whole list + * ivf_pq::helpers::unpack_list_data(res, index, codes.view(), label, 0); + * @endcode + * + * @tparam IdxT type of the indices in the source dataset + * + * @param[in] res + * @param[in] index + * @param[out] out_codes + * the destination buffer [n_take, index.pq_dim()]. + * The length `n_take` defines how many records to unpack, + * it must be smaller than the list size. + * @param[in] label + * The id of the list (cluster) to decode. + * @param[in] offset + * How many records in the list to skip. + */ +template +void unpack_list_data(raft::device_resources const& res, + const index& index, + device_matrix_view out_codes, + uint32_t label, + uint32_t offset) +{ + return ivf_pq::detail::unpack_list_data(res, index, out_codes, label, offset); +} + +/** + * @brief Unpack a series of records of a single list (cluster) in the compressed index + * by their in-list offsets, one code per byte (independently of pq_bits). + * + * Usage example: + * @code{.cpp} + * // We will unpack the fourth cluster + * uint32_t label = 3; + * // Create the selection vector + * auto selected_indices = raft::make_device_vector(res, 4); + * ... fill the indices ... + * res.sync_stream(); + * // allocate the buffer for the output + * auto codes = raft::make_device_matrix(res, selected_indices.size(), index.pq_dim()); + * // decode the whole list + * ivf_pq::helpers::unpack_list_data( + * res, index, selected_indices.view(), codes.view(), label); + * @endcode + * + * @tparam IdxT type of the indices in the source dataset + * + * @param[in] res + * @param[in] index + * @param[in] in_cluster_indices + * The offsets of the selected indices within the cluster. + * @param[out] out_codes + * the destination buffer [n_take, index.pq_dim()]. + * The length `n_take` defines how many records to unpack, + * it must be smaller than the list size. + * @param[in] label + * The id of the list (cluster) to decode. + */ +template +void unpack_list_data(raft::device_resources const& res, + const index& index, + device_vector_view in_cluster_indices, + device_matrix_view out_codes, + uint32_t label) +{ + return ivf_pq::detail::unpack_list_data(res, index, out_codes, label, in_cluster_indices); +} + +/** + * @brief Decode `n_take` consecutive records of a single list (cluster) in the compressed index + * starting at given `offset`. + * + * Usage example: + * @code{.cpp} + * // We will reconstruct the fourth cluster + * uint32_t label = 3; + * // Get the list size + * uint32_t list_size = 0; + * raft::copy(&list_size, index.list_sizes().data_handle() + label, 1, res.get_stream()); + * res.sync_stream(); + * // allocate the buffer for the output + * auto decoded_vectors = raft::make_device_matrix(res, list_size, index.dim()); + * // decode the whole list + * ivf_pq::helpers::reconstruct_list_data(res, index, decoded_vectors.view(), label, 0); + * @endcode + * + * @tparam T data element type + * @tparam IdxT type of the indices in the source dataset + * + * @param[in] res + * @param[in] index + * @param[out] out_vectors + * the destination buffer [n_take, index.dim()]. + * The length `n_take` defines how many records to reconstruct, + * it must be smaller than the list size. + * @param[in] label + * The id of the list (cluster) to decode. + * @param[in] offset + * How many records in the list to skip. + */ +template +void reconstruct_list_data(raft::device_resources const& res, + const index& index, + device_matrix_view out_vectors, + uint32_t label, + uint32_t offset) +{ + return ivf_pq::detail::reconstruct_list_data(res, index, out_vectors, label, offset); +} + +/** + * @brief Decode a series of records of a single list (cluster) in the compressed index + * by their in-list offsets. + * + * Usage example: + * @code{.cpp} + * // We will reconstruct the fourth cluster + * uint32_t label = 3; + * // Create the selection vector + * auto selected_indices = raft::make_device_vector(res, 4); + * ... fill the indices ... + * res.sync_stream(); + * // allocate the buffer for the output + * auto decoded_vectors = raft::make_device_matrix( + * res, selected_indices.size(), index.dim()); + * // decode the whole list + * ivf_pq::helpers::reconstruct_list_data( + * res, index, selected_indices.view(), decoded_vectors.view(), label); + * @endcode + * + * @tparam T data element type + * @tparam IdxT type of the indices in the source dataset + * + * @param[in] res + * @param[in] index + * @param[in] in_cluster_indices + * The offsets of the selected indices within the cluster. + * @param[out] out_vectors + * the destination buffer [n_take, index.dim()]. + * The length `n_take` defines how many records to reconstruct, + * it must be smaller than the list size. + * @param[in] label + * The id of the list (cluster) to decode. + */ +template +void reconstruct_list_data(raft::device_resources const& res, + const index& index, + device_vector_view in_cluster_indices, + device_matrix_view out_vectors, + uint32_t label) +{ + return ivf_pq::detail::reconstruct_list_data(res, index, out_vectors, label, in_cluster_indices); +} + +/** + * @brief Extend one list of the index in-place, by the list label, skipping the classification and + * encoding steps. + * + * Usage example: + * @code{.cpp} + * // We will extend the fourth cluster + * uint32_t label = 3; + * // We will fill 4 new vectors + * uint32_t n_vec = 4; + * // Indices of the new vectors + * auto indices = raft::make_device_vector(res, n_vec); + * ... fill the indices ... + * auto new_codes = raft::make_device_matrix new_codes( + * res, n_vec, index.pq_dim()); + * ... fill codes ... + * // extend list with new codes + * ivf_pq::helpers::extend_list_with_codes( + * res, &index, codes.view(), indices.view(), label); + * @endcode + * + * @tparam IdxT + * + * @param[in] res + * @param[inout] index + * @param[in] new_codes flat PQ codes, one code per byte [n_rows, index.pq_dim()] + * @param[in] new_indices source indices [n_rows] + * @param[in] label the id of the target list (cluster). + */ +template +void extend_list_with_codes(raft::device_resources const& res, + index* index, + device_matrix_view new_codes, + device_vector_view new_indices, + uint32_t label) +{ + ivf_pq::detail::extend_list_with_codes(res, index, new_codes, new_indices, label); +} + +/** + * @brief Extend one list of the index in-place, by the list label, skipping the classification + * step. + * + * Usage example: + * @code{.cpp} + * // We will extend the fourth cluster + * uint32_t label = 3; + * // We will extend with 4 new vectors + * uint32_t n_vec = 4; + * // Indices of the new vectors + * auto indices = raft::make_device_vector(res, n_vec); + * ... fill the indices ... + * auto new_vectors = raft::make_device_matrix new_codes( + * res, n_vec, index.dim()); + * ... fill vectors ... + * // extend list with new vectors + * ivf_pq::helpers::extend_list( + * res, &index, new_vectors.view(), indices.view(), label); + * @endcode + * + * @tparam T + * @tparam IdxT + * + * @param[in] res + * @param[inout] index + * @param[in] new_vectors data to encode [n_rows, index.dim()] + * @param[in] new_indices source indices [n_rows] + * @param[in] label the id of the target list (cluster). + * + */ +template +void extend_list(raft::device_resources const& res, + index* index, + device_matrix_view new_vectors, + device_vector_view new_indices, + uint32_t label) +{ + ivf_pq::detail::extend_list(res, index, new_vectors, new_indices, label); +} + +/** + * @brief Remove all data from a single list (cluster) in the index. + * + * Usage example: + * @code{.cpp} + * // We will erase the fourth cluster (label = 3) + * ivf_pq::helpers::erase_list(res, &index, 3); + * @endcode + * + * @tparam IdxT + * @param[in] res + * @param[inout] index + * @param[in] label the id of the target list (cluster). + */ +template +void erase_list(raft::device_resources const& res, index* index, uint32_t label) +{ + ivf_pq::detail::erase_list(res, index, label); +} + +} // namespace raft::neighbors::ivf_pq::helpers diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh index edc82f4172..101509a878 100644 --- a/cpp/test/neighbors/ann_ivf_pq.cuh +++ b/cpp/test/neighbors/ann_ivf_pq.cuh @@ -26,6 +26,7 @@ #include #include #include +#include #include #ifdef RAFT_COMPILED #include @@ -268,7 +269,7 @@ class ivf_pq_test : public ::testing::TestWithParam { auto rec_data = make_device_matrix(handle_, n_take, dim); auto orig_data = make_device_matrix(handle_, n_take, dim); - ivf_pq::reconstruct_list_data(handle_, index, rec_data.view(), label, n_skip); + ivf_pq::helpers::reconstruct_list_data(handle_, index, rec_data.view(), label, n_skip); matrix::gather(database.data(), IdxT{dim}, @@ -293,11 +294,12 @@ class ivf_pq_test : public ::testing::TestWithParam { auto indices = make_device_vector(handle_, n_rows); copy(indices.data_handle(), old_list->indices.data_handle(), n_rows, stream_); - ivf_pq::reconstruct_list_data(handle_, *index, vectors_1.view(), label, 0); - ivf_pq::erase_list(handle_, index, label); + ivf_pq::helpers::reconstruct_list_data(handle_, *index, vectors_1.view(), label, 0); + ivf_pq::helpers::erase_list(handle_, index, label); // NB: passing the type parameter because const->non-const implicit conversion of the mdspans // breaks type inference - ivf_pq::extend_list(handle_, index, vectors_1.view(), indices.view(), label); + ivf_pq::helpers::extend_list( + handle_, index, vectors_1.view(), indices.view(), label); auto& new_list = index->lists()[label]; ASSERT_NE(old_list.get(), new_list.get()) @@ -305,7 +307,7 @@ class ivf_pq_test : public ::testing::TestWithParam { "corresponding cluster."; auto vectors_2 = make_device_matrix(handle_, n_rows, index->dim()); - ivf_pq::reconstruct_list_data(handle_, *index, vectors_2.view(), label, 0); + ivf_pq::helpers::reconstruct_list_data(handle_, *index, vectors_2.view(), label, 0); // The code search is unstable, and there's high chance of repeating values of the lvl-2 codes. // Hence, encoding-decoding chain often leads to altering both the PQ codes and the // reconstructed data. @@ -324,9 +326,10 @@ class ivf_pq_test : public ::testing::TestWithParam { auto indices = make_device_vector(handle_, n_rows); copy(indices.data_handle(), old_list->indices.data_handle(), n_rows, stream_); - ivf_pq::unpack_list_data(handle_, *index, codes.view(), label, 0); - ivf_pq::erase_list(handle_, index, label); - ivf_pq::extend_list_with_codes(handle_, index, codes.view(), indices.view(), label); + ivf_pq::helpers::unpack_list_data(handle_, *index, codes.view(), label, 0); + ivf_pq::helpers::erase_list(handle_, index, label); + ivf_pq::helpers::extend_list_with_codes( + handle_, index, codes.view(), indices.view(), label); auto& new_list = index->lists()[label]; ASSERT_NE(old_list.get(), new_list.get()) From 926f510aab3ec16844b9b18b4c04fcc51e8f1f7d Mon Sep 17 00:00:00 2001 From: Tamas Bela Feher Date: Fri, 31 Mar 2023 22:16:03 +0200 Subject: [PATCH 22/25] Add public API for pack_list_data --- cpp/include/raft/neighbors/ivf_pq_helpers.cuh | 34 +++++++++++++++++++ cpp/test/neighbors/ann_ivf_pq.cuh | 11 ++++++ 2 files changed, 45 insertions(+) diff --git a/cpp/include/raft/neighbors/ivf_pq_helpers.cuh b/cpp/include/raft/neighbors/ivf_pq_helpers.cuh index b51f0696d0..4655c60916 100644 --- a/cpp/include/raft/neighbors/ivf_pq_helpers.cuh +++ b/cpp/include/raft/neighbors/ivf_pq_helpers.cuh @@ -29,6 +29,40 @@ namespace raft::neighbors::ivf_pq::helpers { * @{ */ +/** + * Write flat PQ codes into an existing list by the given offset. + * + * The list is identified by its label. + * + * NB: no memory allocation happens here; the list must fit the data (offset + n_vec). + * + * Usage example: + * @code{.cpp} + * // We will write into the 137th cluster + * uint32_t label = 137; + * // allocate the buffer for the input codes + * auto codes = raft::make_device_matrix(res, n_vec, index.pq_dim()); + * ... prepare n_vecs to pack into the list in codes ... + * // write codes into the list starting from the 42nd position + * ivf_pq::helpers::pack_list_data(res, &index, codes_to_pack, label, 42); + * @endcode + * + * @param[in] res + * @param[inout] index IVF-PQ index. + * @param[in] codes flat PQ codes, one code per byte [n_rows, pq_dim] + * @param[in] label The id of the list (cluster) into which we write. + * @param[in] offset how many records to skip before writing the data into the list + */ +template +void pack_list_data(raft::device_resources const& res, + index* index, + device_matrix_view codes, + uint32_t label, + uint32_t offset) +{ + ivf_pq::detail::pack_list_data(res, index, codes, label, offset); +} + /** * @brief Unpack `n_take` consecutive records of a single list (cluster) in the compressed index * starting at given `offset`, one code per byte (independently of pq_bits). diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh index 101509a878..37534c0cc9 100644 --- a/cpp/test/neighbors/ann_ivf_pq.cuh +++ b/cpp/test/neighbors/ann_ivf_pq.cuh @@ -338,6 +338,17 @@ class ivf_pq_test : public ::testing::TestWithParam { ASSERT_TRUE(devArrMatch( old_list->data.data_handle(), new_list->data.data_handle(), n_rows, Compare{})); + + // Pack a few vectors back to the list. + int row_offset = 9; + int n_vec = 3; + ASSERT_TRUE(row_offset + n_vec < n_rows); + size_t offset = row_offset * index->pq_dim(); + auto codes_to_pack = make_device_matrix_view( + codes.data_handle() + offset, n_vec, index->pq_dim()); + ivf_pq::helpers::pack_list_data(handle_, index, codes_to_pack, label, row_offset); + ASSERT_TRUE(devArrMatch( + old_list->data.data_handle(), new_list->data.data_handle(), n_rows, Compare{})); } template From 8e311d9876b375267d70d9abbd23fb0857f2b7b4 Mon Sep 17 00:00:00 2001 From: Tamas Bela Feher Date: Sun, 2 Apr 2023 21:39:39 +0200 Subject: [PATCH 23/25] Increase tolarance for vector reconstruction test --- cpp/test/neighbors/ann_ivf_pq.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh index 37534c0cc9..b2528bbd47 100644 --- a/cpp/test/neighbors/ann_ivf_pq.cuh +++ b/cpp/test/neighbors/ann_ivf_pq.cuh @@ -139,7 +139,7 @@ void compare_vectors_l2( double d = dist(i); // The theoretical estimate of the error is hard to come up with, // the estimate below is based on experimentation + curse of dimensionality - ASSERT_LE(d, eps * std::pow(2.0, compression_ratio)) + ASSERT_LE(d, 1.2 * eps * std::pow(2.0, compression_ratio)) << " (label = " << label << ", ix = " << i << ", eps = " << eps << ")"; } } From 280c040149a32f2248e2938f3dc87cfdf74cb7d2 Mon Sep 17 00:00:00 2001 From: Tamas Bela Feher Date: Mon, 3 Apr 2023 12:06:41 +0200 Subject: [PATCH 24/25] Correct number of list elements to compare --- cpp/test/neighbors/ann_ivf_pq.cuh | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh index b2528bbd47..707392efdf 100644 --- a/cpp/test/neighbors/ann_ivf_pq.cuh +++ b/cpp/test/neighbors/ann_ivf_pq.cuh @@ -335,9 +335,15 @@ class ivf_pq_test : public ::testing::TestWithParam { ASSERT_NE(old_list.get(), new_list.get()) << "The old list should have been shared and retained after ivf_pq index has erased the " "corresponding cluster."; + auto list_data_size = (n_rows / ivf_pq::kIndexGroupSize) * new_list->data.extent(1) * + new_list->data.extent(2) * new_list->data.extent(3); - ASSERT_TRUE(devArrMatch( - old_list->data.data_handle(), new_list->data.data_handle(), n_rows, Compare{})); + ASSERT_TRUE(old_list->data.size() >= list_data_size); + ASSERT_TRUE(new_list->data.size() >= list_data_size); + ASSERT_TRUE(devArrMatch(old_list->data.data_handle(), + new_list->data.data_handle(), + list_data_size, + Compare{})); // Pack a few vectors back to the list. int row_offset = 9; @@ -347,8 +353,10 @@ class ivf_pq_test : public ::testing::TestWithParam { auto codes_to_pack = make_device_matrix_view( codes.data_handle() + offset, n_vec, index->pq_dim()); ivf_pq::helpers::pack_list_data(handle_, index, codes_to_pack, label, row_offset); - ASSERT_TRUE(devArrMatch( - old_list->data.data_handle(), new_list->data.data_handle(), n_rows, Compare{})); + ASSERT_TRUE(devArrMatch(old_list->data.data_handle(), + new_list->data.data_handle(), + list_data_size, + Compare{})); } template From a7e54196e97431345be1a676bfa1e1feeee07eda Mon Sep 17 00:00:00 2001 From: Tamas Bela Feher Date: Wed, 5 Apr 2023 14:11:04 +0200 Subject: [PATCH 25/25] Add ivf_pq::helpers::codepacker::pack / unpack --- cpp/include/raft/neighbors/ivf_pq_helpers.cuh | 78 ++++++++++++++++++- cpp/test/neighbors/ann_ivf_pq.cuh | 16 ++++ 2 files changed, 92 insertions(+), 2 deletions(-) diff --git a/cpp/include/raft/neighbors/ivf_pq_helpers.cuh b/cpp/include/raft/neighbors/ivf_pq_helpers.cuh index 4655c60916..398bd545f1 100644 --- a/cpp/include/raft/neighbors/ivf_pq_helpers.cuh +++ b/cpp/include/raft/neighbors/ivf_pq_helpers.cuh @@ -23,12 +23,85 @@ #include namespace raft::neighbors::ivf_pq::helpers { - /** * @defgroup ivf_pq_helpers Helper functions for manipulationg IVF PQ Index * @{ */ +namespace codepacker { +/** + * @brief Unpack `n_take` consecutive records of a single list (cluster) in the compressed index + * starting at given `offset`. + * + * Bit compression is removed, which means output will have pq_dim dimensional vectors (one code per + * byte, instead of ceildiv(pq_dim * pq_bits, 8) bytes of pq codes). + * + * Usage example: + * @code{.cpp} + * auto list_data = index.lists()[label]->data.view(); + * // allocate the buffer for the output + * uint32_t n_take = 4; + * auto codes = raft::make_device_matrix(res, n_take, index.pq_dim()); + * uint32_t offset = 0; + * // unpack n_take elements from the list + * ivf_pq::helpers::codepacker::unpack(res, list_data, index.pq_bits(), offset, codes.view()); + * @endcode + * + * @tparam IdxT type of the indices in the source dataset + * + * @param[in] res raft resource + * @param[in] list_data block to read from + * @param[in] pq_bits bit length of encoded vector elements + * @param[in] offset + * How many records in the list to skip. + * @param[out] codes + * the destination buffer [n_take, index.pq_dim()]. + * The length `n_take` defines how many records to unpack, + * it must be smaller than the list size. + */ +inline void unpack( + raft::device_resources const& res, + device_mdspan::list_extents, row_major> list_data, + uint32_t pq_bits, + uint32_t offset, + device_matrix_view codes) +{ + ivf_pq::detail::unpack_list_data(codes, list_data, offset, pq_bits, res.get_stream()); +} + +/** + * Write flat PQ codes into an existing list by the given offset. + * + * NB: no memory allocation happens here; the list must fit the data (offset + n_vec). + * + * Usage example: + * @code{.cpp} + * auto list_data = index.lists()[label]->data.view(); + * // allocate the buffer for the input codes + * auto codes = raft::make_device_matrix(res, n_vec, index.pq_dim()); + * ... prepare n_vecs to pack into the list in codes ... + * // write codes into the list starting from the 42nd position + * ivf_pq::helpers::codepacker::pack( + * res, make_const_mdspan(codes.view()), index.pq_bits(), 42, list_data); + * @endcode + * + * @param[in] res + * @param[in] codes flat PQ codes, one code per byte [n_vec, pq_dim] + * @param[in] pq_bits bit length of encoded vector elements + * @param[in] offset how many records to skip before writing the data into the list + * @param[in] list_data block to write into + */ +inline void pack( + raft::device_resources const& res, + device_matrix_view codes, + uint32_t pq_bits, + uint32_t offset, + device_mdspan::list_extents, row_major> list_data) +{ + ivf_pq::detail::pack_list_data(list_data, codes, offset, pq_bits, res.get_stream()); +} +} // namespace codepacker + /** * Write flat PQ codes into an existing list by the given offset. * @@ -41,7 +114,7 @@ namespace raft::neighbors::ivf_pq::helpers { * // We will write into the 137th cluster * uint32_t label = 137; * // allocate the buffer for the input codes - * auto codes = raft::make_device_matrix(res, n_vec, index.pq_dim()); + * auto codes = raft::make_device_matrix(res, n_vec, index.pq_dim()); * ... prepare n_vecs to pack into the list in codes ... * // write codes into the list starting from the 42nd position * ivf_pq::helpers::pack_list_data(res, &index, codes_to_pack, label, 42); @@ -332,4 +405,5 @@ void erase_list(raft::device_resources const& res, index* index, uint32_t ivf_pq::detail::erase_list(res, index, label); } +/** @} */ } // namespace raft::neighbors::ivf_pq::helpers diff --git a/cpp/test/neighbors/ann_ivf_pq.cuh b/cpp/test/neighbors/ann_ivf_pq.cuh index 707392efdf..f90b6fb318 100644 --- a/cpp/test/neighbors/ann_ivf_pq.cuh +++ b/cpp/test/neighbors/ann_ivf_pq.cuh @@ -357,6 +357,22 @@ class ivf_pq_test : public ::testing::TestWithParam { new_list->data.data_handle(), list_data_size, Compare{})); + + // Another test with the API that take list_data directly + auto list_data = index->lists()[label]->data.view(); + uint32_t n_take = 4; + ASSERT_TRUE(row_offset + n_take < n_rows); + auto codes2 = raft::make_device_matrix(handle_, n_take, index->pq_dim()); + ivf_pq::helpers::codepacker::unpack( + handle_, list_data, index->pq_bits(), row_offset, codes2.view()); + + // Write it back + ivf_pq::helpers::codepacker::pack( + handle_, make_const_mdspan(codes2.view()), index->pq_bits(), row_offset, list_data); + ASSERT_TRUE(devArrMatch(old_list->data.data_handle(), + new_list->data.data_handle(), + list_data_size, + Compare{})); } template