Skip to content

Commit

Permalink
sparse: accelerate the writing of index files (zilliztech#1017)
Browse files Browse the repository at this point in the history
It is too slow to construct the index file using the DAAT cursor
method. Switch to the TAAT traversal method instead.

Signed-off-by: Shawn Wang <shawn.wang@zilliz.com>
  • Loading branch information
sparknack authored and cqy123456 committed Jan 24, 2025
1 parent fc1300d commit a65a851
Showing 1 changed file with 26 additions and 14 deletions.
40 changes: 26 additions & 14 deletions src/index/sparse/sparse_inverted_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -181,25 +181,37 @@ class InvertedIndex : public BaseInvertedIndex<DType> {
}

auto dim_map_reverse = std::unordered_map<uint32_t, table_t>();
for (auto dim_it = dim_map_.begin(); dim_it != dim_map_.end(); ++dim_it) {
dim_map_reverse[dim_it->second] = dim_it->first;
for (const auto& [dim, idx] : dim_map_) {
dim_map_reverse[idx] = dim;
}

for (table_t vec_id = 0; vec_id < n_rows_internal_; ++vec_id) {
std::vector<std::pair<table_t, DType>> vec_row;
for (size_t i = 0; i < inverted_index_ids_.size(); ++i) {
if (cursors[i].cur_vec_id_ == vec_id) {
vec_row.emplace_back(dim_map_reverse[i], cursors[i].cur_vec_val());
cursors[i].next();
}
std::vector<size_t> row_sizes(n_rows_internal_, 0);
for (size_t i = 0; i < inverted_index_ids_.size(); ++i) {
for (const auto& id : inverted_index_ids_[i]) {
row_sizes[id]++;
}
}

SparseRow<DType> raw_row(vec_row);
writeBinaryPOD(writer, raw_row.size());
if (raw_row.size() == 0) {
continue;
std::vector<SparseRow<DType>> raw_rows(n_rows_internal_);
for (size_t i = 0; i < n_rows_internal_; ++i) {
raw_rows[i] = std::move(SparseRow<DType>(row_sizes[i]));
}

for (size_t i = 0; i < inverted_index_ids_.size(); ++i) {
const auto& ids = inverted_index_ids_[i];
const auto& vals = inverted_index_vals_[i];
const auto dim = dim_map_reverse[i];
for (size_t j = 0; j < ids.size(); ++j) {
raw_rows[ids[j]].set_at(raw_rows[ids[j]].size() - row_sizes[ids[j]], dim, vals[j]);
--row_sizes[ids[j]];
}
}

for (table_t vec_id = 0; vec_id < n_rows_internal_; ++vec_id) {
writeBinaryPOD(writer, raw_rows[vec_id].size());
if (raw_rows[vec_id].size() > 0) {
writer.write(raw_rows[vec_id].data(), raw_rows[vec_id].size() * SparseRow<DType>::element_size());
}
writer.write(raw_row.data(), raw_row.size() * SparseRow<DType>::element_size());
}

return Status::success;
Expand Down

0 comments on commit a65a851

Please sign in to comment.