Skip to content
This repository has been archived by the owner on Apr 4, 2023. It is now read-only.

Commit

Permalink
Merge #557
Browse files Browse the repository at this point in the history
557: Fasten documents deletion and update r=Kerollmops a=irevoire

When a document deletion occurs, instead of deleting the document we mark it as deleted in the new “soft deleted” bitmap. It is then removed from the search and all the other endpoints.

I ran the benchmarks against main;
```
% ./compare.sh indexing_main_83ad1aaf.json indexing_fasten-document-deletion_abab51fb.json
group                                                                     indexing_fasten-document-deletion_abab51fb    indexing_main_83ad1aaf
-----                                                                     ------------------------------------------    ----------------------
indexing/-geo-delete-facetedNumber-facetedGeo-searchable-                 1.05      2.0±0.40ms        ? ?/sec           1.00  1904.9±190.00µs        ? ?/sec
indexing/-movies-delete-facetedString-facetedNumber-searchable-           1.00     10.3±2.64ms        ? ?/sec           961.61      9.9±0.12s        ? ?/sec
indexing/-movies-delete-facetedString-facetedNumber-searchable-nested-    1.00     15.1±3.90ms        ? ?/sec           554.63      8.4±0.12s        ? ?/sec
indexing/-songs-delete-facetedString-facetedNumber-searchable-            1.00     45.1±7.53ms        ? ?/sec           710.15     32.0±0.10s        ? ?/sec
indexing/-wiki-delete-searchable-                                         1.00    277.8±7.97ms        ? ?/sec           1946.57    540.8±3.15s        ? ?/sec
indexing/Indexing geo_point                                               1.00      12.0±0.20s        ? ?/sec           1.03      12.4±0.19s        ? ?/sec
indexing/Indexing movies in three batches                                 1.00      19.3±0.30s        ? ?/sec           1.01      19.4±0.16s        ? ?/sec
indexing/Indexing movies with default settings                            1.00      18.8±0.09s        ? ?/sec           1.00      18.9±0.10s        ? ?/sec
indexing/Indexing nested movies with default settings                     1.00      25.9±0.19s        ? ?/sec           1.00      25.9±0.12s        ? ?/sec
indexing/Indexing nested movies without any facets                        1.00      24.8±0.17s        ? ?/sec           1.00      24.8±0.18s        ? ?/sec
indexing/Indexing songs in three batches with default settings            1.00      65.9±0.96s        ? ?/sec           1.03      67.8±0.82s        ? ?/sec
indexing/Indexing songs with default settings                             1.00      58.8±1.11s        ? ?/sec           1.02      59.9±2.09s        ? ?/sec
indexing/Indexing songs without any facets                                1.00      53.4±0.72s        ? ?/sec           1.01      54.2±0.88s        ? ?/sec
indexing/Indexing songs without faceted numbers                           1.00      57.9±1.17s        ? ?/sec           1.01      58.3±1.20s        ? ?/sec
indexing/Indexing wiki                                                    1.00   1065.2±13.26s        ? ?/sec           1.00   1065.8±12.66s        ? ?/sec
indexing/Indexing wiki in three batches                                   1.00    1182.4±6.20s        ? ?/sec           1.01    1190.8±8.48s        ? ?/sec
```

Most things do not change, we lost 0.1ms on the indexing of geo point (I don’t get why), and then we are between 500 and 1900 times faster when we delete documents.


Co-authored-by: Tamo <tamo@meilisearch.com>
  • Loading branch information
bors[bot] and irevoire authored Jul 4, 2022
2 parents c6f4775 + d0dbfb4 commit 672cb02
Show file tree
Hide file tree
Showing 10 changed files with 415 additions and 284 deletions.
2 changes: 2 additions & 0 deletions milli/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ pub enum Error {

#[derive(Error, Debug)]
pub enum InternalError {
#[error("Tried to access a soft deleted documents.")]
AccessingSoftDeletedDocument { document_id: DocumentId },
#[error("{}", HeedError::DatabaseClosing)]
DatabaseClosing,
#[error("Missing {} in the {db_name} database.", key.unwrap_or("key"))]
Expand Down
11 changes: 9 additions & 2 deletions milli/src/external_documents_ids.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,26 +5,30 @@ use std::{fmt, str};

use fst::map::IndexedValue;
use fst::{IntoStreamer, Streamer};
use roaring::RoaringBitmap;

const DELETED_ID: u64 = u64::MAX;

pub struct ExternalDocumentsIds<'a> {
pub(crate) hard: fst::Map<Cow<'a, [u8]>>,
pub(crate) soft: fst::Map<Cow<'a, [u8]>>,
soft_deleted_docids: RoaringBitmap,
}

impl<'a> ExternalDocumentsIds<'a> {
pub fn new(
hard: fst::Map<Cow<'a, [u8]>>,
soft: fst::Map<Cow<'a, [u8]>>,
soft_deleted_docids: RoaringBitmap,
) -> ExternalDocumentsIds<'a> {
ExternalDocumentsIds { hard, soft }
ExternalDocumentsIds { hard, soft, soft_deleted_docids }
}

pub fn into_static(self) -> ExternalDocumentsIds<'static> {
ExternalDocumentsIds {
hard: self.hard.map_data(|c| Cow::Owned(c.into_owned())).unwrap(),
soft: self.soft.map_data(|c| Cow::Owned(c.into_owned())).unwrap(),
soft_deleted_docids: self.soft_deleted_docids,
}
}

Expand All @@ -36,7 +40,9 @@ impl<'a> ExternalDocumentsIds<'a> {
pub fn get<A: AsRef<[u8]>>(&self, external_id: A) -> Option<u32> {
let external_id = external_id.as_ref();
match self.soft.get(external_id).or_else(|| self.hard.get(external_id)) {
Some(id) if id != DELETED_ID => Some(id.try_into().unwrap()),
Some(id) if id != DELETED_ID && !self.soft_deleted_docids.contains(id as u32) => {
Some(id.try_into().unwrap())
}
_otherwise => None,
}
}
Expand Down Expand Up @@ -134,6 +140,7 @@ impl Default for ExternalDocumentsIds<'static> {
ExternalDocumentsIds {
hard: fst::Map::default().map_data(Cow::Owned).unwrap(),
soft: fst::Map::default().map_data(Cow::Owned).unwrap(),
soft_deleted_docids: RoaringBitmap::new(),
}
}
}
Expand Down
40 changes: 37 additions & 3 deletions milli/src/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ pub mod main_key {
pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields";
pub const DISTINCT_FIELD_KEY: &str = "distinct-field-key";
pub const DOCUMENTS_IDS_KEY: &str = "documents-ids";
pub const SOFT_DELETED_DOCUMENTS_IDS_KEY: &str = "soft-deleted-documents-ids";
pub const HIDDEN_FACETED_FIELDS_KEY: &str = "hidden-faceted-fields";
pub const FILTERABLE_FIELDS_KEY: &str = "filterable-fields";
pub const SORTABLE_FIELDS_KEY: &str = "sortable-fields";
Expand Down Expand Up @@ -254,6 +255,29 @@ impl Index {
Ok(count.unwrap_or_default())
}

/* deleted documents ids */

/// Writes the soft deleted documents ids.
pub(crate) fn put_soft_deleted_documents_ids(
&self,
wtxn: &mut RwTxn,
docids: &RoaringBitmap,
) -> heed::Result<()> {
self.main.put::<_, Str, RoaringBitmapCodec>(
wtxn,
main_key::SOFT_DELETED_DOCUMENTS_IDS_KEY,
docids,
)
}

/// Returns the soft deleted documents ids.
pub(crate) fn soft_deleted_documents_ids(&self, rtxn: &RoTxn) -> heed::Result<RoaringBitmap> {
Ok(self
.main
.get::<_, Str, RoaringBitmapCodec>(rtxn, main_key::SOFT_DELETED_DOCUMENTS_IDS_KEY)?
.unwrap_or_default())
}

/* primary key */

/// Writes the documents primary key, this is the field name that is used to store the id.
Expand All @@ -280,7 +304,7 @@ impl Index {
wtxn: &mut RwTxn,
external_documents_ids: &ExternalDocumentsIds<'a>,
) -> heed::Result<()> {
let ExternalDocumentsIds { hard, soft } = external_documents_ids;
let ExternalDocumentsIds { hard, soft, .. } = external_documents_ids;
let hard = hard.as_fst().as_bytes();
let soft = soft.as_fst().as_bytes();
self.main.put::<_, Str, ByteSlice>(
Expand Down Expand Up @@ -311,7 +335,8 @@ impl Index {
Some(soft) => fst::Map::new(soft)?.map_data(Cow::Borrowed)?,
None => fst::Map::default().map_data(Cow::Owned)?,
};
Ok(ExternalDocumentsIds::new(hard, soft))
let soft_deleted_docids = self.soft_deleted_documents_ids(rtxn)?;
Ok(ExternalDocumentsIds::new(hard, soft, soft_deleted_docids))
}

/* fields ids map */
Expand Down Expand Up @@ -929,9 +954,13 @@ impl Index {
rtxn: &'t RoTxn,
ids: impl IntoIterator<Item = DocumentId>,
) -> Result<Vec<(DocumentId, obkv::KvReaderU16<'t>)>> {
let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?;
let mut documents = Vec::new();

for id in ids {
if soft_deleted_documents.contains(id) {
return Err(InternalError::AccessingSoftDeletedDocument { document_id: id })?;
}
let kv = self
.documents
.get(rtxn, &BEU32::new(id))?
Expand All @@ -947,11 +976,16 @@ impl Index {
&self,
rtxn: &'t RoTxn,
) -> Result<impl Iterator<Item = heed::Result<(DocumentId, obkv::KvReaderU16<'t>)>>> {
let soft_deleted_docids = self.soft_deleted_documents_ids(rtxn)?;

Ok(self
.documents
.iter(rtxn)?
// we cast the BEU32 to a DocumentId
.map(|document| document.map(|(id, obkv)| (id.get(), obkv))))
.map(|document| document.map(|(id, obkv)| (id.get(), obkv)))
.filter(move |document| {
document.as_ref().map_or(true, |(id, _)| !soft_deleted_docids.contains(*id))
}))
}

pub fn facets_distribution<'a>(&'a self, rtxn: &'a RoTxn) -> FacetDistribution<'a> {
Expand Down
74 changes: 52 additions & 22 deletions milli/src/search/facet/filter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,7 @@ use roaring::RoaringBitmap;

use super::FacetNumberRange;
use crate::error::{Error, UserError};
use crate::heed_codec::facet::{
FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec,
};
use crate::heed_codec::facet::FacetLevelValueF64Codec;
use crate::{
distance_between_two_points, lat_lng_to_xyz, CboRoaringBitmapCodec, FieldId, Index, Result,
};
Expand Down Expand Up @@ -266,11 +264,12 @@ impl<'a> Filter<'a> {
fn evaluate_operator(
rtxn: &heed::RoTxn,
index: &Index,
numbers_db: heed::Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>,
strings_db: heed::Database<FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>,
field_id: FieldId,
operator: &Condition<'a>,
) -> Result<RoaringBitmap> {
let numbers_db = index.facet_id_f64_docids;
let strings_db = index.facet_id_string_docids;

// Make sure we always bound the ranges with the field id and the level,
// as the facets values are all in the same database and prefixed by the
// field id and the level.
Expand Down Expand Up @@ -309,9 +308,7 @@ impl<'a> Filter<'a> {
let all_numbers_ids = index.number_faceted_documents_ids(rtxn, field_id)?;
let all_strings_ids = index.string_faceted_documents_ids(rtxn, field_id)?;
let operator = Condition::Equal(val.clone());
let docids = Self::evaluate_operator(
rtxn, index, numbers_db, strings_db, field_id, &operator,
)?;
let docids = Self::evaluate_operator(rtxn, index, field_id, &operator)?;
return Ok((all_numbers_ids | all_strings_ids) - docids);
}
};
Expand Down Expand Up @@ -342,17 +339,27 @@ impl<'a> Filter<'a> {
}

pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result<RoaringBitmap> {
let numbers_db = index.facet_id_f64_docids;
let strings_db = index.facet_id_string_docids;
// to avoid doing this for each recursive call we're going to do it ONCE ahead of time
let soft_deleted_documents = index.soft_deleted_documents_ids(rtxn)?;
let filterable_fields = index.filterable_fields(rtxn)?;

// and finally we delete all the soft_deleted_documents, again, only once at the very end
self.inner_evaluate(rtxn, index, &filterable_fields)
.map(|result| result - soft_deleted_documents)
}

fn inner_evaluate(
&self,
rtxn: &heed::RoTxn,
index: &Index,
filterable_fields: &HashSet<String>,
) -> Result<RoaringBitmap> {
match &self.condition {
FilterCondition::Condition { fid, op } => {
let filterable_fields = index.filterable_fields(rtxn)?;

if crate::is_faceted(fid.value(), &filterable_fields) {
if crate::is_faceted(fid.value(), filterable_fields) {
let field_ids_map = index.fields_ids_map(rtxn)?;
if let Some(fid) = field_ids_map.id(fid.value()) {
Self::evaluate_operator(rtxn, index, numbers_db, strings_db, fid, &op)
Self::evaluate_operator(rtxn, index, fid, &op)
} else {
return Ok(RoaringBitmap::new());
}
Expand All @@ -371,25 +378,47 @@ impl<'a> Filter<'a> {
return Err(fid.as_external_error(
FilterError::AttributeNotFilterable {
attribute,
filterable_fields,
filterable_fields: filterable_fields.clone(),
},
))?;
}
}
}
}
FilterCondition::Or(lhs, rhs) => {
let lhs = Self::evaluate(&(lhs.as_ref().clone()).into(), rtxn, index)?;
let rhs = Self::evaluate(&(rhs.as_ref().clone()).into(), rtxn, index)?;
let lhs = Self::inner_evaluate(
&(lhs.as_ref().clone()).into(),
rtxn,
index,
filterable_fields,
)?;
let rhs = Self::inner_evaluate(
&(rhs.as_ref().clone()).into(),
rtxn,
index,
filterable_fields,
)?;
Ok(lhs | rhs)
}
FilterCondition::And(lhs, rhs) => {
let lhs = Self::evaluate(&(lhs.as_ref().clone()).into(), rtxn, index)?;
let rhs = Self::evaluate(&(rhs.as_ref().clone()).into(), rtxn, index)?;
let lhs = Self::inner_evaluate(
&(lhs.as_ref().clone()).into(),
rtxn,
index,
filterable_fields,
)?;
if lhs.is_empty() {
return Ok(lhs);
}
let rhs = Self::inner_evaluate(
&(rhs.as_ref().clone()).into(),
rtxn,
index,
filterable_fields,
)?;
Ok(lhs & rhs)
}
FilterCondition::GeoLowerThan { point, radius } => {
let filterable_fields = index.filterable_fields(rtxn)?;
if filterable_fields.contains("_geo") {
let base_point: [f64; 2] = [point[0].parse()?, point[1].parse()?];
if !(-90.0..=90.0).contains(&base_point[0]) {
Expand Down Expand Up @@ -422,16 +451,17 @@ impl<'a> Filter<'a> {
} else {
return Err(point[0].as_external_error(FilterError::AttributeNotFilterable {
attribute: "_geo",
filterable_fields,
filterable_fields: filterable_fields.clone(),
}))?;
}
}
FilterCondition::GeoGreaterThan { point, radius } => {
let result = Self::evaluate(
let result = Self::inner_evaluate(
&FilterCondition::GeoLowerThan { point: point.clone(), radius: radius.clone() }
.into(),
rtxn,
index,
filterable_fields,
)?;
let geo_faceted_doc_ids = index.geo_faceted_documents_ids(rtxn)?;
Ok(geo_faceted_doc_ids - result)
Expand Down
2 changes: 1 addition & 1 deletion milli/src/search/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,7 @@ impl<'a> Search<'a> {
) -> Result<SearchResult> {
let mut offset = self.offset;
let mut initial_candidates = RoaringBitmap::new();
let mut excluded_candidates = RoaringBitmap::new();
let mut excluded_candidates = self.index.soft_deleted_documents_ids(self.rtxn)?;
let mut documents_ids = Vec::new();

while let Some(FinalResult { candidates, bucket_candidates, .. }) =
Expand Down
35 changes: 30 additions & 5 deletions milli/src/update/available_documents_ids.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,16 @@ pub struct AvailableDocumentsIds {
}

impl AvailableDocumentsIds {
pub fn from_documents_ids(docids: &RoaringBitmap) -> AvailableDocumentsIds {
match docids.max() {
pub fn from_documents_ids(
docids: &RoaringBitmap,
soft_deleted_docids: &RoaringBitmap,
) -> AvailableDocumentsIds {
let used_docids = docids | soft_deleted_docids;

match used_docids.max() {
Some(last_id) => {
let mut available = RoaringBitmap::from_iter(0..last_id);
available -= docids;
available -= used_docids;

let iter = match last_id.checked_add(1) {
Some(id) => id..=u32::max_value(),
Expand Down Expand Up @@ -44,7 +49,7 @@ mod tests {
#[test]
fn empty() {
let base = RoaringBitmap::new();
let left = AvailableDocumentsIds::from_documents_ids(&base);
let left = AvailableDocumentsIds::from_documents_ids(&base, &RoaringBitmap::new());
let right = 0..=u32::max_value();
left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r));
}
Expand All @@ -57,8 +62,28 @@ mod tests {
base.insert(100);
base.insert(405);

let left = AvailableDocumentsIds::from_documents_ids(&base);
let left = AvailableDocumentsIds::from_documents_ids(&base, &RoaringBitmap::new());
let right = (0..=u32::max_value()).filter(|&n| n != 0 && n != 10 && n != 100 && n != 405);
left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r));
}

#[test]
fn soft_deleted() {
let mut base = RoaringBitmap::new();
base.insert(0);
base.insert(10);
base.insert(100);
base.insert(405);

let mut soft_deleted = RoaringBitmap::new();
soft_deleted.insert(1);
soft_deleted.insert(11);
soft_deleted.insert(101);
soft_deleted.insert(406);

let left = AvailableDocumentsIds::from_documents_ids(&base, &soft_deleted);
let right =
(0..=u32::max_value()).filter(|&n| ![0, 1, 10, 11, 100, 101, 405, 406].contains(&n));
left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r));
}
}
10 changes: 6 additions & 4 deletions milli/src/update/clear_documents.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
documents,
} = self.index;

let empty_roaring = RoaringBitmap::default();

// We retrieve the number of documents ids that we are deleting.
let number_of_documents = self.index.number_of_documents(self.wtxn)?;
let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?;
Expand All @@ -43,16 +45,16 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
self.index.put_words_fst(self.wtxn, &fst::Set::default())?;
self.index.put_words_prefixes_fst(self.wtxn, &fst::Set::default())?;
self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?;
self.index.put_documents_ids(self.wtxn, &RoaringBitmap::default())?;
self.index.put_documents_ids(self.wtxn, &empty_roaring)?;
self.index.put_soft_deleted_documents_ids(self.wtxn, &empty_roaring)?;
self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?;
self.index.delete_geo_rtree(self.wtxn)?;
self.index.delete_geo_faceted_documents_ids(self.wtxn)?;

// We clean all the faceted documents ids.
let empty = RoaringBitmap::default();
for field_id in faceted_fields {
self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &empty)?;
self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &empty)?;
self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &empty_roaring)?;
self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &empty_roaring)?;
}

// Clear the other databases.
Expand Down
Loading

0 comments on commit 672cb02

Please sign in to comment.