From 138303922ff0eb048398e2b7f7639ba8cd52babb Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Wed, 8 Sep 2021 14:43:29 +0200 Subject: [PATCH 1/5] fix #1151 Fixes a off by one error in the stats for the index fast field in the multi value fast field. When retrieving the data range for a docid, `get(doc)..get(docid+1)` is requested. On creation the num_vals statistic was set to doc instead of docid + 1. In the multivaluelinearinterpol fast field the last value was therefore not serialized (and would return 0 instead in most cases). So the last document get(lastdoc)..get(lastdoc + 1) would return the invalid range `value..0`. This PR adds a proptest to cover this scenario. A combination of a large number values, since multilinear interpolation is only active for more than 5_000 values, and a merge is required. --- Cargo.toml | 2 + src/fastfield/multivalued/mod.rs | 113 +++++++++++++++++++++++++++++++ src/indexer/merger.rs | 2 + 3 files changed, 117 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index 5d9414beb8..8bdb09c53c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -65,6 +65,8 @@ maplit = "1.0.2" matches = "0.1.8" proptest = "1.0" criterion = "0.3.5" +test-env-log = "0.2.7" +env_logger = "0.9.0" [dev-dependencies.fail] version = "0.4" diff --git a/src/fastfield/multivalued/mod.rs b/src/fastfield/multivalued/mod.rs index d03313ddcc..832437f962 100644 --- a/src/fastfield/multivalued/mod.rs +++ b/src/fastfield/multivalued/mod.rs @@ -8,14 +8,22 @@ pub use self::writer::MultiValuedFastFieldWriter; mod tests { use crate::collector::TopDocs; + use crate::indexer::NoMergePolicy; use crate::query::QueryParser; use crate::schema::Cardinality; use crate::schema::Facet; use crate::schema::IntOptions; use crate::schema::Schema; use crate::schema::INDEXED; + use crate::Document; use crate::Index; + use crate::Term; use chrono::Duration; + use futures::executor::block_on; + use proptest::prop_oneof; + use proptest::proptest; + use proptest::strategy::Strategy; + use test_env_log::test; #[test] fn test_multivalued_u64() { @@ -225,6 +233,111 @@ mod tests { multi_value_reader.get_vals(3, &mut vals); assert_eq!(&vals, &[-5i64, -20i64, 1i64]); } + + fn test_multivalued_no_panic(ops: &[IndexingOp]) { + let mut schema_builder = Schema::builder(); + let field = schema_builder.add_u64_field( + "multifield", + IntOptions::default() + .set_fast(Cardinality::MultiValues) + .set_indexed(), + ); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + let mut index_writer = index.writer_for_tests().unwrap(); + index_writer.set_merge_policy(Box::new(NoMergePolicy)); + + for &op in ops { + match op { + IndexingOp::AddDoc { id } => { + match id % 3 { + 0 => { + index_writer.add_document(doc!()); + } + 1 => { + let mut doc = Document::new(); + for _ in 0..5001 { + doc.add_u64(field, id as u64); + } + index_writer.add_document(doc); + } + _ => { + let mut doc = Document::new(); + doc.add_u64(field, id as u64); + index_writer.add_document(doc); + } + }; + } + IndexingOp::DeleteDoc { id } => { + index_writer.delete_term(Term::from_field_u64(field, id as u64)); + } + IndexingOp::Commit => { + index_writer.commit().unwrap(); + } + IndexingOp::Merge => { + let segment_ids = index + .searchable_segment_ids() + .expect("Searchable segments failed."); + if !segment_ids.is_empty() { + block_on(index_writer.merge(&segment_ids)).unwrap(); + assert!(index_writer.segment_updater().wait_merging_thread().is_ok()); + } + } + } + } + + assert!(index_writer.commit().is_ok()); + + // Merging the segments + { + let segment_ids = index + .searchable_segment_ids() + .expect("Searchable segments failed."); + if !segment_ids.is_empty() { + block_on(index_writer.merge(&segment_ids)).unwrap(); + assert!(index_writer.wait_merging_threads().is_ok()); + } + } + } + + #[derive(Debug, Clone, Copy)] + enum IndexingOp { + AddDoc { id: u32 }, + DeleteDoc { id: u32 }, + Commit, + Merge, + } + + fn operation_strategy() -> impl Strategy { + prop_oneof![ + (0u32..10u32).prop_map(|id| IndexingOp::DeleteDoc { id }), + (0u32..10u32).prop_map(|id| IndexingOp::AddDoc { id }), + (0u32..2u32).prop_map(|_| IndexingOp::Commit), + (0u32..1u32).prop_map(|_| IndexingOp::Merge), + ] + } + + proptest! { + #[test] + fn test_multivalued_proptest(ops in proptest::collection::vec(operation_strategy(), 1..10)) { + test_multivalued_no_panic(&ops[..]); + } + } + + #[test] + fn test_multivalued_proptest_off_by_one_bug_1151() { + use IndexingOp::*; + let ops = [ + AddDoc { id: 3 }, + AddDoc { id: 1 }, + AddDoc { id: 3 }, + Commit, + Merge, + ]; + + test_multivalued_no_panic(&ops[..]); + } + #[test] #[ignore] fn test_many_facets() { diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 295a5fde66..0402265068 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -516,6 +516,8 @@ impl IndexMerger { total_num_vals += u64s_reader.get_total_len(); } } + // analougus to offset.push below, for the (last docid .. last docid + 1) range + idx_num_vals += 1; let stats = FastFieldStats { max_value: total_num_vals, From 10aabf2f7c06cf3dbce82142be7abf68fdfb71fd Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 10 Sep 2021 12:58:11 +0200 Subject: [PATCH 2/5] add randomized merges in test --- src/fastfield/multivalued/mod.rs | 2 +- src/indexer/index_writer.rs | 17 +++++++++++++++-- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/src/fastfield/multivalued/mod.rs b/src/fastfield/multivalued/mod.rs index 832437f962..6c07340aab 100644 --- a/src/fastfield/multivalued/mod.rs +++ b/src/fastfield/multivalued/mod.rs @@ -278,7 +278,7 @@ mod tests { let segment_ids = index .searchable_segment_ids() .expect("Searchable segments failed."); - if !segment_ids.is_empty() { + if segment_ids.len() >= 2 { block_on(index_writer.merge(&segment_ids)).unwrap(); assert!(index_writer.segment_updater().wait_merging_thread().is_ok()); } diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index f498099b9f..30dd7f4f15 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -1361,6 +1361,7 @@ mod tests { AddDoc { id: u64 }, DeleteDoc { id: u64 }, Commit, + Merge, } fn operation_strategy() -> impl Strategy { @@ -1368,6 +1369,7 @@ mod tests { (0u64..10u64).prop_map(|id| IndexingOp::DeleteDoc { id }), (0u64..10u64).prop_map(|id| IndexingOp::AddDoc { id }), (0u64..2u64).prop_map(|_| IndexingOp::Commit), + (0u64..1u64).prop_map(|_| IndexingOp::Merge), ] } @@ -1393,7 +1395,7 @@ mod tests { fn test_operation_strategy( ops: &[IndexingOp], sort_index: bool, - force_merge: bool, + force_end_merge: bool, ) -> crate::Result<()> { let mut schema_builder = schema::Schema::builder(); let id_field = schema_builder.add_u64_field("id", FAST | INDEXED | STORED); @@ -1435,6 +1437,8 @@ mod tests { .settings(settings) .create_in_ram()?; let mut index_writer = index.writer_for_tests()?; + index_writer.set_merge_policy(Box::new(NoMergePolicy)); + for &op in ops { match op { IndexingOp::AddDoc { id } => { @@ -1448,12 +1452,21 @@ mod tests { IndexingOp::Commit => { index_writer.commit()?; } + IndexingOp::Merge => { + let segment_ids = index + .searchable_segment_ids() + .expect("Searchable segments failed."); + if segment_ids.len() >= 2 { + block_on(index_writer.merge(&segment_ids)).unwrap(); + assert!(index_writer.segment_updater().wait_merging_thread().is_ok()); + } + } } } index_writer.commit()?; let searcher = index.reader()?.searcher(); - if force_merge { + if force_end_merge { index_writer.wait_merging_threads()?; let mut index_writer = index.writer_for_tests()?; let segment_ids = index From 2a7ea40b249359d924560a44f48056ab9c77e951 Mon Sep 17 00:00:00 2001 From: PSeitz Date: Fri, 10 Sep 2021 13:40:43 +0200 Subject: [PATCH 3/5] explicit range Co-authored-by: Paul Masurel --- src/indexer/merger.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 0402265068..5e24be1130 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -516,7 +516,7 @@ impl IndexMerger { total_num_vals += u64s_reader.get_total_len(); } } - // analougus to offset.push below, for the (last docid .. last docid + 1) range + // analogous to offset.push below, for the `[last_docid .. last docid + 1)` range idx_num_vals += 1; let stats = FastFieldStats { From 75ef94b67087c1431eae1d33670bdea763d54481 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Fri, 10 Sep 2021 14:09:28 +0200 Subject: [PATCH 4/5] rename to num_docs --- src/indexer/merger.rs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 5e24be1130..6eb8c7fe22 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -501,10 +501,10 @@ impl IndexMerger { // // This is required by the bitpacker, as it needs to know // what should be the bit length use for bitpacking. - let mut idx_num_vals = 0; + let mut num_docs = 0; for (reader, u64s_reader) in reader_and_field_accessors.iter() { if let Some(delete_bitset) = reader.delete_bitset() { - idx_num_vals += reader.max_doc() as u64 - delete_bitset.len() as u64; + num_docs += reader.max_doc() as u64 - delete_bitset.len() as u64; for doc in 0u32..reader.max_doc() { if delete_bitset.is_alive(doc) { let num_vals = u64s_reader.get_len(doc) as u64; @@ -512,16 +512,17 @@ impl IndexMerger { } } } else { - idx_num_vals += reader.max_doc() as u64; + num_docs += reader.max_doc() as u64; total_num_vals += u64s_reader.get_total_len(); } } // analogous to offset.push below, for the `[last_docid .. last docid + 1)` range - idx_num_vals += 1; + num_docs += 1; let stats = FastFieldStats { max_value: total_num_vals, - num_vals: idx_num_vals, + // The fastfield offset index contains (num_docs + 1) values. + num_vals: num_docs, min_value: 0, }; // We can now create our `idx` serializer, and in a second pass, From 896cdd6cd06efc367f573a6de542de507996ca85 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Fri, 10 Sep 2021 22:12:21 +0900 Subject: [PATCH 5/5] Apply suggestions from code review --- src/indexer/merger.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 6eb8c7fe22..ef3874256f 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -516,13 +516,11 @@ impl IndexMerger { total_num_vals += u64s_reader.get_total_len(); } } - // analogous to offset.push below, for the `[last_docid .. last docid + 1)` range - num_docs += 1; let stats = FastFieldStats { max_value: total_num_vals, // The fastfield offset index contains (num_docs + 1) values. - num_vals: num_docs, + num_vals: num_docs + 1, min_value: 0, }; // We can now create our `idx` serializer, and in a second pass,