diff --git a/CHANGELOG.md b/CHANGELOG.md index b08955083d..caef5cb4c9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,9 @@ Tantivy 0.17 ================================ - Change to non-strict schema. Ignore fields in data which are not defined in schema. Previously this returned an error. #1211 +- Facets are necessarily indexed. Existing index with indexed facets should work out of the box. Index without facets that are marked with index: false should be broken (but they were already broken in a sense). (@fulmicoton) #1195 . +- Bugfix that could in theory impact durability in theory on some filesystems [#1224](https://github.com/quickwit-inc/tantivy/issues/1224) +- Reduce the number of fsync calls [#1225](https://github.com/quickwit-inc/tantivy/issues/1225) Tantivy 0.16.2 ================================ diff --git a/examples/faceted_search.rs b/examples/faceted_search.rs index 1026a42ed9..4b0edee14f 100644 --- a/examples/faceted_search.rs +++ b/examples/faceted_search.rs @@ -23,7 +23,7 @@ fn main() -> tantivy::Result<()> { let name = schema_builder.add_text_field("felin_name", TEXT | STORED); // this is our faceted field: its scientific classification - let classification = schema_builder.add_facet_field("classification", INDEXED); + let classification = schema_builder.add_facet_field("classification", FacetOptions::default()); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); diff --git a/examples/faceted_search_with_tweaked_score.rs b/examples/faceted_search_with_tweaked_score.rs index 638672976b..de2e2e3236 100644 --- a/examples/faceted_search_with_tweaked_score.rs +++ b/examples/faceted_search_with_tweaked_score.rs @@ -9,7 +9,7 @@ fn main() -> tantivy::Result<()> { let mut schema_builder = Schema::builder(); let title = schema_builder.add_text_field("title", STORED); - let ingredient = schema_builder.add_facet_field("ingredient", INDEXED); + let ingredient = schema_builder.add_facet_field("ingredient", FacetOptions::default()); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); diff --git a/src/collector/facet_collector.rs b/src/collector/facet_collector.rs index 99a4f74711..01701b995e 100644 --- a/src/collector/facet_collector.rs +++ b/src/collector/facet_collector.rs @@ -83,7 +83,7 @@ fn facet_depth(facet_bytes: &[u8]) -> usize { /// ```rust /// use tantivy::collector::FacetCollector; /// use tantivy::query::AllQuery; -/// use tantivy::schema::{Facet, Schema, INDEXED, TEXT}; +/// use tantivy::schema::{Facet, Schema, FacetOptions, TEXT}; /// use tantivy::{doc, Index}; /// /// fn example() -> tantivy::Result<()> { @@ -92,7 +92,7 @@ fn facet_depth(facet_bytes: &[u8]) -> usize { /// // Facet have their own specific type. /// // It is not a bad practise to put all of your /// // facet information in the same field. -/// let facet = schema_builder.add_facet_field("facet", INDEXED); +/// let facet = schema_builder.add_facet_field("facet", FacetOptions::default()); /// let title = schema_builder.add_text_field("title", TEXT); /// let schema = schema_builder.build(); /// let index = Index::create_in_ram(schema); @@ -462,7 +462,7 @@ mod tests { use crate::collector::Count; use crate::core::Index; use crate::query::{AllQuery, QueryParser, TermQuery}; - use crate::schema::{Document, Facet, Field, IndexRecordOption, Schema, INDEXED}; + use crate::schema::{Document, Facet, FacetOptions, Field, IndexRecordOption, Schema}; use crate::Term; use rand::distributions::Uniform; use rand::prelude::SliceRandom; @@ -472,7 +472,7 @@ mod tests { #[test] fn test_facet_collector_drilldown() -> crate::Result<()> { let mut schema_builder = Schema::builder(); - let facet_field = schema_builder.add_facet_field("facet", INDEXED); + let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default()); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); @@ -533,7 +533,7 @@ mod tests { #[test] fn test_doc_unsorted_multifacet() -> crate::Result<()> { let mut schema_builder = Schema::builder(); - let facet_field = schema_builder.add_facet_field("facets", INDEXED); + let facet_field = schema_builder.add_facet_field("facets", FacetOptions::default()); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); let mut index_writer = index.writer_for_tests()?; @@ -558,7 +558,7 @@ mod tests { #[test] fn test_doc_search_by_facet() -> crate::Result<()> { let mut schema_builder = Schema::builder(); - let facet_field = schema_builder.add_facet_field("facet", INDEXED); + let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default()); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); let mut index_writer = index.writer_for_tests()?; @@ -615,7 +615,7 @@ mod tests { #[test] fn test_facet_collector_topk() { let mut schema_builder = Schema::builder(); - let facet_field = schema_builder.add_facet_field("facet", INDEXED); + let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default()); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); @@ -664,7 +664,7 @@ mod tests { #[test] fn test_facet_collector_topk_tie_break() -> crate::Result<()> { let mut schema_builder = Schema::builder(); - let facet_field = schema_builder.add_facet_field("facet", INDEXED); + let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default()); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); diff --git a/src/core/index_meta.rs b/src/core/index_meta.rs index 9a4c0b1c79..9fa7731e41 100644 --- a/src/core/index_meta.rs +++ b/src/core/index_meta.rs @@ -394,7 +394,7 @@ mod tests { let json = serde_json::ser::to_string(&index_metas).expect("serialization failed"); assert_eq!( json, - r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"lz4"},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","tokenizer":"default"},"stored":false}}],"opstamp":0}"# + r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"lz4"},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false}}],"opstamp":0}"# ); } } diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index cad61dc7b9..c2dc288803 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -127,13 +127,17 @@ impl SegmentReader { self.fieldnorm_readers.get_field(field)?.ok_or_else(|| { let field_name = self.schema.get_field_name(field); let err_msg = format!( - "Field norm not found for field {:?}. Was it marked as indexed during indexing?", + "Field norm not found for field {:?}. Was the field set to record norm during indexing?", field_name ); crate::TantivyError::SchemaError(err_msg) }) } + pub(crate) fn fieldnorms_readers(&self) -> &FieldNormReaders { + &self.fieldnorm_readers + } + /// Accessor to the segment's `StoreReader`. pub fn get_store_reader(&self) -> io::Result { StoreReader::open(self.store_file.clone()) diff --git a/src/directory/directory.rs b/src/directory/directory.rs index 36c0172c59..66fa8cd96a 100644 --- a/src/directory/directory.rs +++ b/src/directory/directory.rs @@ -142,10 +142,16 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static { /// Opens a writer for the *virtual file* associated with /// a Path. /// - /// Right after this call, the file should be created - /// and any subsequent call to `open_read` for the + /// Right after this call, for the span of the execution of the program + /// the file should be created and any subsequent call to `open_read` for the /// same path should return a `FileSlice`. /// + /// However, depending on the directory implementation, + /// it might be required to call `sync_directory` to ensure + /// that the file is durably created. + /// (The semantics here are the same when dealing with + /// a posix filesystem.) + /// /// Write operations may be aggressively buffered. /// The client of this trait is responsible for calling flush /// to ensure that subsequent `read` operations @@ -176,6 +182,12 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static { /// The file may or may not previously exist. fn atomic_write(&self, path: &Path, data: &[u8]) -> io::Result<()>; + /// Sync the directory. + /// + /// This call is required to ensure that newly created files are + /// effectively stored durably. + fn sync_directory(&self) -> io::Result<()>; + /// Acquire a lock in the given directory. /// /// The method is blocking or not depending on the `Lock` object. diff --git a/src/directory/managed_directory.rs b/src/directory/managed_directory.rs index 3034558efd..381d4346f4 100644 --- a/src/directory/managed_directory.rs +++ b/src/directory/managed_directory.rs @@ -192,6 +192,7 @@ impl ManagedDirectory { for delete_file in &deleted_files { managed_paths_write.remove(delete_file); } + self.directory.sync_directory()?; save_managed_paths(self.directory.as_mut(), &meta_informations_wlock)?; } @@ -222,9 +223,22 @@ impl ManagedDirectory { .write() .expect("Managed file lock poisoned"); let has_changed = meta_wlock.managed_paths.insert(filepath.to_owned()); - if has_changed { - save_managed_paths(self.directory.as_ref(), &meta_wlock)?; + if !has_changed { + return Ok(()); + } + save_managed_paths(self.directory.as_ref(), &meta_wlock)?; + // This is not the first file we add. + // Therefore, we are sure that `.managed.json` has been already + // properly created and we do not need to sync its parent directory. + // + // (It might seem like a nicer solution to create the managed_json on the + // creation of the ManagedDirectory instance but it would actually + // prevent the use of read-only directories..) + let managed_file_definitely_already_exists = meta_wlock.managed_paths.len() > 1; + if managed_file_definitely_already_exists { + return Ok(()); } + self.directory.sync_directory()?; Ok(()) } @@ -310,6 +324,11 @@ impl Directory for ManagedDirectory { fn watch(&self, watch_callback: WatchCallback) -> crate::Result { self.directory.watch(watch_callback) } + + fn sync_directory(&self) -> io::Result<()> { + self.directory.sync_directory()?; + Ok(()) + } } impl Clone for ManagedDirectory { diff --git a/src/directory/mmap_directory.rs b/src/directory/mmap_directory.rs index 3d11b0334c..80f9ce27b3 100644 --- a/src/directory/mmap_directory.rs +++ b/src/directory/mmap_directory.rs @@ -211,33 +211,6 @@ impl MmapDirectory { self.inner.root_path.join(relative_path) } - /// Sync the root directory. - /// In certain FS, this is required to persistently create - /// a file. - fn sync_directory(&self) -> Result<(), io::Error> { - let mut open_opts = OpenOptions::new(); - - // Linux needs read to be set, otherwise returns EINVAL - // write must not be set, or it fails with EISDIR - open_opts.read(true); - - // On Windows, opening a directory requires FILE_FLAG_BACKUP_SEMANTICS - // and calling sync_all() only works if write access is requested. - #[cfg(windows)] - { - use std::os::windows::fs::OpenOptionsExt; - use winapi::um::winbase; - - open_opts - .write(true) - .custom_flags(winbase::FILE_FLAG_BACKUP_SEMANTICS); - } - - let fd = open_opts.open(&self.inner.root_path)?; - fd.sync_all()?; - Ok(()) - } - /// Returns some statistical information /// about the Mmap cache. /// @@ -288,8 +261,7 @@ impl Write for SafeFileWriter { } fn flush(&mut self) -> io::Result<()> { - self.0.flush()?; - self.0.sync_all() + Ok(()) } } @@ -301,7 +273,9 @@ impl Seek for SafeFileWriter { impl TerminatingWrite for SafeFileWriter { fn terminate_ref(&mut self, _: AntiCallToken) -> io::Result<()> { - self.flush() + self.0.flush()?; + self.0.sync_data()?; + Ok(()) } } @@ -331,6 +305,7 @@ pub(crate) fn atomic_write(path: &Path, content: &[u8]) -> io::Result<()> { let mut tempfile = tempfile::Builder::new().tempfile_in(&parent_path)?; tempfile.write_all(content)?; tempfile.flush()?; + tempfile.as_file_mut().sync_data()?; tempfile.into_temp_path().persist(path)?; Ok(()) } @@ -365,22 +340,17 @@ impl Directory for MmapDirectory { /// removed before the file is deleted. fn delete(&self, path: &Path) -> result::Result<(), DeleteError> { let full_path = self.resolve_path(path); - match fs::remove_file(&full_path) { - Ok(_) => self.sync_directory().map_err(|e| DeleteError::IoError { - io_error: e, - filepath: path.to_path_buf(), - }), - Err(e) => { - if e.kind() == io::ErrorKind::NotFound { - Err(DeleteError::FileDoesNotExist(path.to_owned())) - } else { - Err(DeleteError::IoError { - io_error: e, - filepath: path.to_path_buf(), - }) + fs::remove_file(&full_path).map_err(|e| { + if e.kind() == io::ErrorKind::NotFound { + DeleteError::FileDoesNotExist(path.to_owned()) + } else { + DeleteError::IoError { + io_error: e, + filepath: path.to_path_buf(), } } - } + })?; + Ok(()) } fn exists(&self, path: &Path) -> Result { @@ -409,10 +379,13 @@ impl Directory for MmapDirectory { file.flush() .map_err(|io_error| OpenWriteError::wrap_io_error(io_error, path.to_path_buf()))?; - // Apparetntly, on some filesystem syncing the parent - // directory is required. - self.sync_directory() - .map_err(|io_err| OpenWriteError::wrap_io_error(io_err, path.to_path_buf()))?; + // Note we actually do not sync the parent directory here. + // + // A newly created file, may, in some case, be created and even flushed to disk. + // and then lost... + // + // The file will only be durably written after we terminate AND + // sync_directory() is called. let writer = SafeFileWriter::new(file); Ok(BufWriter::new(Box::new(writer))) @@ -442,7 +415,7 @@ impl Directory for MmapDirectory { debug!("Atomic Write {:?}", path); let full_path = self.resolve_path(path); atomic_write(&full_path, content)?; - self.sync_directory() + Ok(()) } fn acquire_lock(&self, lock: &Lock) -> Result { @@ -468,6 +441,30 @@ impl Directory for MmapDirectory { fn watch(&self, watch_callback: WatchCallback) -> crate::Result { Ok(self.inner.watch(watch_callback)) } + + fn sync_directory(&self) -> Result<(), io::Error> { + let mut open_opts = OpenOptions::new(); + + // Linux needs read to be set, otherwise returns EINVAL + // write must not be set, or it fails with EISDIR + open_opts.read(true); + + // On Windows, opening a directory requires FILE_FLAG_BACKUP_SEMANTICS + // and calling sync_all() only works if write access is requested. + #[cfg(windows)] + { + use std::os::windows::fs::OpenOptionsExt; + use winapi::um::winbase; + + open_opts + .write(true) + .custom_flags(winbase::FILE_FLAG_BACKUP_SEMANTICS); + } + + let fd = open_opts.open(&self.inner.root_path)?; + fd.sync_data()?; + Ok(()) + } } #[cfg(test)] diff --git a/src/directory/ram_directory.rs b/src/directory/ram_directory.rs index a04c6f7478..3d33fd8d03 100644 --- a/src/directory/ram_directory.rs +++ b/src/directory/ram_directory.rs @@ -225,6 +225,10 @@ impl Directory for RamDirectory { fn watch(&self, watch_callback: WatchCallback) -> crate::Result { Ok(self.fs.write().unwrap().watch(watch_callback)) } + + fn sync_directory(&self) -> io::Result<()> { + Ok(()) + } } #[cfg(test)] diff --git a/src/fastfield/facet_reader.rs b/src/fastfield/facet_reader.rs index 54b62b70f4..ef354af614 100644 --- a/src/fastfield/facet_reader.rs +++ b/src/fastfield/facet_reader.rs @@ -84,14 +84,14 @@ impl FacetReader { mod tests { use crate::Index; use crate::{ - schema::{Facet, FacetOptions, SchemaBuilder, Value, INDEXED, STORED}, + schema::{Facet, FacetOptions, SchemaBuilder, Value, STORED}, DocAddress, Document, }; #[test] fn test_facet_only_indexed() -> crate::Result<()> { let mut schema_builder = SchemaBuilder::default(); - let facet_field = schema_builder.add_facet_field("facet", INDEXED); + let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default()); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); let mut index_writer = index.writer_for_tests()?; @@ -106,38 +106,15 @@ mod tests { facet_reader.facet_ords(0u32, &mut facet_ords); assert_eq!(&facet_ords, &[2u64]); let doc = searcher.doc(DocAddress::new(0u32, 0u32))?; - let value = doc.get_first(facet_field).and_then(Value::path); + let value = doc.get_first(facet_field).and_then(Value::facet); assert_eq!(value, None); Ok(()) } - #[test] - fn test_facet_only_stored() -> crate::Result<()> { - let mut schema_builder = SchemaBuilder::default(); - let facet_field = schema_builder.add_facet_field("facet", STORED); - let schema = schema_builder.build(); - let index = Index::create_in_ram(schema); - let mut index_writer = index.writer_for_tests()?; - index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))?; - index_writer.commit()?; - let searcher = index.reader()?.searcher(); - let facet_reader = searcher - .segment_reader(0u32) - .facet_reader(facet_field) - .unwrap(); - let mut facet_ords = Vec::new(); - facet_reader.facet_ords(0u32, &mut facet_ords); - assert!(facet_ords.is_empty()); - let doc = searcher.doc(DocAddress::new(0u32, 0u32))?; - let value = doc.get_first(facet_field).and_then(Value::path); - assert_eq!(value, Some("/a/b".to_string())); - Ok(()) - } - #[test] fn test_facet_stored_and_indexed() -> crate::Result<()> { let mut schema_builder = SchemaBuilder::default(); - let facet_field = schema_builder.add_facet_field("facet", STORED | INDEXED); + let facet_field = schema_builder.add_facet_field("facet", STORED); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); let mut index_writer = index.writer_for_tests()?; @@ -152,38 +129,15 @@ mod tests { facet_reader.facet_ords(0u32, &mut facet_ords); assert_eq!(&facet_ords, &[2u64]); let doc = searcher.doc(DocAddress::new(0u32, 0u32))?; - let value = doc.get_first(facet_field).and_then(Value::path); - assert_eq!(value, Some("/a/b".to_string())); - Ok(()) - } - - #[test] - fn test_facet_neither_stored_and_indexed() -> crate::Result<()> { - let mut schema_builder = SchemaBuilder::default(); - let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default()); - let schema = schema_builder.build(); - let index = Index::create_in_ram(schema); - let mut index_writer = index.writer_for_tests()?; - index_writer.add_document(doc!(facet_field=>Facet::from_text("/a/b").unwrap()))?; - index_writer.commit()?; - let searcher = index.reader()?.searcher(); - let facet_reader = searcher - .segment_reader(0u32) - .facet_reader(facet_field) - .unwrap(); - let mut facet_ords = Vec::new(); - facet_reader.facet_ords(0u32, &mut facet_ords); - assert!(facet_ords.is_empty()); - let doc = searcher.doc(DocAddress::new(0u32, 0u32))?; - let value = doc.get_first(facet_field).and_then(Value::path); - assert_eq!(value, None); + let value: Option<&Facet> = doc.get_first(facet_field).and_then(Value::facet); + assert_eq!(value, Facet::from_text("/a/b").ok().as_ref()); Ok(()) } #[test] fn test_facet_not_populated_for_all_docs() -> crate::Result<()> { let mut schema_builder = SchemaBuilder::default(); - let facet_field = schema_builder.add_facet_field("facet", INDEXED); + let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default()); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); let mut index_writer = index.writer_for_tests()?; @@ -206,7 +160,7 @@ mod tests { #[test] fn test_facet_not_populated_for_any_docs() -> crate::Result<()> { let mut schema_builder = SchemaBuilder::default(); - let facet_field = schema_builder.add_facet_field("facet", INDEXED); + let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default()); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); let mut index_writer = index.writer_for_tests()?; diff --git a/src/fastfield/multivalued/mod.rs b/src/fastfield/multivalued/mod.rs index 5533b5cf3e..2590c6c500 100644 --- a/src/fastfield/multivalued/mod.rs +++ b/src/fastfield/multivalued/mod.rs @@ -12,9 +12,9 @@ mod tests { use crate::query::QueryParser; use crate::schema::Cardinality; use crate::schema::Facet; + use crate::schema::FacetOptions; use crate::schema::IntOptions; use crate::schema::Schema; - use crate::schema::INDEXED; use crate::Document; use crate::Index; use crate::Term; @@ -68,6 +68,7 @@ mod tests { IntOptions::default() .set_fast(Cardinality::MultiValues) .set_indexed() + .set_fieldnorm() .set_stored(), ); let time_i = @@ -334,7 +335,7 @@ mod tests { #[ignore] fn test_many_facets() -> crate::Result<()> { let mut schema_builder = Schema::builder(); - let field = schema_builder.add_facet_field("facetfield", INDEXED); + let field = schema_builder.add_facet_field("facetfield", FacetOptions::default()); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); let mut index_writer = index.writer_for_tests()?; diff --git a/src/fastfield/multivalued/reader.rs b/src/fastfield/multivalued/reader.rs index dd339f1511..3278d94c46 100644 --- a/src/fastfield/multivalued/reader.rs +++ b/src/fastfield/multivalued/reader.rs @@ -91,12 +91,12 @@ impl MultiValueLength for MultiValuedFastFieldReader { mod tests { use crate::core::Index; - use crate::schema::{Cardinality, Facet, IntOptions, Schema, INDEXED}; + use crate::schema::{Cardinality, Facet, FacetOptions, IntOptions, Schema}; #[test] fn test_multifastfield_reader() -> crate::Result<()> { let mut schema_builder = Schema::builder(); - let facet_field = schema_builder.add_facet_field("facets", INDEXED); + let facet_field = schema_builder.add_facet_field("facets", FacetOptions::default()); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); let mut index_writer = index.writer_for_tests()?; diff --git a/src/fieldnorm/mod.rs b/src/fieldnorm/mod.rs index 0b4faa9337..fcab458c7e 100644 --- a/src/fieldnorm/mod.rs +++ b/src/fieldnorm/mod.rs @@ -26,3 +26,66 @@ pub use self::serializer::FieldNormsSerializer; pub use self::writer::FieldNormsWriter; use self::code::{fieldnorm_to_id, id_to_fieldnorm}; + +#[cfg(test)] +mod tests { + use crate::directory::CompositeFile; + use crate::fieldnorm::FieldNormReader; + use crate::fieldnorm::FieldNormsSerializer; + use crate::fieldnorm::FieldNormsWriter; + use crate::{ + directory::{Directory, RamDirectory, WritePtr}, + schema::{STRING, TEXT}, + }; + use once_cell::sync::Lazy; + use std::path::Path; + + use crate::schema::{Field, Schema, STORED}; + + pub static SCHEMA: Lazy = Lazy::new(|| { + let mut schema_builder = Schema::builder(); + schema_builder.add_text_field("field", STORED); + schema_builder.add_text_field("txt_field", TEXT); + schema_builder.add_text_field("str_field", STRING); + schema_builder.build() + }); + + pub static FIELD: Lazy = Lazy::new(|| SCHEMA.get_field("field").unwrap()); + pub static TXT_FIELD: Lazy = Lazy::new(|| SCHEMA.get_field("txt_field").unwrap()); + pub static STR_FIELD: Lazy = Lazy::new(|| SCHEMA.get_field("str_field").unwrap()); + + #[test] + #[should_panic(expected = "Cannot register a given fieldnorm twice")] + pub fn test_should_panic_when_recording_fieldnorm_twice_for_same_doc() { + let mut fieldnorm_writers = FieldNormsWriter::for_schema(&SCHEMA); + fieldnorm_writers.record(0u32, *TXT_FIELD, 5); + fieldnorm_writers.record(0u32, *TXT_FIELD, 3); + } + + #[test] + pub fn test_fieldnorm() -> crate::Result<()> { + let path = Path::new("test"); + let directory: RamDirectory = RamDirectory::create(); + { + let write: WritePtr = directory.open_write(Path::new("test"))?; + let serializer = FieldNormsSerializer::from_write(write)?; + let mut fieldnorm_writers = FieldNormsWriter::for_schema(&SCHEMA); + fieldnorm_writers.record(2u32, *TXT_FIELD, 5); + fieldnorm_writers.record(3u32, *TXT_FIELD, 3); + fieldnorm_writers.serialize(serializer, None)?; + } + let file = directory.open_read(&path)?; + { + let fields_composite = CompositeFile::open(&file)?; + assert!(fields_composite.open_read(*FIELD).is_none()); + assert!(fields_composite.open_read(*STR_FIELD).is_none()); + let data = fields_composite.open_read(*TXT_FIELD).unwrap(); + let fieldnorm_reader = FieldNormReader::open(data)?; + assert_eq!(fieldnorm_reader.fieldnorm(0u32), 0u32); + assert_eq!(fieldnorm_reader.fieldnorm(1u32), 0u32); + assert_eq!(fieldnorm_reader.fieldnorm(2u32), 5u32); + assert_eq!(fieldnorm_reader.fieldnorm(3u32), 3u32); + } + Ok(()) + } +} diff --git a/src/fieldnorm/writer.rs b/src/fieldnorm/writer.rs index 5284c7021c..f802cb46ea 100644 --- a/src/fieldnorm/writer.rs +++ b/src/fieldnorm/writer.rs @@ -4,6 +4,7 @@ use super::fieldnorm_to_id; use super::FieldNormsSerializer; use crate::schema::Field; use crate::schema::Schema; +use std::cmp::Ordering; use std::{io, iter}; /// The `FieldNormsWriter` is in charge of tracking the fieldnorm byte @@ -12,8 +13,7 @@ use std::{io, iter}; /// `FieldNormsWriter` stores a Vec for each tracked field, using a /// byte per document per field. pub struct FieldNormsWriter { - fields: Vec, - fieldnorms_buffer: Vec>, + fieldnorms_buffers: Vec>>, } impl FieldNormsWriter { @@ -23,7 +23,7 @@ impl FieldNormsWriter { schema .fields() .filter_map(|(field, field_entry)| { - if field_entry.is_indexed() { + if field_entry.is_indexed() && field_entry.has_fieldnorms() { Some(field) } else { None @@ -42,18 +42,19 @@ impl FieldNormsWriter { .max() .map(|max_field_id| max_field_id as usize + 1) .unwrap_or(0); - FieldNormsWriter { - fields, - fieldnorms_buffer: iter::repeat_with(Vec::new) - .take(max_field) - .collect::>(), + let mut fieldnorms_buffers: Vec>> = + iter::repeat_with(|| None).take(max_field).collect(); + for field in fields { + fieldnorms_buffers[field.field_id() as usize] = Some(Vec::with_capacity(1_000)); } + FieldNormsWriter { fieldnorms_buffers } } /// The memory used inclusive childs pub fn mem_usage(&self) -> usize { - self.fieldnorms_buffer + self.fieldnorms_buffers .iter() + .flatten() .map(|buf| buf.capacity()) .sum() } @@ -62,8 +63,10 @@ impl FieldNormsWriter { /// /// Will extend with 0-bytes for documents that have not been seen. pub fn fill_up_to_max_doc(&mut self, max_doc: DocId) { - for field in self.fields.iter() { - self.fieldnorms_buffer[field.field_id() as usize].resize(max_doc as usize, 0u8); + for fieldnorms_buffer_opt in self.fieldnorms_buffers.iter_mut() { + if let Some(fieldnorms_buffer) = fieldnorms_buffer_opt.as_mut() { + fieldnorms_buffer.resize(max_doc as usize, 0u8); + } } } @@ -76,14 +79,24 @@ impl FieldNormsWriter { /// * field - the field being set /// * fieldnorm - the number of terms present in document `doc` in field `field` pub fn record(&mut self, doc: DocId, field: Field, fieldnorm: u32) { - let fieldnorm_buffer: &mut Vec = &mut self.fieldnorms_buffer[field.field_id() as usize]; - assert!( - fieldnorm_buffer.len() <= doc as usize, - "Cannot register a given fieldnorm twice" - ); - // we fill intermediary `DocId` as having a fieldnorm of 0. - fieldnorm_buffer.resize(doc as usize + 1, 0u8); - fieldnorm_buffer[doc as usize] = fieldnorm_to_id(fieldnorm); + if let Some(fieldnorm_buffer) = self + .fieldnorms_buffers + .get_mut(field.field_id() as usize) + .map(Option::as_mut) + .flatten() + { + match fieldnorm_buffer.len().cmp(&(doc as usize)) { + Ordering::Less => { + // we fill intermediary `DocId` as having a fieldnorm of 0. + fieldnorm_buffer.resize(doc as usize, 0u8); + } + Ordering::Equal => {} + Ordering::Greater => { + panic!("Cannot register a given fieldnorm twice") + } + } + fieldnorm_buffer.push(fieldnorm_to_id(fieldnorm)); + } } /// Serialize the seen fieldnorm values to the serializer for all fields. @@ -92,17 +105,26 @@ impl FieldNormsWriter { mut fieldnorms_serializer: FieldNormsSerializer, doc_id_map: Option<&DocIdMapping>, ) -> io::Result<()> { - for &field in self.fields.iter() { - let fieldnorm_values: &[u8] = &self.fieldnorms_buffer[field.field_id() as usize][..]; + for (field, fieldnorms_buffer) in self + .fieldnorms_buffers + .iter() + .enumerate() + .map(|(field_id, fieldnorms_buffer_opt)| { + fieldnorms_buffer_opt.as_ref().map(|fieldnorms_buffer| { + (Field::from_field_id(field_id as u32), fieldnorms_buffer) + }) + }) + .flatten() + { if let Some(doc_id_map) = doc_id_map { - let mut mapped_fieldnorm_values = vec![]; - mapped_fieldnorm_values.resize(fieldnorm_values.len(), 0u8); + // TODO is that a bug? + let mut mapped_fieldnorm_values = vec![0u8; doc_id_map.num_new_doc_ids()]; for (new_doc_id, old_doc_id) in doc_id_map.iter_old_doc_ids().enumerate() { - mapped_fieldnorm_values[new_doc_id] = fieldnorm_values[old_doc_id as usize]; + mapped_fieldnorm_values[new_doc_id] = fieldnorms_buffer[old_doc_id as usize]; } fieldnorms_serializer.serialize_field(field, &mapped_fieldnorm_values)?; } else { - fieldnorms_serializer.serialize_field(field, fieldnorm_values)?; + fieldnorms_serializer.serialize_field(field, fieldnorms_buffer)?; } } fieldnorms_serializer.close()?; diff --git a/src/indexer/doc_id_mapping.rs b/src/indexer/doc_id_mapping.rs index 6f622d4fbe..2115f27325 100644 --- a/src/indexer/doc_id_mapping.rs +++ b/src/indexer/doc_id_mapping.rs @@ -75,6 +75,9 @@ impl DocIdMapping { pub fn iter_old_doc_ids(&self) -> impl Iterator + Clone + '_ { self.new_doc_id_to_old.iter().cloned() } + pub fn num_new_doc_ids(&self) -> usize { + self.new_doc_id_to_old.len() + } } pub(crate) fn expect_field_id_for_sort_field( diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 3ee60c8752..fdec9ecd56 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -553,7 +553,7 @@ impl IndexWriter { // marks the segment updater as killed. From now on, all // segment updates will be ignored. self.segment_updater.kill(); - let document_receiver = self.operation_receiver(); + let document_receiver_res = self.operation_receiver(); // take the directory lock to create a new index_writer. let directory_lock = self @@ -579,7 +579,9 @@ impl IndexWriter { // // This will reach an end as the only document_sender // was dropped with the index_writer. - for _ in document_receiver {} + if let Ok(document_receiver) = document_receiver_res { + for _ in document_receiver {} + } Ok(self.committed_opstamp) } @@ -796,6 +798,7 @@ mod tests { use crate::query::TermQuery; use crate::schema::Cardinality; use crate::schema::Facet; + use crate::schema::FacetOptions; use crate::schema::IntOptions; use crate::schema::TextFieldIndexing; use crate::schema::TextOptions; @@ -1417,7 +1420,7 @@ mod tests { .set_fast(Cardinality::MultiValues) .set_stored(), ); - let facet_field = schema_builder.add_facet_field("facet", INDEXED); + let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default()); let schema = schema_builder.build(); let settings = if sort_index { IndexSettings { diff --git a/src/indexer/index_writer_status.rs b/src/indexer/index_writer_status.rs index c837f99bc7..6b1ee26803 100644 --- a/src/indexer/index_writer_status.rs +++ b/src/indexer/index_writer_status.rs @@ -22,7 +22,7 @@ impl IndexWriterStatus { .receive_channel .read() .expect("This lock should never be poisoned"); - rlock.as_ref().map(|receiver| receiver.clone()) + rlock.as_ref().cloned() } /// Create an index writer bomb. diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 6d1f559190..0bbd776d8c 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -41,31 +41,56 @@ use tantivy_bitpacker::minmax; /// We do not allow segments with more than pub const MAX_DOC_LIMIT: u32 = 1 << 31; -fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> crate::Result { - let mut total_tokens = 0u64; - let mut count: [usize; 256] = [0; 256]; - for reader in readers { - if reader.has_deletes() { - // if there are deletes, then we use an approximation - // using the fieldnorm - let fieldnorms_reader = reader.get_fieldnorms_reader(field)?; - for doc in reader.doc_ids_alive() { - let fieldnorm_id = fieldnorms_reader.fieldnorm_id(doc); - count[fieldnorm_id as usize] += 1; - } - } else { - total_tokens += reader.inverted_index(field)?.total_num_tokens(); - } +fn estimate_total_num_tokens_in_single_segment( + reader: &SegmentReader, + field: Field, +) -> crate::Result { + // There are no deletes. We can simply use the exact value saved into the posting list. + // Note that this value is not necessarily exact as it could have been the result of a merge between + // segments themselves containing deletes. + if !reader.has_deletes() { + return Ok(reader.inverted_index(field)?.total_num_tokens()); } - Ok(total_tokens - + count + + // When there are deletes, we use an approximation either + // by using the fieldnorm. + if let Some(fieldnorm_reader) = reader.fieldnorms_readers().get_field(field)? { + let mut count: [usize; 256] = [0; 256]; + for doc in reader.doc_ids_alive() { + let fieldnorm_id = fieldnorm_reader.fieldnorm_id(doc); + count[fieldnorm_id as usize] += 1; + } + let total_num_tokens = count .iter() .cloned() .enumerate() .map(|(fieldnorm_ord, count)| { count as u64 * u64::from(FieldNormReader::id_to_fieldnorm(fieldnorm_ord as u8)) }) - .sum::()) + .sum::(); + return Ok(total_num_tokens); + } + + // There are no fieldnorms available. + // Here we just do a pro-rata with the overall number of tokens an the ratio of + // documents alive. + let segment_num_tokens = reader.inverted_index(field)?.total_num_tokens(); + if reader.num_docs() == 0 {} + + if reader.max_doc() == 0 { + // That supposedly never happens, but let's be a bit defensive here. + return Ok(0u64); + } + let ratio = reader.num_docs() as f64 / reader.max_doc() as f64; + Ok((segment_num_tokens as f64 * ratio) as u64) +} + +fn estimate_total_num_tokens(readers: &[SegmentReader], field: Field) -> crate::Result { + let mut total_num_tokens: u64 = 0; + for reader in readers { + total_num_tokens += estimate_total_num_tokens_in_single_segment(reader, field)?; + } + Ok(total_num_tokens) } pub struct IndexMerger { @@ -851,10 +876,9 @@ impl IndexMerger { segment_map[*old_doc_id as usize] = Some(new_doc_id as DocId); } - // The total number of tokens will only be exact when there has been no deletes. - // - // Otherwise, we approximate by removing deleted documents proportionally. - let total_num_tokens: u64 = compute_total_num_tokens(&self.readers, indexed_field)?; + // Note that the total number of tokens is not exact. + // It is only used as a parameter in the BM25 formula. + let total_num_tokens: u64 = estimate_total_num_tokens(&self.readers, indexed_field)?; // Create the total list of doc ids // by stacking the doc ids from the different segment. @@ -1118,13 +1142,13 @@ mod tests { use crate::query::BooleanQuery; use crate::query::Scorer; use crate::query::TermQuery; - use crate::schema::Document; use crate::schema::Facet; use crate::schema::IndexRecordOption; use crate::schema::IntOptions; use crate::schema::Term; use crate::schema::TextFieldIndexing; use crate::schema::{Cardinality, TEXT}; + use crate::schema::{Document, FacetOptions}; use crate::DocAddress; use crate::IndexSettings; use crate::IndexSortByField; @@ -1650,7 +1674,7 @@ mod tests { // ranges between segments so that merge algorithm can't apply certain optimizations fn test_merge_facets(index_settings: Option, force_segment_value_overlap: bool) { let mut schema_builder = schema::Schema::builder(); - let facet_field = schema_builder.add_facet_field("facet", INDEXED); + let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default()); let int_options = IntOptions::default() .set_fast(Cardinality::SingleValue) .set_indexed(); diff --git a/src/indexer/merger_sorted_index_test.rs b/src/indexer/merger_sorted_index_test.rs index 091e42e7c8..2f6921565c 100644 --- a/src/indexer/merger_sorted_index_test.rs +++ b/src/indexer/merger_sorted_index_test.rs @@ -1,22 +1,17 @@ #[cfg(test)] mod tests { + use crate::collector::TopDocs; + use crate::core::Index; + use crate::fastfield::MultiValuedFastFieldReader; use crate::fastfield::{AliveBitSet, FastFieldReader}; - use crate::schema::IndexRecordOption; - use crate::{ - collector::TopDocs, - schema::{Cardinality, TextFieldIndexing}, - }; - use crate::{core::Index, fastfield::MultiValuedFastFieldReader}; - use crate::{ - query::QueryParser, - schema::{IntOptions, TextOptions}, - }; - use crate::{schema::Facet, IndexSortByField}; - use crate::{schema::INDEXED, Order}; - use crate::{ - schema::{self, BytesOptions}, - DocAddress, + use crate::query::QueryParser; + use crate::schema::{ + self, BytesOptions, Cardinality, Facet, FacetOptions, IndexRecordOption, TextFieldIndexing, }; + use crate::schema::{IntOptions, TextOptions}; + use crate::DocAddress; + use crate::IndexSortByField; + use crate::Order; use crate::{DocSet, IndexSettings, Postings, Term}; use futures::executor::block_on; @@ -27,7 +22,7 @@ mod tests { .set_indexed(); let int_field = schema_builder.add_u64_field("intval", int_options); - let facet_field = schema_builder.add_facet_field("facet", INDEXED); + let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default()); let schema = schema_builder.build(); @@ -79,7 +74,7 @@ mod tests { let bytes_options = BytesOptions::default().set_fast().set_indexed(); let bytes_field = schema_builder.add_bytes_field("bytes", bytes_options); - let facet_field = schema_builder.add_facet_field("facet", INDEXED); + let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default()); let multi_numbers = schema_builder.add_u64_field( "multi_numbers", diff --git a/src/indexer/segment_register.rs b/src/indexer/segment_register.rs index b20080f7b4..75a5316e4e 100644 --- a/src/indexer/segment_register.rs +++ b/src/indexer/segment_register.rs @@ -66,13 +66,10 @@ impl SegmentRegister { } pub fn segment_metas(&self) -> Vec { - let mut segment_ids: Vec = self - .segment_states + self.segment_states .values() .map(|segment_entry| segment_entry.meta().clone()) - .collect(); - segment_ids.sort_by_key(SegmentMeta::id); - segment_ids + .collect() } pub fn contains_all(&self, segment_ids: &[SegmentId]) -> bool { diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index f7f7518db7..cae64d796f 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -61,7 +61,9 @@ pub fn save_new_metas( payload: None, }, directory, - ) + )?; + directory.sync_directory()?; + Ok(()) } /// Save the index meta file. @@ -82,6 +84,7 @@ fn save_metas(metas: &IndexMeta, directory: &dyn Directory) -> crate::Result<()> io::ErrorKind::Other, msg.unwrap_or_else(|| "Undefined".to_string()) )))); + directory.sync_directory()?; directory.atomic_write(&META_FILEPATH, &buffer[..])?; debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas)); Ok(()) diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index b88a03421a..34228069d3 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -175,16 +175,9 @@ impl SegmentWriter { match *field_entry.field_type() { FieldType::HierarchicalFacet(_) => { term_buffer.set_field(field); - let facets = - field_values - .iter() - .flat_map(|field_value| match *field_value.value() { - Value::Facet(ref facet) => Some(facet.encoded_str()), - _ => { - panic!("Expected hierarchical facet"); - } - }); - for facet_str in facets { + for field_value in field_values { + let facet = field_value.value().facet().ok_or_else(make_schema_error)?; + let facet_str = facet.encoded_str(); let mut unordered_term_id_opt = None; FacetTokenizer .token_stream(facet_str) @@ -241,7 +234,6 @@ impl SegmentWriter { term_buffer, ) }; - self.fieldnorms_writer.record(doc_id, field, num_tokens); } FieldType::U64(_) => { diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 1e3b1c0569..69ecda7a90 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -47,7 +47,6 @@ pub mod tests { use crate::fieldnorm::FieldNormReader; use crate::indexer::operation::AddOperation; use crate::indexer::SegmentWriter; - use crate::merge_policy::NoMergePolicy; use crate::query::Scorer; use crate::schema::{Field, TextOptions}; use crate::schema::{IndexRecordOption, TextFieldIndexing}; @@ -154,10 +153,39 @@ pub mod tests { } #[test] - pub fn test_drop_token_that_are_too_long() -> crate::Result<()> { + pub fn test_index_max_length_token() -> crate::Result<()> { + let mut schema_builder = Schema::builder(); + let text_options = TextOptions::default().set_indexing_options( + TextFieldIndexing::default() + .set_index_option(IndexRecordOption::WithFreqsAndPositions) + .set_tokenizer("simple_no_truncation"), + ); + let text_field = schema_builder.add_text_field("text", text_options); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + index + .tokenizers() + .register("simple_no_truncation", SimpleTokenizer); + let reader = index.reader()?; + let mut index_writer = index.writer_for_tests()?; + let ok_token_text: String = "A".repeat(MAX_TOKEN_LEN); - let mut exceeding_token_text: String = "A".repeat(MAX_TOKEN_LEN + 1); - exceeding_token_text.push_str(" hello"); + index_writer.add_document(doc!(text_field=>ok_token_text.clone()))?; + index_writer.commit()?; + reader.reload()?; + let searcher = reader.searcher(); + let segment_reader = searcher.segment_reader(0u32); + let inverted_index = segment_reader.inverted_index(text_field)?; + assert_eq!(inverted_index.terms().num_terms(), 1); + let mut bytes = vec![]; + assert!(inverted_index.terms().ord_to_term(0, &mut bytes)?); + assert_eq!(&bytes[..], ok_token_text.as_bytes()); + + Ok(()) + } + + #[test] + pub fn test_drop_token_that_are_too_long() -> crate::Result<()> { let mut schema_builder = Schema::builder(); let text_options = TextOptions::default().set_indexing_options( TextFieldIndexing::default() @@ -170,33 +198,22 @@ pub mod tests { index .tokenizers() .register("simple_no_truncation", SimpleTokenizer); - let reader = index.reader().unwrap(); - let mut index_writer = index.writer_for_tests().unwrap(); - index_writer.set_merge_policy(Box::new(NoMergePolicy)); - { - index_writer.add_document(doc!(text_field=>exceeding_token_text))?; - index_writer.commit()?; - reader.reload()?; - let searcher = reader.searcher(); - let segment_reader = searcher.segment_reader(0u32); - let inverted_index = segment_reader.inverted_index(text_field)?; - assert_eq!(inverted_index.terms().num_terms(), 1); - let mut bytes = vec![]; - assert!(inverted_index.terms().ord_to_term(0, &mut bytes)?); - assert_eq!(&bytes, b"hello"); - } - { - index_writer.add_document(doc!(text_field=>ok_token_text.clone()))?; - index_writer.commit()?; - reader.reload()?; - let searcher = reader.searcher(); - let segment_reader = searcher.segment_reader(1u32); - let inverted_index = segment_reader.inverted_index(text_field)?; - assert_eq!(inverted_index.terms().num_terms(), 1); - let mut bytes = vec![]; - assert!(inverted_index.terms().ord_to_term(0, &mut bytes)?); - assert_eq!(&bytes[..], ok_token_text.as_bytes()); - } + let reader = index.reader()?; + let mut index_writer = index.writer_for_tests()?; + + let mut exceeding_token_text: String = "A".repeat(MAX_TOKEN_LEN + 1); + exceeding_token_text.push_str(" hello"); + index_writer.add_document(doc!(text_field=>exceeding_token_text))?; + index_writer.commit()?; + reader.reload()?; + let searcher = reader.searcher(); + let segment_reader = searcher.segment_reader(0u32); + let inverted_index = segment_reader.inverted_index(text_field)?; + assert_eq!(inverted_index.terms().num_terms(), 1); + let mut bytes = vec![]; + assert!(inverted_index.terms().ord_to_term(0, &mut bytes)?); + assert_eq!(&bytes, b"hello"); + Ok(()) } diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index 6f9e0f31ff..1df91a7d4e 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -308,10 +308,8 @@ pub struct PostingsSerializer { fieldnorm_reader: Option, bm25_weight: Option, - - num_docs: u32, // Number of docs in the segment avg_fieldnorm: Score, // Average number of term in the field for that segment. - // this value is used to compute the block wand information. + // this value is used to compute the block wand information. } impl PostingsSerializer { @@ -321,10 +319,6 @@ impl PostingsSerializer { mode: IndexRecordOption, fieldnorm_reader: Option, ) -> PostingsSerializer { - let num_docs = fieldnorm_reader - .as_ref() - .map(|fieldnorm_reader| fieldnorm_reader.num_docs()) - .unwrap_or(0u32); PostingsSerializer { output_write: CountingWriter::wrap(write), @@ -339,21 +333,25 @@ impl PostingsSerializer { fieldnorm_reader, bm25_weight: None, - - num_docs, avg_fieldnorm, } } + /// Returns the number of documents in the segment currently being serialized. + /// This function may return `None` if there are no fieldnorm for that field. + fn num_docs_in_segment(&self) -> Option { + self.fieldnorm_reader + .as_ref() + .map(|reader| reader.num_docs()) + } + pub fn new_term(&mut self, term_doc_freq: u32) { - if self.mode.has_freq() && self.num_docs > 0 { - let bm25_weight = Bm25Weight::for_one_term( - term_doc_freq as u64, - self.num_docs as u64, - self.avg_fieldnorm, - ); - self.bm25_weight = Some(bm25_weight); + if !self.mode.has_freq() { + return; } + self.bm25_weight = self.num_docs_in_segment().map(|num_docs| { + Bm25Weight::for_one_term(term_doc_freq as u64, num_docs as u64, self.avg_fieldnorm) + }); } fn write_block(&mut self) { diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 259e78258f..086c52c0b7 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -587,6 +587,7 @@ mod test { use super::QueryParser; use super::QueryParserError; use crate::query::Query; + use crate::schema::FacetOptions; use crate::schema::Field; use crate::schema::{IndexRecordOption, TextFieldIndexing, TextOptions}; use crate::schema::{Schema, Term, INDEXED, STORED, STRING, TEXT}; @@ -615,8 +616,7 @@ mod test { schema_builder.add_text_field("with_stop_words", text_options); schema_builder.add_date_field("date", INDEXED); schema_builder.add_f64_field("float", INDEXED); - schema_builder.add_facet_field("facet", INDEXED); - schema_builder.add_facet_field("facet_not_indexed", STORED); + schema_builder.add_facet_field("facet", FacetOptions::default()); schema_builder.add_bytes_field("bytes", INDEXED); schema_builder.add_bytes_field("bytes_not_indexed", STORED); schema_builder.build() @@ -669,13 +669,6 @@ mod test { ); } - #[test] - fn test_parse_query_facet_not_indexed() { - let error = - parse_query_to_logical_ast("facet_not_indexed:/root/branch/leaf", false).unwrap_err(); - assert!(matches!(error, QueryParserError::FieldNotIndexed(_))); - } - #[test] pub fn test_parse_query_with_boost() { let mut query_parser = make_query_parser(); @@ -817,7 +810,7 @@ mod test { fn test_parse_bytes() { test_parse_query_to_logical_ast_helper( "bytes:YnVidQ==", - "Term(field=13,bytes=[98, 117, 98, 117])", + "Term(field=12,bytes=[98, 117, 98, 117])", false, ); } @@ -832,7 +825,7 @@ mod test { fn test_parse_bytes_phrase() { test_parse_query_to_logical_ast_helper( "bytes:\"YnVidQ==\"", - "Term(field=13,bytes=[98, 117, 98, 117])", + "Term(field=12,bytes=[98, 117, 98, 117])", false, ); } diff --git a/src/query/term_query/term_query.rs b/src/query/term_query/term_query.rs index 8f003e04d4..57fc16fb31 100644 --- a/src/query/term_query/term_query.rs +++ b/src/query/term_query/term_query.rs @@ -93,20 +93,20 @@ impl TermQuery { scoring_enabled: bool, ) -> crate::Result { let term = self.term.clone(); - let field_entry = searcher.schema().get_field_entry(term.field()); + let field_entry = searcher.schema().get_field_entry(self.term.field()); if !field_entry.is_indexed() { - return Err(crate::TantivyError::SchemaError(format!( - "Field {:?} is not indexed", - field_entry.name() - ))); + let error_msg = format!("Field {:?} is not indexed.", field_entry.name()); + return Err(crate::TantivyError::SchemaError(error_msg)); } - let bm25_weight; - if scoring_enabled { - bm25_weight = Bm25Weight::for_terms(searcher, &[term])?; + let has_fieldnorms = searcher + .schema() + .get_field_entry(self.term.field()) + .has_fieldnorms(); + let bm25_weight = if scoring_enabled { + Bm25Weight::for_terms(searcher, &[term])? } else { - bm25_weight = - Bm25Weight::new(Explanation::new("".to_string(), 1.0f32), 1.0f32); - } + Bm25Weight::new(Explanation::new("".to_string(), 1.0f32), 1.0f32) + }; let index_record_option = if scoring_enabled { self.index_record_option } else { @@ -116,6 +116,7 @@ impl TermQuery { self.term.clone(), index_record_option, bm25_weight, + has_fieldnorms, scoring_enabled, )) } diff --git a/src/query/term_query/term_weight.rs b/src/query/term_query/term_weight.rs index 23222424cc..46c6517808 100644 --- a/src/query/term_query/term_weight.rs +++ b/src/query/term_query/term_weight.rs @@ -17,6 +17,7 @@ pub struct TermWeight { index_record_option: IndexRecordOption, similarity_weight: Bm25Weight, scoring_enabled: bool, + has_fieldnorms: bool, } impl Weight for TermWeight { @@ -90,12 +91,14 @@ impl TermWeight { index_record_option: IndexRecordOption, similarity_weight: Bm25Weight, scoring_enabled: bool, + has_fieldnorms: bool, ) -> TermWeight { TermWeight { term, index_record_option, similarity_weight, scoring_enabled, + has_fieldnorms, } } @@ -106,7 +109,7 @@ impl TermWeight { ) -> crate::Result { let field = self.term.field(); let inverted_index = reader.inverted_index(field)?; - let fieldnorm_reader = if self.scoring_enabled { + let fieldnorm_reader = if self.scoring_enabled && self.has_fieldnorms { reader.get_fieldnorms_reader(field)? } else { FieldNormReader::constant(reader.max_doc(), 1) diff --git a/src/schema/bytes_options.rs b/src/schema/bytes_options.rs index 51e6d8757d..662faf8397 100644 --- a/src/schema/bytes_options.rs +++ b/src/schema/bytes_options.rs @@ -3,19 +3,51 @@ use std::ops::BitOr; use super::flags::{FastFlag, IndexedFlag, SchemaFlagList, StoredFlag}; /// Define how an a bytes field should be handled by tantivy. -#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)] +#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)] +#[serde(from = "BytesOptionsDeser")] pub struct BytesOptions { indexed: bool, + fieldnorms: bool, fast: bool, stored: bool, } +/// For backward compability we add an intermediary to interpret the +/// lack of fieldnorms attribute as "true" iff indexed. +/// +/// (Downstream, for the moment, this attribute is not used anyway if not indexed...) +/// Note that: newly serialized IntOptions will include the new attribute. +#[derive(Deserialize)] +struct BytesOptionsDeser { + indexed: bool, + #[serde(default)] + fieldnorms: Option, + fast: bool, + stored: bool, +} + +impl From for BytesOptions { + fn from(deser: BytesOptionsDeser) -> Self { + BytesOptions { + indexed: deser.indexed, + fieldnorms: deser.fieldnorms.unwrap_or(deser.indexed), + fast: deser.fast, + stored: deser.stored, + } + } +} + impl BytesOptions { /// Returns true iff the value is indexed. pub fn is_indexed(&self) -> bool { self.indexed } + /// Returns true iff the value is normed. + pub fn fieldnorms(&self) -> bool { + self.fieldnorms + } + /// Returns true iff the value is a fast field. pub fn is_fast(&self) -> bool { self.fast @@ -35,6 +67,15 @@ impl BytesOptions { self } + /// Set the field as normed. + /// + /// Setting an integer as normed will generate + /// the fieldnorm data for it. + pub fn set_fieldnorms(mut self) -> BytesOptions { + self.fieldnorms = true; + self + } + /// Set the field as a single-valued fast field. /// /// Fast fields are designed for random access. @@ -63,6 +104,7 @@ impl> BitOr for BytesOptions { let other = other.into(); BytesOptions { indexed: self.indexed | other.indexed, + fieldnorms: self.fieldnorms | other.fieldnorms, stored: self.stored | other.stored, fast: self.fast | other.fast, } @@ -79,6 +121,7 @@ impl From for BytesOptions { fn from(_: FastFlag) -> Self { BytesOptions { indexed: false, + fieldnorms: false, stored: false, fast: true, } @@ -89,6 +132,7 @@ impl From for BytesOptions { fn from(_: StoredFlag) -> Self { BytesOptions { indexed: false, + fieldnorms: false, stored: true, fast: false, } @@ -99,6 +143,7 @@ impl From for BytesOptions { fn from(_: IndexedFlag) -> Self { BytesOptions { indexed: true, + fieldnorms: true, stored: false, fast: false, } @@ -123,7 +168,10 @@ mod tests { #[test] fn test_bytes_option_fast_flag() { assert_eq!(BytesOptions::default().set_fast(), FAST.into()); - assert_eq!(BytesOptions::default().set_indexed(), INDEXED.into()); + assert_eq!( + BytesOptions::default().set_indexed().set_fieldnorms(), + INDEXED.into() + ); assert_eq!(BytesOptions::default().set_stored(), STORED.into()); } #[test] @@ -133,11 +181,17 @@ mod tests { (FAST | STORED).into() ); assert_eq!( - BytesOptions::default().set_indexed().set_fast(), + BytesOptions::default() + .set_indexed() + .set_fieldnorms() + .set_fast(), (INDEXED | FAST).into() ); assert_eq!( - BytesOptions::default().set_stored().set_indexed(), + BytesOptions::default() + .set_stored() + .set_fieldnorms() + .set_indexed(), (STORED | INDEXED).into() ); } @@ -147,8 +201,89 @@ mod tests { assert!(!BytesOptions::default().is_stored()); assert!(!BytesOptions::default().is_fast()); assert!(!BytesOptions::default().is_indexed()); + assert!(!BytesOptions::default().fieldnorms()); assert!(BytesOptions::default().set_stored().is_stored()); assert!(BytesOptions::default().set_fast().is_fast()); assert!(BytesOptions::default().set_indexed().is_indexed()); + assert!(BytesOptions::default().set_fieldnorms().fieldnorms()); + } + + #[test] + fn test_bytes_options_deser_if_fieldnorm_missing_indexed_true() { + let json = r#"{ + "indexed": true, + "fast": false, + "stored": false + }"#; + let bytes_options: BytesOptions = serde_json::from_str(json).unwrap(); + assert_eq!( + &bytes_options, + &BytesOptions { + indexed: true, + fieldnorms: true, + fast: false, + stored: false + } + ); + } + + #[test] + fn test_bytes_options_deser_if_fieldnorm_missing_indexed_false() { + let json = r#"{ + "indexed": false, + "stored": false, + "fast": false + }"#; + let bytes_options: BytesOptions = serde_json::from_str(json).unwrap(); + assert_eq!( + &bytes_options, + &BytesOptions { + indexed: false, + fieldnorms: false, + fast: false, + stored: false + } + ); + } + + #[test] + fn test_bytes_options_deser_if_fieldnorm_false_indexed_true() { + let json = r#"{ + "indexed": true, + "fieldnorms": false, + "fast": false, + "stored": false + }"#; + let bytes_options: BytesOptions = serde_json::from_str(json).unwrap(); + assert_eq!( + &bytes_options, + &BytesOptions { + indexed: true, + fieldnorms: false, + fast: false, + stored: false + } + ); + } + + #[test] + fn test_bytes_options_deser_if_fieldnorm_true_indexed_false() { + // this one is kind of useless, at least at the moment + let json = r#"{ + "indexed": false, + "fieldnorms": true, + "fast": false, + "stored": false + }"#; + let bytes_options: BytesOptions = serde_json::from_str(json).unwrap(); + assert_eq!( + &bytes_options, + &BytesOptions { + indexed: false, + fieldnorms: true, + fast: false, + stored: false + } + ); } } diff --git a/src/schema/facet_options.rs b/src/schema/facet_options.rs index 59fbf5a589..f5d78042c2 100644 --- a/src/schema/facet_options.rs +++ b/src/schema/facet_options.rs @@ -3,9 +3,10 @@ use serde::{Deserialize, Serialize}; use std::ops::BitOr; /// Define how a facet field should be handled by tantivy. +/// +/// Note that a Facet is always indexed and stored as a fastfield. #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)] pub struct FacetOptions { - indexed: bool, stored: bool, } @@ -15,11 +16,6 @@ impl FacetOptions { self.stored } - /// Returns true iff the value is indexed. - pub fn is_indexed(&self) -> bool { - self.indexed - } - /// Set the field as stored. /// /// Only the fields that are set as *stored* are @@ -28,15 +24,6 @@ impl FacetOptions { self.stored = true; self } - - /// Set the field as indexed. - /// - /// Setting a facet as indexed will generate - /// a walkable path. - pub fn set_indexed(mut self) -> FacetOptions { - self.indexed = true; - self - } } impl From<()> for FacetOptions { @@ -47,19 +34,7 @@ impl From<()> for FacetOptions { impl From for FacetOptions { fn from(_: StoredFlag) -> Self { - FacetOptions { - indexed: false, - stored: true, - } - } -} - -impl From for FacetOptions { - fn from(_: IndexedFlag) -> Self { - FacetOptions { - indexed: true, - stored: false, - } + FacetOptions { stored: true } } } @@ -69,7 +44,6 @@ impl> BitOr for FacetOptions { fn bitor(self, other: T) -> FacetOptions { let other = other.into(); FacetOptions { - indexed: self.indexed | other.indexed, stored: self.stored | other.stored, } } @@ -85,3 +59,20 @@ where Self::from(head_tail.head) | Self::from(head_tail.tail) } } + +impl From for FacetOptions { + fn from(_: IndexedFlag) -> Self { + FacetOptions { stored: false } + } +} + +#[cfg(test)] +mod tests { + use crate::schema::{FacetOptions, INDEXED}; + + #[test] + fn test_from_index_flag() { + let facet_option = FacetOptions::from(INDEXED); + assert_eq!(facet_option, FacetOptions::default()); + } +} diff --git a/src/schema/field_entry.rs b/src/schema/field_entry.rs index c53bd2039d..55929f464c 100644 --- a/src/schema/field_entry.rs +++ b/src/schema/field_entry.rs @@ -114,6 +114,11 @@ impl FieldEntry { self.field_type.is_indexed() } + /// Returns true iff the field is normed + pub fn has_fieldnorms(&self) -> bool { + self.field_type.has_fieldnorms() + } + /// Returns true iff the field is a int (signed or unsigned) fast field pub fn is_fast(&self) -> bool { match self.field_type { @@ -142,7 +147,10 @@ impl FieldEntry { #[cfg(test)] mod tests { use super::*; - use crate::schema::TEXT; + use crate::{ + schema::{Schema, STRING, TEXT}, + Index, + }; use serde_json; #[test] @@ -161,6 +169,7 @@ mod tests { "options": { "indexing": { "record": "position", + "fieldnorms": true, "tokenizer": "default" }, "stored": false @@ -187,6 +196,7 @@ mod tests { "options": { "indexing": { "record": "position", + "fieldnorms": true, "tokenizer": "default" }, "stored": false @@ -199,4 +209,19 @@ mod tests { _ => panic!("expected FieldType::Str"), } } + + #[test] + fn test_fieldnorms() -> crate::Result<()> { + let mut schema_builder = Schema::builder(); + let text = schema_builder.add_text_field("text", STRING); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + let mut index_writer = index.writer_for_tests()?; + index_writer.add_document(doc!(text=>"abc"))?; + index_writer.commit()?; + let searcher = index.reader()?.searcher(); + let err = searcher.segment_reader(0u32).get_fieldnorms_reader(text); + assert!(matches!(err, Err(crate::TantivyError::SchemaError(_)))); + Ok(()) + } } diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index 6cc0f227e3..4cd86aefa7 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -92,11 +92,27 @@ impl FieldType { | FieldType::I64(ref int_options) | FieldType::F64(ref int_options) => int_options.is_indexed(), FieldType::Date(ref date_options) => date_options.is_indexed(), - FieldType::HierarchicalFacet(ref facet_options) => facet_options.is_indexed(), + FieldType::HierarchicalFacet(ref _facet_options) => true, FieldType::Bytes(ref bytes_options) => bytes_options.is_indexed(), } } + /// returns true iff the field is normed. + pub fn has_fieldnorms(&self) -> bool { + match *self { + FieldType::Str(ref text_options) => text_options + .get_indexing_options() + .map(|options| options.fieldnorms()) + .unwrap_or(false), + FieldType::U64(ref int_options) + | FieldType::I64(ref int_options) + | FieldType::F64(ref int_options) + | FieldType::Date(ref int_options) => int_options.fieldnorms(), + FieldType::HierarchicalFacet(_) => false, + FieldType::Bytes(ref bytes_options) => bytes_options.fieldnorms(), + } + } + /// Given a field configuration, return the maximal possible /// `IndexRecordOption` available. /// @@ -116,13 +132,7 @@ impl FieldType { None } } - FieldType::HierarchicalFacet(ref facet_options) => { - if facet_options.is_indexed() { - Some(IndexRecordOption::Basic) - } else { - None - } - } + FieldType::HierarchicalFacet(ref _facet_options) => Some(IndexRecordOption::Basic), FieldType::Bytes(ref bytes_options) => { if bytes_options.is_indexed() { Some(IndexRecordOption::Basic) diff --git a/src/schema/flags.rs b/src/schema/flags.rs index 7c10e0c8be..106538aecc 100644 --- a/src/schema/flags.rs +++ b/src/schema/flags.rs @@ -20,7 +20,7 @@ pub const STORED: SchemaFlagList = SchemaFlagList { #[derive(Clone)] pub struct IndexedFlag; -/// Flag to mark the field as indexed. An indexed field is searchable. +/// Flag to mark the field as indexed. An indexed field is searchable and has a fieldnorm. /// /// The `INDEXED` flag can only be used when building `IntOptions` (`u64`, `i64` and `f64` fields) /// Of course, text fields can also be indexed... But this is expressed by using either the diff --git a/src/schema/int_options.rs b/src/schema/int_options.rs index bb80492aa1..736dddf37f 100644 --- a/src/schema/int_options.rs +++ b/src/schema/int_options.rs @@ -15,14 +15,43 @@ pub enum Cardinality { } /// Define how an u64, i64, of f64 field should be handled by tantivy. -#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, Default)] +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[serde(from = "IntOptionsDeser")] pub struct IntOptions { indexed: bool, + // This boolean has no effect if the field is not marked as indexed too. + fieldnorms: bool, // This attribute only has an effect if indexed is true. #[serde(skip_serializing_if = "Option::is_none")] fast: Option, stored: bool, } +/// For backward compability we add an intermediary to interpret the +/// lack of fieldnorms attribute as "true" iff indexed. +/// +/// (Downstream, for the moment, this attribute is not used anyway if not indexed...) +/// Note that: newly serialized IntOptions will include the new attribute. +#[derive(Deserialize)] +struct IntOptionsDeser { + indexed: bool, + #[serde(default)] + fieldnorms: Option, // This attribute only has an effect if indexed is true. + #[serde(default)] + fast: Option, + stored: bool, +} + +impl From for IntOptions { + fn from(deser: IntOptionsDeser) -> Self { + IntOptions { + indexed: deser.indexed, + fieldnorms: deser.fieldnorms.unwrap_or(deser.indexed), + fast: deser.fast, + stored: deser.stored, + } + } +} + impl IntOptions { /// Returns true iff the value is stored. pub fn is_stored(&self) -> bool { @@ -34,6 +63,11 @@ impl IntOptions { self.indexed } + /// Returns true iff the field has fieldnorm. + pub fn fieldnorms(&self) -> bool { + self.fieldnorms && self.indexed + } + /// Returns true iff the value is a fast field. pub fn is_fast(&self) -> bool { self.fast.is_some() @@ -59,6 +93,15 @@ impl IntOptions { self } + /// Set the field with fieldnorm. + /// + /// Setting an integer as fieldnorm will generate + /// the fieldnorm data for it. + pub fn set_fieldnorm(mut self) -> IntOptions { + self.fieldnorms = true; + self + } + /// Set the field as a single-valued fast field. /// /// Fast fields are designed for random access. @@ -79,6 +122,17 @@ impl IntOptions { } } +impl Default for IntOptions { + fn default() -> IntOptions { + IntOptions { + indexed: false, + fieldnorms: false, + stored: false, + fast: None, + } + } +} + impl From<()> for IntOptions { fn from(_: ()) -> IntOptions { IntOptions::default() @@ -89,6 +143,7 @@ impl From for IntOptions { fn from(_: FastFlag) -> Self { IntOptions { indexed: false, + fieldnorms: false, stored: false, fast: Some(Cardinality::SingleValue), } @@ -99,6 +154,7 @@ impl From for IntOptions { fn from(_: StoredFlag) -> Self { IntOptions { indexed: false, + fieldnorms: false, stored: true, fast: None, } @@ -109,6 +165,7 @@ impl From for IntOptions { fn from(_: IndexedFlag) -> Self { IntOptions { indexed: true, + fieldnorms: true, stored: false, fast: None, } @@ -122,6 +179,7 @@ impl> BitOr for IntOptions { let other = other.into(); IntOptions { indexed: self.indexed | other.indexed, + fieldnorms: self.fieldnorms | other.fieldnorms, stored: self.stored | other.stored, fast: self.fast.or(other.fast), } @@ -138,3 +196,83 @@ where Self::from(head_tail.head) | Self::from(head_tail.tail) } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_int_options_deser_if_fieldnorm_missing_indexed_true() { + let json = r#"{ + "indexed": true, + "stored": false + }"#; + let int_options: IntOptions = serde_json::from_str(json).unwrap(); + assert_eq!( + &int_options, + &IntOptions { + indexed: true, + fieldnorms: true, + fast: None, + stored: false + } + ); + } + + #[test] + fn test_int_options_deser_if_fieldnorm_missing_indexed_false() { + let json = r#"{ + "indexed": false, + "stored": false + }"#; + let int_options: IntOptions = serde_json::from_str(json).unwrap(); + assert_eq!( + &int_options, + &IntOptions { + indexed: false, + fieldnorms: false, + fast: None, + stored: false + } + ); + } + + #[test] + fn test_int_options_deser_if_fieldnorm_false_indexed_true() { + let json = r#"{ + "indexed": true, + "fieldnorms": false, + "stored": false + }"#; + let int_options: IntOptions = serde_json::from_str(json).unwrap(); + assert_eq!( + &int_options, + &IntOptions { + indexed: true, + fieldnorms: false, + fast: None, + stored: false + } + ); + } + + #[test] + fn test_int_options_deser_if_fieldnorm_true_indexed_false() { + // this one is kind of useless, at least at the moment + let json = r#"{ + "indexed": false, + "fieldnorms": true, + "stored": false + }"#; + let int_options: IntOptions = serde_json::from_str(json).unwrap(); + assert_eq!( + &int_options, + &IntOptions { + indexed: false, + fieldnorms: true, + fast: None, + stored: false + } + ); + } +} diff --git a/src/schema/schema.rs b/src/schema/schema.rs index 44b75c6c2d..6e1d42da9c 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -240,6 +240,11 @@ impl Schema { self.get_field_entry(field).name() } + /// Returns the number of fields in the schema. + pub fn num_fields(&self) -> usize { + self.0.fields.len() + } + /// Return the list of all the `Field`s. pub fn fields(&self) -> impl Iterator { self.0 @@ -427,6 +432,7 @@ mod tests { .set_fast(Cardinality::SingleValue); let score_options = IntOptions::default() .set_indexed() + .set_fieldnorm() .set_fast(Cardinality::SingleValue); schema_builder.add_text_field("title", TEXT); schema_builder.add_text_field("author", STRING); @@ -442,6 +448,7 @@ mod tests { "options": { "indexing": { "record": "position", + "fieldnorms": true, "tokenizer": "default" }, "stored": false @@ -453,6 +460,7 @@ mod tests { "options": { "indexing": { "record": "basic", + "fieldnorms": false, "tokenizer": "raw" }, "stored": false @@ -463,6 +471,7 @@ mod tests { "type": "u64", "options": { "indexed": false, + "fieldnorms": false, "fast": "single", "stored": true } @@ -472,6 +481,7 @@ mod tests { "type": "i64", "options": { "indexed": false, + "fieldnorms": false, "fast": "single", "stored": true } @@ -481,6 +491,7 @@ mod tests { "type": "f64", "options": { "indexed": true, + "fieldnorms": true, "fast": "single", "stored": false } @@ -743,6 +754,7 @@ mod tests { let timestamp_options = IntOptions::default() .set_stored() .set_indexed() + .set_fieldnorm() .set_fast(SingleValue); schema_builder.add_text_field("_id", id_options); schema_builder.add_date_field("_timestamp", timestamp_options); @@ -754,6 +766,7 @@ mod tests { "options": { "indexing": { "record": "position", + "fieldnorms": true, "tokenizer": "default" }, "stored": false @@ -764,6 +777,7 @@ mod tests { "type": "i64", "options": { "indexed": false, + "fieldnorms": false, "fast": "single", "stored": true } @@ -784,6 +798,7 @@ mod tests { "options": { "indexing": { "record": "basic", + "fieldnorms": true, "tokenizer": "raw" }, "stored": true @@ -794,6 +809,7 @@ mod tests { "type": "date", "options": { "indexed": true, + "fieldnorms": true, "fast": "single", "stored": true } @@ -804,6 +820,7 @@ mod tests { "options": { "indexing": { "record": "position", + "fieldnorms": true, "tokenizer": "default" }, "stored": false @@ -814,6 +831,7 @@ mod tests { "type": "i64", "options": { "indexed": false, + "fieldnorms": false, "fast": "single", "stored": true } diff --git a/src/schema/text_options.rs b/src/schema/text_options.rs index 3a8516d2ba..ba972131cf 100644 --- a/src/schema/text_options.rs +++ b/src/schema/text_options.rs @@ -45,6 +45,7 @@ impl TextOptions { #[derive(Clone, PartialEq, Debug, Serialize, Deserialize)] pub struct TextFieldIndexing { record: IndexRecordOption, + fieldnorms: bool, tokenizer: Cow<'static, str>, } @@ -53,6 +54,7 @@ impl Default for TextFieldIndexing { TextFieldIndexing { tokenizer: Cow::Borrowed("default"), record: IndexRecordOption::Basic, + fieldnorms: true, } } } @@ -69,6 +71,17 @@ impl TextFieldIndexing { &self.tokenizer } + /// Sets fieldnorms + pub fn set_fieldnorms(mut self, fieldnorms: bool) -> TextFieldIndexing { + self.fieldnorms = fieldnorms; + self + } + + /// Returns true iff fieldnorms are stored. + pub fn fieldnorms(&self) -> bool { + self.fieldnorms + } + /// Sets which information should be indexed with the tokens. /// /// See [IndexRecordOption](./enum.IndexRecordOption.html) for more detail. @@ -89,6 +102,7 @@ impl TextFieldIndexing { pub const STRING: TextOptions = TextOptions { indexing: Some(TextFieldIndexing { tokenizer: Cow::Borrowed("raw"), + fieldnorms: false, record: IndexRecordOption::Basic, }), stored: false, @@ -98,6 +112,7 @@ pub const STRING: TextOptions = TextOptions { pub const TEXT: TextOptions = TextOptions { indexing: Some(TextFieldIndexing { tokenizer: Cow::Borrowed("default"), + fieldnorms: true, record: IndexRecordOption::WithFreqsAndPositions, }), stored: false, diff --git a/src/schema/value.rs b/src/schema/value.rs index b3b49a8eb5..73f643a89c 100644 --- a/src/schema/value.rs +++ b/src/schema/value.rs @@ -137,19 +137,18 @@ impl Value { } } - /// Returns the path value, provided the value is of the `Facet` type. + /// Returns the facet value, provided the value is of the `Facet` type. /// (Returns None if the value is not of the `Facet` type). - pub fn path(&self) -> Option { + pub fn facet(&self) -> Option<&Facet> { if let Value::Facet(facet) = self { - Some(facet.to_path_string()) + Some(facet) } else { None } } /// Returns the tokenized text, provided the value is of the `PreTokStr` type. - /// - /// Returns None if the value is not of the `PreTokStr` type. + /// (Returns None if the value is not of the `PreTokStr` type.) pub fn tokenized_text(&self) -> Option<&PreTokenizedString> { if let Value::PreTokStr(tokenized_text) = self { Some(tokenized_text) @@ -159,8 +158,7 @@ impl Value { } /// Returns the u64-value, provided the value is of the `U64` type. - /// - /// Returns None if the value is not of the `U64` type. + /// (Returns None if the value is not of the `U64` type) pub fn u64_value(&self) -> Option { if let Value::U64(val) = self { Some(*val) diff --git a/src/store/compression_lz4_block.rs b/src/store/compression_lz4_block.rs index 6c126e6300..532eed2fa9 100644 --- a/src/store/compression_lz4_block.rs +++ b/src/store/compression_lz4_block.rs @@ -2,27 +2,30 @@ use std::io::{self}; use core::convert::TryInto; use lz4_flex::{compress_into, decompress_into}; +use std::mem; #[inline] +#[allow(clippy::uninit_vec)] pub fn compress(uncompressed: &[u8], compressed: &mut Vec) -> io::Result<()> { compressed.clear(); - let maximum_ouput_size = lz4_flex::block::get_maximum_output_size(uncompressed.len()); + let maximum_ouput_size = + mem::size_of::() + lz4_flex::block::get_maximum_output_size(uncompressed.len()); compressed.reserve(maximum_ouput_size); - unsafe { - compressed.set_len(maximum_ouput_size + 4); + compressed.set_len(maximum_ouput_size); } let bytes_written = compress_into(uncompressed, &mut compressed[4..]) .map_err(|err| io::Error::new(io::ErrorKind::InvalidData, err.to_string()))?; let num_bytes = uncompressed.len() as u32; compressed[0..4].copy_from_slice(&num_bytes.to_le_bytes()); unsafe { - compressed.set_len(bytes_written + 4); + compressed.set_len(bytes_written + mem::size_of::()); } Ok(()) } #[inline] +#[allow(clippy::uninit_vec)] pub fn decompress(compressed: &[u8], decompressed: &mut Vec) -> io::Result<()> { decompressed.clear(); let uncompressed_size_bytes: &[u8; 4] = compressed diff --git a/src/store/index/mod.rs b/src/store/index/mod.rs index 9ef6692060..6535ff571f 100644 --- a/src/store/index/mod.rs +++ b/src/store/index/mod.rs @@ -47,7 +47,7 @@ mod tests { use crate::directory::OwnedBytes; use crate::indexer::NoMergePolicy; - use crate::schema::{SchemaBuilder, STORED, STRING}; + use crate::schema::{SchemaBuilder, STORED, TEXT}; use crate::store::index::Checkpoint; use crate::{DocAddress, DocId, Index, Term}; @@ -128,7 +128,7 @@ mod tests { #[test] fn test_merge_store_with_stacking_reproducing_issue969() -> crate::Result<()> { let mut schema_builder = SchemaBuilder::default(); - let text = schema_builder.add_text_field("text", STORED | STRING); + let text = schema_builder.add_text_field("text", STORED | TEXT); let body = schema_builder.add_text_field("body", STORED); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); diff --git a/src/store/index/skip_index.rs b/src/store/index/skip_index.rs index b69df319aa..4eb3e3b326 100644 --- a/src/store/index/skip_index.rs +++ b/src/store/index/skip_index.rs @@ -19,9 +19,7 @@ impl<'a> Iterator for LayerCursor<'a> { return None; } let (block_mut, remaining_mut) = (&mut self.block, &mut self.remaining); - if block_mut.deserialize(remaining_mut).is_err() { - return None; - } + block_mut.deserialize(remaining_mut).ok()?; self.cursor = 0; } let res = Some(self.block.get(self.cursor));