diff --git a/src/fastfield/bytes/mod.rs b/src/fastfield/bytes/mod.rs index 16b268e64a..8373c63e4b 100644 --- a/src/fastfield/bytes/mod.rs +++ b/src/fastfield/bytes/mod.rs @@ -7,7 +7,7 @@ pub use self::writer::BytesFastFieldWriter; #[cfg(test)] mod tests { use crate::schema::{BytesOptions, IndexRecordOption, Schema, Value}; - use crate::{query::TermQuery, schema::FAST, schema::INDEXED, schema::STORED}; + use crate::{query::TermQuery, schema::FAST, schema::INDEXED, schema::NORMED, schema::STORED}; use crate::{DocAddress, DocSet, Index, Searcher, Term}; use std::ops::Deref; @@ -80,7 +80,7 @@ mod tests { #[test] fn test_index_bytes() -> crate::Result<()> { - let searcher = create_index_for_test(INDEXED)?; + let searcher = create_index_for_test(INDEXED | NORMED)?; assert_eq!(searcher.num_docs(), 1); let field = searcher.schema().get_field("string_bytes").unwrap(); let term = Term::from_field_bytes(field, b"lucene".as_ref()); diff --git a/src/fastfield/multivalued/mod.rs b/src/fastfield/multivalued/mod.rs index 235255b689..c84a5dff88 100644 --- a/src/fastfield/multivalued/mod.rs +++ b/src/fastfield/multivalued/mod.rs @@ -59,6 +59,7 @@ mod tests { IntOptions::default() .set_fast(Cardinality::MultiValues) .set_indexed() + .set_normed() .set_stored(), ); let time_i = diff --git a/src/fieldnorm/mod.rs b/src/fieldnorm/mod.rs index 7450376c73..fcdec269bc 100644 --- a/src/fieldnorm/mod.rs +++ b/src/fieldnorm/mod.rs @@ -26,3 +26,97 @@ pub use self::serializer::FieldNormsSerializer; pub use self::writer::FieldNormsWriter; use self::code::{fieldnorm_to_id, id_to_fieldnorm}; + +#[cfg(test)] +mod tests { + use crate::common::CompositeFile; + use crate::fieldnorm::FieldNormReader; + use crate::fieldnorm::FieldNormsSerializer; + use crate::fieldnorm::FieldNormsWriter; + use crate::{ + directory::{Directory, RAMDirectory, WritePtr}, + schema::{STRING, TEXT}, + }; + use once_cell::sync::Lazy; + use std::{ + panic::{catch_unwind, AssertUnwindSafe}, + path::Path, + }; + + use crate::schema::{Field, Schema, STORED}; + + pub static SCHEMA: Lazy = Lazy::new(|| { + let mut schema_builder = Schema::builder(); + schema_builder.add_text_field("field", STORED); + schema_builder.add_text_field("txt_field", TEXT); + schema_builder.add_text_field("str_field", STRING); + schema_builder.build() + }); + + pub static FIELD: Lazy = Lazy::new(|| SCHEMA.get_field("field").unwrap()); + pub static TXT_FIELD: Lazy = Lazy::new(|| SCHEMA.get_field("txt_field").unwrap()); + pub static STR_FIELD: Lazy = Lazy::new(|| SCHEMA.get_field("str_field").unwrap()); + + #[ignore] + #[test] + pub fn test_fieldnorm_bug() -> crate::Result<()> { + let path = Path::new("test"); + let directory: RAMDirectory = RAMDirectory::create(); + { + let write: WritePtr = directory.open_write(Path::new("test"))?; + let serializer = FieldNormsSerializer::from_write(write)?; + let mut fieldnorm_writers = FieldNormsWriter::for_schema(&SCHEMA); + fieldnorm_writers.fill_up_to_max_doc(1u32); + fieldnorm_writers.record(0u32, *TXT_FIELD, 5); + fieldnorm_writers.record(1u32, *TXT_FIELD, 3); + fieldnorm_writers.serialize(serializer)?; + } + let file = directory.open_read(&path)?; + { + let fields_composite = CompositeFile::open(&file)?; + assert!(fields_composite.open_read(*FIELD).is_none()); + assert!(fields_composite.open_read(*TXT_FIELD).is_none()); + assert!(fields_composite.open_read(*STR_FIELD).is_none()); + let data = fields_composite.open_read(*TXT_FIELD).unwrap(); + let fieldnorm_reader = FieldNormReader::open(data)?; + assert_eq!(fieldnorm_reader.fieldnorm(0u32), 5u32); + assert_eq!(fieldnorm_reader.fieldnorm(1u32), 3u32); + } + Ok(()) + } + + #[test] + pub fn test_fieldnorm() -> crate::Result<()> { + let path = Path::new("test"); + let directory: RAMDirectory = RAMDirectory::create(); + { + let write: WritePtr = directory.open_write(Path::new("test"))?; + let serializer = FieldNormsSerializer::from_write(write)?; + let mut fieldnorm_writers = FieldNormsWriter::for_schema(&SCHEMA); + fieldnorm_writers.fill_up_to_max_doc(1u32); + fieldnorm_writers.record(1u32, *TXT_FIELD, 3); + fieldnorm_writers.serialize(serializer)?; + } + let file = directory.open_read(&path)?; + { + let fields_composite = CompositeFile::open(&file)?; + assert!(fields_composite.open_read(*FIELD).is_none()); + assert!(fields_composite.open_read(*STR_FIELD).is_none()); + let data = fields_composite.open_read(*TXT_FIELD).unwrap(); + let fieldnorm_reader = FieldNormReader::open(data)?; + assert_eq!(fieldnorm_reader.fieldnorm(1u32), 3u32); + } + Ok(()) + } + + #[test] + pub fn test_fail_fieldnorm_cannot_registered_twice() { + let mut fieldnorm_writers = FieldNormsWriter::for_schema(&SCHEMA); + fieldnorm_writers.fill_up_to_max_doc(1u32); + fieldnorm_writers.record(1u32, *TXT_FIELD, 5); + let result = catch_unwind(AssertUnwindSafe(|| { + fieldnorm_writers.record(1u32, *TXT_FIELD, 3) + })); + assert!(result.is_err()); + } +} diff --git a/src/fieldnorm/writer.rs b/src/fieldnorm/writer.rs index 061522e5c8..cdd1493fcf 100644 --- a/src/fieldnorm/writer.rs +++ b/src/fieldnorm/writer.rs @@ -23,7 +23,7 @@ impl FieldNormsWriter { schema .fields() .filter_map(|(field, field_entry)| { - if field_entry.is_indexed() { + if field_entry.is_indexed() && field_entry.is_normed() { Some(field) } else { None diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index dfe0c04c4b..79d63bf72a 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -211,7 +211,9 @@ impl SegmentWriter { ) }; - self.fieldnorms_writer.record(doc_id, field, num_tokens); + if field_entry.is_normed() { + self.fieldnorms_writer.record(doc_id, field, num_tokens); + } } FieldType::U64(_) => { for field_value in field_values { diff --git a/src/query/term_query/term_weight.rs b/src/query/term_query/term_weight.rs index a7c583c29a..1aa7fe6483 100644 --- a/src/query/term_query/term_weight.rs +++ b/src/query/term_query/term_weight.rs @@ -107,7 +107,9 @@ impl TermWeight { let field = self.term.field(); let inverted_index = reader.inverted_index(field)?; let fieldnorm_reader = if self.scoring_enabled { - reader.get_fieldnorms_reader(field)? + reader + .get_fieldnorms_reader(field) + .unwrap_or(FieldNormReader::constant(reader.max_doc(), 1)) } else { FieldNormReader::constant(reader.max_doc(), 1) }; diff --git a/src/schema/bytes_options.rs b/src/schema/bytes_options.rs index 5ab7501742..1c565ae6ff 100644 --- a/src/schema/bytes_options.rs +++ b/src/schema/bytes_options.rs @@ -1,11 +1,12 @@ use serde::{Deserialize, Serialize}; use std::ops::BitOr; -use super::flags::{FastFlag, IndexedFlag, SchemaFlagList, StoredFlag}; +use super::flags::{FastFlag, IndexedFlag, NormedFlag, SchemaFlagList, StoredFlag}; /// Define how an a bytes field should be handled by tantivy. #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct BytesOptions { indexed: bool, + normed: bool, fast: bool, stored: bool, } @@ -16,6 +17,11 @@ impl BytesOptions { self.indexed } + /// Returns true iff the value is normed. + pub fn is_normed(&self) -> bool { + self.normed + } + /// Returns true iff the value is a fast field. pub fn is_fast(&self) -> bool { self.fast @@ -35,6 +41,15 @@ impl BytesOptions { self } + /// Set the field as normed. + /// + /// Setting an integer as normed will generate + /// the fieldnorm data for it. + pub fn set_normed(mut self) -> BytesOptions { + self.normed = true; + self + } + /// Set the field as a single-valued fast field. /// /// Fast fields are designed for random access. @@ -60,6 +75,7 @@ impl Default for BytesOptions { fn default() -> BytesOptions { BytesOptions { indexed: false, + normed: false, fast: false, stored: false, } @@ -73,6 +89,7 @@ impl> BitOr for BytesOptions { let other = other.into(); BytesOptions { indexed: self.indexed | other.indexed, + normed: self.normed | other.normed, stored: self.stored | other.stored, fast: self.fast | other.fast, } @@ -89,6 +106,7 @@ impl From for BytesOptions { fn from(_: FastFlag) -> Self { BytesOptions { indexed: false, + normed: false, stored: false, fast: true, } @@ -99,6 +117,7 @@ impl From for BytesOptions { fn from(_: StoredFlag) -> Self { BytesOptions { indexed: false, + normed: false, stored: true, fast: false, } @@ -109,6 +128,18 @@ impl From for BytesOptions { fn from(_: IndexedFlag) -> Self { BytesOptions { indexed: true, + normed: false, + stored: false, + fast: false, + } + } +} + +impl From for BytesOptions { + fn from(_: NormedFlag) -> Self { + BytesOptions { + indexed: false, + normed: true, stored: false, fast: false, } @@ -128,12 +159,13 @@ where #[cfg(test)] mod tests { - use crate::schema::{BytesOptions, FAST, INDEXED, STORED}; + use crate::schema::{BytesOptions, FAST, INDEXED, NORMED, STORED}; #[test] fn test_bytes_option_fast_flag() { assert_eq!(BytesOptions::default().set_fast(), FAST.into()); assert_eq!(BytesOptions::default().set_indexed(), INDEXED.into()); + assert_eq!(BytesOptions::default().set_normed(), NORMED.into()); assert_eq!(BytesOptions::default().set_stored(), STORED.into()); } #[test] @@ -157,8 +189,10 @@ mod tests { assert!(!BytesOptions::default().is_stored()); assert!(!BytesOptions::default().is_fast()); assert!(!BytesOptions::default().is_indexed()); + assert!(!BytesOptions::default().is_normed()); assert!(BytesOptions::default().set_stored().is_stored()); assert!(BytesOptions::default().set_fast().is_fast()); assert!(BytesOptions::default().set_indexed().is_indexed()); + assert!(BytesOptions::default().set_normed().is_normed()); } } diff --git a/src/schema/field_entry.rs b/src/schema/field_entry.rs index 67256bfbd3..230169c63a 100644 --- a/src/schema/field_entry.rs +++ b/src/schema/field_entry.rs @@ -113,6 +113,11 @@ impl FieldEntry { } } + /// Returns true iff the field is normed + pub fn is_normed(&self) -> bool { + self.field_type.is_normed() + } + /// Returns true iff the field is a int (signed or unsigned) fast field pub fn is_fast(&self) -> bool { match self.field_type { diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index b3b9227047..c18ff1a6c2 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -93,6 +93,22 @@ impl FieldType { } } + /// returns true iff the field is normed. + pub fn is_normed(&self) -> bool { + match *self { + FieldType::Str(ref text_options) => text_options.get_indexing_options().map_or_else( + || false, + |opt| opt.index_option() != IndexRecordOption::Basic, + ), + FieldType::U64(ref int_options) + | FieldType::I64(ref int_options) + | FieldType::F64(ref int_options) + | FieldType::Date(ref int_options) => int_options.is_normed(), + FieldType::HierarchicalFacet(_) => false, + FieldType::Bytes(ref bytes_options) => bytes_options.is_normed(), + } + } + /// Given a field configuration, return the maximal possible /// `IndexRecordOption` available. /// diff --git a/src/schema/flags.rs b/src/schema/flags.rs index 758e680f5c..09dd95ff39 100644 --- a/src/schema/flags.rs +++ b/src/schema/flags.rs @@ -30,6 +30,18 @@ pub const INDEXED: SchemaFlagList = SchemaFlagList { tail: (), }; +#[derive(Clone)] +pub struct NormedFlag; +/// Flag to mark the field as indexed. +/// +/// The `INDEXED` flag can only be used when building `IntOptions` (`u64`, `i64` and `f64` fields) +/// Of course, text fields can also be indexed... But this is expressed by using either the +/// `STRING` (untokenized) or `TEXT` (tokenized with the english tokenizer) flags. +pub const NORMED: SchemaFlagList = SchemaFlagList { + head: NormedFlag, + tail: (), +}; + #[derive(Clone)] pub struct FastFlag; /// Flag to mark the field as a fast field (similar to Lucene's DocValues) diff --git a/src/schema/int_options.rs b/src/schema/int_options.rs index a5e3b86f27..e79e10c20d 100644 --- a/src/schema/int_options.rs +++ b/src/schema/int_options.rs @@ -2,6 +2,8 @@ use crate::schema::flags::{FastFlag, IndexedFlag, SchemaFlagList, StoredFlag}; use serde::{Deserialize, Serialize}; use std::ops::BitOr; +use super::flags::NormedFlag; + /// Express whether a field is single-value or multi-valued. #[derive(Clone, Copy, PartialEq, Eq, Debug, Serialize, Deserialize)] pub enum Cardinality { @@ -18,6 +20,7 @@ pub enum Cardinality { #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct IntOptions { indexed: bool, + normed: bool, #[serde(skip_serializing_if = "Option::is_none")] fast: Option, stored: bool, @@ -34,6 +37,11 @@ impl IntOptions { self.indexed } + /// Returns true iff the value is normed. + pub fn is_normed(&self) -> bool { + self.normed + } + /// Returns true iff the value is a fast field. pub fn is_fast(&self) -> bool { self.fast.is_some() @@ -57,6 +65,15 @@ impl IntOptions { self } + /// Set the field as normed. + /// + /// Setting an integer as normed will generate + /// the fieldnorm data for it. + pub fn set_normed(mut self) -> IntOptions { + self.normed = true; + self + } + /// Set the field as a single-valued fast field. /// /// Fast fields are designed for random access. @@ -81,6 +98,7 @@ impl Default for IntOptions { fn default() -> IntOptions { IntOptions { indexed: false, + normed: false, stored: false, fast: None, } @@ -97,6 +115,7 @@ impl From for IntOptions { fn from(_: FastFlag) -> Self { IntOptions { indexed: false, + normed: false, stored: false, fast: Some(Cardinality::SingleValue), } @@ -107,6 +126,7 @@ impl From for IntOptions { fn from(_: StoredFlag) -> Self { IntOptions { indexed: false, + normed: false, stored: true, fast: None, } @@ -117,6 +137,18 @@ impl From for IntOptions { fn from(_: IndexedFlag) -> Self { IntOptions { indexed: true, + normed: false, + stored: false, + fast: None, + } + } +} + +impl From for IntOptions { + fn from(_: NormedFlag) -> Self { + IntOptions { + indexed: false, + normed: true, stored: false, fast: None, } @@ -130,6 +162,7 @@ impl> BitOr for IntOptions { let other = other.into(); IntOptions { indexed: self.indexed | other.indexed, + normed: self.normed | other.normed, stored: self.stored | other.stored, fast: self.fast.or(other.fast), } diff --git a/src/schema/mod.rs b/src/schema/mod.rs index 1c1a62171b..26635552c5 100644 --- a/src/schema/mod.rs +++ b/src/schema/mod.rs @@ -146,7 +146,7 @@ pub use self::text_options::STRING; pub use self::text_options::TEXT; pub use self::bytes_options::BytesOptions; -pub use self::flags::{FAST, INDEXED, STORED}; +pub use self::flags::{FAST, INDEXED, NORMED, STORED}; pub use self::int_options::Cardinality; pub use self::int_options::IntOptions; diff --git a/src/schema/schema.rs b/src/schema/schema.rs index 70263a8f57..77ccf737f3 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -433,6 +433,7 @@ mod tests { .set_fast(Cardinality::SingleValue); let score_options = IntOptions::default() .set_indexed() + .set_normed() .set_fast(Cardinality::SingleValue); schema_builder.add_text_field("title", TEXT); schema_builder.add_text_field("author", STRING); @@ -469,6 +470,7 @@ mod tests { "type": "u64", "options": { "indexed": false, + "normed": false, "fast": "single", "stored": true } @@ -478,6 +480,7 @@ mod tests { "type": "i64", "options": { "indexed": false, + "normed": false, "fast": "single", "stored": true } @@ -487,6 +490,7 @@ mod tests { "type": "f64", "options": { "indexed": true, + "normed": true, "fast": "single", "stored": false } @@ -752,6 +756,7 @@ mod tests { let timestamp_options = IntOptions::default() .set_stored() .set_indexed() + .set_normed() .set_fast(SingleValue); schema_builder.add_text_field("_id", id_options); schema_builder.add_date_field("_timestamp", timestamp_options); @@ -773,6 +778,7 @@ mod tests { "type": "i64", "options": { "indexed": false, + "normed": false, "fast": "single", "stored": true } @@ -803,6 +809,7 @@ mod tests { "type": "date", "options": { "indexed": true, + "normed": true, "fast": "single", "stored": true } @@ -823,6 +830,7 @@ mod tests { "type": "i64", "options": { "indexed": false, + "normed": false, "fast": "single", "stored": true } diff --git a/src/space_usage/mod.rs b/src/space_usage/mod.rs index 3bad8f8b05..8256644947 100644 --- a/src/space_usage/mod.rs +++ b/src/space_usage/mod.rs @@ -296,7 +296,7 @@ mod test { use crate::core::Index; use crate::schema::Field; use crate::schema::Schema; - use crate::schema::{FAST, INDEXED, STORED, TEXT}; + use crate::schema::{FAST, INDEXED, NORMED, STORED, TEXT}; use crate::space_usage::ByteCount; use crate::space_usage::PerFieldSpaceUsage; use crate::Term; @@ -331,7 +331,7 @@ mod test { #[test] fn test_fast_indexed() { let mut schema_builder = Schema::builder(); - let name = schema_builder.add_u64_field("name", FAST | INDEXED); + let name = schema_builder.add_u64_field("name", FAST | INDEXED | NORMED); let schema = schema_builder.build(); let index = Index::create_in_ram(schema.clone()); @@ -448,7 +448,7 @@ mod test { #[test] fn test_deletes() -> crate::Result<()> { let mut schema_builder = Schema::builder(); - let name = schema_builder.add_u64_field("name", INDEXED); + let name = schema_builder.add_u64_field("name", INDEXED | NORMED); let schema = schema_builder.build(); let index = Index::create_in_ram(schema.clone()); diff --git a/src/store/index/mod.rs b/src/store/index/mod.rs index ae6431285f..a7d4d27e80 100644 --- a/src/store/index/mod.rs +++ b/src/store/index/mod.rs @@ -47,7 +47,7 @@ mod tests { use crate::directory::OwnedBytes; use crate::indexer::NoMergePolicy; - use crate::schema::{SchemaBuilder, STORED, STRING}; + use crate::schema::{SchemaBuilder, STORED, TEXT}; use crate::store::index::Checkpoint; use crate::{DocAddress, DocId, Index, Term}; @@ -128,7 +128,7 @@ mod tests { #[test] fn test_merge_store_with_stacking_reproducing_issue969() -> crate::Result<()> { let mut schema_builder = SchemaBuilder::default(); - let text = schema_builder.add_text_field("text", STORED | STRING); + let text = schema_builder.add_text_field("text", STORED | TEXT); let body = schema_builder.add_text_field("body", STORED); let schema = schema_builder.build(); let index = Index::create_in_ram(schema);