diff --git a/src/core/index_meta.rs b/src/core/index_meta.rs index 9a4c0b1c79..9fa7731e41 100644 --- a/src/core/index_meta.rs +++ b/src/core/index_meta.rs @@ -394,7 +394,7 @@ mod tests { let json = serde_json::ser::to_string(&index_metas).expect("serialization failed"); assert_eq!( json, - r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"lz4"},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","tokenizer":"default"},"stored":false}}],"opstamp":0}"# + r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"lz4"},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false}}],"opstamp":0}"# ); } } diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index cad61dc7b9..80411d0239 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -127,7 +127,7 @@ impl SegmentReader { self.fieldnorm_readers.get_field(field)?.ok_or_else(|| { let field_name = self.schema.get_field_name(field); let err_msg = format!( - "Field norm not found for field {:?}. Was it marked as indexed during indexing?", + "Field norm not found for field {:?}. Was it marked as normed during indexing?", field_name ); crate::TantivyError::SchemaError(err_msg) diff --git a/src/fastfield/bytes/mod.rs b/src/fastfield/bytes/mod.rs index c05dd68d7d..97862b29cc 100644 --- a/src/fastfield/bytes/mod.rs +++ b/src/fastfield/bytes/mod.rs @@ -7,7 +7,7 @@ pub use self::writer::BytesFastFieldWriter; #[cfg(test)] mod tests { use crate::schema::{BytesOptions, IndexRecordOption, Schema, Value}; - use crate::{query::TermQuery, schema::FAST, schema::INDEXED, schema::NORMED, schema::STORED}; + use crate::{query::TermQuery, schema::FAST, schema::INDEXED, schema::STORED}; use crate::{DocAddress, DocSet, Index, Searcher, Term}; use std::ops::Deref; @@ -80,7 +80,7 @@ mod tests { #[test] fn test_index_bytes() -> crate::Result<()> { - let searcher = create_index_for_test(INDEXED | NORMED)?; + let searcher = create_index_for_test(INDEXED)?; assert_eq!(searcher.num_docs(), 1); let field = searcher.schema().get_field("string_bytes").unwrap(); let term = Term::from_field_bytes(field, b"lucene".as_ref()); diff --git a/src/fastfield/multivalued/mod.rs b/src/fastfield/multivalued/mod.rs index 6e214f3b5b..5c9c8a32b1 100644 --- a/src/fastfield/multivalued/mod.rs +++ b/src/fastfield/multivalued/mod.rs @@ -68,7 +68,7 @@ mod tests { IntOptions::default() .set_fast(Cardinality::MultiValues) .set_indexed() - .set_normed() + .set_fieldnorm() .set_stored(), ); let time_i = diff --git a/src/fieldnorm/mod.rs b/src/fieldnorm/mod.rs index 372f10e4d4..9edc695819 100644 --- a/src/fieldnorm/mod.rs +++ b/src/fieldnorm/mod.rs @@ -29,19 +29,16 @@ use self::code::{fieldnorm_to_id, id_to_fieldnorm}; #[cfg(test)] mod tests { - use crate::common::CompositeFile; + use crate::directory::CompositeFile; use crate::fieldnorm::FieldNormReader; use crate::fieldnorm::FieldNormsSerializer; use crate::fieldnorm::FieldNormsWriter; use crate::{ - directory::{Directory, RAMDirectory, WritePtr}, + directory::{Directory, RamDirectory, WritePtr}, schema::{STRING, TEXT}, }; use once_cell::sync::Lazy; - use std::{ - panic::{catch_unwind, AssertUnwindSafe}, - path::Path, - }; + use std::path::Path; use crate::schema::{Field, Schema, STORED}; @@ -57,45 +54,25 @@ mod tests { pub static TXT_FIELD: Lazy = Lazy::new(|| SCHEMA.get_field("txt_field").unwrap()); pub static STR_FIELD: Lazy = Lazy::new(|| SCHEMA.get_field("str_field").unwrap()); - #[ignore] #[test] - pub fn test_fieldnorm_bug() -> crate::Result<()> { - let path = Path::new("test"); - let directory: RAMDirectory = RAMDirectory::create(); - { - let write: WritePtr = directory.open_write(Path::new("test"))?; - let serializer = FieldNormsSerializer::from_write(write)?; - let mut fieldnorm_writers = FieldNormsWriter::for_schema(&SCHEMA); - fieldnorm_writers.fill_up_to_max_doc(1u32); - fieldnorm_writers.record(0u32, *TXT_FIELD, 5); - fieldnorm_writers.record(1u32, *TXT_FIELD, 3); - fieldnorm_writers.serialize(serializer)?; - } - let file = directory.open_read(&path)?; - { - let fields_composite = CompositeFile::open(&file)?; - assert!(fields_composite.open_read(*FIELD).is_none()); - assert!(fields_composite.open_read(*TXT_FIELD).is_none()); - assert!(fields_composite.open_read(*STR_FIELD).is_none()); - let data = fields_composite.open_read(*TXT_FIELD).unwrap(); - let fieldnorm_reader = FieldNormReader::open(data)?; - assert_eq!(fieldnorm_reader.fieldnorm(0u32), 5u32); - assert_eq!(fieldnorm_reader.fieldnorm(1u32), 3u32); - } - Ok(()) + #[should_panic(expected = "Cannot register a given fieldnorm twice")] + pub fn test_should_panic_when_recording_fieldnorm_twice_for_same_doc() { + let mut fieldnorm_writers = FieldNormsWriter::for_schema(&SCHEMA); + fieldnorm_writers.record(0u32, *TXT_FIELD, 5); + fieldnorm_writers.record(0u32, *TXT_FIELD, 3); } #[test] pub fn test_fieldnorm() -> crate::Result<()> { let path = Path::new("test"); - let directory: RAMDirectory = RAMDirectory::create(); + let directory: RamDirectory = RamDirectory::create(); { let write: WritePtr = directory.open_write(Path::new("test"))?; let serializer = FieldNormsSerializer::from_write(write)?; let mut fieldnorm_writers = FieldNormsWriter::for_schema(&SCHEMA); - fieldnorm_writers.fill_up_to_max_doc(1u32); - fieldnorm_writers.record(1u32, *TXT_FIELD, 3); - fieldnorm_writers.serialize(serializer)?; + fieldnorm_writers.record(2u32, *TXT_FIELD, 5); + fieldnorm_writers.record(3u32, *TXT_FIELD, 3); + fieldnorm_writers.serialize(serializer, None)?; } let file = directory.open_read(&path)?; { @@ -104,19 +81,16 @@ mod tests { assert!(fields_composite.open_read(*STR_FIELD).is_none()); let data = fields_composite.open_read(*TXT_FIELD).unwrap(); let fieldnorm_reader = FieldNormReader::open(data)?; - assert_eq!(fieldnorm_reader.fieldnorm(1u32), 3u32); + assert_eq!(fieldnorm_reader.fieldnorm(0u32), 0u32); + assert_eq!(fieldnorm_reader.fieldnorm(1u32), 0u32); + assert_eq!(fieldnorm_reader.fieldnorm(2u32), 5u32); + assert_eq!(fieldnorm_reader.fieldnorm(3u32), 3u32); } Ok(()) } #[test] - pub fn test_fail_fieldnorm_cannot_registered_twice() { - let mut fieldnorm_writers = FieldNormsWriter::for_schema(&SCHEMA); - fieldnorm_writers.fill_up_to_max_doc(1u32); - fieldnorm_writers.record(1u32, *TXT_FIELD, 5); - let result = catch_unwind(AssertUnwindSafe(|| { - fieldnorm_writers.record(1u32, *TXT_FIELD, 3) - })); - assert!(result.is_err()); + pub fn test_retrieve_fields_with_norms() { + //TODO } } diff --git a/src/fieldnorm/writer.rs b/src/fieldnorm/writer.rs index ce756fac4c..dd774fc87a 100644 --- a/src/fieldnorm/writer.rs +++ b/src/fieldnorm/writer.rs @@ -23,7 +23,7 @@ impl FieldNormsWriter { schema .fields() .filter_map(|(field, field_entry)| { - if field_entry.is_indexed() && field_entry.is_normed() { + if field_entry.is_indexed() && field_entry.has_fieldnorms() { Some(field) } else { None diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 6d1f559190..19e17e565c 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -41,17 +41,29 @@ use tantivy_bitpacker::minmax; /// We do not allow segments with more than pub const MAX_DOC_LIMIT: u32 = 1 << 31; -fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> crate::Result { +fn estimate_total_num_tokens(readers: &[SegmentReader], field: Field) -> crate::Result { let mut total_tokens = 0u64; let mut count: [usize; 256] = [0; 256]; for reader in readers { + // When there are deletes, we use an approximation either + // - by using the fieldnorm + // - or by using a multiplying the total number of tokens by a ratio (1 - deleted docs / num docs). if reader.has_deletes() { - // if there are deletes, then we use an approximation - // using the fieldnorm - let fieldnorms_reader = reader.get_fieldnorms_reader(field)?; - for doc in reader.doc_ids_alive() { - let fieldnorm_id = fieldnorms_reader.fieldnorm_id(doc); - count[fieldnorm_id as usize] += 1; + if reader + .schema() + .get_field_entry(field) + .field_type() + .has_fieldnorms() + { + let fieldnorms_reader = reader.get_fieldnorms_reader(field)?; + for doc in reader.doc_ids_alive() { + let fieldnorm_id = fieldnorms_reader.fieldnorm_id(doc); + count[fieldnorm_id as usize] += 1; + } + } else { + let segment_num_tokens = reader.inverted_index(field)?.total_num_tokens(); + let ratio = 1f64 - reader.num_deleted_docs() as f64 / reader.num_docs() as f64; + total_tokens += (segment_num_tokens as f64 * ratio) as u64; } } else { total_tokens += reader.inverted_index(field)?.total_num_tokens(); @@ -851,10 +863,11 @@ impl IndexMerger { segment_map[*old_doc_id as usize] = Some(new_doc_id as DocId); } - // The total number of tokens will only be exact when there has been no deletes. + // The total number of tokens will only be exact when there has been no deletes + // and if the field has a norm. // // Otherwise, we approximate by removing deleted documents proportionally. - let total_num_tokens: u64 = compute_total_num_tokens(&self.readers, indexed_field)?; + let total_num_tokens: u64 = estimate_total_num_tokens(&self.readers, indexed_field)?; // Create the total list of doc ids // by stacking the doc ids from the different segment. diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 491f0ca7aa..23747661f0 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -242,7 +242,7 @@ impl SegmentWriter { ) }; - if field_entry.is_normed() { + if field_entry.has_fieldnorms() { self.fieldnorms_writer.record(doc_id, field, num_tokens); } } diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index 6f9e0f31ff..1df91a7d4e 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -308,10 +308,8 @@ pub struct PostingsSerializer { fieldnorm_reader: Option, bm25_weight: Option, - - num_docs: u32, // Number of docs in the segment avg_fieldnorm: Score, // Average number of term in the field for that segment. - // this value is used to compute the block wand information. + // this value is used to compute the block wand information. } impl PostingsSerializer { @@ -321,10 +319,6 @@ impl PostingsSerializer { mode: IndexRecordOption, fieldnorm_reader: Option, ) -> PostingsSerializer { - let num_docs = fieldnorm_reader - .as_ref() - .map(|fieldnorm_reader| fieldnorm_reader.num_docs()) - .unwrap_or(0u32); PostingsSerializer { output_write: CountingWriter::wrap(write), @@ -339,21 +333,25 @@ impl PostingsSerializer { fieldnorm_reader, bm25_weight: None, - - num_docs, avg_fieldnorm, } } + /// Returns the number of documents in the segment currently being serialized. + /// This function may return `None` if there are no fieldnorm for that field. + fn num_docs_in_segment(&self) -> Option { + self.fieldnorm_reader + .as_ref() + .map(|reader| reader.num_docs()) + } + pub fn new_term(&mut self, term_doc_freq: u32) { - if self.mode.has_freq() && self.num_docs > 0 { - let bm25_weight = Bm25Weight::for_one_term( - term_doc_freq as u64, - self.num_docs as u64, - self.avg_fieldnorm, - ); - self.bm25_weight = Some(bm25_weight); + if !self.mode.has_freq() { + return; } + self.bm25_weight = self.num_docs_in_segment().map(|num_docs| { + Bm25Weight::for_one_term(term_doc_freq as u64, num_docs as u64, self.avg_fieldnorm) + }); } fn write_block(&mut self) { diff --git a/src/query/term_query/term_query.rs b/src/query/term_query/term_query.rs index 8f003e04d4..57fc16fb31 100644 --- a/src/query/term_query/term_query.rs +++ b/src/query/term_query/term_query.rs @@ -93,20 +93,20 @@ impl TermQuery { scoring_enabled: bool, ) -> crate::Result { let term = self.term.clone(); - let field_entry = searcher.schema().get_field_entry(term.field()); + let field_entry = searcher.schema().get_field_entry(self.term.field()); if !field_entry.is_indexed() { - return Err(crate::TantivyError::SchemaError(format!( - "Field {:?} is not indexed", - field_entry.name() - ))); + let error_msg = format!("Field {:?} is not indexed.", field_entry.name()); + return Err(crate::TantivyError::SchemaError(error_msg)); } - let bm25_weight; - if scoring_enabled { - bm25_weight = Bm25Weight::for_terms(searcher, &[term])?; + let has_fieldnorms = searcher + .schema() + .get_field_entry(self.term.field()) + .has_fieldnorms(); + let bm25_weight = if scoring_enabled { + Bm25Weight::for_terms(searcher, &[term])? } else { - bm25_weight = - Bm25Weight::new(Explanation::new("".to_string(), 1.0f32), 1.0f32); - } + Bm25Weight::new(Explanation::new("".to_string(), 1.0f32), 1.0f32) + }; let index_record_option = if scoring_enabled { self.index_record_option } else { @@ -116,6 +116,7 @@ impl TermQuery { self.term.clone(), index_record_option, bm25_weight, + has_fieldnorms, scoring_enabled, )) } diff --git a/src/query/term_query/term_weight.rs b/src/query/term_query/term_weight.rs index a2ac1b4e8f..46c6517808 100644 --- a/src/query/term_query/term_weight.rs +++ b/src/query/term_query/term_weight.rs @@ -17,6 +17,7 @@ pub struct TermWeight { index_record_option: IndexRecordOption, similarity_weight: Bm25Weight, scoring_enabled: bool, + has_fieldnorms: bool, } impl Weight for TermWeight { @@ -90,12 +91,14 @@ impl TermWeight { index_record_option: IndexRecordOption, similarity_weight: Bm25Weight, scoring_enabled: bool, + has_fieldnorms: bool, ) -> TermWeight { TermWeight { term, index_record_option, similarity_weight, scoring_enabled, + has_fieldnorms, } } @@ -106,10 +109,8 @@ impl TermWeight { ) -> crate::Result { let field = self.term.field(); let inverted_index = reader.inverted_index(field)?; - let fieldnorm_reader = if self.scoring_enabled { - reader - .get_fieldnorms_reader(field) - .unwrap_or(FieldNormReader::constant(reader.max_doc(), 1)) + let fieldnorm_reader = if self.scoring_enabled && self.has_fieldnorms { + reader.get_fieldnorms_reader(field)? } else { FieldNormReader::constant(reader.max_doc(), 1) }; diff --git a/src/schema/bytes_options.rs b/src/schema/bytes_options.rs index 1c565ae6ff..a010b36954 100644 --- a/src/schema/bytes_options.rs +++ b/src/schema/bytes_options.rs @@ -1,12 +1,12 @@ use serde::{Deserialize, Serialize}; use std::ops::BitOr; -use super::flags::{FastFlag, IndexedFlag, NormedFlag, SchemaFlagList, StoredFlag}; +use super::flags::{FastFlag, IndexedFlag, SchemaFlagList, StoredFlag}; /// Define how an a bytes field should be handled by tantivy. #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct BytesOptions { indexed: bool, - normed: bool, + fieldnorms: bool, fast: bool, stored: bool, } @@ -18,8 +18,8 @@ impl BytesOptions { } /// Returns true iff the value is normed. - pub fn is_normed(&self) -> bool { - self.normed + pub fn fieldnorms(&self) -> bool { + self.fieldnorms } /// Returns true iff the value is a fast field. @@ -45,8 +45,8 @@ impl BytesOptions { /// /// Setting an integer as normed will generate /// the fieldnorm data for it. - pub fn set_normed(mut self) -> BytesOptions { - self.normed = true; + pub fn set_fieldnorms(mut self) -> BytesOptions { + self.fieldnorms = true; self } @@ -75,7 +75,7 @@ impl Default for BytesOptions { fn default() -> BytesOptions { BytesOptions { indexed: false, - normed: false, + fieldnorms: false, fast: false, stored: false, } @@ -89,7 +89,7 @@ impl> BitOr for BytesOptions { let other = other.into(); BytesOptions { indexed: self.indexed | other.indexed, - normed: self.normed | other.normed, + fieldnorms: self.fieldnorms | other.fieldnorms, stored: self.stored | other.stored, fast: self.fast | other.fast, } @@ -106,7 +106,7 @@ impl From for BytesOptions { fn from(_: FastFlag) -> Self { BytesOptions { indexed: false, - normed: false, + fieldnorms: false, stored: false, fast: true, } @@ -117,7 +117,7 @@ impl From for BytesOptions { fn from(_: StoredFlag) -> Self { BytesOptions { indexed: false, - normed: false, + fieldnorms: false, stored: true, fast: false, } @@ -128,18 +128,7 @@ impl From for BytesOptions { fn from(_: IndexedFlag) -> Self { BytesOptions { indexed: true, - normed: false, - stored: false, - fast: false, - } - } -} - -impl From for BytesOptions { - fn from(_: NormedFlag) -> Self { - BytesOptions { - indexed: false, - normed: true, + fieldnorms: true, stored: false, fast: false, } @@ -159,13 +148,15 @@ where #[cfg(test)] mod tests { - use crate::schema::{BytesOptions, FAST, INDEXED, NORMED, STORED}; + use crate::schema::{BytesOptions, FAST, INDEXED, STORED}; #[test] fn test_bytes_option_fast_flag() { assert_eq!(BytesOptions::default().set_fast(), FAST.into()); - assert_eq!(BytesOptions::default().set_indexed(), INDEXED.into()); - assert_eq!(BytesOptions::default().set_normed(), NORMED.into()); + assert_eq!( + BytesOptions::default().set_indexed().set_fieldnorms(), + INDEXED.into() + ); assert_eq!(BytesOptions::default().set_stored(), STORED.into()); } #[test] @@ -175,11 +166,17 @@ mod tests { (FAST | STORED).into() ); assert_eq!( - BytesOptions::default().set_indexed().set_fast(), + BytesOptions::default() + .set_indexed() + .set_fieldnorms() + .set_fast(), (INDEXED | FAST).into() ); assert_eq!( - BytesOptions::default().set_stored().set_indexed(), + BytesOptions::default() + .set_stored() + .set_fieldnorms() + .set_indexed(), (STORED | INDEXED).into() ); } @@ -189,10 +186,10 @@ mod tests { assert!(!BytesOptions::default().is_stored()); assert!(!BytesOptions::default().is_fast()); assert!(!BytesOptions::default().is_indexed()); - assert!(!BytesOptions::default().is_normed()); + assert!(!BytesOptions::default().fieldnorms()); assert!(BytesOptions::default().set_stored().is_stored()); assert!(BytesOptions::default().set_fast().is_fast()); assert!(BytesOptions::default().set_indexed().is_indexed()); - assert!(BytesOptions::default().set_normed().is_normed()); + assert!(BytesOptions::default().set_fieldnorms().fieldnorms()); } } diff --git a/src/schema/field_entry.rs b/src/schema/field_entry.rs index 9a8a5f587b..55929f464c 100644 --- a/src/schema/field_entry.rs +++ b/src/schema/field_entry.rs @@ -115,8 +115,8 @@ impl FieldEntry { } /// Returns true iff the field is normed - pub fn is_normed(&self) -> bool { - self.field_type.is_normed() + pub fn has_fieldnorms(&self) -> bool { + self.field_type.has_fieldnorms() } /// Returns true iff the field is a int (signed or unsigned) fast field @@ -147,7 +147,10 @@ impl FieldEntry { #[cfg(test)] mod tests { use super::*; - use crate::schema::TEXT; + use crate::{ + schema::{Schema, STRING, TEXT}, + Index, + }; use serde_json; #[test] @@ -166,6 +169,7 @@ mod tests { "options": { "indexing": { "record": "position", + "fieldnorms": true, "tokenizer": "default" }, "stored": false @@ -192,6 +196,7 @@ mod tests { "options": { "indexing": { "record": "position", + "fieldnorms": true, "tokenizer": "default" }, "stored": false @@ -204,4 +209,19 @@ mod tests { _ => panic!("expected FieldType::Str"), } } + + #[test] + fn test_fieldnorms() -> crate::Result<()> { + let mut schema_builder = Schema::builder(); + let text = schema_builder.add_text_field("text", STRING); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + let mut index_writer = index.writer_for_tests()?; + index_writer.add_document(doc!(text=>"abc"))?; + index_writer.commit()?; + let searcher = index.reader()?.searcher(); + let err = searcher.segment_reader(0u32).get_fieldnorms_reader(text); + assert!(matches!(err, Err(crate::TantivyError::SchemaError(_)))); + Ok(()) + } } diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index a9b3ac4789..a4ce142ac4 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -98,18 +98,18 @@ impl FieldType { } /// returns true iff the field is normed. - pub fn is_normed(&self) -> bool { + pub fn has_fieldnorms(&self) -> bool { match *self { - FieldType::Str(ref text_options) => text_options.get_indexing_options().map_or_else( - || false, - |opt| opt.index_option() != IndexRecordOption::Basic, - ), + FieldType::Str(ref text_options) => text_options + .get_indexing_options() + .map(|options| options.fieldnorms()) + .unwrap_or(false), FieldType::U64(ref int_options) | FieldType::I64(ref int_options) | FieldType::F64(ref int_options) - | FieldType::Date(ref int_options) => int_options.is_normed(), + | FieldType::Date(ref int_options) => int_options.fieldnorms(), FieldType::HierarchicalFacet(_) => false, - FieldType::Bytes(ref bytes_options) => bytes_options.is_normed(), + FieldType::Bytes(ref bytes_options) => bytes_options.fieldnorms(), } } diff --git a/src/schema/flags.rs b/src/schema/flags.rs index 9e1937910f..106538aecc 100644 --- a/src/schema/flags.rs +++ b/src/schema/flags.rs @@ -20,7 +20,7 @@ pub const STORED: SchemaFlagList = SchemaFlagList { #[derive(Clone)] pub struct IndexedFlag; -/// Flag to mark the field as indexed. An indexed field is searchable. +/// Flag to mark the field as indexed. An indexed field is searchable and has a fieldnorm. /// /// The `INDEXED` flag can only be used when building `IntOptions` (`u64`, `i64` and `f64` fields) /// Of course, text fields can also be indexed... But this is expressed by using either the @@ -30,18 +30,6 @@ pub const INDEXED: SchemaFlagList = SchemaFlagList { tail: (), }; -#[derive(Clone)] -pub struct NormedFlag; -/// Flag to mark the field as indexed. -/// -/// The `INDEXED` flag can only be used when building `IntOptions` (`u64`, `i64` and `f64` fields) -/// Of course, text fields can also be indexed... But this is expressed by using either the -/// `STRING` (untokenized) or `TEXT` (tokenized with the english tokenizer) flags. -pub const NORMED: SchemaFlagList = SchemaFlagList { - head: NormedFlag, - tail: (), -}; - #[derive(Clone)] pub struct FastFlag; /// Flag to mark the field as a fast field (similar to Lucene's DocValues) diff --git a/src/schema/int_options.rs b/src/schema/int_options.rs index 7e8b11bc9f..8659fcdad9 100644 --- a/src/schema/int_options.rs +++ b/src/schema/int_options.rs @@ -2,8 +2,6 @@ use crate::schema::flags::{FastFlag, IndexedFlag, SchemaFlagList, StoredFlag}; use serde::{Deserialize, Serialize}; use std::ops::BitOr; -use super::flags::NormedFlag; - /// Express whether a field is single-value or multi-valued. #[derive(Clone, Copy, PartialEq, Eq, Debug, Serialize, Deserialize)] pub enum Cardinality { @@ -20,7 +18,7 @@ pub enum Cardinality { #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] pub struct IntOptions { indexed: bool, - normed: bool, + fieldnorms: bool, #[serde(skip_serializing_if = "Option::is_none")] fast: Option, stored: bool, @@ -37,9 +35,9 @@ impl IntOptions { self.indexed } - /// Returns true iff the value is normed. - pub fn is_normed(&self) -> bool { - self.normed + /// Returns true iff the field has fieldnorm. + pub fn fieldnorms(&self) -> bool { + self.fieldnorms } /// Returns true iff the value is a fast field. @@ -67,12 +65,12 @@ impl IntOptions { self } - /// Set the field as normed. + /// Set the field with fieldnorm. /// - /// Setting an integer as normed will generate + /// Setting an integer as fieldnorm will generate /// the fieldnorm data for it. - pub fn set_normed(mut self) -> IntOptions { - self.normed = true; + pub fn set_fieldnorm(mut self) -> IntOptions { + self.fieldnorms = true; self } @@ -100,7 +98,7 @@ impl Default for IntOptions { fn default() -> IntOptions { IntOptions { indexed: false, - normed: false, + fieldnorms: false, stored: false, fast: None, } @@ -117,7 +115,7 @@ impl From for IntOptions { fn from(_: FastFlag) -> Self { IntOptions { indexed: false, - normed: false, + fieldnorms: false, stored: false, fast: Some(Cardinality::SingleValue), } @@ -128,7 +126,7 @@ impl From for IntOptions { fn from(_: StoredFlag) -> Self { IntOptions { indexed: false, - normed: false, + fieldnorms: false, stored: true, fast: None, } @@ -139,18 +137,7 @@ impl From for IntOptions { fn from(_: IndexedFlag) -> Self { IntOptions { indexed: true, - normed: false, - stored: false, - fast: None, - } - } -} - -impl From for IntOptions { - fn from(_: NormedFlag) -> Self { - IntOptions { - indexed: false, - normed: true, + fieldnorms: true, stored: false, fast: None, } @@ -164,7 +151,7 @@ impl> BitOr for IntOptions { let other = other.into(); IntOptions { indexed: self.indexed | other.indexed, - normed: self.normed | other.normed, + fieldnorms: self.fieldnorms | other.fieldnorms, stored: self.stored | other.stored, fast: self.fast.or(other.fast), } diff --git a/src/schema/mod.rs b/src/schema/mod.rs index 359b1277eb..b1ea12e273 100644 --- a/src/schema/mod.rs +++ b/src/schema/mod.rs @@ -147,7 +147,7 @@ pub use self::text_options::STRING; pub use self::text_options::TEXT; pub use self::bytes_options::BytesOptions; -pub use self::flags::{FAST, INDEXED, NORMED, STORED}; +pub use self::flags::{FAST, INDEXED, STORED}; pub use self::int_options::Cardinality; pub use self::int_options::IntOptions; diff --git a/src/schema/schema.rs b/src/schema/schema.rs index 2c8174564d..d30f60c1eb 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -427,7 +427,7 @@ mod tests { .set_fast(Cardinality::SingleValue); let score_options = IntOptions::default() .set_indexed() - .set_normed() + .set_fieldnorm() .set_fast(Cardinality::SingleValue); schema_builder.add_text_field("title", TEXT); schema_builder.add_text_field("author", STRING); @@ -443,6 +443,7 @@ mod tests { "options": { "indexing": { "record": "position", + "fieldnorms": true, "tokenizer": "default" }, "stored": false @@ -454,6 +455,7 @@ mod tests { "options": { "indexing": { "record": "basic", + "fieldnorms": false, "tokenizer": "raw" }, "stored": false @@ -464,7 +466,7 @@ mod tests { "type": "u64", "options": { "indexed": false, - "normed": false, + "fieldnorms": false, "fast": "single", "stored": true } @@ -474,7 +476,7 @@ mod tests { "type": "i64", "options": { "indexed": false, - "normed": false, + "fieldnorms": false, "fast": "single", "stored": true } @@ -484,7 +486,7 @@ mod tests { "type": "f64", "options": { "indexed": true, - "normed": true, + "fieldnorms": true, "fast": "single", "stored": false } @@ -747,7 +749,7 @@ mod tests { let timestamp_options = IntOptions::default() .set_stored() .set_indexed() - .set_normed() + .set_fieldnorm() .set_fast(SingleValue); schema_builder.add_text_field("_id", id_options); schema_builder.add_date_field("_timestamp", timestamp_options); @@ -759,6 +761,7 @@ mod tests { "options": { "indexing": { "record": "position", + "fieldnorms": true, "tokenizer": "default" }, "stored": false @@ -769,7 +772,7 @@ mod tests { "type": "i64", "options": { "indexed": false, - "normed": false, + "fieldnorms": false, "fast": "single", "stored": true } @@ -790,6 +793,7 @@ mod tests { "options": { "indexing": { "record": "basic", + "fieldnorms": true, "tokenizer": "raw" }, "stored": true @@ -800,7 +804,7 @@ mod tests { "type": "date", "options": { "indexed": true, - "normed": true, + "fieldnorms": true, "fast": "single", "stored": true } @@ -811,6 +815,7 @@ mod tests { "options": { "indexing": { "record": "position", + "fieldnorms": true, "tokenizer": "default" }, "stored": false @@ -821,7 +826,7 @@ mod tests { "type": "i64", "options": { "indexed": false, - "normed": false, + "fieldnorms": false, "fast": "single", "stored": true } diff --git a/src/schema/text_options.rs b/src/schema/text_options.rs index 3a8516d2ba..3ab9998e4b 100644 --- a/src/schema/text_options.rs +++ b/src/schema/text_options.rs @@ -45,6 +45,7 @@ impl TextOptions { #[derive(Clone, PartialEq, Debug, Serialize, Deserialize)] pub struct TextFieldIndexing { record: IndexRecordOption, + fieldnorms: bool, tokenizer: Cow<'static, str>, } @@ -53,6 +54,7 @@ impl Default for TextFieldIndexing { TextFieldIndexing { tokenizer: Cow::Borrowed("default"), record: IndexRecordOption::Basic, + fieldnorms: true, } } } @@ -69,6 +71,17 @@ impl TextFieldIndexing { &self.tokenizer } + /// Sets fieldnorms TODO + pub fn set_fieldnorms(mut self, fieldnorms: bool) -> TextFieldIndexing { + self.fieldnorms = fieldnorms; + self + } + + /// Sets fieldnorms TODO + pub fn fieldnorms(&self) -> bool { + self.fieldnorms + } + /// Sets which information should be indexed with the tokens. /// /// See [IndexRecordOption](./enum.IndexRecordOption.html) for more detail. @@ -89,6 +102,7 @@ impl TextFieldIndexing { pub const STRING: TextOptions = TextOptions { indexing: Some(TextFieldIndexing { tokenizer: Cow::Borrowed("raw"), + fieldnorms: false, record: IndexRecordOption::Basic, }), stored: false, @@ -98,6 +112,7 @@ pub const STRING: TextOptions = TextOptions { pub const TEXT: TextOptions = TextOptions { indexing: Some(TextFieldIndexing { tokenizer: Cow::Borrowed("default"), + fieldnorms: true, record: IndexRecordOption::WithFreqsAndPositions, }), stored: false, diff --git a/src/space_usage/mod.rs b/src/space_usage/mod.rs index 5235949dc1..7bf2fc84dc 100644 --- a/src/space_usage/mod.rs +++ b/src/space_usage/mod.rs @@ -288,7 +288,7 @@ mod test { use crate::core::Index; use crate::schema::Field; use crate::schema::Schema; - use crate::schema::{FAST, INDEXED, NORMED, STORED, TEXT}; + use crate::schema::{FAST, INDEXED, STORED, TEXT}; use crate::space_usage::ByteCount; use crate::space_usage::PerFieldSpaceUsage; use crate::Term; @@ -323,7 +323,7 @@ mod test { #[test] fn test_fast_indexed() -> crate::Result<()> { let mut schema_builder = Schema::builder(); - let name = schema_builder.add_u64_field("name", FAST | INDEXED | NORMED); + let name = schema_builder.add_u64_field("name", FAST | INDEXED); let schema = schema_builder.build(); let index = Index::create_in_ram(schema); @@ -440,7 +440,7 @@ mod test { #[test] fn test_deletes() -> crate::Result<()> { let mut schema_builder = Schema::builder(); - let name = schema_builder.add_u64_field("name", INDEXED | NORMED); + let name = schema_builder.add_u64_field("name", INDEXED); let schema = schema_builder.build(); let index = Index::create_in_ram(schema);