Skip to content

Commit

Permalink
Finalize making fieldnorm optional for all field types.
Browse files Browse the repository at this point in the history
  • Loading branch information
fmassot committed Nov 30, 2021
1 parent bde91de commit de3efe1
Show file tree
Hide file tree
Showing 20 changed files with 166 additions and 179 deletions.
2 changes: 1 addition & 1 deletion src/core/index_meta.rs
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,7 @@ mod tests {
let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
assert_eq!(
json,
r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"lz4"},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","tokenizer":"default"},"stored":false}}],"opstamp":0}"#
r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"lz4"},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false}}],"opstamp":0}"#
);
}
}
2 changes: 1 addition & 1 deletion src/core/segment_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ impl SegmentReader {
self.fieldnorm_readers.get_field(field)?.ok_or_else(|| {
let field_name = self.schema.get_field_name(field);
let err_msg = format!(
"Field norm not found for field {:?}. Was it marked as indexed during indexing?",
"Field norm not found for field {:?}. Was it marked as normed during indexing?",
field_name
);
crate::TantivyError::SchemaError(err_msg)
Expand Down
4 changes: 2 additions & 2 deletions src/fastfield/bytes/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ pub use self::writer::BytesFastFieldWriter;
#[cfg(test)]
mod tests {
use crate::schema::{BytesOptions, IndexRecordOption, Schema, Value};
use crate::{query::TermQuery, schema::FAST, schema::INDEXED, schema::NORMED, schema::STORED};
use crate::{query::TermQuery, schema::FAST, schema::INDEXED, schema::STORED};
use crate::{DocAddress, DocSet, Index, Searcher, Term};
use std::ops::Deref;

Expand Down Expand Up @@ -80,7 +80,7 @@ mod tests {

#[test]
fn test_index_bytes() -> crate::Result<()> {
let searcher = create_index_for_test(INDEXED | NORMED)?;
let searcher = create_index_for_test(INDEXED)?;
assert_eq!(searcher.num_docs(), 1);
let field = searcher.schema().get_field("string_bytes").unwrap();
let term = Term::from_field_bytes(field, b"lucene".as_ref());
Expand Down
2 changes: 1 addition & 1 deletion src/fastfield/multivalued/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ mod tests {
IntOptions::default()
.set_fast(Cardinality::MultiValues)
.set_indexed()
.set_normed()
.set_fieldnorm()
.set_stored(),
);
let time_i =
Expand Down
58 changes: 17 additions & 41 deletions src/fieldnorm/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,17 +29,16 @@ use self::code::{fieldnorm_to_id, id_to_fieldnorm};

#[cfg(test)]
mod tests {
use crate::common::CompositeFile;
use crate::directory::CompositeFile;
use crate::fieldnorm::FieldNormReader;
use crate::fieldnorm::FieldNormsSerializer;
use crate::fieldnorm::FieldNormsWriter;
use crate::{
directory::{Directory, RAMDirectory, WritePtr},
directory::{RamDirectory, Directory, WritePtr},
schema::{STRING, TEXT},
};
use once_cell::sync::Lazy;
use std::{
panic::{catch_unwind, AssertUnwindSafe},
path::Path,
};

Expand All @@ -57,45 +56,25 @@ mod tests {
pub static TXT_FIELD: Lazy<Field> = Lazy::new(|| SCHEMA.get_field("txt_field").unwrap());
pub static STR_FIELD: Lazy<Field> = Lazy::new(|| SCHEMA.get_field("str_field").unwrap());

#[ignore]
#[test]
pub fn test_fieldnorm_bug() -> crate::Result<()> {
let path = Path::new("test");
let directory: RAMDirectory = RAMDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test"))?;
let serializer = FieldNormsSerializer::from_write(write)?;
let mut fieldnorm_writers = FieldNormsWriter::for_schema(&SCHEMA);
fieldnorm_writers.fill_up_to_max_doc(1u32);
fieldnorm_writers.record(0u32, *TXT_FIELD, 5);
fieldnorm_writers.record(1u32, *TXT_FIELD, 3);
fieldnorm_writers.serialize(serializer)?;
}
let file = directory.open_read(&path)?;
{
let fields_composite = CompositeFile::open(&file)?;
assert!(fields_composite.open_read(*FIELD).is_none());
assert!(fields_composite.open_read(*TXT_FIELD).is_none());
assert!(fields_composite.open_read(*STR_FIELD).is_none());
let data = fields_composite.open_read(*TXT_FIELD).unwrap();
let fieldnorm_reader = FieldNormReader::open(data)?;
assert_eq!(fieldnorm_reader.fieldnorm(0u32), 5u32);
assert_eq!(fieldnorm_reader.fieldnorm(1u32), 3u32);
}
Ok(())
#[should_panic(expected = "Cannot register a given fieldnorm twice")]
pub fn test_should_panic_when_recording_fieldnorm_twice_for_same_doc() {
let mut fieldnorm_writers = FieldNormsWriter::for_schema(&SCHEMA);
fieldnorm_writers.record(0u32, *TXT_FIELD, 5);
fieldnorm_writers.record(0u32, *TXT_FIELD, 3);
}

#[test]
pub fn test_fieldnorm() -> crate::Result<()> {
let path = Path::new("test");
let directory: RAMDirectory = RAMDirectory::create();
let directory: RamDirectory = RamDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test"))?;
let serializer = FieldNormsSerializer::from_write(write)?;
let mut fieldnorm_writers = FieldNormsWriter::for_schema(&SCHEMA);
fieldnorm_writers.fill_up_to_max_doc(1u32);
fieldnorm_writers.record(1u32, *TXT_FIELD, 3);
fieldnorm_writers.serialize(serializer)?;
fieldnorm_writers.record(2u32, *TXT_FIELD, 5);
fieldnorm_writers.record(3u32, *TXT_FIELD, 3);
fieldnorm_writers.serialize(serializer, None)?;
}
let file = directory.open_read(&path)?;
{
Expand All @@ -104,19 +83,16 @@ mod tests {
assert!(fields_composite.open_read(*STR_FIELD).is_none());
let data = fields_composite.open_read(*TXT_FIELD).unwrap();
let fieldnorm_reader = FieldNormReader::open(data)?;
assert_eq!(fieldnorm_reader.fieldnorm(1u32), 3u32);
assert_eq!(fieldnorm_reader.fieldnorm(0u32), 0u32);
assert_eq!(fieldnorm_reader.fieldnorm(1u32), 0u32);
assert_eq!(fieldnorm_reader.fieldnorm(2u32), 5u32);
assert_eq!(fieldnorm_reader.fieldnorm(3u32), 3u32);
}
Ok(())
}

#[test]
pub fn test_fail_fieldnorm_cannot_registered_twice() {
let mut fieldnorm_writers = FieldNormsWriter::for_schema(&SCHEMA);
fieldnorm_writers.fill_up_to_max_doc(1u32);
fieldnorm_writers.record(1u32, *TXT_FIELD, 5);
let result = catch_unwind(AssertUnwindSafe(|| {
fieldnorm_writers.record(1u32, *TXT_FIELD, 3)
}));
assert!(result.is_err());
pub fn test_retrieve_fields_with_norms() {
//TODO
}
}
2 changes: 1 addition & 1 deletion src/fieldnorm/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ impl FieldNormsWriter {
schema
.fields()
.filter_map(|(field, field_entry)| {
if field_entry.is_indexed() && field_entry.is_normed() {
if field_entry.is_indexed() && field_entry.has_fieldnorms() {
Some(field)
} else {
None
Expand Down
29 changes: 19 additions & 10 deletions src/indexer/merger.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,18 +41,26 @@ use tantivy_bitpacker::minmax;
/// We do not allow segments with more than
pub const MAX_DOC_LIMIT: u32 = 1 << 31;

fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> crate::Result<u64> {
fn estimate_total_num_tokens(readers: &[SegmentReader], field: Field) -> crate::Result<u64> {
let mut total_tokens = 0u64;
let mut count: [usize; 256] = [0; 256];
for reader in readers {
// When there are deletes, we use an approximation either
// - by using the fieldnorm
// - or by using a multiplying the total number of tokens by a ratio (1 - deleted docs / num docs).
if reader.has_deletes() {
// if there are deletes, then we use an approximation
// using the fieldnorm
let fieldnorms_reader = reader.get_fieldnorms_reader(field)?;
for doc in reader.doc_ids_alive() {
let fieldnorm_id = fieldnorms_reader.fieldnorm_id(doc);
count[fieldnorm_id as usize] += 1;
}
if reader.schema().get_field_entry(field).field_type().has_fieldnorms() {
let fieldnorms_reader = reader.get_fieldnorms_reader(field)?;
for doc in reader.doc_ids_alive() {
let fieldnorm_id = fieldnorms_reader.fieldnorm_id(doc);
count[fieldnorm_id as usize] += 1;
}
} else {
let segment_num_tokens = reader.inverted_index(field)?.total_num_tokens();
let ratio = 1f64 - reader.num_deleted_docs() as f64 / reader.num_docs() as f64;
total_tokens += (segment_num_tokens as f64 * ratio) as u64;

}
} else {
total_tokens += reader.inverted_index(field)?.total_num_tokens();
}
Expand Down Expand Up @@ -851,10 +859,11 @@ impl IndexMerger {
segment_map[*old_doc_id as usize] = Some(new_doc_id as DocId);
}

// The total number of tokens will only be exact when there has been no deletes.
// The total number of tokens will only be exact when there has been no deletes
// and if the field has a norm.
//
// Otherwise, we approximate by removing deleted documents proportionally.
let total_num_tokens: u64 = compute_total_num_tokens(&self.readers, indexed_field)?;
let total_num_tokens: u64 = estimate_total_num_tokens(&self.readers, indexed_field)?;

// Create the total list of doc ids
// by stacking the doc ids from the different segment.
Expand Down
2 changes: 1 addition & 1 deletion src/indexer/segment_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ impl SegmentWriter {
)
};

if field_entry.is_normed() {
if field_entry.has_fieldnorms() {
self.fieldnorms_writer.record(doc_id, field, num_tokens);
}
}
Expand Down
30 changes: 14 additions & 16 deletions src/postings/serializer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -308,10 +308,8 @@ pub struct PostingsSerializer<W: Write> {
fieldnorm_reader: Option<FieldNormReader>,

bm25_weight: Option<Bm25Weight>,

num_docs: u32, // Number of docs in the segment
avg_fieldnorm: Score, // Average number of term in the field for that segment.
// this value is used to compute the block wand information.
// this value is used to compute the block wand information.
}

impl<W: Write> PostingsSerializer<W> {
Expand All @@ -321,10 +319,6 @@ impl<W: Write> PostingsSerializer<W> {
mode: IndexRecordOption,
fieldnorm_reader: Option<FieldNormReader>,
) -> PostingsSerializer<W> {
let num_docs = fieldnorm_reader
.as_ref()
.map(|fieldnorm_reader| fieldnorm_reader.num_docs())
.unwrap_or(0u32);
PostingsSerializer {
output_write: CountingWriter::wrap(write),

Expand All @@ -339,21 +333,25 @@ impl<W: Write> PostingsSerializer<W> {

fieldnorm_reader,
bm25_weight: None,

num_docs,
avg_fieldnorm,
}
}

/// Returns the number of documents in the segment currently being serialized.
/// This function may return `None` if there are no fieldnorm for that field.
fn num_docs_in_segment(&self) -> Option<u32> {
self.fieldnorm_reader
.as_ref()
.map(|reader| reader.num_docs())
}

pub fn new_term(&mut self, term_doc_freq: u32) {
if self.mode.has_freq() && self.num_docs > 0 {
let bm25_weight = Bm25Weight::for_one_term(
term_doc_freq as u64,
self.num_docs as u64,
self.avg_fieldnorm,
);
self.bm25_weight = Some(bm25_weight);
if !self.mode.has_freq() {
return;
}
self.bm25_weight = self.num_docs_in_segment().map(|num_docs| {
Bm25Weight::for_one_term(term_doc_freq as u64, num_docs as u64, self.avg_fieldnorm)
});
}

fn write_block(&mut self) {
Expand Down
25 changes: 14 additions & 11 deletions src/query/term_query/term_query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -93,20 +93,22 @@ impl TermQuery {
scoring_enabled: bool,
) -> crate::Result<TermWeight> {
let term = self.term.clone();
let field_entry = searcher.schema().get_field_entry(term.field());
let field_entry = searcher
.schema()
.get_field_entry(self.term.field());
if !field_entry.is_indexed() {
return Err(crate::TantivyError::SchemaError(format!(
"Field {:?} is not indexed",
field_entry.name()
)));
let error_msg = format!("Field {:?} is not indexed.", field_entry.name());
return Err(crate::TantivyError::SchemaError(error_msg));
}
let bm25_weight;
if scoring_enabled {
bm25_weight = Bm25Weight::for_terms(searcher, &[term])?;
let has_fieldnorms = searcher
.schema()
.get_field_entry(self.term.field())
.has_fieldnorms();
let bm25_weight = if scoring_enabled {
Bm25Weight::for_terms(searcher, &[term])?
} else {
bm25_weight =
Bm25Weight::new(Explanation::new("<no score>".to_string(), 1.0f32), 1.0f32);
}
Bm25Weight::new(Explanation::new("<no score>".to_string(), 1.0f32), 1.0f32)
};
let index_record_option = if scoring_enabled {
self.index_record_option
} else {
Expand All @@ -116,6 +118,7 @@ impl TermQuery {
self.term.clone(),
index_record_option,
bm25_weight,
has_fieldnorms,
scoring_enabled,
))
}
Expand Down
9 changes: 5 additions & 4 deletions src/query/term_query/term_weight.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ pub struct TermWeight {
index_record_option: IndexRecordOption,
similarity_weight: Bm25Weight,
scoring_enabled: bool,
has_fieldnorms: bool,
}

impl Weight for TermWeight {
Expand Down Expand Up @@ -90,12 +91,14 @@ impl TermWeight {
index_record_option: IndexRecordOption,
similarity_weight: Bm25Weight,
scoring_enabled: bool,
has_fieldnorms: bool,
) -> TermWeight {
TermWeight {
term,
index_record_option,
similarity_weight,
scoring_enabled,
has_fieldnorms,
}
}

Expand All @@ -106,10 +109,8 @@ impl TermWeight {
) -> crate::Result<TermScorer> {
let field = self.term.field();
let inverted_index = reader.inverted_index(field)?;
let fieldnorm_reader = if self.scoring_enabled {
reader
.get_fieldnorms_reader(field)
.unwrap_or(FieldNormReader::constant(reader.max_doc(), 1))
let fieldnorm_reader = if self.scoring_enabled && self.has_fieldnorms {
reader.get_fieldnorms_reader(field)?
} else {
FieldNormReader::constant(reader.max_doc(), 1)
};
Expand Down
Loading

0 comments on commit de3efe1

Please sign in to comment.