Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a normed options on fields #1001

Closed
2 changes: 1 addition & 1 deletion src/core/index_meta.rs
Original file line number Diff line number Diff line change
Expand Up @@ -394,7 +394,7 @@ mod tests {
let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
assert_eq!(
json,
r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"lz4"},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","tokenizer":"default"},"stored":false}}],"opstamp":0}"#
r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"lz4"},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false}}],"opstamp":0}"#
);
}
}
2 changes: 1 addition & 1 deletion src/core/segment_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ impl SegmentReader {
self.fieldnorm_readers.get_field(field)?.ok_or_else(|| {
let field_name = self.schema.get_field_name(field);
let err_msg = format!(
"Field norm not found for field {:?}. Was it marked as indexed during indexing?",
"Field norm not found for field {:?}. Was it marked as normed during indexing?",
field_name
);
crate::TantivyError::SchemaError(err_msg)
Expand Down
1 change: 1 addition & 0 deletions src/fastfield/multivalued/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ mod tests {
IntOptions::default()
.set_fast(Cardinality::MultiValues)
.set_indexed()
.set_fieldnorm()
.set_stored(),
);
let time_i =
Expand Down
68 changes: 68 additions & 0 deletions src/fieldnorm/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,71 @@ pub use self::serializer::FieldNormsSerializer;
pub use self::writer::FieldNormsWriter;

use self::code::{fieldnorm_to_id, id_to_fieldnorm};

#[cfg(test)]
mod tests {
use crate::directory::CompositeFile;
use crate::fieldnorm::FieldNormReader;
use crate::fieldnorm::FieldNormsSerializer;
use crate::fieldnorm::FieldNormsWriter;
use crate::{
directory::{Directory, RamDirectory, WritePtr},
schema::{STRING, TEXT},
};
use once_cell::sync::Lazy;
use std::path::Path;

use crate::schema::{Field, Schema, STORED};

pub static SCHEMA: Lazy<Schema> = Lazy::new(|| {
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("field", STORED);
schema_builder.add_text_field("txt_field", TEXT);
schema_builder.add_text_field("str_field", STRING);
schema_builder.build()
});

pub static FIELD: Lazy<Field> = Lazy::new(|| SCHEMA.get_field("field").unwrap());
pub static TXT_FIELD: Lazy<Field> = Lazy::new(|| SCHEMA.get_field("txt_field").unwrap());
pub static STR_FIELD: Lazy<Field> = Lazy::new(|| SCHEMA.get_field("str_field").unwrap());

#[test]
#[should_panic(expected = "Cannot register a given fieldnorm twice")]
pub fn test_should_panic_when_recording_fieldnorm_twice_for_same_doc() {
let mut fieldnorm_writers = FieldNormsWriter::for_schema(&SCHEMA);
fieldnorm_writers.record(0u32, *TXT_FIELD, 5);
fieldnorm_writers.record(0u32, *TXT_FIELD, 3);
}

#[test]
pub fn test_fieldnorm() -> crate::Result<()> {
let path = Path::new("test");
let directory: RamDirectory = RamDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test"))?;
let serializer = FieldNormsSerializer::from_write(write)?;
let mut fieldnorm_writers = FieldNormsWriter::for_schema(&SCHEMA);
fieldnorm_writers.record(2u32, *TXT_FIELD, 5);
fieldnorm_writers.record(3u32, *TXT_FIELD, 3);
fieldnorm_writers.serialize(serializer, None)?;
}
let file = directory.open_read(&path)?;
{
let fields_composite = CompositeFile::open(&file)?;
assert!(fields_composite.open_read(*FIELD).is_none());
assert!(fields_composite.open_read(*STR_FIELD).is_none());
let data = fields_composite.open_read(*TXT_FIELD).unwrap();
let fieldnorm_reader = FieldNormReader::open(data)?;
assert_eq!(fieldnorm_reader.fieldnorm(0u32), 0u32);
assert_eq!(fieldnorm_reader.fieldnorm(1u32), 0u32);
assert_eq!(fieldnorm_reader.fieldnorm(2u32), 5u32);
assert_eq!(fieldnorm_reader.fieldnorm(3u32), 3u32);
}
Ok(())
}

#[test]
pub fn test_retrieve_fields_with_norms() {
//TODO
}
}
2 changes: 1 addition & 1 deletion src/fieldnorm/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ impl FieldNormsWriter {
schema
.fields()
.filter_map(|(field, field_entry)| {
if field_entry.is_indexed() {
if field_entry.is_indexed() && field_entry.has_fieldnorms() {
Some(field)
} else {
None
Expand Down
31 changes: 22 additions & 9 deletions src/indexer/merger.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,17 +41,29 @@ use tantivy_bitpacker::minmax;
/// We do not allow segments with more than
pub const MAX_DOC_LIMIT: u32 = 1 << 31;

fn compute_total_num_tokens(readers: &[SegmentReader], field: Field) -> crate::Result<u64> {
fn estimate_total_num_tokens(readers: &[SegmentReader], field: Field) -> crate::Result<u64> {
let mut total_tokens = 0u64;
let mut count: [usize; 256] = [0; 256];
for reader in readers {
// When there are deletes, we use an approximation either
// - by using the fieldnorm
// - or by using a multiplying the total number of tokens by a ratio (1 - deleted docs / num docs).
if reader.has_deletes() {
// if there are deletes, then we use an approximation
// using the fieldnorm
let fieldnorms_reader = reader.get_fieldnorms_reader(field)?;
for doc in reader.doc_ids_alive() {
let fieldnorm_id = fieldnorms_reader.fieldnorm_id(doc);
count[fieldnorm_id as usize] += 1;
if reader
.schema()
.get_field_entry(field)
.field_type()
.has_fieldnorms()
{
let fieldnorms_reader = reader.get_fieldnorms_reader(field)?;
for doc in reader.doc_ids_alive() {
let fieldnorm_id = fieldnorms_reader.fieldnorm_id(doc);
count[fieldnorm_id as usize] += 1;
}
} else {
let segment_num_tokens = reader.inverted_index(field)?.total_num_tokens();
fulmicoton marked this conversation as resolved.
Show resolved Hide resolved
let ratio = 1f64 - reader.num_deleted_docs() as f64 / reader.num_docs() as f64;
total_tokens += (segment_num_tokens as f64 * ratio) as u64;
}
} else {
total_tokens += reader.inverted_index(field)?.total_num_tokens();
Expand Down Expand Up @@ -851,10 +863,11 @@ impl IndexMerger {
segment_map[*old_doc_id as usize] = Some(new_doc_id as DocId);
}

// The total number of tokens will only be exact when there has been no deletes.
// The total number of tokens will only be exact when there has been no deletes
// and if the field has a norm.
fulmicoton marked this conversation as resolved.
Show resolved Hide resolved
//
// Otherwise, we approximate by removing deleted documents proportionally.
let total_num_tokens: u64 = compute_total_num_tokens(&self.readers, indexed_field)?;
let total_num_tokens: u64 = estimate_total_num_tokens(&self.readers, indexed_field)?;

// Create the total list of doc ids
// by stacking the doc ids from the different segment.
Expand Down
4 changes: 3 additions & 1 deletion src/indexer/segment_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,9 @@ impl SegmentWriter {
)
};

self.fieldnorms_writer.record(doc_id, field, num_tokens);
if field_entry.has_fieldnorms() {
self.fieldnorms_writer.record(doc_id, field, num_tokens);
fulmicoton marked this conversation as resolved.
Show resolved Hide resolved
}
}
FieldType::U64(_) => {
for field_value in field_values {
Expand Down
30 changes: 14 additions & 16 deletions src/postings/serializer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -308,10 +308,8 @@ pub struct PostingsSerializer<W: Write> {
fieldnorm_reader: Option<FieldNormReader>,

bm25_weight: Option<Bm25Weight>,

num_docs: u32, // Number of docs in the segment
avg_fieldnorm: Score, // Average number of term in the field for that segment.
// this value is used to compute the block wand information.
// this value is used to compute the block wand information.
}

impl<W: Write> PostingsSerializer<W> {
Expand All @@ -321,10 +319,6 @@ impl<W: Write> PostingsSerializer<W> {
mode: IndexRecordOption,
fieldnorm_reader: Option<FieldNormReader>,
) -> PostingsSerializer<W> {
let num_docs = fieldnorm_reader
.as_ref()
.map(|fieldnorm_reader| fieldnorm_reader.num_docs())
.unwrap_or(0u32);
PostingsSerializer {
output_write: CountingWriter::wrap(write),

Expand All @@ -339,21 +333,25 @@ impl<W: Write> PostingsSerializer<W> {

fieldnorm_reader,
bm25_weight: None,

num_docs,
avg_fieldnorm,
}
}

/// Returns the number of documents in the segment currently being serialized.
/// This function may return `None` if there are no fieldnorm for that field.
fn num_docs_in_segment(&self) -> Option<u32> {
self.fieldnorm_reader
.as_ref()
.map(|reader| reader.num_docs())
}

pub fn new_term(&mut self, term_doc_freq: u32) {
if self.mode.has_freq() && self.num_docs > 0 {
let bm25_weight = Bm25Weight::for_one_term(
term_doc_freq as u64,
self.num_docs as u64,
self.avg_fieldnorm,
);
self.bm25_weight = Some(bm25_weight);
if !self.mode.has_freq() {
return;
}
self.bm25_weight = self.num_docs_in_segment().map(|num_docs| {
Bm25Weight::for_one_term(term_doc_freq as u64, num_docs as u64, self.avg_fieldnorm)
});
}

fn write_block(&mut self) {
Expand Down
23 changes: 12 additions & 11 deletions src/query/term_query/term_query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -93,20 +93,20 @@ impl TermQuery {
scoring_enabled: bool,
) -> crate::Result<TermWeight> {
let term = self.term.clone();
let field_entry = searcher.schema().get_field_entry(term.field());
let field_entry = searcher.schema().get_field_entry(self.term.field());
if !field_entry.is_indexed() {
return Err(crate::TantivyError::SchemaError(format!(
"Field {:?} is not indexed",
field_entry.name()
)));
let error_msg = format!("Field {:?} is not indexed.", field_entry.name());
return Err(crate::TantivyError::SchemaError(error_msg));
}
let bm25_weight;
if scoring_enabled {
bm25_weight = Bm25Weight::for_terms(searcher, &[term])?;
let has_fieldnorms = searcher
.schema()
.get_field_entry(self.term.field())
.has_fieldnorms();
let bm25_weight = if scoring_enabled {
Bm25Weight::for_terms(searcher, &[term])?
} else {
bm25_weight =
Bm25Weight::new(Explanation::new("<no score>".to_string(), 1.0f32), 1.0f32);
}
Bm25Weight::new(Explanation::new("<no score>".to_string(), 1.0f32), 1.0f32)
};
let index_record_option = if scoring_enabled {
self.index_record_option
} else {
Expand All @@ -116,6 +116,7 @@ impl TermQuery {
self.term.clone(),
index_record_option,
bm25_weight,
has_fieldnorms,
scoring_enabled,
))
}
Expand Down
5 changes: 4 additions & 1 deletion src/query/term_query/term_weight.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ pub struct TermWeight {
index_record_option: IndexRecordOption,
similarity_weight: Bm25Weight,
scoring_enabled: bool,
has_fieldnorms: bool,
}

impl Weight for TermWeight {
Expand Down Expand Up @@ -90,12 +91,14 @@ impl TermWeight {
index_record_option: IndexRecordOption,
similarity_weight: Bm25Weight,
scoring_enabled: bool,
has_fieldnorms: bool,
) -> TermWeight {
TermWeight {
term,
index_record_option,
similarity_weight,
scoring_enabled,
has_fieldnorms,
}
}

Expand All @@ -106,7 +109,7 @@ impl TermWeight {
) -> crate::Result<TermScorer> {
let field = self.term.field();
let inverted_index = reader.inverted_index(field)?;
let fieldnorm_reader = if self.scoring_enabled {
let fieldnorm_reader = if self.scoring_enabled && self.has_fieldnorms {
reader.get_fieldnorms_reader(field)?
} else {
FieldNormReader::constant(reader.max_doc(), 1)
Expand Down
Loading