Skip to content

Commit

Permalink
Add a NORMED options on field
Browse files Browse the repository at this point in the history
Make fieldnorm indexation optional:

* for all types except text => added a NORMED options
* for text field
** if STRING, field has not fieldnorm retained
** if TEXT, field has fieldnorm computed
  • Loading branch information
Laurent Pouget committed Mar 29, 2021
1 parent 114fbe2 commit 784c9a4
Show file tree
Hide file tree
Showing 15 changed files with 220 additions and 13 deletions.
4 changes: 2 additions & 2 deletions src/fastfield/bytes/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ pub use self::writer::BytesFastFieldWriter;
#[cfg(test)]
mod tests {
use crate::schema::{BytesOptions, IndexRecordOption, Schema, Value};
use crate::{query::TermQuery, schema::FAST, schema::INDEXED, schema::STORED};
use crate::{query::TermQuery, schema::FAST, schema::INDEXED, schema::NORMED, schema::STORED};
use crate::{DocAddress, DocSet, Index, Searcher, Term};
use std::ops::Deref;

Expand Down Expand Up @@ -80,7 +80,7 @@ mod tests {

#[test]
fn test_index_bytes() -> crate::Result<()> {
let searcher = create_index_for_test(INDEXED)?;
let searcher = create_index_for_test(INDEXED | NORMED)?;
assert_eq!(searcher.num_docs(), 1);
let field = searcher.schema().get_field("string_bytes").unwrap();
let term = Term::from_field_bytes(field, b"lucene".as_ref());
Expand Down
1 change: 1 addition & 0 deletions src/fastfield/multivalued/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ mod tests {
IntOptions::default()
.set_fast(Cardinality::MultiValues)
.set_indexed()
.set_normed()
.set_stored(),
);
let time_i =
Expand Down
94 changes: 94 additions & 0 deletions src/fieldnorm/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,97 @@ pub use self::serializer::FieldNormsSerializer;
pub use self::writer::FieldNormsWriter;

use self::code::{fieldnorm_to_id, id_to_fieldnorm};

#[cfg(test)]
mod tests {
use crate::common::CompositeFile;
use crate::fieldnorm::FieldNormReader;
use crate::fieldnorm::FieldNormsSerializer;
use crate::fieldnorm::FieldNormsWriter;
use crate::{
directory::{Directory, RAMDirectory, WritePtr},
schema::{STRING, TEXT},
};
use once_cell::sync::Lazy;
use std::{
panic::{catch_unwind, AssertUnwindSafe},
path::Path,
};

use crate::schema::{Field, Schema, STORED};

pub static SCHEMA: Lazy<Schema> = Lazy::new(|| {
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("field", STORED);
schema_builder.add_text_field("txt_field", TEXT);
schema_builder.add_text_field("str_field", STRING);
schema_builder.build()
});

pub static FIELD: Lazy<Field> = Lazy::new(|| SCHEMA.get_field("field").unwrap());
pub static TXT_FIELD: Lazy<Field> = Lazy::new(|| SCHEMA.get_field("txt_field").unwrap());
pub static STR_FIELD: Lazy<Field> = Lazy::new(|| SCHEMA.get_field("str_field").unwrap());

#[ignore]
#[test]
pub fn test_fieldnorm_bug() -> crate::Result<()> {
let path = Path::new("test");
let directory: RAMDirectory = RAMDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test"))?;
let serializer = FieldNormsSerializer::from_write(write)?;
let mut fieldnorm_writers = FieldNormsWriter::for_schema(&SCHEMA);
fieldnorm_writers.fill_up_to_max_doc(1u32);
fieldnorm_writers.record(0u32, *TXT_FIELD, 5);
fieldnorm_writers.record(1u32, *TXT_FIELD, 3);
fieldnorm_writers.serialize(serializer)?;
}
let file = directory.open_read(&path)?;
{
let fields_composite = CompositeFile::open(&file)?;
assert!(fields_composite.open_read(*FIELD).is_none());
assert!(fields_composite.open_read(*TXT_FIELD).is_none());
assert!(fields_composite.open_read(*STR_FIELD).is_none());
let data = fields_composite.open_read(*TXT_FIELD).unwrap();
let fieldnorm_reader = FieldNormReader::open(data)?;
assert_eq!(fieldnorm_reader.fieldnorm(0u32), 5u32);
assert_eq!(fieldnorm_reader.fieldnorm(1u32), 3u32);
}
Ok(())
}

#[test]
pub fn test_fieldnorm() -> crate::Result<()> {
let path = Path::new("test");
let directory: RAMDirectory = RAMDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test"))?;
let serializer = FieldNormsSerializer::from_write(write)?;
let mut fieldnorm_writers = FieldNormsWriter::for_schema(&SCHEMA);
fieldnorm_writers.fill_up_to_max_doc(1u32);
fieldnorm_writers.record(1u32, *TXT_FIELD, 3);
fieldnorm_writers.serialize(serializer)?;
}
let file = directory.open_read(&path)?;
{
let fields_composite = CompositeFile::open(&file)?;
assert!(fields_composite.open_read(*FIELD).is_none());
assert!(fields_composite.open_read(*STR_FIELD).is_none());
let data = fields_composite.open_read(*TXT_FIELD).unwrap();
let fieldnorm_reader = FieldNormReader::open(data)?;
assert_eq!(fieldnorm_reader.fieldnorm(1u32), 3u32);
}
Ok(())
}

#[test]
pub fn test_fail_fieldnorm_cannot_registered_twice() {
let mut fieldnorm_writers = FieldNormsWriter::for_schema(&SCHEMA);
fieldnorm_writers.fill_up_to_max_doc(1u32);
fieldnorm_writers.record(1u32, *TXT_FIELD, 5);
let result = catch_unwind(AssertUnwindSafe(|| {
fieldnorm_writers.record(1u32, *TXT_FIELD, 3)
}));
assert!(result.is_err());
}
}
2 changes: 1 addition & 1 deletion src/fieldnorm/writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ impl FieldNormsWriter {
schema
.fields()
.filter_map(|(field, field_entry)| {
if field_entry.is_indexed() {
if field_entry.is_indexed() && field_entry.is_normed() {
Some(field)
} else {
None
Expand Down
4 changes: 3 additions & 1 deletion src/indexer/segment_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,9 @@ impl SegmentWriter {
)
};

self.fieldnorms_writer.record(doc_id, field, num_tokens);
if field_entry.is_normed() {
self.fieldnorms_writer.record(doc_id, field, num_tokens);
}
}
FieldType::U64(_) => {
for field_value in field_values {
Expand Down
4 changes: 3 additions & 1 deletion src/query/term_query/term_weight.rs
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,9 @@ impl TermWeight {
let field = self.term.field();
let inverted_index = reader.inverted_index(field)?;
let fieldnorm_reader = if self.scoring_enabled {
reader.get_fieldnorms_reader(field)?
reader
.get_fieldnorms_reader(field)
.unwrap_or(FieldNormReader::constant(reader.max_doc(), 1))
} else {
FieldNormReader::constant(reader.max_doc(), 1)
};
Expand Down
38 changes: 36 additions & 2 deletions src/schema/bytes_options.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
use serde::{Deserialize, Serialize};
use std::ops::BitOr;

use super::flags::{FastFlag, IndexedFlag, SchemaFlagList, StoredFlag};
use super::flags::{FastFlag, IndexedFlag, NormedFlag, SchemaFlagList, StoredFlag};
/// Define how an a bytes field should be handled by tantivy.
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
pub struct BytesOptions {
indexed: bool,
normed: bool,
fast: bool,
stored: bool,
}
Expand All @@ -16,6 +17,11 @@ impl BytesOptions {
self.indexed
}

/// Returns true iff the value is normed.
pub fn is_normed(&self) -> bool {
self.normed
}

/// Returns true iff the value is a fast field.
pub fn is_fast(&self) -> bool {
self.fast
Expand All @@ -35,6 +41,15 @@ impl BytesOptions {
self
}

/// Set the field as normed.
///
/// Setting an integer as normed will generate
/// the fieldnorm data for it.
pub fn set_normed(mut self) -> BytesOptions {
self.normed = true;
self
}

/// Set the field as a single-valued fast field.
///
/// Fast fields are designed for random access.
Expand All @@ -60,6 +75,7 @@ impl Default for BytesOptions {
fn default() -> BytesOptions {
BytesOptions {
indexed: false,
normed: false,
fast: false,
stored: false,
}
Expand All @@ -73,6 +89,7 @@ impl<T: Into<BytesOptions>> BitOr<T> for BytesOptions {
let other = other.into();
BytesOptions {
indexed: self.indexed | other.indexed,
normed: self.normed | other.normed,
stored: self.stored | other.stored,
fast: self.fast | other.fast,
}
Expand All @@ -89,6 +106,7 @@ impl From<FastFlag> for BytesOptions {
fn from(_: FastFlag) -> Self {
BytesOptions {
indexed: false,
normed: false,
stored: false,
fast: true,
}
Expand All @@ -99,6 +117,7 @@ impl From<StoredFlag> for BytesOptions {
fn from(_: StoredFlag) -> Self {
BytesOptions {
indexed: false,
normed: false,
stored: true,
fast: false,
}
Expand All @@ -109,6 +128,18 @@ impl From<IndexedFlag> for BytesOptions {
fn from(_: IndexedFlag) -> Self {
BytesOptions {
indexed: true,
normed: false,
stored: false,
fast: false,
}
}
}

impl From<NormedFlag> for BytesOptions {
fn from(_: NormedFlag) -> Self {
BytesOptions {
indexed: false,
normed: true,
stored: false,
fast: false,
}
Expand All @@ -128,12 +159,13 @@ where

#[cfg(test)]
mod tests {
use crate::schema::{BytesOptions, FAST, INDEXED, STORED};
use crate::schema::{BytesOptions, FAST, INDEXED, NORMED, STORED};

#[test]
fn test_bytes_option_fast_flag() {
assert_eq!(BytesOptions::default().set_fast(), FAST.into());
assert_eq!(BytesOptions::default().set_indexed(), INDEXED.into());
assert_eq!(BytesOptions::default().set_normed(), NORMED.into());
assert_eq!(BytesOptions::default().set_stored(), STORED.into());
}
#[test]
Expand All @@ -157,8 +189,10 @@ mod tests {
assert!(!BytesOptions::default().is_stored());
assert!(!BytesOptions::default().is_fast());
assert!(!BytesOptions::default().is_indexed());
assert!(!BytesOptions::default().is_normed());
assert!(BytesOptions::default().set_stored().is_stored());
assert!(BytesOptions::default().set_fast().is_fast());
assert!(BytesOptions::default().set_indexed().is_indexed());
assert!(BytesOptions::default().set_normed().is_normed());
}
}
5 changes: 5 additions & 0 deletions src/schema/field_entry.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,11 @@ impl FieldEntry {
}
}

/// Returns true iff the field is normed
pub fn is_normed(&self) -> bool {
self.field_type.is_normed()
}

/// Returns true iff the field is a int (signed or unsigned) fast field
pub fn is_fast(&self) -> bool {
match self.field_type {
Expand Down
16 changes: 16 additions & 0 deletions src/schema/field_type.rs
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,22 @@ impl FieldType {
}
}

/// returns true iff the field is normed.
pub fn is_normed(&self) -> bool {
match *self {
FieldType::Str(ref text_options) => text_options.get_indexing_options().map_or_else(
|| false,
|opt| opt.index_option() != IndexRecordOption::Basic,
),
FieldType::U64(ref int_options)
| FieldType::I64(ref int_options)
| FieldType::F64(ref int_options)
| FieldType::Date(ref int_options) => int_options.is_normed(),
FieldType::HierarchicalFacet(_) => false,
FieldType::Bytes(ref bytes_options) => bytes_options.is_normed(),
}
}

/// Given a field configuration, return the maximal possible
/// `IndexRecordOption` available.
///
Expand Down
12 changes: 12 additions & 0 deletions src/schema/flags.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,18 @@ pub const INDEXED: SchemaFlagList<IndexedFlag, ()> = SchemaFlagList {
tail: (),
};

#[derive(Clone)]
pub struct NormedFlag;
/// Flag to mark the field as indexed.
///
/// The `INDEXED` flag can only be used when building `IntOptions` (`u64`, `i64` and `f64` fields)
/// Of course, text fields can also be indexed... But this is expressed by using either the
/// `STRING` (untokenized) or `TEXT` (tokenized with the english tokenizer) flags.
pub const NORMED: SchemaFlagList<NormedFlag, ()> = SchemaFlagList {
head: NormedFlag,
tail: (),
};

#[derive(Clone)]
pub struct FastFlag;
/// Flag to mark the field as a fast field (similar to Lucene's DocValues)
Expand Down
Loading

0 comments on commit 784c9a4

Please sign in to comment.