From a8c1fbed0a4fa4454bd22bd2183cf70ce5523b90 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 19 Nov 2022 17:37:17 +0100 Subject: [PATCH] Added a `expand_dots` JsonObjectOptions. Related with quickwit#2345. --- src/indexer/json_term_writer.rs | 76 +++++++++++++++++++++----- src/indexer/mod.rs | 28 +++++++++- src/indexer/segment_writer.rs | 13 +++-- src/query/query_parser/query_parser.rs | 58 ++++++++++---------- src/schema/json_object_options.rs | 30 +++++++++- src/schema/term.rs | 13 ++++- 6 files changed, 165 insertions(+), 53 deletions(-) diff --git a/src/indexer/json_term_writer.rs b/src/indexer/json_term_writer.rs index 71543acd37..989357804b 100644 --- a/src/indexer/json_term_writer.rs +++ b/src/indexer/json_term_writer.rs @@ -67,11 +67,12 @@ pub(crate) fn index_json_values<'a>( doc: DocId, json_values: impl Iterator>>, text_analyzer: &TextAnalyzer, + expand_dots_enabled: bool, term_buffer: &mut Term, postings_writer: &mut dyn PostingsWriter, ctx: &mut IndexingContext, ) -> crate::Result<()> { - let mut json_term_writer = JsonTermWriter::wrap(term_buffer); + let mut json_term_writer = JsonTermWriter::wrap(term_buffer, expand_dots_enabled); let mut positions_per_path: IndexingPositionsPerPath = Default::default(); for json_value_res in json_values { let json_value = json_value_res?; @@ -259,6 +260,7 @@ pub(crate) fn set_string_and_get_terms( pub struct JsonTermWriter<'a> { term_buffer: &'a mut Term, path_stack: Vec, + expand_dots_enabled: bool, } /// Splits a json path supplied to the query parser in such a way that @@ -298,23 +300,25 @@ impl<'a> JsonTermWriter<'a> { pub fn from_field_and_json_path( field: Field, json_path: &str, + expand_dots_enabled: bool, term_buffer: &'a mut Term, ) -> Self { term_buffer.set_field_and_type(field, Type::Json); - let mut json_term_writer = Self::wrap(term_buffer); + let mut json_term_writer = Self::wrap(term_buffer, expand_dots_enabled); for segment in split_json_path(json_path) { json_term_writer.push_path_segment(&segment); } json_term_writer } - pub fn wrap(term_buffer: &'a mut Term) -> Self { + pub fn wrap(term_buffer: &'a mut Term, expand_dots_enabled: bool) -> Self { term_buffer.clear_with_type(Type::Json); let mut path_stack = Vec::with_capacity(10); path_stack.push(0); Self { term_buffer, path_stack, + expand_dots_enabled, } } @@ -336,11 +340,24 @@ impl<'a> JsonTermWriter<'a> { self.trim_to_end_of_path(); let buffer = self.term_buffer.value_bytes_mut(); let buffer_len = buffer.len(); + if self.path_stack.len() > 1 { buffer[buffer_len - 1] = JSON_PATH_SEGMENT_SEP; } - self.term_buffer.append_bytes(segment.as_bytes()); - self.term_buffer.append_bytes(&[JSON_PATH_SEGMENT_SEP]); + if self.expand_dots_enabled && segment.as_bytes().contains(&b'.') { + // We need to replace `.` by JSON_PATH_SEGMENT_SEP. + self.term_buffer + .append_bytes(segment.as_bytes()) + .iter_mut() + .for_each(|byte| { + if *byte == b'.' { + *byte = JSON_PATH_SEGMENT_SEP; + } + }); + } else { + self.term_buffer.append_bytes(segment.as_bytes()); + } + self.term_buffer.push_byte(JSON_PATH_SEGMENT_SEP); self.path_stack.push(self.term_buffer.len_bytes()); } @@ -391,7 +408,7 @@ mod tests { fn test_json_writer() { let field = Field::from_field_id(1); let mut term = Term::with_type_and_field(Type::Json, field); - let mut json_writer = JsonTermWriter::wrap(&mut term); + let mut json_writer = JsonTermWriter::wrap(&mut term, false); json_writer.push_path_segment("attributes"); json_writer.push_path_segment("color"); json_writer.set_str("red"); @@ -425,7 +442,7 @@ mod tests { fn test_string_term() { let field = Field::from_field_id(1); let mut term = Term::with_type_and_field(Type::Json, field); - let mut json_writer = JsonTermWriter::wrap(&mut term); + let mut json_writer = JsonTermWriter::wrap(&mut term, false); json_writer.push_path_segment("color"); json_writer.set_str("red"); assert_eq!( @@ -438,7 +455,7 @@ mod tests { fn test_i64_term() { let field = Field::from_field_id(1); let mut term = Term::with_type_and_field(Type::Json, field); - let mut json_writer = JsonTermWriter::wrap(&mut term); + let mut json_writer = JsonTermWriter::wrap(&mut term, false); json_writer.push_path_segment("color"); json_writer.set_fast_value(-4i64); assert_eq!( @@ -451,7 +468,7 @@ mod tests { fn test_u64_term() { let field = Field::from_field_id(1); let mut term = Term::with_type_and_field(Type::Json, field); - let mut json_writer = JsonTermWriter::wrap(&mut term); + let mut json_writer = JsonTermWriter::wrap(&mut term, false); json_writer.push_path_segment("color"); json_writer.set_fast_value(4u64); assert_eq!( @@ -464,7 +481,7 @@ mod tests { fn test_f64_term() { let field = Field::from_field_id(1); let mut term = Term::with_type_and_field(Type::Json, field); - let mut json_writer = JsonTermWriter::wrap(&mut term); + let mut json_writer = JsonTermWriter::wrap(&mut term, false); json_writer.push_path_segment("color"); json_writer.set_fast_value(4.0f64); assert_eq!( @@ -477,7 +494,7 @@ mod tests { fn test_bool_term() { let field = Field::from_field_id(1); let mut term = Term::with_type_and_field(Type::Json, field); - let mut json_writer = JsonTermWriter::wrap(&mut term); + let mut json_writer = JsonTermWriter::wrap(&mut term, false); json_writer.push_path_segment("color"); json_writer.set_fast_value(true); assert_eq!( @@ -490,7 +507,7 @@ mod tests { fn test_push_after_set_path_segment() { let field = Field::from_field_id(1); let mut term = Term::with_type_and_field(Type::Json, field); - let mut json_writer = JsonTermWriter::wrap(&mut term); + let mut json_writer = JsonTermWriter::wrap(&mut term, false); json_writer.push_path_segment("attribute"); json_writer.set_str("something"); json_writer.push_path_segment("color"); @@ -505,7 +522,7 @@ mod tests { fn test_pop_segment() { let field = Field::from_field_id(1); let mut term = Term::with_type_and_field(Type::Json, field); - let mut json_writer = JsonTermWriter::wrap(&mut term); + let mut json_writer = JsonTermWriter::wrap(&mut term, false); json_writer.push_path_segment("color"); json_writer.push_path_segment("hue"); json_writer.pop_path_segment(); @@ -520,7 +537,7 @@ mod tests { fn test_json_writer_path() { let field = Field::from_field_id(1); let mut term = Term::with_type_and_field(Type::Json, field); - let mut json_writer = JsonTermWriter::wrap(&mut term); + let mut json_writer = JsonTermWriter::wrap(&mut term, false); json_writer.push_path_segment("color"); assert_eq!(json_writer.path(), b"color"); json_writer.push_path_segment("hue"); @@ -529,6 +546,37 @@ mod tests { assert_eq!(json_writer.path(), b"color\x01hue"); } + #[test] + fn test_json_path_expand_dots_disabled() { + let field = Field::from_field_id(1); + let mut term = Term::with_type_and_field(Type::Json, field); + let mut json_writer = JsonTermWriter::wrap(&mut term, false); + json_writer.push_path_segment("color.hue"); + assert_eq!(json_writer.path(), b"color.hue"); + } + + #[test] + fn test_json_path_expand_dots_enabled() { + let field = Field::from_field_id(1); + let mut term = Term::with_type_and_field(Type::Json, field); + let mut json_writer = JsonTermWriter::wrap(&mut term, true); + json_writer.push_path_segment("color.hue"); + assert_eq!(json_writer.path(), b"color\x01hue"); + } + + #[test] + fn test_json_path_expand_dots_enabled_pop_segment() { + let field = Field::from_field_id(1); + let mut term = Term::with_type_and_field(Type::Json, field); + let mut json_writer = JsonTermWriter::wrap(&mut term, true); + json_writer.push_path_segment("hello"); + assert_eq!(json_writer.path(), b"hello"); + json_writer.push_path_segment("color.hue"); + assert_eq!(json_writer.path(), b"hello\x01color\x01hue"); + json_writer.pop_path_segment(); + assert_eq!(json_writer.path(), b"hello"); + } + #[test] fn test_split_json_path_simple() { let json_path = split_json_path("titi.toto"); diff --git a/src/indexer/mod.rs b/src/indexer/mod.rs index c55c241f1b..36d911afb4 100644 --- a/src/indexer/mod.rs +++ b/src/indexer/mod.rs @@ -60,7 +60,7 @@ type AddBatchReceiver = channel::Receiver; mod tests_mmap { use crate::collector::Count; use crate::query::QueryParser; - use crate::schema::{Schema, STORED, TEXT}; + use crate::schema::{JsonObjectOptions, Schema, TEXT}; use crate::{Index, Term}; #[test] @@ -81,9 +81,9 @@ mod tests_mmap { } #[test] - fn test_json_field_espace() { + fn test_json_field_expand_dots_disabled_dot_escaped_required() { let mut schema_builder = Schema::builder(); - let json_field = schema_builder.add_json_field("json", TEXT | STORED); + let json_field = schema_builder.add_json_field("json", TEXT); let index = Index::create_in_ram(schema_builder.build()); let mut index_writer = index.writer_for_tests().unwrap(); let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello"}); @@ -99,4 +99,26 @@ mod tests_mmap { let num_docs = searcher.search(&query, &Count).unwrap(); assert_eq!(num_docs, 1); } + + #[test] + fn test_json_field_expand_dots_enabled_dot_escape_not_required() { + let mut schema_builder = Schema::builder(); + let json_options: JsonObjectOptions = + JsonObjectOptions::from(TEXT).set_expand_dots_enabled(); + let json_field = schema_builder.add_json_field("json", json_options); + let index = Index::create_in_ram(schema_builder.build()); + let mut index_writer = index.writer_for_tests().unwrap(); + let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello"}); + index_writer.add_document(doc!(json_field=>json)).unwrap(); + index_writer.commit().unwrap(); + let reader = index.reader().unwrap(); + let searcher = reader.searcher(); + assert_eq!(searcher.num_docs(), 1); + let parse_query = QueryParser::for_index(&index, Vec::new()); + let query = parse_query + .parse_query(r#"json.k8s.container.name:prometheus"#) + .unwrap(); + let num_docs = searcher.search(&query, &Count).unwrap(); + assert_eq!(num_docs, 1); + } } diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index c22e9dc56b..3bf99dc302 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -180,7 +180,7 @@ impl SegmentWriter { self.per_field_postings_writers.get_for_field_mut(field); term_buffer.clear_with_field_and_type(field_entry.field_type().value_type(), field); - match *field_entry.field_type() { + match field_entry.field_type() { FieldType::Facet(_) => { for value in values { let facet = value.as_facet().ok_or_else(make_schema_error)?; @@ -307,7 +307,7 @@ impl SegmentWriter { self.fieldnorms_writer.record(doc_id, field, num_vals); } } - FieldType::JsonObject(_) => { + FieldType::JsonObject(json_options) => { let text_analyzer = &self.per_field_text_analyzers[field.field_id() as usize]; let json_values_it = values.map(|value| value.as_json().ok_or_else(make_schema_error)); @@ -315,6 +315,7 @@ impl SegmentWriter { doc_id, json_values_it, text_analyzer, + json_options.is_expand_dots_enabled(), term_buffer, postings_writer, ctx, @@ -557,7 +558,7 @@ mod tests { let mut term = Term::with_type_and_field(Type::Json, json_field); let mut term_stream = term_dict.stream().unwrap(); - let mut json_term_writer = JsonTermWriter::wrap(&mut term); + let mut json_term_writer = JsonTermWriter::wrap(&mut term, false); json_term_writer.push_path_segment("bool"); json_term_writer.set_fast_value(true); @@ -648,7 +649,7 @@ mod tests { let segment_reader = searcher.segment_reader(0u32); let inv_index = segment_reader.inverted_index(json_field).unwrap(); let mut term = Term::with_type_and_field(Type::Json, json_field); - let mut json_term_writer = JsonTermWriter::wrap(&mut term); + let mut json_term_writer = JsonTermWriter::wrap(&mut term, false); json_term_writer.push_path_segment("mykey"); json_term_writer.set_str("token"); let term_info = inv_index @@ -692,7 +693,7 @@ mod tests { let segment_reader = searcher.segment_reader(0u32); let inv_index = segment_reader.inverted_index(json_field).unwrap(); let mut term = Term::with_type_and_field(Type::Json, json_field); - let mut json_term_writer = JsonTermWriter::wrap(&mut term); + let mut json_term_writer = JsonTermWriter::wrap(&mut term, false); json_term_writer.push_path_segment("mykey"); json_term_writer.set_str("two tokens"); let term_info = inv_index @@ -737,7 +738,7 @@ mod tests { let reader = index.reader().unwrap(); let searcher = reader.searcher(); let mut term = Term::with_type_and_field(Type::Json, json_field); - let mut json_term_writer = JsonTermWriter::wrap(&mut term); + let mut json_term_writer = JsonTermWriter::wrap(&mut term, false); json_term_writer.push_path_segment("mykey"); json_term_writer.push_path_segment("field"); json_term_writer.set_str("hello"); diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index a7f864e96a..c703cdef67 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -16,7 +16,8 @@ use crate::query::{ TermQuery, TermSetQuery, }; use crate::schema::{ - Facet, FacetParseError, Field, FieldType, IndexRecordOption, IntoIpv6Addr, Schema, Term, Type, + Facet, FacetParseError, Field, FieldType, IndexRecordOption, IntoIpv6Addr, JsonObjectOptions, + Schema, Term, Type, }; use crate::time::format_description::well_known::Rfc3339; use crate::time::OffsetDateTime; @@ -482,28 +483,14 @@ impl QueryParser { .into_iter() .collect()) } - FieldType::JsonObject(ref json_options) => { - let option = json_options.get_text_indexing_options().ok_or_else(|| { - // This should have been seen earlier really. - QueryParserError::FieldNotIndexed(field_name.to_string()) - })?; - let text_analyzer = - self.tokenizer_manager - .get(option.tokenizer()) - .ok_or_else(|| QueryParserError::UnknownTokenizer { - field: field_name.to_string(), - tokenizer: option.tokenizer().to_string(), - })?; - let index_record_option = option.index_option(); - generate_literals_for_json_object( - field_name, - field, - json_path, - phrase, - &text_analyzer, - index_record_option, - ) - } + FieldType::JsonObject(ref json_options) => generate_literals_for_json_object( + field_name, + field, + json_path, + phrase, + &self.tokenizer_manager, + json_options, + ), FieldType::Facet(_) => match Facet::from_text(phrase) { Ok(facet) => { let facet_term = Term::from_facet(field, &facet); @@ -767,17 +754,32 @@ fn generate_literals_for_json_object( field: Field, json_path: &str, phrase: &str, - text_analyzer: &TextAnalyzer, - index_record_option: IndexRecordOption, + tokenizer_manager: &TokenizerManager, + json_options: &JsonObjectOptions, ) -> Result, QueryParserError> { + let text_options = json_options.get_text_indexing_options().ok_or_else(|| { + // This should have been seen earlier really. + QueryParserError::FieldNotIndexed(field_name.to_string()) + })?; + let text_analyzer = tokenizer_manager + .get(text_options.tokenizer()) + .ok_or_else(|| QueryParserError::UnknownTokenizer { + field: field_name.to_string(), + tokenizer: text_options.tokenizer().to_string(), + })?; + let index_record_option = text_options.index_option(); let mut logical_literals = Vec::new(); let mut term = Term::with_capacity(100); - let mut json_term_writer = - JsonTermWriter::from_field_and_json_path(field, json_path, &mut term); + let mut json_term_writer = JsonTermWriter::from_field_and_json_path( + field, + json_path, + json_options.is_expand_dots_enabled(), + &mut term, + ); if let Some(term) = convert_to_fast_value_and_get_term(&mut json_term_writer, phrase) { logical_literals.push(LogicalLiteral::Term(term)); } - let terms = set_string_and_get_terms(&mut json_term_writer, phrase, text_analyzer); + let terms = set_string_and_get_terms(&mut json_term_writer, phrase, &text_analyzer); drop(json_term_writer); if terms.len() <= 1 { for (_, term) in terms { diff --git a/src/schema/json_object_options.rs b/src/schema/json_object_options.rs index d5829bddba..d6ea40b147 100644 --- a/src/schema/json_object_options.rs +++ b/src/schema/json_object_options.rs @@ -13,6 +13,8 @@ pub struct JsonObjectOptions { // If set to some, int, date, f64 and text will be indexed. // Text will use the TextFieldIndexing setting for indexing. indexing: Option, + + expand_dots_enabled: bool, } impl JsonObjectOptions { @@ -26,6 +28,29 @@ impl JsonObjectOptions { self.indexing.is_some() } + /// Returns `true` iff dots in json keys should be expanded. + /// + /// When expand_dots is enabled, json object like + /// `{"k8s.node.id": 5}` is processed as if it was + /// `{"k8s": {"node": {"id": 5}}}`. + /// It option has the merit of allowing users to + /// write queries like `k8s.node.id:5`. + /// On the other, enabling that feature can lead to + /// ambiguity. + /// + /// If disabled, the "." need to be escaped: + /// `k8s\.node\.id:5`. + pub fn is_expand_dots_enabled(&self) -> bool { + self.expand_dots_enabled + } + + /// Sets `expands_dots` to true. + /// See `is_expand_dots_enabled` for more information. + pub fn set_expand_dots_enabled(mut self) -> Self { + self.expand_dots_enabled = true; + self + } + /// Returns the text indexing options. /// /// If set to `Some` then both int and str values will be indexed. @@ -55,6 +80,7 @@ impl From for JsonObjectOptions { JsonObjectOptions { stored: true, indexing: None, + expand_dots_enabled: false, } } } @@ -69,10 +95,11 @@ impl> BitOr for JsonObjectOptions { type Output = JsonObjectOptions; fn bitor(self, other: T) -> Self { - let other = other.into(); + let other: JsonObjectOptions = other.into(); JsonObjectOptions { indexing: self.indexing.or(other.indexing), stored: self.stored | other.stored, + expand_dots_enabled: self.expand_dots_enabled | other.expand_dots_enabled, } } } @@ -93,6 +120,7 @@ impl From for JsonObjectOptions { JsonObjectOptions { stored: text_options.is_stored(), indexing: text_options.get_indexing_options().cloned(), + expand_dots_enabled: false, } } } diff --git a/src/schema/term.rs b/src/schema/term.rs index ba9a1ba822..ad393b1dbc 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -197,8 +197,19 @@ impl Term { } /// Appends value bytes to the Term. - pub fn append_bytes(&mut self, bytes: &[u8]) { + /// + /// This function returns the segment that has just been added. + #[inline] + pub fn append_bytes(&mut self, bytes: &[u8]) -> &mut [u8] { + let len_before = self.0.len(); self.0.extend_from_slice(bytes); + &mut self.0[len_before..] + } + + /// Appends a single byte to the term. + #[inline] + pub fn push_byte(&mut self, byte: u8) { + self.0.push(byte); } }