From d1f281e1c8edc3e8194d30433782805a99b1173e Mon Sep 17 00:00:00 2001 From: Mehul <65443164+infiniteregrets@users.noreply.github.com> Date: Thu, 7 Apr 2022 23:08:04 -0400 Subject: [PATCH] Add option for fieldnorm (#1215) * Add option for fieldnorm * Updated backward compatibility tests. Closes #1134 --- .../default_doc_mapper/field_mapping_entry.rs | 42 +++++++ ...2f93986378154c2ced8692b1c82b.expected.json | 4 + ...5eb4eab7af5fd6a01237efebe352.expected.json | 4 + ...3f14ad8abc59c64907d07d4ab421.expected.json | 116 ++++++++++++++++++ .../v0-cefa3f14ad8abc59c64907d07d4ab421.json | 116 ++++++++++++++++++ ...e15d8611ab4d84b0eaec0607907c.expected.json | 4 + .../index-metadata/unversioned.expected.json | 4 + ...93da53735bc58ec35c1648a4f14c.expected.json | 4 + ...3f5078aa6acf314c95936336fa68.expected.json | 4 + ...352caf8990d0e7a54582a4478cd5.expected.json | 4 + ...b1b7051ec08b3b442298cd7ba362.expected.json | 4 + ...f6216df5ed38e0548ac2012570e1.expected.json | 89 ++++++++++++++ .../v1-b26ff6216df5ed38e0548ac2012570e1.json | 89 ++++++++++++++ ...66301963300ef942997308b4f3ac.expected.json | 4 + 14 files changed, 488 insertions(+) create mode 100644 quickwit-metastore/test-data/file-backed-index/v0-cefa3f14ad8abc59c64907d07d4ab421.expected.json create mode 100644 quickwit-metastore/test-data/file-backed-index/v0-cefa3f14ad8abc59c64907d07d4ab421.json create mode 100644 quickwit-metastore/test-data/index-metadata/v1-b26ff6216df5ed38e0548ac2012570e1.expected.json create mode 100644 quickwit-metastore/test-data/index-metadata/v1-b26ff6216df5ed38e0548ac2012570e1.json diff --git a/quickwit-doc-mapper/src/default_doc_mapper/field_mapping_entry.rs b/quickwit-doc-mapper/src/default_doc_mapper/field_mapping_entry.rs index a2ca6bd312d..35fb9482e4e 100644 --- a/quickwit-doc-mapper/src/default_doc_mapper/field_mapping_entry.rs +++ b/quickwit-doc-mapper/src/default_doc_mapper/field_mapping_entry.rs @@ -510,6 +510,8 @@ struct FieldMappingEntryForSerialization { #[serde(skip_serializing_if = "Option::is_none")] indexed: Option, #[serde(skip_serializing_if = "Option::is_none")] + fieldnorms: Option, + #[serde(skip_serializing_if = "Option::is_none")] tokenizer: Option, #[serde(skip_serializing_if = "Option::is_none")] record: Option, @@ -551,6 +553,7 @@ impl From for FieldMappingEntryForSerialization { let type_with_cardinality = value.mapping_type.type_with_cardinality(); let mut fast = false; let mut indexed = None; + let mut fieldnorms = None; let mut record = None; let mut stored = false; let mut tokenizer: Option = None; @@ -560,8 +563,11 @@ impl From for FieldMappingEntryForSerialization { if let Some(indexing_options) = text_options.get_indexing_options() { tokenizer = Some(indexing_options.tokenizer().to_owned()); record = Some(indexing_options.index_option()); + indexed = Some(true); + fieldnorms = Some(indexing_options.fieldnorms()); } else { indexed = Some(false); + fieldnorms = Some(false); } } FieldMappingType::I64(options, _) @@ -585,6 +591,7 @@ impl From for FieldMappingEntryForSerialization { type_with_cardinality, fast, indexed, + fieldnorms, record, stored, tokenizer, @@ -623,8 +630,10 @@ impl FieldMappingEntryForSerialization { ) } let mut options = TextOptions::default(); + if self.indexed.unwrap_or(true) { let mut indexing_options = TextFieldIndexing::default(); + indexing_options = indexing_options.set_fieldnorms(self.fieldnorms.unwrap_or(false)); if let Some(index_option) = self.record { indexing_options = indexing_options.set_index_option(index_option); } @@ -673,6 +682,9 @@ impl FieldMappingEntryForSerialization { } if self.indexed.unwrap_or(true) { options = options.set_indexed(); + if self.fieldnorms.unwrap_or(false) { + options = options.set_fieldnorms(); + } } if self.fast { options = options.set_fast(); @@ -720,6 +732,9 @@ impl FieldMappingEntryForSerialization { } if self.indexed.unwrap_or(true) { options = options.set_indexed(); + if self.fieldnorms.unwrap_or(false) { + options = options.set_fieldnorm(); + } } Ok(options) } @@ -821,6 +836,32 @@ mod tests { Ok(()) } + #[test] + fn test_deserialize_valid_fieldnorms() -> anyhow::Result<()> { + let result = serde_json::from_str::( + r#" + { + "name": "my_field_name", + "type": "text", + "stored": true, + "indexed": true, + "fieldnorms": true, + "record": "basic", + "tokenizer": "english" + }"#, + ); + match result.unwrap().mapping_type { + FieldMappingType::Text(options, _) => { + assert_eq!(options.is_stored(), true); + let index_options = options.get_indexing_options().unwrap(); + assert_eq!(index_options.fieldnorms(), true); + } + _ => panic!("wrong property type"), + } + + Ok(()) + } + #[test] fn test_error_on_text_with_invalid_options() -> anyhow::Result<()> { let result = serde_json::from_str::( @@ -963,6 +1004,7 @@ mod tests { match result.mapping_type { FieldMappingType::I64(options, cardinality) => { assert_eq!(options.is_indexed(), true); // default + assert_eq!(options.fieldnorms(), false); // default assert_eq!(options.is_fast(), false); // default assert_eq!(options.is_stored(), true); // default assert_eq!(cardinality, Cardinality::MultiValues); diff --git a/quickwit-metastore/test-data/file-backed-index/v0-9e6b2f93986378154c2ced8692b1c82b.expected.json b/quickwit-metastore/test-data/file-backed-index/v0-9e6b2f93986378154c2ced8692b1c82b.expected.json index cf763ab357f..1f04a43d9be 100644 --- a/quickwit-metastore/test-data/file-backed-index/v0-9e6b2f93986378154c2ced8692b1c82b.expected.json +++ b/quickwit-metastore/test-data/file-backed-index/v0-9e6b2f93986378154c2ced8692b1c82b.expected.json @@ -24,6 +24,8 @@ }, { "fast": false, + "fieldnorms": false, + "indexed": true, "name": "log_level", "record": "basic", "stored": true, @@ -32,6 +34,8 @@ }, { "fast": false, + "fieldnorms": false, + "indexed": true, "name": "message", "record": "position", "stored": true, diff --git a/quickwit-metastore/test-data/file-backed-index/v0-abe75eb4eab7af5fd6a01237efebe352.expected.json b/quickwit-metastore/test-data/file-backed-index/v0-abe75eb4eab7af5fd6a01237efebe352.expected.json index 686196e0d3e..af1e6666662 100644 --- a/quickwit-metastore/test-data/file-backed-index/v0-abe75eb4eab7af5fd6a01237efebe352.expected.json +++ b/quickwit-metastore/test-data/file-backed-index/v0-abe75eb4eab7af5fd6a01237efebe352.expected.json @@ -24,6 +24,8 @@ }, { "fast": false, + "fieldnorms": false, + "indexed": true, "name": "log_level", "record": "basic", "stored": true, @@ -32,6 +34,8 @@ }, { "fast": false, + "fieldnorms": false, + "indexed": true, "name": "message", "record": "position", "stored": true, diff --git a/quickwit-metastore/test-data/file-backed-index/v0-cefa3f14ad8abc59c64907d07d4ab421.expected.json b/quickwit-metastore/test-data/file-backed-index/v0-cefa3f14ad8abc59c64907d07d4ab421.expected.json new file mode 100644 index 00000000000..7a9cf79cd01 --- /dev/null +++ b/quickwit-metastore/test-data/file-backed-index/v0-cefa3f14ad8abc59c64907d07d4ab421.expected.json @@ -0,0 +1,116 @@ +{ + "index": { + "checkpoint": { + "kafka-source": { + "00000000000000000000": "00000000000000000042" + } + }, + "create_timestamp": 1789, + "doc_mapping": { + "field_mappings": [ + { + "fast": true, + "indexed": true, + "name": "tenant_id", + "stored": true, + "type": "u64" + }, + { + "fast": true, + "indexed": true, + "name": "timestamp", + "stored": true, + "type": "i64" + }, + { + "fast": false, + "fieldnorms": false, + "indexed": true, + "name": "log_level", + "record": "basic", + "stored": true, + "tokenizer": "raw", + "type": "text" + }, + { + "fast": false, + "fieldnorms": false, + "indexed": true, + "name": "message", + "record": "position", + "stored": true, + "tokenizer": "default", + "type": "text" + } + ], + "store_source": true, + "tag_fields": [ + "log_level", + "tenant_id" + ] + }, + "index_id": "my-index", + "index_uri": "s3://quickwit-indexes/my-index", + "indexing_settings": { + "commit_timeout_secs": 301, + "demux_enabled": true, + "demux_field": "tenant_id", + "merge_enabled": true, + "merge_policy": { + "demux_factor": 7, + "max_merge_factor": 11, + "merge_factor": 9 + }, + "resources": { + "heap_size": 3, + "num_threads": 3 + }, + "sort_field": "timestamp", + "sort_order": "asc", + "split_num_docs_target": 10000001, + "timestamp_field": "timestamp" + }, + "search_settings": { + "default_search_fields": [ + "message" + ] + }, + "sources": [ + { + "params": { + "client_params": {}, + "topic": "kafka-topic" + }, + "source_id": "kafka-source", + "source_type": "kafka" + } + ], + "update_timestamp": 1789, + "version": "1" + }, + "splits": [ + { + "create_timestamp": 3, + "demux_num_ops": 1, + "footer_offsets": { + "end": 2000, + "start": 1000 + }, + "num_docs": 12303, + "size_in_bytes": 234234, + "split_id": "split", + "split_state": "Published", + "tags": [ + "234", + "aaa" + ], + "time_range": { + "end": 130198, + "start": 121000 + }, + "update_timestamp": 1789, + "version": "1" + } + ], + "version": "0" +} diff --git a/quickwit-metastore/test-data/file-backed-index/v0-cefa3f14ad8abc59c64907d07d4ab421.json b/quickwit-metastore/test-data/file-backed-index/v0-cefa3f14ad8abc59c64907d07d4ab421.json new file mode 100644 index 00000000000..7a9cf79cd01 --- /dev/null +++ b/quickwit-metastore/test-data/file-backed-index/v0-cefa3f14ad8abc59c64907d07d4ab421.json @@ -0,0 +1,116 @@ +{ + "index": { + "checkpoint": { + "kafka-source": { + "00000000000000000000": "00000000000000000042" + } + }, + "create_timestamp": 1789, + "doc_mapping": { + "field_mappings": [ + { + "fast": true, + "indexed": true, + "name": "tenant_id", + "stored": true, + "type": "u64" + }, + { + "fast": true, + "indexed": true, + "name": "timestamp", + "stored": true, + "type": "i64" + }, + { + "fast": false, + "fieldnorms": false, + "indexed": true, + "name": "log_level", + "record": "basic", + "stored": true, + "tokenizer": "raw", + "type": "text" + }, + { + "fast": false, + "fieldnorms": false, + "indexed": true, + "name": "message", + "record": "position", + "stored": true, + "tokenizer": "default", + "type": "text" + } + ], + "store_source": true, + "tag_fields": [ + "log_level", + "tenant_id" + ] + }, + "index_id": "my-index", + "index_uri": "s3://quickwit-indexes/my-index", + "indexing_settings": { + "commit_timeout_secs": 301, + "demux_enabled": true, + "demux_field": "tenant_id", + "merge_enabled": true, + "merge_policy": { + "demux_factor": 7, + "max_merge_factor": 11, + "merge_factor": 9 + }, + "resources": { + "heap_size": 3, + "num_threads": 3 + }, + "sort_field": "timestamp", + "sort_order": "asc", + "split_num_docs_target": 10000001, + "timestamp_field": "timestamp" + }, + "search_settings": { + "default_search_fields": [ + "message" + ] + }, + "sources": [ + { + "params": { + "client_params": {}, + "topic": "kafka-topic" + }, + "source_id": "kafka-source", + "source_type": "kafka" + } + ], + "update_timestamp": 1789, + "version": "1" + }, + "splits": [ + { + "create_timestamp": 3, + "demux_num_ops": 1, + "footer_offsets": { + "end": 2000, + "start": 1000 + }, + "num_docs": 12303, + "size_in_bytes": 234234, + "split_id": "split", + "split_state": "Published", + "tags": [ + "234", + "aaa" + ], + "time_range": { + "end": 130198, + "start": 121000 + }, + "update_timestamp": 1789, + "version": "1" + } + ], + "version": "0" +} diff --git a/quickwit-metastore/test-data/file-backed-index/v0-f43be15d8611ab4d84b0eaec0607907c.expected.json b/quickwit-metastore/test-data/file-backed-index/v0-f43be15d8611ab4d84b0eaec0607907c.expected.json index 482092b9b5a..7a9cf79cd01 100644 --- a/quickwit-metastore/test-data/file-backed-index/v0-f43be15d8611ab4d84b0eaec0607907c.expected.json +++ b/quickwit-metastore/test-data/file-backed-index/v0-f43be15d8611ab4d84b0eaec0607907c.expected.json @@ -24,6 +24,8 @@ }, { "fast": false, + "fieldnorms": false, + "indexed": true, "name": "log_level", "record": "basic", "stored": true, @@ -32,6 +34,8 @@ }, { "fast": false, + "fieldnorms": false, + "indexed": true, "name": "message", "record": "position", "stored": true, diff --git a/quickwit-metastore/test-data/index-metadata/unversioned.expected.json b/quickwit-metastore/test-data/index-metadata/unversioned.expected.json index f59f3cc6226..8077a35cccb 100644 --- a/quickwit-metastore/test-data/index-metadata/unversioned.expected.json +++ b/quickwit-metastore/test-data/index-metadata/unversioned.expected.json @@ -23,6 +23,8 @@ }, { "fast": false, + "fieldnorms": false, + "indexed": true, "name": "log_level", "record": "basic", "stored": true, @@ -31,6 +33,8 @@ }, { "fast": false, + "fieldnorms": false, + "indexed": true, "name": "message", "record": "position", "stored": true, diff --git a/quickwit-metastore/test-data/index-metadata/v0-082493da53735bc58ec35c1648a4f14c.expected.json b/quickwit-metastore/test-data/index-metadata/v0-082493da53735bc58ec35c1648a4f14c.expected.json index 0ae745b849b..8c60cd5d0a8 100644 --- a/quickwit-metastore/test-data/index-metadata/v0-082493da53735bc58ec35c1648a4f14c.expected.json +++ b/quickwit-metastore/test-data/index-metadata/v0-082493da53735bc58ec35c1648a4f14c.expected.json @@ -23,6 +23,8 @@ }, { "fast": false, + "fieldnorms": false, + "indexed": true, "name": "log_level", "record": "basic", "stored": true, @@ -31,6 +33,8 @@ }, { "fast": false, + "fieldnorms": false, + "indexed": true, "name": "message", "record": "position", "stored": true, diff --git a/quickwit-metastore/test-data/index-metadata/v0-4a463f5078aa6acf314c95936336fa68.expected.json b/quickwit-metastore/test-data/index-metadata/v0-4a463f5078aa6acf314c95936336fa68.expected.json index 7cad484ad51..c848d4fc421 100644 --- a/quickwit-metastore/test-data/index-metadata/v0-4a463f5078aa6acf314c95936336fa68.expected.json +++ b/quickwit-metastore/test-data/index-metadata/v0-4a463f5078aa6acf314c95936336fa68.expected.json @@ -23,6 +23,8 @@ }, { "fast": false, + "fieldnorms": false, + "indexed": true, "name": "log_level", "record": "basic", "stored": true, @@ -31,6 +33,8 @@ }, { "fast": false, + "fieldnorms": false, + "indexed": true, "name": "message", "record": "position", "stored": true, diff --git a/quickwit-metastore/test-data/index-metadata/v0-cb4b352caf8990d0e7a54582a4478cd5.expected.json b/quickwit-metastore/test-data/index-metadata/v0-cb4b352caf8990d0e7a54582a4478cd5.expected.json index 0ae745b849b..8c60cd5d0a8 100644 --- a/quickwit-metastore/test-data/index-metadata/v0-cb4b352caf8990d0e7a54582a4478cd5.expected.json +++ b/quickwit-metastore/test-data/index-metadata/v0-cb4b352caf8990d0e7a54582a4478cd5.expected.json @@ -23,6 +23,8 @@ }, { "fast": false, + "fieldnorms": false, + "indexed": true, "name": "log_level", "record": "basic", "stored": true, @@ -31,6 +33,8 @@ }, { "fast": false, + "fieldnorms": false, + "indexed": true, "name": "message", "record": "position", "stored": true, diff --git a/quickwit-metastore/test-data/index-metadata/v1-9538b1b7051ec08b3b442298cd7ba362.expected.json b/quickwit-metastore/test-data/index-metadata/v1-9538b1b7051ec08b3b442298cd7ba362.expected.json index 45f343b6a31..4570ac086d9 100644 --- a/quickwit-metastore/test-data/index-metadata/v1-9538b1b7051ec08b3b442298cd7ba362.expected.json +++ b/quickwit-metastore/test-data/index-metadata/v1-9538b1b7051ec08b3b442298cd7ba362.expected.json @@ -23,6 +23,8 @@ }, { "fast": false, + "fieldnorms": false, + "indexed": true, "name": "log_level", "record": "basic", "stored": true, @@ -31,6 +33,8 @@ }, { "fast": false, + "fieldnorms": false, + "indexed": true, "name": "message", "record": "position", "stored": true, diff --git a/quickwit-metastore/test-data/index-metadata/v1-b26ff6216df5ed38e0548ac2012570e1.expected.json b/quickwit-metastore/test-data/index-metadata/v1-b26ff6216df5ed38e0548ac2012570e1.expected.json new file mode 100644 index 00000000000..ef99f63c159 --- /dev/null +++ b/quickwit-metastore/test-data/index-metadata/v1-b26ff6216df5ed38e0548ac2012570e1.expected.json @@ -0,0 +1,89 @@ +{ + "checkpoint": { + "kafka-source": { + "00000000000000000000": "00000000000000000042" + } + }, + "create_timestamp": 1789, + "doc_mapping": { + "field_mappings": [ + { + "fast": true, + "indexed": true, + "name": "tenant_id", + "stored": true, + "type": "u64" + }, + { + "fast": true, + "indexed": true, + "name": "timestamp", + "stored": true, + "type": "i64" + }, + { + "fast": false, + "fieldnorms": false, + "indexed": true, + "name": "log_level", + "record": "basic", + "stored": true, + "tokenizer": "raw", + "type": "text" + }, + { + "fast": false, + "fieldnorms": false, + "indexed": true, + "name": "message", + "record": "position", + "stored": true, + "tokenizer": "default", + "type": "text" + } + ], + "store_source": true, + "tag_fields": [ + "log_level", + "tenant_id" + ] + }, + "index_id": "my-index", + "index_uri": "s3://quickwit-indexes/my-index", + "indexing_settings": { + "commit_timeout_secs": 301, + "demux_enabled": true, + "demux_field": "tenant_id", + "merge_enabled": true, + "merge_policy": { + "demux_factor": 7, + "max_merge_factor": 11, + "merge_factor": 9 + }, + "resources": { + "heap_size": 3, + "num_threads": 3 + }, + "sort_field": "timestamp", + "sort_order": "asc", + "split_num_docs_target": 10000001, + "timestamp_field": "timestamp" + }, + "search_settings": { + "default_search_fields": [ + "message" + ] + }, + "sources": [ + { + "params": { + "client_params": {}, + "topic": "kafka-topic" + }, + "source_id": "kafka-source", + "source_type": "kafka" + } + ], + "update_timestamp": 1789, + "version": "1" +} diff --git a/quickwit-metastore/test-data/index-metadata/v1-b26ff6216df5ed38e0548ac2012570e1.json b/quickwit-metastore/test-data/index-metadata/v1-b26ff6216df5ed38e0548ac2012570e1.json new file mode 100644 index 00000000000..ef99f63c159 --- /dev/null +++ b/quickwit-metastore/test-data/index-metadata/v1-b26ff6216df5ed38e0548ac2012570e1.json @@ -0,0 +1,89 @@ +{ + "checkpoint": { + "kafka-source": { + "00000000000000000000": "00000000000000000042" + } + }, + "create_timestamp": 1789, + "doc_mapping": { + "field_mappings": [ + { + "fast": true, + "indexed": true, + "name": "tenant_id", + "stored": true, + "type": "u64" + }, + { + "fast": true, + "indexed": true, + "name": "timestamp", + "stored": true, + "type": "i64" + }, + { + "fast": false, + "fieldnorms": false, + "indexed": true, + "name": "log_level", + "record": "basic", + "stored": true, + "tokenizer": "raw", + "type": "text" + }, + { + "fast": false, + "fieldnorms": false, + "indexed": true, + "name": "message", + "record": "position", + "stored": true, + "tokenizer": "default", + "type": "text" + } + ], + "store_source": true, + "tag_fields": [ + "log_level", + "tenant_id" + ] + }, + "index_id": "my-index", + "index_uri": "s3://quickwit-indexes/my-index", + "indexing_settings": { + "commit_timeout_secs": 301, + "demux_enabled": true, + "demux_field": "tenant_id", + "merge_enabled": true, + "merge_policy": { + "demux_factor": 7, + "max_merge_factor": 11, + "merge_factor": 9 + }, + "resources": { + "heap_size": 3, + "num_threads": 3 + }, + "sort_field": "timestamp", + "sort_order": "asc", + "split_num_docs_target": 10000001, + "timestamp_field": "timestamp" + }, + "search_settings": { + "default_search_fields": [ + "message" + ] + }, + "sources": [ + { + "params": { + "client_params": {}, + "topic": "kafka-topic" + }, + "source_id": "kafka-source", + "source_type": "kafka" + } + ], + "update_timestamp": 1789, + "version": "1" +} diff --git a/quickwit-metastore/test-data/index-metadata/v1-e52f66301963300ef942997308b4f3ac.expected.json b/quickwit-metastore/test-data/index-metadata/v1-e52f66301963300ef942997308b4f3ac.expected.json index 3be56808d76..ef99f63c159 100644 --- a/quickwit-metastore/test-data/index-metadata/v1-e52f66301963300ef942997308b4f3ac.expected.json +++ b/quickwit-metastore/test-data/index-metadata/v1-e52f66301963300ef942997308b4f3ac.expected.json @@ -23,6 +23,8 @@ }, { "fast": false, + "fieldnorms": false, + "indexed": true, "name": "log_level", "record": "basic", "stored": true, @@ -31,6 +33,8 @@ }, { "fast": false, + "fieldnorms": false, + "indexed": true, "name": "message", "record": "position", "stored": true,