Skip to content

Commit d42fd68

Browse files
authored
[opt](invert index) Empty strings are not written to the index in the case of TOKENIZED (#28822)
1 parent b49671b commit d42fd68

File tree

3 files changed

+68
-15
lines changed

3 files changed

+68
-15
lines changed

be/src/olap/rowset/segment_v2/inverted_index_writer.cpp

+8-15
Original file line numberDiff line numberDiff line change
@@ -258,7 +258,6 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter {
258258
}
259259

260260
for (int i = 0; i < count; ++i) {
261-
new_fulltext_field(empty_value.c_str(), 0);
262261
RETURN_IF_ERROR(add_null_document());
263262
}
264263
}
@@ -305,13 +304,10 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter {
305304
get_parser_ignore_above_value_from_properties(_index_meta->properties());
306305
auto ignore_above = std::stoi(ignore_above_value);
307306
for (int i = 0; i < count; ++i) {
308-
// only ignore_above UNTOKENIZED strings
309-
if (_parser_type == InvertedIndexParserType::PARSER_NONE &&
310-
v->get_size() > ignore_above) {
311-
VLOG_DEBUG << "fulltext index value length can be at most "
312-
<< ignore_above_value << ", but got "
313-
<< "value length:" << v->get_size() << ", ignore this value";
314-
new_fulltext_field(empty_value.c_str(), 0);
307+
// only ignore_above UNTOKENIZED strings and empty strings not tokenized
308+
if ((_parser_type == InvertedIndexParserType::PARSER_NONE &&
309+
v->get_size() > ignore_above) ||
310+
(_parser_type != InvertedIndexParserType::PARSER_NONE && v->empty())) {
315311
RETURN_IF_ERROR(add_null_document());
316312
} else {
317313
new_fulltext_field(v->get_data(), v->get_size());
@@ -358,13 +354,10 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter {
358354
}
359355

360356
auto value = join(strings, " ");
361-
// only ignore_above UNTOKENIZED strings
362-
if (_parser_type == InvertedIndexParserType::PARSER_NONE &&
363-
value.length() > ignore_above) {
364-
VLOG_DEBUG << "fulltext index value length can be at most "
365-
<< ignore_above_value << ", but got "
366-
<< "value length:" << value.length() << ", ignore this value";
367-
new_fulltext_field(empty_value.c_str(), 0);
357+
// only ignore_above UNTOKENIZED strings and empty strings not tokenized
358+
if ((_parser_type == InvertedIndexParserType::PARSER_NONE &&
359+
value.length() > ignore_above) ||
360+
(_parser_type != InvertedIndexParserType::PARSER_NONE && value.empty())) {
368361
RETURN_IF_ERROR(add_null_document());
369362
} else {
370363
new_fulltext_field(value.c_str(), value.length());
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
-- This file is automatically generated. You should know what you did if you want to edit this
2+
-- !sql --
3+
1
4+
5+
-- !sql --
6+
0
7+
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
19+
suite("test_index_empty_string", "p0"){
20+
def timeout = 60000
21+
def delta_time = 1000
22+
def alter_res = "null"
23+
def useTime = 0
24+
25+
def indexTblName = "test_index_empty_string"
26+
27+
sql "DROP TABLE IF EXISTS ${indexTblName}"
28+
// create 1 replica table
29+
sql """
30+
CREATE TABLE IF NOT EXISTS ${indexTblName}(
31+
`id` int(11) NOT NULL,
32+
`a` text NULL DEFAULT "",
33+
`b` text NULL DEFAULT "",
34+
INDEX a_idx(`a`) USING INVERTED COMMENT '',
35+
INDEX b_idx(`b`) USING INVERTED PROPERTIES("parser" = "english") COMMENT ''
36+
) ENGINE=OLAP
37+
DUPLICATE KEY(`id`)
38+
COMMENT 'OLAP'
39+
DISTRIBUTED BY HASH(`id`) BUCKETS 1
40+
PROPERTIES(
41+
"replication_allocation" = "tag.location.default: 1"
42+
);
43+
"""
44+
45+
sql """
46+
INSERT INTO $indexTblName VALUES
47+
(1, '', '1'),
48+
(2, '2', '');
49+
"""
50+
51+
qt_sql "SELECT count() FROM $indexTblName WHERE a match '';"
52+
qt_sql "SELECT count() FROM $indexTblName WHERE b match '';"
53+
}

0 commit comments

Comments
 (0)