Skip to content

Commit

Permalink
[opt](inverted index) performance optimization for need_read_data in …
Browse files Browse the repository at this point in the history
…compound #35346 #36292 (#36404)

pick from master
#35346
#36292
  • Loading branch information
zzzxl1993 authored Jun 20, 2024
1 parent 0be5331 commit dabd27e
Show file tree
Hide file tree
Showing 5 changed files with 254 additions and 13 deletions.
54 changes: 41 additions & 13 deletions be/src/olap/rowset/segment_v2/segment_iterator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,34 @@ Status SegmentIterator::_init_impl(const StorageReadOptions& opts) {
_storage_name_and_type[i] = std::make_pair(field_name, storage_type);
}
}

// find columns that definitely require reading data, such as functions that are not pushed down.
{
std::set<std::string> push_down_preds;
for (auto* pred : _col_predicates) {
if (!_check_apply_by_inverted_index(pred)) {
continue;
}
push_down_preds.insert(_gen_predicate_result_sign(pred));
}
for (auto* pred : _col_preds_except_leafnode_of_andnode) {
if (!_check_apply_by_inverted_index(pred)) {
continue;
}
push_down_preds.insert(_gen_predicate_result_sign(pred));
}
for (auto& preds_in_remaining_vconjuct : _column_pred_in_remaining_vconjunct) {
const auto& column_name = preds_in_remaining_vconjuct.first;
for (auto& pred_info : preds_in_remaining_vconjuct.second) {
auto column_sign = _gen_predicate_result_sign(&pred_info);
if (!push_down_preds.contains(column_sign)) {
auto cid = _opts.tablet_schema->field_index(column_name);
_need_read_data_indices[cid] = true;
}
}
}
}

return Status::OK();
}

Expand Down Expand Up @@ -891,6 +919,7 @@ Status SegmentIterator::_apply_inverted_index_except_leafnode_of_andnode(

Status SegmentIterator::_apply_index_except_leafnode_of_andnode() {
for (auto* pred : _col_preds_except_leafnode_of_andnode) {
auto column_id = pred->column_id();
auto pred_type = pred->type();
bool is_support = pred_type == PredicateType::EQ || pred_type == PredicateType::NE ||
pred_type == PredicateType::LT || pred_type == PredicateType::LE ||
Expand All @@ -899,6 +928,7 @@ Status SegmentIterator::_apply_index_except_leafnode_of_andnode() {
pred_type == PredicateType::IN_LIST ||
pred_type == PredicateType::NOT_IN_LIST;
if (!is_support) {
_need_read_data_indices[column_id] = true;
continue;
}

Expand All @@ -908,16 +938,17 @@ Status SegmentIterator::_apply_index_except_leafnode_of_andnode() {
if (can_apply_by_inverted_index) {
res = _apply_inverted_index_except_leafnode_of_andnode(pred, &bitmap);
} else {
_need_read_data_indices[column_id] = true;
continue;
}

bool need_remaining_after_evaluate = _column_has_fulltext_index(pred->column_id()) &&
bool need_remaining_after_evaluate = _column_has_fulltext_index(column_id) &&
PredicateTypeTraits::is_equal_or_list(pred_type);
if (!res.ok()) {
if (_downgrade_without_index(res, need_remaining_after_evaluate)) {
// downgrade without index query
_not_apply_index_pred.insert(pred->column_id());
_need_read_data_indices[pred->column_id()] = true;
_not_apply_index_pred.insert(column_id);
_need_read_data_indices[column_id] = true;
continue;
}
LOG(WARNING) << "failed to evaluate index"
Expand All @@ -928,17 +959,10 @@ Status SegmentIterator::_apply_index_except_leafnode_of_andnode() {

std::string pred_result_sign = _gen_predicate_result_sign(pred);
_rowid_result_for_index.emplace(pred_result_sign, std::make_pair(true, std::move(bitmap)));
}

for (auto* pred : _col_preds_except_leafnode_of_andnode) {
auto column_name = _schema->column(pred->column_id())->name();
if (!_remaining_conjunct_roots.empty() &&
_check_column_pred_all_push_down(column_name, true,
pred->type() == PredicateType::MATCH) &&
!pred->predicate_params()->marked_by_runtime_filter) {
// if column's need_read_data already set true, we can not set it to false now.
if (_need_read_data_indices.find(pred->column_id()) == _need_read_data_indices.end()) {
_need_read_data_indices[pred->column_id()] = false;
if (!pred->predicate_params()->marked_by_runtime_filter) {
if (!_need_read_data_indices.contains(column_id)) {
_need_read_data_indices[column_id] = false;
}
}
}
Expand Down Expand Up @@ -1928,6 +1952,10 @@ Status SegmentIterator::_read_columns_by_index(uint32_t nrows_read_limit, uint32
continue;
}

DBUG_EXECUTE_IF("segment_iterator._read_columns_by_index", {
return Status::Error<ErrorCode::INTERNAL_ERROR>("{} does not need to read data");
})

if (is_continuous) {
size_t rows_read = nrows_read;
_opts.stats->block_first_read_seek_num += 1;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !sql --
863

-- !sql --
210

-- !sql --
0

-- !sql --
819

-- !sql --
199

-- !sql --
713

-- !sql --
18

15 changes: 15 additions & 0 deletions regression-test/data/inverted_index_p0/test_need_read_data.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !sql --
8 \N
4 -10
13 -4
2 1
3 2
3 3
5 4
5 5
1 6
1 7
4 8
1 9

Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

suite("test_need_read_data_fault_injection", "nonConcurrent") {
// define a sql table
def indexTbName = "test_need_read_data_fault_injection"

sql "DROP TABLE IF EXISTS ${indexTbName}"
sql """
CREATE TABLE ${indexTbName} (
`@timestamp` int(11) NULL COMMENT "",
`clientip` varchar(20) NULL COMMENT "",
`request` text NULL COMMENT "",
`status` int(11) NULL COMMENT "",
`size` int(11) NULL COMMENT "",
INDEX clientip_idx (`clientip`) USING INVERTED PROPERTIES("parser" = "english", "support_phrase" = "true") COMMENT '',
INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" = "english", "support_phrase" = "true") COMMENT ''
) ENGINE=OLAP
DUPLICATE KEY(`@timestamp`)
COMMENT "OLAP"
DISTRIBUTED BY RANDOM BUCKETS 1
PROPERTIES (
"replication_allocation" = "tag.location.default: 1",
"disable_auto_compaction" = "true"
);
"""

def load_httplogs_data = {table_name, label, read_flag, format_flag, file_name, ignore_failure=false,
expected_succ_rows = -1, load_to_single_tablet = 'true' ->

// load the json data
streamLoad {
table "${table_name}"

// set http request header params
set 'label', label + "_" + UUID.randomUUID().toString()
set 'read_json_by_line', read_flag
set 'format', format_flag
file file_name // import json file
time 10000 // limit inflight 10s
if (expected_succ_rows >= 0) {
set 'max_filter_ratio', '1'
}

// if declared a check callback, the default check condition will ignore.
// So you must check all condition
check { result, exception, startTime, endTime ->
if (ignore_failure && expected_succ_rows < 0) { return }
if (exception != null) {
throw exception
}
log.info("Stream load result: ${result}".toString())
def json = parseJson(result)
}
}
}

try {
load_httplogs_data.call(indexTbName, 'test_need_read_data_fault_injection', 'true', 'json', 'documents-1000.json')

sql "sync"

try {
GetDebugPoint().enableDebugPointForAllBEs("segment_iterator._read_columns_by_index")

qt_sql """ select count() from ${indexTbName} where (request match_phrase 'hm' or request match_phrase 'jpg' or request match_phrase 'gif'); """
qt_sql """ select count() from ${indexTbName} where (request match_phrase 'hm' or request match_phrase 'jpg' and request match_phrase 'gif'); """
qt_sql """ select count() from ${indexTbName} where (request match_phrase 'hm' and request match_phrase 'jpg' and request match_phrase 'gif'); """
qt_sql """ select count() from ${indexTbName} where (request match_phrase 'hm' and request match_phrase 'jpg' or request match_phrase 'gif'); """

qt_sql """ select count() from ${indexTbName} where (clientip match '1' or request match 'jpg' or clientip match '2'); """
qt_sql """ select count() from ${indexTbName} where (clientip match '3' or request match 'gif' or clientip match '4'); """
qt_sql """ select count() from ${indexTbName} where (clientip match 'images' or clientip match '5' or clientip match 'english'); """

} finally {
GetDebugPoint().disableDebugPointForAllBEs("segment_iterator._read_columns_by_index")
}
} finally {
}
}
Loading

0 comments on commit dabd27e

Please sign in to comment.