Skip to content

Commit

Permalink
[fix](group commit) make group commit cancel in time (apache#36249)
Browse files Browse the repository at this point in the history
## Proposed changes

If group commit time interval is larger than the load timeout, and there
is no new client load to reuse the internal group commit load, the group
commit can not cancel in time because it stuck in wait:
```
#0  0x00007f33937a47aa in pthread_cond_timedwait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
#1  0x00005651105dbd05 in __gthread_cond_timedwait(pthread_cond_t*, pthread_mutex_t*, timespec const*) ()
#2  0x000056511063f385 in std::__condvar::wait_until(std::mutex&, timespec&) ()
apache#3  0x000056511063dc2e in std::cv_status std::condition_variable::__wait_until_impl<std::chrono::duration<long, std::ratio<1l, 1000000000l> > >(std::unique_lock<std::mutex>&, std::chrono::time_point<std::chrono::_V2::system_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> > > const&) ()
apache#4  0x000056511063cedf in std::cv_status std::condition_variable::wait_until<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> > >(std::unique_lock<std::mutex>&, std::chrono::time_point<std::chrono::_V2::steady_clock, std::chrono::duration<long, std::ratio<1l, 1000000000l> > > const&) ()
apache#5  0x0000565110824f48 in std::cv_status std::condition_variable::wait_for<long, std::ratio<1l, 1000l> >(std::unique_lock<std::mutex>&, std::chrono::duration<long, std::ratio<1l, 1000l> > const&) ()
apache#6  0x0000565113b5612a in doris::LoadBlockQueue::get_block(doris::RuntimeState*, doris::vectorized::Block*, bool*, bool*) ()
apache#7  0x000056513f900941 in doris::pipeline::GroupCommitOperatorX::get_block(doris::RuntimeState*, doris::vectorized::Block*, bool*) ()
apache#8  0x000056513c69c0b6 in doris::pipeline::ScanOperatorX<doris::pipeline::GroupCommitLocalState>::get_block_after_projects(doris::RuntimeState*, doris::vectorized::Block*, bool*) ()
apache#9  0x000056514009d5f1 in doris::pipeline::PipelineTask::execute(bool*) ()
apache#10 0x00005651400fb24a in doris::pipeline::TaskScheduler::_do_work(unsigned long) ()
```
  • Loading branch information
mymeiyi authored Jun 13, 2024
1 parent 9a125d3 commit 975beea
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 1 deletion.
2 changes: 1 addition & 1 deletion be/src/runtime/group_commit_mgr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ Status LoadBlockQueue::get_block(RuntimeState* runtime_state, vectorized::Block*
<< ", runtime_state=" << runtime_state;
}
}
_get_cond.wait_for(l, std::chrono::milliseconds(left_milliseconds));
_get_cond.wait_for(l, std::chrono::milliseconds(std::min(left_milliseconds, 10000L)));
}
if (runtime_state->is_cancelled()) {
auto st = runtime_state->cancel_reason();
Expand Down
55 changes: 55 additions & 0 deletions regression-test/suites/insert_p0/test_group_commit_timeout.groovy
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

suite("test_group_commit_timeout", "nonConcurrent") {
def tableName = "test_group_commit_timeout"
sql """
CREATE TABLE if not exists ${tableName} (
`id` int(11) NOT NULL,
`name` varchar(100) NULL,
`score` int(11) NULL default "-1"
) ENGINE=OLAP
DUPLICATE KEY(`id`)
DISTRIBUTED BY HASH(`id`) BUCKETS 1
PROPERTIES (
"replication_num" = "1",
"group_commit_interval_ms" = "300000"
);
"""

def query_timeout = sql """show variables where variable_name = 'query_timeout';"""
def insert_timeout = sql """show variables where variable_name = 'insert_timeout';"""
logger.info("query_timeout: ${query_timeout}, insert_timeout: ${insert_timeout}")

long start = System.currentTimeMillis()
try {
sql "SET global query_timeout = 5"
sql "SET global insert_timeout = 5"

sql "set group_commit = sync_mode"
sql "insert into ${tableName} values(1, 'a', 10)"
assertTrue(false)
} catch (Exception e) {
long end = System.currentTimeMillis()
logger.info("failed " + e.getMessage())
assertTrue(e.getMessage().contains("FragmentMgr cancel worker going to cancel timeout instance"))
assertTrue(end - start <= 60000)
} finally {
sql "SET global query_timeout = ${query_timeout[0][1]}"
sql "SET global insert_timeout = ${insert_timeout[0][1]}"
}
}

0 comments on commit 975beea

Please sign in to comment.