From c770d797ca63cceb80234d734a3c8cb61f2d70af Mon Sep 17 00:00:00 2001 From: Amitanand Aiyer Date: Sat, 31 Aug 2024 14:31:50 +0800 Subject: [PATCH] [#23747] MetaCache: Callback should not be called while holding the lock Summary: Call callback in ScopeExit block only. Not while holding the lock. Without this fix, it is possible that a thread can get into a deadlock, trying to request a shared_lock on a mutex, while already holding an exclusive lock on the same mutex: This deadlock can be triggered if there are active read/write requests to a Table (from more than 1 thread) right after the table had a tablet-split. If there is only 1 thread, it is unlikely to run into the deadlock, as the thread notices -- as part of the callback -- that the table's partition info is stale. Having a different thread refresh the partition version before the main thread checks if the table version is stale, is likely necessary to trigger the stack trace seen below. e.g: ``` #0 pthread_cond_wait@@GLIBC_2.3.2 () at ../sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185 #1 0x00005640c3eb441b in std::__1::shared_timed_mutex::lock_shared() () #2 0x00005640c3ffcbff in yb::client::internal::MetaCache::LookupTabletByKey(std::__1::shared_ptr const&, std::__1::basic_string, std::__1::allocator > const&, std::__1::chrono::time_point > >, std::__1::function > const&)>, yb::StronglyTypedBool) () #3 0x00005640c3f7549a in yb::client::internal::Batcher::LookupTabletFor(yb::client::internal::InFlightOp*) () #4 0x00005640c401855e in yb::client::(anonymous namespace)::FlushBatcherAsync(std::__1::shared_ptr const&, boost::function, yb::client::YBSession::BatcherConfig, yb::StronglyTypedBool) () #5 0x00005640c401aa76 in yb::client::(anonymous namespace)::BatcherFlushDone(std::__1::shared_ptr const&, yb::Status const&, boost::function, yb::client::YBSession::BatcherConfig) () #6 0x00005640c401b371 in boost::detail::function::void_function_obj_invoker1 const&, yb::Status const&, boost::function, yb::client::YBSession::BatcherConfig), std::__1::shared_ptr const&, std::__1::placeholders::__ph<1> const&, boost::function, yb::client::YBSession::BatcherConfig&>, void, yb::Status const&>::invoke(boost::detail::function::function_buffer&, yb::Status const&) () #7 0x00005640c3f70398 in yb::client::internal::Batcher::Run() () #8 0x00005640c3f72656 in yb::client::internal::Batcher::FlushFinished() () #9 0x00005640c3f74a4d in yb::client::internal::Batcher::TabletLookupFinished(yb::client::internal::InFlightOp*, yb::Result >) () #10 0x00005640c3f759bc in std::__1::__function::__func, void (yb::Result > const&)>::operator()(yb::Result > const&) () #11 0x00005640c3fff05d in yb::client::internal::MetaCache::LookupTabletByKey(std::__1::shared_ptr const&, std::__1::basic_string, std::__1::allocator > const&, std::__1::chrono::time_point > >, std::__1::function > const&)>, yb::StronglyTypedBool) () ** Is holding an exclusive lock in MetaCache::LookupTabletByKey/DoLookupTabletByKey ** #12 0x00005640c3f7549a in yb::client::internal::Batcher::LookupTabletFor(yb::client::internal::InFlightOp*) () #13 0x00005640c401855e in yb::client::(anonymous namespace)::FlushBatcherAsync(std::__1::shared_ptr const&, boost::function, yb::client::YBSession::BatcherConfig, yb::StronglyTypedBool) () #14 0x00005640c4017130 in yb::client::YBSession::FlushAsync(boost::function) () #15 0x00005640c5225a0c in yb::tserver::PgClientServiceImpl::Perform(yb::tserver::PgPerformRequestPB const*, yb::tserver::PgPerformResponsePB*, yb::rpc::RpcContext) () #16 0x00005640c51c4487 in std::__1::__function::__func const&)::$_20, std::__1::allocator const&)::$_20>, void (std::__1::shared_ptr)>::operator()(std::__1::shared_ptr&&) () #17 0x00005640c51d374f in yb::tserver::PgClientServiceIf::Handle(std::__1::shared_ptr) () #18 0x00005640c4f5f420 in yb::rpc::ServicePoolImpl::Handle(std::__1::shared_ptr) () #19 0x00005640c4e845af in yb::rpc::InboundCall::InboundCallTask::Run() () #20 0x00005640c4f6e243 in yb::rpc::(anonymous namespace)::Worker::Execute() () #21 0x00005640c570ecb4 in yb::Thread::SuperviseThread(void*) () #22 0x00007f808b7c6694 in start_thread (arg=0x7f76d8caf700) at pthread_create.c:333 #23 0x00007f808bac341d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:109 ``` Jira: DB-12651 Test Plan: Jenkins yb_build.sh --cxx-test ql-stress-test QLStressTest.ReproMetaCacheDeadlock Reviewers: rthallam, hsunder, qhu, timur Reviewed By: hsunder Subscribers: svc_phabricator, ybase Differential Revision: https://phorge.dev.yugabyte.com/D37706 --- src/yb/client/meta_cache.cc | 24 +++++++++++++++++++----- src/yb/client/ql-stress-test.cc | 8 ++++++++ 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/src/yb/client/meta_cache.cc b/src/yb/client/meta_cache.cc index d6389807a1b9..b00e899ad3c0 100644 --- a/src/yb/client/meta_cache.cc +++ b/src/yb/client/meta_cache.cc @@ -112,6 +112,8 @@ DEFINE_test_flag(bool, force_master_lookup_all_tablets, false, "If set, force the client to go to the master for all tablet lookup " "instead of reading from cache."); +DEFINE_test_flag(int32, sleep_before_metacache_lookup_ms, 0, + "If set, will sleep in LookupTabletByKey for a random amount up to this value."); DEFINE_test_flag(double, simulate_lookup_timeout_probability, 0, "If set, mark an RPC as failed and force retry on the first attempt."); DEFINE_test_flag(double, simulate_lookup_partition_list_mismatch_probability, 0, @@ -2058,9 +2060,12 @@ bool MetaCache::DoLookupTabletByKey( LookupTabletCallback* callback, PartitionGroupStartKeyPtr* partition_group_start) { DCHECK_ONLY_NOTNULL(partition_group_start); RemoteTabletPtr tablet; - auto scope_exit = ScopeExit([callback, &tablet] { + Status status = Status::OK(); + auto scope_exit = ScopeExit([callback, &tablet, &status] { if (tablet) { (*callback)(tablet); + } else if (!status.ok()) { + (*callback)(status); } }); int64_t request_no; @@ -2087,13 +2092,13 @@ bool MetaCache::DoLookupTabletByKey( (PREDICT_FALSE(RandomActWithProbability( FLAGS_TEST_simulate_lookup_partition_list_mismatch_probability)) && table->table_type() != YBTableType::TRANSACTION_STATUS_TABLE_TYPE)) { - (*callback)(STATUS( + status = STATUS( TryAgain, Format( "MetaCache's table $0 partitions version does not match, cached: $1, got: $2, " "refresh required", table->ToString(), table_data->partition_list->version, partitions->version), - ClientError(ClientErrorCode::kTablePartitionListIsStale))); + ClientError(ClientErrorCode::kTablePartitionListIsStale)); return true; } @@ -2200,6 +2205,12 @@ void MetaCache::LookupTabletByKey(const std::shared_ptr& table, return; } + if (FLAGS_TEST_sleep_before_metacache_lookup_ms > 0) { + MonoDelta sleep_time = MonoDelta::FromMilliseconds(1) * + RandomUniformInt(1, FLAGS_TEST_sleep_before_metacache_lookup_ms); + SleepFor(sleep_time); + VLOG_WITH_FUNC(2) << "Slept for " << sleep_time; + } if (table->ArePartitionsStale()) { RefreshTablePartitions( table, @@ -2293,9 +2304,12 @@ bool MetaCache::DoLookupTabletById( UseCache use_cache, LookupTabletCallback* callback) { std::optional tablet = std::nullopt; - auto scope_exit = ScopeExit([callback, &tablet] { + Status status = Status::OK(); + auto scope_exit = ScopeExit([callback, &tablet, &status] { if (tablet) { (*callback)(*tablet); + } else if (!status.ok()) { + (*callback)(status); } }); int64_t request_no; @@ -2311,7 +2325,7 @@ bool MetaCache::DoLookupTabletById( if (use_cache) { if (!include_deleted) { tablet = std::nullopt; - (*callback)(STATUS(NotFound, "Tablet deleted")); + status = STATUS(NotFound, "Tablet deleted"); } return true; } diff --git a/src/yb/client/ql-stress-test.cc b/src/yb/client/ql-stress-test.cc index 399eedc71b01..c0d24c6d5fee 100644 --- a/src/yb/client/ql-stress-test.cc +++ b/src/yb/client/ql-stress-test.cc @@ -74,8 +74,10 @@ DECLARE_bool(detect_duplicates_for_retryable_requests); DECLARE_bool(enable_ondisk_compression); DECLARE_bool(ycql_enable_packed_row); DECLARE_double(TEST_respond_write_failed_probability); +DECLARE_double(TEST_simulate_lookup_partition_list_mismatch_probability); DECLARE_double(transaction_max_missed_heartbeat_periods); DECLARE_int32(TEST_max_write_waiters); +DECLARE_int32(TEST_sleep_before_metacache_lookup_ms); DECLARE_int32(client_read_write_timeout_ms); DECLARE_int32(log_cache_size_limit_mb); DECLARE_int32(log_min_seconds_to_retain); @@ -438,6 +440,12 @@ TEST_F(QLStressTest, RetryWritesWithRestarts) { TestRetryWrites(true /* restarts */); } +TEST_F(QLStressTest, ReproMetaCacheDeadlock) { + ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_simulate_lookup_partition_list_mismatch_probability) = 0.8; + ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_sleep_before_metacache_lookup_ms) = 50; + TestRetryWrites(true /* restarts */); +} + void SetTransactional(YBSchemaBuilder* builder) { TableProperties table_properties; table_properties.SetTransactional(true);