diff --git a/src/yb/client/ql-tablet-test.cc b/src/yb/client/ql-tablet-test.cc index 593e776a4dbb..d0081dbeee9e 100644 --- a/src/yb/client/ql-tablet-test.cc +++ b/src/yb/client/ql-tablet-test.cc @@ -78,6 +78,7 @@ #include "yb/util/shared_lock.h" #include "yb/util/status_format.h" #include "yb/util/stopwatch.h" +#include "yb/util/test_thread_holder.h" #include "yb/util/tsan_util.h" #include "yb/yql/cql/ql/util/statement_result.h" @@ -1940,33 +1941,44 @@ namespace { Status CalcKeysDistributionAcrossWorkers( tablet::Tablet* tablet, std::vector range_end_keys, const size_t num_workers, - const double min_max_keys_ratio_limit) { + const double min_max_keys_ratio_limit, tablet::Direction direction) { std::vector keys_per_worker(num_workers); auto iter = CreateRocksDBIterator( tablet->doc_db().regular, tablet->doc_db().key_bounds, docdb::BloomFilterMode::DONT_USE_BLOOM_FILTER, boost::none, rocksdb::kDefaultQueryId); - iter.SeekToFirst(); + if (direction == tablet::Direction::kForward) { + iter.SeekToFirst(); + } else { + iter.SeekToLast(); + } std::string current_range_start_key; + auto range_key_iter = range_end_keys.begin(); size_t current_range_idx = 0; size_t entries_in_current_range = 0; - while (iter.Valid() || current_range_idx < range_end_keys.size()) { - while (current_range_idx < range_end_keys.size() && - (!iter.Valid() || (!range_end_keys[current_range_idx].empty() && - iter.key() >= range_end_keys[current_range_idx]))) { + while (iter.Valid() || range_key_iter != range_end_keys.end()) { + while (range_key_iter != range_end_keys.end() && + (!iter.Valid() || (!range_key_iter->empty() && (direction == tablet::Direction::kForward + ? iter.key() >= *range_key_iter + : iter.key() <= *range_key_iter)))) { LOG(INFO) << "Range #" << current_range_idx << "[" << Slice(current_range_start_key).ToDebugHexString() << ", " - << Slice(range_end_keys[current_range_idx]).ToDebugHexString() << ") has " - << entries_in_current_range << " rocksdb records"; + << Slice(*range_key_iter).ToDebugHexString() << ") has " << entries_in_current_range + << " rocksdb records"; keys_per_worker[current_range_idx % num_workers] += entries_in_current_range; - current_range_start_key = range_end_keys[current_range_idx]; + current_range_start_key = *range_key_iter; + ++range_key_iter; ++current_range_idx; entries_in_current_range = 0; } if (iter.Valid()) { ++entries_in_current_range; - iter.Next(); + if (direction == tablet::Direction::kForward) { + iter.Next(); + } else { + iter.Prev(); + } } } RETURN_NOT_OK(iter.status()); @@ -1989,27 +2001,62 @@ Status CalcKeysDistributionAcrossWorkers( } // namespace TEST_P(QLTabletRf1TestToggleEnablePackedRow, GetTabletKeyRanges) { + constexpr auto kValueSize = 16; + constexpr auto kWriteBatchSize = 128; + constexpr auto kNumWriteThreads = 2; + constexpr auto kNumSstFiles = 4; constexpr auto kNumFlushes = 15; + constexpr auto kNumWorkers = 5; constexpr auto kMinMaxKeysRatioLimit = 0.75; FLAGS_db_block_size_bytes = 4_KB; FLAGS_db_write_buffer_size = 200_KB; - TestWorkload workload(cluster_.get()); - workload.set_table_name(kTable1Name); - workload.set_write_timeout_millis(30000); - workload.set_num_tablets(1); - workload.set_num_write_threads(2); - workload.set_write_batch_size(1); - workload.set_payload_bytes(16); - workload.Setup(); + TableHandle table; + + { + YBSchemaBuilder builder; + // TODO(get_table_key_ranges): also test for hash-based sharding as soon as it is properly + // supported by GetTabletKeyRanges. + builder.AddColumn(kKeyColumn)->Type(DataType::INT32)->PrimaryKey()->NotNull(); + builder.AddColumn(kValueColumn)->Type(DataType::STRING); + ASSERT_OK(table.Create(kTable1Name, /* num_tablets = */ 1, client_.get(), &builder)); + table.NewInsertOp(); + } + + std::atomic num_rows_inserted{0}; + TestThreadHolder write_threads; LOG(INFO) << "Starting workload ..."; Stopwatch s(Stopwatch::ALL_THREADS); s.start(); - workload.Start(); + + for (auto t = 0; t < kNumWriteThreads; ++t) { + write_threads.AddThreadFunctor( + [this, &stop_requested = write_threads.stop_flag(), &table, &num_rows_inserted] { + auto session = CreateSession(); + size_t num_ops_applied = 0; + + while (!stop_requested) { + auto insert = table.NewInsertOp(); + auto req = insert->mutable_request(); + const auto key = RandomUniformInt(); + + QLAddInt32RangeValue(req, key); + table.AddStringColumnValue(req, kValueColumn, RandomString(kValueSize)); + session->Apply(insert); + if (++num_ops_applied == kWriteBatchSize) { + const auto flush_status = session->TEST_FlushAndGetOpsErrors(); + ASSERT_OK(flush_status.status); + ASSERT_EQ(flush_status.errors.size(), 0); + num_rows_inserted.fetch_add(num_ops_applied); + num_ops_applied = 0; + } + } + }); + } const auto peers = ListTabletPeers(cluster_.get(), ListPeersFilter::kLeaders); ASSERT_EQ(peers.size(), 1); @@ -2023,91 +2070,104 @@ TEST_P(QLTabletRf1TestToggleEnablePackedRow, GetTabletKeyRanges) { std::this_thread::sleep_for(100ms); } - workload.StopAndJoin(); + write_threads.Stop(); s.stop(); LOG(INFO) << "Workload stopped, it took: " << AsString(s.elapsed()); - LOG(INFO) << "Rows inserted: " << workload.rows_inserted(); + LOG(INFO) << "Rows inserted: " << num_rows_inserted; LOG(INFO) << "Number of SST files: " << db->GetCurrentVersionNumSSTFiles(); LOG(INFO) << "SST data size: " << db->GetCurrentVersionSstFilesUncompressedSize(); + NO_PENDING_FATALS(); + const auto range_size_bytes = db->GetCurrentVersionSstFilesUncompressedSize() / 50; - for (uint32_t max_key_length : {1024, 16, 8, 4, 2}) { - LOG(INFO) << "max_key_length: " << max_key_length; + for (auto direction : {tablet::Direction::kForward, tablet::Direction::kBackward}) { + for (uint32_t max_key_length : {1024, 16, 8, 4, 2}) { + LOG(INFO) << "max_key_length: " << max_key_length << " direction: " << AsString(direction); - std::vector range_end_keys; - auto add_range_end_key = [&range_end_keys](Slice key) { - LOG(INFO) << "Got range end key: " << key.ToDebugHexString(); - range_end_keys.push_back(key.ToBuffer()); - }; + // List of range end/start keys depending on whether is_forward is true/false. + std::vector range_boundary_keys; + auto add_range_boundary_key = [&range_boundary_keys](Slice key) { + LOG(INFO) << "Got range boundary key: " << key.ToDebugHexString(); + range_boundary_keys.push_back(key.ToBuffer()); + }; - ASSERT_OK(tablet->TEST_GetTabletKeyRanges( - Slice(), Slice(), std::numeric_limits::max(), range_size_bytes, - tablet::IsForward::kTrue, max_key_length, add_range_end_key)); + ASSERT_OK(tablet->TEST_GetTabletKeyRanges( + Slice(), Slice(), std::numeric_limits::max(), range_size_bytes, + direction, max_key_length, add_range_boundary_key)); - LOG(INFO) << "Ranges count: " << range_end_keys.size(); + LOG(INFO) << "Ranges count: " << range_boundary_keys.size(); - // We should have at least 1 range. - ASSERT_GE(range_end_keys.size(), 1); + // We should have at least 1 range. + ASSERT_GE(range_boundary_keys.size(), 1); + + ASSERT_EQ(range_boundary_keys.back(), ""); + for (size_t i = 0; i + 2 < range_boundary_keys.size(); ++i) { + switch (direction) { + case tablet::Direction::kForward: + ASSERT_LT(range_boundary_keys[i], range_boundary_keys[i + 1]); + continue; + case tablet::Direction::kBackward: + ASSERT_GT(range_boundary_keys[i], range_boundary_keys[i + 1]); + continue; + } + FATAL_INVALID_ENUM_VALUE(tablet::Direction, direction); + } - ASSERT_EQ(range_end_keys.back(), ""); - for (size_t i = 0; i + 2 < range_end_keys.size(); ++i) { - ASSERT_LT(range_end_keys[i], range_end_keys[i + 1]); - } + // TODO(get_table_key_ranges): For now we are returning full DocKeys and skipping ones that + // are longer than max_key_length, because truncated DocKeys are not supported as lower/upper + // bounds for scans. So, if DocKeys are longer than max_key_length we can have very uneven + // distribution across ranges/workers. + const auto min_max_keys_ratio_limit = max_key_length > 8 ? kMinMaxKeysRatioLimit : 0; - // TODO(get_table_key_ranges): For now we are returning full DocKeys and skipping ones that - // are longer than max_key_length, because truncated DocKeys are not supported as lower/upper - // bounds for scans. So, if DocKeys are longer than max_key_length we can have very uneven - // distribution across ranges/workers. - const auto min_max_keys_ratio_limit = max_key_length > 8 ? kMinMaxKeysRatioLimit : 0; + ASSERT_OK(CalcKeysDistributionAcrossWorkers( + tablet.get(), range_boundary_keys, kNumWorkers, min_max_keys_ratio_limit, direction)); - ASSERT_OK(CalcKeysDistributionAcrossWorkers( - tablet.get(), range_end_keys, kNumWorkers, min_max_keys_ratio_limit)); + // Get ranges in multiple batches, verify it covers the whole space. + constexpr auto kNumRangesPerBatch = 5; + std::string start_key = ""; - // Get ranges in multiple batches, verify it covers the whole space. - constexpr auto kNumRangesPerBatch = 5; - std::string lower_bound = ""; + std::vector range_boundary_keys_from_batches; + for (;;) { + std::vector range_boundary_keys_batch; + LOG(INFO) << "Getting tablet key ranges starting from: " + << Slice(start_key).ToDebugHexString() << " direction: " << AsString(direction); - std::vector range_end_keys_from_batches; - for (;;) { - std::vector range_end_keys_batch; - LOG(INFO) << "Getting tablet key ranges starting from: " - << Slice(lower_bound).ToDebugHexString(); + ASSERT_OK(tablet->TEST_GetTabletKeyRanges( + direction == tablet::Direction::kForward ? start_key : Slice(), + direction == tablet::Direction::kForward ? Slice() : start_key, kNumRangesPerBatch, + range_size_bytes, direction, max_key_length, [&range_boundary_keys_batch](Slice key) { + LOG(INFO) << "Got range boundary key: " << key.ToDebugHexString(); + range_boundary_keys_batch.push_back(key.ToBuffer()); + })); - ASSERT_OK(tablet->TEST_GetTabletKeyRanges( - lower_bound, Slice(), kNumRangesPerBatch, range_size_bytes, tablet::IsForward::kTrue, - max_key_length, [&range_end_keys_batch](Slice key) { - LOG(INFO) << "Got range end key: " << key.ToDebugHexString(); - range_end_keys_batch.push_back(key.ToBuffer()); - })); + ASSERT_LE(range_boundary_keys_batch.size(), kNumRangesPerBatch); - ASSERT_LE(range_end_keys_batch.size(), kNumRangesPerBatch); + // We should have at least 1 range. + ASSERT_GE(range_boundary_keys_batch.size(), 1); - // We should have at least 1 range. - ASSERT_GE(range_end_keys_batch.size(), 1); + for (const auto& key : range_boundary_keys_batch) { + range_boundary_keys_from_batches.push_back(key); + } - for (const auto& key : range_end_keys_batch) { - range_end_keys_from_batches.push_back(key); - } + if (range_boundary_keys_batch.back().empty()) { + // We've reached the end. + break; + } - if (range_end_keys_batch.back().empty()) { - // We've reached the end. - break; - } + ASSERT_EQ(range_boundary_keys_batch.size(), kNumRangesPerBatch); - ASSERT_EQ(range_end_keys_batch.size(), kNumRangesPerBatch); + // Use last returned key as start key for next batch of ranges. + start_key = range_boundary_keys_batch.back(); + } - // Use last returned key as lower bound for next batch of ranges. - lower_bound = range_end_keys_batch.back(); + ASSERT_OK(CalcKeysDistributionAcrossWorkers( + tablet.get(), range_boundary_keys_from_batches, kNumWorkers, min_max_keys_ratio_limit, + direction)); } - - ASSERT_OK(CalcKeysDistributionAcrossWorkers( - tablet.get(), range_end_keys_from_batches, kNumWorkers, min_max_keys_ratio_limit)); } - // TODO(get_table_key_ranges): test getting ranges in reverse order. } - } // namespace client } // namespace yb diff --git a/src/yb/client/table_handle.cc b/src/yb/client/table_handle.cc index 79c8ccae65f6..ca0459adff12 100644 --- a/src/yb/client/table_handle.cc +++ b/src/yb/client/table_handle.cc @@ -58,6 +58,21 @@ Status TableHandle::Create(const YBTableName& table_name, .schema(&schema) .num_tablets(num_tablets); + if (schema.num_hash_key_columns() == 0) { + // Setup range key columns for range-sharded tables. + std::vector range_column_names; + range_column_names.reserve(schema.num_range_key_columns()); + auto& columns = schema.columns(); + for (size_t i = 0; i < schema.num_key_columns(); ++i) { + auto& column_schema = columns[i]; + CHECK(column_schema.is_key()); + if (!column_schema.is_hash_key()) { + range_column_names.push_back(column_schema.name()); + } + } + table_creator->set_range_partition_columns(range_column_names); + } + // Setup Index properties. if (index_info) { table_creator->indexed_table_id(index_info->indexed_table_id()) diff --git a/src/yb/common/pgsql_protocol.proto b/src/yb/common/pgsql_protocol.proto index 27244b966a22..94a80595e06e 100644 --- a/src/yb/common/pgsql_protocol.proto +++ b/src/yb/common/pgsql_protocol.proto @@ -372,8 +372,7 @@ message GetTabletKeyRangesEmbeddedRequestPB { optional bytes lower_bound_key = 1; optional bytes upper_bound_key = 2; optional uint64 range_size_bytes = 3; - optional bool is_forward = 4 [ default = true ]; - optional uint32 max_key_length = 5; + optional uint32 max_key_length = 4; } // TODO(neil) The protocol for select needs to be changed accordingly when we introduce and cache diff --git a/src/yb/integration-tests/external_mini_cluster.h b/src/yb/integration-tests/external_mini_cluster.h index c1ffb90fa236..86229cf95d45 100644 --- a/src/yb/integration-tests/external_mini_cluster.h +++ b/src/yb/integration-tests/external_mini_cluster.h @@ -457,6 +457,7 @@ class ExternalMiniCluster : public MiniClusterBase { Result GetSplitKey(const ExternalTabletServer& ts, const yb::TabletId& tablet_id, bool fail_on_response_error = true); + // Flushes all tablets if tablets_ids is empty. Status FlushTabletsOnSingleTServer( ExternalTabletServer* ts, const std::vector tablet_ids, tserver::FlushTabletsRequestPB_Operation operation); diff --git a/src/yb/rocksdb/iterator.h b/src/yb/rocksdb/iterator.h index e6c08a45ed41..dfd2d6b43d5f 100644 --- a/src/yb/rocksdb/iterator.h +++ b/src/yb/rocksdb/iterator.h @@ -128,6 +128,10 @@ class Iterator : public Cleanable { return Entry().Valid(); } + std::string KeyDebugHexString() const { + return Valid() ? key().ToDebugHexString() : ""; + } + // Same as Valid(), but returns error if there was a read error. // For hot paths consider using Valid() in a loop and checking status after the loop. yb::Result CheckedValid() const { diff --git a/src/yb/tablet/tablet.cc b/src/yb/tablet/tablet.cc index 8b298887f81a..da6e8d8bdf51 100644 --- a/src/yb/tablet/tablet.cc +++ b/src/yb/tablet/tablet.cc @@ -4602,19 +4602,43 @@ Status Tablet::ProcessPgsqlGetTableKeyRangesRequest( const PgsqlReadRequestPB& req, PgsqlReadRequestResult* result) const { const auto& get_key_ranges_req = req.get_tablet_key_ranges_request(); result->num_rows_read = 0; - const auto status = GetTabletKeyRanges( + bool has_reached_end = false; + Status key_error; + auto status = GetTabletKeyRanges( get_key_ranges_req.lower_bound_key(), get_key_ranges_req.upper_bound_key(), req.limit(), - get_key_ranges_req.range_size_bytes(), IsForward(get_key_ranges_req.is_forward()), - get_key_ranges_req.max_key_length(), [result](Slice key) { + get_key_ranges_req.range_size_bytes(), + req.is_forward_scan() ? Direction::kForward : Direction::kBackward, + get_key_ranges_req.max_key_length(), + [result, &has_reached_end, &key_error](Slice key) { + if (has_reached_end && key_error.ok()) { + key_error = STATUS_FORMAT( + InternalError, "Got key after reaching boundary: $0", key.ToDebugHexString()); + YB_LOG_EVERY_N_SECS(DFATAL, 60) << key_error; + } pggate::WriteBinaryColumn(key, result->rows_data); ++result->num_rows_read; + has_reached_end |= key.empty(); }, req.table_id()); + if (status.ok() && !key_error.ok()) { + status = key_error; + } if (!status.ok()) { result->response.set_status(PgsqlResponsePB::PGSQL_STATUS_RUNTIME_ERROR); StatusToPB(status, result->response.add_error_status()); return Status::OK(); } - RETURN_NOT_OK(CreatePagingStateForRead(req, result->num_rows_read, &result->response)); + + if ((!req.has_limit() || result->num_rows_read < req.limit()) && !has_reached_end) { + auto* paging_state = result->response.mutable_paging_state(); + paging_state->set_next_partition_key( + req.is_forward_scan() ? metadata_->partition()->partition_key_end() + : metadata_->partition()->partition_key_start()); + if (req.has_paging_state()) { + paging_state->set_total_num_rows_read( + req.paging_state().total_num_rows_read() + result->num_rows_read); + } + } + result->response.set_status(PgsqlResponsePB::PGSQL_STATUS_OK); return Status::OK(); } @@ -4625,23 +4649,23 @@ Result Tablet::GetTableInfo(ColocationId colocation_id) const { namespace { -// Skips `num_keys_to_skip` keys using `skip_key_func` until `is_reached_end_key_func` return true. -// Stops at first key that satisfies `is_reached_end_key_func`. +// Skips `num_keys_to_skip` keys using `skip_key_func` until `no_more_keys_func` return true. +// Stops at first key that satisfies `no_more_keys_func`. // Can skip more keys if `has_key_changed_func` returns false. -template +template bool SkipKeys( - const size_t num_keys_to_skip, IsReachedEndKeyFunc is_reached_end_key_func, + const size_t num_keys_to_skip, NoMoreKeysFunc no_more_keys_func, SkipKeyFunc skip_key_func, HasKeyChangedFunc has_key_changed_func) { - bool reached_end_key = is_reached_end_key_func(); - for (size_t i = 0; i < num_keys_to_skip && !reached_end_key; ++i) { + bool no_more_keys = no_more_keys_func(); + for (size_t i = 0; i < num_keys_to_skip && !no_more_keys; ++i) { skip_key_func(); - reached_end_key = is_reached_end_key_func(); + no_more_keys = no_more_keys_func(); } - while (!has_key_changed_func() && !reached_end_key) { + while (!has_key_changed_func() && !no_more_keys) { skip_key_func(); - reached_end_key = is_reached_end_key_func(); + no_more_keys = no_more_keys_func(); } - return reached_end_key; + return no_more_keys; } std::string IncrementedCopy(Slice key) { @@ -4674,29 +4698,214 @@ Status Tablet::AbortSQLTransactions(CoarseTimePoint deadline) const { Status Tablet::GetTabletKeyRanges( const Slice lower_bound_key, const Slice upper_bound_key, const uint64_t max_num_ranges, - const uint64_t range_size_bytes, const IsForward is_forward, const uint32_t max_key_length, + const uint64_t range_size_bytes, const Direction direction, const uint32_t max_key_length, WriteBuffer* keys_buffer, const TableId& colocated_table_id) const { if (table_type_ != PGSQL_TABLE_TYPE) { return STATUS_FORMAT( NotSupported, "GetTabletKeyRanges is only supported for YSQL, tablet_id: ", tablet_id()); } + if (!metadata_->colocated()) { + return STATUS_FORMAT( + NotSupported, + "GetTabletKeyRanges is only supported for colocated tables, tablet_id: ", tablet_id()); + } + if (!metadata_->partition_schema()->IsRangePartitioning()) { + return STATUS_FORMAT( + NotSupported, + "GetTabletKeyRanges is only supported for range-sharded tables, tablet_id: ", tablet_id()); + } return GetTabletKeyRanges( - lower_bound_key, upper_bound_key, max_num_ranges, range_size_bytes, is_forward, - max_key_length, + lower_bound_key, upper_bound_key, max_num_ranges, range_size_bytes, direction, max_key_length, [&keys_buffer](Slice key) { pggate::WriteBinaryColumn(key, keys_buffer); }, colocated_table_id); } +namespace { + +// Retrieves from the data block full doc key pointed by index_iter. +// If its length less or equal to max_key_length, pushes the key to the callback, updates +// last_key and returns true. +// Otherwise, skips the data block and returns false. +template +Result RetrieveFullDocKey( + const std::string& log_prefix, rocksdb::Iterator* index_iter, rocksdb::Iterator* data_iter, + std::string* last_key, const uint32_t max_key_length, Callback callback) { + RSTATUS_DCHECK( + index_iter->Valid(), InternalError, + "Not expected to reach this line when index iterator is invalid"); + auto data_entry = data_iter->Seek(index_iter->key()); + RSTATUS_DCHECK( + data_entry.Valid(), InternalError, + Format( + "Invalid data iterator after seeking to key from SST index: $0", + index_iter->KeyDebugHexString())); + + for (; data_entry.Valid(); + data_entry = (direction == Direction::kForward ? data_iter->Next() : data_iter->Prev())) { + VLOG_WITH_FUNC(2) << log_prefix << "data_entry.key: " << data_entry.key.ToDebugHexString(); + // Use encoded doc key to avoid breaking data related to the same row into halves. + const auto doc_key_size_result = + dockv::DocKey::EncodedSize(data_entry.key, dockv::DocKeyPart::kWholeDocKey); + + if (!doc_key_size_result.ok()) { + YB_LOG_EVERY_N_SECS(WARNING, 60) + << "Failed to get encoded size of key: " << data_entry.key.ToDebugHexString() << "." + << doc_key_size_result.status(); + continue; + } + if (doc_key_size_result.get() > max_key_length) { + // Skip keys longer than max_key_length. Also skip the whole block for efficiency. + return false; + } + const auto key = data_entry.key.Prefix(doc_key_size_result.get()); + if (direction == Direction::kBackward && !last_key->empty() && key >= *last_key) { + // Skips keys larger than last_key (can only happen during reverse scan). + continue; + } + VLOG_WITH_FUNC(2) << log_prefix << "key: " << key.ToDebugHexString(); + callback(key); + *last_key = key.ToBuffer(); + return true; + } + return false; +} + +template < + Direction direction, typename NoMoreKeysFunc, typename SkipKeyFunc, typename HasKeyChangedFunc, + typename Callback> +Status DoGetTabletKeyRanges( + const std::string& log_prefix, rocksdb::Iterator* index_iter, rocksdb::Iterator* data_iter, + std::string* last_key, NoMoreKeysFunc no_more_keys_func, SkipKeyFunc skip_key_func, + HasKeyChangedFunc has_key_changed_func, const uint64_t max_num_ranges, + uint64_t num_blocks_to_skip, const uint64_t num_blocks_to_skip_after_first, + const uint32_t max_key_length, const Slice key_to_add_as_last, Callback callback) { + uint64_t num_keys = 1; + + while (num_keys <= max_num_ranges) { + const auto no_more_keys = SkipKeys( + num_blocks_to_skip, no_more_keys_func, skip_key_func, has_key_changed_func); + num_blocks_to_skip = num_blocks_to_skip_after_first; + + VLOG_WITH_FUNC(2) << log_prefix << "last_key: " << Slice(*last_key).ToDebugHexString() + << ", index_iter: " << index_iter->KeyDebugHexString() + << ", no_more_keys: " << no_more_keys + << ", num_keys: " << num_keys; + + if (no_more_keys) { + VLOG_WITH_FUNC(2) << log_prefix << "key: " << key_to_add_as_last.ToDebugHexString(); + callback(key_to_add_as_last); + break; + } + + if (VERIFY_RESULT(RetrieveFullDocKey( + log_prefix, index_iter, data_iter, last_key, max_key_length, callback))) { + ++num_keys; + } + } + return Status::OK(); +} + +// Not able to use typename Callback as template parameter because partial function template +// specification is not allowed. +template +Status DoGetTabletKeyRanges( + const std::string& log_prefix, rocksdb::Iterator* index_iter, rocksdb::Iterator* data_iter, + Slice lower_bound_key, Slice upper_bound_key, const uint64_t max_num_ranges, + const uint64_t num_blocks_to_skip, const uint32_t max_key_length, + std::function callback, const bool use_empty_as_last_key); + +template<> +Status DoGetTabletKeyRanges( + const std::string& log_prefix, rocksdb::Iterator* index_iter, rocksdb::Iterator* data_iter, + Slice lower_bound_key, Slice upper_bound_key, const uint64_t max_num_ranges, + const uint64_t num_blocks_to_skip, const uint32_t max_key_length, + std::function callback, const bool use_empty_as_last_key) { + if (lower_bound_key.empty()) { + index_iter->SeekToFirst(); + } else { + index_iter->Seek(lower_bound_key); + } + VLOG_WITH_FUNC(2) << log_prefix << "index_iter: " << index_iter->KeyDebugHexString(); + + // First index key is the last key in first data block, so we treat it as we've already + // skipped one data block. + const auto treat_first_block_as_skipped = + index_iter->Valid() && index_iter->key() != lower_bound_key; + + std::string last_key = lower_bound_key.ToBuffer(); + + auto has_iter_key_changed_func = [&index_iter, &last_key]() { + if (!index_iter->Valid()) { + return true; + } + return index_iter->key() > last_key; + }; + + auto no_more_keys_func = [&index_iter, upper_bound_key]() { + return !index_iter->Valid() || + (!upper_bound_key.empty() && index_iter->key().GreaterOrEqual(upper_bound_key)); + }; + + auto skip_key_func = [&index_iter]() { index_iter->Next(); }; + + return DoGetTabletKeyRanges( + log_prefix, index_iter, data_iter, &last_key, no_more_keys_func, skip_key_func, + has_iter_key_changed_func, max_num_ranges, num_blocks_to_skip - treat_first_block_as_skipped, + num_blocks_to_skip, max_key_length, use_empty_as_last_key ? Slice{} : upper_bound_key, + callback); +} + +template<> +Status DoGetTabletKeyRanges( + const std::string& log_prefix, rocksdb::Iterator* index_iter, rocksdb::Iterator* data_iter, + Slice lower_bound_key, Slice upper_bound_key, const uint64_t max_num_ranges, + const uint64_t num_blocks_to_skip, const uint32_t max_key_length, + std::function callback, const bool use_empty_as_last_key) { + if (upper_bound_key.empty()) { + index_iter->SeekToLast(); + } else { + index_iter->Seek(upper_bound_key); + if (!index_iter->Valid()) { + index_iter->SeekToLast(); + } + } + + VLOG_WITH_FUNC(2) << log_prefix << "index_iter: " << index_iter->KeyDebugHexString(); + + std::string last_key = upper_bound_key.ToBuffer(); + + auto has_iter_key_changed_func = [&index_iter, &last_key]() { + if (!index_iter->Valid()) { + return true; + } + return (last_key.empty() && !index_iter->key().empty()) || index_iter->key() < last_key; + }; + + auto no_more_keys_func = [&index_iter, lower_bound_key]() { + return !index_iter->Valid() || lower_bound_key.GreaterOrEqual(index_iter->key()); + }; + + auto skip_key_func = [&index_iter]() { index_iter->Prev(); }; + + return DoGetTabletKeyRanges( + log_prefix, index_iter, data_iter, &last_key, no_more_keys_func, skip_key_func, + has_iter_key_changed_func, max_num_ranges, num_blocks_to_skip, num_blocks_to_skip, + max_key_length, use_empty_as_last_key ? Slice{} : upper_bound_key, callback); +} + +} // namespace + Status Tablet::GetTabletKeyRanges( Slice lower_bound_key, Slice upper_bound_key, uint64_t max_num_ranges, - const uint64_t range_size_bytes, const IsForward is_forward, uint32_t max_key_length, + const uint64_t range_size_bytes, const Direction direction, uint32_t max_key_length, std::function callback, const TableId& colocated_table_id) const { - VLOG_WITH_FUNC(2) << "lower_bound_key: " << lower_bound_key.ToDebugHexString() - << " upper_bound_key: " << upper_bound_key.ToDebugHexString() - << " max_num_ranges: " << max_num_ranges - << " range_size_bytes: " << range_size_bytes - << " max_key_length: " << max_key_length; + VLOG_WITH_PREFIX_AND_FUNC(2) << "lower_bound_key: " << lower_bound_key.ToDebugHexString() + << " upper_bound_key: " << upper_bound_key.ToDebugHexString() + << " max_num_ranges: " << max_num_ranges + << " range_size_bytes: " << range_size_bytes + << " max_key_length: " << max_key_length + << " direction: " << AsString(direction); if (max_num_ranges == 0) { max_num_ranges = std::numeric_limits::max(); } @@ -4707,20 +4916,6 @@ Status Tablet::GetTabletKeyRanges( auto pending_op = CreateScopedRWOperationNotBlockingRocksDbShutdownStart(); RETURN_NOT_OK(pending_op); - const auto num_blocks_to_skip = std::max(range_size_bytes / FLAGS_db_block_size_bytes, 1); - - rocksdb::ReadOptions read_options; - // An index block contains one entry per data block, where the key is a string >= last key in that - // data block and < the first key in the successive data block. The value is the BlockHandle - // (file offset and length) for the data block. - // So, we need to skip last key in each SST index because there are no data keys after the last - // index key. - auto index_iter = regular_db_->NewIndexIterator(read_options, rocksdb::SkipLastEntry::kTrue); - - // TODO(get_table_key_ranges): consider reusing index_iter and BlockHandle to avoid double SST - // index seek. - auto data_iter = std::unique_ptr(regular_db_->NewIterator(read_options)); - std::string encoded_partition_key_start; std::string encoded_partition_key_end; Slice partition_lower_bound_key; @@ -4746,121 +4941,82 @@ Status Tablet::GetTabletKeyRanges( partition_lower_bound_key = encoded_partition_key_start; partition_upper_bound_key = encoded_partition_key_end; } - // Whether it make sense to continue fetching ranges for the caller if we reach end of - // tablet(table) or upper/lower bound. - bool use_empty_as_end_key; - if (partition_lower_bound_key > lower_bound_key) { + VLOG_WITH_PREFIX_AND_FUNC(2) << "partition_lower_bound_key: " + << Slice(partition_lower_bound_key).ToDebugHexString() + << " partition_upper_bound_key: " + << Slice(partition_upper_bound_key).ToDebugHexString(); + + // Whether it make sense to continue fetching ranges for the caller if we reach end/start of + // tablet(table) or upper/lower bound (depending on whether is_forward is true/false). + std::optional use_empty_as_last_key; + + if (partition_lower_bound_key <= lower_bound_key) { + if (direction == Direction::kBackward) { + // Reaching lower bound means we can't have more keys in next tablet. + use_empty_as_last_key = true; + } + } else { lower_bound_key = partition_lower_bound_key; + if (direction == Direction::kBackward) { + // Partition lower bound is higher than lower_bound_key, we can have more keys in the next (in + // reverse order) tablet (unless colocated). + use_empty_as_last_key = is_colocated; + } } + if (partition_upper_bound_key.empty() || - (!upper_bound_key.empty() && partition_upper_bound_key >= upper_bound_key)) { - // Reaching partition upper bound means we can't have more keys in next tablet. - use_empty_as_end_key = true; + (!upper_bound_key.empty() && upper_bound_key <= partition_upper_bound_key)) { + if (direction == Direction::kForward) { + // Reaching upper bound means we can't have more keys in next tablet. + use_empty_as_last_key = true; + } } else { - // Partition upper bound is less than upper_bound_key, we can have more keys in the next - // tablet (unless colocated). - use_empty_as_end_key = is_colocated; upper_bound_key = partition_upper_bound_key; + if (direction == Direction::kForward) { + // Partition upper bound is less than upper_bound_key, we can have more keys in the next + // tablet (unless colocated). + use_empty_as_last_key = is_colocated; + } } - return is_forward ? GetTabletKeyRangesForward( - index_iter.get(), data_iter.get(), lower_bound_key, upper_bound_key, - max_num_ranges, num_blocks_to_skip, max_key_length, std::move(callback), - use_empty_as_end_key) - : GetTabletKeyRangesBackward( - index_iter.get(), lower_bound_key, upper_bound_key, max_num_ranges, - num_blocks_to_skip, max_key_length, std::move(callback)); -} - -Status Tablet::GetTabletKeyRangesForward( - rocksdb::Iterator* index_iter, rocksdb::Iterator* data_iter, Slice lower_bound_key, - Slice upper_bound_key, const uint64_t max_num_ranges, const uint64_t num_blocks_to_skip, - const uint32_t max_key_length, std::function callback, - const bool use_empty_as_end_key) const { - // TODO(get_table_key_ranges): As of 2023-09 we get full encoded doc keys and skip keys longer - // than max_key_length. Consider reworking that to allow using key prefixes as lower/upper bound - // for scan, currently it is not accepted by QLRocksDBStorage::GetIterator. - - if (lower_bound_key.empty()) { - index_iter->SeekToFirst(); - } else { - index_iter->Seek(lower_bound_key); - } - - // First index key is the last key in first data block, so we treat it as we've already - // skipped one data block. - bool treat_block_as_skipped = index_iter->Valid() && index_iter->key() != lower_bound_key; - - std::string last_key = lower_bound_key.ToBuffer(); - - uint64_t num_keys = 1; + RSTATUS_DCHECK( + use_empty_as_last_key.has_value(), InternalError, "use_empty_as_last_key is not set"); - auto has_iter_key_changed_func = [&index_iter, &last_key]() { - if (!index_iter->Valid()) { - return true; - } - return index_iter->key() > last_key; - }; + VLOG_WITH_PREFIX_AND_FUNC(2) << "lower_bound_key: " << lower_bound_key.ToDebugHexString() + << " upper_bound_key: " << upper_bound_key.ToDebugHexString() + << " use_empty_as_last_key: " << *use_empty_as_last_key; - while (num_keys <= max_num_ranges) { - const auto reached_end_key = SkipKeys( - num_blocks_to_skip - treat_block_as_skipped, - [&index_iter, upper_bound_key]() { - return !index_iter->Valid() || - (!upper_bound_key.empty() && index_iter->key().GreaterOrEqual(upper_bound_key)); - }, - [&index_iter]() { index_iter->Next(); }, - has_iter_key_changed_func); - treat_block_as_skipped = false; - - if (reached_end_key) { - if (use_empty_as_end_key) { - callback(""); - } else { - callback(upper_bound_key); - } - break; - } + const auto num_blocks_to_skip = std::max(range_size_bytes / FLAGS_db_block_size_bytes, 1); - auto data_entry = data_iter->Seek(index_iter->key()); - RSTATUS_DCHECK( - data_entry.Valid(), InternalError, - Format( - "Invalid data iterator after seeking to key from SST index: $0", - index_iter->key().ToDebugHexString())); - - for (; data_entry.Valid(); data_entry = data_iter->Next()) { - // Use encoded doc key to avoid breaking data related to the same row into halves. - const auto doc_key_size_result = - dockv::DocKey::EncodedSize(data_entry.key, dockv::DocKeyPart::kWholeDocKey); - - if (!doc_key_size_result.ok()) { - YB_LOG_EVERY_N_SECS(WARNING, 60) - << "Failed to get encoded size of key: " << data_entry.key.ToDebugHexString() << "." - << doc_key_size_result.status(); - continue; - } - if (doc_key_size_result.get() > max_key_length) { - // Skip keys longer than max_key_length. - continue; - } - const auto key = data_entry.key.Prefix(doc_key_size_result.get()); - callback(key); - last_key = key.ToBuffer(); - ++num_keys; - break; - } - } + rocksdb::ReadOptions read_options; + // An index block contains one entry per data block, where the key is a string >= last key in that + // data block and < the first key in the successive data block. The value is the BlockHandle + // (file offset and length) for the data block. + // So, we need to skip last key in each SST index because there are no data keys after the last + // index key. + auto index_iter = regular_db_->NewIndexIterator(read_options, rocksdb::SkipLastEntry::kTrue); - return Status::OK(); -} + // TODO(get_table_key_ranges): consider reusing index_iter and BlockHandle to avoid double SST + // index seek. + auto data_iter = std::unique_ptr(regular_db_->NewIterator(read_options)); -Status Tablet::GetTabletKeyRangesBackward( - rocksdb::Iterator* index_iter, Slice lower_bound_key, Slice upper_bound_key, - const uint64_t max_num_ranges, const uint64_t num_blocks_to_skip, const uint32_t max_key_length, - std::function callback) const { - return STATUS(NotSupported, "Tablet::GetTabletKeyRanges in backward order is not yet supported"); + // TODO(get_table_key_ranges): As of 2023-09 we get full encoded doc keys and skip keys longer + // than max_key_length. Consider reworking that to allow using key prefixes as lower/upper bound + // for scan, currently it is not accepted by QLRocksDBStorage::GetIterator. + switch (direction) { + case Direction::kForward: + return DoGetTabletKeyRanges( + LogPrefix(), index_iter.get(), data_iter.get(), lower_bound_key, upper_bound_key, + max_num_ranges, num_blocks_to_skip, max_key_length, std::move(callback), + *use_empty_as_last_key); + case Direction::kBackward: + return DoGetTabletKeyRanges( + LogPrefix(), index_iter.get(), data_iter.get(), lower_bound_key, upper_bound_key, + max_num_ranges, num_blocks_to_skip, max_key_length, std::move(callback), + *use_empty_as_last_key); + } + FATAL_INVALID_ENUM_VALUE(Direction, direction); } // ------------------------------------------------------------------------------------------------ diff --git a/src/yb/tablet/tablet.h b/src/yb/tablet/tablet.h index 1f22baf82d61..533fdf493c9f 100644 --- a/src/yb/tablet/tablet.h +++ b/src/yb/tablet/tablet.h @@ -98,7 +98,7 @@ namespace tablet { YB_STRONGLY_TYPED_BOOL(BlockingRocksDbShutdownStart); YB_STRONGLY_TYPED_BOOL(FlushOnShutdown); YB_STRONGLY_TYPED_BOOL(IncludeIntents); -YB_STRONGLY_TYPED_BOOL(IsForward) +YB_DEFINE_ENUM(Direction, (kForward)(kBackward)); inline FlushFlags operator|(FlushFlags lhs, FlushFlags rhs) { return static_cast(to_underlying(lhs) | to_underlying(rhs)); @@ -923,27 +923,27 @@ class Tablet : public AbstractTablet, Result GetTableInfo(ColocationId colocation_id) const override; // Breaks tablet data into ranges of approximately range_size_bytes each, at most into - // `max_num_ranges` and adds to `keys_buffer` a list of these ranges end keys. + // `max_num_ranges` and adds to `keys_buffer` a list of these ranges boundary keys (depending on + // is_forward). // // It is guaranteed that returned keys are at most max_key_length bytes. - // lower_bound_key is inclusive, upper_bound_key is exclusive. They are adjusted by this function + // Both lower_bound_key and upper_bound_key are inclusive. They are adjusted by this function // to be within tablet boundaries (key_bounds_ if set or based on metadata()->partition() if - // key_bounds_ is not set) and to be no longer than max_key_length. Due to truncation the last - // returned key can be outside tablet partition boundaries. + // key_bounds_ is not set) and to be no longer than max_key_length. // // If `is_forward` is set, list will consist of: - // - 1st_range_end_key \in (adjusted_lower_bound_key, adjusted_upper_bound_key) - // - 2nd_range_end_key \in (1st_range_end_key, adjusted_upper_bound_key) + // - 1st_range_boundary_key \in (adjusted_lower_bound_key, adjusted_upper_bound_key) + // - 2nd_range_boundary_key \in (1st_range_boundary_key, adjusted_upper_bound_key) // ... - // - nth_range_end_key \in ((n-1)th_range_end_key, adjusted_upper_bound_key] + // - nth_range_boundary_key \in ((n-1)th_range_boundary_key, adjusted_upper_bound_key] // or empty key iff next tablet can't have more keys (meaning we've already reached // specified upper_bound_key or the end of the last tablet). // // If `is_forward` is not set, list will consist of: - // - 1st_range_end_key \in (adjusted_lower_bound_key, adjusted_upper_bound_key) - // - 2nd_range_end_key \in (adjusted_lower_bound_key, 1st_range_end_key) + // - 1st_range_boundary_key \in (adjusted_lower_bound_key, adjusted_upper_bound_key) + // - 2nd_range_boundary_key \in (adjusted_lower_bound_key, 1st_range_boundary_key) // ... - // - nth_range_end_key \in [adjusted_lower_bound_key, (n-1)_th_range_key) + // - nth_range_boundary_key \in [adjusted_lower_bound_key, (n-1)_th_range_boundary_key) // or empty key iff next tablet can't have more keys (meaning we've already reached // specified lower_bound_key or the beginning of the first tablet). // @@ -951,15 +951,15 @@ class Tablet : public AbstractTablet, // If max_num_ranges is 0, nothing will be written to the `keys_buffer`. Status GetTabletKeyRanges( Slice lower_bound_key, Slice upper_bound_key, uint64_t max_num_ranges, - uint64_t range_size_bytes, IsForward is_forward, uint32_t max_key_length, + uint64_t range_size_bytes, Direction direction, uint32_t max_key_length, WriteBuffer* keys_buffer, const TableId& colocated_table_id = "") const; Status TEST_GetTabletKeyRanges( Slice lower_bound_key, Slice upper_bound_key, uint64_t max_num_ranges, - uint64_t range_size_bytes, IsForward is_forward, uint32_t max_key_length, + uint64_t range_size_bytes, Direction direction, uint32_t max_key_length, std::function callback, const TableId& colocated_table_id = "") const { return GetTabletKeyRanges( - lower_bound_key, upper_bound_key, max_num_ranges, range_size_bytes, is_forward, + lower_bound_key, upper_bound_key, max_num_ranges, range_size_bytes, direction, max_key_length, std::move(callback), colocated_table_id); } @@ -1055,20 +1055,9 @@ class Tablet : public AbstractTablet, Status GetTabletKeyRanges( Slice lower_bound_key, Slice upper_bound_key, uint64_t max_num_ranges, - uint64_t range_size_bytes, IsForward is_forward, uint32_t max_key_length, + uint64_t range_size_bytes, Direction direction, uint32_t max_key_length, std::function callback, const TableId& colocated_table_id) const; - Status GetTabletKeyRangesForward( - rocksdb::Iterator* index_iter, rocksdb::Iterator* data_iter, Slice lower_bound_key, - Slice upper_bound_key, uint64_t max_num_ranges, uint64_t num_blocks_to_skip, - uint32_t max_key_length, std::function callback, - bool use_empty_as_end_key) const; - - Status GetTabletKeyRangesBackward( - rocksdb::Iterator* index_iter, Slice lower_bound_key, Slice upper_bound_key, - uint64_t max_num_ranges, uint64_t num_blocks_to_skip, uint32_t max_key_length, - std::function callback) const; - Status ProcessPgsqlGetTableKeyRangesRequest( const PgsqlReadRequestPB& req, PgsqlReadRequestResult* result) const; diff --git a/src/yb/tserver/pg_client_session.cc b/src/yb/tserver/pg_client_session.cc index c3c154dc78d3..16ec2a09fa10 100644 --- a/src/yb/tserver/pg_client_session.cc +++ b/src/yb/tserver/pg_client_session.cc @@ -1819,8 +1819,12 @@ void PgClientSession::GetTableKeyRanges( read_request->set_limit(max_num_ranges); } + read_request->set_is_forward_scan(is_forward); auto* req = read_request->mutable_get_tablet_key_ranges_request(); + // IsInclusive is actually ignored by Tablet::GetTabletKeyRanges, and it always treats both + // boundaries as inclusive. But we are setting it here to avoid check failures inside + // YBPgsqlReadOp. if (!lower_bound_key.empty()) { read_request->mutable_lower_bound()->mutable_key()->assign( lower_bound_key.cdata(), lower_bound_key.size()); @@ -1831,10 +1835,9 @@ void PgClientSession::GetTableKeyRanges( if (!upper_bound_key.empty()) { read_request->mutable_upper_bound()->mutable_key()->assign( upper_bound_key.cdata(), upper_bound_key.size()); - read_request->mutable_upper_bound()->set_is_inclusive(false); + read_request->mutable_upper_bound()->set_is_inclusive(true); req->mutable_upper_bound_key()->assign(upper_bound_key.cdata(), upper_bound_key.size()); } - req->set_is_forward(is_forward); req->set_range_size_bytes(range_size_bytes); req->set_max_key_length(max_key_length); diff --git a/src/yb/tserver/tablet_service.cc b/src/yb/tserver/tablet_service.cc index 00018fbed50a..00132ffc4ae5 100644 --- a/src/yb/tserver/tablet_service.cc +++ b/src/yb/tserver/tablet_service.cc @@ -3238,8 +3238,9 @@ void TabletServiceImpl::GetTabletKeyRanges( const auto& tablet = leader_tablet_peer.tablet; RETURN_NOT_OK(tablet->GetTabletKeyRanges( req->lower_bound_key(), req->upper_bound_key(), req->max_num_ranges(), - req->range_size_bytes(), tablet::IsForward(req->is_forward()), req->max_key_length(), - &context.sidecars().Start())); + req->range_size_bytes(), + req->is_forward() ? tablet::Direction::kForward : tablet::Direction::kBackward, + req->max_key_length(), &context.sidecars().Start())); return Status::OK(); }); } diff --git a/src/yb/yql/pggate/test/pggate_test_select.cc b/src/yb/yql/pggate/test/pggate_test_select.cc index 642f3cd3fc6d..4970837e0724 100644 --- a/src/yb/yql/pggate/test/pggate_test_select.cc +++ b/src/yb/yql/pggate/test/pggate_test_select.cc @@ -321,87 +321,146 @@ class PggateTestSelectWithYsql : public PggateTestSelect { namespace { -Status CheckRanges(const std::vector& end_keys) { +Status CheckRanges(const std::vector& end_keys, const bool is_forward) { SCHECK_GT(end_keys.size(), 0, InternalError, "No key ranges"); for (size_t i = 0; i + 1 < end_keys.size() - 1; ++i) { - SCHECK_LT(end_keys[i], end_keys[i + 1], InternalError, "Wrong range keys order"); + SCHECK( + is_forward ? end_keys[i] < end_keys[i + 1] : end_keys[i] > end_keys[i + 1], InternalError, + Format( + "Wrong range keys order, expected '$0' $1 '$2'", Slice(end_keys[i]).ToDebugHexString(), + is_forward ? "<" : ">", Slice(end_keys[i + 1]).ToDebugHexString())); } - SCHECK_EQ(end_keys.back(), "", InternalError, "Wrong last range end key"); + SCHECK( + end_keys.back().empty(), InternalError, + Format("Wrong last range end key: '$0'", Slice(end_keys.back()).ToDebugHexString())); return Status::OK(); } -Status TestGetTableKeyRanges( +Result TestGetTableKeyRanges( YBCPgOid database_oid, YBCPgOid table_oid, Slice lower_bound_key, Slice upper_bound_key, - uint64_t max_num_ranges, uint64_t range_size_bytes, uint32_t max_key_length, - std::vector* end_keys) { - end_keys->clear(); + uint64_t range_size_bytes, uint32_t max_key_length, std::string* min_key = nullptr, + std::string* max_key = nullptr) { + if (min_key) { + min_key->clear(); + } + if (max_key) { + max_key->clear(); + } + + std::vector end_keys; + std::function func = - [&end_keys](const char* key, size_t key_size) { + [&end_keys, min_key, max_key](const char* key, size_t key_size) { LOG(INFO) << "Range end key: " << Slice(key, key_size).ToDebugHexString(); - end_keys->push_back(std::string(key, key_size)); + std::string key_str(key, key_size); + end_keys.push_back(key_str); + if (key_size == 0) { + return; + } + if (min_key && (min_key->empty() || key_str < *min_key)) { + *min_key = key; + } + if (max_key && (max_key->empty() || key_str > *max_key)) { + *max_key = key; + } }; - uint64_t current_tserver_ht = 0; - /* Request server HT on the first call for the key ranges */ - CHECK_YBC_STATUS(YBCGetTableKeyRanges( - database_oid, table_oid, lower_bound_key.cdata(), lower_bound_key.size(), - upper_bound_key.cdata(), upper_bound_key.size(), max_num_ranges, range_size_bytes, - /* is_forward = */ true, max_key_length, ¤t_tserver_ht, - &InvokeFunctionWithKeyPtrAndSize, &func)); - LOG(INFO) << "Got " << end_keys->size() << " ranges"; - LOG(INFO) << "current tserver HT: " << HybridTime(current_tserver_ht).ToString(); - SCHECK_GT(current_tserver_ht, 0, InternalError, "No tserver hybrid time"); + std::unordered_map num_boundaries_by_direction; + for (const auto is_forward : {false, true}) { + LOG_WITH_FUNC(INFO) << "lower_bound_key: " << lower_bound_key.ToDebugHexString() + << " upper_bound_key: " << upper_bound_key.ToDebugHexString() + << " range_size_bytes: " << range_size_bytes + << " max_key_length: " << max_key_length << " is_forward: " << is_forward; - RETURN_NOT_OK(CheckRanges(*end_keys)); + uint64_t current_tserver_ht = 0; + /* Request server HT on the first call for the key ranges */ + end_keys.clear(); + CHECK_YBC_STATUS(YBCGetTableKeyRanges( + database_oid, table_oid, lower_bound_key.cdata(), lower_bound_key.size(), + upper_bound_key.cdata(), upper_bound_key.size(), std::numeric_limits::max(), + range_size_bytes, is_forward, max_key_length, ¤t_tserver_ht, + &InvokeFunctionWithKeyPtrAndSize, &func)); + LOG(INFO) << "Got " << end_keys.size() << " ranges"; + LOG(INFO) << "current tserver HT: " << HybridTime(current_tserver_ht).ToString(); + SCHECK_GT(current_tserver_ht, 0, InternalError, "No tserver hybrid time"); - max_num_ranges = end_keys->size() / 3; + RETURN_NOT_OK(CheckRanges(end_keys, is_forward)); - if (max_num_ranges == 0) { - // Only test pagination when we have enough ranges to break them into 3 pieces. - return Status::OK(); - } + const auto num_boundaries_received = end_keys.size(); + const auto num_ranges_limit = num_boundaries_received / 3; - end_keys->clear(); + if (num_ranges_limit == 0) { + // Only test pagination when we have enough ranges to break them into 3 pieces. + continue; + } - std::string lower_bound; + end_keys.clear(); - for (;;) { - const auto prev_size = end_keys->size(); + std::string bound; - LOG(INFO) << "Starting with: " << Slice(lower_bound).ToDebugHexString(); - CHECK_YBC_STATUS(YBCGetTableKeyRanges( - database_oid, table_oid, lower_bound.data(), lower_bound.size(), nullptr, 0, max_num_ranges, - range_size_bytes, true, max_key_length, nullptr /* current_tserver_ht */, - &InvokeFunctionWithKeyPtrAndSize, &func)); + for (;;) { + const auto prev_size = end_keys.size(); - const auto size_diff = end_keys->size() - prev_size; + LOG(INFO) << "Starting with: " << Slice(bound).ToDebugHexString(); - LOG(INFO) << "Got " << size_diff << " ranges"; + CHECK_YBC_STATUS(YBCGetTableKeyRanges( + database_oid, table_oid, is_forward ? bound.data() : nullptr, + is_forward ? bound.size() : 0, is_forward ? nullptr : bound.data(), + is_forward ? 0 : bound.size(), num_ranges_limit, range_size_bytes, is_forward, + max_key_length, /* current_tserver_ht = */ nullptr, &InvokeFunctionWithKeyPtrAndSize, + &func)); - SCHECK_GT(size_diff, 0, InternalError, "Expected some ranges"); + const auto size_diff = end_keys.size() - prev_size; - if (end_keys->back().empty()) { - SCHECK_LE( - size_diff, max_num_ranges, InternalError, - "Expected no more than specified number of ranges"); - break; + LOG(INFO) << "Got " << size_diff << " ranges (limited by " << num_ranges_limit << ")"; + + SCHECK_GT(size_diff, 0, InternalError, "Expected some ranges"); + + if (end_keys.back().empty()) { + SCHECK_LE( + size_diff, num_ranges_limit, InternalError, + "Expected no more than specified number of ranges"); + break; + } + + SCHECK_EQ( + size_diff, num_ranges_limit, InternalError, + "Expected specified number of ranges except for the last response"); + + bound = end_keys.back(); } - SCHECK_EQ( - size_diff, max_num_ranges, InternalError, - "Expected specified number of ranges except for the last response"); + const int64_t num_boundaries_diff = num_boundaries_received - end_keys.size(); + SCHECK( + abs(num_boundaries_diff) <= 2, InternalError, + Format( + "Expected approximately the same number of ranges independently of paging but got " + "without paging: $0, with paging: $1", + num_boundaries_received, end_keys.size())); - lower_bound = end_keys->back(); + RETURN_NOT_OK(CheckRanges(end_keys, is_forward)); + + num_boundaries_by_direction[is_forward] = num_boundaries_received; } - RETURN_NOT_OK(CheckRanges(*end_keys)); + const int64_t num_boundaries_diff = + num_boundaries_by_direction[true] - num_boundaries_by_direction[false]; + SCHECK( + abs(num_boundaries_diff) <= 2, InternalError, + Format( + "Expected approximately the same number of ranges independently of direction but got " + "forward: $0, backward: $1", + num_boundaries_by_direction[true], num_boundaries_by_direction[false], end_keys.size())); - return Status::OK(); + return std::min(num_boundaries_by_direction[true], num_boundaries_by_direction[false]); } } // namespace -TEST_F_EX(PggateTestSelect, GetTableKeyRanges, PggateTestSelectWithYsql) { +// TODO(get_table_key_ranges): Enable this test as part of +// https://github.com/yugabyte/yugabyte-db/issues/21090 +TEST_F_EX( + PggateTestSelect, YB_DISABLE_TEST(GetRangeShardedTableKeyRanges), PggateTestSelectWithYsql) { constexpr auto kDatabaseName = "yugabyte"; constexpr auto kMaxKeyLength = 1_KB; constexpr auto kRangeSizeBytes = 16_KB; @@ -429,18 +488,15 @@ TEST_F_EX(PggateTestSelect, GetTableKeyRanges, PggateTestSelectWithYsql) { ASSERT_OK(cluster_->WaitForAllIntentsApplied(30s * kTimeMultiplier)); - std::vector end_keys; - ASSERT_OK(TestGetTableKeyRanges( - db_oid, table_oid, Slice(), Slice(), std::numeric_limits::max(), kRangeSizeBytes, - kMaxKeyLength, &end_keys)); + db_oid, table_oid, Slice(), Slice(), kRangeSizeBytes, kMaxKeyLength)); std::string upper_bound; ASSERT_TRUE(strings::ByteStringFromAscii("488000022C21", &upper_bound)); ASSERT_OK(TestGetTableKeyRanges( - db_oid, table_oid, Slice(), upper_bound, std::numeric_limits::max(), - kRangeSizeBytes, kMaxKeyLength, &end_keys)); + db_oid, table_oid, Slice(), upper_bound, kRangeSizeBytes, kMaxKeyLength)); + } TEST_F_EX(PggateTestSelect, GetColocatedTableKeyRanges, PggateTestSelectWithYsql) { @@ -449,6 +505,8 @@ TEST_F_EX(PggateTestSelect, GetColocatedTableKeyRanges, PggateTestSelectWithYsql constexpr auto kMaxKeyLength = 1_KB; constexpr auto kRangeSizeBytes = 16_KB; constexpr auto kNumTables = 3; + constexpr auto kNumRows = 5000; + constexpr auto kMinNumRangesExpected = 10; ASSERT_OK(Init( "GetColocatedTableKeyRanges", kNumOfTablets, /* replication_factor = */ 0, kDatabaseName)); @@ -466,10 +524,14 @@ TEST_F_EX(PggateTestSelect, GetColocatedTableKeyRanges, PggateTestSelectWithYsql } for (int i = 0; i < kNumTables; ++i) { ASSERT_OK(conn.ExecuteFormat( - "INSERT INTO t$0 SELECT i, 1 FROM (SELECT generate_series(1, 3000) i) tmp;", i)); + "INSERT INTO t$0 SELECT i, 1 FROM (SELECT generate_series(1, $1) i) tmp;", i, kNumRows)); } ASSERT_OK(cluster_->WaitForAllIntentsApplied(30s * kTimeMultiplier)); + for (size_t ts_idx = 0; ts_idx < cluster_->num_tablet_servers(); ++ts_idx) { + ASSERT_OK(cluster_->FlushTabletsOnSingleTServer( + cluster_->tablet_server(ts_idx), {}, tserver::FlushTabletsRequestPB::FLUSH)); + } std::vector> min_max_keys; @@ -477,19 +539,14 @@ TEST_F_EX(PggateTestSelect, GetColocatedTableKeyRanges, PggateTestSelectWithYsql const auto table_oid = ASSERT_RESULT(conn.FetchRow( Format("SELECT oid from pg_class WHERE relname='t$0'", i))); - std::vector end_keys; + std::string min_key; + std::string max_key; + ASSERT_GE( + ASSERT_RESULT(TestGetTableKeyRanges( + db_oid, table_oid, Slice(), Slice(), kRangeSizeBytes, kMaxKeyLength, &min_key, + &max_key)), + kMinNumRangesExpected); - ASSERT_OK(TestGetTableKeyRanges( - db_oid, table_oid, Slice(), Slice(), std::numeric_limits::max(), kRangeSizeBytes, - kMaxKeyLength, &end_keys)); - - ASSERT_GT(end_keys.size(), 0); - if (end_keys.size() == 1) { - // If there is only one range covering the whole table we have nothing more to check. - continue; - } - std::string min_key = end_keys.front(); - std::string max_key = end_keys[end_keys.size() - 2]; for (const auto& min_max_key : min_max_keys) { ASSERT_TRUE( (min_key < min_max_key.first || min_key > min_max_key.second) && diff --git a/src/yb/yql/pggate/ybc_pggate.cc b/src/yb/yql/pggate/ybc_pggate.cc index 44fed259ebd1..6bfd6a53bcaf 100644 --- a/src/yb/yql/pggate/ybc_pggate.cc +++ b/src/yb/yql/pggate/ybc_pggate.cc @@ -2056,10 +2056,6 @@ YBCStatus YBCGetTableKeyRanges( } auto& encoded_table_range_slices = res->encoded_range_end_keys; - if (!is_forward) { - return ToYBCStatus( - STATUS(NotSupported, "YBCGetTableKeyRanges is not supported yet for reverse order")); - } if (current_tserver_ht) { *current_tserver_ht = res->current_ht.ToUint64();