From 6deb64873bcab745db98276f541f123164b42ed3 Mon Sep 17 00:00:00 2001 From: vporyadke Date: Thu, 23 May 2024 15:10:33 +0200 Subject: [PATCH 1/2] only give counter resource to tablets that never had normal metrics (#4763) --- ydb/core/mind/hive/hive_ut.cpp | 90 ++++++++++++++++++++++++++++++ ydb/core/mind/hive/tablet_info.cpp | 18 ++++-- ydb/core/mind/hive/tablet_info.h | 2 +- ydb/core/protos/metrics.proto | 1 + ydb/core/util/metrics.h | 3 + 5 files changed, 107 insertions(+), 7 deletions(-) diff --git a/ydb/core/mind/hive/hive_ut.cpp b/ydb/core/mind/hive/hive_ut.cpp index 58367a60403f..eb8589d8a995 100644 --- a/ydb/core/mind/hive/hive_ut.cpp +++ b/ydb/core/mind/hive/hive_ut.cpp @@ -4507,6 +4507,96 @@ Y_UNIT_TEST_SUITE(THiveTest) { UNIT_ASSERT_LE(movedToFirstNode, TABLETS_PER_NODE / 2); } + Y_UNIT_TEST(TestHiveNoBalancingWithLowResourceUsage) { + static constexpr ui64 NUM_NODES = 5; + static constexpr ui64 NUM_TABLETS = 100; + TTestBasicRuntime runtime(NUM_NODES, false); + Setup(runtime, true, 1, [](TAppPrepare& app) { + app.HiveConfig.SetTabletKickCooldownPeriod(0); + app.HiveConfig.SetResourceChangeReactionPeriod(0); + app.HiveConfig.SetMetricsWindowSize(1); + }); + const int nodeBase = runtime.GetNodeId(0); + TActorId senderA = runtime.AllocateEdgeActor(); + const ui64 hiveTablet = MakeDefaultHiveID(); + const ui64 testerTablet = MakeTabletID(false, 1); + + auto getDistribution = [hiveTablet, nodeBase, senderA, &runtime]() -> std::array, NUM_NODES> { + std::array, NUM_NODES> nodeTablets = {}; + { + runtime.SendToPipe(hiveTablet, senderA, new TEvHive::TEvRequestHiveInfo()); + TAutoPtr handle; + TEvHive::TEvResponseHiveInfo* response = runtime.GrabEdgeEventRethrow(handle); + for (const NKikimrHive::TTabletInfo& tablet : response->Record.GetTablets()) { + UNIT_ASSERT_C(((int)tablet.GetNodeID() - nodeBase >= 0) && (tablet.GetNodeID() - nodeBase < NUM_NODES), + "nodeId# " << tablet.GetNodeID() << " nodeBase# " << nodeBase); + nodeTablets[tablet.GetNodeID() - nodeBase].push_back(tablet.GetTabletID()); + } + } + return nodeTablets; + }; + + CreateTestBootstrapper(runtime, CreateTestTabletInfo(hiveTablet, TTabletTypes::Hive), &CreateDefaultHive); + + // wait for creation of nodes + { + TDispatchOptions options; + options.FinalEvents.emplace_back(TEvLocal::EvStatus, NUM_NODES); + runtime.DispatchEvents(options); + } + + TTabletTypes::EType tabletType = TTabletTypes::Dummy; + std::vector tablets; + tablets.reserve(NUM_TABLETS); + for (size_t i = 0; i < NUM_TABLETS; ++i) { + THolder ev(new TEvHive::TEvCreateTablet(testerTablet, 100500 + i, tabletType, BINDED_CHANNELS)); + ev->Record.SetObjectId(i); + ui64 tabletId = SendCreateTestTablet(runtime, hiveTablet, testerTablet, std::move(ev), 0, true); + MakeSureTabletIsUp(runtime, tabletId, 0); + tablets.push_back(tabletId); + } + + auto initialDistribution = getDistribution(); + + // report small metrics for some tablets + auto rand = CreateDeterministicRandomProvider(777); + for (auto tablet : tablets) { + THolder metrics = MakeHolder(); + NKikimrHive::TTabletMetrics* metric = metrics->Record.AddTabletMetrics(); + metric->SetTabletID(tablet); + if (rand->GenRand() % 2) { + metric->MutableResourceUsage()->SetCPU(1001); // 1% core + metric->MutableResourceUsage()->SetMemory(150'000); // 150kb + } else { + metric->MutableResourceUsage()->SetCPU(999); + metric->MutableResourceUsage()->SetMemory(100'000); + } + + runtime.SendToPipe(hiveTablet, senderA, metrics.Release()); + } + + { + TDispatchOptions options; + options.FinalEvents.emplace_back(NHive::TEvPrivate::EvBalancerOut); + runtime.DispatchEvents(options, TDuration::Seconds(10)); + } + + // Check that balancer moved no tablets + auto newDistribution = getDistribution(); + + UNIT_ASSERT_EQUAL(initialDistribution, newDistribution); + + { + auto request = std::make_unique(); + request->Record.SetReturnMetrics(true); + runtime.SendToPipe(hiveTablet, senderA, request.release()); + TAutoPtr handle; + TEvHive::TEvResponseHiveDomainStats* response = runtime.GrabEdgeEventRethrow(handle); + ui64 totalCounter = response->Record.GetDomainStats(0).GetMetrics().GetCounter(); + UNIT_ASSERT_VALUES_EQUAL(totalCounter, 0); + } + } + Y_UNIT_TEST(TestHiveBalancerWithImmovableTablets) { static constexpr ui64 TABLETS_PER_NODE = 10; TTestBasicRuntime runtime(3, false); diff --git a/ydb/core/mind/hive/tablet_info.cpp b/ydb/core/mind/hive/tablet_info.cpp index 4598bface6ec..aa0dcdde9275 100644 --- a/ydb/core/mind/hive/tablet_info.cpp +++ b/ydb/core/mind/hive/tablet_info.cpp @@ -398,14 +398,21 @@ TResourceRawValues TTabletInfo::GetResourceMaximumValues() const { } } -i64 TTabletInfo::GetCounterValue(const NKikimrTabletBase::TMetrics& metrics, const TVector& allowedMetricIds) { - if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kCPUFieldNumber) != allowedMetricIds.end() && THive::IsValidMetricsCPU(metrics)) { +i64 TTabletInfo::GetCounterValue() const { + const auto& allowedMetricIds = GetTabletAllowedMetricIds(); + if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kCPUFieldNumber) != allowedMetricIds.end() + && (ResourceMetricsAggregates.MaximumCPU.GetAllTimeMaximum() > 0 + || ResourceValues.GetCPU() > 0)) { return 0; } - if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kMemoryFieldNumber) != allowedMetricIds.end() && THive::IsValidMetricsMemory(metrics)) { + if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kMemoryFieldNumber) != allowedMetricIds.end() + && (ResourceMetricsAggregates.MaximumMemory.GetAllTimeMaximum() > 0 + || ResourceValues.GetMemory() > 0)) { return 0; } - if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kNetworkFieldNumber) != allowedMetricIds.end() && THive::IsValidMetricsNetwork(metrics)) { + if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kNetworkFieldNumber) != allowedMetricIds.end() + && (ResourceMetricsAggregates.MaximumNetwork.GetAllTimeMaximum() > 0 + || ResourceValues.GetNetwork() > 0)) { return 0; } return 1; @@ -446,8 +453,7 @@ void TTabletInfo::FilterRawValues(TResourceNormalizedValues& values) const { } void TTabletInfo::ActualizeCounter() { - auto value = GetCounterValue(ResourceValues, GetTabletAllowedMetricIds()); - ResourceValues.SetCounter(value); + ResourceValues.SetCounter(GetCounterValue()); } const TNodeFilter& TTabletInfo::GetNodeFilter() const { diff --git a/ydb/core/mind/hive/tablet_info.h b/ydb/core/mind/hive/tablet_info.h index 2afb5f6010a8..8d722440b57d 100644 --- a/ydb/core/mind/hive/tablet_info.h +++ b/ydb/core/mind/hive/tablet_info.h @@ -230,7 +230,7 @@ struct TTabletInfo { void UpdateResourceUsage(const NKikimrTabletBase::TMetrics& metrics); TResourceRawValues GetResourceCurrentValues() const; TResourceRawValues GetResourceMaximumValues() const; - static i64 GetCounterValue(const NKikimrTabletBase::TMetrics& metrics, const TVector& allowedMetricIds); + i64 GetCounterValue() const; void FilterRawValues(TResourceRawValues& values) const; void FilterRawValues(TResourceNormalizedValues& values) const; void ActualizeCounter(); diff --git a/ydb/core/protos/metrics.proto b/ydb/core/protos/metrics.proto index 017014b8118b..d1e3f91d9430 100644 --- a/ydb/core/protos/metrics.proto +++ b/ydb/core/protos/metrics.proto @@ -3,4 +3,5 @@ package NKikimrMetricsProto; message TMaximumValueUI64 { optional uint64 LastBucketStartTime = 1; repeated uint64 Values = 2; + optional uint64 AllTimeMaximum = 3; } diff --git a/ydb/core/util/metrics.h b/ydb/core/util/metrics.h index cae931733faf..f822100aa395 100644 --- a/ydb/core/util/metrics.h +++ b/ydb/core/util/metrics.h @@ -395,6 +395,9 @@ class TMaximumValueVariableWindowUI64 : public NKikimrMetricsProto::TMaximumValu using TProto = NKikimrMetricsProto::TMaximumValueUI64; void SetValue(TType value, TInstant now = TInstant::Now()) { + if (TProto::GetAllTimeMaximum() > 0 || MaximumValue > 0) { // ignoring initial value + TProto::SetAllTimeMaximum(std::max(value, TProto::GetAllTimeMaximum())); + } TDuration elapsedCurrentBucket = now - TInstant::MilliSeconds(TProto::GetLastBucketStartTime()); if (TProto::ValuesSize() == 0 || elapsedCurrentBucket >= BucketDuration) { size_t bucketsPassed = 0; From 5d350b6c2325d0d5acaa66a24a85513cca43d0f6 Mon Sep 17 00:00:00 2001 From: Alexander Zalyalov Date: Fri, 24 May 2024 11:46:58 +0000 Subject: [PATCH 2/2] fix test build after cherrypick --- ydb/core/mind/hive/hive_ut.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ydb/core/mind/hive/hive_ut.cpp b/ydb/core/mind/hive/hive_ut.cpp index eb8589d8a995..d8eec27f09bd 100644 --- a/ydb/core/mind/hive/hive_ut.cpp +++ b/ydb/core/mind/hive/hive_ut.cpp @@ -4518,8 +4518,8 @@ Y_UNIT_TEST_SUITE(THiveTest) { }); const int nodeBase = runtime.GetNodeId(0); TActorId senderA = runtime.AllocateEdgeActor(); - const ui64 hiveTablet = MakeDefaultHiveID(); - const ui64 testerTablet = MakeTabletID(false, 1); + const ui64 hiveTablet = MakeDefaultHiveID(0); + const ui64 testerTablet = MakeDefaultHiveID(1); auto getDistribution = [hiveTablet, nodeBase, senderA, &runtime]() -> std::array, NUM_NODES> { std::array, NUM_NODES> nodeTablets = {};