From 9d8fecd3f7231fcfc19f1f72816171ad66af04ec Mon Sep 17 00:00:00 2001 From: vporyadke Date: Wed, 31 Jan 2024 15:15:07 +0100 Subject: [PATCH] storage balancer info in hive ui & sensors KIKIMR-2190 (#1200) --- ydb/core/mind/hive/hive_impl.cpp | 5 ++++- ydb/core/mind/hive/hive_impl.h | 6 ++++++ ydb/core/mind/hive/hive_ut.cpp | 1 + ydb/core/mind/hive/monitoring.cpp | 15 +++++++++++++++ ydb/core/mind/hive/tx__update_tablet_groups.cpp | 1 + ydb/core/protos/config.proto | 9 +++++---- ydb/core/protos/counters_hive.proto | 2 ++ 7 files changed, 34 insertions(+), 5 deletions(-) diff --git a/ydb/core/mind/hive/hive_impl.cpp b/ydb/core/mind/hive/hive_impl.cpp index 76fdcdeb1c92..a866ea08e3e4 100644 --- a/ydb/core/mind/hive/hive_impl.cpp +++ b/ydb/core/mind/hive/hive_impl.cpp @@ -2382,7 +2382,9 @@ void THive::Handle(TEvPrivate::TEvProcessStorageBalancer::TPtr&) { auto& [stats, pool] = *std::max_element(poolStats.begin(), poolStats.end(), [](const TPoolStat& lhs, const TPoolStat& rhs) { return lhs.first.Scatter < rhs.first.Scatter; }); - if (stats.Scatter > GetMinStorageScatterToBalance()) { + StorageScatter = stats.Scatter; + TabletCounters->Simple()[NHive::COUNTER_STORAGE_SCATTER].Set(StorageScatter * 100); + if (StorageScatter > GetMinStorageScatterToBalance()) { BLOG_D("Storage Scatter = " << stats.Scatter << " in pool " << pool.Name << ", starting StorageBalancer"); ui64 numReassigns = 1; auto it = pool.Groups.find(stats.MaxUsageGroupId); @@ -2395,6 +2397,7 @@ void THive::Handle(TEvPrivate::TEvProcessStorageBalancer::TPtr&) { } StartHiveStorageBalancer({ .NumReassigns = numReassigns, + .MaxInFlight = GetStorageBalancerInflight(), .StoragePool = pool.Name }); } diff --git a/ydb/core/mind/hive/hive_impl.h b/ydb/core/mind/hive/hive_impl.h index 33da8f8f31e8..587219a41ef0 100644 --- a/ydb/core/mind/hive/hive_impl.h +++ b/ydb/core/mind/hive/hive_impl.h @@ -233,6 +233,7 @@ class THive : public TActor, public TTabletExecutedFlat, public THiveShar friend class TTxTabletOwnersReply; friend class TTxRequestTabletOwners; friend class TTxUpdateTabletsObject; + friend class TTxUpdateTabletGroups; friend class TDeleteTabletActor; @@ -327,6 +328,7 @@ class THive : public TActor, public TTabletExecutedFlat, public THiveShar ui32 DataCenters = 1; ui32 RegisteredDataCenters = 1; TObjectDistributions ObjectDistributions; + double StorageScatter = 0; bool AreWeRootHive() const { return RootHiveId == HiveId; } bool AreWeSubDomainHive() const { return RootHiveId != HiveId; } @@ -925,6 +927,10 @@ class THive : public TActor, public TTabletExecutedFlat, public THiveShar return CurrentConfig.GetMinStorageScatterToBalance(); } + ui64 GetStorageBalancerInflight() const { + return CurrentConfig.GetStorageBalancerInflight(); + } + static void ActualizeRestartStatistics(google::protobuf::RepeatedField& restartTimestamps, ui64 barrier); static ui64 GetRestartsPerPeriod(const google::protobuf::RepeatedField& restartTimestamps, ui64 barrier); static bool IsSystemTablet(TTabletTypes::EType type); diff --git a/ydb/core/mind/hive/hive_ut.cpp b/ydb/core/mind/hive/hive_ut.cpp index ca238edb21ec..58367a60403f 100644 --- a/ydb/core/mind/hive/hive_ut.cpp +++ b/ydb/core/mind/hive/hive_ut.cpp @@ -2786,6 +2786,7 @@ Y_UNIT_TEST_SUITE(THiveTest) { Setup(runtime, true, 2, [](TAppPrepare& app) { app.HiveConfig.SetMinPeriodBetweenReassign(0); app.HiveConfig.SetStorageInfoRefreshFrequency(200); + app.HiveConfig.SetMinStorageScatterToBalance(0.5); }); const ui64 hiveTablet = MakeDefaultHiveID(0); const ui64 testerTablet = MakeDefaultHiveID(1); diff --git a/ydb/core/mind/hive/monitoring.cpp b/ydb/core/mind/hive/monitoring.cpp index e3757b674d9d..09c8040aa082 100644 --- a/ydb/core/mind/hive/monitoring.cpp +++ b/ydb/core/mind/hive/monitoring.cpp @@ -828,6 +828,12 @@ class TTxMonEvent_Settings : public TTransactionBase { UpdateConfig(db, "MaxWarmUpPeriod"); UpdateConfig(db, "WarmUpEnabled"); UpdateConfig(db, "ObjectImbalanceToBalance"); + UpdateConfig(db, "ChannelBalanceStrategy"); + UpdateConfig(db, "MaxChannelHistorySize"); + UpdateConfig(db, "StorageInfoRefreshFrequency"); + UpdateConfig(db, "MinStorageScatterToBalance"); + UpdateConfig(db, "MinGroupUsageToBalance"); + UpdateConfig(db, "StorageBalancerInflight"); if (params.contains("BalancerIgnoreTabletTypes")) { TVector tabletTypeNames = SplitString(params.Get("BalancerIgnoreTabletTypes"), ";"); @@ -1111,6 +1117,12 @@ class TTxMonEvent_Settings : public TTransactionBase { ShowConfig(out, "MaxWarmUpPeriod"); ShowConfig(out, "WarmUpEnabled"); ShowConfig(out, "ObjectImbalanceToBalance"); + ShowConfig(out, "ChannelBalanceStrategy"); + ShowConfig(out, "MaxChannelHistorySize"); + ShowConfig(out, "StorageInfoRefreshFrequency"); + ShowConfig(out, "MinStorageScatterToBalance"); + ShowConfig(out, "MinGroupUsageToBalance"); + ShowConfig(out, "StorageBalancerInflight"); ShowConfigForBalancerIgnoreTabletTypes(out); out << "
"; @@ -1405,6 +1417,7 @@ class TTxMonEvent_Landing : public TTransactionBase { out << "Network"; out << "MaxUsage"; out << "Imbalance"; + out << "Storage"; out << "
"; out << "
"; out << ""; @@ -1896,6 +1909,7 @@ function fillDataShort(result) { $('#waitQueue').html(result.WaitQueueSize); $('#maxUsage').html(result.MaxUsage); $('#objectImbalance').html(result.ObjectImbalance); + $('#storageScatter').html(result.StorageScatter); $('#resourceTotalCounter').html(result.ResourceTotal.Counter); $('#resourceTotalCPU').html(result.ResourceTotal.CPU); @@ -2205,6 +2219,7 @@ class TTxMonEvent_LandingData : public TTransactionBase { jsonData["ScatterHtml"]["Memory"] = std::get(scatterHtml); jsonData["ScatterHtml"]["Network"] = std::get(scatterHtml); jsonData["ObjectImbalance"] = GetValueWithColoredGlyph(Self->ObjectDistributions.GetMaxImbalance(), Self->GetObjectImbalanceToBalance()); + jsonData["StorageScatter"] = GetValueWithColoredGlyph(Self->StorageScatter, Self->GetMinStorageScatterToBalance()); jsonData["WarmUp"] = Self->WarmUp; if (Cgi.Get("nodes") == "1") { diff --git a/ydb/core/mind/hive/tx__update_tablet_groups.cpp b/ydb/core/mind/hive/tx__update_tablet_groups.cpp index be2900c19f7d..5511594bf32e 100644 --- a/ydb/core/mind/hive/tx__update_tablet_groups.cpp +++ b/ydb/core/mind/hive/tx__update_tablet_groups.cpp @@ -285,6 +285,7 @@ class TTxUpdateTabletGroups : public TTransactionBase { // Use best effort to kill currently running tablet SideEffects.Register(CreateTabletKiller(TabletId, /* nodeId */ 0, tablet->KnownGeneration)); } + SideEffects.Callback([counters = Self->TabletCounters] { counters->Cumulative()[NHive::COUNTER_TABLETS_STORAGE_REASSIGNED].Increment(1); }); } if (needToIncreaseGeneration) { tablet->IncreaseGeneration(); diff --git a/ydb/core/protos/config.proto b/ydb/core/protos/config.proto index cf31b90b8c4a..9f0f5a1e52a7 100644 --- a/ydb/core/protos/config.proto +++ b/ydb/core/protos/config.proto @@ -1375,9 +1375,9 @@ message THiveConfig { } enum EHiveChannelBalanceStrategy { - HIVE_CHANNEL_BALANCE_STRATEGY_HEAVIEST = 1; - HIVE_CHANNEL_BALANCE_STRATEGY_RANDOM = 2; - HIVE_CHANNEL_BALANCE_STRATEGY_WEIGHTED_RANDOM = 3; + HIVE_CHANNEL_BALANCE_STRATEGY_HEAVIEST = 0; + HIVE_CHANNEL_BALANCE_STRATEGY_RANDOM = 1; + HIVE_CHANNEL_BALANCE_STRATEGY_WEIGHTED_RANDOM = 2; } enum EHiveNodeSelectStrategy { @@ -1460,8 +1460,9 @@ message THiveConfig { optional EHiveChannelBalanceStrategy ChannelBalanceStrategy = 68 [default = HIVE_CHANNEL_BALANCE_STRATEGY_WEIGHTED_RANDOM]; optional uint64 MaxChannelHistorySize = 69 [default = 200]; optional uint64 StorageInfoRefreshFrequency = 70 [default = 600000]; // send a query to BSC every x milliseconds - optional double MinStorageScatterToBalance = 71 [default = 0.5]; + optional double MinStorageScatterToBalance = 71 [default = 999]; // storage balancer trigger is disabled by default optional double MinGroupUsageToBalance = 72 [default = 0.1]; + optional uint64 StorageBalancerInflight = 73 [default = 1]; } message TColumnShardConfig { diff --git a/ydb/core/protos/counters_hive.proto b/ydb/core/protos/counters_hive.proto index a936a0aa7c51..47dc89070b26 100644 --- a/ydb/core/protos/counters_hive.proto +++ b/ydb/core/protos/counters_hive.proto @@ -28,6 +28,7 @@ enum ESimpleCounters { COUNTER_BALANCE_OBJECT_IMBALANCE = 18 [(CounterOpts) = {Name: "BalanceObjectImbalance"}]; COUNTER_IMBALANCED_OBJECTS = 19 [(CounterOpts) = {Name: "ImbalancedObjects"}]; COUNTER_WORST_OBJECT_VARIANCE = 20 [(CounterOpts) = {Name: "WorstObjectVariance"}]; + COUNTER_STORAGE_SCATTER = 21 [(CounterOpts) = {Name: "StorageScatter"}]; } enum ECumulativeCounters { @@ -44,6 +45,7 @@ enum ECumulativeCounters { COUNTER_SUGGESTED_SCALE_UP = 10 [(CounterOpts) = {Name: "SuggestedScaleUp"}]; COUNTER_SUGGESTED_SCALE_DOWN = 11 [(CounterOpts) = {Name: "SuggestedScaleDown"}]; COUNTER_STORAGE_BALANCER_EXECUTED = 12 [(CounterOpts) = {Name: "StorageBalancerExecuted"}]; + COUNTER_TABLETS_STORAGE_REASSIGNED = 13 [(CounterOpts) = {Name: "TabletsStorageReassigned"}]; } enum EPercentileCounters {
BalancerRunsMoves