Skip to content

Commit

Permalink
skip irrelevant tablets in balancing (#6033)
Browse files Browse the repository at this point in the history
  • Loading branch information
vporyadke authored Jul 1, 2024
1 parent 88e58ec commit 823fbb3
Show file tree
Hide file tree
Showing 4 changed files with 96 additions and 7 deletions.
2 changes: 1 addition & 1 deletion ydb/core/mind/hive/balancer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ class THiveBalancer : public NActors::TActorBootstrapped<THiveBalancer>, public
for (TTabletInfo* tablet : nodeTablets) {
if (tablet->IsGoodForBalancer(now) &&
(!Settings.FilterObjectId || tablet->GetObjectId() == *Settings.FilterObjectId) &&
tablet->HasAllowedMetric(Settings.ResourceToBalance)) {
tablet->HasMetric(Settings.ResourceToBalance)) {
tablet->UpdateWeight();
tablets.emplace_back(tablet);
}
Expand Down
80 changes: 80 additions & 0 deletions ydb/core/mind/hive/hive_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4606,6 +4606,86 @@ Y_UNIT_TEST_SUITE(THiveTest) {
UNIT_ASSERT_LE(movedToFirstNode, TABLETS_PER_NODE / 2);
}

Y_UNIT_TEST(TestHiveBalancerDifferentResources2) {
// Tablets on node 1 report high network usage but cannot be moved
// other tablets have default low metrics
// Nothing should be moved!
static constexpr ui64 TABLETS_PER_NODE = 5;
static constexpr ui64 NUM_NODES = 3;
TTestBasicRuntime runtime(NUM_NODES, false);
Setup(runtime, true, 1, [](TAppPrepare& app) {
app.HiveConfig.SetTabletKickCooldownPeriod(0);
app.HiveConfig.SetResourceChangeReactionPeriod(0);
});
const int nodeBase = runtime.GetNodeId(0);
TActorId senderA = runtime.AllocateEdgeActor();
const ui64 hiveTablet = MakeDefaultHiveID();
const ui64 testerTablet = MakeTabletID(false, 1);

auto getDistribution = [hiveTablet, nodeBase, senderA, &runtime]() -> std::array<std::vector<ui64>, NUM_NODES> {
std::array<std::vector<ui64>, NUM_NODES> nodeTablets = {};
{
runtime.SendToPipe(hiveTablet, senderA, new TEvHive::TEvRequestHiveInfo());
TAutoPtr<IEventHandle> handle;
TEvHive::TEvResponseHiveInfo* response = runtime.GrabEdgeEventRethrow<TEvHive::TEvResponseHiveInfo>(handle);
for (const NKikimrHive::TTabletInfo& tablet : response->Record.GetTablets()) {
UNIT_ASSERT_C(((int)tablet.GetNodeID() - nodeBase >= 0) && (tablet.GetNodeID() - nodeBase < 3),
"nodeId# " << tablet.GetNodeID() << " nodeBase# " << nodeBase);
nodeTablets[tablet.GetNodeID() - nodeBase].push_back(tablet.GetTabletID());
}
}
for (auto& tablets : nodeTablets) {
std::sort(tablets.begin(), tablets.end());
}
return nodeTablets;
};

CreateTestBootstrapper(runtime, CreateTestTabletInfo(hiveTablet, TTabletTypes::Hive), &CreateDefaultHive);

// wait for creation of nodes
{
TDispatchOptions options;
options.FinalEvents.emplace_back(TEvLocal::EvStatus, NUM_NODES);
runtime.DispatchEvents(options);
}

TTabletTypes::EType tabletType = TTabletTypes::Dummy;
for (size_t i = 0; i < NUM_NODES * TABLETS_PER_NODE; ++i) {
THolder<TEvHive::TEvCreateTablet> ev(new TEvHive::TEvCreateTablet(testerTablet, 100500 + i, tabletType, BINDED_CHANNELS));
ev->Record.SetObjectId(i);
if (i % NUM_NODES == 0) {
ev->Record.AddAllowedNodeIDs(nodeBase);
}
ui64 tabletId = SendCreateTestTablet(runtime, hiveTablet, testerTablet, std::move(ev), 0, true);
MakeSureTabletIsUp(runtime, tabletId, 0);
}

// Check initial distribution
auto initialDistribution = getDistribution();
for (size_t i = 0; i < NUM_NODES; ++i) {
UNIT_ASSERT_VALUES_EQUAL(initialDistribution[i].size(), TABLETS_PER_NODE);
}

for (auto tabletId : initialDistribution[0]) {
THolder<TEvHive::TEvTabletMetrics> metrics = MakeHolder<TEvHive::TEvTabletMetrics>();
NKikimrHive::TTabletMetrics* cpu = metrics->Record.AddTabletMetrics();
cpu->SetTabletID(tabletId);
cpu->MutableResourceUsage()->SetCPU(1'500'000);

runtime.SendToPipe(hiveTablet, senderA, metrics.Release());
}

{
TDispatchOptions options;
options.FinalEvents.emplace_back(NHive::TEvPrivate::EvBalancerOut);
runtime.DispatchEvents(options, TDuration::Seconds(10));
}

// Check nothing happened
auto newDistribution = getDistribution();
UNIT_ASSERT_EQUAL(initialDistribution, newDistribution);
}

Y_UNIT_TEST(TestHiveNoBalancingWithLowResourceUsage) {
static constexpr ui64 NUM_NODES = 5;
static constexpr ui64 NUM_TABLETS = 100;
Expand Down
7 changes: 7 additions & 0 deletions ydb/core/mind/hive/tablet_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,13 @@ bool TTabletInfo::HasAllowedMetric(EResourceToBalance resource) const {
return HasAllowedMetric(GetTabletAllowedMetricIds(), resource);
}

bool TTabletInfo::HasMetric(EResourceToBalance resource) const {
if (!HasAllowedMetric(resource)) {
return false;
}
return ExtractResourceUsage(GetResourceCurrentValues(), resource) > 0;
}

void TTabletInfo::UpdateResourceUsage(const NKikimrTabletBase::TMetrics& metrics) {
TInstant now = TActivationContext::Now();
const TVector<i64>& allowedMetricIds(GetTabletAllowedMetricIds());
Expand Down
14 changes: 8 additions & 6 deletions ydb/core/mind/hive/tablet_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,7 @@ struct TTabletInfo {
const TVector<i64>& GetTabletAllowedMetricIds() const;
static bool HasAllowedMetric(const TVector<i64>& allowedMetricIds, EResourceToBalance resource);
bool HasAllowedMetric(EResourceToBalance resource) const;
bool HasMetric(EResourceToBalance resource) const;

void UpdateResourceUsage(const NKikimrTabletBase::TMetrics& metrics);
TResourceRawValues GetResourceCurrentValues() const;
Expand All @@ -244,13 +245,14 @@ struct TTabletInfo {
return ExtractResourceUsage(normValues, resource);
}

static double ExtractResourceUsage(const TResourceNormalizedValues& normValues, EResourceToBalance resource = EResourceToBalance::ComputeResources) {
template <typename ResourceTypes>
static auto ExtractResourceUsage(const ResourceTypes& values, EResourceToBalance resource = EResourceToBalance::ComputeResources) {
switch (resource) {
case EResourceToBalance::CPU: return std::get<NMetrics::EResource::CPU>(normValues);
case EResourceToBalance::Memory: return std::get<NMetrics::EResource::Memory>(normValues);
case EResourceToBalance::Network: return std::get<NMetrics::EResource::Network>(normValues);
case EResourceToBalance::Counter: return std::get<NMetrics::EResource::Counter>(normValues);
case EResourceToBalance::ComputeResources: return max(normValues);
case EResourceToBalance::CPU: return std::get<NMetrics::EResource::CPU>(values);
case EResourceToBalance::Memory: return std::get<NMetrics::EResource::Memory>(values);
case EResourceToBalance::Network: return std::get<NMetrics::EResource::Network>(values);
case EResourceToBalance::Counter: return std::get<NMetrics::EResource::Counter>(values);
case EResourceToBalance::ComputeResources: return max(values);
}
}

Expand Down

0 comments on commit 823fbb3

Please sign in to comment.