Skip to content

Commit

Permalink
only give counter resource to tablets that never had normal metrics (y…
Browse files Browse the repository at this point in the history
  • Loading branch information
vporyadke committed May 24, 2024
1 parent 2631f5b commit 6deb648
Show file tree
Hide file tree
Showing 5 changed files with 107 additions and 7 deletions.
90 changes: 90 additions & 0 deletions ydb/core/mind/hive/hive_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4507,6 +4507,96 @@ Y_UNIT_TEST_SUITE(THiveTest) {
UNIT_ASSERT_LE(movedToFirstNode, TABLETS_PER_NODE / 2);
}

Y_UNIT_TEST(TestHiveNoBalancingWithLowResourceUsage) {
static constexpr ui64 NUM_NODES = 5;
static constexpr ui64 NUM_TABLETS = 100;
TTestBasicRuntime runtime(NUM_NODES, false);
Setup(runtime, true, 1, [](TAppPrepare& app) {
app.HiveConfig.SetTabletKickCooldownPeriod(0);
app.HiveConfig.SetResourceChangeReactionPeriod(0);
app.HiveConfig.SetMetricsWindowSize(1);
});
const int nodeBase = runtime.GetNodeId(0);
TActorId senderA = runtime.AllocateEdgeActor();
const ui64 hiveTablet = MakeDefaultHiveID();
const ui64 testerTablet = MakeTabletID(false, 1);

auto getDistribution = [hiveTablet, nodeBase, senderA, &runtime]() -> std::array<std::vector<ui64>, NUM_NODES> {
std::array<std::vector<ui64>, NUM_NODES> nodeTablets = {};
{
runtime.SendToPipe(hiveTablet, senderA, new TEvHive::TEvRequestHiveInfo());
TAutoPtr<IEventHandle> handle;
TEvHive::TEvResponseHiveInfo* response = runtime.GrabEdgeEventRethrow<TEvHive::TEvResponseHiveInfo>(handle);
for (const NKikimrHive::TTabletInfo& tablet : response->Record.GetTablets()) {
UNIT_ASSERT_C(((int)tablet.GetNodeID() - nodeBase >= 0) && (tablet.GetNodeID() - nodeBase < NUM_NODES),
"nodeId# " << tablet.GetNodeID() << " nodeBase# " << nodeBase);
nodeTablets[tablet.GetNodeID() - nodeBase].push_back(tablet.GetTabletID());
}
}
return nodeTablets;
};

CreateTestBootstrapper(runtime, CreateTestTabletInfo(hiveTablet, TTabletTypes::Hive), &CreateDefaultHive);

// wait for creation of nodes
{
TDispatchOptions options;
options.FinalEvents.emplace_back(TEvLocal::EvStatus, NUM_NODES);
runtime.DispatchEvents(options);
}

TTabletTypes::EType tabletType = TTabletTypes::Dummy;
std::vector<ui64> tablets;
tablets.reserve(NUM_TABLETS);
for (size_t i = 0; i < NUM_TABLETS; ++i) {
THolder<TEvHive::TEvCreateTablet> ev(new TEvHive::TEvCreateTablet(testerTablet, 100500 + i, tabletType, BINDED_CHANNELS));
ev->Record.SetObjectId(i);
ui64 tabletId = SendCreateTestTablet(runtime, hiveTablet, testerTablet, std::move(ev), 0, true);
MakeSureTabletIsUp(runtime, tabletId, 0);
tablets.push_back(tabletId);
}

auto initialDistribution = getDistribution();

// report small metrics for some tablets
auto rand = CreateDeterministicRandomProvider(777);
for (auto tablet : tablets) {
THolder<TEvHive::TEvTabletMetrics> metrics = MakeHolder<TEvHive::TEvTabletMetrics>();
NKikimrHive::TTabletMetrics* metric = metrics->Record.AddTabletMetrics();
metric->SetTabletID(tablet);
if (rand->GenRand() % 2) {
metric->MutableResourceUsage()->SetCPU(1001); // 1% core
metric->MutableResourceUsage()->SetMemory(150'000); // 150kb
} else {
metric->MutableResourceUsage()->SetCPU(999);
metric->MutableResourceUsage()->SetMemory(100'000);
}

runtime.SendToPipe(hiveTablet, senderA, metrics.Release());
}

{
TDispatchOptions options;
options.FinalEvents.emplace_back(NHive::TEvPrivate::EvBalancerOut);
runtime.DispatchEvents(options, TDuration::Seconds(10));
}

// Check that balancer moved no tablets
auto newDistribution = getDistribution();

UNIT_ASSERT_EQUAL(initialDistribution, newDistribution);

{
auto request = std::make_unique<TEvHive::TEvRequestHiveDomainStats>();
request->Record.SetReturnMetrics(true);
runtime.SendToPipe(hiveTablet, senderA, request.release());
TAutoPtr<IEventHandle> handle;
TEvHive::TEvResponseHiveDomainStats* response = runtime.GrabEdgeEventRethrow<TEvHive::TEvResponseHiveDomainStats>(handle);
ui64 totalCounter = response->Record.GetDomainStats(0).GetMetrics().GetCounter();
UNIT_ASSERT_VALUES_EQUAL(totalCounter, 0);
}
}

Y_UNIT_TEST(TestHiveBalancerWithImmovableTablets) {
static constexpr ui64 TABLETS_PER_NODE = 10;
TTestBasicRuntime runtime(3, false);
Expand Down
18 changes: 12 additions & 6 deletions ydb/core/mind/hive/tablet_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -398,14 +398,21 @@ TResourceRawValues TTabletInfo::GetResourceMaximumValues() const {
}
}

i64 TTabletInfo::GetCounterValue(const NKikimrTabletBase::TMetrics& metrics, const TVector<i64>& allowedMetricIds) {
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kCPUFieldNumber) != allowedMetricIds.end() && THive::IsValidMetricsCPU(metrics)) {
i64 TTabletInfo::GetCounterValue() const {
const auto& allowedMetricIds = GetTabletAllowedMetricIds();
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kCPUFieldNumber) != allowedMetricIds.end()
&& (ResourceMetricsAggregates.MaximumCPU.GetAllTimeMaximum() > 0
|| ResourceValues.GetCPU() > 0)) {
return 0;
}
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kMemoryFieldNumber) != allowedMetricIds.end() && THive::IsValidMetricsMemory(metrics)) {
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kMemoryFieldNumber) != allowedMetricIds.end()
&& (ResourceMetricsAggregates.MaximumMemory.GetAllTimeMaximum() > 0
|| ResourceValues.GetMemory() > 0)) {
return 0;
}
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kNetworkFieldNumber) != allowedMetricIds.end() && THive::IsValidMetricsNetwork(metrics)) {
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kNetworkFieldNumber) != allowedMetricIds.end()
&& (ResourceMetricsAggregates.MaximumNetwork.GetAllTimeMaximum() > 0
|| ResourceValues.GetNetwork() > 0)) {
return 0;
}
return 1;
Expand Down Expand Up @@ -446,8 +453,7 @@ void TTabletInfo::FilterRawValues(TResourceNormalizedValues& values) const {
}

void TTabletInfo::ActualizeCounter() {
auto value = GetCounterValue(ResourceValues, GetTabletAllowedMetricIds());
ResourceValues.SetCounter(value);
ResourceValues.SetCounter(GetCounterValue());
}

const TNodeFilter& TTabletInfo::GetNodeFilter() const {
Expand Down
2 changes: 1 addition & 1 deletion ydb/core/mind/hive/tablet_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ struct TTabletInfo {
void UpdateResourceUsage(const NKikimrTabletBase::TMetrics& metrics);
TResourceRawValues GetResourceCurrentValues() const;
TResourceRawValues GetResourceMaximumValues() const;
static i64 GetCounterValue(const NKikimrTabletBase::TMetrics& metrics, const TVector<i64>& allowedMetricIds);
i64 GetCounterValue() const;
void FilterRawValues(TResourceRawValues& values) const;
void FilterRawValues(TResourceNormalizedValues& values) const;
void ActualizeCounter();
Expand Down
1 change: 1 addition & 0 deletions ydb/core/protos/metrics.proto
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ package NKikimrMetricsProto;
message TMaximumValueUI64 {
optional uint64 LastBucketStartTime = 1;
repeated uint64 Values = 2;
optional uint64 AllTimeMaximum = 3;
}
3 changes: 3 additions & 0 deletions ydb/core/util/metrics.h
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,9 @@ class TMaximumValueVariableWindowUI64 : public NKikimrMetricsProto::TMaximumValu
using TProto = NKikimrMetricsProto::TMaximumValueUI64;

void SetValue(TType value, TInstant now = TInstant::Now()) {
if (TProto::GetAllTimeMaximum() > 0 || MaximumValue > 0) { // ignoring initial value
TProto::SetAllTimeMaximum(std::max(value, TProto::GetAllTimeMaximum()));
}
TDuration elapsedCurrentBucket = now - TInstant::MilliSeconds(TProto::GetLastBucketStartTime());
if (TProto::ValuesSize() == 0 || elapsedCurrentBucket >= BucketDuration) {
size_t bucketsPassed = 0;
Expand Down

0 comments on commit 6deb648

Please sign in to comment.