Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

only give counter resource to tablets that never had normal metrics #4829

Merged
merged 2 commits into from
May 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 90 additions & 0 deletions ydb/core/mind/hive/hive_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4507,6 +4507,96 @@ Y_UNIT_TEST_SUITE(THiveTest) {
UNIT_ASSERT_LE(movedToFirstNode, TABLETS_PER_NODE / 2);
}

Y_UNIT_TEST(TestHiveNoBalancingWithLowResourceUsage) {
static constexpr ui64 NUM_NODES = 5;
static constexpr ui64 NUM_TABLETS = 100;
TTestBasicRuntime runtime(NUM_NODES, false);
Setup(runtime, true, 1, [](TAppPrepare& app) {
app.HiveConfig.SetTabletKickCooldownPeriod(0);
app.HiveConfig.SetResourceChangeReactionPeriod(0);
app.HiveConfig.SetMetricsWindowSize(1);
});
const int nodeBase = runtime.GetNodeId(0);
TActorId senderA = runtime.AllocateEdgeActor();
const ui64 hiveTablet = MakeDefaultHiveID(0);
const ui64 testerTablet = MakeDefaultHiveID(1);

auto getDistribution = [hiveTablet, nodeBase, senderA, &runtime]() -> std::array<std::vector<ui64>, NUM_NODES> {
std::array<std::vector<ui64>, NUM_NODES> nodeTablets = {};
{
runtime.SendToPipe(hiveTablet, senderA, new TEvHive::TEvRequestHiveInfo());
TAutoPtr<IEventHandle> handle;
TEvHive::TEvResponseHiveInfo* response = runtime.GrabEdgeEventRethrow<TEvHive::TEvResponseHiveInfo>(handle);
for (const NKikimrHive::TTabletInfo& tablet : response->Record.GetTablets()) {
UNIT_ASSERT_C(((int)tablet.GetNodeID() - nodeBase >= 0) && (tablet.GetNodeID() - nodeBase < NUM_NODES),
"nodeId# " << tablet.GetNodeID() << " nodeBase# " << nodeBase);
nodeTablets[tablet.GetNodeID() - nodeBase].push_back(tablet.GetTabletID());
}
}
return nodeTablets;
};

CreateTestBootstrapper(runtime, CreateTestTabletInfo(hiveTablet, TTabletTypes::Hive), &CreateDefaultHive);

// wait for creation of nodes
{
TDispatchOptions options;
options.FinalEvents.emplace_back(TEvLocal::EvStatus, NUM_NODES);
runtime.DispatchEvents(options);
}

TTabletTypes::EType tabletType = TTabletTypes::Dummy;
std::vector<ui64> tablets;
tablets.reserve(NUM_TABLETS);
for (size_t i = 0; i < NUM_TABLETS; ++i) {
THolder<TEvHive::TEvCreateTablet> ev(new TEvHive::TEvCreateTablet(testerTablet, 100500 + i, tabletType, BINDED_CHANNELS));
ev->Record.SetObjectId(i);
ui64 tabletId = SendCreateTestTablet(runtime, hiveTablet, testerTablet, std::move(ev), 0, true);
MakeSureTabletIsUp(runtime, tabletId, 0);
tablets.push_back(tabletId);
}

auto initialDistribution = getDistribution();

// report small metrics for some tablets
auto rand = CreateDeterministicRandomProvider(777);
for (auto tablet : tablets) {
THolder<TEvHive::TEvTabletMetrics> metrics = MakeHolder<TEvHive::TEvTabletMetrics>();
NKikimrHive::TTabletMetrics* metric = metrics->Record.AddTabletMetrics();
metric->SetTabletID(tablet);
if (rand->GenRand() % 2) {
metric->MutableResourceUsage()->SetCPU(1001); // 1% core
metric->MutableResourceUsage()->SetMemory(150'000); // 150kb
} else {
metric->MutableResourceUsage()->SetCPU(999);
metric->MutableResourceUsage()->SetMemory(100'000);
}

runtime.SendToPipe(hiveTablet, senderA, metrics.Release());
}

{
TDispatchOptions options;
options.FinalEvents.emplace_back(NHive::TEvPrivate::EvBalancerOut);
runtime.DispatchEvents(options, TDuration::Seconds(10));
}

// Check that balancer moved no tablets
auto newDistribution = getDistribution();

UNIT_ASSERT_EQUAL(initialDistribution, newDistribution);

{
auto request = std::make_unique<TEvHive::TEvRequestHiveDomainStats>();
request->Record.SetReturnMetrics(true);
runtime.SendToPipe(hiveTablet, senderA, request.release());
TAutoPtr<IEventHandle> handle;
TEvHive::TEvResponseHiveDomainStats* response = runtime.GrabEdgeEventRethrow<TEvHive::TEvResponseHiveDomainStats>(handle);
ui64 totalCounter = response->Record.GetDomainStats(0).GetMetrics().GetCounter();
UNIT_ASSERT_VALUES_EQUAL(totalCounter, 0);
}
}

Y_UNIT_TEST(TestHiveBalancerWithImmovableTablets) {
static constexpr ui64 TABLETS_PER_NODE = 10;
TTestBasicRuntime runtime(3, false);
Expand Down
18 changes: 12 additions & 6 deletions ydb/core/mind/hive/tablet_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -398,14 +398,21 @@ TResourceRawValues TTabletInfo::GetResourceMaximumValues() const {
}
}

i64 TTabletInfo::GetCounterValue(const NKikimrTabletBase::TMetrics& metrics, const TVector<i64>& allowedMetricIds) {
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kCPUFieldNumber) != allowedMetricIds.end() && THive::IsValidMetricsCPU(metrics)) {
i64 TTabletInfo::GetCounterValue() const {
const auto& allowedMetricIds = GetTabletAllowedMetricIds();
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kCPUFieldNumber) != allowedMetricIds.end()
&& (ResourceMetricsAggregates.MaximumCPU.GetAllTimeMaximum() > 0
|| ResourceValues.GetCPU() > 0)) {
return 0;
}
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kMemoryFieldNumber) != allowedMetricIds.end() && THive::IsValidMetricsMemory(metrics)) {
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kMemoryFieldNumber) != allowedMetricIds.end()
&& (ResourceMetricsAggregates.MaximumMemory.GetAllTimeMaximum() > 0
|| ResourceValues.GetMemory() > 0)) {
return 0;
}
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kNetworkFieldNumber) != allowedMetricIds.end() && THive::IsValidMetricsNetwork(metrics)) {
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kNetworkFieldNumber) != allowedMetricIds.end()
&& (ResourceMetricsAggregates.MaximumNetwork.GetAllTimeMaximum() > 0
|| ResourceValues.GetNetwork() > 0)) {
return 0;
}
return 1;
Expand Down Expand Up @@ -446,8 +453,7 @@ void TTabletInfo::FilterRawValues(TResourceNormalizedValues& values) const {
}

void TTabletInfo::ActualizeCounter() {
auto value = GetCounterValue(ResourceValues, GetTabletAllowedMetricIds());
ResourceValues.SetCounter(value);
ResourceValues.SetCounter(GetCounterValue());
}

const TNodeFilter& TTabletInfo::GetNodeFilter() const {
Expand Down
2 changes: 1 addition & 1 deletion ydb/core/mind/hive/tablet_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ struct TTabletInfo {
void UpdateResourceUsage(const NKikimrTabletBase::TMetrics& metrics);
TResourceRawValues GetResourceCurrentValues() const;
TResourceRawValues GetResourceMaximumValues() const;
static i64 GetCounterValue(const NKikimrTabletBase::TMetrics& metrics, const TVector<i64>& allowedMetricIds);
i64 GetCounterValue() const;
void FilterRawValues(TResourceRawValues& values) const;
void FilterRawValues(TResourceNormalizedValues& values) const;
void ActualizeCounter();
Expand Down
1 change: 1 addition & 0 deletions ydb/core/protos/metrics.proto
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ package NKikimrMetricsProto;
message TMaximumValueUI64 {
optional uint64 LastBucketStartTime = 1;
repeated uint64 Values = 2;
optional uint64 AllTimeMaximum = 3;
}
3 changes: 3 additions & 0 deletions ydb/core/util/metrics.h
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,9 @@ class TMaximumValueVariableWindowUI64 : public NKikimrMetricsProto::TMaximumValu
using TProto = NKikimrMetricsProto::TMaximumValueUI64;

void SetValue(TType value, TInstant now = TInstant::Now()) {
if (TProto::GetAllTimeMaximum() > 0 || MaximumValue > 0) { // ignoring initial value
TProto::SetAllTimeMaximum(std::max(value, TProto::GetAllTimeMaximum()));
}
TDuration elapsedCurrentBucket = now - TInstant::MilliSeconds(TProto::GetLastBucketStartTime());
if (TProto::ValuesSize() == 0 || elapsedCurrentBucket >= BucketDuration) {
size_t bucketsPassed = 0;
Expand Down
Loading