Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

only give counter resource to tablets that never had normal metrics #4763

Merged
merged 2 commits into from
May 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 90 additions & 0 deletions ydb/core/mind/hive/hive_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4594,6 +4594,96 @@ Y_UNIT_TEST_SUITE(THiveTest) {
UNIT_ASSERT_LE(movedToFirstNode, TABLETS_PER_NODE / 2);
}

Y_UNIT_TEST(TestHiveNoBalancingWithLowResourceUsage) {
static constexpr ui64 NUM_NODES = 5;
static constexpr ui64 NUM_TABLETS = 100;
TTestBasicRuntime runtime(NUM_NODES, false);
Setup(runtime, true, 1, [](TAppPrepare& app) {
app.HiveConfig.SetTabletKickCooldownPeriod(0);
app.HiveConfig.SetResourceChangeReactionPeriod(0);
app.HiveConfig.SetMetricsWindowSize(1);
});
const int nodeBase = runtime.GetNodeId(0);
TActorId senderA = runtime.AllocateEdgeActor();
const ui64 hiveTablet = MakeDefaultHiveID();
const ui64 testerTablet = MakeTabletID(false, 1);

auto getDistribution = [hiveTablet, nodeBase, senderA, &runtime]() -> std::array<std::vector<ui64>, NUM_NODES> {
std::array<std::vector<ui64>, NUM_NODES> nodeTablets = {};
{
runtime.SendToPipe(hiveTablet, senderA, new TEvHive::TEvRequestHiveInfo());
TAutoPtr<IEventHandle> handle;
TEvHive::TEvResponseHiveInfo* response = runtime.GrabEdgeEventRethrow<TEvHive::TEvResponseHiveInfo>(handle);
for (const NKikimrHive::TTabletInfo& tablet : response->Record.GetTablets()) {
UNIT_ASSERT_C(((int)tablet.GetNodeID() - nodeBase >= 0) && (tablet.GetNodeID() - nodeBase < NUM_NODES),
"nodeId# " << tablet.GetNodeID() << " nodeBase# " << nodeBase);
nodeTablets[tablet.GetNodeID() - nodeBase].push_back(tablet.GetTabletID());
}
}
return nodeTablets;
};

CreateTestBootstrapper(runtime, CreateTestTabletInfo(hiveTablet, TTabletTypes::Hive), &CreateDefaultHive);

// wait for creation of nodes
{
TDispatchOptions options;
options.FinalEvents.emplace_back(TEvLocal::EvStatus, NUM_NODES);
runtime.DispatchEvents(options);
}

TTabletTypes::EType tabletType = TTabletTypes::Dummy;
std::vector<ui64> tablets;
tablets.reserve(NUM_TABLETS);
for (size_t i = 0; i < NUM_TABLETS; ++i) {
THolder<TEvHive::TEvCreateTablet> ev(new TEvHive::TEvCreateTablet(testerTablet, 100500 + i, tabletType, BINDED_CHANNELS));
ev->Record.SetObjectId(i);
ui64 tabletId = SendCreateTestTablet(runtime, hiveTablet, testerTablet, std::move(ev), 0, true);
MakeSureTabletIsUp(runtime, tabletId, 0);
tablets.push_back(tabletId);
}

auto initialDistribution = getDistribution();

// report small metrics for some tablets
auto rand = CreateDeterministicRandomProvider(777);
for (auto tablet : tablets) {
THolder<TEvHive::TEvTabletMetrics> metrics = MakeHolder<TEvHive::TEvTabletMetrics>();
NKikimrHive::TTabletMetrics* metric = metrics->Record.AddTabletMetrics();
metric->SetTabletID(tablet);
if (rand->GenRand() % 2) {
metric->MutableResourceUsage()->SetCPU(1001); // 1% core
metric->MutableResourceUsage()->SetMemory(150'000); // 150kb
} else {
metric->MutableResourceUsage()->SetCPU(999);
metric->MutableResourceUsage()->SetMemory(100'000);
}

runtime.SendToPipe(hiveTablet, senderA, metrics.Release());
}

{
TDispatchOptions options;
options.FinalEvents.emplace_back(NHive::TEvPrivate::EvBalancerOut);
runtime.DispatchEvents(options, TDuration::Seconds(10));
}

// Check that balancer moved no tablets
auto newDistribution = getDistribution();
adameat marked this conversation as resolved.
Show resolved Hide resolved

UNIT_ASSERT_EQUAL(initialDistribution, newDistribution);

{
auto request = std::make_unique<TEvHive::TEvRequestHiveDomainStats>();
request->Record.SetReturnMetrics(true);
runtime.SendToPipe(hiveTablet, senderA, request.release());
TAutoPtr<IEventHandle> handle;
TEvHive::TEvResponseHiveDomainStats* response = runtime.GrabEdgeEventRethrow<TEvHive::TEvResponseHiveDomainStats>(handle);
ui64 totalCounter = response->Record.GetDomainStats(0).GetMetrics().GetCounter();
UNIT_ASSERT_VALUES_EQUAL(totalCounter, 0);
}
}

Y_UNIT_TEST(TestHiveBalancerWithImmovableTablets) {
static constexpr ui64 TABLETS_PER_NODE = 10;
TTestBasicRuntime runtime(3, false);
Expand Down
18 changes: 12 additions & 6 deletions ydb/core/mind/hive/tablet_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -426,14 +426,21 @@ TResourceRawValues TTabletInfo::GetResourceMaximumValues() const {
}
}

i64 TTabletInfo::GetCounterValue(const NKikimrTabletBase::TMetrics& metrics, const TVector<i64>& allowedMetricIds) {
if (HasAllowedMetric(allowedMetricIds, EResourceToBalance::CPU) && THive::IsValidMetricsCPU(metrics)) {
i64 TTabletInfo::GetCounterValue() const {
const auto& allowedMetricIds = GetTabletAllowedMetricIds();
if (HasAllowedMetric(allowedMetricIds, EResourceToBalance::CPU)
&& (ResourceMetricsAggregates.MaximumCPU.GetAllTimeMaximum() > 0
|| ResourceValues.GetCPU() > 0)) {
return 0;
}
if (HasAllowedMetric(allowedMetricIds, EResourceToBalance::Memory) && THive::IsValidMetricsMemory(metrics)) {
if (HasAllowedMetric(allowedMetricIds, EResourceToBalance::Memory)
&& (ResourceMetricsAggregates.MaximumMemory.GetAllTimeMaximum() > 0
|| ResourceValues.GetMemory() > 0)) {
return 0;
}
if (HasAllowedMetric(allowedMetricIds, EResourceToBalance::Network) && THive::IsValidMetricsNetwork(metrics)) {
if (HasAllowedMetric(allowedMetricIds, EResourceToBalance::Network)
&& (ResourceMetricsAggregates.MaximumNetwork.GetAllTimeMaximum() > 0
|| ResourceValues.GetNetwork() > 0)) {
return 0;
}
return 1;
Expand Down Expand Up @@ -474,8 +481,7 @@ void TTabletInfo::FilterRawValues(TResourceNormalizedValues& values) const {
}

void TTabletInfo::ActualizeCounter() {
auto value = GetCounterValue(ResourceValues, GetTabletAllowedMetricIds());
ResourceValues.SetCounter(value);
ResourceValues.SetCounter(GetCounterValue());
}

const TNodeFilter& TTabletInfo::GetNodeFilter() const {
Expand Down
2 changes: 1 addition & 1 deletion ydb/core/mind/hive/tablet_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ struct TTabletInfo {
void UpdateResourceUsage(const NKikimrTabletBase::TMetrics& metrics);
TResourceRawValues GetResourceCurrentValues() const;
TResourceRawValues GetResourceMaximumValues() const;
static i64 GetCounterValue(const NKikimrTabletBase::TMetrics& metrics, const TVector<i64>& allowedMetricIds);
i64 GetCounterValue() const;
void FilterRawValues(TResourceRawValues& values) const;
void FilterRawValues(TResourceNormalizedValues& values) const;
void ActualizeCounter();
Expand Down
1 change: 1 addition & 0 deletions ydb/core/protos/metrics.proto
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ package NKikimrMetricsProto;
message TMaximumValueUI64 {
optional uint64 LastBucketStartTime = 1;
repeated uint64 Values = 2;
optional uint64 AllTimeMaximum = 3;
}
3 changes: 3 additions & 0 deletions ydb/core/util/metrics.h
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,9 @@ class TMaximumValueVariableWindowUI64 : public NKikimrMetricsProto::TMaximumValu
using TProto = NKikimrMetricsProto::TMaximumValueUI64;

void SetValue(TType value, TInstant now = TInstant::Now()) {
if (TProto::GetAllTimeMaximum() > 0 || MaximumValue > 0) { // ignoring initial value
Copy link
Member

@adameat adameat May 23, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why don't you put it into the end of the function?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's easier to make this check before we update MaximumValue
i want to ignore the initial value because it is set by hive, not actually reported by the tablet

TProto::SetAllTimeMaximum(std::max(value, TProto::GetAllTimeMaximum()));
}
TDuration elapsedCurrentBucket = now - TInstant::MilliSeconds(TProto::GetLastBucketStartTime());
if (TProto::ValuesSize() == 0 || elapsedCurrentBucket >= BucketDuration) {
size_t bucketsPassed = 0;
Expand Down
Loading