Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tablets with disabled metrics ignore metric-related balancers #4052

Merged
merged 3 commits into from
Apr 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion ydb/core/mind/hive/balancer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,9 @@ class THiveBalancer : public NActors::TActorBootstrapped<THiveBalancer>, public
std::vector<TTabletInfo*> tablets;
tablets.reserve(nodeTablets.size());
for (TTabletInfo* tablet : nodeTablets) {
if (tablet->IsGoodForBalancer(now) && (!Settings.FilterObjectId || tablet->GetObjectId() == *Settings.FilterObjectId)) {
if (tablet->IsGoodForBalancer(now) &&
(!Settings.FilterObjectId || tablet->GetObjectId() == *Settings.FilterObjectId) &&
tablet->HasAllowedMetric(Settings.ResourceToBalance)) {
tablet->UpdateWeight();
tablets.emplace_back(tablet);
}
Expand Down
4 changes: 2 additions & 2 deletions ydb/core/mind/hive/hive.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ constexpr std::size_t EBalancerTypeSize = static_cast<std::size_t>(EBalancerType
TString EBalancerTypeName(EBalancerType value);

enum class EResourceToBalance {
Dominant,
ComputeResources,
Counter,
CPU,
Memory,
Expand Down Expand Up @@ -292,7 +292,7 @@ struct TBalancerSettings {
bool RecheckOnFinish = false;
ui64 MaxInFlight = 1;
const std::vector<TNodeId> FilterNodeIds = {};
EResourceToBalance ResourceToBalance = EResourceToBalance::Dominant;
EResourceToBalance ResourceToBalance = EResourceToBalance::ComputeResources;
std::optional<TFullObjectId> FilterObjectId;
};

Expand Down
2 changes: 1 addition & 1 deletion ydb/core/mind/hive/hive_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2373,7 +2373,7 @@ void THive::Handle(TEvPrivate::TEvProcessTabletBalancer::TPtr&) {
case EResourceToBalance::Network:
balancerType = EBalancerType::ScatterNetwork;
break;
case EResourceToBalance::Dominant:
case EResourceToBalance::ComputeResources:
balancerType = EBalancerType::Scatter;
break;
}
Expand Down
6 changes: 3 additions & 3 deletions ydb/core/mind/hive/node_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ i32 TNodeInfo::GetPriorityForTablet(const TTabletInfo& tablet) const {

bool TNodeInfo::IsAbleToRunTablet(const TTabletInfo& tablet, TTabletDebugState* debugState) const {
if (tablet.IsAliveOnLocal(Local)) {
return !IsOverloaded();
return !(IsOverloaded() && tablet.HasAllowedMetric(EResourceToBalance::ComputeResources));
}
if (tablet.IsLeader()) {
const TLeaderTabletInfo& leader = tablet.AsLeader();
Expand Down Expand Up @@ -280,7 +280,7 @@ bool TNodeInfo::IsAbleToRunTablet(const TTabletInfo& tablet, TTabletDebugState*
}
}

if (tablet.IsAlive() && IsOverloaded()) {
if (tablet.IsAlive() && IsOverloaded() && tablet.HasAllowedMetric(EResourceToBalance::ComputeResources)) {
// we don't move already running tablet to another overloaded node
if (debugState) {
debugState->NodesWithoutResources++;
Expand Down Expand Up @@ -434,7 +434,7 @@ double TNodeInfo::GetNodeUsageForTablet(const TTabletInfo& tablet) const {

double TNodeInfo::GetNodeUsage(const TResourceNormalizedValues& normValues, EResourceToBalance resource) const {
double usage = TTabletInfo::ExtractResourceUsage(normValues, resource);
if (resource == EResourceToBalance::Dominant && AveragedNodeTotalUsage.IsValueStable()) {
if (resource == EResourceToBalance::ComputeResources && AveragedNodeTotalUsage.IsValueStable()) {
usage = std::max(usage, AveragedNodeTotalUsage.GetValue());
}
return usage;
Expand Down
4 changes: 2 additions & 2 deletions ydb/core/mind/hive/node_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -243,9 +243,9 @@ struct TNodeInfo {
}

double GetNodeUsageForTablet(const TTabletInfo& tablet) const;
double GetNodeUsage(EResourceToBalance resource = EResourceToBalance::Dominant) const;
double GetNodeUsage(EResourceToBalance resource = EResourceToBalance::ComputeResources) const;
double GetNodeUsage(const TResourceNormalizedValues& normValues,
EResourceToBalance resource = EResourceToBalance::Dominant) const;
EResourceToBalance resource = EResourceToBalance::ComputeResources) const;

ui64 GetTabletsRunningByType(TTabletTypes::EType tabletType) const;

Expand Down
49 changes: 37 additions & 12 deletions ydb/core/mind/hive/tablet_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -310,12 +310,37 @@ const TVector<i64>& TTabletInfo::GetTabletAllowedMetricIds() const {
return Hive.GetTabletTypeAllowedMetricIds(GetLeader().Type);
}

bool TTabletInfo::HasAllowedMetric(const TVector<i64>& allowedMetricIds, EResourceToBalance resource) {
switch (resource) {
case EResourceToBalance::ComputeResources: {
auto isComputeMetric = [](i64 metricId) {
return metricId == NKikimrTabletBase::TMetrics::kCPUFieldNumber ||
metricId == NKikimrTabletBase::TMetrics::kMemoryFieldNumber ||
metricId == NKikimrTabletBase::TMetrics::kNetworkFieldNumber;
};
return AnyOf(allowedMetricIds.begin(), allowedMetricIds.end(), isComputeMetric);
}
case EResourceToBalance::Counter:
return true;
case EResourceToBalance::CPU:
return Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kCPUFieldNumber) != allowedMetricIds.end();
case EResourceToBalance::Memory:
return Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kMemoryFieldNumber) != allowedMetricIds.end();
case EResourceToBalance::Network:
return Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kNetworkFieldNumber) != allowedMetricIds.end();
}
}

bool TTabletInfo::HasAllowedMetric(EResourceToBalance resource) const {
return HasAllowedMetric(GetTabletAllowedMetricIds(), resource);
}

void TTabletInfo::UpdateResourceUsage(const NKikimrTabletBase::TMetrics& metrics) {
TInstant now = TActivationContext::Now();
const TVector<i64>& allowedMetricIds(GetTabletAllowedMetricIds());
auto before = ResourceValues;
auto maximum = GetResourceMaximumValues();
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kCPUFieldNumber) != allowedMetricIds.end()) {
if (HasAllowedMetric(allowedMetricIds, EResourceToBalance::CPU)) {
if (metrics.HasCPU()) {
if (metrics.GetCPU() > static_cast<ui64>(std::get<NMetrics::EResource::CPU>(maximum))) {
BLOG_W("Ignoring too high CPU metric (" << metrics.GetCPU() << ") for tablet " << ToString());
Expand All @@ -325,7 +350,7 @@ void TTabletInfo::UpdateResourceUsage(const NKikimrTabletBase::TMetrics& metrics
}
}
}
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kMemoryFieldNumber) != allowedMetricIds.end()) {
if (HasAllowedMetric(allowedMetricIds, EResourceToBalance::Memory)) {
if (metrics.HasMemory()) {
if (metrics.GetMemory() > static_cast<ui64>(std::get<NMetrics::EResource::Memory>(maximum))) {
BLOG_W("Ignoring too high Memory metric (" << metrics.GetMemory() << ") for tablet " << ToString());
Expand All @@ -335,7 +360,7 @@ void TTabletInfo::UpdateResourceUsage(const NKikimrTabletBase::TMetrics& metrics
}
}
}
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kNetworkFieldNumber) != allowedMetricIds.end()) {
if (HasAllowedMetric(allowedMetricIds, EResourceToBalance::Network)) {
if (metrics.HasNetwork()) {
if (metrics.GetNetwork() > static_cast<ui64>(std::get<NMetrics::EResource::Network>(maximum))) {
BLOG_W("Ignoring too high Network metric (" << metrics.GetNetwork() << ") for tablet " << ToString());
Expand Down Expand Up @@ -396,13 +421,13 @@ TResourceRawValues TTabletInfo::GetResourceMaximumValues() const {
}

i64 TTabletInfo::GetCounterValue(const NKikimrTabletBase::TMetrics& metrics, const TVector<i64>& allowedMetricIds) {
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kCPUFieldNumber) != allowedMetricIds.end() && THive::IsValidMetricsCPU(metrics)) {
if (HasAllowedMetric(allowedMetricIds, EResourceToBalance::CPU) && THive::IsValidMetricsCPU(metrics)) {
return 0;
}
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kMemoryFieldNumber) != allowedMetricIds.end() && THive::IsValidMetricsMemory(metrics)) {
if (HasAllowedMetric(allowedMetricIds, EResourceToBalance::Memory) && THive::IsValidMetricsMemory(metrics)) {
return 0;
}
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kNetworkFieldNumber) != allowedMetricIds.end() && THive::IsValidMetricsNetwork(metrics)) {
if (HasAllowedMetric(allowedMetricIds, EResourceToBalance::Network) && THive::IsValidMetricsNetwork(metrics)) {
return 0;
}
return 1;
Expand All @@ -414,13 +439,13 @@ void TTabletInfo::FilterRawValues(TResourceRawValues& values) const {
if (metrics.GetCounter() == 0) {
std::get<NMetrics::EResource::Counter>(values) = 0;
}
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kCPUFieldNumber) == allowedMetricIds.end() || !THive::IsValidMetricsCPU(metrics)) {
if (!HasAllowedMetric(allowedMetricIds, EResourceToBalance::CPU) || !THive::IsValidMetricsCPU(metrics)) {
std::get<NMetrics::EResource::CPU>(values) = 0;
}
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kMemoryFieldNumber) == allowedMetricIds.end() || !THive::IsValidMetricsMemory(metrics)) {
if (!HasAllowedMetric(allowedMetricIds, EResourceToBalance::Memory) || !THive::IsValidMetricsMemory(metrics)) {
std::get<NMetrics::EResource::Memory>(values) = 0;
}
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kNetworkFieldNumber) == allowedMetricIds.end() || !THive::IsValidMetricsNetwork(metrics)) {
if (!HasAllowedMetric(allowedMetricIds, EResourceToBalance::Network) || !THive::IsValidMetricsNetwork(metrics)) {
std::get<NMetrics::EResource::Network>(values) = 0;
}
}
Expand All @@ -431,13 +456,13 @@ void TTabletInfo::FilterRawValues(TResourceNormalizedValues& values) const {
if (metrics.GetCounter() == 0) {
std::get<NMetrics::EResource::Counter>(values) = 0;
}
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kCPUFieldNumber) == allowedMetricIds.end() || !THive::IsValidMetricsCPU(metrics)) {
if (!HasAllowedMetric(allowedMetricIds, EResourceToBalance::CPU) || !THive::IsValidMetricsCPU(metrics)) {
std::get<NMetrics::EResource::CPU>(values) = 0;
}
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kMemoryFieldNumber) == allowedMetricIds.end() || !THive::IsValidMetricsMemory(metrics)) {
if (!HasAllowedMetric(allowedMetricIds, EResourceToBalance::Memory) || !THive::IsValidMetricsMemory(metrics)) {
std::get<NMetrics::EResource::Memory>(values) = 0;
}
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kNetworkFieldNumber) == allowedMetricIds.end() || !THive::IsValidMetricsNetwork(metrics)) {
if (!HasAllowedMetric(allowedMetricIds, EResourceToBalance::Network) || !THive::IsValidMetricsNetwork(metrics)) {
std::get<NMetrics::EResource::Network>(values) = 0;
}
}
Expand Down
8 changes: 5 additions & 3 deletions ydb/core/mind/hive/tablet_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,8 @@ struct TTabletInfo {
void BecomeUnknown(TNodeInfo* node);
bool Kick();
const TVector<i64>& GetTabletAllowedMetricIds() const;
static bool HasAllowedMetric(const TVector<i64>& allowedMetricIds, EResourceToBalance resource);
bool HasAllowedMetric(EResourceToBalance resource) const;

void UpdateResourceUsage(const NKikimrTabletBase::TMetrics& metrics);
TResourceRawValues GetResourceCurrentValues() const;
Expand All @@ -237,18 +239,18 @@ struct TTabletInfo {
void ActualizeCounter();

template <typename ResourcesType>
static double GetUsage(const ResourcesType& current, const ResourcesType& maximum, EResourceToBalance resource = EResourceToBalance::Dominant) {
static double GetUsage(const ResourcesType& current, const ResourcesType& maximum, EResourceToBalance resource = EResourceToBalance::ComputeResources) {
auto normValues = NormalizeRawValues(current, maximum);
return ExtractResourceUsage(normValues, resource);
}

static double ExtractResourceUsage(const TResourceNormalizedValues& normValues, EResourceToBalance resource = EResourceToBalance::Dominant) {
static double ExtractResourceUsage(const TResourceNormalizedValues& normValues, EResourceToBalance resource = EResourceToBalance::ComputeResources) {
switch (resource) {
case EResourceToBalance::CPU: return std::get<NMetrics::EResource::CPU>(normValues);
case EResourceToBalance::Memory: return std::get<NMetrics::EResource::Memory>(normValues);
case EResourceToBalance::Network: return std::get<NMetrics::EResource::Network>(normValues);
case EResourceToBalance::Counter: return std::get<NMetrics::EResource::Counter>(normValues);
case EResourceToBalance::Dominant: return max(normValues);
case EResourceToBalance::ComputeResources: return max(normValues);
}
}

Expand Down
Loading