Skip to content

Commit

Permalink
tablets with disabled metrics ignore metric-related balancers (ydb-pl…
Browse files Browse the repository at this point in the history
  • Loading branch information
vporyadke authored Apr 25, 2024
1 parent 7b8da5c commit 8a1ba16
Show file tree
Hide file tree
Showing 7 changed files with 53 additions and 24 deletions.
4 changes: 3 additions & 1 deletion ydb/core/mind/hive/balancer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,9 @@ class THiveBalancer : public NActors::TActorBootstrapped<THiveBalancer>, public
std::vector<TTabletInfo*> tablets;
tablets.reserve(nodeTablets.size());
for (TTabletInfo* tablet : nodeTablets) {
if (tablet->IsGoodForBalancer(now) && (!Settings.FilterObjectId || tablet->GetObjectId() == *Settings.FilterObjectId)) {
if (tablet->IsGoodForBalancer(now) &&
(!Settings.FilterObjectId || tablet->GetObjectId() == *Settings.FilterObjectId) &&
tablet->HasAllowedMetric(Settings.ResourceToBalance)) {
tablet->UpdateWeight();
tablets.emplace_back(tablet);
}
Expand Down
4 changes: 2 additions & 2 deletions ydb/core/mind/hive/hive.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ constexpr std::size_t EBalancerTypeSize = static_cast<std::size_t>(EBalancerType
TString EBalancerTypeName(EBalancerType value);

enum class EResourceToBalance {
Dominant,
ComputeResources,
Counter,
CPU,
Memory,
Expand Down Expand Up @@ -296,7 +296,7 @@ struct TBalancerSettings {
bool RecheckOnFinish = false;
ui64 MaxInFlight = 1;
const std::vector<TNodeId> FilterNodeIds = {};
EResourceToBalance ResourceToBalance = EResourceToBalance::Dominant;
EResourceToBalance ResourceToBalance = EResourceToBalance::ComputeResources;
std::optional<TFullObjectId> FilterObjectId;
};

Expand Down
2 changes: 1 addition & 1 deletion ydb/core/mind/hive/hive_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2373,7 +2373,7 @@ void THive::Handle(TEvPrivate::TEvProcessTabletBalancer::TPtr&) {
case EResourceToBalance::Network:
balancerType = EBalancerType::ScatterNetwork;
break;
case EResourceToBalance::Dominant:
case EResourceToBalance::ComputeResources:
balancerType = EBalancerType::Scatter;
break;
}
Expand Down
6 changes: 3 additions & 3 deletions ydb/core/mind/hive/node_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ i32 TNodeInfo::GetPriorityForTablet(const TTabletInfo& tablet) const {

bool TNodeInfo::IsAbleToRunTablet(const TTabletInfo& tablet, TTabletDebugState* debugState) const {
if (tablet.IsAliveOnLocal(Local)) {
return !IsOverloaded();
return !(IsOverloaded() && tablet.HasAllowedMetric(EResourceToBalance::ComputeResources));
}
if (tablet.IsLeader()) {
const TLeaderTabletInfo& leader = tablet.AsLeader();
Expand Down Expand Up @@ -280,7 +280,7 @@ bool TNodeInfo::IsAbleToRunTablet(const TTabletInfo& tablet, TTabletDebugState*
}
}

if (tablet.IsAlive() && IsOverloaded()) {
if (tablet.IsAlive() && IsOverloaded() && tablet.HasAllowedMetric(EResourceToBalance::ComputeResources)) {
// we don't move already running tablet to another overloaded node
if (debugState) {
debugState->NodesWithoutResources++;
Expand Down Expand Up @@ -434,7 +434,7 @@ double TNodeInfo::GetNodeUsageForTablet(const TTabletInfo& tablet) const {

double TNodeInfo::GetNodeUsage(const TResourceNormalizedValues& normValues, EResourceToBalance resource) const {
double usage = TTabletInfo::ExtractResourceUsage(normValues, resource);
if (resource == EResourceToBalance::Dominant && AveragedNodeTotalUsage.IsValueStable()) {
if (resource == EResourceToBalance::ComputeResources && AveragedNodeTotalUsage.IsValueStable()) {
usage = std::max(usage, AveragedNodeTotalUsage.GetValue());
}
return usage;
Expand Down
4 changes: 2 additions & 2 deletions ydb/core/mind/hive/node_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -243,9 +243,9 @@ struct TNodeInfo {
}

double GetNodeUsageForTablet(const TTabletInfo& tablet) const;
double GetNodeUsage(EResourceToBalance resource = EResourceToBalance::Dominant) const;
double GetNodeUsage(EResourceToBalance resource = EResourceToBalance::ComputeResources) const;
double GetNodeUsage(const TResourceNormalizedValues& normValues,
EResourceToBalance resource = EResourceToBalance::Dominant) const;
EResourceToBalance resource = EResourceToBalance::ComputeResources) const;

ui64 GetTabletsRunningByType(TTabletTypes::EType tabletType) const;

Expand Down
49 changes: 37 additions & 12 deletions ydb/core/mind/hive/tablet_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -310,12 +310,37 @@ const TVector<i64>& TTabletInfo::GetTabletAllowedMetricIds() const {
return Hive.GetTabletTypeAllowedMetricIds(GetLeader().Type);
}

bool TTabletInfo::HasAllowedMetric(const TVector<i64>& allowedMetricIds, EResourceToBalance resource) {
switch (resource) {
case EResourceToBalance::ComputeResources: {
auto isComputeMetric = [](i64 metricId) {
return metricId == NKikimrTabletBase::TMetrics::kCPUFieldNumber ||
metricId == NKikimrTabletBase::TMetrics::kMemoryFieldNumber ||
metricId == NKikimrTabletBase::TMetrics::kNetworkFieldNumber;
};
return AnyOf(allowedMetricIds.begin(), allowedMetricIds.end(), isComputeMetric);
}
case EResourceToBalance::Counter:
return true;
case EResourceToBalance::CPU:
return Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kCPUFieldNumber) != allowedMetricIds.end();
case EResourceToBalance::Memory:
return Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kMemoryFieldNumber) != allowedMetricIds.end();
case EResourceToBalance::Network:
return Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kNetworkFieldNumber) != allowedMetricIds.end();
}
}

bool TTabletInfo::HasAllowedMetric(EResourceToBalance resource) const {
return HasAllowedMetric(GetTabletAllowedMetricIds(), resource);
}

void TTabletInfo::UpdateResourceUsage(const NKikimrTabletBase::TMetrics& metrics) {
TInstant now = TActivationContext::Now();
const TVector<i64>& allowedMetricIds(GetTabletAllowedMetricIds());
auto before = ResourceValues;
auto maximum = GetResourceMaximumValues();
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kCPUFieldNumber) != allowedMetricIds.end()) {
if (HasAllowedMetric(allowedMetricIds, EResourceToBalance::CPU)) {
if (metrics.HasCPU()) {
if (metrics.GetCPU() > static_cast<ui64>(std::get<NMetrics::EResource::CPU>(maximum))) {
BLOG_W("Ignoring too high CPU metric (" << metrics.GetCPU() << ") for tablet " << ToString());
Expand All @@ -325,7 +350,7 @@ void TTabletInfo::UpdateResourceUsage(const NKikimrTabletBase::TMetrics& metrics
}
}
}
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kMemoryFieldNumber) != allowedMetricIds.end()) {
if (HasAllowedMetric(allowedMetricIds, EResourceToBalance::Memory)) {
if (metrics.HasMemory()) {
if (metrics.GetMemory() > static_cast<ui64>(std::get<NMetrics::EResource::Memory>(maximum))) {
BLOG_W("Ignoring too high Memory metric (" << metrics.GetMemory() << ") for tablet " << ToString());
Expand All @@ -335,7 +360,7 @@ void TTabletInfo::UpdateResourceUsage(const NKikimrTabletBase::TMetrics& metrics
}
}
}
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kNetworkFieldNumber) != allowedMetricIds.end()) {
if (HasAllowedMetric(allowedMetricIds, EResourceToBalance::Network)) {
if (metrics.HasNetwork()) {
if (metrics.GetNetwork() > static_cast<ui64>(std::get<NMetrics::EResource::Network>(maximum))) {
BLOG_W("Ignoring too high Network metric (" << metrics.GetNetwork() << ") for tablet " << ToString());
Expand Down Expand Up @@ -396,13 +421,13 @@ TResourceRawValues TTabletInfo::GetResourceMaximumValues() const {
}

i64 TTabletInfo::GetCounterValue(const NKikimrTabletBase::TMetrics& metrics, const TVector<i64>& allowedMetricIds) {
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kCPUFieldNumber) != allowedMetricIds.end() && THive::IsValidMetricsCPU(metrics)) {
if (HasAllowedMetric(allowedMetricIds, EResourceToBalance::CPU) && THive::IsValidMetricsCPU(metrics)) {
return 0;
}
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kMemoryFieldNumber) != allowedMetricIds.end() && THive::IsValidMetricsMemory(metrics)) {
if (HasAllowedMetric(allowedMetricIds, EResourceToBalance::Memory) && THive::IsValidMetricsMemory(metrics)) {
return 0;
}
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kNetworkFieldNumber) != allowedMetricIds.end() && THive::IsValidMetricsNetwork(metrics)) {
if (HasAllowedMetric(allowedMetricIds, EResourceToBalance::Network) && THive::IsValidMetricsNetwork(metrics)) {
return 0;
}
return 1;
Expand All @@ -414,13 +439,13 @@ void TTabletInfo::FilterRawValues(TResourceRawValues& values) const {
if (metrics.GetCounter() == 0) {
std::get<NMetrics::EResource::Counter>(values) = 0;
}
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kCPUFieldNumber) == allowedMetricIds.end() || !THive::IsValidMetricsCPU(metrics)) {
if (!HasAllowedMetric(allowedMetricIds, EResourceToBalance::CPU) || !THive::IsValidMetricsCPU(metrics)) {
std::get<NMetrics::EResource::CPU>(values) = 0;
}
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kMemoryFieldNumber) == allowedMetricIds.end() || !THive::IsValidMetricsMemory(metrics)) {
if (!HasAllowedMetric(allowedMetricIds, EResourceToBalance::Memory) || !THive::IsValidMetricsMemory(metrics)) {
std::get<NMetrics::EResource::Memory>(values) = 0;
}
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kNetworkFieldNumber) == allowedMetricIds.end() || !THive::IsValidMetricsNetwork(metrics)) {
if (!HasAllowedMetric(allowedMetricIds, EResourceToBalance::Network) || !THive::IsValidMetricsNetwork(metrics)) {
std::get<NMetrics::EResource::Network>(values) = 0;
}
}
Expand All @@ -431,13 +456,13 @@ void TTabletInfo::FilterRawValues(TResourceNormalizedValues& values) const {
if (metrics.GetCounter() == 0) {
std::get<NMetrics::EResource::Counter>(values) = 0;
}
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kCPUFieldNumber) == allowedMetricIds.end() || !THive::IsValidMetricsCPU(metrics)) {
if (!HasAllowedMetric(allowedMetricIds, EResourceToBalance::CPU) || !THive::IsValidMetricsCPU(metrics)) {
std::get<NMetrics::EResource::CPU>(values) = 0;
}
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kMemoryFieldNumber) == allowedMetricIds.end() || !THive::IsValidMetricsMemory(metrics)) {
if (!HasAllowedMetric(allowedMetricIds, EResourceToBalance::Memory) || !THive::IsValidMetricsMemory(metrics)) {
std::get<NMetrics::EResource::Memory>(values) = 0;
}
if (Find(allowedMetricIds, NKikimrTabletBase::TMetrics::kNetworkFieldNumber) == allowedMetricIds.end() || !THive::IsValidMetricsNetwork(metrics)) {
if (!HasAllowedMetric(allowedMetricIds, EResourceToBalance::Network) || !THive::IsValidMetricsNetwork(metrics)) {
std::get<NMetrics::EResource::Network>(values) = 0;
}
}
Expand Down
8 changes: 5 additions & 3 deletions ydb/core/mind/hive/tablet_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,8 @@ struct TTabletInfo {
void BecomeUnknown(TNodeInfo* node);
bool Kick();
const TVector<i64>& GetTabletAllowedMetricIds() const;
static bool HasAllowedMetric(const TVector<i64>& allowedMetricIds, EResourceToBalance resource);
bool HasAllowedMetric(EResourceToBalance resource) const;

void UpdateResourceUsage(const NKikimrTabletBase::TMetrics& metrics);
TResourceRawValues GetResourceCurrentValues() const;
Expand All @@ -237,18 +239,18 @@ struct TTabletInfo {
void ActualizeCounter();

template <typename ResourcesType>
static double GetUsage(const ResourcesType& current, const ResourcesType& maximum, EResourceToBalance resource = EResourceToBalance::Dominant) {
static double GetUsage(const ResourcesType& current, const ResourcesType& maximum, EResourceToBalance resource = EResourceToBalance::ComputeResources) {
auto normValues = NormalizeRawValues(current, maximum);
return ExtractResourceUsage(normValues, resource);
}

static double ExtractResourceUsage(const TResourceNormalizedValues& normValues, EResourceToBalance resource = EResourceToBalance::Dominant) {
static double ExtractResourceUsage(const TResourceNormalizedValues& normValues, EResourceToBalance resource = EResourceToBalance::ComputeResources) {
switch (resource) {
case EResourceToBalance::CPU: return std::get<NMetrics::EResource::CPU>(normValues);
case EResourceToBalance::Memory: return std::get<NMetrics::EResource::Memory>(normValues);
case EResourceToBalance::Network: return std::get<NMetrics::EResource::Network>(normValues);
case EResourceToBalance::Counter: return std::get<NMetrics::EResource::Counter>(normValues);
case EResourceToBalance::Dominant: return max(normValues);
case EResourceToBalance::ComputeResources: return max(normValues);
}
}

Expand Down

0 comments on commit 8a1ba16

Please sign in to comment.