Skip to content

Commit

Permalink
Reduce severity of some health check issues (ydb-platform#3067)
Browse files Browse the repository at this point in the history
  • Loading branch information
adameat authored Mar 22, 2024
1 parent 83f9944 commit a79b382
Showing 1 changed file with 5 additions and 9 deletions.
14 changes: 5 additions & 9 deletions ydb/core/health_check/health_check.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1106,19 +1106,15 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
static void Check(TSelfCheckContext& context, const NKikimrWhiteboard::TSystemStateInfo::TPoolStats& poolStats) {
if (poolStats.name() == "System" || poolStats.name() == "IC" || poolStats.name() == "IO") {
if (poolStats.usage() >= 0.99) {
context.ReportStatus(Ydb::Monitoring::StatusFlag::RED, "Pool usage over 99%", ETags::OverloadState);
context.ReportStatus(Ydb::Monitoring::StatusFlag::ORANGE, "Pool usage is over than 99%", ETags::OverloadState);
} else if (poolStats.usage() >= 0.95) {
context.ReportStatus(Ydb::Monitoring::StatusFlag::ORANGE, "Pool usage over 95%", ETags::OverloadState);
} else if (poolStats.usage() >= 0.90) {
context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Pool usage over 90%", ETags::OverloadState);
context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Pool usage is over than 95%", ETags::OverloadState);
} else {
context.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN);
}
} else {
if (poolStats.usage() >= 0.99) {
context.ReportStatus(Ydb::Monitoring::StatusFlag::ORANGE, "Pool usage over 99%", ETags::OverloadState);
} else if (poolStats.usage() >= 0.95) {
context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Pool usage over 95%", ETags::OverloadState);
context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Pool usage is over than 99%", ETags::OverloadState);
} else {
context.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN);
}
Expand Down Expand Up @@ -1188,7 +1184,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
break;
case TNodeTabletState::ETabletState::RestartsTooOften:
computeTabletStatus.set_state("RESTARTS_TOO_OFTEN");
tabletContext.ReportStatus(Ydb::Monitoring::StatusFlag::RED, "Tablets are restarting too often", ETags::TabletState);
tabletContext.ReportStatus(Ydb::Monitoring::StatusFlag::ORANGE, "Tablets are restarting too often", ETags::TabletState);
break;
case TNodeTabletState::ETabletState::Dead:
computeTabletStatus.set_state("DEAD");
Expand Down Expand Up @@ -1227,7 +1223,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {

TSelfCheckContext rrContext(&context, "NODE_UPTIME");
if (databaseState.NodeRestartsPerPeriod[nodeId] >= 30) {
rrContext.ReportStatus(Ydb::Monitoring::StatusFlag::RED, "Node is restarting too often", ETags::Uptime);
rrContext.ReportStatus(Ydb::Monitoring::StatusFlag::ORANGE, "Node is restarting too often", ETags::Uptime);
} else if (databaseState.NodeRestartsPerPeriod[nodeId] >= 10) {
rrContext.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "The number of node restarts has increased", ETags::Uptime);
} else {
Expand Down

0 comments on commit a79b382

Please sign in to comment.