Skip to content

Commit

Permalink
move time difference issue under database level (#5859)
Browse files Browse the repository at this point in the history
  • Loading branch information
StekPerepolnen committed Jul 3, 2024
1 parent 81d4eb3 commit 1315408
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 37 deletions.
80 changes: 43 additions & 37 deletions ydb/core/health_check/health_check.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1265,7 +1265,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
}
}

void FillComputeNodeStatus(TDatabaseState& databaseState,TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context) {
void FillComputeNodeStatus(TDatabaseState& databaseState, TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context, bool reportTimeDifference) {
FillNodeInfo(nodeId, context.Location.mutable_compute()->mutable_node());

TSelfCheckContext rrContext(&context, "NODE_UPTIME");
Expand Down Expand Up @@ -1303,6 +1303,34 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
}
loadAverageStatus.set_overall(laContext.GetOverallStatus());
}

if (nodeSystemState.HasMaxClockSkewPeerId()) {
TNodeId peerId = nodeSystemState.GetMaxClockSkewPeerId();
long timeDifferenceUs = nodeSystemState.GetMaxClockSkewWithPeerUs();
TDuration timeDifferenceDuration = TDuration::MicroSeconds(abs(timeDifferenceUs));
Ydb::Monitoring::StatusFlag::Status status;
if (timeDifferenceDuration > MAX_CLOCKSKEW_ORANGE_ISSUE_TIME) {
status = Ydb::Monitoring::StatusFlag::ORANGE;
} else if (timeDifferenceDuration > MAX_CLOCKSKEW_YELLOW_ISSUE_TIME) {
status = Ydb::Monitoring::StatusFlag::YELLOW;
} else {
status = Ydb::Monitoring::StatusFlag::GREEN;
}

computeNodeStatus.mutable_max_time_difference()->set_peer(ToString(peerId));
computeNodeStatus.mutable_max_time_difference()->set_difference_ms(timeDifferenceDuration.MilliSeconds());
computeNodeStatus.set_overall(status);

if (reportTimeDifference) {
TSelfCheckContext tdContext(&context, "NODES_TIME_DIFFERENCE");
FillNodeInfo(peerId, tdContext.Location.mutable_compute()->mutable_peer());
if (status == Ydb::Monitoring::StatusFlag::GREEN) {
tdContext.ReportStatus(status);
} else {
tdContext.ReportStatus(status, TStringBuilder() << "The nodes have a time difference of " << timeDifferenceDuration.MilliSeconds() << " ms", ETags::SyncState);
}
}
}
} else {
// context.ReportStatus(Ydb::Monitoring::StatusFlag::RED,
// TStringBuilder() << "Compute node is not available",
Expand Down Expand Up @@ -1334,12 +1362,25 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
if (systemStatus != Ydb::Monitoring::StatusFlag::GREEN && systemStatus != Ydb::Monitoring::StatusFlag::GREY) {
context.ReportStatus(systemStatus, "Compute has issues with system tablets", ETags::ComputeState, {ETags::SystemTabletState});
}
long maxClockSkewUs = 0;
TNodeId maxClockSkewNodeId = 0;
for (TNodeId nodeId : *computeNodeIds) {
auto itNodeSystemState = MergedNodeSystemState.find(nodeId);
if (itNodeSystemState != MergedNodeSystemState.end()) {
if (std::count(computeNodeIds->begin(), computeNodeIds->end(), itNodeSystemState->second->GetMaxClockSkewPeerId()) > 0
&& abs(itNodeSystemState->second->GetMaxClockSkewWithPeerUs()) > maxClockSkewUs) {
maxClockSkewUs = abs(itNodeSystemState->second->GetMaxClockSkewWithPeerUs());
maxClockSkewNodeId = nodeId;
}
}
}
for (TNodeId nodeId : *computeNodeIds) {
auto& computeNode = *computeStatus.add_nodes();
FillComputeNodeStatus(databaseState, nodeId, computeNode, {&context, "COMPUTE_NODE"});
FillComputeNodeStatus(databaseState, nodeId, computeNode, {&context, "COMPUTE_NODE"}, maxClockSkewNodeId == nodeId);
}
context.ReportWithMaxChildStatus("Some nodes are restarting too often", ETags::ComputeState, {ETags::Uptime});
context.ReportWithMaxChildStatus("Compute is overloaded", ETags::ComputeState, {ETags::OverloadState});
context.ReportWithMaxChildStatus("Database has time difference between nodes", ETags::ComputeState, {ETags::SyncState});
Ydb::Monitoring::StatusFlag::Status tabletsStatus = Ydb::Monitoring::StatusFlag::GREEN;
computeNodeIds->push_back(0); // for tablets without node
for (TNodeId nodeId : *computeNodeIds) {
Expand Down Expand Up @@ -2086,40 +2127,6 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
const TDuration MAX_CLOCKSKEW_ORANGE_ISSUE_TIME = TDuration::MicroSeconds(25000);
const TDuration MAX_CLOCKSKEW_YELLOW_ISSUE_TIME = TDuration::MicroSeconds(5000);

void FillNodesSyncStatus(TOverallStateContext& context) {
long maxClockSkewUs = 0;
TNodeId maxClockSkewPeerId = 0;
TNodeId maxClockSkewNodeId = 0;
for (auto& [nodeId, nodeSystemState] : MergedNodeSystemState) {
if (IsTimeDifferenceCheckNode(nodeId) && IsTimeDifferenceCheckNode(nodeSystemState->GetMaxClockSkewPeerId())
&& abs(nodeSystemState->GetMaxClockSkewWithPeerUs()) > maxClockSkewUs) {
maxClockSkewUs = abs(nodeSystemState->GetMaxClockSkewWithPeerUs());
maxClockSkewPeerId = nodeSystemState->GetMaxClockSkewPeerId();
maxClockSkewNodeId = nodeId;
}
}
if (!maxClockSkewNodeId) {
return;
}

TSelfCheckResult syncContext;
syncContext.Type = "NODES_TIME_DIFFERENCE";
FillNodeInfo(maxClockSkewNodeId, syncContext.Location.mutable_node());
FillNodeInfo(maxClockSkewPeerId, syncContext.Location.mutable_peer());

TDuration maxClockSkewTime = TDuration::MicroSeconds(maxClockSkewUs);
if (maxClockSkewTime > MAX_CLOCKSKEW_ORANGE_ISSUE_TIME) {
syncContext.ReportStatus(Ydb::Monitoring::StatusFlag::ORANGE, TStringBuilder() << "The nodes have a time difference of " << maxClockSkewTime.MilliSeconds() << " ms", ETags::SyncState);
} else if (maxClockSkewTime > MAX_CLOCKSKEW_YELLOW_ISSUE_TIME) {
syncContext.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, TStringBuilder() << "The nodes have a time difference of " << maxClockSkewTime.MilliSeconds() << " ms", ETags::SyncState);
} else {
syncContext.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN);
}

context.UpdateMaxStatus(syncContext.GetOverallStatus());
context.AddIssues(syncContext.IssueRecords);
}

void FillResult(TOverallStateContext context) {
if (IsSpecificDatabaseFilter()) {
FillDatabaseResult(context, FilterDatabase, DatabaseState[FilterDatabase]);
Expand All @@ -2128,7 +2135,6 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
FillDatabaseResult(context, path, state);
}
}
FillNodesSyncStatus(context);
if (DatabaseState.empty()) {
Ydb::Monitoring::DatabaseStatus& databaseStatus(*context.Result->add_database_status());
TSelfCheckResult tabletContext;
Expand Down
8 changes: 8 additions & 0 deletions ydb/public/api/protos/ydb_monitoring.proto
Original file line number Diff line number Diff line change
Expand Up @@ -106,12 +106,19 @@ message LoadAverageStatus {
uint32 cores = 3;
}

message TimeDifferenceStatus {
StatusFlag.Status overall = 1;
int64 difference_ms = 2;
string peer = 3;
}

message ComputeNodeStatus {
string id = 1;
StatusFlag.Status overall = 2;
repeated ComputeTabletStatus tablets = 3;
repeated ThreadPoolStatus pools = 4;
LoadAverageStatus load = 5;
TimeDifferenceStatus max_time_difference = 6;
}

message ComputeStatus {
Expand Down Expand Up @@ -165,6 +172,7 @@ message LocationCompute {
LocationNode node = 1;
LocationComputePool pool = 2;
LocationComputeTablet tablet = 3;
LocationNode peer = 4;
}

message LocationDatabase {
Expand Down

0 comments on commit 1315408

Please sign in to comment.