Skip to content

Commit

Permalink
Merge 4568f24 into 450c3a2
Browse files Browse the repository at this point in the history
  • Loading branch information
StekPerepolnen authored Jun 23, 2024
2 parents 450c3a2 + 4568f24 commit fa10a51
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 22 deletions.
64 changes: 42 additions & 22 deletions ydb/core/health_check/health_check.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2579,14 +2579,17 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
databaseStatus.set_name(path);
FillCompute(state, *databaseStatus.mutable_compute(), {&dbContext, "COMPUTE"});
FillStorage(state, *databaseStatus.mutable_storage(), {&dbContext, "STORAGE"});
FillTimeDifference(state, *databaseStatus.mutable_time_difference(), {&dbContext, "NODES_TIME_DIFFERENCE"});
if (databaseStatus.compute().overall() != Ydb::Monitoring::StatusFlag::GREEN
&& databaseStatus.storage().overall() != Ydb::Monitoring::StatusFlag::GREEN) {
dbContext.ReportStatus(MaxStatus(databaseStatus.compute().overall(), databaseStatus.storage().overall()),
"Database has multiple issues", ETags::DBState, { ETags::ComputeState, ETags::StorageState});
"Database has multiple issues", ETags::DBState, { ETags::ComputeState, ETags::StorageState, ETags::SyncState });
} else if (databaseStatus.compute().overall() != Ydb::Monitoring::StatusFlag::GREEN) {
dbContext.ReportStatus(databaseStatus.compute().overall(), "Database has compute issues", ETags::DBState, {ETags::ComputeState});
dbContext.ReportStatus(databaseStatus.compute().overall(), "Database has compute issues", ETags::DBState, {ETags::ComputeState, ETags::SyncState});
} else if (databaseStatus.storage().overall() != Ydb::Monitoring::StatusFlag::GREEN) {
dbContext.ReportStatus(databaseStatus.storage().overall(), "Database has storage issues", ETags::DBState, {ETags::StorageState});
dbContext.ReportStatus(databaseStatus.storage().overall(), "Database has storage issues", ETags::DBState, {ETags::StorageState, ETags::SyncState});
} else if (databaseStatus.time_difference().overall() != Ydb::Monitoring::StatusFlag::GREEN) {
dbContext.ReportStatus(databaseStatus.time_difference().overall(), "Database has time difference issues", ETags::DBState, {ETags::SyncState});
}
databaseStatus.set_overall(dbContext.GetOverallStatus());
context.UpdateMaxStatus(dbContext.GetOverallStatus());
Expand All @@ -2596,41 +2599,59 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
}
}

const TDuration MAX_CLOCKSKEW_ORANGE_ISSUE_TIME = TDuration::MicroSeconds(25000);
const TDuration MAX_CLOCKSKEW_YELLOW_ISSUE_TIME = TDuration::MicroSeconds(5000);
const TDuration MAX_CLOCKSKEW_ORANGE_ISSUE_TIME = TDuration::MicroSeconds(25);
const TDuration MAX_CLOCKSKEW_YELLOW_ISSUE_TIME = TDuration::MicroSeconds(5);

void FillNodesSyncStatus(TOverallStateContext& context) {
void FillTimeDifference(TDatabaseState& databaseState, Ydb::Monitoring::TimeDifferenceStatus& timeDifferenceStatus, TSelfCheckContext context) {
long maxClockSkewUs = 0;
TNodeId maxClockSkewPeerId = 0;
TNodeId maxClockSkewNodeId = 0;
for (auto& [nodeId, nodeSystemState] : MergedNodeSystemState) {
if (IsTimeDifferenceCheckNode(nodeId) && IsTimeDifferenceCheckNode(nodeSystemState->GetMaxClockSkewPeerId())
&& abs(nodeSystemState->GetMaxClockSkewWithPeerUs()) > maxClockSkewUs) {
maxClockSkewUs = abs(nodeSystemState->GetMaxClockSkewWithPeerUs());
maxClockSkewPeerId = nodeSystemState->GetMaxClockSkewPeerId();
maxClockSkewNodeId = nodeId;

TVector<TNodeId>* computeNodeIds = &databaseState.ComputeNodeIds;
if (databaseState.ResourcePathId
&& databaseState.ServerlessComputeResourcesMode != NKikimrSubDomains::EServerlessComputeResourcesModeExclusive)
{
auto itDatabase = FilterDomainKey.find(TSubDomainKey(databaseState.ResourcePathId.OwnerId, databaseState.ResourcePathId.LocalPathId));
if (itDatabase != FilterDomainKey.end()) {
const TString& sharedDatabaseName = itDatabase->second;
TDatabaseState& sharedDatabase = DatabaseState[sharedDatabaseName];
computeNodeIds = &sharedDatabase.ComputeNodeIds;
}
}

for (TNodeId nodeId : *computeNodeIds) {
auto itNodeSystemState = MergedNodeSystemState.find(nodeId);
if (itNodeSystemState != MergedNodeSystemState.end()) {
if (IsTimeDifferenceCheckNode(nodeId) && IsTimeDifferenceCheckNode(itNodeSystemState->second->GetMaxClockSkewPeerId())
&& abs(itNodeSystemState->second->GetMaxClockSkewWithPeerUs()) > maxClockSkewUs) {
maxClockSkewUs = abs(itNodeSystemState->second->GetMaxClockSkewWithPeerUs());
maxClockSkewPeerId = itNodeSystemState->second->GetMaxClockSkewPeerId();
maxClockSkewNodeId = nodeId;
}
}
}

if (!maxClockSkewNodeId) {
timeDifferenceStatus.set_overall(Ydb::Monitoring::StatusFlag::GREEN);
return;
}

TSelfCheckResult syncContext;
syncContext.Type = "NODES_TIME_DIFFERENCE";
FillNodeInfo(maxClockSkewNodeId, syncContext.Location.mutable_node());
FillNodeInfo(maxClockSkewPeerId, syncContext.Location.mutable_peer());
FillNodeInfo(maxClockSkewNodeId, context.Location.mutable_node());
FillNodeInfo(maxClockSkewPeerId, context.Location.mutable_peer());

TDuration maxClockSkewTime = TDuration::MicroSeconds(maxClockSkewUs);
if (maxClockSkewTime > MAX_CLOCKSKEW_ORANGE_ISSUE_TIME) {
syncContext.ReportStatus(Ydb::Monitoring::StatusFlag::ORANGE, TStringBuilder() << "The nodes have a time difference of " << maxClockSkewTime.MilliSeconds() << " ms", ETags::SyncState);
context.ReportStatus(Ydb::Monitoring::StatusFlag::ORANGE, TStringBuilder() << "The nodes have a time difference of " << maxClockSkewTime.MilliSeconds() << " ms", ETags::SyncState);
} else if (maxClockSkewTime > MAX_CLOCKSKEW_YELLOW_ISSUE_TIME) {
syncContext.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, TStringBuilder() << "The nodes have a time difference of " << maxClockSkewTime.MilliSeconds() << " ms", ETags::SyncState);
context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, TStringBuilder() << "The nodes have a time difference of " << maxClockSkewTime.MilliSeconds() << " ms", ETags::SyncState);
} else {
syncContext.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN);
context.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN);
}

context.UpdateMaxStatus(syncContext.GetOverallStatus());
context.AddIssues(syncContext.IssueRecords);
timeDifferenceStatus.set_node(ToString(maxClockSkewNodeId));
timeDifferenceStatus.set_peer(ToString(maxClockSkewPeerId));
timeDifferenceStatus.set_max_difference_ms(maxClockSkewTime.MilliSeconds());
timeDifferenceStatus.set_overall(context.GetOverallStatus());
}

void FillResult(TOverallStateContext context) {
Expand All @@ -2641,7 +2662,6 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
FillDatabaseResult(context, path, state);
}
}
FillNodesSyncStatus(context);
if (DatabaseState.empty()) {
Ydb::Monitoring::DatabaseStatus& databaseStatus(*context.Result->add_database_status());
TSelfCheckResult tabletContext;
Expand Down
8 changes: 8 additions & 0 deletions ydb/public/api/protos/ydb_monitoring.proto
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,13 @@ message ComputeStatus {
float shards_quota_usage = 5;
}

message TimeDifferenceStatus {
StatusFlag.Status overall = 1;
int64 max_difference_ms = 2;
string node = 3;
string peer = 4;
}

message LocationNode {
uint32 id = 1;
string host = 2;
Expand Down Expand Up @@ -198,6 +205,7 @@ message DatabaseStatus {
StatusFlag.Status overall = 2;
StorageStatus storage = 3;
ComputeStatus compute = 4;
TimeDifferenceStatus time_difference = 5;
}

message SelfCheckResult {
Expand Down

0 comments on commit fa10a51

Please sign in to comment.