From c1ccdda7b8d83593bc9df402be2a3f159a2cf0d1 Mon Sep 17 00:00:00 2001 From: StekPerepolnen Date: Thu, 20 Jun 2024 13:58:46 +0000 Subject: [PATCH 1/2] move time difference up --- ydb/core/health_check/health_check.cpp | 48 ++++++++++++++++---------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/ydb/core/health_check/health_check.cpp b/ydb/core/health_check/health_check.cpp index 8851c2697b57..be810bd3386f 100644 --- a/ydb/core/health_check/health_check.cpp +++ b/ydb/core/health_check/health_check.cpp @@ -2579,6 +2579,7 @@ class TSelfCheckRequest : public TActorBootstrapped { databaseStatus.set_name(path); FillCompute(state, *databaseStatus.mutable_compute(), {&dbContext, "COMPUTE"}); FillStorage(state, *databaseStatus.mutable_storage(), {&dbContext, "STORAGE"}); + FillTimeDifference(state, {&dbContext, "NODES_TIME_DIFFERENCE"}); if (databaseStatus.compute().overall() != Ydb::Monitoring::StatusFlag::GREEN && databaseStatus.storage().overall() != Ydb::Monitoring::StatusFlag::GREEN) { dbContext.ReportStatus(MaxStatus(databaseStatus.compute().overall(), databaseStatus.storage().overall()), @@ -2599,38 +2600,50 @@ class TSelfCheckRequest : public TActorBootstrapped { const TDuration MAX_CLOCKSKEW_ORANGE_ISSUE_TIME = TDuration::MicroSeconds(25000); const TDuration MAX_CLOCKSKEW_YELLOW_ISSUE_TIME = TDuration::MicroSeconds(5000); - void FillNodesSyncStatus(TOverallStateContext& context) { + void FillTimeDifference(TDatabaseState& databaseState, TSelfCheckContext context) { long maxClockSkewUs = 0; TNodeId maxClockSkewPeerId = 0; TNodeId maxClockSkewNodeId = 0; - for (auto& [nodeId, nodeSystemState] : MergedNodeSystemState) { - if (IsTimeDifferenceCheckNode(nodeId) && IsTimeDifferenceCheckNode(nodeSystemState->GetMaxClockSkewPeerId()) - && abs(nodeSystemState->GetMaxClockSkewWithPeerUs()) > maxClockSkewUs) { - maxClockSkewUs = abs(nodeSystemState->GetMaxClockSkewWithPeerUs()); - maxClockSkewPeerId = nodeSystemState->GetMaxClockSkewPeerId(); - maxClockSkewNodeId = nodeId; + + TVector* computeNodeIds = &databaseState.ComputeNodeIds; + if (databaseState.ResourcePathId + && databaseState.ServerlessComputeResourcesMode != NKikimrSubDomains::EServerlessComputeResourcesModeExclusive) + { + auto itDatabase = FilterDomainKey.find(TSubDomainKey(databaseState.ResourcePathId.OwnerId, databaseState.ResourcePathId.LocalPathId)); + if (itDatabase != FilterDomainKey.end()) { + const TString& sharedDatabaseName = itDatabase->second; + TDatabaseState& sharedDatabase = DatabaseState[sharedDatabaseName]; + computeNodeIds = &sharedDatabase.ComputeNodeIds; + } + } + + for (TNodeId nodeId : *computeNodeIds) { + auto itNodeSystemState = MergedNodeSystemState.find(nodeId); + if (itNodeSystemState != MergedNodeSystemState.end()) { + if (IsTimeDifferenceCheckNode(nodeId) && IsTimeDifferenceCheckNode(itNodeSystemState->second->GetMaxClockSkewPeerId()) + && abs(itNodeSystemState->second->GetMaxClockSkewWithPeerUs()) > maxClockSkewUs) { + maxClockSkewUs = abs(itNodeSystemState->second->GetMaxClockSkewWithPeerUs()); + maxClockSkewPeerId = itNodeSystemState->second->GetMaxClockSkewPeerId(); + maxClockSkewNodeId = nodeId; + } } } + if (!maxClockSkewNodeId) { return; } - TSelfCheckResult syncContext; - syncContext.Type = "NODES_TIME_DIFFERENCE"; - FillNodeInfo(maxClockSkewNodeId, syncContext.Location.mutable_node()); - FillNodeInfo(maxClockSkewPeerId, syncContext.Location.mutable_peer()); + FillNodeInfo(maxClockSkewNodeId, context.Location.mutable_node()); + FillNodeInfo(maxClockSkewPeerId, context.Location.mutable_peer()); TDuration maxClockSkewTime = TDuration::MicroSeconds(maxClockSkewUs); if (maxClockSkewTime > MAX_CLOCKSKEW_ORANGE_ISSUE_TIME) { - syncContext.ReportStatus(Ydb::Monitoring::StatusFlag::ORANGE, TStringBuilder() << "The nodes have a time difference of " << maxClockSkewTime.MilliSeconds() << " ms", ETags::SyncState); + context.ReportStatus(Ydb::Monitoring::StatusFlag::ORANGE, TStringBuilder() << "The nodes have a time difference of " << maxClockSkewTime.MilliSeconds() << " ms", ETags::SyncState); } else if (maxClockSkewTime > MAX_CLOCKSKEW_YELLOW_ISSUE_TIME) { - syncContext.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, TStringBuilder() << "The nodes have a time difference of " << maxClockSkewTime.MilliSeconds() << " ms", ETags::SyncState); + context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, TStringBuilder() << "The nodes have a time difference of " << maxClockSkewTime.MilliSeconds() << " ms", ETags::SyncState); } else { - syncContext.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN); + context.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN); } - - context.UpdateMaxStatus(syncContext.GetOverallStatus()); - context.AddIssues(syncContext.IssueRecords); } void FillResult(TOverallStateContext context) { @@ -2641,7 +2654,6 @@ class TSelfCheckRequest : public TActorBootstrapped { FillDatabaseResult(context, path, state); } } - FillNodesSyncStatus(context); if (DatabaseState.empty()) { Ydb::Monitoring::DatabaseStatus& databaseStatus(*context.Result->add_database_status()); TSelfCheckResult tabletContext; From 4568f248e7e62e89e93a4388ce4ffe5d29e4417e Mon Sep 17 00:00:00 2001 From: StekPerepolnen Date: Sun, 23 Jun 2024 12:18:36 +0000 Subject: [PATCH 2/2] move time difference issue --- ydb/core/health_check/health_check.cpp | 22 +++++++++++++++------- ydb/public/api/protos/ydb_monitoring.proto | 8 ++++++++ 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/ydb/core/health_check/health_check.cpp b/ydb/core/health_check/health_check.cpp index be810bd3386f..6ee97f59f42f 100644 --- a/ydb/core/health_check/health_check.cpp +++ b/ydb/core/health_check/health_check.cpp @@ -2579,15 +2579,17 @@ class TSelfCheckRequest : public TActorBootstrapped { databaseStatus.set_name(path); FillCompute(state, *databaseStatus.mutable_compute(), {&dbContext, "COMPUTE"}); FillStorage(state, *databaseStatus.mutable_storage(), {&dbContext, "STORAGE"}); - FillTimeDifference(state, {&dbContext, "NODES_TIME_DIFFERENCE"}); + FillTimeDifference(state, *databaseStatus.mutable_time_difference(), {&dbContext, "NODES_TIME_DIFFERENCE"}); if (databaseStatus.compute().overall() != Ydb::Monitoring::StatusFlag::GREEN && databaseStatus.storage().overall() != Ydb::Monitoring::StatusFlag::GREEN) { dbContext.ReportStatus(MaxStatus(databaseStatus.compute().overall(), databaseStatus.storage().overall()), - "Database has multiple issues", ETags::DBState, { ETags::ComputeState, ETags::StorageState}); + "Database has multiple issues", ETags::DBState, { ETags::ComputeState, ETags::StorageState, ETags::SyncState }); } else if (databaseStatus.compute().overall() != Ydb::Monitoring::StatusFlag::GREEN) { - dbContext.ReportStatus(databaseStatus.compute().overall(), "Database has compute issues", ETags::DBState, {ETags::ComputeState}); + dbContext.ReportStatus(databaseStatus.compute().overall(), "Database has compute issues", ETags::DBState, {ETags::ComputeState, ETags::SyncState}); } else if (databaseStatus.storage().overall() != Ydb::Monitoring::StatusFlag::GREEN) { - dbContext.ReportStatus(databaseStatus.storage().overall(), "Database has storage issues", ETags::DBState, {ETags::StorageState}); + dbContext.ReportStatus(databaseStatus.storage().overall(), "Database has storage issues", ETags::DBState, {ETags::StorageState, ETags::SyncState}); + } else if (databaseStatus.time_difference().overall() != Ydb::Monitoring::StatusFlag::GREEN) { + dbContext.ReportStatus(databaseStatus.time_difference().overall(), "Database has time difference issues", ETags::DBState, {ETags::SyncState}); } databaseStatus.set_overall(dbContext.GetOverallStatus()); context.UpdateMaxStatus(dbContext.GetOverallStatus()); @@ -2597,10 +2599,10 @@ class TSelfCheckRequest : public TActorBootstrapped { } } - const TDuration MAX_CLOCKSKEW_ORANGE_ISSUE_TIME = TDuration::MicroSeconds(25000); - const TDuration MAX_CLOCKSKEW_YELLOW_ISSUE_TIME = TDuration::MicroSeconds(5000); + const TDuration MAX_CLOCKSKEW_ORANGE_ISSUE_TIME = TDuration::MicroSeconds(25); + const TDuration MAX_CLOCKSKEW_YELLOW_ISSUE_TIME = TDuration::MicroSeconds(5); - void FillTimeDifference(TDatabaseState& databaseState, TSelfCheckContext context) { + void FillTimeDifference(TDatabaseState& databaseState, Ydb::Monitoring::TimeDifferenceStatus& timeDifferenceStatus, TSelfCheckContext context) { long maxClockSkewUs = 0; TNodeId maxClockSkewPeerId = 0; TNodeId maxClockSkewNodeId = 0; @@ -2630,6 +2632,7 @@ class TSelfCheckRequest : public TActorBootstrapped { } if (!maxClockSkewNodeId) { + timeDifferenceStatus.set_overall(Ydb::Monitoring::StatusFlag::GREEN); return; } @@ -2644,6 +2647,11 @@ class TSelfCheckRequest : public TActorBootstrapped { } else { context.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN); } + + timeDifferenceStatus.set_node(ToString(maxClockSkewNodeId)); + timeDifferenceStatus.set_peer(ToString(maxClockSkewPeerId)); + timeDifferenceStatus.set_max_difference_ms(maxClockSkewTime.MilliSeconds()); + timeDifferenceStatus.set_overall(context.GetOverallStatus()); } void FillResult(TOverallStateContext context) { diff --git a/ydb/public/api/protos/ydb_monitoring.proto b/ydb/public/api/protos/ydb_monitoring.proto index dd99eb583f29..540a2c100fe6 100644 --- a/ydb/public/api/protos/ydb_monitoring.proto +++ b/ydb/public/api/protos/ydb_monitoring.proto @@ -122,6 +122,13 @@ message ComputeStatus { float shards_quota_usage = 5; } +message TimeDifferenceStatus { + StatusFlag.Status overall = 1; + int64 max_difference_ms = 2; + string node = 3; + string peer = 4; +} + message LocationNode { uint32 id = 1; string host = 2; @@ -198,6 +205,7 @@ message DatabaseStatus { StatusFlag.Status overall = 2; StorageStatus storage = 3; ComputeStatus compute = 4; + TimeDifferenceStatus time_difference = 5; } message SelfCheckResult {