From 2092cd6985bd722ffe9b7ee30a7b325084251546 Mon Sep 17 00:00:00 2001 From: vporyadke Date: Thu, 1 Aug 2024 12:43:08 +0300 Subject: [PATCH 1/2] fix missing static group issue (#7212) --- ydb/core/health_check/health_check.cpp | 7 +++- ydb/core/health_check/health_check_ut.cpp | 47 +++++++++++++++++++---- 2 files changed, 44 insertions(+), 10 deletions(-) diff --git a/ydb/core/health_check/health_check.cpp b/ydb/core/health_check/health_check.cpp index 46460e4de841..aa789833d833 100644 --- a/ydb/core/health_check/health_check.cpp +++ b/ydb/core/health_check/health_check.cpp @@ -56,6 +56,7 @@ struct hash { } #define BLOG_CRIT(stream) LOG_CRIT_S(*TlsActivationContext, NKikimrServices::HEALTH, stream) +#define BLOG_D(stream) LOG_DEBUG_S(*TlsActivationContext, NKikimrServices::HEALTH, stream) namespace NKikimr { @@ -643,7 +644,7 @@ class TSelfCheckRequest : public TActorBootstrapped { } bool NeedWhiteboardInfoForGroup(TGroupId groupId) { - return !HaveAllBSControllerInfo() && IsStaticGroup(groupId); + return UnknownStaticGroups.contains(groupId) || (!HaveAllBSControllerInfo() && IsStaticGroup(groupId)); } void Handle(TEvNodeWardenStorageConfig::TPtr ev) { @@ -678,6 +679,7 @@ class TSelfCheckRequest : public TActorBootstrapped { auto groupId = vDisk.GetVDiskID().GetGroupID(); if (NeedWhiteboardInfoForGroup(groupId)) { + BLOG_D("Requesting whiteboard for group " << groupId); RequestStorageNode(vDisk.GetVDiskLocation().GetNodeID()); } } @@ -1308,6 +1310,7 @@ class TSelfCheckRequest : public TActorBootstrapped { // it should not be trusted Ydb::Monitoring::StorageGroupStatus staticGroupStatus; FillGroupStatus(0, staticGroupStatus, {nullptr}); + BLOG_D("Static group status is " << staticGroupStatus.overall()); if (staticGroupStatus.overall() != Ydb::Monitoring::StatusFlag::GREEN) { UnknownStaticGroups.emplace(0); RequestStorageConfig(); @@ -2169,7 +2172,7 @@ class TSelfCheckRequest : public TActorBootstrapped { context.OverallStatus = MinStatus(context.OverallStatus, Ydb::Monitoring::StatusFlag::YELLOW); checker.ReportStatus(context); - + BLOG_D("Group " << groupId << " has status " << context.GetOverallStatus()); storageGroupStatus.set_overall(context.GetOverallStatus()); } diff --git a/ydb/core/health_check/health_check_ut.cpp b/ydb/core/health_check/health_check_ut.cpp index fa869c265be3..58d7f832eb23 100644 --- a/ydb/core/health_check/health_check_ut.cpp +++ b/ydb/core/health_check/health_check_ut.cpp @@ -163,12 +163,12 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) { } void AddVSlotsToSysViewResponse(NSysView::TEvSysView::TEvGetVSlotsResponse::TPtr* ev, size_t groupCount, - const TVector& vdiskStatuses) { + const TVector& vdiskStatuses, ui32 groupStartId = GROUP_START_ID) { auto& record = (*ev)->Get()->Record; auto entrySample = record.entries(0); record.clear_entries(); - auto groupId = GROUP_START_ID; + auto groupId = groupStartId; const auto *descriptor = NKikimrBlobStorage::EVDiskStatus_descriptor(); for (size_t i = 0; i < groupCount; ++i) { auto vslotId = VCARD_START_ID; @@ -252,13 +252,13 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) { sPool->set_name(STORAGE_POOL_NAME); }; - void AddVSlotInVDiskStateResponse(TEvWhiteboard::TEvVDiskStateResponse::TPtr* ev, int groupCount, int vslotCount) { + void AddVSlotInVDiskStateResponse(TEvWhiteboard::TEvVDiskStateResponse::TPtr* ev, int groupCount, int vslotCount, ui32 groupStartId = GROUP_START_ID) { auto& pbRecord = (*ev)->Get()->Record; auto sample = pbRecord.vdiskstateinfo(0); pbRecord.clear_vdiskstateinfo(); - auto groupId = GROUP_START_ID; + auto groupId = groupStartId; for (int i = 0; i < groupCount; i++) { auto slotId = VCARD_START_ID; for (int j = 0; j < vslotCount; j++) { @@ -273,6 +273,12 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) { } } + void ChangeGroupStateResponse(NNodeWhiteboard::TEvWhiteboard::TEvBSGroupStateResponse::TPtr* ev) { + for (auto& groupInfo : *(*ev)->Get()->Record.mutable_bsgroupstateinfo()) { + groupInfo.set_erasurespecies(NHealthCheck::TSelfCheckRequest::BLOCK_4_2); + } + } + void SetLongHostValue(TEvInterconnect::TEvNodesInfo::TPtr* ev) { TString host(1000000, 'a'); auto& pbRecord = (*ev)->Get()->Nodes; @@ -383,7 +389,7 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) { CheckHcResult(result, groupNumber, vdiscPerGroupNumber, isMergeRecords); } - Ydb::Monitoring::SelfCheckResult RequestHcWithVdisks(const NKikimrBlobStorage::TGroupStatus::E groupStatus, const TVector& vdiskStatuses) { + Ydb::Monitoring::SelfCheckResult RequestHcWithVdisks(const NKikimrBlobStorage::TGroupStatus::E groupStatus, const TVector& vdiskStatuses, bool forStaticGroup = false) { TPortManager tp; ui16 port = tp.GetPort(2134); ui16 grpcPort = tp.GetPort(2135); @@ -418,7 +424,11 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) { } case NSysView::TEvSysView::EvGetVSlotsResponse: { auto* x = reinterpret_cast(&ev); - AddVSlotsToSysViewResponse(x, 1, vdiskStatuses); + if (forStaticGroup) { + AddVSlotsToSysViewResponse(x, 1, vdiskStatuses, 0); + } else { + AddVSlotsToSysViewResponse(x, 1, vdiskStatuses); + } break; } case NSysView::TEvSysView::EvGetGroupsResponse: { @@ -431,6 +441,19 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) { AddStoragePoolsToSysViewResponse(x); break; } + case NNodeWhiteboard::TEvWhiteboard::EvVDiskStateResponse: { + auto *x = reinterpret_cast(&ev); + if (forStaticGroup) { + AddVSlotInVDiskStateResponse(x, 1, vdiskStatuses.size(), 0); + } else { + AddVSlotInVDiskStateResponse(x, 1, vdiskStatuses.size()); + } + break; + } + case NNodeWhiteboard::TEvWhiteboard::EvBSGroupStateResponse: { + auto* x = reinterpret_cast(&ev); + ChangeGroupStateResponse(x); + } } return TTestActorRuntime::EEventAction::PROCESS; @@ -444,10 +467,12 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) { return runtime.GrabEdgeEvent(handle)->Result; } - void CheckHcResultHasIssuesWithStatus(Ydb::Monitoring::SelfCheckResult& result, const TString& type, const Ydb::Monitoring::StatusFlag::Status expectingStatus, ui32 total) { + void CheckHcResultHasIssuesWithStatus(Ydb::Monitoring::SelfCheckResult& result, const TString& type, + const Ydb::Monitoring::StatusFlag::Status expectingStatus, ui32 total, + std::string_view pool = "/Root:test") { int issuesCount = 0; for (const auto& issue_log : result.Getissue_log()) { - if (issue_log.type() == type && issue_log.location().storage().pool().name() == "/Root:test" && issue_log.status() == expectingStatus) { + if (issue_log.type() == type && issue_log.location().storage().pool().name() == pool && issue_log.status() == expectingStatus) { issuesCount++; } } @@ -589,6 +614,12 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) { CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::RED, 1); } + Y_UNIT_TEST(StaticGroupIssue) { + auto result = RequestHcWithVdisks(NKikimrBlobStorage::TGroupStatus::PARTIAL, {NKikimrBlobStorage::ERROR}, /*forStatic*/ true); + Cerr << result.ShortDebugString() << Endl; + CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::YELLOW, 1, "static"); + } + /* HC currently infers group status on its own, so it's never unknown Y_UNIT_TEST(RedGroupIssueWhenUnknownGroupStatus) { auto result = RequestHcWithVdisks(NKikimrBlobStorage::TGroupStatus::UNKNOWN, {}); From 39aead88f7231850310d58dced7d02d0924dc9f0 Mon Sep 17 00:00:00 2001 From: Alexander Zalyalov Date: Mon, 5 Aug 2024 13:08:54 +0000 Subject: [PATCH 2/2] do not look at active/inactive pdisk status in healthchceck --- ydb/core/health_check/health_check.cpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/ydb/core/health_check/health_check.cpp b/ydb/core/health_check/health_check.cpp index aa789833d833..650e4673505d 100644 --- a/ydb/core/health_check/health_check.cpp +++ b/ydb/core/health_check/health_check.cpp @@ -1715,12 +1715,9 @@ class TSelfCheckRequest : public TActorBootstrapped { ETags::PDiskState); } switch (status->number()) { - case NKikimrBlobStorage::ACTIVE: { - context.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN); - break; - } + case NKikimrBlobStorage::ACTIVE: case NKikimrBlobStorage::INACTIVE: { - context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "PDisk is inactive", ETags::PDiskState); + context.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN); break; } case NKikimrBlobStorage::FAULTY: