Skip to content

Commit

Permalink
fix missing static group issue (#7212)
Browse files Browse the repository at this point in the history
  • Loading branch information
vporyadke committed Aug 2, 2024
1 parent bf5f3d7 commit 2092cd6
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 10 deletions.
7 changes: 5 additions & 2 deletions ydb/core/health_check/health_check.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ struct hash<NKikimrBlobStorage::TVSlotId> {
}

#define BLOG_CRIT(stream) LOG_CRIT_S(*TlsActivationContext, NKikimrServices::HEALTH, stream)
#define BLOG_D(stream) LOG_DEBUG_S(*TlsActivationContext, NKikimrServices::HEALTH, stream)

namespace NKikimr {

Expand Down Expand Up @@ -643,7 +644,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
}

bool NeedWhiteboardInfoForGroup(TGroupId groupId) {
return !HaveAllBSControllerInfo() && IsStaticGroup(groupId);
return UnknownStaticGroups.contains(groupId) || (!HaveAllBSControllerInfo() && IsStaticGroup(groupId));
}

void Handle(TEvNodeWardenStorageConfig::TPtr ev) {
Expand Down Expand Up @@ -678,6 +679,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {

auto groupId = vDisk.GetVDiskID().GetGroupID();
if (NeedWhiteboardInfoForGroup(groupId)) {
BLOG_D("Requesting whiteboard for group " << groupId);
RequestStorageNode(vDisk.GetVDiskLocation().GetNodeID());
}
}
Expand Down Expand Up @@ -1308,6 +1310,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
// it should not be trusted
Ydb::Monitoring::StorageGroupStatus staticGroupStatus;
FillGroupStatus(0, staticGroupStatus, {nullptr});
BLOG_D("Static group status is " << staticGroupStatus.overall());
if (staticGroupStatus.overall() != Ydb::Monitoring::StatusFlag::GREEN) {
UnknownStaticGroups.emplace(0);
RequestStorageConfig();
Expand Down Expand Up @@ -2169,7 +2172,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
context.OverallStatus = MinStatus(context.OverallStatus, Ydb::Monitoring::StatusFlag::YELLOW);
checker.ReportStatus(context);


BLOG_D("Group " << groupId << " has status " << context.GetOverallStatus());
storageGroupStatus.set_overall(context.GetOverallStatus());
}

Expand Down
47 changes: 39 additions & 8 deletions ydb/core/health_check/health_check_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -163,12 +163,12 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
}

void AddVSlotsToSysViewResponse(NSysView::TEvSysView::TEvGetVSlotsResponse::TPtr* ev, size_t groupCount,
const TVector<NKikimrBlobStorage::EVDiskStatus>& vdiskStatuses) {
const TVector<NKikimrBlobStorage::EVDiskStatus>& vdiskStatuses, ui32 groupStartId = GROUP_START_ID) {
auto& record = (*ev)->Get()->Record;
auto entrySample = record.entries(0);
record.clear_entries();

auto groupId = GROUP_START_ID;
auto groupId = groupStartId;
const auto *descriptor = NKikimrBlobStorage::EVDiskStatus_descriptor();
for (size_t i = 0; i < groupCount; ++i) {
auto vslotId = VCARD_START_ID;
Expand Down Expand Up @@ -252,13 +252,13 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
sPool->set_name(STORAGE_POOL_NAME);
};

void AddVSlotInVDiskStateResponse(TEvWhiteboard::TEvVDiskStateResponse::TPtr* ev, int groupCount, int vslotCount) {
void AddVSlotInVDiskStateResponse(TEvWhiteboard::TEvVDiskStateResponse::TPtr* ev, int groupCount, int vslotCount, ui32 groupStartId = GROUP_START_ID) {
auto& pbRecord = (*ev)->Get()->Record;

auto sample = pbRecord.vdiskstateinfo(0);
pbRecord.clear_vdiskstateinfo();

auto groupId = GROUP_START_ID;
auto groupId = groupStartId;
for (int i = 0; i < groupCount; i++) {
auto slotId = VCARD_START_ID;
for (int j = 0; j < vslotCount; j++) {
Expand All @@ -273,6 +273,12 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
}
}

void ChangeGroupStateResponse(NNodeWhiteboard::TEvWhiteboard::TEvBSGroupStateResponse::TPtr* ev) {
for (auto& groupInfo : *(*ev)->Get()->Record.mutable_bsgroupstateinfo()) {
groupInfo.set_erasurespecies(NHealthCheck::TSelfCheckRequest::BLOCK_4_2);
}
}

void SetLongHostValue(TEvInterconnect::TEvNodesInfo::TPtr* ev) {
TString host(1000000, 'a');
auto& pbRecord = (*ev)->Get()->Nodes;
Expand Down Expand Up @@ -383,7 +389,7 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
CheckHcResult(result, groupNumber, vdiscPerGroupNumber, isMergeRecords);
}

Ydb::Monitoring::SelfCheckResult RequestHcWithVdisks(const NKikimrBlobStorage::TGroupStatus::E groupStatus, const TVector<NKikimrBlobStorage::EVDiskStatus>& vdiskStatuses) {
Ydb::Monitoring::SelfCheckResult RequestHcWithVdisks(const NKikimrBlobStorage::TGroupStatus::E groupStatus, const TVector<NKikimrBlobStorage::EVDiskStatus>& vdiskStatuses, bool forStaticGroup = false) {
TPortManager tp;
ui16 port = tp.GetPort(2134);
ui16 grpcPort = tp.GetPort(2135);
Expand Down Expand Up @@ -418,7 +424,11 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
}
case NSysView::TEvSysView::EvGetVSlotsResponse: {
auto* x = reinterpret_cast<NSysView::TEvSysView::TEvGetVSlotsResponse::TPtr*>(&ev);
AddVSlotsToSysViewResponse(x, 1, vdiskStatuses);
if (forStaticGroup) {
AddVSlotsToSysViewResponse(x, 1, vdiskStatuses, 0);
} else {
AddVSlotsToSysViewResponse(x, 1, vdiskStatuses);
}
break;
}
case NSysView::TEvSysView::EvGetGroupsResponse: {
Expand All @@ -431,6 +441,19 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
AddStoragePoolsToSysViewResponse(x);
break;
}
case NNodeWhiteboard::TEvWhiteboard::EvVDiskStateResponse: {
auto *x = reinterpret_cast<NNodeWhiteboard::TEvWhiteboard::TEvVDiskStateResponse::TPtr*>(&ev);
if (forStaticGroup) {
AddVSlotInVDiskStateResponse(x, 1, vdiskStatuses.size(), 0);
} else {
AddVSlotInVDiskStateResponse(x, 1, vdiskStatuses.size());
}
break;
}
case NNodeWhiteboard::TEvWhiteboard::EvBSGroupStateResponse: {
auto* x = reinterpret_cast<NNodeWhiteboard::TEvWhiteboard::TEvBSGroupStateResponse::TPtr*>(&ev);
ChangeGroupStateResponse(x);
}
}

return TTestActorRuntime::EEventAction::PROCESS;
Expand All @@ -444,10 +467,12 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
return runtime.GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
}

void CheckHcResultHasIssuesWithStatus(Ydb::Monitoring::SelfCheckResult& result, const TString& type, const Ydb::Monitoring::StatusFlag::Status expectingStatus, ui32 total) {
void CheckHcResultHasIssuesWithStatus(Ydb::Monitoring::SelfCheckResult& result, const TString& type,
const Ydb::Monitoring::StatusFlag::Status expectingStatus, ui32 total,
std::string_view pool = "/Root:test") {
int issuesCount = 0;
for (const auto& issue_log : result.Getissue_log()) {
if (issue_log.type() == type && issue_log.location().storage().pool().name() == "/Root:test" && issue_log.status() == expectingStatus) {
if (issue_log.type() == type && issue_log.location().storage().pool().name() == pool && issue_log.status() == expectingStatus) {
issuesCount++;
}
}
Expand Down Expand Up @@ -589,6 +614,12 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::RED, 1);
}

Y_UNIT_TEST(StaticGroupIssue) {
auto result = RequestHcWithVdisks(NKikimrBlobStorage::TGroupStatus::PARTIAL, {NKikimrBlobStorage::ERROR}, /*forStatic*/ true);
Cerr << result.ShortDebugString() << Endl;
CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::YELLOW, 1, "static");
}

/* HC currently infers group status on its own, so it's never unknown
Y_UNIT_TEST(RedGroupIssueWhenUnknownGroupStatus) {
auto result = RequestHcWithVdisks(NKikimrBlobStorage::TGroupStatus::UNKNOWN, {});
Expand Down

0 comments on commit 2092cd6

Please sign in to comment.