Skip to content

Commit

Permalink
Add reason for pending action in maintenance public API (#3289)
Browse files Browse the repository at this point in the history
  • Loading branch information
pixcc authored Oct 2, 2024
1 parent cd74ec0 commit d765169
Show file tree
Hide file tree
Showing 14 changed files with 388 additions and 59 deletions.
24 changes: 23 additions & 1 deletion ydb/core/cms/api_adapters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,33 @@ namespace {
}
}

Ydb::Maintenance::ActionState::ActionReason ConvertReason(NKikimrCms::TAction::TIssue::EType cmsActionIssueType) {
using EIssueType = NKikimrCms::TAction::TIssue;
switch (cmsActionIssueType) {
case EIssueType::UNKNOWN:
return Ydb::Maintenance::ActionState::ACTION_REASON_UNSPECIFIED;
case EIssueType::GENERIC:
return Ydb::Maintenance::ActionState::ACTION_REASON_GENERIC;
case EIssueType::TOO_MANY_UNAVAILABLE_VDISKS:
return Ydb::Maintenance::ActionState::ACTION_REASON_TOO_MANY_UNAVAILABLE_VDISKS;
case EIssueType::TOO_MANY_UNAVAILABLE_STATE_STORAGE_RINGS:
return Ydb::Maintenance::ActionState::ACTION_REASON_TOO_MANY_UNAVAILABLE_STATE_STORAGE_RINGS;
case EIssueType::DISABLED_NODES_LIMIT_REACHED:
return Ydb::Maintenance::ActionState::ACTION_REASON_DISABLED_NODES_LIMIT_REACHED;
case EIssueType::TENANT_DISABLED_NODES_LIMIT_REACHED:
return Ydb::Maintenance::ActionState::ACTION_REASON_TENANT_DISABLED_NODES_LIMIT_REACHED;
case EIssueType::SYS_TABLETS_NODE_LIMIT_REACHED:
return Ydb::Maintenance::ActionState::ACTION_REASON_SYS_TABLETS_NODE_LIMIT_REACHED;
}
return Ydb::Maintenance::ActionState::ACTION_REASON_UNSPECIFIED;
}

void ConvertAction(const NKikimrCms::TAction& cmsAction, Ydb::Maintenance::ActionState& actionState) {
ConvertAction(cmsAction, *actionState.mutable_action()->mutable_lock_action());
// FIXME: specify action_uid
actionState.set_status(Ydb::Maintenance::ActionState::ACTION_STATUS_PENDING);
actionState.set_reason(Ydb::Maintenance::ActionState::ACTION_REASON_UNSPECIFIED); // FIXME: specify
actionState.set_reason(ConvertReason(cmsAction.GetIssue().GetType()));
actionState.set_reason_details(cmsAction.GetIssue().GetMessage());
}

void ConvertActionUid(const TString& taskUid, const TString& permissionId,
Expand Down
7 changes: 0 additions & 7 deletions ydb/core/cms/cluster_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,6 @@ using TClusterInfoPtr = TIntrusivePtr<TClusterInfo>;
struct TCmsState;
using TCmsStatePtr = TIntrusivePtr<TCmsState>;

struct TErrorInfo {
NKikimrCms::TStatus::ECode Code = NKikimrCms::TStatus::ALLOW;
TString Reason;
TInstant Deadline;
ui64 RollbackPoint = 0;
};

/**
* Structure to hold info about issued permission. A set of
* all issued permissions is a part of CMS persistent state.
Expand Down
111 changes: 91 additions & 20 deletions ydb/core/cms/cms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,38 @@ namespace NKikimr::NCms {
using namespace NNodeWhiteboard;
using namespace NKikimrCms;

namespace {

constexpr size_t MAX_ISSUES_TO_STORE = 100;

TAction::TIssue ConvertIssue(const TReason& reason) {
TAction::TIssue issue;
switch (reason.GetType()) {
case TReason::EType::Generic:
issue.SetType(TAction::TIssue::GENERIC);
break;
case TReason::EType::TooManyUnavailableVDisks:
issue.SetType(TAction::TIssue::TOO_MANY_UNAVAILABLE_VDISKS);
break;
case TReason::EType::TooManyUnavailableStateStorageRings:
issue.SetType(TAction::TIssue::TOO_MANY_UNAVAILABLE_STATE_STORAGE_RINGS);
break;
case TReason::EType::DisabledNodesLimitReached:
issue.SetType(TAction::TIssue::DISABLED_NODES_LIMIT_REACHED);
break;
case TReason::EType::TenantDisabledNodesLimitReached:
issue.SetType(TAction::TIssue::TENANT_DISABLED_NODES_LIMIT_REACHED);
break;
case TReason::EType::SysTabletsNodeLimitReached:
issue.SetType(TAction::TIssue::SYS_TABLETS_NODE_LIMIT_REACHED);
break;
}
issue.SetMessage(reason.GetMessage());
return issue;
}

} // anonymous namespace

void TCms::DefaultSignalTabletActive(const TActorContext &)
{
// must be empty
Expand Down Expand Up @@ -326,6 +358,8 @@ bool TCms::CheckPermissionRequest(const TPermissionRequest &request,
};

auto point = ClusterInfo->PushRollbackPoint();
size_t storedIssues = 0;
size_t processedActions = 0;
for (const auto &action : request.GetActions()) {
TDuration permissionDuration = State->Config.DefaultPermissionDuration;
if (request.HasDuration())
Expand All @@ -352,28 +386,40 @@ bool TCms::CheckPermissionRequest(const TPermissionRequest &request,

auto *permission = response.AddPermissions();
permission->MutableAction()->CopyFrom(action);
permission->MutableAction()->ClearIssue();
permission->SetDeadline(error.Deadline.GetValue());
AddPermissionExtensions(action, *permission);

ClusterInfo->AddTempLocks(action, &ctx);
} else {
LOG_DEBUG(ctx, NKikimrServices::CMS, "Result: %s (reason: %s)",
ToString(error.Code).data(), error.Reason.data());
ToString(error.Code).data(), error.Reason.GetMessage().data());

if (CodesRate[response.GetStatus().GetCode()] > CodesRate[error.Code]) {
response.MutableStatus()->SetCode(error.Code);
response.MutableStatus()->SetReason(error.Reason);
response.MutableStatus()->SetReason(error.Reason.GetMessage());
if (error.Code == TStatus::DISALLOW_TEMP
|| error.Code == TStatus::ERROR_TEMP)
response.SetDeadline(error.Deadline.GetValue());
}

if (schedule) {
auto *scheduledAction = scheduled.AddActions();
scheduledAction->CopyFrom(action);

// Limit stored issues to avoid overloading the local database
if (storedIssues < MAX_ISSUES_TO_STORE) {
*scheduledAction->MutableIssue() = ConvertIssue(error.Reason);
++storedIssues;
} else {
scheduledAction->ClearIssue();
}
}

if (!allowPartial)
break;

if (schedule)
scheduled.AddActions()->CopyFrom(action);
}
++processedActions;
}
ClusterInfo->RollbackLocks(point);

Expand All @@ -396,9 +442,21 @@ bool TCms::CheckPermissionRequest(const TPermissionRequest &request,
if (schedule && response.GetStatus().GetCode() != TStatus::ALLOW_PARTIAL) {
if (response.GetStatus().GetCode() == TStatus::DISALLOW_TEMP
|| response.GetStatus().GetCode() == TStatus::ERROR_TEMP)
scheduled.MutableActions()->CopyFrom(request.GetActions());
else
{
if (!allowPartial) {
// Only the first problem action was scheduled during
// the actions check loop. Merge it with rest actions.
Y_ABORT_UNLESS(scheduled.ActionsSize() == 1);
TAction::TIssue issue = std::move(*scheduled.MutableActions()->begin()->MutableIssue());
scheduled.MutableActions()->CopyFrom(request.GetActions());
for (auto &action : *scheduled.MutableActions()) {
action.ClearIssue();
}
*scheduled.MutableActions(processedActions)->MutableIssue() = std::move(issue);
}
} else {
scheduled.ClearActions();
}
}

return response.GetStatus().GetCode() == TStatus::ALLOW
Expand Down Expand Up @@ -707,26 +765,32 @@ bool TCms::TryToLockStateStorageReplica(const TAction& action,
case MODE_MAX_AVAILABILITY:
if (restartRings + lockedRings > 1) {
error.Code = TStatus::DISALLOW_TEMP;
error.Reason = TStringBuilder() << "Too many unavailable state storage rings"
<< ". Restarting rings: "
<< (currentRingState == TStateStorageRingInfo::Restart ? restartRings : restartRings - 1)
<< ". Temporary (for a 2 minutes) locked rings: "
<< (currentRingState == TStateStorageRingInfo::Locked ? lockedRings + 1 : lockedRings)
<< ". Maximum allowed number of unavailable rings for this mode: " << 1;
error.Reason = TReason(
TStringBuilder() << "Too many unavailable state storage rings"
<< ". Restarting rings: "
<< (currentRingState == TStateStorageRingInfo::Restart ? restartRings : restartRings - 1)
<< ". Temporary (for a 2 minutes) locked rings: "
<< (currentRingState == TStateStorageRingInfo::Locked ? lockedRings + 1 : lockedRings)
<< ". Maximum allowed number of unavailable rings for this mode: " << 1,
TReason::EType::TooManyUnavailableStateStorageRings
);
error.Deadline = defaultDeadline;
return false;
}
break;
case MODE_KEEP_AVAILABLE:
if (restartRings + lockedRings + disabledRings > (nToSelect - 1) / 2) {
error.Code = TStatus::DISALLOW_TEMP;
error.Reason = TStringBuilder() << "Too many unavailable state storage rings"
<< ". Restarting rings: "
<< (currentRingState == TStateStorageRingInfo::Restart ? restartRings : restartRings - 1)
<< ". Temporary (for a 2 minutes) locked rings: "
<< (currentRingState == TStateStorageRingInfo::Locked ? lockedRings + 1 : lockedRings)
<< ". Disabled rings: " << disabledRings
<< ". Maximum allowed number of unavailable rings for this mode: " << (nToSelect - 1) / 2;
error.Reason = TReason(
TStringBuilder() << "Too many unavailable state storage rings"
<< ". Restarting rings: "
<< (currentRingState == TStateStorageRingInfo::Restart ? restartRings : restartRings - 1)
<< ". Temporary (for a 2 minutes) locked rings: "
<< (currentRingState == TStateStorageRingInfo::Locked ? lockedRings + 1 : lockedRings)
<< ". Disabled rings: " << disabledRings
<< ". Maximum allowed number of unavailable rings for this mode: " << (nToSelect - 1) / 2,
TReason::EType::TooManyUnavailableStateStorageRings
);
error.Deadline = defaultDeadline;
return false;
}
Expand Down Expand Up @@ -1490,6 +1554,13 @@ void TCms::CheckAndEnqueueRequest(TEvCms::TEvPermissionRequest::TPtr &ev, const
ev, TStatus::WRONG_REQUEST, "Priority value is out of range", ctx);
}

for (const auto &action : rec.GetActions()) {
if (action.HasIssue()) {
return ReplyWithError<TEvCms::TEvPermissionResponse>(
ev, TStatus::WRONG_REQUEST, TStringBuilder() << "Action issue is read-only", ctx);
}
}

EnqueueRequest(ev.Release(), ctx);
}

Expand Down
26 changes: 26 additions & 0 deletions ydb/core/cms/cms_maintenance_api_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,32 @@ Y_UNIT_TEST_SUITE(TMaintenanceApiTest) {
)
);
}

Y_UNIT_TEST(ActionReason) {
TCmsTestEnv env(8);

auto response = env.CheckMaintenanceTaskCreate("task-1", Ydb::StatusIds::SUCCESS,
MakeActionGroup(
MakeLockAction(env.GetNodeId(0), TDuration::Minutes(10))
),
MakeActionGroup(
MakeLockAction(env.GetNodeId(1), TDuration::Minutes(10))
)
);

UNIT_ASSERT_VALUES_EQUAL(response.action_group_states().size(), 2);
UNIT_ASSERT_VALUES_EQUAL(response.action_group_states(0).action_states().size(), 1);
const auto &a1 = response.action_group_states(0).action_states(0);
UNIT_ASSERT_VALUES_EQUAL(a1.status(), ActionState::ACTION_STATUS_PERFORMED);
UNIT_ASSERT_VALUES_EQUAL(a1.reason(), ActionState::ACTION_REASON_OK);
UNIT_ASSERT(a1.reason_details().empty());

UNIT_ASSERT_VALUES_EQUAL(response.action_group_states(1).action_states().size(), 1);
const auto &a2 = response.action_group_states(1).action_states(0);
UNIT_ASSERT_VALUES_EQUAL(a2.status(), ActionState::ACTION_STATUS_PENDING);
UNIT_ASSERT_VALUES_EQUAL(a2.reason(), ActionState::ACTION_REASON_TOO_MANY_UNAVAILABLE_VDISKS);
UNIT_ASSERT(a2.reason_details().Contains("too many unavailable vdisks"));
}
}

} // namespace NKikimr::NCmsTest
98 changes: 98 additions & 0 deletions ydb/core/cms/cms_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -631,6 +631,104 @@ Y_UNIT_TEST_SUITE(TCmsTest) {
env.CheckListRequests("user1", 0);
}

Y_UNIT_TEST(ActionIssue)
{
TCmsTestEnv env(16);

// Acquire lock on one node
auto rec = env.CheckPermissionRequest
("user", false, false, true, true, TStatus::ALLOW,
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(0), 60000000));
UNIT_ASSERT_VALUES_EQUAL(rec.PermissionsSize(), 1);
UNIT_ASSERT(!rec.GetPermissions(0).GetAction().HasIssue());

auto pid = rec.GetPermissions(0).GetId();

// Schedule request
rec = env.CheckPermissionRequest
("user", false, false, true, true, TStatus::DISALLOW_TEMP,
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(9), 60000000),
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(1), 60000000));
UNIT_ASSERT_VALUES_EQUAL(rec.PermissionsSize(), 0);

auto rid = rec.GetRequestId();

// Get scheduled request
auto scheduledRec = env.CheckGetRequest("user", rid);
UNIT_ASSERT_VALUES_EQUAL(scheduledRec.RequestsSize(), 1);
UNIT_ASSERT_VALUES_EQUAL(scheduledRec.GetRequests(0).ActionsSize(), 2);
auto action1 = scheduledRec.GetRequests(0).GetActions(0);
UNIT_ASSERT(!action1.HasIssue());
auto action2 = scheduledRec.GetRequests(0).GetActions(1);
UNIT_ASSERT(action2.HasIssue());
UNIT_ASSERT_VALUES_EQUAL(action2.GetIssue().GetType(), TAction::TIssue::TOO_MANY_UNAVAILABLE_VDISKS);

// Try to check request
env.CheckRequest("user", rid, false, TStatus::DISALLOW_TEMP);

// Get scheduled request
scheduledRec = env.CheckGetRequest("user", rid);
UNIT_ASSERT_VALUES_EQUAL(scheduledRec.RequestsSize(), 1);
UNIT_ASSERT_VALUES_EQUAL(scheduledRec.GetRequests(0).ActionsSize(), 2);
action1 = scheduledRec.GetRequests(0).GetActions(0);
UNIT_ASSERT(!action1.HasIssue());
action2 = scheduledRec.GetRequests(0).GetActions(1);
UNIT_ASSERT(action2.HasIssue());
UNIT_ASSERT_VALUES_EQUAL(action2.GetIssue().GetType(), TAction::TIssue::TOO_MANY_UNAVAILABLE_VDISKS);

// Done with permission
env.CheckDonePermission("user", pid);

// Try to check request
rec = env.CheckRequest("user", rid, false, TStatus::ALLOW, 2);
UNIT_ASSERT(!rec.GetPermissions(0).GetAction().HasIssue());
UNIT_ASSERT(!rec.GetPermissions(1).GetAction().HasIssue());

env.CheckGetRequest("user", rid, false, TStatus::WRONG_REQUEST);
}

Y_UNIT_TEST(ActionIssuePartialPermissions)
{
TCmsTestEnv env(8);

// Schedule request
auto rec = env.CheckPermissionRequest
("user", true, false, true, true, TStatus::ALLOW_PARTIAL,
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(0), 60000000),
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(1), 60000000));
UNIT_ASSERT_VALUES_EQUAL(rec.PermissionsSize(), 1);
UNIT_ASSERT(!rec.GetPermissions(0).GetAction().HasIssue());

auto pid = rec.GetPermissions(0).GetId();
auto rid = rec.GetRequestId();

// Get scheduled request
auto scheduledRec = env.CheckGetRequest("user", rid);
UNIT_ASSERT_VALUES_EQUAL(scheduledRec.RequestsSize(), 1);
UNIT_ASSERT_VALUES_EQUAL(scheduledRec.GetRequests(0).ActionsSize(), 1);
auto action = scheduledRec.GetRequests(0).GetActions(0);
UNIT_ASSERT_VALUES_EQUAL(action.GetIssue().GetType(), TAction::TIssue::TOO_MANY_UNAVAILABLE_VDISKS);

// Try to check request
env.CheckRequest("user", rid, false, TStatus::DISALLOW_TEMP);

// Get scheduled request
scheduledRec = env.CheckGetRequest("user", rid);
UNIT_ASSERT_VALUES_EQUAL(scheduledRec.RequestsSize(), 1);
UNIT_ASSERT_VALUES_EQUAL(scheduledRec.GetRequests(0).ActionsSize(), 1);
action = scheduledRec.GetRequests(0).GetActions(0);
UNIT_ASSERT_VALUES_EQUAL(action.GetIssue().GetType(), TAction::TIssue::TOO_MANY_UNAVAILABLE_VDISKS);

// Done with permission
env.CheckDonePermission("user", pid);

// Try to check request
rec = env.CheckRequest("user", rid, false, TStatus::ALLOW, 1);
UNIT_ASSERT(!rec.GetPermissions(0).GetAction().HasIssue());

env.CheckGetRequest("user", rid, false, TStatus::WRONG_REQUEST);
}

Y_UNIT_TEST(WalleTasks)
{
TCmsTestEnv env(24, 4);
Expand Down
Loading

0 comments on commit d765169

Please sign in to comment.