Skip to content

Commit

Permalink
issue-506: improve tests, report critical event when request info doe… (
Browse files Browse the repository at this point in the history
#633)

* issue-506: improve tests, report critical event when request info does not have cancel routine but CancelRequest is called, make error messages when tablet restarts the same

* align with nbs

* update

* update
  • Loading branch information
yegorskii authored and qkrorlqr committed Jul 14, 2024
1 parent 3ecd795 commit 4d0ebbf
Show file tree
Hide file tree
Showing 17 changed files with 86 additions and 23 deletions.
31 changes: 29 additions & 2 deletions cloud/filestore/libs/diagnostics/critical_events.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,46 @@ void InitCriticalEventsCounter(NMonitoring::TDynamicCountersPtr counters)
// FILESTORE_INIT_CRITICAL_EVENT_COUNTER

FILESTORE_CRITICAL_EVENTS(FILESTORE_INIT_CRITICAL_EVENT_COUNTER)
FILESTORE_IMPOSSIBLE_EVENTS(FILESTORE_INIT_CRITICAL_EVENT_COUNTER)
#undef FILESTORE_INIT_CRITICAL_EVENT_COUNTER

NCloud::InitCriticalEventsCounter(std::move(counters));
}

#define FILESTORE_DEFINE_CRITICAL_EVENT_ROUTINE(name) \
void Report##name() \
TString Report##name(const TString& message) \
{ \
ReportCriticalEvent("AppCriticalEvents/"#name, "", false); \
return ReportCriticalEvent( \
GetCriticalEventFor##name(), \
message, \
false); \
} \
\
const TString GetCriticalEventFor##name() \
{ \
return "AppCriticalEvents/"#name; \
} \
// FILESTORE_DEFINE_CRITICAL_EVENT_ROUTINE

FILESTORE_CRITICAL_EVENTS(FILESTORE_DEFINE_CRITICAL_EVENT_ROUTINE)
#undef FILESTORE_DEFINE_CRITICAL_EVENT_ROUTINE

#define FILESTORE_DEFINE_IMPOSSIBLE_EVENT_ROUTINE(name) \
TString Report##name(const TString& message) \
{ \
return ReportCriticalEvent( \
GetCriticalEventFor##name(), \
message, \
true); \
} \
\
const TString GetCriticalEventFor##name() \
{ \
return "AppCriticalEvents/"#name; \
} \
// FILESTORE_DEFINE_IMPOSSIBLE_EVENT_ROUTINE

FILESTORE_IMPOSSIBLE_EVENTS(FILESTORE_DEFINE_IMPOSSIBLE_EVENT_ROUTINE)
#undef FILESTORE_DEFINE_IMPOSSIBLE_EVENT_ROUTINE

} // namespace NCloud::NFileStore
15 changes: 14 additions & 1 deletion cloud/filestore/libs/diagnostics/critical_events.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,28 @@ namespace NCloud::NFileStore{
xxx(NodeNotFoundInFollower) \
// FILESTORE_CRITICAL_EVENTS

#define FILESTORE_IMPOSSIBLE_EVENTS(xxx) \
xxx(CancelRoutineIsNotSet) \
// FILESTORE_IMPOSSIBLE_EVENTS

////////////////////////////////////////////////////////////////////////////////

void InitCriticalEventsCounter(NMonitoring::TDynamicCountersPtr counters);

#define FILESTORE_DECLARE_CRITICAL_EVENT_ROUTINE(name) \
void Report##name(); \
TString Report##name(const TString& message = ""); \
const TString GetCriticalEventFor##name(); \
// FILESTORE_DECLARE_CRITICAL_EVENT_ROUTINE

FILESTORE_CRITICAL_EVENTS(FILESTORE_DECLARE_CRITICAL_EVENT_ROUTINE)
#undef FILESTORE_DECLARE_CRITICAL_EVENT_ROUTINE

#define FILESTORE_DECLARE_IMPOSSIBLE_EVENT_ROUTINE(name) \
TString Report##name(const TString& message = ""); \
const TString GetCriticalEventFor##name(); \
// FILESTORE_DECLARE_IMPOSSIBLE_EVENT_ROUTINE

FILESTORE_IMPOSSIBLE_EVENTS(FILESTORE_DECLARE_IMPOSSIBLE_EVENT_ROUTINE)
#undef FILESTORE_DECLARE_IMPOSSIBLE_EVENT_ROUTINE

} // namespace NCloud::NFileStore
12 changes: 11 additions & 1 deletion cloud/filestore/libs/storage/core/request_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include "public.h"

#include <cloud/filestore/libs/diagnostics/critical_events.h>
#include <cloud/filestore/libs/service/context.h>
#include <cloud/filestore/libs/storage/api/events.h>

Expand Down Expand Up @@ -48,6 +49,15 @@ struct TRequestInfo
, CallContext(std::move(callContext))
{}

void CancelRequest(const NActors::TActorContext& ctx)
{
if (!CancelRoutine) {
ReportCancelRoutineIsNotSet();
return;
};
CancelRoutine(ctx, *this);
}

void AddExecCycles(ui64 cycles)
{
AtomicAdd(ExecCycles, cycles);
Expand Down Expand Up @@ -121,7 +131,7 @@ TRequestInfoPtr CreateRequestInfo(
TRequestInfo& requestInfo)
{
auto response = std::make_unique<typename TMethod::TResponse>(
MakeError(E_REJECTED, "tablet is dead"));
MakeError(E_REJECTED, "tablet is shutting down"));

NCloud::Reply(ctx, requestInfo, std::move(response));
};
Expand Down
2 changes: 1 addition & 1 deletion cloud/filestore/libs/storage/tablet/tablet_actor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ void TIndexTabletActor::TerminateTransactions(const TActorContext& ctx)
TRequestInfo* requestInfo = ActiveTransactions.PopFront();
TABLET_VERIFY(requestInfo->RefCount() >= 1);

requestInfo->CancelRoutine(ctx, *requestInfo);
requestInfo->CancelRequest(ctx);
requestInfo->UnRef();
}
}
Expand Down
4 changes: 2 additions & 2 deletions cloud/filestore/libs/storage/tablet/tablet_actor.h
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ class TIndexTabletActor final
TRequestInfo& requestInfo)
{
auto response = std::make_unique<typename TMethod::TResponse>(
MakeError(E_REJECTED, "tablet is dead"));
MakeError(E_REJECTED, "tablet is shutting down"));

NCloud::Reply(ctx, requestInfo, std::move(response));
};
Expand Down Expand Up @@ -321,7 +321,7 @@ class TIndexTabletActor final
TRequestInfo& requestInfo)
{
auto response = std::make_unique<typename TMethod::TResponse>(
MakeError(E_REJECTED, "request cancelled"));
MakeError(E_REJECTED, "tablet is shutting down"));

NCloud::Reply(ctx, requestInfo, std::move(response));
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ void TCleanupSessionsActor::HandlePoisonPill(
const TActorContext& ctx)
{
Y_UNUSED(ev);
ReplyAndDie(ctx, MakeError(E_REJECTED, "request cancelled"));
ReplyAndDie(ctx, MakeError(E_REJECTED, "tablet is shutting down"));
}

void TCleanupSessionsActor::ReplyAndDie(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ void TCollectGarbageActor::HandlePoisonPill(
const TActorContext& ctx)
{
Y_UNUSED(ev);
ReplyAndDie(ctx, MakeError(E_REJECTED, "request cancelled"));
ReplyAndDie(ctx, MakeError(E_REJECTED, "tablet is shutting down"));
}

void TCollectGarbageActor::HandleError(NProto::TError error)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ void TCompactionActor::HandlePoisonPill(
const TActorContext& ctx)
{
Y_UNUSED(ev);
ReplyAndDie(ctx, MakeError(E_REJECTED, "request cancelled"));
ReplyAndDie(ctx, MakeError(E_REJECTED, "tablet is shutting down"));
}

void TCompactionActor::ReplyAndDie(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ void TDestroyCheckpointActor::HandlePoisonPill(
const TActorContext& ctx)
{
Y_UNUSED(ev);
ReplyAndDie(ctx, MakeError(E_REJECTED, "request cancelled"));
ReplyAndDie(ctx, MakeError(E_REJECTED, "tablet is shutting down"));
}

void TDestroyCheckpointActor::ReplyAndDie(
Expand Down
2 changes: 1 addition & 1 deletion cloud/filestore/libs/storage/tablet/tablet_actor_flush.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ void TFlushActor::HandlePoisonPill(
const TActorContext& ctx)
{
Y_UNUSED(ev);
ReplyAndDie(ctx, MakeError(E_REJECTED, "request cancelled"));
ReplyAndDie(ctx, MakeError(E_REJECTED, "tablet is shutting down"));
}

void TFlushActor::ReplyAndDie(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -404,7 +404,7 @@ void TFlushBytesActor::HandlePoisonPill(
const TActorContext& ctx)
{
Y_UNUSED(ev);
ReplyAndDie(ctx, MakeError(E_REJECTED, "request cancelled"));
ReplyAndDie(ctx, MakeError(E_REJECTED, "tablet is shutting down"));
}

void TFlushBytesActor::ReplyAndDie(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -859,7 +859,7 @@ struct TIndexTabletMonitoringActor
const TActorContext& ctx)
{
Y_UNUSED(ev);
ReplyAndDie(ctx, MakeError(E_REJECTED, "request cancelled"), {});
ReplyAndDie(ctx, MakeError(E_REJECTED, "tablet is shutting down"), {});
}

void ReplyAndDie(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ void TReadBlobActor::HandlePoisonPill(
const TActorContext& ctx)
{
Y_UNUSED(ev);
ReplyAndDie(ctx, MakeError(E_REJECTED, "request cancelled"));
ReplyAndDie(ctx, MakeError(E_REJECTED, "tablet is shutting down"));
}

void TReadBlobActor::ReplyError(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,7 @@ void TReadDataActor::HandlePoisonPill(
const TActorContext& ctx)
{
Y_UNUSED(ev);
ReplyAndDie(ctx, MakeError(E_REJECTED, "request cancelled"));
ReplyAndDie(ctx, MakeError(E_REJECTED, "tablet is shutting down"));
}

void TReadDataActor::ReplyAndDie(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ void TWriteBatchActor::HandlePoisonPill(
const TActorContext& ctx)
{
Y_UNUSED(ev);
ReplyAndDie(ctx, MakeError(E_REJECTED, "request cancelled"));
ReplyAndDie(ctx, MakeError(E_REJECTED, "tablet is shutting down"));
}

void TWriteBatchActor::ReplyAndDie(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,7 @@ void TWriteBlobActor::HandlePoisonPill(
const TActorContext& ctx)
{
Y_UNUSED(ev);
ReplyAndDie(ctx, MakeError(E_REJECTED, "request cancelled"));
ReplyAndDie(ctx, MakeError(E_REJECTED, "tablet is shutting down"));
}

void TWriteBlobActor::ReplyError(
Expand Down
23 changes: 18 additions & 5 deletions cloud/filestore/libs/storage/tablet/tablet_ut_data.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4377,9 +4377,11 @@ Y_UNIT_TEST_SUITE(TIndexTabletTest_Data)
return false;
});

env.GetRuntime().DispatchEvents(TDispatchOptions(), TDuration::Seconds(1));

UNIT_ASSERT(putSeen);
TDispatchOptions options;
options.CustomFinalCondition = [&] {
return putSeen;
};
env.GetRuntime().DispatchEvents(options);

tablet.RebootTablet();

Expand All @@ -4388,6 +4390,10 @@ Y_UNIT_TEST_SUITE(TIndexTabletTest_Data)
E_REJECTED,
response->GetStatus(),
response->GetErrorReason());

UNIT_ASSERT_VALUES_EQUAL(
"tablet is shutting down",
response->GetErrorReason());
}

TABLET_TEST(ShouldCancelWriteRequestsIfTabletIsRebooted)
Expand Down Expand Up @@ -4440,9 +4446,12 @@ Y_UNIT_TEST_SUITE(TIndexTabletTest_Data)
return false;
});

env.GetRuntime().DispatchEvents(TDispatchOptions(), TDuration::Seconds(1));
TDispatchOptions options;
options.CustomFinalCondition = [&] {
return getSeen;
};

UNIT_ASSERT(getSeen);
env.GetRuntime().DispatchEvents(options);

failGet = false;
tablet.RebootTablet();
Expand All @@ -4452,6 +4461,10 @@ Y_UNIT_TEST_SUITE(TIndexTabletTest_Data)
E_REJECTED,
response->GetStatus(),
response->GetErrorReason());

UNIT_ASSERT_VALUES_EQUAL(
"tablet is shutting down",
response->GetErrorReason());
}

#define CHECK_GENERATED_BLOB(offset, length, expected) \
Expand Down

0 comments on commit 4d0ebbf

Please sign in to comment.