Skip to content

Commit

Permalink
return up-to-date node stats KIKIMR-20697 (#910)
Browse files Browse the repository at this point in the history
  • Loading branch information
vporyadke authored Jan 11, 2024
1 parent 5c82488 commit d95e3ab
Show file tree
Hide file tree
Showing 6 changed files with 24 additions and 17 deletions.
22 changes: 14 additions & 8 deletions ydb/core/mind/hive/hive_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1782,6 +1782,8 @@ bool THive::IsTabletMoveExpedient(const TTabletInfo& tablet, const TNodeInfo& no

void THive::FillTabletInfo(NKikimrHive::TEvResponseHiveInfo& response, ui64 tabletId, const TLeaderTabletInfo *info, const NKikimrHive::TEvRequestHiveInfo &req) {
if (info) {
TInstant now = TActivationContext::Now();
TInstant restartsBarrierTime = now - GetTabletRestartsPeriod();
auto& tabletInfo = *response.AddTablets();
tabletInfo.SetTabletID(tabletId);
tabletInfo.SetTabletType(info->Type);
Expand All @@ -1800,7 +1802,7 @@ void THive::FillTabletInfo(NKikimrHive::TEvResponseHiveInfo& response, ui64 tabl
if (!info->IsRunning()) {
tabletInfo.SetLastAliveTimestamp(info->Statistics.GetLastAliveTimestamp());
}
tabletInfo.SetRestartsPerPeriod(info->Statistics.RestartTimestampSize());
tabletInfo.SetRestartsPerPeriod(info->GetRestartsPerPeriod(restartsBarrierTime));
if (req.GetReturnMetrics()) {
tabletInfo.MutableMetrics()->CopyFrom(info->GetResourceValues());
}
Expand Down Expand Up @@ -1831,7 +1833,7 @@ void THive::FillTabletInfo(NKikimrHive::TEvResponseHiveInfo& response, ui64 tabl
if (!follower.IsRunning()) {
tabletInfo.SetLastAliveTimestamp(follower.Statistics.GetLastAliveTimestamp());
}
tabletInfo.SetRestartsPerPeriod(follower.Statistics.RestartTimestampSize());
tabletInfo.SetRestartsPerPeriod(follower.GetRestartsPerPeriod(restartsBarrierTime));
if (req.GetReturnMetrics()) {
tabletInfo.MutableMetrics()->CopyFrom(follower.GetResourceValues());
}
Expand All @@ -1843,16 +1845,14 @@ void THive::FillTabletInfo(NKikimrHive::TEvResponseHiveInfo& response, ui64 tabl
void THive::Handle(TEvHive::TEvRequestHiveInfo::TPtr& ev) {
const auto& record = ev->Get()->Record;
TAutoPtr<TEvHive::TEvResponseHiveInfo> response = new TEvHive::TEvResponseHiveInfo();
TInstant now = TlsActivationContext->Now();
if (record.HasTabletID()) {
TTabletId tabletId = record.GetTabletID();
NKikimrHive::TForwardRequest forwardRequest;
if (CheckForForwardTabletRequest(tabletId, forwardRequest)) {
response->Record.MutableForwardRequest()->CopyFrom(forwardRequest);
}
TLeaderTabletInfo* tablet = FindTablet(tabletId);
const TLeaderTabletInfo* tablet = FindTablet(tabletId);
if (tablet) {
tablet->ActualizeTabletStatistics(now);
FillTabletInfo(response->Record, record.GetTabletID(), tablet, record);
} else {
BLOG_W("Can't find the tablet from RequestHiveInfo(TabletID=" << tabletId << ")");
Expand All @@ -1866,7 +1866,6 @@ void THive::Handle(TEvHive::TEvRequestHiveInfo::TPtr& ev) {
if (it->second.IsDeleting()) {
continue;
}
it->second.ActualizeTabletStatistics(now);
FillTabletInfo(response->Record, it->first, &it->second, record);
}
response->Record.set_starttimetimestamp(StartTime().MilliSeconds());
Expand Down Expand Up @@ -1955,13 +1954,15 @@ void THive::Handle(TEvHive::TEvRequestHiveDomainStats::TPtr& ev) {

void THive::Handle(TEvHive::TEvRequestHiveNodeStats::TPtr& ev) {
const auto& request(ev->Get()->Record);
TInstant now = TActivationContext::Now();
TInstant restartsBarrierTime = now - GetNodeRestartWatchPeriod();
THolder<TEvHive::TEvResponseHiveNodeStats> response = MakeHolder<TEvHive::TEvResponseHiveNodeStats>();
auto& record = response->Record;
if (request.GetReturnExtendedTabletInfo()) {
record.SetExtendedTabletInfo(true);
}
for (auto it = Nodes.begin(); it != Nodes.end(); ++it) {
const TNodeInfo& node = it->second;
TNodeInfo& node = it->second;
if (node.IsUnknown()) {
continue;
}
Expand Down Expand Up @@ -2035,7 +2036,7 @@ void THive::Handle(TEvHive::TEvRequestHiveNodeStats::TPtr& ev) {
if (!node.IsAlive()) {
nodeStats.SetLastAliveTimestamp(node.Statistics.GetLastAliveTimestamp());
}
nodeStats.SetRestartsPerPeriod(node.Statistics.RestartTimestampSize());
nodeStats.SetRestartsPerPeriod(node.GetRestartsPerPeriod(restartsBarrierTime));
}
Send(ev->Sender, response.Release(), 0, ev->Cookie);
}
Expand Down Expand Up @@ -3299,6 +3300,11 @@ void THive::ActualizeRestartStatistics(google::protobuf::RepeatedField<google::p
array.erase(begin, it);
}

ui64 THive::GetRestartsPerPeriod(const google::protobuf::RepeatedField<google::protobuf::uint64>& restartTimestamps, ui64 barrier) {
auto it = std::lower_bound(restartTimestamps.begin(), restartTimestamps.end(), barrier);
return restartTimestamps.end() - it;
}

bool THive::IsSystemTablet(TTabletTypes::EType type) {
switch (type) {
case TTabletTypes::Coordinator:
Expand Down
3 changes: 2 additions & 1 deletion ydb/core/mind/hive/hive_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -665,7 +665,7 @@ class THive : public TActor<THive>, public TTabletExecutedFlat, public THiveShar
const NKikimrTabletBase::TMetrics& after,
NKikimr::NHive::TResourceRawValues deltaRaw,
NKikimr::NHive::TResourceNormalizedValues deltaNormalized);
static void FillTabletInfo(NKikimrHive::TEvResponseHiveInfo& response, ui64 tabletId, const TLeaderTabletInfo* info, const NKikimrHive::TEvRequestHiveInfo& req);
void FillTabletInfo(NKikimrHive::TEvResponseHiveInfo& response, ui64 tabletId, const TLeaderTabletInfo* info, const NKikimrHive::TEvRequestHiveInfo& req);
void ExecuteStartTablet(TFullTabletId tabletId, const TActorId& local, ui64 cookie, bool external);
ui32 GetDataCenters();
ui32 GetRegisteredDataCenters();
Expand Down Expand Up @@ -914,6 +914,7 @@ class THive : public TActor<THive>, public TTabletExecutedFlat, public THiveShar
}

static void ActualizeRestartStatistics(google::protobuf::RepeatedField<google::protobuf::uint64>& restartTimestamps, ui64 barrier);
static ui64 GetRestartsPerPeriod(const google::protobuf::RepeatedField<google::protobuf::uint64>& restartTimestamps, ui64 barrier);
static bool IsSystemTablet(TTabletTypes::EType type);

protected:
Expand Down
4 changes: 4 additions & 0 deletions ydb/core/mind/hive/node_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -471,6 +471,10 @@ void TNodeInfo::ActualizeNodeStatistics(TInstant now) {
Hive.ActualizeRestartStatistics(*Statistics.MutableRestartTimestamp(), barierTime.MilliSeconds());
}

ui64 TNodeInfo::GetRestartsPerPeriod(TInstant barrier) const {
return Hive.GetRestartsPerPeriod(Statistics.GetRestartTimestamp(), barrier.MilliSeconds());
}

TString TNodeInfo::GetLogPrefix() const {
return Hive.GetLogPrefix();
}
Expand Down
1 change: 1 addition & 0 deletions ydb/core/mind/hive/node_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,7 @@ struct TNodeInfo {

void UpdateResourceTotalUsage(const NKikimrHive::TEvTabletMetrics& metrics);
void ActualizeNodeStatistics(TInstant now);
ui64 GetRestartsPerPeriod(TInstant barrier) const;

TDataCenterId GetDataCenter() const {
return Location.GetDataCenterId();
Expand Down
9 changes: 2 additions & 7 deletions ydb/core/mind/hive/tablet_info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -496,13 +496,8 @@ void TTabletInfo::ActualizeTabletStatistics(TInstant now) {
Hive.ActualizeRestartStatistics(*Statistics.MutableRestartTimestamp(), barierTime.MilliSeconds());
}

ui64 TTabletInfo::GetRestartsPerPeriod(TInstant barrier) {
const auto& array(Statistics.GetRestartTimestamp());
ui64 restarts = 0;
for (auto itRestart = array.rbegin(); (itRestart != array.rend()) && (TInstant::MilliSeconds(*itRestart) >= barrier); ++itRestart) {
++restarts;
}
return restarts;
ui64 TTabletInfo::GetRestartsPerPeriod(TInstant barrier) const {
return Hive.GetRestartsPerPeriod(Statistics.GetRestartTimestamp(), barrier.MilliSeconds());
}

bool TTabletInfo::RestartsOften() const {
Expand Down
2 changes: 1 addition & 1 deletion ydb/core/mind/hive/tablet_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@ struct TTabletInfo {
}

void ActualizeTabletStatistics(TInstant now);
ui64 GetRestartsPerPeriod(TInstant barrier);
ui64 GetRestartsPerPeriod(TInstant barrier) const;
bool RestartsOften() const;

bool HasCounter() {
Expand Down

0 comments on commit d95e3ab

Please sign in to comment.