From d95e3abce972f56f6c15a53a0850875672215d05 Mon Sep 17 00:00:00 2001 From: vporyadke Date: Thu, 11 Jan 2024 11:30:13 +0300 Subject: [PATCH] return up-to-date node stats KIKIMR-20697 (#910) --- ydb/core/mind/hive/hive_impl.cpp | 22 ++++++++++++++-------- ydb/core/mind/hive/hive_impl.h | 3 ++- ydb/core/mind/hive/node_info.cpp | 4 ++++ ydb/core/mind/hive/node_info.h | 1 + ydb/core/mind/hive/tablet_info.cpp | 9 ++------- ydb/core/mind/hive/tablet_info.h | 2 +- 6 files changed, 24 insertions(+), 17 deletions(-) diff --git a/ydb/core/mind/hive/hive_impl.cpp b/ydb/core/mind/hive/hive_impl.cpp index 9717f97bc260..c9152a9da64f 100644 --- a/ydb/core/mind/hive/hive_impl.cpp +++ b/ydb/core/mind/hive/hive_impl.cpp @@ -1782,6 +1782,8 @@ bool THive::IsTabletMoveExpedient(const TTabletInfo& tablet, const TNodeInfo& no void THive::FillTabletInfo(NKikimrHive::TEvResponseHiveInfo& response, ui64 tabletId, const TLeaderTabletInfo *info, const NKikimrHive::TEvRequestHiveInfo &req) { if (info) { + TInstant now = TActivationContext::Now(); + TInstant restartsBarrierTime = now - GetTabletRestartsPeriod(); auto& tabletInfo = *response.AddTablets(); tabletInfo.SetTabletID(tabletId); tabletInfo.SetTabletType(info->Type); @@ -1800,7 +1802,7 @@ void THive::FillTabletInfo(NKikimrHive::TEvResponseHiveInfo& response, ui64 tabl if (!info->IsRunning()) { tabletInfo.SetLastAliveTimestamp(info->Statistics.GetLastAliveTimestamp()); } - tabletInfo.SetRestartsPerPeriod(info->Statistics.RestartTimestampSize()); + tabletInfo.SetRestartsPerPeriod(info->GetRestartsPerPeriod(restartsBarrierTime)); if (req.GetReturnMetrics()) { tabletInfo.MutableMetrics()->CopyFrom(info->GetResourceValues()); } @@ -1831,7 +1833,7 @@ void THive::FillTabletInfo(NKikimrHive::TEvResponseHiveInfo& response, ui64 tabl if (!follower.IsRunning()) { tabletInfo.SetLastAliveTimestamp(follower.Statistics.GetLastAliveTimestamp()); } - tabletInfo.SetRestartsPerPeriod(follower.Statistics.RestartTimestampSize()); + tabletInfo.SetRestartsPerPeriod(follower.GetRestartsPerPeriod(restartsBarrierTime)); if (req.GetReturnMetrics()) { tabletInfo.MutableMetrics()->CopyFrom(follower.GetResourceValues()); } @@ -1843,16 +1845,14 @@ void THive::FillTabletInfo(NKikimrHive::TEvResponseHiveInfo& response, ui64 tabl void THive::Handle(TEvHive::TEvRequestHiveInfo::TPtr& ev) { const auto& record = ev->Get()->Record; TAutoPtr response = new TEvHive::TEvResponseHiveInfo(); - TInstant now = TlsActivationContext->Now(); if (record.HasTabletID()) { TTabletId tabletId = record.GetTabletID(); NKikimrHive::TForwardRequest forwardRequest; if (CheckForForwardTabletRequest(tabletId, forwardRequest)) { response->Record.MutableForwardRequest()->CopyFrom(forwardRequest); } - TLeaderTabletInfo* tablet = FindTablet(tabletId); + const TLeaderTabletInfo* tablet = FindTablet(tabletId); if (tablet) { - tablet->ActualizeTabletStatistics(now); FillTabletInfo(response->Record, record.GetTabletID(), tablet, record); } else { BLOG_W("Can't find the tablet from RequestHiveInfo(TabletID=" << tabletId << ")"); @@ -1866,7 +1866,6 @@ void THive::Handle(TEvHive::TEvRequestHiveInfo::TPtr& ev) { if (it->second.IsDeleting()) { continue; } - it->second.ActualizeTabletStatistics(now); FillTabletInfo(response->Record, it->first, &it->second, record); } response->Record.set_starttimetimestamp(StartTime().MilliSeconds()); @@ -1955,13 +1954,15 @@ void THive::Handle(TEvHive::TEvRequestHiveDomainStats::TPtr& ev) { void THive::Handle(TEvHive::TEvRequestHiveNodeStats::TPtr& ev) { const auto& request(ev->Get()->Record); + TInstant now = TActivationContext::Now(); + TInstant restartsBarrierTime = now - GetNodeRestartWatchPeriod(); THolder response = MakeHolder(); auto& record = response->Record; if (request.GetReturnExtendedTabletInfo()) { record.SetExtendedTabletInfo(true); } for (auto it = Nodes.begin(); it != Nodes.end(); ++it) { - const TNodeInfo& node = it->second; + TNodeInfo& node = it->second; if (node.IsUnknown()) { continue; } @@ -2035,7 +2036,7 @@ void THive::Handle(TEvHive::TEvRequestHiveNodeStats::TPtr& ev) { if (!node.IsAlive()) { nodeStats.SetLastAliveTimestamp(node.Statistics.GetLastAliveTimestamp()); } - nodeStats.SetRestartsPerPeriod(node.Statistics.RestartTimestampSize()); + nodeStats.SetRestartsPerPeriod(node.GetRestartsPerPeriod(restartsBarrierTime)); } Send(ev->Sender, response.Release(), 0, ev->Cookie); } @@ -3299,6 +3300,11 @@ void THive::ActualizeRestartStatistics(google::protobuf::RepeatedField& restartTimestamps, ui64 barrier) { + auto it = std::lower_bound(restartTimestamps.begin(), restartTimestamps.end(), barrier); + return restartTimestamps.end() - it; +} + bool THive::IsSystemTablet(TTabletTypes::EType type) { switch (type) { case TTabletTypes::Coordinator: diff --git a/ydb/core/mind/hive/hive_impl.h b/ydb/core/mind/hive/hive_impl.h index e2894adb5f78..020381fc9ac1 100644 --- a/ydb/core/mind/hive/hive_impl.h +++ b/ydb/core/mind/hive/hive_impl.h @@ -665,7 +665,7 @@ class THive : public TActor, public TTabletExecutedFlat, public THiveShar const NKikimrTabletBase::TMetrics& after, NKikimr::NHive::TResourceRawValues deltaRaw, NKikimr::NHive::TResourceNormalizedValues deltaNormalized); - static void FillTabletInfo(NKikimrHive::TEvResponseHiveInfo& response, ui64 tabletId, const TLeaderTabletInfo* info, const NKikimrHive::TEvRequestHiveInfo& req); + void FillTabletInfo(NKikimrHive::TEvResponseHiveInfo& response, ui64 tabletId, const TLeaderTabletInfo* info, const NKikimrHive::TEvRequestHiveInfo& req); void ExecuteStartTablet(TFullTabletId tabletId, const TActorId& local, ui64 cookie, bool external); ui32 GetDataCenters(); ui32 GetRegisteredDataCenters(); @@ -914,6 +914,7 @@ class THive : public TActor, public TTabletExecutedFlat, public THiveShar } static void ActualizeRestartStatistics(google::protobuf::RepeatedField& restartTimestamps, ui64 barrier); + static ui64 GetRestartsPerPeriod(const google::protobuf::RepeatedField& restartTimestamps, ui64 barrier); static bool IsSystemTablet(TTabletTypes::EType type); protected: diff --git a/ydb/core/mind/hive/node_info.cpp b/ydb/core/mind/hive/node_info.cpp index 64b3e96c3f5a..bc06531bfcfd 100644 --- a/ydb/core/mind/hive/node_info.cpp +++ b/ydb/core/mind/hive/node_info.cpp @@ -471,6 +471,10 @@ void TNodeInfo::ActualizeNodeStatistics(TInstant now) { Hive.ActualizeRestartStatistics(*Statistics.MutableRestartTimestamp(), barierTime.MilliSeconds()); } +ui64 TNodeInfo::GetRestartsPerPeriod(TInstant barrier) const { + return Hive.GetRestartsPerPeriod(Statistics.GetRestartTimestamp(), barrier.MilliSeconds()); +} + TString TNodeInfo::GetLogPrefix() const { return Hive.GetLogPrefix(); } diff --git a/ydb/core/mind/hive/node_info.h b/ydb/core/mind/hive/node_info.h index f5baa4828b1f..34571e9cd4bf 100644 --- a/ydb/core/mind/hive/node_info.h +++ b/ydb/core/mind/hive/node_info.h @@ -235,6 +235,7 @@ struct TNodeInfo { void UpdateResourceTotalUsage(const NKikimrHive::TEvTabletMetrics& metrics); void ActualizeNodeStatistics(TInstant now); + ui64 GetRestartsPerPeriod(TInstant barrier) const; TDataCenterId GetDataCenter() const { return Location.GetDataCenterId(); diff --git a/ydb/core/mind/hive/tablet_info.cpp b/ydb/core/mind/hive/tablet_info.cpp index bb998dfe473d..4598bface6ec 100644 --- a/ydb/core/mind/hive/tablet_info.cpp +++ b/ydb/core/mind/hive/tablet_info.cpp @@ -496,13 +496,8 @@ void TTabletInfo::ActualizeTabletStatistics(TInstant now) { Hive.ActualizeRestartStatistics(*Statistics.MutableRestartTimestamp(), barierTime.MilliSeconds()); } -ui64 TTabletInfo::GetRestartsPerPeriod(TInstant barrier) { - const auto& array(Statistics.GetRestartTimestamp()); - ui64 restarts = 0; - for (auto itRestart = array.rbegin(); (itRestart != array.rend()) && (TInstant::MilliSeconds(*itRestart) >= barrier); ++itRestart) { - ++restarts; - } - return restarts; +ui64 TTabletInfo::GetRestartsPerPeriod(TInstant barrier) const { + return Hive.GetRestartsPerPeriod(Statistics.GetRestartTimestamp(), barrier.MilliSeconds()); } bool TTabletInfo::RestartsOften() const { diff --git a/ydb/core/mind/hive/tablet_info.h b/ydb/core/mind/hive/tablet_info.h index 89ed766a0523..2afb5f6010a8 100644 --- a/ydb/core/mind/hive/tablet_info.h +++ b/ydb/core/mind/hive/tablet_info.h @@ -294,7 +294,7 @@ struct TTabletInfo { } void ActualizeTabletStatistics(TInstant now); - ui64 GetRestartsPerPeriod(TInstant barrier); + ui64 GetRestartsPerPeriod(TInstant barrier) const; bool RestartsOften() const; bool HasCounter() {