From 624cae5ac922dee96c20629d1bd35b0cc3b62e2f Mon Sep 17 00:00:00 2001 From: Dan Wang Date: Tue, 7 Feb 2023 15:45:20 +0800 Subject: [PATCH] feat(new_metrics): retire stale metric entities that are not used by any other object (#1304) --- src/utils/metrics.cpp | 191 +++++++++++++++++++++++++--- src/utils/metrics.h | 193 ++++++++++++++++++++-------- src/utils/test/metrics_test.cpp | 215 ++++++++++++++++++++++++++++++++ 3 files changed, 529 insertions(+), 70 deletions(-) diff --git a/src/utils/metrics.cpp b/src/utils/metrics.cpp index 6e83324d60..007fb2198b 100644 --- a/src/utils/metrics.cpp +++ b/src/utils/metrics.cpp @@ -17,7 +17,9 @@ #include "utils/metrics.h" +#include "runtime/api_layer1.h" #include "utils/api_utilities.h" +#include "utils/flags.h" #include "utils/rand.h" #include "utils/shared_io_service.h" #include "utils/string_conv.h" @@ -25,10 +27,15 @@ namespace dsn { +DSN_DEFINE_uint64(metrics, + entity_retirement_delay_ms, + 10 * 60 * 1000, + "The retention internal (milliseconds) for an entity after it becomes stale."); + metric_entity::metric_entity(const metric_entity_prototype *prototype, const std::string &id, const attr_map &attrs) - : _prototype(prototype), _id(id), _attrs(attrs) + : _prototype(prototype), _id(id), _attrs(attrs), _retire_time_ms(0) { } @@ -45,13 +52,10 @@ void metric_entity::close(close_option option) { utils::auto_write_lock l(_lock); - // The reason why each metric is closed in the entity rather than in the destructor of each - // metric is that close() for the metric will return immediately without waiting for any close - // operation to be finished. - // - // Thus, to close all metrics owned by an entity, it's more efficient to firstly issue a close - // request for all metrics; then, just wait for all of the close operations to be finished. - // It's inefficient to wait for each metric to be closed one by one. + // To close all metrics owned by an entity, it's more efficient to firstly issue an asynchronous + // close request to each metric; then, just wait for all of the close operations to be finished. + // It's inefficient to wait for each metric to be closed one by one. Therefore, the metric is + // not closed in its destructor. for (auto &m : _metrics) { if (m.second->prototype()->type() == metric_type::kPercentile) { auto p = down_cast(m.second.get()); @@ -177,6 +181,17 @@ void metric_entity::take_snapshot(metric_json_writer &writer, const metric_filte writer.EndObject(); } +bool metric_entity::is_stale() const +{ + // Since this entity itself is still being accessed, its reference count should be 1 + // at least. + CHECK_GE(get_count(), 1); + + // This entity is considered stale once there is only one reference for it kept in the + // registry. + return get_count() == 1; +} + void metric_filters::extract_entity_metrics(const metric_entity::metric_map &candidates, metric_entity::metric_map &target_metrics) const { @@ -312,10 +327,12 @@ metric_registry::metric_registry() : _http_service(this) { // We should ensure that metric_registry is destructed before shared_io_service is destructed. // Once shared_io_service is destructed before metric_registry is destructed, - // boost::asio::io_service needed by metrics in metric_registry such as percentile_timer will + // boost::asio::io_service needed by metrics in metric_registry such as metric_timer will // be released firstly, then will lead to heap-use-after-free error since percentiles in // metric_registry are still running but the resources they needed have been released. tools::shared_io_service::instance(); + + start_timer(); } metric_registry::~metric_registry() @@ -336,6 +353,39 @@ metric_registry::~metric_registry() for (auto &entity : _entities) { entity.second->close(metric_entity::close_option::kNoWait); } + + stop_timer(); +} + +void metric_registry::on_close() {} + +void metric_registry::start_timer() +{ + if (_timer) { + return; + } + + // Once an entity is considered stale, it will be retired after the retention interval, + // namely FLAGS_entity_retirement_delay_ms milliseconds. Therefore, if the interval of + // the timer is also set to FLAGS_entity_retirement_delay_ms, in the next round, it's + // just about time to retire this entity. + _timer.reset(new metric_timer(FLAGS_entity_retirement_delay_ms, + std::bind(&metric_registry::process_stale_entities, this), + std::bind(&metric_registry::on_close, this))); +} + +void metric_registry::stop_timer() +{ + if (!_timer) { + return; + } + + // Close the timer synchronously. + _timer->close(); + _timer->wait(); + + // Reset the timer to mark that it has been stopped, now it could be started. + _timer.reset(); } metric_registry::entity_map metric_registry::entities() const @@ -383,6 +433,111 @@ metric_entity_ptr metric_registry::find_or_create_entity(const metric_entity_pro return entity; } +metric_registry::collected_entities_info metric_registry::collect_stale_entities() const +{ + collected_entities_info collected_info; + + auto now = dsn_now_ms(); + + utils::auto_read_lock l(_lock); + + for (const auto &entity : _entities) { + if (!entity.second->is_stale()) { + if (entity.second->_retire_time_ms > 0) { + // This entity had been scheduled to be retired. However, it was reemployed + // after that. It has been in use since then, therefore its scheduled time + // for retirement should be reset to 0. + collected_info.collected_entities.insert(entity.first); + } + continue; + } + + if (entity.second->_retire_time_ms > now) { + // This entity has been scheduled to be retired, however it is still within + // the retention interval. Thus do not collect it. + ++collected_info.num_scheduled_entities; + continue; + } + + collected_info.collected_entities.insert(entity.first); + } + + collected_info.num_all_entities = _entities.size(); + return collected_info; +} + +metric_registry::retired_entities_stat +metric_registry::retire_stale_entities(const collected_entity_list &collected_entities) +{ + if (collected_entities.empty()) { + // Do not lock for empty list. + return retired_entities_stat(); + } + + retired_entities_stat retired_stat; + + auto now = dsn_now_ms(); + + utils::auto_write_lock l(_lock); + + for (const auto &collected_entity : collected_entities) { + auto iter = _entities.find(collected_entity); + if (dsn_unlikely(iter == _entities.end())) { + // The entity has been removed from the registry for some unusual reason. + continue; + } + + if (!iter->second->is_stale()) { + if (iter->second->_retire_time_ms > 0) { + // For those entities which are reemployed, their scheduled time for retirement + // should be reset to 0 though previously they could have been scheduled to be + // retired. + iter->second->_retire_time_ms = 0; + ++retired_stat.num_reemployed_entities; + } + continue; + } + + if (dsn_unlikely(iter->second->_retire_time_ms > now)) { + // Since in collect_stale_entities() we've filtered the metrics which have been + // outside the retention interval, this is unlikely to happen. However, we still + // check here. + continue; + } + + if (iter->second->_retire_time_ms == 0) { + // The entity should be marked with a scheduled time for retirement, since it has + // already been considered stale. + iter->second->_retire_time_ms = now + FLAGS_entity_retirement_delay_ms; + ++retired_stat.num_recently_scheduled_entities; + continue; + } + + // Once the entity is outside the retention interval, retire it from the registry. + _entities.erase(iter); + ++retired_stat.num_retired_entities; + } + + return retired_stat; +} + +void metric_registry::process_stale_entities() +{ + LOG_INFO("begin to process stale metric entities"); + + const auto &collected_info = collect_stale_entities(); + const auto &retired_stat = retire_stale_entities(collected_info.collected_entities); + + LOG_INFO("stat for metric entities: total={}, collected={}, retired={}, scheduled={}, " + "recently_scheduled={}, reemployed={}", + collected_info.num_all_entities, + collected_info.collected_entities.size(), + retired_stat.num_retired_entities, + collected_info.num_scheduled_entities, + retired_stat.num_recently_scheduled_entities, + retired_stat.num_reemployed_entities); +} + metric_prototype::metric_prototype(const ctor_args &args) : _args(args) {} metric_prototype::~metric_prototype() {} @@ -391,7 +546,7 @@ metric::metric(const metric_prototype *prototype) : _prototype(prototype) {} closeable_metric::closeable_metric(const metric_prototype *prototype) : metric(prototype) {} -uint64_t percentile_timer::generate_initial_delay_ms(uint64_t interval_ms) +uint64_t metric_timer::generate_initial_delay_ms(uint64_t interval_ms) { CHECK_GT(interval_ms, 0); @@ -403,7 +558,7 @@ uint64_t percentile_timer::generate_initial_delay_ms(uint64_t interval_ms) return (rand::next_u64() % interval_seconds + 1) * 1000 + rand::next_u64() % 1000; } -percentile_timer::percentile_timer(uint64_t interval_ms, on_exec_fn on_exec, on_close_fn on_close) +metric_timer::metric_timer(uint64_t interval_ms, on_exec_fn on_exec, on_close_fn on_close) : _initial_delay_ms(generate_initial_delay_ms(interval_ms)), _interval_ms(interval_ms), _on_exec(on_exec), @@ -413,10 +568,10 @@ percentile_timer::percentile_timer(uint64_t interval_ms, on_exec_fn on_exec, on_ _timer(new boost::asio::deadline_timer(tools::shared_io_service::instance().ios)) { _timer->expires_from_now(boost::posix_time::milliseconds(_initial_delay_ms)); - _timer->async_wait(std::bind(&percentile_timer::on_timer, this, std::placeholders::_1)); + _timer->async_wait(std::bind(&metric_timer::on_timer, this, std::placeholders::_1)); } -void percentile_timer::close() +void metric_timer::close() { // If the timer has already expired when cancel() is called, then the handlers for asynchronous // wait operations will: @@ -433,15 +588,15 @@ void percentile_timer::close() } } -void percentile_timer::wait() { _completed.wait(); } +void metric_timer::wait() { _completed.wait(); } -void percentile_timer::on_close() +void metric_timer::on_close() { _on_close(); _completed.notify(); } -void percentile_timer::on_timer(const boost::system::error_code &ec) +void metric_timer::on_timer(const boost::system::error_code &ec) { // This macro is defined for the case that handlers for asynchronous wait operations are no // longer cancelled. It just checks the internal state atomically (since close() can also be @@ -465,7 +620,7 @@ void percentile_timer::on_timer(const boost::system::error_code &ec) // Cancel can only be launched by close(). auto expected_state = state::kClosing; CHECK(_state.compare_exchange_strong(expected_state, state::kClosed), - "wrong state for percentile_timer: {}, while expecting closing state", + "wrong state for metric_timer: {}, while expecting closing state", static_cast(expected_state)); on_close(); @@ -477,7 +632,7 @@ void percentile_timer::on_timer(const boost::system::error_code &ec) TRY_PROCESS_TIMER_CLOSING(); _timer->expires_from_now(boost::posix_time::milliseconds(_interval_ms)); - _timer->async_wait(std::bind(&percentile_timer::on_timer, this, std::placeholders::_1)); + _timer->async_wait(std::bind(&metric_timer::on_timer, this, std::placeholders::_1)); #undef TRY_PROCESS_TIMER_CLOSING } diff --git a/src/utils/metrics.h b/src/utils/metrics.h index c7c1bda18d..340d2fd9d5 100644 --- a/src/utils/metrics.h +++ b/src/utils/metrics.h @@ -171,6 +171,7 @@ class metric_entity : public ref_counter private: friend class metric_registry; friend class ref_ptr; + friend class scoped_entity; metric_entity(const metric_entity_prototype *prototype, const std::string &id, @@ -197,6 +198,18 @@ class metric_entity : public ref_counter void encode_id(metric_json_writer &writer) const; + // Decide if an entity is stale. An entity becomes stale if it is no longer used by any other + // object. + // + // An entity could be bound to one or multiple objects. Once all of these objects are + // destroyed, this entity will become stale, which means all of the metrics held by this + // entity are also stale. + // + // For example, once a replica is removed, the replica entity (and all metrics it holds) will + // become stale; then, this entity is scheduled to be retired after a configurable retention + // interval; finally, this entity will be removed from the registry with all metrics it holds. + bool is_stale() const; + const metric_entity_prototype *const _prototype; const std::string _id; @@ -204,6 +217,12 @@ class metric_entity : public ref_counter attr_map _attrs; metric_map _metrics; + // The timestamp when this entity should be retired: + // * default value is 0, which means this entity has not been scheduled to be retired; + // * otherwise, non-zero value means this entity has been scheduled to be retired, and will + // be retired at any time once current time has reached or exceeded this timestamp. + uint64_t _retire_time_ms; + DISALLOW_COPY_AND_ASSIGN(metric_entity); }; @@ -354,10 +373,94 @@ class metrics_http_service : public http_server_base DISALLOW_COPY_AND_ASSIGN(metrics_http_service); }; +// `metric_timer` is a timer class that runs metric-related computations periodically, such as +// calculating percentile, checking if there are stale entities. It accepts `on_exec` and +// `on_close` as the callbacks for execution and close. +// +// In case that all metrics (such as percentiles) are computed at the same time and lead to very +// high load, first calculation will be delayed at a random interval. +class metric_timer +{ +public: + enum class state : int + { + kRunning, + kClosing, + kClosed, + }; + + using on_exec_fn = std::function; + using on_close_fn = std::function; + + metric_timer(uint64_t interval_ms, on_exec_fn on_exec, on_close_fn on_close); + ~metric_timer() = default; + + void close(); + void wait(); + + // Get the initial delay that is randomly generated by `generate_initial_delay_ms()`. + uint64_t get_initial_delay_ms() const { return _initial_delay_ms; } + +private: + // Generate an initial delay randomly in case that all percentiles are computed at the + // same time. + static uint64_t generate_initial_delay_ms(uint64_t interval_ms); + + void on_close(); + + void on_timer(const boost::system::error_code &ec); + + const uint64_t _initial_delay_ms; + const uint64_t _interval_ms; + const on_exec_fn _on_exec; + const on_close_fn _on_close; + std::atomic _state; + utils::notify_event _completed; + std::unique_ptr _timer; + + DISALLOW_COPY_AND_ASSIGN(metric_timer); +}; + class metric_registry : public utils::singleton { public: using entity_map = std::unordered_map; + using collected_entity_list = std::unordered_set; + + struct collected_entities_info + { + // The collected entities that will be processed by retire_stale_entities(). Following + // kinds of entities will be collected: + // * entities that should be retired immediately. The entities that are still within + // the retention interval will not be collected. + // * entities that were previously considered stale however have already been reemployed, + // which means its retirement should be cancelled by retire_stale_entities(). + collected_entity_list collected_entities; + + // The number of all entities in the registry. + size_t num_all_entities = 0; + + // The number of the entities that have been scheduled to be retired. + size_t num_scheduled_entities = 0; + + collected_entities_info() = default; + }; + + struct retired_entities_stat + { + // The number of retired entities. + size_t num_retired_entities = 0; + + // The number of entities that were recently considered stale and scheduled to be + // retired. + size_t num_recently_scheduled_entities = 0; + + // The number of the entities that had previously been scheduled to be retired and + // were recently reemployed. + size_t num_reemployed_entities = 0; + + retired_entities_stat() = default; + }; entity_map entities() const; @@ -368,19 +471,51 @@ class metric_registry : public utils::singleton friend class utils::singleton; friend void test_get_metrics_handler(const http_request &req, http_response &resp); + friend class scoped_entity; + friend class MetricsRetirementTest; metric_registry(); ~metric_registry(); + void on_close(); + + void start_timer(); + void stop_timer(); + metric_entity_ptr find_or_create_entity(const metric_entity_prototype *prototype, const std::string &id, const metric_entity::attr_map &attrs); + // These functions are used to retire stale entities. + // + // Since retirement is infrequent, there tend to be no entity that should be retired. + // Therefore, the whole retirement process is divided into two phases: "collect" and + // "retire". + // + // At the first phase "collect", we just check if there are entities that: + // * has become stale, but has not been scheduled to be retired, or + // * should be retired immediately, or + // * previously were scheduled to be retired, now has been reemployed. + // + // All operations in the first phase are read-only, needing just read lock which is more + // lightweight. If some entities were found following above conditions, albeit infrequenly, + // they would be collected to be processed at the next phase. + // + // Collected entities, if any, will be processed at the second phase "retire": + // * stale entities will be schedule to be retired; + // * the expired entities will be retired; + // * reset the retirement timestamp to 0 for reemployed entities. + collected_entities_info collect_stale_entities() const; + retired_entities_stat retire_stale_entities(const collected_entity_list &collected_entities); + void process_stale_entities(); + mutable utils::rw_lock_nr _lock; entity_map _entities; metrics_http_service _http_service; + std::unique_ptr _timer; + DISALLOW_COPY_AND_ASSIGN(metric_registry); }; @@ -593,6 +728,8 @@ class metric : public ref_counter const metric_prototype *const _prototype; private: + friend class metric_entity; + DISALLOW_COPY_AND_ASSIGN(metric); }; @@ -892,56 +1029,6 @@ inline size_t kth_percentile_to_nth_index(size_t size, kth_percentile_type type) return kth_percentile_to_nth_index(size, static_cast(type)); } -// `percentile_timer` is a timer class that encapsulates the details how each percentile is -// computed periodically. -// -// To be instantiated, it requires `interval_ms` at which a percentile is computed and `exec` -// which is used to compute percentile. -// -// In case that all percentiles are computed at the same time and lead to very high load, -// first computation for percentile will be delayed at a random interval. -class percentile_timer -{ -public: - enum class state : int - { - kRunning, - kClosing, - kClosed, - }; - - using on_exec_fn = std::function; - using on_close_fn = std::function; - - percentile_timer(uint64_t interval_ms, on_exec_fn on_exec, on_close_fn on_close); - ~percentile_timer() = default; - - void close(); - void wait(); - - // Get the initial delay that is randomly generated by `generate_initial_delay_ms()`. - uint64_t get_initial_delay_ms() const { return _initial_delay_ms; } - -private: - // Generate an initial delay randomly in case that all percentiles are computed at the - // same time. - static uint64_t generate_initial_delay_ms(uint64_t interval_ms); - - void on_close(); - - void on_timer(const boost::system::error_code &ec); - - const uint64_t _initial_delay_ms; - const uint64_t _interval_ms; - const on_exec_fn _on_exec; - const on_close_fn _on_close; - std::atomic _state; - utils::notify_event _completed; - std::unique_ptr _timer; - - DISALLOW_COPY_AND_ASSIGN(percentile_timer); -}; - // The percentile is a metric type that samples observations. The size of samples has an upper // bound. Once the maximum size is reached, the earliest observations will be overwritten. // @@ -1022,6 +1109,8 @@ class percentile : public closeable_metric // interval_ms is the interval between the computations for percentiles. Its unit is // milliseconds. It's suggested that interval_ms should be near the period between pulls // from or pushes to the monitoring system. + // TODO(wangdan): we can also support constructing percentiles from the parameters in + // the configuration file. percentile(const metric_prototype *prototype, uint64_t interval_ms = 10000, const std::set &kth_percentiles = kAllKthPercentileTypes, @@ -1066,7 +1155,7 @@ class percentile : public closeable_metric // See on_close() for details which is registered in timer and will be called // back once close() is invoked. add_ref(); - _timer.reset(new percentile_timer( + _timer.reset(new metric_timer( interval_ms, std::bind(&percentile::find_nth_elements, this), std::bind(&percentile::on_close, this))); @@ -1159,7 +1248,7 @@ class percentile : public closeable_metric std::vector> _full_nth_elements; NthElementFinder _nth_element_finder; - std::unique_ptr _timer; + std::unique_ptr _timer; DISALLOW_COPY_AND_ASSIGN(percentile); }; diff --git a/src/utils/test/metrics_test.cpp b/src/utils/test/metrics_test.cpp index 9b12a61a6e..20414a2743 100644 --- a/src/utils/test/metrics_test.cpp +++ b/src/utils/test/metrics_test.cpp @@ -19,6 +19,7 @@ #include #include +#include #include #include @@ -31,6 +32,8 @@ namespace dsn { +DSN_DECLARE_uint64(entity_retirement_delay_ms); + class my_gauge : public metric { public: @@ -2828,4 +2831,216 @@ TEST(metrics_test, http_get_metrics) } } +using surviving_metrics_case = std::tuple; + +class MetricsRetirementTest : public testing::TestWithParam +{ +public: + // For higher version of googletest, use `static void SetUpTestSuite()` instead. + static void SetUpTestCase() + { + // Restart the timer of registry with shorter interval to reduce the test time. + _reserved_entity_retirement_delay_ms = FLAGS_entity_retirement_delay_ms; + restart_metric_registry_timer(kEntityRetirementDelayMsForTest); + } + + // For higher version of googletest, use `static void TearDownTestSuite()` instead. + static void TearDownTestCase() + { + // Recover the timer of registry with the original interval. + restart_metric_registry_timer(_reserved_entity_retirement_delay_ms); + } + + static const uint64_t kEntityRetirementDelayMsForTest; + +private: + static void restart_metric_registry_timer(uint64_t interval_ms) + { + metric_registry::instance().stop_timer(); + FLAGS_entity_retirement_delay_ms = interval_ms; + metric_registry::instance().start_timer(); + + std::cout << "restart the timer of metric registry at interval " << interval_ms << " ms." + << std::endl; + } + + static uint64_t _reserved_entity_retirement_delay_ms; +}; + +const uint64_t MetricsRetirementTest::kEntityRetirementDelayMsForTest = 100; +uint64_t MetricsRetirementTest::_reserved_entity_retirement_delay_ms; + +// This class helps to test retirement of metrics and entities, by creating temporary +// variables or reference them as members of this class to control their lifetime. +class scoped_entity +{ +public: + // Use the raw pointer to hold metric without any reference which may affect the test results. + using surviving_metric_map = std::unordered_map; + + scoped_entity(const std::string &entity_id, + bool is_entity_surviving, + bool is_gauge_surviving, + bool is_counter_surviving, + bool is_percentile_surviving); + + // After a long enough time, check if temporary entity is retired with its own metrics while + // long-life one still survive. + void test_survival_after_retirement() const; + +private: + template + void instantiate_metric(const metric_entity_ptr &my_entity, + bool is_surviving, + const MetricPrototype &prototype, + MetricPtr &m) + { + // Create a temporary variable for the metric. + auto temp_m = prototype.instantiate(my_entity); + _expected_all_metrics.emplace(&prototype, temp_m.get()); + + if (!is_surviving) { + return; + } + + // Extend the lifetime of the metric since it's marked as "surviving". + m = temp_m; + } + + surviving_metric_map get_actual_surviving_metrics(const metric_entity_ptr &my_entity) const; + + // Check if the entity still survive with its own metrics no matter whether they are temporary + // or long-life. + void test_survival_immediately_after_initialization() const; + + std::string _my_entity_id; + metric_entity *_expected_my_entity_raw_ptr; + metric_entity_ptr _my_entity; + + gauge_ptr _my_gauge_int64; + counter_ptr<> _my_counter; + percentile_ptr _my_percentile_int64; + + surviving_metric_map _expected_all_metrics; + surviving_metric_map _expected_surviving_metrics; +}; + +scoped_entity::scoped_entity(const std::string &entity_id, + bool is_entity_surviving, + bool is_gauge_surviving, + bool is_counter_surviving, + bool is_percentile_surviving) + : _my_entity_id(entity_id) +{ + // Create a temporary variabl for the entity. + auto my_entity = METRIC_ENTITY_my_server.instantiate(entity_id); + _expected_my_entity_raw_ptr = my_entity.get(); + + // Create temporary or long-life variables for metrics, depending on what is_*_surviving is. + instantiate_metric( + my_entity, is_gauge_surviving, METRIC_test_server_gauge_int64, _my_gauge_int64); + instantiate_metric(my_entity, is_counter_surviving, METRIC_test_server_counter, _my_counter); + instantiate_metric(my_entity, + is_percentile_surviving, + METRIC_test_server_percentile_int64, + _my_percentile_int64); + + if (is_entity_surviving) { + // Extend the lifetime of the entity since it's marked as "surviving". + _my_entity = my_entity; + _expected_surviving_metrics = _expected_all_metrics; + } + + test_survival_immediately_after_initialization(); +} + +scoped_entity::surviving_metric_map +scoped_entity::get_actual_surviving_metrics(const metric_entity_ptr &my_entity) const +{ + surviving_metric_map actual_surviving_metrics; + + utils::auto_read_lock l(my_entity->_lock); + + // Use internal member directly instead of calling metrics(). We don't want to have + // any reference which may affect the test results. + for (const auto &m : my_entity->_metrics) { + actual_surviving_metrics.emplace(m.first, m.second.get()); + } + + return actual_surviving_metrics; +} + +void scoped_entity::test_survival_immediately_after_initialization() const +{ + utils::auto_read_lock l(metric_registry::instance()._lock); + + // Use internal member directly instead of calling entities(). We don't want to have + // any reference which may affect the test results. + const auto &entities = metric_registry::instance()._entities; + const auto &iter = entities.find(_my_entity_id); + ASSERT_NE(entities.end(), iter); + ASSERT_EQ(_expected_my_entity_raw_ptr, iter->second.get()); + + const auto &actual_surviving_metrics = get_actual_surviving_metrics(iter->second); + ASSERT_EQ(_expected_all_metrics, actual_surviving_metrics); +} + +void scoped_entity::test_survival_after_retirement() const +{ + std::this_thread::sleep_for( + std::chrono::milliseconds(MetricsRetirementTest::kEntityRetirementDelayMsForTest * 2)); + + utils::auto_read_lock l(metric_registry::instance()._lock); + + // Use internal member directly instead of calling entities(). We don't want to have + // any reference which may affect the test results. + const auto &entities = metric_registry::instance()._entities; + const auto &iter = entities.find(_my_entity_id); + if (_my_entity == nullptr) { + // The entity has been retired. + ASSERT_EQ(entities.end(), iter); + ASSERT_TRUE(_expected_surviving_metrics.empty()); + return; + } + + ASSERT_NE(entities.end(), iter); + ASSERT_EQ(_expected_my_entity_raw_ptr, iter->second.get()); + + const auto &actual_surviving_metrics = get_actual_surviving_metrics(iter->second); + ASSERT_EQ(_expected_surviving_metrics, actual_surviving_metrics); +} + +TEST_P(MetricsRetirementTest, RetireOldMetrics) +{ + std::string entity_id; + bool is_entity_surviving; + bool is_gauge_surviving; + bool is_counter_surviving; + bool is_percentile_surviving; + std::tie(entity_id, + is_entity_surviving, + is_gauge_surviving, + is_counter_surviving, + is_percentile_surviving) = GetParam(); + + scoped_entity entity(entity_id, + is_entity_surviving, + is_gauge_surviving, + is_counter_surviving, + is_percentile_surviving); + entity.test_survival_after_retirement(); +} + +const std::vector metrics_retirement_tests = { + {"server_117", true, true, true, true}, + {"server_118", true, true, true, false}, + {"server_119", true, true, false, false}, + {"server_120", true, false, false, false}, + {"server_121", false, false, false, false}, +}; + +INSTANTIATE_TEST_CASE_P(MetricsTest, + MetricsRetirementTest, + testing::ValuesIn(metrics_retirement_tests)); + } // namespace dsn