From f6bdcc61acccadd49ea1f4c6e3ef677692a22f11 Mon Sep 17 00:00:00 2001 From: Rohit Agrawal Date: Thu, 5 Dec 2024 23:14:26 -0800 Subject: [PATCH] health_check: add stats counters to monitor health check behavior (#37409) ## Description This PR adds stats to the health check HTTP filter. These new stats provide visibility into health check behavior including request counts, successful/failed checks, cached responses, and cluster health status. These stats help operators monitor the health checking system and diagnose issues. Here is a list of key stats added: - **request_total** (Counter) : Total number of requests that were served from this health check filter - **failed** (Counter) : Total number of health checks that failed (including failures from cluster status) - **ok** (Counter) : Total number of health checks that passed - **cached_response** (Counter) : Total number of requests that were responded to with cached health check status - **failed_cluster_not_found** (Counter) : Total number of failed health checks due to referenced cluster not being found - **failed_cluster_empty** (Counter) : Total number of failed health checks due to empty cluster membership when checking cluster health - **failed_cluster_unhealthy** (Counter) : Total number of failed health checks due to cluster falling below minimum healthy percentage threshold - **degraded** (Counter) : Total number of health check responses that reported degraded status --- **Commit Message:** health_check: add stats counters to monitor health check behavior **Additional Description:** This change improves observability of the health check filter by exposing key metrics about health check processing and cluster health state. The stats are scoped under the connection manager and follow standard Envoy stats naming conventions. **Risk Level: Low** **Testing:** Added unit and integration tests verifying all stats counters **Docs Changes:** Added **Release Notes:** Added --------- Signed-off-by: Rohit Agrawal --- changelogs/current.yaml | 4 + .../http/http_filters/health_check_filter.rst | 20 +++ .../filters/http/health_check/config.cc | 9 +- .../filters/http/health_check/health_check.cc | 15 +- .../filters/http/health_check/health_check.h | 40 +++++- .../health_check_integration_test.cc | 31 +++++ .../http/health_check/health_check_test.cc | 129 ++++++++++++++++-- 7 files changed, 227 insertions(+), 21 deletions(-) diff --git a/changelogs/current.yaml b/changelogs/current.yaml index b889ce7b3b61..65122b6129c6 100644 --- a/changelogs/current.yaml +++ b/changelogs/current.yaml @@ -303,6 +303,10 @@ new_features: change: | Added :ref:`attribute ` ``upstream.cx_pool_ready_duration`` to get the duration from when the upstream request was created to when the upstream connection pool is ready. +- area: health_check + change: | + Added new health check filter stats including total requests, successful/failed checks, cached responses, and + cluster health status counters. These stats help track health check behavior and cluster health state. deprecated: - area: rbac diff --git a/docs/root/configuration/http/http_filters/health_check_filter.rst b/docs/root/configuration/http/http_filters/health_check_filter.rst index a5d366631853..371dde11972f 100644 --- a/docs/root/configuration/http/http_filters/health_check_filter.rst +++ b/docs/root/configuration/http/http_filters/health_check_filter.rst @@ -16,3 +16,23 @@ Health check ` admin endpoint has been called. (The :ref:`/healthcheck/ok ` admin endpoint reverses this behavior). + +Statistics +---------- + +The health check filter outputs statistics in the ``http..health_check.`` namespace. The +:ref:`stat prefix ` +comes from the owning HTTP connection manager. + +.. csv-table:: + :header: Name, Type, Description + :widths: 1, 1, 2 + + request_total, Counter, Total number of requests processed by this health check filter ()including responses served from the cache) + failed, Counter, Total number of health checks that failed (including failures due to cluster status and responses served from the cache) + ok, Counter, Total number of health checks that passed + cached_response, Counter, Total number of requests that were responded to with cached health check status + failed_cluster_not_found, Counter, Total number of failed health checks due to referenced cluster not being found + failed_cluster_empty, Counter, Total number of failed health checks due to empty cluster membership when checking cluster health + failed_cluster_unhealthy, Counter, Total number of failed health checks due to cluster falling below minimum healthy percentage threshold + degraded, Counter, Total number of health check responses that reported degraded status diff --git a/source/extensions/filters/http/health_check/config.cc b/source/extensions/filters/http/health_check/config.cc index 937d38856f34..dcf464deb74b 100644 --- a/source/extensions/filters/http/health_check/config.cc +++ b/source/extensions/filters/http/health_check/config.cc @@ -17,9 +17,11 @@ namespace HealthCheck { Http::FilterFactoryCb HealthCheckFilterConfig::createFilterFactoryFromProtoTyped( const envoy::extensions::filters::http::health_check::v3::HealthCheck& proto_config, - const std::string&, Server::Configuration::FactoryContext& context) { + const std::string& stats_prefix, Server::Configuration::FactoryContext& context) { ASSERT(proto_config.has_pass_through_mode()); + auto stats = std::make_shared( + HealthCheckFilterStats::generateStats(stats_prefix, context.scope())); const bool pass_through_mode = proto_config.pass_through_mode().value(); const int64_t cache_time_ms = PROTOBUF_GET_MS_OR_DEFAULT(proto_config, cache_time, 0); @@ -48,10 +50,11 @@ Http::FilterFactoryCb HealthCheckFilterConfig::createFilterFactoryFromProtoTyped } return [&context, pass_through_mode, cache_manager, header_match_data, - cluster_min_healthy_percentages](Http::FilterChainFactoryCallbacks& callbacks) -> void { + cluster_min_healthy_percentages, + stats](Http::FilterChainFactoryCallbacks& callbacks) -> void { callbacks.addStreamFilter(std::make_shared( context.serverFactoryContext(), pass_through_mode, cache_manager, header_match_data, - cluster_min_healthy_percentages)); + cluster_min_healthy_percentages, stats)); }; } diff --git a/source/extensions/filters/http/health_check/health_check.cc b/source/extensions/filters/http/health_check/health_check.cc index ded52a64eca1..3ba6c0e50c65 100644 --- a/source/extensions/filters/http/health_check/health_check.cc +++ b/source/extensions/filters/http/health_check/health_check.cc @@ -119,16 +119,19 @@ void HealthCheckFilter::onComplete() { Http::Code final_status = Http::Code::OK; const std::string* details = &RcDetails::get().HealthCheckOk; bool degraded = false; + stats_->request_total_.inc(); if (context_.healthCheckFailed()) { callbacks_->streamInfo().setResponseFlag(StreamInfo::CoreResponseFlag::FailedLocalHealthCheck); final_status = Http::Code::ServiceUnavailable; details = &RcDetails::get().HealthCheckFailed; + stats_->failed_.inc(); } else { if (cache_manager_) { const auto status_and_degraded = cache_manager_->getCachedResponse(); final_status = status_and_degraded.first; details = &RcDetails::get().HealthCheckCached; degraded = status_and_degraded.second; + stats_->cached_response_.inc(); } else if (cluster_min_healthy_percentages_ != nullptr && !cluster_min_healthy_percentages_->empty()) { // Check the status of the specified upstream cluster(s) to determine the right response. @@ -142,9 +145,10 @@ void HealthCheckFilter::onComplete() { // If the cluster does not exist at all, consider the service unhealthy. final_status = Http::Code::ServiceUnavailable; details = &RcDetails::get().HealthCheckNoCluster; - + stats_->failed_cluster_not_found_.inc(); break; } + const auto& endpoint_stats = cluster->info()->endpointStats(); const uint64_t membership_total = endpoint_stats.membership_total_.value(); if (membership_total == 0) { @@ -155,6 +159,7 @@ void HealthCheckFilter::onComplete() { } else { final_status = Http::Code::ServiceUnavailable; details = &RcDetails::get().HealthCheckClusterEmpty; + stats_->failed_cluster_empty_.inc(); break; } } @@ -165,6 +170,7 @@ void HealthCheckFilter::onComplete() { membership_total * min_healthy_percentage) { final_status = Http::Code::ServiceUnavailable; details = &RcDetails::get().HealthCheckClusterUnhealthy; + stats_->failed_cluster_unhealthy_.inc(); break; } } @@ -173,9 +179,16 @@ void HealthCheckFilter::onComplete() { if (!Http::CodeUtility::is2xx(enumToInt(final_status))) { callbacks_->streamInfo().setResponseFlag( StreamInfo::CoreResponseFlag::FailedLocalHealthCheck); + stats_->failed_.inc(); + } else { + stats_->ok_.inc(); } } + if (degraded) { + stats_->degraded_.inc(); + } + callbacks_->sendLocalReply( final_status, "", [degraded](auto& headers) { diff --git a/source/extensions/filters/http/health_check/health_check.h b/source/extensions/filters/http/health_check/health_check.h index 2779a4d16a52..05abfe6d076b 100644 --- a/source/extensions/filters/http/health_check/health_check.h +++ b/source/extensions/filters/http/health_check/health_check.h @@ -8,7 +8,10 @@ #include "envoy/http/codes.h" #include "envoy/http/filter.h" #include "envoy/server/filter_config.h" +#include "envoy/stats/stats.h" +#include "envoy/stats/stats_macros.h" +#include "source/common/common/assert.h" #include "source/common/http/header_utility.h" namespace Envoy { @@ -16,6 +19,33 @@ namespace Extensions { namespace HttpFilters { namespace HealthCheck { +/** + * All health check filter stats. @see stats_macros.h + */ +#define ALL_HEALTH_CHECK_FILTER_STATS(COUNTER) \ + COUNTER(request_total) \ + COUNTER(failed) \ + COUNTER(ok) \ + COUNTER(cached_response) \ + COUNTER(failed_cluster_not_found) \ + COUNTER(failed_cluster_empty) \ + COUNTER(failed_cluster_unhealthy) \ + COUNTER(degraded) + +/** + * Struct definition for all health check stats. @see stats_macros.h + */ +struct HealthCheckFilterStats { + ALL_HEALTH_CHECK_FILTER_STATS(GENERATE_COUNTER_STRUCT) + + static HealthCheckFilterStats generateStats(const std::string& prefix, Stats::Scope& scope) { + const std::string final_prefix = absl::StrCat(prefix, "health_check."); + return {ALL_HEALTH_CHECK_FILTER_STATS(POOL_COUNTER_PREFIX(scope, final_prefix))}; + } +}; + +using HealthCheckFilterStatsSharedPtr = std::shared_ptr; + /** * Shared cache manager used by all instances of a health check filter configuration as well as * all threads. This sets up a timer that will invalidate the cached response code and allow some @@ -48,13 +78,11 @@ class HealthCheckCacheManager { }; using HealthCheckCacheManagerSharedPtr = std::shared_ptr; - +using HeaderDataVectorSharedPtr = std::shared_ptr>; using ClusterMinHealthyPercentages = std::map; using ClusterMinHealthyPercentagesConstSharedPtr = std::shared_ptr; -using HeaderDataVectorSharedPtr = std::shared_ptr>; - /** * Health check responder filter. */ @@ -63,10 +91,11 @@ class HealthCheckFilter : public Http::StreamFilter { HealthCheckFilter(Server::Configuration::ServerFactoryContext& context, bool pass_through_mode, HealthCheckCacheManagerSharedPtr cache_manager, HeaderDataVectorSharedPtr header_match_data, - ClusterMinHealthyPercentagesConstSharedPtr cluster_min_healthy_percentages) + ClusterMinHealthyPercentagesConstSharedPtr cluster_min_healthy_percentages, + HealthCheckFilterStatsSharedPtr stats) : context_(context), pass_through_mode_(pass_through_mode), cache_manager_(cache_manager), header_match_data_(std::move(header_match_data)), - cluster_min_healthy_percentages_(cluster_min_healthy_percentages) {} + cluster_min_healthy_percentages_(cluster_min_healthy_percentages), stats_(stats) {} // Http::StreamFilterBase void onDestroy() override {} @@ -108,6 +137,7 @@ class HealthCheckFilter : public Http::StreamFilter { HealthCheckCacheManagerSharedPtr cache_manager_; const HeaderDataVectorSharedPtr header_match_data_; ClusterMinHealthyPercentagesConstSharedPtr cluster_min_healthy_percentages_; + const HealthCheckFilterStatsSharedPtr stats_; }; } // namespace HealthCheck diff --git a/test/extensions/filters/http/health_check/health_check_integration_test.cc b/test/extensions/filters/http/health_check/health_check_integration_test.cc index ac500eb87b17..f098262dade6 100644 --- a/test/extensions/filters/http/health_check/health_check_integration_test.cc +++ b/test/extensions/filters/http/health_check/health_check_integration_test.cc @@ -179,6 +179,37 @@ TEST_P(HealthCheckIntegrationTest, HealthCheckWithBufferFilter) { EXPECT_EQ("200", request("http", "GET", "/healthcheck", response)); } +TEST_P(HealthCheckIntegrationTest, HealthCheckStats) { + DISABLE_IF_ADMIN_DISABLED; + initialize(); + + // Initial stats should be zero + EXPECT_EQ(0, test_server_->counter("http.config_test.health_check.request_total")->value()); + EXPECT_EQ(0, test_server_->counter("http.config_test.health_check.ok")->value()); + EXPECT_EQ(0, test_server_->counter("http.config_test.health_check.failed")->value()); + + // Make a health check request - should result in OK response and increment request/ok counters + BufferingStreamDecoderPtr response; + EXPECT_EQ("200", request("http", "GET", "/healthcheck", response)); + EXPECT_EQ(1, test_server_->counter("http.config_test.health_check.request_total")->value()); + EXPECT_EQ(1, test_server_->counter("http.config_test.health_check.ok")->value()); + EXPECT_EQ(0, test_server_->counter("http.config_test.health_check.failed")->value()); + + // Fail the health check and verify failed counter increments + EXPECT_EQ("200", request("admin", "POST", "/healthcheck/fail", response)); + EXPECT_EQ("503", request("http", "GET", "/healthcheck", response)); + EXPECT_EQ(2, test_server_->counter("http.config_test.health_check.request_total")->value()); + EXPECT_EQ(1, test_server_->counter("http.config_test.health_check.ok")->value()); + EXPECT_EQ(1, test_server_->counter("http.config_test.health_check.failed")->value()); + + // Restore health check and verify ok counter increments + EXPECT_EQ("200", request("admin", "POST", "/healthcheck/ok", response)); + EXPECT_EQ("200", request("http", "GET", "/healthcheck", response)); + EXPECT_EQ(3, test_server_->counter("http.config_test.health_check.request_total")->value()); + EXPECT_EQ(2, test_server_->counter("http.config_test.health_check.ok")->value()); + EXPECT_EQ(1, test_server_->counter("http.config_test.health_check.failed")->value()); +} + INSTANTIATE_TEST_SUITE_P(Protocols, HealthCheckIntegrationTest, testing::ValuesIn(HttpProtocolIntegrationTest::getProtocolTestParams( {Http::CodecType::HTTP1, Http::CodecType::HTTP2}, diff --git a/test/extensions/filters/http/health_check/health_check_test.cc b/test/extensions/filters/http/health_check/health_check_test.cc index 58eecb4a4472..bbdd13df57ab 100644 --- a/test/extensions/filters/http/health_check/health_check_test.cc +++ b/test/extensions/filters/http/health_check/health_check_test.cc @@ -28,6 +28,16 @@ namespace HttpFilters { namespace HealthCheck { namespace { +class MockHealthCheckCluster : public NiceMock { +public: + MockHealthCheckCluster(uint64_t membership_total, uint64_t membership_healthy, + uint64_t membership_degraded = 0) { + info()->endpointStats().membership_total_.set(membership_total); + info()->endpointStats().membership_healthy_.set(membership_healthy); + info()->endpointStats().membership_degraded_.set(membership_degraded); + } +}; + class HealthCheckFilterTest : public testing::Test { public: HealthCheckFilterTest(bool pass_through, bool caching) @@ -51,8 +61,10 @@ class HealthCheckFilterTest : public testing::Test { matcher.set_name(":path"); matcher.mutable_string_match()->set_exact("/healthcheck"); header_data_->emplace_back(Http::HeaderUtility::createHeaderData(matcher, context_)); - filter_ = std::make_unique(context_, pass_through, cache_manager_, - header_data_, cluster_min_healthy_percentages); + filter_ = std::make_unique( + context_, pass_through, cache_manager_, header_data_, cluster_min_healthy_percentages, + std::make_shared(HealthCheckFilterStats::generateStats( + "test.", *stats_store_.rootScope()))); // Pass stats instead of config filter_->setDecoderFilterCallbacks(callbacks_); } @@ -65,16 +77,7 @@ class HealthCheckFilterTest : public testing::Test { Http::TestRequestHeaderMapImpl request_headers_; Http::TestRequestHeaderMapImpl request_headers_no_hc_; HeaderDataVectorSharedPtr header_data_; - - class MockHealthCheckCluster : public NiceMock { - public: - MockHealthCheckCluster(uint64_t membership_total, uint64_t membership_healthy, - uint64_t membership_degraded = 0) { - info()->endpointStats().membership_total_.set(membership_total); - info()->endpointStats().membership_healthy_.set(membership_healthy); - info()->endpointStats().membership_degraded_.set(membership_degraded); - } - }; + Stats::TestUtil::TestStore stats_store_; }; class HealthCheckFilterNoPassThroughTest : public HealthCheckFilterTest { @@ -409,6 +412,108 @@ TEST_F(HealthCheckFilterCachingTest, NotHcRequest) { filter_->decodeHeaders(request_headers_no_hc_, true)); } +TEST_F(HealthCheckFilterNoPassThroughTest, HealthCheckStats) { + // Test that health check request counter increases. + EXPECT_EQ(0, stats_store_.counter("test.health_check.request_total").value()); + EXPECT_EQ(Http::FilterHeadersStatus::StopIteration, + filter_->decodeHeaders(request_headers_, true)); + EXPECT_EQ(1, stats_store_.counter("test.health_check.request_total").value()); + EXPECT_EQ(1, stats_store_.counter("test.health_check.ok").value()); + + // Test failed health check stats. + EXPECT_CALL(context_, healthCheckFailed()).WillOnce(Return(true)); + Http::TestResponseHeaderMapImpl failed_response{{":status", "503"}}; + EXPECT_CALL(callbacks_, encodeHeaders_(HeaderMapEqualRef(&failed_response), true)); + filter_->decodeHeaders(request_headers_, true); + EXPECT_EQ(1, stats_store_.counter("test.health_check.failed").value()); + + // Test healthy response stats. + EXPECT_CALL(context_, healthCheckFailed()).WillOnce(Return(false)); + Http::TestResponseHeaderMapImpl healthy_response{{":status", "200"}}; + EXPECT_CALL(callbacks_, encodeHeaders_(HeaderMapEqualRef(&healthy_response), true)); + filter_->decodeHeaders(request_headers_, true); + EXPECT_EQ(2, stats_store_.counter("test.health_check.ok").value()); +} + +TEST_F(HealthCheckFilterCachingTest, CachedResponseStats) { + EXPECT_CALL(callbacks_.stream_info_, healthCheck(true)); + EXPECT_CALL(callbacks_.active_span_, setSampled(false)); + + // Set cached response and verify stats. + cache_manager_->setCachedResponse(Http::Code::ServiceUnavailable, false); + EXPECT_EQ(0, stats_store_.counter("test.health_check.cached_response").value()); + + EXPECT_EQ(Http::FilterHeadersStatus::StopIteration, + filter_->decodeHeaders(request_headers_, true)); + EXPECT_EQ(1, stats_store_.counter("test.health_check.cached_response").value()); + + EXPECT_EQ(0, stats_store_.counter("test.health_check.ok").value()); + EXPECT_EQ(1, stats_store_.counter("test.health_check.failed").value()); +} + +TEST_F(HealthCheckFilterNoPassThroughTest, ClusterHealthCheckStats) { + prepareFilter(false, ClusterMinHealthyPercentagesConstSharedPtr( + new ClusterMinHealthyPercentages{{"www1", 50.0}, {"www2", 75.0}})); + + // Test cluster not found stats. + { + EXPECT_CALL(context_, healthCheckFailed()).WillOnce(Return(false)); + EXPECT_CALL(context_.cluster_manager_, getThreadLocalCluster(Eq("www1"))) + .WillRepeatedly(Return(nullptr)); + + EXPECT_EQ(0, stats_store_.counter("test.health_check.failed_cluster_not_found").value()); + filter_->decodeHeaders(request_headers_, true); + EXPECT_EQ(1, stats_store_.counter("test.health_check.failed_cluster_not_found").value()); + } + + // Test empty cluster stats. + { + MockHealthCheckCluster cluster_empty(0, 0); + EXPECT_CALL(context_, healthCheckFailed()).WillOnce(Return(false)); + EXPECT_CALL(context_.cluster_manager_, getThreadLocalCluster(Eq("www1"))) + .WillRepeatedly(Return(&cluster_empty)); + + EXPECT_EQ(0, stats_store_.counter("test.health_check.failed_cluster_empty").value()); + filter_->decodeHeaders(request_headers_, true); + EXPECT_EQ(1, stats_store_.counter("test.health_check.failed_cluster_empty").value()); + } + + // Test unhealthy cluster stats. + { + MockHealthCheckCluster cluster_unhealthy(100, 20); // Only 20% healthy, below 50% threshold + EXPECT_CALL(context_, healthCheckFailed()).WillOnce(Return(false)); + EXPECT_CALL(context_.cluster_manager_, getThreadLocalCluster(Eq("www1"))) + .WillRepeatedly(Return(&cluster_unhealthy)); + + EXPECT_EQ(0, stats_store_.counter("test.health_check.failed_cluster_unhealthy").value()); + filter_->decodeHeaders(request_headers_, true); + EXPECT_EQ(1, stats_store_.counter("test.health_check.failed_cluster_unhealthy").value()); + } +} + +TEST_F(HealthCheckFilterCachingTest, DegradedStats) { + EXPECT_CALL(callbacks_.stream_info_, healthCheck(true)); + EXPECT_CALL(callbacks_.active_span_, setSampled(false)); + + // Set up a degraded cached response. + cache_manager_->setCachedResponse(Http::Code::ServiceUnavailable, true); + + EXPECT_EQ(0, stats_store_.counter("test.health_check.degraded").value()); + + Http::TestResponseHeaderMapImpl health_check_response{{":status", "503"}, + {"x-envoy-degraded", ""}}; + EXPECT_CALL(callbacks_, encodeHeaders_(HeaderMapEqualRef(&health_check_response), true)) + .WillOnce(Invoke([&](Http::ResponseHeaderMap& headers, bool end_stream) { + filter_->encodeHeaders(headers, end_stream); + })); + + EXPECT_CALL(callbacks_.stream_info_, + setResponseFlag(StreamInfo::CoreResponseFlag::FailedLocalHealthCheck)); + + filter_->decodeHeaders(request_headers_, true); + EXPECT_EQ(1, stats_store_.counter("test.health_check.degraded").value()); +} + } // namespace } // namespace HealthCheck } // namespace HttpFilters