From c35f5ab707357310d1e64368a33e125671ee0cd0 Mon Sep 17 00:00:00 2001 From: James Forcier Date: Thu, 22 Aug 2019 20:47:44 +0000 Subject: [PATCH] upstream: Add ability to disable host selection during panic Previously, when in a panic state, requests would be routed to all hosts. In some cases it is instead preferable to not route any requests. Add a configuration option for zone-aware load balancers which switches from routing to all hosts to no hosts. Closes #7550. Signed-off-by: James Forcier --- api/envoy/api/v2/cds.proto | 4 ++ .../load_balancing/panic_threshold.rst | 12 ++-- docs/root/intro/version_history.rst | 1 + source/common/upstream/load_balancer_impl.cc | 23 ++++++-- source/common/upstream/load_balancer_impl.h | 7 ++- .../upstream/load_balancer_impl_test.cc | 59 +++++++++++++++++++ 6 files changed, 96 insertions(+), 10 deletions(-) diff --git a/api/envoy/api/v2/cds.proto b/api/envoy/api/v2/cds.proto index e7df4a940bc6..d1a3af0af9ba 100644 --- a/api/envoy/api/v2/cds.proto +++ b/api/envoy/api/v2/cds.proto @@ -539,6 +539,10 @@ message Cluster { // * :ref:`runtime values `. // * :ref:`Zone aware routing support `. google.protobuf.UInt64Value min_cluster_size = 2; + + // If set to true, Envoy will not consider any hosts when the cluster is in panic mode. + // Instead, the cluster will fail all requests as if all hosts are unhealthy. + bool disable_cluster_on_panic = 3; } // Configuration for :ref:`locality weighted load balancing // ` diff --git a/docs/root/intro/arch_overview/upstream/load_balancing/panic_threshold.rst b/docs/root/intro/arch_overview/upstream/load_balancing/panic_threshold.rst index e24022a8f07b..3c71b3a788c4 100644 --- a/docs/root/intro/arch_overview/upstream/load_balancing/panic_threshold.rst +++ b/docs/root/intro/arch_overview/upstream/load_balancing/panic_threshold.rst @@ -5,8 +5,8 @@ Panic threshold During load balancing, Envoy will generally only consider available (healthy or degraded) hosts in an upstream cluster. However, if the percentage of available hosts in the cluster becomes too low, -Envoy will disregard health status and balance amongst all hosts. This is known as the *panic -threshold*. The default panic threshold is 50%. This is +Envoy will disregard health status and balance either amongst all hosts or no hosts. This is known +as the *panic threshold*. The default panic threshold is 50%. This is :ref:`configurable ` via runtime as well as in the :ref:`cluster configuration `. The panic threshold is used to avoid a situation in which host failures cascade throughout the @@ -20,8 +20,12 @@ disregards panic thresholds and continues to distribute traffic load across prio the algorithm described :ref:`here `. However, when normalized total availability drops below 100%, Envoy assumes that there are not enough available hosts across all priority levels. It continues to distribute traffic load across priorities, -but if a given priority level's availability is below the panic threshold, traffic will go to all hosts -in that priority level regardless of their availability. +but if a given priority level's availability is below the panic threshold, traffic will go to all +(or no) hosts in that priority level regardless of their availability. + +There are two modes Envoy can choose from when in a panic state: traffic will either be sent to all +hosts, or will be sent to no hosts (and therefore will always fail). This is configured in the +:ref:`cluster configuration `. The following examples explain the relationship between normalized total availability and panic threshold. It is assumed that the default value of 50% is used for the panic threshold. diff --git a/docs/root/intro/version_history.rst b/docs/root/intro/version_history.rst index 609554b4d374..bd89d2fa6fba 100644 --- a/docs/root/intro/version_history.rst +++ b/docs/root/intro/version_history.rst @@ -43,6 +43,7 @@ Version history * upstream: added network filter chains to upstream connections, see :ref:`filters`. * upstream: use p2c to select hosts for least-requests load balancers if all host weights are the same, even in cases where weights are not equal to 1. * upstream: added :ref:`an option ` that allows draining HTTP, TCP connection pools on cluster membership change. +* upstream: add :ref:`disable_cluster_on_panic ` to allow failing all requests to a cluster during panic state * zookeeper: parse responses and emit latency stats. 1.11.1 (August 13, 2019) diff --git a/source/common/upstream/load_balancer_impl.cc b/source/common/upstream/load_balancer_impl.cc index 79bf87b79c44..5bc8781b3a06 100644 --- a/source/common/upstream/load_balancer_impl.cc +++ b/source/common/upstream/load_balancer_impl.cc @@ -282,7 +282,8 @@ ZoneAwareLoadBalancerBase::ZoneAwareLoadBalancerBase( routing_enabled_(PROTOBUF_PERCENT_TO_ROUNDED_INTEGER_OR_DEFAULT( common_config.zone_aware_lb_config(), routing_enabled, 100, 100)), min_cluster_size_(PROTOBUF_GET_WRAPPED_OR_DEFAULT(common_config.zone_aware_lb_config(), - min_cluster_size, 6U)) { + min_cluster_size, 6U)), + disable_cluster_on_panic_(common_config.zone_aware_lb_config().disable_cluster_on_panic()) { ASSERT(!priority_set.hostSetsPerPriority().empty()); resizePerPriorityState(); priority_set_.addPriorityUpdateCb( @@ -552,7 +553,11 @@ ZoneAwareLoadBalancerBase::hostSourceToUse(LoadBalancerContext* context) { // If the selected host set has insufficient healthy hosts, return all hosts. if (per_priority_panic_[hosts_source.priority_]) { stats_.lb_healthy_panic_.inc(); - hosts_source.source_type_ = HostsSource::SourceType::AllHosts; + if (disable_cluster_on_panic_) { + hosts_source.source_type_ = HostsSource::SourceType::NoHosts; + } else { + hosts_source.source_type_ = HostsSource::SourceType::AllHosts; + } return hosts_source; } @@ -586,9 +591,13 @@ ZoneAwareLoadBalancerBase::hostSourceToUse(LoadBalancerContext* context) { if (isGlobalPanic(localHostSet())) { stats_.lb_local_cluster_not_ok_.inc(); - // If the local Envoy instances are in global panic, do not do locality - // based routing. - hosts_source.source_type_ = sourceType(host_availability); + // If the local Envoy instances are in global panic, and we should not disable the cluster, do + // not do locality based routing. + if (disable_cluster_on_panic_) { + hosts_source.source_type_ = HostsSource::SourceType::NoHosts; + } else { + hosts_source.source_type_ = sourceType(host_availability); + } return hosts_source; } @@ -610,6 +619,8 @@ const HostVector& ZoneAwareLoadBalancerBase::hostSourceToHosts(HostsSource hosts return host_set.healthyHostsPerLocality().get()[hosts_source.locality_index_]; case HostsSource::SourceType::LocalityDegradedHosts: return host_set.degradedHostsPerLocality().get()[hosts_source.locality_index_]; + case HostsSource::SourceType::NoHosts: + return dummy_empty_host_vector; default: NOT_REACHED_GCOVR_EXCL_LINE; } @@ -696,6 +707,8 @@ void EdfLoadBalancerBase::refresh(uint32_t priority) { HostsSource(priority, HostsSource::SourceType::LocalityDegradedHosts, locality_index), host_set->degradedHostsPerLocality().get()[locality_index]); } + add_hosts_source(HostsSource(priority, HostsSource::SourceType::NoHosts), + dummy_empty_host_vector); } HostConstSharedPtr EdfLoadBalancerBase::chooseHostOnce(LoadBalancerContext* context) { diff --git a/source/common/upstream/load_balancer_impl.h b/source/common/upstream/load_balancer_impl.h index 98fd0c75d7d1..6bc0fdd9074b 100644 --- a/source/common/upstream/load_balancer_impl.h +++ b/source/common/upstream/load_balancer_impl.h @@ -180,6 +180,8 @@ class ZoneAwareLoadBalancerBase : public LoadBalancerBase { LocalityHealthyHosts, // Degraded hosts for locality @ locality_index. LocalityDegradedHosts, + // No hosts in the host set. + NoHosts, }; HostsSource() = default; @@ -187,7 +189,7 @@ class ZoneAwareLoadBalancerBase : public LoadBalancerBase { HostsSource(uint32_t priority, SourceType source_type) : priority_(priority), source_type_(source_type) { ASSERT(source_type == SourceType::AllHosts || source_type == SourceType::HealthyHosts || - source_type == SourceType::DegradedHosts); + source_type == SourceType::DegradedHosts || source_type == SourceType::NoHosts); } HostsSource(uint32_t priority, SourceType source_type, uint32_t locality_index) @@ -231,6 +233,8 @@ class ZoneAwareLoadBalancerBase : public LoadBalancerBase { */ const HostVector& hostSourceToHosts(HostsSource hosts_source); + const HostVector dummy_empty_host_vector; + private: enum class LocalityRoutingState { // Locality based routing is off. @@ -300,6 +304,7 @@ class ZoneAwareLoadBalancerBase : public LoadBalancerBase { const uint32_t routing_enabled_; const uint64_t min_cluster_size_; + const bool disable_cluster_on_panic_; struct PerPriorityState { // The percent of requests which can be routed to the local locality. diff --git a/test/common/upstream/load_balancer_impl_test.cc b/test/common/upstream/load_balancer_impl_test.cc index e29dd9b0dcd8..b6be73e9c04f 100644 --- a/test/common/upstream/load_balancer_impl_test.cc +++ b/test/common/upstream/load_balancer_impl_test.cc @@ -539,6 +539,39 @@ TEST_P(FailoverTest, PriorityUpdatesWithLocalHostSet) { EXPECT_EQ(tertiary_host_set_.hosts_[0], lb_->chooseHost(nullptr)); } +// Test that extending the priority set with an existing LB causes the correct updates when the +// cluster is configured to disable on panic. +TEST_P(FailoverTest, PriorityUpdatesWithLocalHostSetDisableOnPanic) { + host_set_.hosts_ = {makeTestHost(info_, "tcp://127.0.0.1:80")}; + failover_host_set_.hosts_ = {makeTestHost(info_, "tcp://127.0.0.1:81")}; + common_config_.mutable_zone_aware_lb_config()->set_disable_cluster_on_panic(true); + + init(false); + // With both the primary and failover hosts unhealthy, we should select no host. + EXPECT_EQ(nullptr, lb_->chooseHost(nullptr)); + + // Update the priority set with a new priority level P=2 and ensure the host + // is chosen + MockHostSet& tertiary_host_set_ = *priority_set_.getMockHostSet(2); + HostVectorSharedPtr hosts(new HostVector({makeTestHost(info_, "tcp://127.0.0.1:82")})); + tertiary_host_set_.hosts_ = *hosts; + tertiary_host_set_.healthy_hosts_ = tertiary_host_set_.hosts_; + HostVector add_hosts; + add_hosts.push_back(tertiary_host_set_.hosts_[0]); + tertiary_host_set_.runCallbacks(add_hosts, {}); + EXPECT_EQ(tertiary_host_set_.hosts_[0], lb_->chooseHost(nullptr)); + + // Now add a healthy host in P=0 and make sure it is immediately selected. + host_set_.healthy_hosts_ = host_set_.hosts_; + host_set_.runCallbacks(add_hosts, {}); + EXPECT_EQ(host_set_.hosts_[0], lb_->chooseHost(nullptr)); + + // Remove the healthy host and ensure we fail back over to tertiary_host_set_ + host_set_.healthy_hosts_ = {}; + host_set_.runCallbacks({}, {}); + EXPECT_EQ(tertiary_host_set_.hosts_[0], lb_->chooseHost(nullptr)); +} + // Test extending the priority set. TEST_P(FailoverTest, ExtendPrioritiesUpdatingPrioritySet) { host_set_.hosts_ = {makeTestHost(info_, "tcp://127.0.0.1:80")}; @@ -829,6 +862,32 @@ TEST_P(RoundRobinLoadBalancerTest, MaxUnhealthyPanic) { EXPECT_EQ(3UL, stats_.lb_healthy_panic_.value()); } +// Test that no hosts are selected when disable_cluster_on_panic is enabled. +TEST_P(RoundRobinLoadBalancerTest, MaxUnhealthyPanicDisableOnPanic) { + hostSet().healthy_hosts_ = {makeTestHost(info_, "tcp://127.0.0.1:80"), + makeTestHost(info_, "tcp://127.0.0.1:81")}; + hostSet().hosts_ = { + makeTestHost(info_, "tcp://127.0.0.1:80"), makeTestHost(info_, "tcp://127.0.0.1:81"), + makeTestHost(info_, "tcp://127.0.0.1:82"), makeTestHost(info_, "tcp://127.0.0.1:83"), + makeTestHost(info_, "tcp://127.0.0.1:84"), makeTestHost(info_, "tcp://127.0.0.1:85")}; + + common_config_.mutable_zone_aware_lb_config()->set_disable_cluster_on_panic(true); + + init(false); + EXPECT_EQ(nullptr, lb_->chooseHost(nullptr)); + + // Take the threshold back above the panic threshold. + hostSet().healthy_hosts_ = { + makeTestHost(info_, "tcp://127.0.0.1:80"), makeTestHost(info_, "tcp://127.0.0.1:81"), + makeTestHost(info_, "tcp://127.0.0.1:82"), makeTestHost(info_, "tcp://127.0.0.1:83")}; + hostSet().runCallbacks({}, {}); + + EXPECT_EQ(hostSet().healthy_hosts_[0], lb_->chooseHost(nullptr)); + EXPECT_EQ(hostSet().healthy_hosts_[1], lb_->chooseHost(nullptr)); + + EXPECT_EQ(1UL, stats_.lb_healthy_panic_.value()); +} + // Ensure if the panic threshold is 0%, panic mode is disabled. TEST_P(RoundRobinLoadBalancerTest, DisablePanicMode) { hostSet().healthy_hosts_ = {};