Skip to content

Commit

Permalink
upstream: force a full rebuild on host weight changes (#14569)
Browse files Browse the repository at this point in the history
This will allow us to build load balancers that pre-compute data
structures based on host weights (for example using weighted queues),
to work around some of the deficiencies of EDF scheduling.

This behavior can be temporarily disabled by setting the
envoy.reloadable_features.upstream_host_weight_change_causes_rebuild
feature flag to false.

Fixes #14360

Signed-off-by: Matt Klein <mklein@lyft.com>
  • Loading branch information
mattklein123 authored Jan 12, 2021
1 parent 0e1b388 commit 658ee30
Show file tree
Hide file tree
Showing 6 changed files with 77 additions and 8 deletions.
7 changes: 7 additions & 0 deletions docs/root/version_history/current.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,13 @@ Minor Behavior Changes
----------------------
*Changes that may cause incompatibilities for some users, but should not for most*

* upstream: host weight changes now cause a full load balancer rebuild as opposed to happening
atomically inline. This change has been made to support load balancer pre-computation of data
structures based on host weight, but may have performance implications if host weight changes
are very frequent. This change can be disabled by setting the `envoy.reloadable_features.upstream_host_weight_change_causes_rebuild`
feature flag to false. If setting this flag to false is required in a deployment please open an
issue against the project.

Bug Fixes
---------
*Changes expected to improve the state of the world and are unlikely to have negative effects*
Expand Down
1 change: 1 addition & 0 deletions source/common/runtime/runtime_features.cc
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ constexpr const char* runtime_features[] = {
"envoy.reloadable_features.stop_faking_paths",
"envoy.reloadable_features.strict_1xx_and_204_response_headers",
"envoy.reloadable_features.tls_use_io_handle_bio",
"envoy.reloadable_features.upstream_host_weight_change_causes_rebuild",
"envoy.reloadable_features.vhds_heartbeats",
"envoy.reloadable_features.unify_grpc_handling",
"envoy.restart_features.use_apple_api_for_dns_lookups",
Expand Down
11 changes: 5 additions & 6 deletions source/common/upstream/eds.cc
Original file line number Diff line number Diff line change
Expand Up @@ -247,12 +247,11 @@ bool EdsClusterImpl::updateHostsPerLocality(
// is responsible for both determining whether there was a change and to perform the actual update
// to current_hosts_copy, so it must be called even if we know that we need to update (e.g. if the
// overprovisioning factor changes).
// TODO(htuch): We eagerly update all the host sets here on weight changes, which isn't great,
// since this has the knock on effect that we rebuild the load balancers and locality scheduler.
// We could make this happen lazily, as we do for host-level weight updates, where as things age
// out of the locality scheduler, we discover their new weights. We don't currently have a shared
// object for locality weights that we can update here, we should add something like this to
// improve performance and scalability of locality weight updates.
//
// TODO(htuch): We eagerly update all the host sets here on weight changes, which may have
// performance implications, since this has the knock on effect that we rebuild the load balancers
// and locality scheduler. See the comment in BaseDynamicClusterImpl::updateDynamicHostList
// about this. In the future we may need to do better here.
const bool hosts_updated = updateDynamicHostList(new_hosts, *current_hosts_copy, hosts_added,
hosts_removed, updated_hosts, all_hosts_);
if (hosts_updated || host_set.overprovisioningFactor() != overprovisioning_factor ||
Expand Down
12 changes: 11 additions & 1 deletion source/common/upstream/upstream_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1450,6 +1450,17 @@ bool BaseDynamicClusterImpl::updateDynamicHostList(const HostVector& new_hosts,
if (host->weight() > max_host_weight) {
max_host_weight = host->weight();
}
if (existing_host->second->weight() != host->weight()) {
existing_host->second->weight(host->weight());
if (Runtime::runtimeFeatureEnabled(
"envoy.reloadable_features.upstream_host_weight_change_causes_rebuild")) {
// We do full host set rebuilds so that load balancers can do pre-computation of data
// structures based on host weight. This may become a performance problem in certain
// deployments so it is runtime feature guarded and may also need to be configurable
// and/or dynamic in the future.
hosts_changed = true;
}
}

hosts_changed |=
updateHealthFlag(*host, *existing_host->second, Host::HealthFlag::FAILED_EDS_HEALTH);
Expand Down Expand Up @@ -1485,7 +1496,6 @@ bool BaseDynamicClusterImpl::updateDynamicHostList(const HostVector& new_hosts,
hosts_added_to_current_priority.emplace_back(existing_host->second);
}

existing_host->second->weight(host->weight());
final_hosts.push_back(existing_host->second);
updated_hosts[existing_host->second->address()->asString()] = existing_host->second;
} else {
Expand Down
1 change: 1 addition & 0 deletions test/common/upstream/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ envoy_cc_test(
"//test/mocks/ssl:ssl_mocks",
"//test/mocks/upstream:cluster_manager_mocks",
"//test/mocks/upstream:health_checker_mocks",
"//test/test_common:test_runtime_lib",
"//test/test_common:utility_lib",
"@envoy_api//envoy/config/cluster/v3:pkg_cc_proto",
"@envoy_api//envoy/config/core/v3:pkg_cc_proto",
Expand Down
53 changes: 52 additions & 1 deletion test/common/upstream/eds_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "test/mocks/ssl/mocks.h"
#include "test/mocks/upstream/cluster_manager.h"
#include "test/mocks/upstream/health_checker.h"
#include "test/test_common/test_runtime.h"
#include "test/test_common/utility.h"

#include "gmock/gmock.h"
Expand All @@ -35,7 +36,7 @@ namespace Upstream {
namespace {

class EdsTest : public testing::Test {
protected:
public:
EdsTest() : api_(Api::createApiForTest(stats_)) { resetCluster(); }

void resetCluster() {
Expand Down Expand Up @@ -324,6 +325,56 @@ TEST_F(EdsTest, EdsClusterFromFileIsPrimaryCluster) {
EXPECT_TRUE(initialized_);
}

namespace {

void endpointWeightChangeCausesRebuildTest(EdsTest& test, bool expect_rebuild) {
envoy::config::endpoint::v3::ClusterLoadAssignment cluster_load_assignment;
cluster_load_assignment.set_cluster_name("fare");
auto* endpoints = cluster_load_assignment.add_endpoints();
auto* endpoint = endpoints->add_lb_endpoints();
endpoint->mutable_endpoint()->mutable_address()->mutable_socket_address()->set_address("1.2.3.4");
endpoint->mutable_endpoint()->mutable_address()->mutable_socket_address()->set_port_value(80);
endpoint->mutable_load_balancing_weight()->set_value(30);

test.initialize();
test.doOnConfigUpdateVerifyNoThrow(cluster_load_assignment);
EXPECT_TRUE(test.initialized_);
EXPECT_EQ(0UL, test.stats_.counter("cluster.name.update_no_rebuild").value());
EXPECT_EQ(30UL,
test.stats_.gauge("cluster.name.max_host_weight", Stats::Gauge::ImportMode::Accumulate)
.value());
auto& hosts = test.cluster_->prioritySet().hostSetsPerPriority()[0]->hosts();
EXPECT_EQ(hosts.size(), 1);
EXPECT_EQ(hosts[0]->weight(), 30);

endpoint->mutable_load_balancing_weight()->set_value(31);
test.doOnConfigUpdateVerifyNoThrow(cluster_load_assignment);
EXPECT_EQ(expect_rebuild ? 0UL : 1UL,
test.stats_.counter("cluster.name.update_no_rebuild").value());
EXPECT_EQ(31UL,
test.stats_.gauge("cluster.name.max_host_weight", Stats::Gauge::ImportMode::Accumulate)
.value());
auto& new_hosts = test.cluster_->prioritySet().hostSetsPerPriority()[0]->hosts();
EXPECT_EQ(new_hosts.size(), 1);
EXPECT_EQ(new_hosts[0]->weight(), 31);
}

} // namespace

// Verify that host weight changes cause a full rebuild.
TEST_F(EdsTest, EndpointWeightChangeCausesRebuild) {
endpointWeightChangeCausesRebuildTest(*this, true);
}

// Verify that host weight changes do not cause a full rebuild when the feature flag is disabled.
TEST_F(EdsTest, EndpointWeightChangeCausesRebuildDisabled) {
TestScopedRuntime scoped_runtime;
Runtime::LoaderSingleton::getExisting()->mergeValues(
{{"envoy.reloadable_features.upstream_host_weight_change_causes_rebuild", "false"}});

endpointWeightChangeCausesRebuildTest(*this, false);
}

// Validate that onConfigUpdate() updates the endpoint metadata.
TEST_F(EdsTest, EndpointMetadata) {
envoy::config::endpoint::v3::ClusterLoadAssignment cluster_load_assignment;
Expand Down

0 comments on commit 658ee30

Please sign in to comment.