envoyproxy · alyssawilk · Jan 13, 2021 · Nov 18, 2020 · Nov 23, 2020 · Dec 2, 2020
diff --git a/api/envoy/config/cluster/v3/cluster.proto b/api/envoy/config/cluster/v3/cluster.proto
@@ -589,7 +589,6 @@ message Cluster {
     google.protobuf.Duration max_interval = 2 [(validate.rules).duration = {gt {nanos: 1000000}}];
   }
 
-  // [#not-implemented-hide:]
   message PrefetchPolicy {
     // Indicates how many streams (rounded up) can be anticipated per-upstream for each
     // incoming stream. This is useful for high-QPS or latency-sensitive services. Prefetching
@@ -998,7 +997,6 @@ message Cluster {
   // Configuration to track optional cluster stats.
   TrackClusterStats track_cluster_stats = 49;
 
-  // [#not-implemented-hide:]
   // Prefetch configuration for this cluster.
   PrefetchPolicy prefetch_policy = 50;
 

diff --git a/api/envoy/config/cluster/v4alpha/cluster.proto b/api/envoy/config/cluster/v4alpha/cluster.proto
diff --git a/generated_api_shadow/envoy/config/cluster/v3/cluster.proto b/generated_api_shadow/envoy/config/cluster/v3/cluster.proto
diff --git a/generated_api_shadow/envoy/config/cluster/v4alpha/cluster.proto b/generated_api_shadow/envoy/config/cluster/v4alpha/cluster.proto
diff --git a/source/common/upstream/cluster_manager_impl.cc b/source/common/upstream/cluster_manager_impl.cc
@@ -855,25 +855,31 @@ ThreadLocalCluster* ClusterManagerImpl::get(absl::string_view cluster) {
 
 void ClusterManagerImpl::maybePrefetch(
     ThreadLocalClusterManagerImpl::ClusterEntryPtr& cluster_entry,
+    const ClusterConnectivityState& state,
     std::function<ConnectionPool::Instance*()> pick_prefetch_pool) {
-  // TODO(alyssawilk) As currently implemented, this will always just prefetch
-  // one connection ahead of actually needed connections.
-  //
-  // Instead we want to track the following metrics across the entire connection
-  // pool and use the same algorithm we do for per-upstream prefetch:
-  // ((pending_streams_ + num_active_streams_) * global_prefetch_ratio >
-  //  (connecting_stream_capacity_ + num_active_streams_)))
-  //  and allow multiple prefetches per pick.
-  //  Also cap prefetches such that
-  //  num_unused_prefetch < num hosts
-  //  since if we have more prefetches than hosts, we should consider kicking into
-  //  per-upstream prefetch.
-  //
-  //  Once we do this, this should loop capped number of times while shouldPrefetch is true.
-  if (cluster_entry->cluster_info_->peekaheadRatio() > 1.0) {
+  auto peekahead_ratio = cluster_entry->cluster_info_->peekaheadRatio();
+  if (peekahead_ratio <= 1.0) {
+    return;
+  }
+
+  // 3 here is arbitrary. Just as in ConnPoolImplBase::tryCreateNewConnections
+  // we want to limit the work which can be done on any given prefetch attempt.
+  for (int i = 0; i < 3; ++i) {
+    if ((state.pending_streams_ + 1 + state.active_streams_) * peekahead_ratio <=
+        (state.connecting_stream_capacity_ + state.active_streams_)) {
+      return;
+    }
     ConnectionPool::Instance* prefetch_pool = pick_prefetch_pool();
     if (prefetch_pool) {
-      prefetch_pool->maybePrefetch(cluster_entry->cluster_info_->peekaheadRatio());
+      if (!prefetch_pool->maybePrefetch(cluster_entry->cluster_info_->peekaheadRatio())) {
+        // Given that the next prefetch pick may be entirely different, we could
+        // opt to try again even if the first prefetch fails. Err on the side of
+        // caution and wait for the next attempt.
+        return;
+      }
+    } else {
+      // If unable to find a prefetch pool, exit early.
+      return;
     }
   }
 }
@@ -898,9 +904,10 @@ ClusterManagerImpl::httpConnPoolForCluster(const std::string& cluster, ResourceP
   // performed here in anticipation of the new stream.
   // TODO(alyssawilk) refactor to have one function call and return a pair, so this invariant is
   // code-enforced.
-  maybePrefetch(entry->second, [&entry, &priority, &protocol, &context]() {
-    return entry->second->connPool(priority, protocol, context, true);
-  });
+  maybePrefetch(entry->second, cluster_manager.cluster_manager_state_,
+                [&entry, &priority, &protocol, &context]() {
+                  return entry->second->connPool(priority, protocol, context, true);
+                });
 
   return ret;
 }
@@ -924,9 +931,10 @@ ClusterManagerImpl::tcpConnPoolForCluster(const std::string& cluster, ResourcePr
   // TODO(alyssawilk) refactor to have one function call and return a pair, so this invariant is
   // code-enforced.
   // Now see if another host should be prefetched.
-  maybePrefetch(entry->second, [&entry, &priority, &context]() {
-    return entry->second->tcpConnPool(priority, context, true);
-  });
+  maybePrefetch(entry->second, cluster_manager.cluster_manager_state_,
+                [&entry, &priority, &context]() {
+                  return entry->second->tcpConnPool(priority, context, true);
+                });
 
   return ret;
 }
@@ -1405,8 +1413,10 @@ ClusterManagerImpl::ThreadLocalClusterManagerImpl::ClusterEntry::connPool(
     LoadBalancerContext* context, bool peek) {
   HostConstSharedPtr host = (peek ? lb_->peekAnotherHost(context) : lb_->chooseHost(context));
   if (!host) {
-    ENVOY_LOG(debug, "no healthy host for HTTP connection pool");
-    cluster_info_->stats().upstream_cx_none_healthy_.inc();
+    if (!peek) {
+      ENVOY_LOG(debug, "no healthy host for HTTP connection pool");
+      cluster_info_->stats().upstream_cx_none_healthy_.inc();
+    }
     return nullptr;
   }
 
@@ -1466,8 +1476,10 @@ ClusterManagerImpl::ThreadLocalClusterManagerImpl::ClusterEntry::tcpConnPool(
     ResourcePriority priority, LoadBalancerContext* context, bool peek) {
   HostConstSharedPtr host = (peek ? lb_->peekAnotherHost(context) : lb_->chooseHost(context));
   if (!host) {
-    ENVOY_LOG(debug, "no healthy host for TCP connection pool");
-    cluster_info_->stats().upstream_cx_none_healthy_.inc();
+    if (!peek) {
+      ENVOY_LOG(debug, "no healthy host for TCP connection pool");
+      cluster_info_->stats().upstream_cx_none_healthy_.inc();
+    }
     return nullptr;
   }
 

diff --git a/source/common/upstream/cluster_manager_impl.h b/source/common/upstream/cluster_manager_impl.h
@@ -551,6 +551,7 @@ class ClusterManagerImpl : public ClusterManager, Logger::Loggable<Logger::Id::u
   void updateClusterCounts();
   void clusterWarmingToActive(const std::string& cluster_name);
   static void maybePrefetch(ThreadLocalClusterManagerImpl::ClusterEntryPtr& cluster_entry,
+                            const ClusterConnectivityState& cluster_manager_state,
                             std::function<ConnectionPool::Instance*()> prefetch_pool);
 
   ClusterManagerFactory& factory_;

diff --git a/source/common/upstream/load_balancer_impl.cc b/source/common/upstream/load_balancer_impl.cc
@@ -108,16 +108,16 @@ LoadBalancerBase::LoadBalancerBase(
       priority_set_(priority_set) {
   for (auto& host_set : priority_set_.hostSetsPerPriority()) {
     recalculatePerPriorityState(host_set->priority(), priority_set_, per_priority_load_,
-                                per_priority_health_, per_priority_degraded_);
+                                per_priority_health_, per_priority_degraded_, total_healthy_hosts_);
   }
   // Recalculate panic mode for all levels.
   recalculatePerPriorityPanic();
 
-  priority_set_.addPriorityUpdateCb(
-      [this](uint32_t priority, const HostVector&, const HostVector&) -> void {
-        recalculatePerPriorityState(priority, priority_set_, per_priority_load_,
-                                    per_priority_health_, per_priority_degraded_);
-      });
+  priority_set_.addPriorityUpdateCb([this](uint32_t priority, const HostVector&,
+                                           const HostVector&) -> void {
+    recalculatePerPriorityState(priority, priority_set_, per_priority_load_, per_priority_health_,
+                                per_priority_degraded_, total_healthy_hosts_);
+  });
 
   priority_set_.addPriorityUpdateCb(
       [this](uint32_t priority, const HostVector&, const HostVector&) -> void {
@@ -146,11 +146,13 @@ void LoadBalancerBase::recalculatePerPriorityState(uint32_t priority,
                                                    const PrioritySet& priority_set,
                                                    HealthyAndDegradedLoad& per_priority_load,
                                                    HealthyAvailability& per_priority_health,
-                                                   DegradedAvailability& per_priority_degraded) {
+                                                   DegradedAvailability& per_priority_degraded,
+                                                   uint32_t& total_healthy_hosts) {
   per_priority_load.healthy_priority_load_.get().resize(priority_set.hostSetsPerPriority().size());
   per_priority_load.degraded_priority_load_.get().resize(priority_set.hostSetsPerPriority().size());
   per_priority_health.get().resize(priority_set.hostSetsPerPriority().size());
   per_priority_degraded.get().resize(priority_set.hostSetsPerPriority().size());
+  total_healthy_hosts = 0;
 
   // Determine the health of the newly modified priority level.
   // Health ranges from 0-100, and is the ratio of healthy/degraded hosts to total hosts, modified
@@ -232,6 +234,10 @@ void LoadBalancerBase::recalculatePerPriorityState(uint32_t priority,
                                 per_priority_load.healthy_priority_load_.get().end(), 0) +
                     std::accumulate(per_priority_load.degraded_priority_load_.get().begin(),
                                     per_priority_load.degraded_priority_load_.get().end(), 0));
+
+  for (auto& host_set : priority_set.hostSetsPerPriority()) {
+    total_healthy_hosts += host_set->healthyHosts().size();
+  }
 }
 
 // Method iterates through priority levels and turns on/off panic mode.
@@ -774,6 +780,10 @@ void EdfLoadBalancerBase::refresh(uint32_t priority) {
 }
 
 HostConstSharedPtr EdfLoadBalancerBase::peekAnotherHost(LoadBalancerContext* context) {
+  if (stashed_random_.size() + 1 > total_healthy_hosts_) {
+    return nullptr;
+  }
+
   const absl::optional<HostsSource> hosts_source = hostSourceToUse(context, random(true));
   if (!hosts_source) {
     return nullptr;
@@ -859,6 +869,9 @@ HostConstSharedPtr LeastRequestLoadBalancer::unweightedHostPick(const HostVector
 }
 
 HostConstSharedPtr RandomLoadBalancer::peekAnotherHost(LoadBalancerContext* context) {
+  if (stashed_random_.size() + 1 > total_healthy_hosts_) {
+    return nullptr;
+  }
   return peekOrChoose(context, true);
 }
 

diff --git a/source/common/upstream/load_balancer_impl.h b/source/common/upstream/load_balancer_impl.h
@@ -121,7 +121,8 @@ class LoadBalancerBase : public LoadBalancer {
   void static recalculatePerPriorityState(uint32_t priority, const PrioritySet& priority_set,
                                           HealthyAndDegradedLoad& priority_load,
                                           HealthyAvailability& per_priority_health,
-                                          DegradedAvailability& per_priority_degraded);
+                                          DegradedAvailability& per_priority_degraded,
+                                          uint32_t& total_healthy_hosts);
   void recalculatePerPriorityPanic();
 
 protected:
@@ -154,6 +155,8 @@ class LoadBalancerBase : public LoadBalancer {
   DegradedAvailability per_priority_degraded_;
   // Levels which are in panic
   std::vector<bool> per_priority_panic_;
+  // The total count of healthy hosts across all priority levels.
+  uint32_t total_healthy_hosts_;
 };
 
 class LoadBalancerContextBase : public LoadBalancerContext {

diff --git a/source/extensions/retry/priority/previous_priorities/previous_priorities.h b/source/extensions/retry/priority/previous_priorities/previous_priorities.h
@@ -29,7 +29,8 @@ class PreviousPrioritiesRetryPriority : public Upstream::RetryPriority {
   void recalculatePerPriorityState(uint32_t priority, const Upstream::PrioritySet& priority_set) {
     // Recalculate health and priority the same way the load balancer does it.
     Upstream::LoadBalancerBase::recalculatePerPriorityState(
-        priority, priority_set, per_priority_load_, per_priority_health_, per_priority_degraded_);
+        priority, priority_set, per_priority_load_, per_priority_health_, per_priority_degraded_,
+        total_healthy_hosts_);
   }
 
   uint32_t adjustedAvailability(std::vector<uint32_t>& per_priority_health,
@@ -47,6 +48,7 @@ class PreviousPrioritiesRetryPriority : public Upstream::RetryPriority {
   Upstream::HealthyAndDegradedLoad per_priority_load_;
   Upstream::HealthyAvailability per_priority_health_;
   Upstream::DegradedAvailability per_priority_degraded_;
+  uint32_t total_healthy_hosts_;
 };
 
 } // namespace Priority