envoyproxy · alyssawilk · Jan 13, 2021 · Nov 18, 2020 · Nov 23, 2020 · Dec 2, 2020
diff --git a/api/envoy/config/cluster/v3/cluster.proto b/api/envoy/config/cluster/v3/cluster.proto
@@ -589,11 +589,10 @@ message Cluster {
     google.protobuf.Duration max_interval = 2 [(validate.rules).duration = {gt {nanos: 1000000}}];
   }
 
-  // [#not-implemented-hide:]
   message PrefetchPolicy {
     // Indicates how many streams (rounded up) can be anticipated per-upstream for each
     // incoming stream. This is useful for high-QPS or latency-sensitive services. Prefetching
-    // will only be done if the upstream is healthy.
+    // will only be done if the upstream is healthy and the cluster has traffic.
     //
     // For example if this is 2, for an incoming HTTP/1.1 stream, 2 connections will be
     // established, one for the new incoming stream, and one for a presumed follow-up stream. For
@@ -622,24 +621,24 @@ message Cluster {
     // Indicates how many many streams (rounded up) can be anticipated across a cluster for each
     // stream, useful for low QPS services. This is currently supported for a subset of
     // deterministic non-hash-based load-balancing algorithms (weighted round robin, random).
-    // Unlike per_upstream_prefetch_ratio this prefetches across the upstream instances in a
+    // Unlike *per_upstream_prefetch_ratio* this prefetches across the upstream instances in a
     // cluster, doing best effort predictions of what upstream would be picked next and
     // pre-establishing a connection.
     //
+    // Prefetching will be limited to one prefetch per configured upstream in the cluster and will
+    // only be done if there are healthy upstreams and the cluster has traffic.
+    //
     // For example if prefetching is set to 2 for a round robin HTTP/2 cluster, on the first
     // incoming stream, 2 connections will be prefetched - one to the first upstream for this
     // cluster, one to the second on the assumption there will be a follow-up stream.
     //
-    // Prefetching will be limited to one prefetch per configured upstream in the cluster.
-    //
     // If this value is not set, or set explicitly to one, Envoy will fetch as many connections
     // as needed to serve streams in flight, so during warm up and in steady state if a connection
     // is closed (and per_upstream_prefetch_ratio is not set), there will be a latency hit for
     // connection establishment.
     //
     // If both this and prefetch_ratio are set, Envoy will make sure both predicted needs are met,
     // basically prefetching max(predictive-prefetch, per-upstream-prefetch), for each upstream.
-    // TODO(alyssawilk) per LB docs and LB overview docs when unhiding.
     google.protobuf.DoubleValue predictive_prefetch_ratio = 2
         [(validate.rules).double = {lte: 3.0 gte: 1.0}];
   }
@@ -998,7 +997,6 @@ message Cluster {
   // Configuration to track optional cluster stats.
   TrackClusterStats track_cluster_stats = 49;
 
-  // [#not-implemented-hide:]
   // Prefetch configuration for this cluster.
   PrefetchPolicy prefetch_policy = 50;
 

diff --git a/api/envoy/config/cluster/v4alpha/cluster.proto b/api/envoy/config/cluster/v4alpha/cluster.proto
diff --git a/docs/root/version_history/current.rst b/docs/root/version_history/current.rst
@@ -61,6 +61,7 @@ New Features
 * health_check: added option to use :ref:`no_traffic_healthy_interval <envoy_v3_api_field_config.core.v3.HealthCheck.no_traffic_healthy_interval>` which allows a different no traffic interval when the host is healthy.
 * http: added HCM :ref:`timeout config field <envoy_v3_api_field_extensions.filters.network.http_connection_manager.v3.HttpConnectionManager.request_headers_timeout>` to control how long a downstream has to finish sending headers before the stream is cancelled.
 * http: added frame flood and abuse checks to the upstream HTTP/2 codec. This check is off by default and can be enabled by setting the `envoy.reloadable_features.upstream_http2_flood_checks` runtime key to true.
+* http: added support for :ref:`:ref:`prefetching <envoy_v3_api_msg_config.cluster.v3.Cluster.PrefetchPolicy>`. Prefetching is off by default, but recommended for clusters serving latency-sensitive traffic, especially if using HTTP/1.1.
 * jwt_authn: added support for :ref:`per-route config <envoy_v3_api_msg_extensions.filters.http.jwt_authn.v3.PerRouteConfig>`.
 * kill_request: added new :ref:`HTTP kill request filter <config_http_filters_kill_request>`.
 * listener: added an optional :ref:`default filter chain <envoy_v3_api_field_config.listener.v3.Listener.default_filter_chain>`. If this field is supplied, and none of the :ref:`filter_chains <envoy_v3_api_field_config.listener.v3.Listener.filter_chains>` matches, this default filter chain is used to serve the connection.

diff --git a/generated_api_shadow/envoy/config/cluster/v3/cluster.proto b/generated_api_shadow/envoy/config/cluster/v3/cluster.proto
diff --git a/generated_api_shadow/envoy/config/cluster/v4alpha/cluster.proto b/generated_api_shadow/envoy/config/cluster/v4alpha/cluster.proto
diff --git a/source/common/upstream/cluster_manager_impl.cc b/source/common/upstream/cluster_manager_impl.cc
@@ -855,25 +855,33 @@ ThreadLocalCluster* ClusterManagerImpl::get(absl::string_view cluster) {
 
 void ClusterManagerImpl::maybePrefetch(
     ThreadLocalClusterManagerImpl::ClusterEntryPtr& cluster_entry,
+    const ClusterConnectivityState& state,
     std::function<ConnectionPool::Instance*()> pick_prefetch_pool) {
-  // TODO(alyssawilk) As currently implemented, this will always just prefetch
-  // one connection ahead of actually needed connections.
-  //
-  // Instead we want to track the following metrics across the entire connection
-  // pool and use the same algorithm we do for per-upstream prefetch:
-  // ((pending_streams_ + num_active_streams_) * global_prefetch_ratio >
-  //  (connecting_stream_capacity_ + num_active_streams_)))
-  //  and allow multiple prefetches per pick.
-  //  Also cap prefetches such that
-  //  num_unused_prefetch < num hosts
-  //  since if we have more prefetches than hosts, we should consider kicking into
-  //  per-upstream prefetch.
-  //
-  //  Once we do this, this should loop capped number of times while shouldPrefetch is true.
-  if (cluster_entry->cluster_info_->peekaheadRatio() > 1.0) {
+  auto peekahead_ratio = cluster_entry->cluster_info_->peekaheadRatio();
+  if (peekahead_ratio <= 1.0) {
+    return;
+  }
+
+  // 3 here is arbitrary. Just as in ConnPoolImplBase::tryCreateNewConnections
+  // we want to limit the work which can be done on any given prefetch attempt.
+  for (int i = 0; i < 3; ++i) {
+    // Just as in ConnPoolImplBase::shouldCreateNewConnection, see if adding this one new connection
+    // would put the cluster over desired capacity. If so, stop prefetching.
+    if ((state.pending_streams_ + 1 + state.active_streams_) * peekahead_ratio <=
+        (state.connecting_stream_capacity_ + state.active_streams_)) {
+      return;
+    }
     ConnectionPool::Instance* prefetch_pool = pick_prefetch_pool();
     if (prefetch_pool) {
-      prefetch_pool->maybePrefetch(cluster_entry->cluster_info_->peekaheadRatio());
+      if (!prefetch_pool->maybePrefetch(peekahead_ratio)) {
+        // Given that the next prefetch pick may be entirely different, we could
+        // opt to try again even if the first prefetch fails. Err on the side of
+        // caution and wait for the next attempt.
+        return;
+      }
+    } else {
+      // If unable to find a prefetch pool, exit early.
+      return;
     }
   }
 }
@@ -898,9 +906,10 @@ ClusterManagerImpl::httpConnPoolForCluster(const std::string& cluster, ResourceP
   // performed here in anticipation of the new stream.
   // TODO(alyssawilk) refactor to have one function call and return a pair, so this invariant is
   // code-enforced.
-  maybePrefetch(entry->second, [&entry, &priority, &protocol, &context]() {
-    return entry->second->connPool(priority, protocol, context, true);
-  });
+  maybePrefetch(entry->second, cluster_manager.cluster_manager_state_,
+                [&entry, &priority, &protocol, &context]() {
+                  return entry->second->connPool(priority, protocol, context, true);
+                });
 
   return ret;
 }
@@ -924,9 +933,10 @@ ClusterManagerImpl::tcpConnPoolForCluster(const std::string& cluster, ResourcePr
   // TODO(alyssawilk) refactor to have one function call and return a pair, so this invariant is
   // code-enforced.
   // Now see if another host should be prefetched.
-  maybePrefetch(entry->second, [&entry, &priority, &context]() {
-    return entry->second->tcpConnPool(priority, context, true);
-  });
+  maybePrefetch(entry->second, cluster_manager.cluster_manager_state_,
+                [&entry, &priority, &context]() {
+                  return entry->second->tcpConnPool(priority, context, true);
+                });
 
   return ret;
 }
@@ -1405,8 +1415,10 @@ ClusterManagerImpl::ThreadLocalClusterManagerImpl::ClusterEntry::connPool(
     LoadBalancerContext* context, bool peek) {
   HostConstSharedPtr host = (peek ? lb_->peekAnotherHost(context) : lb_->chooseHost(context));
   if (!host) {
-    ENVOY_LOG(debug, "no healthy host for HTTP connection pool");
-    cluster_info_->stats().upstream_cx_none_healthy_.inc();
+    if (!peek) {
+      ENVOY_LOG(debug, "no healthy host for HTTP connection pool");
+      cluster_info_->stats().upstream_cx_none_healthy_.inc();
+    }
     return nullptr;
   }
 
@@ -1466,8 +1478,10 @@ ClusterManagerImpl::ThreadLocalClusterManagerImpl::ClusterEntry::tcpConnPool(
     ResourcePriority priority, LoadBalancerContext* context, bool peek) {
   HostConstSharedPtr host = (peek ? lb_->peekAnotherHost(context) : lb_->chooseHost(context));
   if (!host) {
-    ENVOY_LOG(debug, "no healthy host for TCP connection pool");
-    cluster_info_->stats().upstream_cx_none_healthy_.inc();
+    if (!peek) {
+      ENVOY_LOG(debug, "no healthy host for TCP connection pool");
+      cluster_info_->stats().upstream_cx_none_healthy_.inc();
+    }
     return nullptr;
   }
 

diff --git a/source/common/upstream/cluster_manager_impl.h b/source/common/upstream/cluster_manager_impl.h
@@ -551,6 +551,7 @@ class ClusterManagerImpl : public ClusterManager, Logger::Loggable<Logger::Id::u
   void updateClusterCounts();
   void clusterWarmingToActive(const std::string& cluster_name);
   static void maybePrefetch(ThreadLocalClusterManagerImpl::ClusterEntryPtr& cluster_entry,
+                            const ClusterConnectivityState& cluster_manager_state,
                             std::function<ConnectionPool::Instance*()> prefetch_pool);
 
   ClusterManagerFactory& factory_;