googleforgames · markmandel · Jan 18, 2022 · Jan 13, 2022 · Jan 14, 2022 · Jan 18, 2022
@@ -7,115 +7,153 @@ upstream endpoints and filter configuration from some external source, and then
 that information to generate cluster and filter updates that is pushed to any connected
 Quilkin proxy.
 
-The project has two binaries depending on the external source of configuration:
-
-1. **cmd/file/file.go** an implementation that watches a configuration file on disk and
-   sends updates to proxies whenever that file changes.
-   It can be started with the following command:
-   ```sh
-   go run cmd/file/file.go --config=config.yaml --port=18000
-   ```
-   After running this command, any proxy that connects to port 18000 will receive updates as
-   configured in `config.yaml` file.
-   The configuration file schema is:
-   ```yaml
-   # clusters contain a list of clusters.
-   # Each entry represents a cluster configuration.
-   clusters: [{
-     # Name of the cluster.
-     name: string
-
-     # List of endpoints belonging to the cluster.
-     # Each entry represents an upstream endpoint.
-     endpoints: [{
-       # The endpoint's IP address.
-       ip: int
-       # The endpoint's port.
-       port: int
-       # Opaque metadata that will be the endpoint's metadata.
-       metadata: {}
-     }]
-   }]
-
-   # filterchain represents the filter chain configuration.
-   # It contains a list of filter configurations.
-   filterchain: [{
-     # Name of the filter
-     name: string
-
-     # typed_config contains the filter's configuration.
-     typed_config: {
-       # @type must be equivalent to name - the name of the filter.
-       # It is an extra, required field.
-       '@type': string
-       # ...
-       # The rest of the body contains filter specific configuration or
-       # is empty if the filter has no configuration.
-     }
-   }]
-   ```
-   Example:
-   ```yaml
-   clusters:
-   - name: cluster-a
-     endpoints:
-     - ip: 123.0.0.1
-       port": 29
-       metadata:
-         'quilkin.dev':
-            tokens:
-            - "MXg3aWp5Ng=="
-   filterchain:
-   - name: quilkin.extensions.filters.debug.v1alpha1.Debug
-     typed_config:
-       '@type': quilkin.extensions.filters.debug.v1alpha1.Debug
-       id: hello
-   ```
-
-   > The file.go binary is primarily an example and mostly suitable for demo purposes.
-   > As a result, some configuration options and features might be missing.
-
-1. **cmd/controller.go**: A server implementation that runs in [Kubernetes].
+The project has a binary **cmd/controller.go** with a server implementation that runs in [Kubernetes].
 
-   1. Cluster information is retrieved from [Agones] - the server watches for `Allocated`
-      [Agones GameServers] and exposes their IP address and Port as upstream endpoints to
-      any connected Quilkin proxies.
-
-      > Since an Agones GameServer can have multiple ports exposed, if multiple ports are in
-      > use, the server looks for the port named `default` and picks that as the endpoint's
-      > port (otherwise it picks the first port in the port list).
-
-   1. Filter chain is configurable on a per-proxy basis. By default an empty filter chain is
-      used and from there the filter chain can configured using annotations on the proxy's pod.
-      The following annotations are currently supported:
-      - **quilkin.dev/debug-packets**: If set to the value `true`, then a `Debug` filter will be
-        added to the filter chain, causing all packets will be logged.
-      - **quilkin.dev/routing-token-suffix-size**: Sets the size (in number of bytes) of routing tokens appended to
-        packets. Extracted tokens will matched against available endpoints in order to figure out
-        where to send the associated packet.
-        Note that the token is stripped off the packet. This annotation cannot be provided together with
-        `quilkin.dev/routing-token-prefix-size`.
-      - **quilkin.dev/routing-token-prefix-size**: Works exactly the same as `quilkin.dev/routing-token-suffix-size`
-        with the difference that the token is a prefix on the packet rather than a suffix.
-
-   As an example, the following runs the server against a cluster (using default kubeconfig configuration) where Quilkin pods run in the `quilkin` namespace and game-server pods run in the `gameservers` namespace:
-
-   ```sh
-   go run controller.go -- port=18000 --proxy-namespace=quilkin --game-server-namespace=gameservers
-   ```
-
-   > A proxy's pod must have a `quilkin.dev/role` annotation set to the value `proxy` in order
-     for the management server to detect the pod as a proxy and push updates to it.
-
-   > Note that currently, the server can only discover resources within a single cluster.
-
-   ##### Admin server
-
-   In addition the gRPC server, a http server (configurable via `--admin-port`is also started to serve administrative functionality.
-   The following endpoints are provided:
-   - `/ready`: Readiness probe that returns a 5xx if communication with the Kubernetes api is problematic.
-   - `/live`: Liveness probe that always returns a 200 response.
-   - `/metrics`: Exposes Prometheus metrics.
+1. Cluster information is retrieved from [Agones] - the server watches for `Allocated`
+   [Agones GameServers] and exposes their IP address and Port as upstream endpoints to
+   any connected Quilkin proxies.
+
+   > Since an Agones GameServer can have multiple ports exposed, if multiple ports are in
+   > use, the server looks for the port named `default` and picks that as the endpoint's
+   > port (otherwise it picks the first port in the port list).
+
+1. Filter chain is configurable on a per-proxy basis. By default an empty filter chain is
 tokenStr, found := gs.Annotations["quilkin.dev/tokens"] 
 tokenStr, found := gs.Annotations["quilkin.dev/tokens"] 
+   used and from there the filter chain can configured using annotations on the proxy's pod.
+   The following annotations are currently supported:
+   - **quilkin.dev/debug-packets**: If set to the value `true`, then a `Debug` filter will be
+     added to the filter chain, causing all packets will be logged.
+   - **quilkin.dev/routing-token-suffix-size**: Sets the size (in number of bytes) of routing tokens appended to
+     packets. Extracted tokens will matched against available endpoints in order to figure out
+     where to send the associated packet.
+     Note that the token is stripped off the packet. This annotation cannot be provided together with
+     `quilkin.dev/routing-token-prefix-size`.
+   - **quilkin.dev/routing-token-prefix-size**: Works exactly the same as `quilkin.dev/routing-token-suffix-size`
+     with the difference that the token is a prefix on the packet rather than a suffix.
+
+As an example, the following runs the server against a cluster (using default kubeconfig configuration) where Quilkin pods run in the `quilkin` namespace and game-server pods run in the `gameservers` namespace:
+
+```sh
+go run controller.go -- port=18000 --proxy-namespace=quilkin --game-server-namespace=gameservers
+```
+
+> A proxy's pod must have a `quilkin.dev/role` annotation set to the value `proxy` in order
+  for the management server to detect the pod as a proxy and push updates to it.
+
+> Note that currently, the server can only discover resources within a single cluster.
+
+##### Admin server
+
+In addition the gRPC server, a http server (configurable via `--admin-port`is also started to serve administrative functionality.
+The following endpoints are provided:
+- `/ready`: Readiness probe that returns a 5xx if communication with the Kubernetes api is problematic.
+- `/live`: Liveness probe that always returns a 200 response.
+- `/metrics`: Exposes Prometheus metrics.
+
+##### Metrics
+
+The following metrics are exposed by the management server.
+
+- `quilkin_management_server_connected_proxies` (Gauge)
+
+   The number of proxies currently connected to the server.
+- `quilkin_management_server_discovery_requests_total{request_type}` (Counter)
+
+   The total number of xDS Discovery requests received across all proxies.
+   - `request_type` = `type.googleapis.com/envoy.config.cluster.v3.Cluster` | `type.googleapis.com/envoy.config.endpoint.v3.ClusterLoadAssignment` | `type.googleapis.com/envoy.config.listener.v3.Listener`
+     Type URL of the requested resource
+- `quilkin_management_server_discovery_responses_total` (Counter)
+
+   The total number of xDS Discovery responses sent back across all proxies in response to Discovery Requests.
+   Each Discovery response sent corresponds to a configuration update for some proxy.
+   - `request_type` = `type.googleapis.com/envoy.config.cluster.v3.Cluster` | `type.googleapis.com/envoy.config.endpoint.v3.ClusterLoadAssignment` | `type.googleapis.com/envoy.config.listener.v3.Listener`
+     Type URL of the requested resource
+- `quilkin_management_server_endpoints_total` (Gauge)
+
+   The number of active endpoints discovered by the server. The number of active endpoints
+   correlates with the size of the cluster configuration update sent to proxies.
+- `quilkin_management_server_snapshot_generation_errors_total` (Counter)
+
+   The total number of errors encountered while generating a configuration snapshot update for a proxy.
+- `quilkin_management_server_snapshots_generated_total` (Counter)
+
+   The total number of configuration snapshot generated across all proxies. A snapshot corresponds
+   to a point in time view of a proxy's configuration. However it does not necessarily correspond
+   to a proxy update - a proxy only gets the latest snapshot so it might miss intermediate
+   snapshots if it lags behind.
+- `quilkin_management_server_snapshots_cache_size` (Gauge)
+
+   The current number of snapshots in the in-memory snapshot cache. This corresponds 1-1 to
+   proxies that connect to the server. However the number may be slightly higher than the number
+   of connected proxies since snapshots for disconnected proxies are only periodically cleared
+   from the cache.
+
+##### File Server
+
+> The file server binary is primarily an example and mostly suitable for demo purposes.
+> As a result, some configuration options and features might be missing.
+
+**cmd/file/file.go** an implementation that watches a configuration file on disk and
+sends updates to proxies whenever that file changes.
+It can be started with the following command:
+```sh
+go run cmd/file/file.go --config=config.yaml --port=18000
+```
+After running this command, any proxy that connects to port 18000 will receive updates as
+configured in `config.yaml` file.
+The configuration file schema is:
+```yaml
+# clusters contain a list of clusters.
+# Each entry represents a cluster configuration.
+clusters: [{
+  # Name of the cluster.
+  name: string
+
+  # List of endpoints belonging to the cluster.
+  # Each entry represents an upstream endpoint.
+  endpoints: [{
+    # The endpoint's IP address.
+    ip: int
+    # The endpoint's port.
+    port: int
+    # Opaque metadata that will be the endpoint's metadata.
+    metadata: {}
+  }]
+}]
+
+# filterchain represents the filter chain configuration.
+# It contains a list of filter configurations.
+filterchain: [{
+  # Name of the filter
+  name: string
+
+  # typed_config contains the filter's configuration.
+  typed_config: {
+    # @type must be equivalent to name - the name of the filter.
+    # It is an extra, required field.
+    '@type': string
+    # ...
+    # The rest of the body contains filter specific configuration or
+    # is empty if the filter has no configuration.
+  }
+}]
+```
+Example:
+```yaml
+clusters:
+- name: cluster-a
+  endpoints:
+  - ip: 123.0.0.1
+    port": 29
+    metadata:
+      'quilkin.dev':
+         tokens:
+         - "MXg3aWp5Ng=="
+filterchain:
+- name: quilkin.extensions.filters.debug.v1alpha1.Debug
+  typed_config:
+    '@type': quilkin.extensions.filters.debug.v1alpha1.Debug
+    id: hello
+```
 
 
 [XDS]: https://www.envoyproxy.io/docs/envoy/latest/api-docs/xds_protocol

@@ -81,6 +81,7 @@ func runClusterWatch(
 	defer ticker.Stop()
 
 	prevEndpoints := map[string]cluster.Endpoint{}
+	cluster.EndpointsTotal.Set(0)
 	for {
 		select {
 		case <-ticker.C:
@@ -94,6 +95,9 @@ func runClusterWatch(
 			for _, ep := range currEndpoints {
 				endpoints = append(endpoints, ep)
 			}
+
+			cluster.EndpointsTotal.Set(float64(len(endpoints)))
+
 			clusterCh <- []cluster.Cluster{{
 				Name:      "default-quilkin-cluster",
 				Endpoints: endpoints,

@@ -18,6 +18,20 @@ package cluster
 
 import (
 	"context"
+
+	"github.com/prometheus/client_golang/prometheus"
+	"quilkin.dev/xds-management-server/pkg/metrics"
+
+	"github.com/prometheus/client_golang/prometheus/promauto"
+)
+
+var (
+	EndpointsTotal = promauto.NewGauge(prometheus.GaugeOpts{
+		Namespace: metrics.Namespace,
+		Subsystem: metrics.Subsystem,
+		Name:      "endpoints_total",
+		Help:      "Current number of active endpoints",
+	})
 )
 
 // Endpoint represents an upstream endpoint (e.g a game-server)

@@ -35,19 +35,19 @@ var connectedProxiesGauge = promauto.NewGauge(prometheus.GaugeOpts{
 	Help:      "Number of proxies currently connected to the server",
 })
 
-var discoveryRequestsTotal = promauto.NewCounter(prometheus.CounterOpts{
+var discoveryRequestsTotal = promauto.NewCounterVec(prometheus.CounterOpts{
 	Namespace: metrics.Namespace,
 	Subsystem: metrics.Subsystem,
 	Name:      "discovery_requests_total",
 	Help:      "Number of discovery requests received by the server",
-})
+}, []string{"resource_type"})
 
-var discoveryResponsesTotal = promauto.NewCounter(prometheus.CounterOpts{
+var discoveryResponsesTotal = promauto.NewCounterVec(prometheus.CounterOpts{
 	Namespace: metrics.Namespace,
 	Subsystem: metrics.Subsystem,
 	Name:      "discovery_responses_total",
 	Help:      "Number of discovery responses sent by the server",
-})
+}, []string{"resource_type"})
 
 // callbacks implements callbacks for the go-control-plane xds server.
 type callbacks struct {
@@ -83,7 +83,14 @@ func (c *callbacks) OnStreamRequest(streamID int64, request *discoveryservice.Di
 		"request_version_info": request.VersionInfo,
 		"request_nonce":        request.ResponseNonce,
 	}).Debugf("OnStreamRequest")
-	discoveryRequestsTotal.Inc()
+
+	counter, err := discoveryRequestsTotal.GetMetricWithLabelValues(request.TypeUrl)
+	if err != nil {
+		c.log.WithField("resource_type", request.TypeUrl).
+			Debug("failed to curry discovery_requests_total metric labels")
+	} else {
+		counter.Inc()
+	}
 
 	if c.nodeIDCh != nil {
 		c.nodeIDCh <- request.Node.Id
@@ -103,7 +110,14 @@ func (c *callbacks) OnStreamResponse(
 		"response_version_info": response.VersionInfo,
 		"response_nonce":        response.Nonce,
 	}).Debugf("OnStreamResponse")
-	discoveryResponsesTotal.Inc()
+
+	counter, err := discoveryResponsesTotal.GetMetricWithLabelValues(request.TypeUrl)
+	if err != nil {
+		c.log.WithField("resource_type", request.TypeUrl).
+			Debug("failed to curry discovery_responses_total metric labels")
+	} else {
+		counter.Inc()
+	}
 }
 
 func (c *callbacks) OnFetchRequest(

@@ -41,6 +41,12 @@ var (
 		Name:      "snapshot_generation_errors_total",
 		Help:      "Total number of errors encountered while generating snapshots",
 	})
+	snapshotsCacheSize = promauto.NewGauge(prometheus.GaugeOpts{
+		Namespace: metrics.Namespace,
+		Subsystem: metrics.Subsystem,
+		Name:      "snapshot_cache_size",
+		Help:      "Current number of snapshots in the snapshot cache",
+	})
 	snapshotGeneratedTotal = promauto.NewCounter(prometheus.CounterOpts{
 		Namespace: metrics.Namespace,
 		Subsystem: metrics.Subsystem,
@@ -205,6 +211,11 @@ type snapshotCleanupView interface {
 // cleanup deletes snapshots from the cache for any node that is no longer
 // connected to the server.
 func (u *Updater) cleanup() {
+	defer func() {
+		// When we're done, update metrics to reflect the latest size cache size.
+		snapshotsCacheSize.Set(float64(len(u.snapshotCleanupView.GetStatusKeys())))
+	}()
+
 	// If a node has had no open watches in a while (specified by the
 	// configurable snapshot grace period) then let's delete its snapshot.
 	nodeIDs := u.snapshotCleanupView.GetStatusKeys()