Add etcd metrics, Prometheus scrapes, and Grafana dash

* Use etcd v3.3 --listen-metrics-urls to expose only metrics data via http://0.0.0.0:2381 on controllers * Add Prometheus discovery for etcd peers on controller nodes * Temporarily drop two noisy Prometheus alerts
poseidon · Apr 4, 2018 · d770393 · d770393
1 parent 642f7ec
commit d770393
Show file tree

Hide file tree

Showing 10 changed files with 58 additions and 22 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -4,6 +4,16 @@ Notable changes between versions.
 
 ## Latest
 
+* Enable etcd v3.3 metrics endpoint ([#175](https://github.com/poseidon/typhoon/pull/175))
+
+#### Addons
+
+* Add Prometheus discovery for etcd peers on controller nodes ([#175](https://github.com/poseidon/typhoon/pull/175))
+  * Scrape etcd v3.3 `--listen-metrics-urls` for metrics
+  * Enable etcd alerts and populate the etcd Grafana dashboard
+
+## v1.10.0
+
 * Kubernetes [v1.10.0](https://github.com/kubernetes/kubernetes/blob/master/CHANGELOG-1.10.md#v1100)
 * Remove unused, unmaintained `pxe-worker` internal module
 

diff --git a/addons/prometheus/config.yaml b/addons/prometheus/config.yaml
@@ -112,6 +112,22 @@ data:
         target_label: __metrics_path__
         replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
     
+    # Scrap etcd metrics from controllers 
+    - job_name: 'etcd'
+      kubernetes_sd_configs:
+      - role: node
+      scheme: http
+      relabel_configs:
+        - source_labels: [__meta_kubernetes_node_label_node_role_kubernetes_io_controller]
+          action: keep
+          regex: 'true'
+        - action: labelmap
+          regex: __meta_kubernetes_node_label_(.+)
+        - source_labels: [__meta_kubernetes_node_name]
+          action: replace
+          target_label: __address__
+          replacement: '${1}:2381'
+    
     # Scrape config for service endpoints.
     #
     # The relabeling allows the actual service scrape endpoint to be configured

diff --git a/addons/prometheus/deployment.yaml b/addons/prometheus/deployment.yaml
@@ -15,6 +15,12 @@ spec:
         name: prometheus
         phase: prod
     spec:
+      nodeSelector:
+        node-role.kubernetes.io/master: ""
+      tolerations:
+      - key: node-role.kubernetes.io/master
+        operator: Exists
+        effect: NoSchedule
       serviceAccountName: prometheus
       containers:
       - name: prometheus

diff --git a/addons/prometheus/rules.yaml b/addons/prometheus/rules.yaml
@@ -63,26 +63,6 @@ data:
           description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader
             changes within the last hour
           summary: a high number of leader changes within the etcd cluster are happening
-      - alert: HighNumberOfFailedGRPCRequests
-        expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
-          / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.01
-        for: 10m
-        labels:
-          severity: warning
-        annotations:
-          description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
-            on etcd instance {{ $labels.instance }}'
-          summary: a high number of gRPC requests are failing
-      - alert: HighNumberOfFailedGRPCRequests
-        expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
-          / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.05
-        for: 5m
-        labels:
-          severity: critical
-        annotations:
-          description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
-            on etcd instance {{ $labels.instance }}'
-          summary: a high number of gRPC requests are failing
       - alert: GRPCRequestsSlow
         expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le))
           > 0.15

diff --git a/aws/container-linux/kubernetes/cl/controller.yaml.tmpl b/aws/container-linux/kubernetes/cl/controller.yaml.tmpl
@@ -13,6 +13,7 @@ systemd:
             Environment="ETCD_INITIAL_ADVERTISE_PEER_URLS=https://${etcd_domain}:2380"
             Environment="ETCD_LISTEN_CLIENT_URLS=https://0.0.0.0:2379"
             Environment="ETCD_LISTEN_PEER_URLS=https://0.0.0.0:2380"
+            Environment="ETCD_LISTEN_METRICS_URLS=http://0.0.0.0:2381"
             Environment="ETCD_INITIAL_CLUSTER=${etcd_initial_cluster}"
             Environment="ETCD_STRICT_RECONFIG_CHECK=true"
             Environment="ETCD_SSL_DIR=/etc/ssl/etcd"

diff --git a/aws/container-linux/kubernetes/security.tf b/aws/container-linux/kubernetes/security.tf
@@ -81,6 +81,16 @@ resource "aws_security_group_rule" "controller-node-exporter" {
   source_security_group_id = "${aws_security_group.worker.id}"
 }
 
+resource "aws_security_group_rule" "controller-node-exporter-self" {
+  security_group_id = "${aws_security_group.controller.id}"
+
+  type      = "ingress"
+  protocol  = "tcp"
+  from_port = 9100
+  to_port   = 9100
+  self      = true
+}
+
 resource "aws_security_group_rule" "controller-kubelet-self" {
   security_group_id = "${aws_security_group.controller.id}"
 
@@ -256,6 +266,16 @@ resource "aws_security_group_rule" "worker-flannel-self" {
 resource "aws_security_group_rule" "worker-node-exporter" {
   security_group_id = "${aws_security_group.worker.id}"
 
+  type                     = "ingress"
+  protocol                 = "tcp"
+  from_port                = 9100
+  to_port                  = 9100
+  source_security_group_id = "${aws_security_group.controller.id}"
+}
+
+resource "aws_security_group_rule" "worker-node-exporter-self" {
+  security_group_id = "${aws_security_group.worker.id}"
+
   type      = "ingress"
   protocol  = "tcp"
   from_port = 9100

diff --git a/bare-metal/container-linux/kubernetes/cl/controller.yaml.tmpl b/bare-metal/container-linux/kubernetes/cl/controller.yaml.tmpl
@@ -13,6 +13,7 @@ systemd:
             Environment="ETCD_INITIAL_ADVERTISE_PEER_URLS=https://${domain_name}:2380"
             Environment="ETCD_LISTEN_CLIENT_URLS=https://0.0.0.0:2379"
             Environment="ETCD_LISTEN_PEER_URLS=https://0.0.0.0:2380"
+            Environment="ETCD_LISTEN_METRICS_URLS=http://0.0.0.0:2381"
             Environment="ETCD_INITIAL_CLUSTER=${etcd_initial_cluster}"
             Environment="ETCD_STRICT_RECONFIG_CHECK=true"
             Environment="ETCD_SSL_DIR=/etc/ssl/etcd"

diff --git a/digital-ocean/container-linux/kubernetes/cl/controller.yaml.tmpl b/digital-ocean/container-linux/kubernetes/cl/controller.yaml.tmpl
@@ -13,6 +13,7 @@ systemd:
             Environment="ETCD_INITIAL_ADVERTISE_PEER_URLS=https://${etcd_domain}:2380"
             Environment="ETCD_LISTEN_CLIENT_URLS=https://0.0.0.0:2379"
             Environment="ETCD_LISTEN_PEER_URLS=https://0.0.0.0:2380"
+            Environment="ETCD_LISTEN_METRICS_URLS=http://0.0.0.0:2381"
             Environment="ETCD_INITIAL_CLUSTER=${etcd_initial_cluster}"
             Environment="ETCD_STRICT_RECONFIG_CHECK=true"
             Environment="ETCD_SSL_DIR=/etc/ssl/etcd"

diff --git a/google-cloud/container-linux/kubernetes/controllers/cl/controller.yaml.tmpl b/google-cloud/container-linux/kubernetes/controllers/cl/controller.yaml.tmpl
@@ -13,6 +13,7 @@ systemd:
             Environment="ETCD_INITIAL_ADVERTISE_PEER_URLS=https://${etcd_domain}:2380"
             Environment="ETCD_LISTEN_CLIENT_URLS=https://0.0.0.0:2379"
             Environment="ETCD_LISTEN_PEER_URLS=https://0.0.0.0:2380"
+            Environment="ETCD_LISTEN_METRICS_URLS=http://0.0.0.0:2381"
             Environment="ETCD_INITIAL_CLUSTER=${etcd_initial_cluster}"
             Environment="ETCD_STRICT_RECONFIG_CHECK=true"
             Environment="ETCD_SSL_DIR=/etc/ssl/etcd"

diff --git a/google-cloud/container-linux/kubernetes/network.tf b/google-cloud/container-linux/kubernetes/network.tf
@@ -93,7 +93,7 @@ resource "google_compute_firewall" "internal-flannel" {
   target_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"]
 }
 
-# Allow prometheus (workload) to scrape node-exporter daemonset
+# Allow Prometheus to scrape node-exporter daemonset
 resource "google_compute_firewall" "internal-node-exporter" {
   name    = "${var.cluster_name}-internal-node-exporter"
   network = "${google_compute_network.network.name}"
@@ -103,7 +103,7 @@ resource "google_compute_firewall" "internal-node-exporter" {
     ports    = [9100]
   }
 
-  source_tags = ["${var.cluster_name}-worker"]
+  source_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"]
   target_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"]
 }