Skip to content

Commit

Permalink
Add etcd metrics, Prometheus scrapes, and Grafana dash
Browse files Browse the repository at this point in the history
* Use etcd v3.3 --listen-metrics-urls to expose only metrics
data via http://0.0.0.0:2381 on controllers
* Add Prometheus discovery for etcd peers on controller nodes
* Temporarily drop two noisy Prometheus alerts
  • Loading branch information
dghubble committed Apr 4, 2018
1 parent 642f7ec commit d770393
Show file tree
Hide file tree
Showing 10 changed files with 58 additions and 22 deletions.
10 changes: 10 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,16 @@ Notable changes between versions.

## Latest

* Enable etcd v3.3 metrics endpoint ([#175](https://github.com/poseidon/typhoon/pull/175))

#### Addons

* Add Prometheus discovery for etcd peers on controller nodes ([#175](https://github.com/poseidon/typhoon/pull/175))
* Scrape etcd v3.3 `--listen-metrics-urls` for metrics
* Enable etcd alerts and populate the etcd Grafana dashboard

## v1.10.0

* Kubernetes [v1.10.0](https://github.com/kubernetes/kubernetes/blob/master/CHANGELOG-1.10.md#v1100)
* Remove unused, unmaintained `pxe-worker` internal module

Expand Down
16 changes: 16 additions & 0 deletions addons/prometheus/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,22 @@ data:
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
# Scrap etcd metrics from controllers
- job_name: 'etcd'
kubernetes_sd_configs:
- role: node
scheme: http
relabel_configs:
- source_labels: [__meta_kubernetes_node_label_node_role_kubernetes_io_controller]
action: keep
regex: 'true'
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- source_labels: [__meta_kubernetes_node_name]
action: replace
target_label: __address__
replacement: '${1}:2381'
# Scrape config for service endpoints.
#
# The relabeling allows the actual service scrape endpoint to be configured
Expand Down
6 changes: 6 additions & 0 deletions addons/prometheus/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@ spec:
name: prometheus
phase: prod
spec:
nodeSelector:
node-role.kubernetes.io/master: ""
tolerations:
- key: node-role.kubernetes.io/master
operator: Exists
effect: NoSchedule
serviceAccountName: prometheus
containers:
- name: prometheus
Expand Down
20 changes: 0 additions & 20 deletions addons/prometheus/rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,26 +63,6 @@ data:
description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader
changes within the last hour
summary: a high number of leader changes within the etcd cluster are happening
- alert: HighNumberOfFailedGRPCRequests
expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
/ sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.01
for: 10m
labels:
severity: warning
annotations:
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
on etcd instance {{ $labels.instance }}'
summary: a high number of gRPC requests are failing
- alert: HighNumberOfFailedGRPCRequests
expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
/ sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.05
for: 5m
labels:
severity: critical
annotations:
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
on etcd instance {{ $labels.instance }}'
summary: a high number of gRPC requests are failing
- alert: GRPCRequestsSlow
expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le))
> 0.15
Expand Down
1 change: 1 addition & 0 deletions aws/container-linux/kubernetes/cl/controller.yaml.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ systemd:
Environment="ETCD_INITIAL_ADVERTISE_PEER_URLS=https://${etcd_domain}:2380"
Environment="ETCD_LISTEN_CLIENT_URLS=https://0.0.0.0:2379"
Environment="ETCD_LISTEN_PEER_URLS=https://0.0.0.0:2380"
Environment="ETCD_LISTEN_METRICS_URLS=http://0.0.0.0:2381"
Environment="ETCD_INITIAL_CLUSTER=${etcd_initial_cluster}"
Environment="ETCD_STRICT_RECONFIG_CHECK=true"
Environment="ETCD_SSL_DIR=/etc/ssl/etcd"
Expand Down
20 changes: 20 additions & 0 deletions aws/container-linux/kubernetes/security.tf
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,16 @@ resource "aws_security_group_rule" "controller-node-exporter" {
source_security_group_id = "${aws_security_group.worker.id}"
}

resource "aws_security_group_rule" "controller-node-exporter-self" {
security_group_id = "${aws_security_group.controller.id}"

type = "ingress"
protocol = "tcp"
from_port = 9100
to_port = 9100
self = true
}

resource "aws_security_group_rule" "controller-kubelet-self" {
security_group_id = "${aws_security_group.controller.id}"

Expand Down Expand Up @@ -256,6 +266,16 @@ resource "aws_security_group_rule" "worker-flannel-self" {
resource "aws_security_group_rule" "worker-node-exporter" {
security_group_id = "${aws_security_group.worker.id}"

type = "ingress"
protocol = "tcp"
from_port = 9100
to_port = 9100
source_security_group_id = "${aws_security_group.controller.id}"
}

resource "aws_security_group_rule" "worker-node-exporter-self" {
security_group_id = "${aws_security_group.worker.id}"

type = "ingress"
protocol = "tcp"
from_port = 9100
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ systemd:
Environment="ETCD_INITIAL_ADVERTISE_PEER_URLS=https://${domain_name}:2380"
Environment="ETCD_LISTEN_CLIENT_URLS=https://0.0.0.0:2379"
Environment="ETCD_LISTEN_PEER_URLS=https://0.0.0.0:2380"
Environment="ETCD_LISTEN_METRICS_URLS=http://0.0.0.0:2381"
Environment="ETCD_INITIAL_CLUSTER=${etcd_initial_cluster}"
Environment="ETCD_STRICT_RECONFIG_CHECK=true"
Environment="ETCD_SSL_DIR=/etc/ssl/etcd"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ systemd:
Environment="ETCD_INITIAL_ADVERTISE_PEER_URLS=https://${etcd_domain}:2380"
Environment="ETCD_LISTEN_CLIENT_URLS=https://0.0.0.0:2379"
Environment="ETCD_LISTEN_PEER_URLS=https://0.0.0.0:2380"
Environment="ETCD_LISTEN_METRICS_URLS=http://0.0.0.0:2381"
Environment="ETCD_INITIAL_CLUSTER=${etcd_initial_cluster}"
Environment="ETCD_STRICT_RECONFIG_CHECK=true"
Environment="ETCD_SSL_DIR=/etc/ssl/etcd"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ systemd:
Environment="ETCD_INITIAL_ADVERTISE_PEER_URLS=https://${etcd_domain}:2380"
Environment="ETCD_LISTEN_CLIENT_URLS=https://0.0.0.0:2379"
Environment="ETCD_LISTEN_PEER_URLS=https://0.0.0.0:2380"
Environment="ETCD_LISTEN_METRICS_URLS=http://0.0.0.0:2381"
Environment="ETCD_INITIAL_CLUSTER=${etcd_initial_cluster}"
Environment="ETCD_STRICT_RECONFIG_CHECK=true"
Environment="ETCD_SSL_DIR=/etc/ssl/etcd"
Expand Down
4 changes: 2 additions & 2 deletions google-cloud/container-linux/kubernetes/network.tf
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ resource "google_compute_firewall" "internal-flannel" {
target_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"]
}

# Allow prometheus (workload) to scrape node-exporter daemonset
# Allow Prometheus to scrape node-exporter daemonset
resource "google_compute_firewall" "internal-node-exporter" {
name = "${var.cluster_name}-internal-node-exporter"
network = "${google_compute_network.network.name}"
Expand All @@ -103,7 +103,7 @@ resource "google_compute_firewall" "internal-node-exporter" {
ports = [9100]
}

source_tags = ["${var.cluster_name}-worker"]
source_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"]
target_tags = ["${var.cluster_name}-controller", "${var.cluster_name}-worker"]
}

Expand Down

0 comments on commit d770393

Please sign in to comment.