Skip to content

Commit

Permalink
Merge pull request #78 from sighupio/sync_kube_prom
Browse files Browse the repository at this point in the history
Sync the Monitoring module to kube-prometheus
  • Loading branch information
nandajavarma authored Jan 28, 2022
2 parents 75b08ad + bcc08e0 commit 7c0c518
Show file tree
Hide file tree
Showing 30 changed files with 242 additions and 98 deletions.
4 changes: 3 additions & 1 deletion katalog/alertmanager-operated/deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,9 @@ spec:
- name: web
port: 9093
targetPort: web
- name: reloader-web
port: 8080
targetPort: reloader-web
selector:
alertmanager: main
app: alertmanager
sessionAffinity: ClientIP
2 changes: 2 additions & 0 deletions katalog/alertmanager-operated/sm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ spec:
endpoints:
- interval: 30s
port: web
- interval: 30s
port: reloader-web
selector:
matchLabels:
alertmanager: main
16 changes: 8 additions & 8 deletions katalog/configs/bases/coredns/dashboards/coredns.json
Original file line number Diff line number Diff line change
Expand Up @@ -849,7 +849,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"description": "This graph gives an overview on the proxy requests volume on a given pod receiving DNS requests on a 2 minutes time window.",
"description": "This graph gives an overview on the forward requests volume on a given pod receiving DNS requests on a 2 minutes time window.",
"fill": 1,
"gridPos": {
"h": 7,
Expand Down Expand Up @@ -881,7 +881,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum by (instance,pod,proto) (irate(coredns_proxy_request_count_total{job=\"coredns\",instance=~\"$instance\"}[2m]))",
"expr": "sum by (instance,pod,proto) (irate(coredns_forward_requests_total{job=\"coredns\",instance=~\"$instance\"}[2m]))",
"format": "time_series",
"hide": false,
"intervalFactor": 2,
Expand All @@ -892,7 +892,7 @@
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Proxy Requests Volume",
"title": "Forward Requests Volume",
"tooltip": {
"shared": true,
"sort": 0,
Expand Down Expand Up @@ -935,7 +935,7 @@
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"description": "This graph gives an overview of the 99th percentile requests latency for a given pod proxying DNS requests.",
"description": "This graph gives an overview of the 99th percentile requests latency for a given pod forwarding DNS requests.",
"fill": 1,
"gridPos": {
"h": 7,
Expand Down Expand Up @@ -967,22 +967,22 @@
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.99, sum by (le,instance,pod,proto) (rate(coredns_proxy_request_duration_seconds_bucket{job=\"coredns\",instance=~\"$instance\"}[2m])))",
"expr": "histogram_quantile(0.99, sum by (le,instance,pod,proto) (rate(coredns_forward_request_duration_seconds_bucket{job=\"coredns\",instance=~\"$instance\"}[2m])))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{ pod }}/{{ proto }} p99",
"refId": "A"
},
{
"expr": "histogram_quantile(0.90, sum by (le,instance,pod,proto) (rate(coredns_proxy_request_duration_seconds_bucket{job=\"coredns\",instance=~\"$instance\"}[2m])))",
"expr": "histogram_quantile(0.90, sum by (le,instance,pod,proto) (rate(coredns_forward_request_duration_seconds_bucket{job=\"coredns\",instance=~\"$instance\"}[2m])))",
"format": "time_series",
"hide": true,
"intervalFactor": 2,
"legendFormat": "{{ pod }}/{{ proto }} p90",
"refId": "B"
},
{
"expr": "histogram_quantile(0.50, sum by (le,instance,pod,proto) (rate(coredns_proxy_request_duration_seconds_bucket{job=\"coredns\",instance=~\"$instance\"}[2m])))",
"expr": "histogram_quantile(0.50, sum by (le,instance,pod,proto) (rate(coredns_forward_request_duration_seconds_bucket{job=\"coredns\",instance=~\"$instance\"}[2m])))",
"format": "time_series",
"hide": true,
"intervalFactor": 2,
Expand All @@ -993,7 +993,7 @@
"thresholds": [],
"timeFrom": null,
"timeShift": null,
"title": "Proxy Requests 99th Percentile Requests Duration",
"title": "Forward Requests 99th Percentile Requests Duration",
"tooltip": {
"shared": true,
"sort": 0,
Expand Down
2 changes: 1 addition & 1 deletion katalog/configs/bases/coredns/service-monitors/coredns.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,4 +35,4 @@ spec:
protocol: TCP
targetPort: 9153
selector:
k8s-app: kube-dns
k8s-app: kube-dns
6 changes: 3 additions & 3 deletions katalog/configs/bases/default/dashboards/apiserver.json
Original file line number Diff line number Diff line change
Expand Up @@ -1470,7 +1470,7 @@
"multi": false,
"name": "cluster",
"options": [],
"query": "label_values(apiserver_request_total, cluster)",
"query": "label_values(p{job=\"apiserver\"}, cluster)",
"refresh": 2,
"regex": "",
"sort": 1,
Expand All @@ -1490,7 +1490,7 @@
"multi": false,
"name": "instance",
"options": [],
"query": "label_values(apiserver_request_total{job=\"apiserver\", cluster=\"$cluster\"}, instance)",
"query": "label_values(up{job=\"apiserver\", cluster=\"$cluster\"}, instance)",
"refresh": 2,
"regex": "",
"sort": 1,
Expand Down Expand Up @@ -1535,4 +1535,4 @@
"title": "Kubernetes / API server",
"uid": "09ec8aa1e996d6ffcd6817bbaff4db1b",
"version": 0
}
}
6 changes: 3 additions & 3 deletions katalog/configs/bases/default/dashboards/kubelet.json
Original file line number Diff line number Diff line change
Expand Up @@ -2119,11 +2119,11 @@
"datasource": "$datasource",
"hide": 0,
"includeAll": true,
"label": null,
"label": "Data Source",
"multi": false,
"name": "instance",
"options": [],
"query": "label_values(kubelet_runtime_operations_total{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\"}, instance)",
"query": "label_values(up{job=\"kubelet\", metrics_path=\"/metrics\",cluster=\"$cluster\"}, instance)",
"refresh": 2,
"regex": "",
"sort": 1,
Expand Down Expand Up @@ -2168,4 +2168,4 @@
"title": "Kubernetes / Kubelet",
"uid": "3138fa155d5915769fbded898ac09fd9",
"version": 0
}
}
6 changes: 3 additions & 3 deletions katalog/configs/bases/default/dashboards/workload-total.json
Original file line number Diff line number Diff line change
Expand Up @@ -1079,7 +1079,7 @@
"multi": false,
"name": "cluster",
"options": [],
"query": "label_values(kube_pod_info, cluster)",
"query": "label_values(up{job=\"kube-state-metrics\"}, cluster)",
"refresh": 2,
"regex": "",
"sort": 0,
Expand All @@ -1102,7 +1102,7 @@
"definition": "label_values(container_network_receive_packets_total{cluster=\"$cluster\"}, namespace)",
"hide": 0,
"includeAll": true,
"label": null,
"label": "Data Source",
"multi": false,
"name": "namespace",
"options": [],
Expand Down Expand Up @@ -1284,4 +1284,4 @@
"title": "Kubernetes / Networking / Workload",
"uid": "728bf77cc1166d2f3133bf25846876cc",
"version": 0
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ spec:
sourceLabels:
- __name__
- action: drop
regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs)
regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs|longrunning_gauge|registered_watchers)
sourceLabels:
- __name__
- action: drop
Expand Down
9 changes: 7 additions & 2 deletions katalog/configs/bases/default/service-monitors/kubelet.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ spec:
sourceLabels:
- __name__
- action: drop
regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs)
regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs|longrunning_gauge|registered_watchers)
sourceLabels:
- __name__
- action: drop
Expand Down Expand Up @@ -64,11 +64,16 @@ spec:
sourceLabels:
- __name__
- action: drop
regex: (container_fs_.*|container_spec_.*|container_blkio_device_usage_total|container_file_descriptors|container_sockets|container_threads_max|container_threads|container_start_time_seconds|container_last_seen);;
regex: (container_spec_.*|container_file_descriptors|container_sockets|container_threads_max|container_threads|container_start_time_seconds|container_last_seen);;
sourceLabels:
- __name__
- pod
- namespace
- action: drop
regex: (container_blkio_device_usage_total);.+
sourceLabels:
- __name__
- container
path: /metrics/cadvisor
port: https-metrics
relabelings:
Expand Down
2 changes: 1 addition & 1 deletion katalog/configs/kubeadm/dashboards/scheduler.json
Original file line number Diff line number Diff line change
Expand Up @@ -907,7 +907,7 @@
"multi": false,
"name": "instance",
"options": [],
"query": "label_values(process_cpu_seconds_total{cluster=\"$cluster\", job=\"kube-scheduler\"}, instance)",
"query": "label_values(up{job=\"kube-scheduler\", cluster=\"$cluster\"}, instance)",
"refresh": 2,
"regex": "",
"sort": 1,
Expand Down
12 changes: 6 additions & 6 deletions katalog/configs/kubeadm/rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ spec:
severity: critical
- alert: KubeClientCertificateExpiration
annotations:
description: A client certificate used to authenticate to the apiserver is expiring in less than 30.0 days.
description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 30.0 days.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration
summary: Client certificate is about to expire.
expr: |
Expand All @@ -45,7 +45,7 @@ spec:
severity: warning
- alert: KubeClientCertificateExpiration
annotations:
description: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days.
description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 7.0 days.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration
summary: Client certificate is about to expire.
expr: |
Expand Down Expand Up @@ -188,14 +188,14 @@ spec:
for: 10m
labels:
severity: warning
- alert: CoreDNSProxyRequestsLatency
- alert: CoreDNSForwardRequestsLatency
annotations:
message: 'CoreDNS instance {{ $labels.instance }} proxy requests
message: 'CoreDNS instance {{ $labels.instance }} forward requests
latency too high, current latency is {{ $value | printf "%.2f" }}.'
doc: "This alert fires if CoreDNS 99th percentile proxy requests
doc: "This alert fires if CoreDNS 99th percentile forward requests
latency was higher than 500ms in the last 10 minutes."
expr: |
histogram_quantile(0.99, sum by (le,instance,pod,proto) (rate(coredns_proxy_request_duration_seconds_bucket[2m]))) > 0.5
histogram_quantile(0.99, sum by (le,instance,pod,proto) (rate(coredns_forward_request_duration_seconds_bucket[2m]))) > 0.5
for: 10m
labels:
severity: warning
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ spec:
sourceLabels:
- __name__
- action: drop
regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs)
regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs|longrunning_gauge|registered_watchers)
sourceLabels:
- __name__
- action: drop
Expand Down
6 changes: 6 additions & 0 deletions katalog/grafana/deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ spec:
name: grafana-datasources
- mountPath: /etc/grafana/provisioning/dashboards
name: grafana-dashboards
- mountPath: /etc/grafana
name: grafana-config
readOnly: false
nodeSelector:
beta.kubernetes.io/os: linux
securityContext:
Expand All @@ -93,6 +96,9 @@ spec:
- name: grafana-dashboards
configMap:
name: grafana-dashboards
- name: grafana-config
secret:
secretName: grafana-config
---
apiVersion: v1
kind: Service
Expand Down
13 changes: 13 additions & 0 deletions katalog/grafana/grafana-config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Copyright (c) 2021 SIGHUP s.r.l All rights reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.

apiVersion: v1
kind: Secret
metadata:
name: grafana-config
stringData:
grafana.ini: |
[date_formats]
default_timezone = UTC
type: Opaque
1 change: 1 addition & 0 deletions katalog/grafana/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ resources:
- dashboards
- deploy.yml
- rbac.yml
- grafana-config.yml

images:
- name: grafana/grafana
Expand Down
8 changes: 4 additions & 4 deletions katalog/kube-proxy-metrics/dashboards/kube-proxy.json
Original file line number Diff line number Diff line change
Expand Up @@ -1002,7 +1002,7 @@
"value": "default"
},
"hide": 0,
"label": null,
"label": "Data Source",
"name": "datasource",
"options": [],
"query": "prometheus",
Expand All @@ -1020,7 +1020,7 @@
"multi": false,
"name": "cluster",
"options": [],
"query": "label_values(kube_pod_info, cluster)",
"query": "label_values(up{job=\"kube-proxy\"}, cluster)",
"refresh": 2,
"regex": "",
"sort": 1,
Expand All @@ -1040,7 +1040,7 @@
"multi": false,
"name": "instance",
"options": [],
"query": "label_values(kubeproxy_network_programming_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-proxy\"}, instance)",
"query": "label_values(up{job=\"kube-proxy\", cluster=\"$cluster\", job=\"kube-proxy\"}, instance)",
"refresh": 2,
"regex": "",
"sort": 1,
Expand Down Expand Up @@ -1085,4 +1085,4 @@
"title": "Kubernetes / Proxy",
"uid": "632e265de029684c40b21cb76bca4f94",
"version": 0
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -741,7 +741,7 @@
],
"targets": [
{
"expr": "sum(kube_pod_owner{cluster=\"$cluster\"}) by (namespace)",
"expr": "sum(kube_pod_owner{job=\"kube-state-metrics\", cluster=\"$cluster\"}) by (namespace)",
"format": "table",
"instant": true,
"intervalFactor": 2,
Expand Down Expand Up @@ -2644,4 +2644,4 @@
"title": "Kubernetes / Compute Resources / Cluster",
"uid": "efa86fd1d0c121a26444b636a3f509a8",
"version": 0
}
}
Loading

0 comments on commit 7c0c518

Please sign in to comment.