diff --git a/.github/workflows/kubeconform.yml b/.github/workflows/kubeconform.yml new file mode 100644 index 000000000..328c1ed4a --- /dev/null +++ b/.github/workflows/kubeconform.yml @@ -0,0 +1,49 @@ +--- +name: kubeconform + +on: + push: + branches: [master] + pull_request: + branches: [master] + +jobs: + apps: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-go@v2 + with: + go-version: '1.15' + - run: go get -u github.com/yannh/kubeconform/cmd/kubeconform + # - run: ./hack/generate-schemas.sh + - run: > + kubeconform + -schema-location 'https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/{{ .NormalizedKubernetesVersion }}-standalone{{ .StrictSuffix }}/{{ .ResourceKind }}.json' + #-schema-location 'crdschemas/{{ .ResourceKind }}.json' + #-skip CustomResourceDefinition,SealedSecret,ConfigMapSecret + -skip CustomResourceDefinition,SealedSecret,ConfigMapSecret,ServiceMonitor,PodMonitor,Probe,Prometheus,Alertmanager + -ignore-filename-pattern vendor/* + -ignore-filename-pattern jsonnet/* + -summary + apps/ + base: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - uses: actions/setup-go@v2 + with: + go-version: '1.15' + - run: go get -u github.com/yannh/kubeconform/cmd/kubeconform + # - run: ./hack/generate-schemas.sh + - run: > + kubeconform + -schema-location 'https://raw.githubusercontent.com/yannh/kubernetes-json-schema/master/{{ .NormalizedKubernetesVersion }}-standalone{{ .StrictSuffix }}/{{ .ResourceKind }}.json' + #-schema-location 'crdschemas/{{ .ResourceKind }}.json' + #-skip CustomResourceDefinition,SealedSecret,ConfigMapSecret,Plan,Application,AppProject,ClusterIssuer + -skip CustomResourceDefinition,SealedSecret,ConfigMapSecret,Plan,Application,AppProject,ClusterIssues,ServiceMonitor,PodMonitor + -ignore-filename-pattern vendor/* + -ignore-filename-pattern jsonnet/* + -summary + base/ + diff --git a/.github/workflows/kubeval.yml b/.github/workflows/kubeval.yml deleted file mode 100644 index dcb6e3813..000000000 --- a/.github/workflows/kubeval.yml +++ /dev/null @@ -1,28 +0,0 @@ ---- -name: kubeval - -on: - push: - branches: [master] - pull_request: - branches: [master] - -jobs: - apps: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - uses: actions/setup-go@v2 - with: - go-version: '1.15' - - run: go get -u github.com/instrumenta/kubeval - - run: kubeval --skip-kinds Plan,SealedSecret,ConfigMapSecret,CustomResourceDefinition,ServiceMonitor,PodMonitor,Probe,Prometheus,PrometheusRule,Alertmanager,APIService,Ingress --strict --force-color --ignored-filename-patterns .yml -d apps/ - base: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - uses: actions/setup-go@v2 - with: - go-version: '1.15' - - run: go get -u github.com/instrumenta/kubeval - - run: kubeval --skip-kinds Plan,SealedSecret,Application,CustomResourceDefinition,ClusterIssuer,AppProject,ServiceMonitor,PodMonitor,PrometheusRule,Ingress --strict --force-color -d base/ diff --git a/.gitignore b/.gitignore index 850bb19c4..1ff2e2e77 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,5 @@ master.key .kube bin/ templates/ +vendor/ +crdschemas/ diff --git a/apps/monitoring/README.md b/apps/monitoring/README.md new file mode 100644 index 000000000..3120775b7 --- /dev/null +++ b/apps/monitoring/README.md @@ -0,0 +1,17 @@ +# What is this? + +Customized kube-prometheus stack for @paulfantom personal homelab. This is also one of few public usages of kube-prometheus. + +## How this works? + +### Short version + +1. `./generate.sh` +2. Commit and push +3. Profit + +### Long version + +`kube-prometheus` is used as a library and installed with `jb`. Next customization stored in `jsonnet/main.jsonnet` is +applied. After this `jsonnet` is used to generate `manifests/` directory and ConfigMapSecrets are copied into `manifests/` +from `configmapsecrets/` directory. diff --git a/apps/monitoring/manifests/alertmanager/config.yaml b/apps/monitoring/configmapsecrets/alertmanager-secret.yaml similarity index 100% rename from apps/monitoring/manifests/alertmanager/config.yaml rename to apps/monitoring/configmapsecrets/alertmanager-secret.yaml diff --git a/apps/monitoring/manifests/prometheus/03_scrapeconfigs.yaml b/apps/monitoring/configmapsecrets/prometheus-additionalScrapeConfigs.yaml similarity index 85% rename from apps/monitoring/manifests/prometheus/03_scrapeconfigs.yaml rename to apps/monitoring/configmapsecrets/prometheus-additionalScrapeConfigs.yaml index 852d72378..c0b165e9a 100644 --- a/apps/monitoring/manifests/prometheus/03_scrapeconfigs.yaml +++ b/apps/monitoring/configmapsecrets/prometheus-additionalScrapeConfigs.yaml @@ -17,21 +17,6 @@ spec: labels: node: 'DESKTOP-ODOR2KB' - # https://github.com/prometheus-pve/prometheus-pve-exporter - #- job_name: 'pve' - # static_configs: - # - targets: - # - 192.168.2.40 # Proxmox VE node. - # metrics_path: /pve - # params: - # module: [default] - # relabel_configs: - # - source_labels: [__address__] - # target_label: __param_target - # - source_labels: [__param_target] - # target_label: instance - # - target_label: __address__ - # replacement: 127.0.0.1:9221 # PVE exporter. - job_name: lancre scrape_interval: 30s scrape_timeout: 30s diff --git a/apps/monitoring/generate.sh b/apps/monitoring/generate.sh new file mode 100755 index 000000000..aa85229a0 --- /dev/null +++ b/apps/monitoring/generate.sh @@ -0,0 +1,29 @@ +#!/bin/bash + +set -euo pipefail + +# Install dependencies +if [ ! -d 'jsonnet/vendor' ]; then + cd jsonnet + jb install + cd ../ +fi + +# Remove old manifests +rm -rf manifests || : + +# Generate manifests +jsonnet -J jsonnet/vendor -c -m manifests -S jsonnet/main.jsonnet + +# Next step is just an eye-candy and only beautifies yaml files +for i in $(find manifests/ -name *.yaml); do + mv "$i" "$i.bak" + yamlfmt < "$i.bak" > "$i" + rm "$i.bak" +done + +# Copy ConfigMapSecrets +for i in configmapsecrets/*.yaml; do + f="$(basename "$i" | sed 's/-/\//')" + cp "$i" "manifests/$f" +done diff --git a/apps/monitoring/jsonnet/ext/blackboxExporterConfig.json b/apps/monitoring/jsonnet/ext/blackboxExporterConfig.json new file mode 100644 index 000000000..9ae114146 --- /dev/null +++ b/apps/monitoring/jsonnet/ext/blackboxExporterConfig.json @@ -0,0 +1,70 @@ +{ + "modules": { + "http_2xx": { + "http": { + "preferred_ip_protocol": "ip4" + }, + "prober": "http" + }, + "http_post_2xx": { + "http": { + "method": "POST", + "preferred_ip_protocol": "ip4" + }, + "prober": "http" + }, + "irc_banner": { + "prober": "tcp", + "tcp": { + "preferred_ip_protocol": "ip4", + "query_response": [ + { + "send": "NICK prober" + }, + { + "send": "USER prober prober prober :prober" + }, + { + "expect": "PING :([^ ]+)", + "send": "PONG ${1}" + }, + { + "expect": "^:[^ ]+ 001" + } + ] + } + }, + "pop3s_banner": { + "prober": "tcp", + "tcp": { + "preferred_ip_protocol": "ip4", + "query_response": [ + { + "expect": "^+OK" + } + ], + "tls": true, + "tls_config": { + "insecure_skip_verify": false + } + } + }, + "ssh_banner": { + "prober": "tcp", + "tcp": { + "preferred_ip_protocol": "ip4", + "query_response": [ + { + "expect": "^SSH-2.0-" + } + ] + } + }, + "tcp_connect": { + "prober": "tcp", + "tcp": { + "preferred_ip_protocol": "ip4" + } + } + } +} diff --git a/apps/monitoring/jsonnet/ext/rules/testing.json b/apps/monitoring/jsonnet/ext/rules/testing.json new file mode 100644 index 000000000..b086b88bb --- /dev/null +++ b/apps/monitoring/jsonnet/ext/rules/testing.json @@ -0,0 +1,22 @@ +{ + "groups": [ + { + "name": "testing.rules", + "rules": [ + { + "alert": "CPUStealTimeHigh", + "annotations": { + "description": "CPU Steal Time is very high on {{ $labels.instance }} hypervisor. This can lead to VM being stalled.", + "runbook_url": "https://github.com/thaum-xyz/ankhmorpork/blob/master/docs/runbooks/CPUStealTimeHigh.md", + "summary": "High CPU Steal Time" + }, + "expr": "sum by (instance) (rate(node_cpu_seconds_total{mode=\"steal\"}[3m])) / count by (instance) (node_cpu_seconds_total{mode=\"steal\"}) > 0.1\n", + "for": "20m", + "labels": { + "severity": "warning" + } + } + ] + } + ] +} diff --git a/apps/monitoring/jsonnet/ext/rules/thaum.json b/apps/monitoring/jsonnet/ext/rules/thaum.json new file mode 100644 index 000000000..de1b59a92 --- /dev/null +++ b/apps/monitoring/jsonnet/ext/rules/thaum.json @@ -0,0 +1,100 @@ +{ + "groups": [ + { + "name": "custom node alert rules", + "rules": [ + { + "alert": "PackagesAvailable", + "annotations": { + "description": "{{ $value }} packages are available for upgrade. Maybe it is time to upgrade?", + "runbook_url": "https://github.com/thaum-xyz/ankhmorpork/blob/master/docs/runbooks/PackagesAvailable.md", + "summary": "Packages are available for upgrade" + }, + "expr": "sum by (node,instance) (yum_upgrades_pending) > 200\nor\nsum by (node,instance) (apt_upgrades_pending) > 200\n", + "for": "48h", + "labels": { + "severity": "info" + } + }, + { + "alert": "RebootRequired", + "annotations": { + "description": "Instance '{{ $labels.instance }}' was upgraded and now requires a reboot.", + "runbook_url": "https://github.com/thaum-xyz/ankhmorpork/blob/master/docs/runbooks/RebootRequired.md", + "summary": "Reboot is required to finish package upgrade" + }, + "expr": "node_reboot_required > 0", + "for": "4h", + "labels": { + "severity": "info" + } + } + ] + }, + { + "name": "alert rules specific to thaum.xyz", + "rules": [ + { + "alert": "FederatedPrometheusDown", + "annotations": { + "description": "Remote Prometheus server {{ $labels.instance }} has been down for more than 10 minutes.", + "runbook_url": "https://github.com/thaum-xyz/ankhmorpork/blob/master/docs/runbooks/FederatedPrometheusDown.md", + "summary": "Federated prometheus is down" + }, + "expr": "up{job=\"lancre\"} == 0", + "for": "20m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "FilesystemReadOnly", + "annotations": { + "description": "Filesystem went read-only on {{ $labels.instance }}. Check FS for possible corruption.", + "summary": "Filesystem went read-only possibly due to device error." + }, + "expr": "node_filesystem_readonly{fstype=~\"(vfat|ext4|xfs)\"} != 0\n", + "labels": { + "severity": "critical" + } + }, + { + "alert": "TouchscreenNotAvailable", + "annotations": { + "description": "Powercycle device {{ $labels.instance }} to bring touchscreen up", + "summary": "Touchscreen not available" + }, + "expr": "devices_input_touchscreen_up == 0 or absent(devices_input_touchscreen_up)\n", + "for": "10m", + "labels": { + "severity": "warning" + } + }, + { + "alert": "TouchscreenNotAvailable", + "annotations": { + "description": "Powercycle device {{ $labels.instance }}", + "summary": "Touchscreen not available and automatic remediation failed to restore it" + }, + "expr": "devices_input_touchscreen_up == 0 or absent(devices_input_touchscreen_up)\n", + "for": "1h", + "labels": { + "severity": "critical" + } + }, + { + "alert": "TemperaturesNotAvailable", + "annotations": { + "description": "Temperature data is gone. Immediatelly switch off all relays and check OW bus.", + "summary": "Cannot obtain temperature data" + }, + "expr": "absent(evok_temperature_celsius)\n", + "for": "15m", + "labels": { + "severity": "critical" + } + } + ] + } + ] +} diff --git a/apps/monitoring/jsonnet/jsonnetfile.json b/apps/monitoring/jsonnet/jsonnetfile.json new file mode 100644 index 000000000..986fdeba1 --- /dev/null +++ b/apps/monitoring/jsonnet/jsonnetfile.json @@ -0,0 +1,42 @@ +{ + "version": 1, + "dependencies": [ + { + "source": { + "git": { + "remote": "https://github.com/prometheus-operator/kube-prometheus", + "subdir": "jsonnet/kube-prometheus" + } + }, + "version": "master" + }, + { + "source": { + "git": { + "remote": "https://github.com/kubernetes/kube-state-metrics", + "subdir": "jsonnet/kube-state-metrics" + } + }, + "version": "release-2.0" + }, + { + "source": { + "git": { + "remote": "https://github.com/kubernetes/kube-state-metrics", + "subdir": "jsonnet/kube-state-metrics-mixin" + } + }, + "version": "release-2.0" + }, + { + + "source": { + "git": { + "remote": "https://github.com/povilasv/coredns-mixin" + } + }, + "version": "master" + } + ], + "legacyImports": true +} diff --git a/apps/monitoring/jsonnet/jsonnetfile.lock.json b/apps/monitoring/jsonnet/jsonnetfile.lock.json new file mode 100644 index 000000000..09be111ad --- /dev/null +++ b/apps/monitoring/jsonnet/jsonnetfile.lock.json @@ -0,0 +1,179 @@ +{ + "version": 1, + "dependencies": [ + { + "source": { + "git": { + "remote": "https://github.com/brancz/kubernetes-grafana.git", + "subdir": "grafana" + } + }, + "version": "8ea4e7bc04b1bf5e9bd99918ca28c6271b42be0e", + "sum": "muenICtKXABk6MZZHCZD2wCbmtiE96GwWRMGa1Rg+wA=" + }, + { + "source": { + "git": { + "remote": "https://github.com/etcd-io/etcd.git", + "subdir": "Documentation/etcd-mixin" + } + }, + "version": "dae29bb719dd69dc119146fc297a0628fcc1ccf8", + "sum": "EgKKzxcW3ttt7gjPMX//DNTqNcn/0o2VAIaWJ/HSLEc=" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet-lib.git", + "subdir": "grafonnet" + } + }, + "version": "92878854ea7ccd7c85dd052ac4994c8b05338068", + "sum": "z0LLJrcnSpB64162KXfqUg8jEmQxWlzOB1CLhTmzNiQ=" + }, + { + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs.git", + "subdir": "grafana-builder" + } + }, + "version": "a5c8e40749c2bfa6f02dd9029f69f0819385f041", + "sum": "EmHrmBY8PbnV0BKXmVWvAEmax6eglRinKSyZbTmVWuc=" + }, + { + "source": { + "git": { + "remote": "https://github.com/ksonnet/ksonnet-lib.git", + "subdir": "" + } + }, + "version": "0d2f82676817bbf9e4acf6495b2090205f323b9f", + "sum": "h28BXZ7+vczxYJ2sCt8JuR9+yznRtU/iA6DCpQUrtEg=", + "name": "ksonnet" + }, + { + "source": { + "git": { + "remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin.git", + "subdir": "" + } + }, + "version": "81f090c45d67eefe2d7708ad16017ad529f80f81", + "sum": "CwwoLS0xw/QYk0MR97vuFFKO1YXQlt4Z3bYAeskmlEk=" + }, + { + "source": { + "git": { + "remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin.git", + "subdir": "lib/promgrafonnet" + } + }, + "version": "81f090c45d67eefe2d7708ad16017ad529f80f81", + "sum": "zv7hXGui6BfHzE9wPatHI/AGZa4A2WKo6pq7ZdqBsps=" + }, + { + "source": { + "git": { + "remote": "https://github.com/kubernetes/kube-state-metrics.git", + "subdir": "jsonnet/kube-state-metrics" + } + }, + "version": "baacc7bfe03a73c3b54e67866a3224d849f5d846", + "sum": "aE6e4P6NiMf5eQMv0w4hy+oSeLBzwCrjUSkP+DSgrro=" + }, + { + "source": { + "git": { + "remote": "https://github.com/kubernetes/kube-state-metrics.git", + "subdir": "jsonnet/kube-state-metrics-mixin" + } + }, + "version": "baacc7bfe03a73c3b54e67866a3224d849f5d846", + "sum": "Yf8mNAHrV1YWzrdV8Ry5dJ8YblepTGw3C0Zp10XIYLo=" + }, + { + "source": { + "git": { + "remote": "https://github.com/povilasv/coredns-mixin.git", + "subdir": "" + } + }, + "version": "fd8418f3e564c0f21506fd65cf51f805ef28c4a0", + "sum": "RJzNXxpUgrGamHRdRjT8c5d8aXjpskm0kvrVpm2//qo=" + }, + { + "source": { + "git": { + "remote": "https://github.com/prometheus-operator/kube-prometheus.git", + "subdir": "jsonnet/kube-prometheus" + } + }, + "version": "2ff04a63a38fd3964c3d10a3a7a12a3d6d78a687", + "sum": "hyoV3FMLQa3qjKpYe+6uitiYSud+IhKPyDYBIoI6k1M=" + }, + { + "source": { + "git": { + "remote": "https://github.com/prometheus-operator/prometheus-operator.git", + "subdir": "jsonnet/mixin" + } + }, + "version": "58ec20eabd7847e6ed249191ccdba9ae98c5fdf2", + "sum": "6reUygVmQrLEWQzTKcH8ceDbvM+2ztK3z2VBR2K2l+U=" + }, + { + "source": { + "git": { + "remote": "https://github.com/prometheus-operator/prometheus-operator.git", + "subdir": "jsonnet/prometheus-operator" + } + }, + "version": "5555f492df250168657b72bb8cb60bec071de71f", + "sum": "quzK9/gITldAfVGBkFUsLjQ3Y2F4NOJ2GQUjPSD8HHQ=" + }, + { + "source": { + "git": { + "remote": "https://github.com/prometheus/alertmanager.git", + "subdir": "doc/alertmanager-mixin" + } + }, + "version": "e6a1bede89403920a9875f43e6981ed46dd2ab05", + "sum": "VP1vn/WTGLZaBgGhGMUO81qNTc/fnp5KtzVjcaxad6Q=", + "name": "alertmanager" + }, + { + "source": { + "git": { + "remote": "https://github.com/prometheus/node_exporter.git", + "subdir": "docs/node-mixin" + } + }, + "version": "0e74fbcd5fe3b98246292829a8e81e3133e17033", + "sum": "cZTNXQMUCLB5FGYpMn845dcqGdkcYt58qCqOFIV/BoQ=" + }, + { + "source": { + "git": { + "remote": "https://github.com/prometheus/prometheus.git", + "subdir": "documentation/prometheus-mixin" + } + }, + "version": "e4487274853c587717006eeda8804e597d120340", + "sum": "6kUzElCBWZ5U/3cxEpHNMmoKKPubG45QxpmLu8PGg08=", + "name": "prometheus" + }, + { + "source": { + "git": { + "remote": "https://github.com/thanos-io/thanos.git", + "subdir": "mixin" + } + }, + "version": "37e6ef61566c7c70793ba6d128f00c4c66cb2402", + "sum": "OptiWUMOHFrRGTZhSfxV1RCeXZ90qsefGNTD4lDYVG0=" + } + ], + "legacyImports": false +} diff --git a/apps/monitoring/jsonnet/lib/additional-scrape-configs.libsonnet b/apps/monitoring/jsonnet/lib/additional-scrape-configs.libsonnet new file mode 100644 index 000000000..bd0bdcd0c --- /dev/null +++ b/apps/monitoring/jsonnet/lib/additional-scrape-configs.libsonnet @@ -0,0 +1,26 @@ +{ + // This addon will configure additionalScrapeConfigs in main prometheus + // It will also load scrape configs from additional-scrape-configs.yaml file + // and wrap it into a correct secret. + prometheus+: { + local p = self, + prometheus+: { + additionalScrapeConfigs: { + name: 'additional-scrape-config', + key: 'additional-scrape-configs.yaml', + }, + }, + additionalScrapeConfig: { + apiVersion: 'v1', + kind: 'Secret', + metadata: { + name: 'additional-scrape-config', + namespace: p.config.namespace, + labels: { prometheus: p.config.name } + p.config.commonLabels, + }, + stringData: { + 'additional-scrape-configs.yaml': importstr 'additional-scrape-configs.yaml', + }, + }, + }, +} \ No newline at end of file diff --git a/apps/monitoring/jsonnet/lib/externalRules.libsonnet b/apps/monitoring/jsonnet/lib/externalRules.libsonnet new file mode 100644 index 000000000..c61c432f7 --- /dev/null +++ b/apps/monitoring/jsonnet/lib/externalRules.libsonnet @@ -0,0 +1,26 @@ +local defaults = { + name: error 'provide name', + namespace: 'monitoring', + labels: { + prometheus: 'k8s', + role: 'alert-rules', + }, + groups: error 'provide alert groups', +}; + +function(params) { + local cfg = defaults + params + { + objName: params.name + 'PrometheusRule', + }, + + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + labels: cfg.labels, + name: cfg.name, + namespace: cfg.namespace, + }, + spec: { + groups: cfg.groups, + } +} diff --git a/apps/monitoring/jsonnet/lib/grafana-overrides.libsonnet b/apps/monitoring/jsonnet/lib/grafana-overrides.libsonnet new file mode 100644 index 000000000..69f3b7fd4 --- /dev/null +++ b/apps/monitoring/jsonnet/lib/grafana-overrides.libsonnet @@ -0,0 +1,163 @@ +{ + service+: { + spec+: { + type: 'ClusterIP', + }, + }, + dashboardSources:: null, + dashboardDefinitions:: null, + deployment: { + apiVersion: 'apps/v1', + kind: 'Deployment', + metadata: { + labels: { + 'app.kubernetes.io/name': 'grafana', + 'app.kubernetes.io/component': 'grafana', + 'app.kubernetes.io/part-of': 'kube-prometheus', + }, + name: 'grafana', + namespace: 'monitoring', + }, + spec: { + replicas: 1, + selector: { + matchLabels: { + 'app.kubernetes.io/name': 'grafana', + 'app.kubernetes.io/component': 'grafana', + 'app.kubernetes.io/part-of': 'kube-prometheus', + }, + }, + template: { + metadata: { + labels: { + 'app.kubernetes.io/name': 'grafana', + 'app.kubernetes.io/component': 'grafana', + 'app.kubernetes.io/part-of': 'kube-prometheus', + }, + }, + spec: { + containers: [ + { + env: [ + { name: 'GF_SERVER_ROOT_URL', value: 'https://grafana.ankhmorpork.thaum.xyz' }, + { name: 'GF_AUTH_ANONYMOUS_ENABLED', value: 'false' }, + { name: 'GF_AUTH_DISABLE_LOGIN_FORM', value: 'true' }, + { name: 'GF_AUTH_SIGNOUT_REDIRECT_URL', value: 'https://auth.ankhmorpork.thaum.xyz/oauth2?logout=true' }, + { name: 'GF_AUTH_BASIC_ENABLED', value: 'false' }, + { name: 'GF_AUTH_PROXY_AUTO_SIGN_UP', value: 'false' }, + { name: 'GF_AUTH_PROXY_ENABLED', value: 'true' }, + { name: 'GF_AUTH_PROXY_HEADER_NAME', value: 'X-Auth-Request-Email' }, + { name: 'GF_AUTH_PROXY_HEADER_PROPERTY', value: 'username' }, + { name: 'GF_AUTH_PROXY_HEADERS', value: 'Email:X-Auth-Request-Email' }, + { name: 'GF_SNAPSHOTS_EXTERNAL_ENABLED', value: 'false' }, + ], + image: 'grafana/grafana:7.3.7', + //image: $.values.grafana.image, + name: 'grafana', + ports: [{ + containerPort: 3000, + name: 'http', + }], + resources: { + limits: { cpu: '400m', memory: '200Mi' }, + requests: { cpu: '100m', memory: '100Mi' }, + }, + volumeMounts: [ + { + mountPath: '/var/lib/grafana', + name: 'grafana-storage', + }, + { + mountPath: '/etc/grafana/provisioning/datasources', + name: 'grafana-datasources', + }, + ], + }, + ], + securityContext: { + runAsNonRoot: true, + runAsUser: 472, + }, + nodeSelector: { + 'kubernetes.io/os': 'linux', + }, + serviceAccountName: 'grafana', + volumes: [ + { + name: 'grafana-storage', + persistentVolumeClaim: { + claimName: 'grafana-data', + }, + }, + { + name: 'grafana-datasources', + secret: { + secretName: 'grafana-datasources', + }, + }, + ], + }, + }, + }, + }, + + pvc: { + kind: 'PersistentVolumeClaim', + apiVersion: 'v1', + metadata: { + name: 'grafana-data', + namespace: 'monitoring', + annotations: { + 'volume.beta.kubernetes.io/storage-class': 'longhorn' + }, + }, + spec: { + storageClassName: 'longhorn', + accessModes: ['ReadWriteMany'], + resources: { + requests: { + storage: '60Mi', + }, + }, + }, + }, + + ingress: { + apiVersion: 'networking.k8s.io/v1', + kind: 'Ingress', + metadata: { + name: 'grafana', + namespace: 'monitoring', + annotations: { + 'kubernetes.io/ingress.class': 'nginx', + 'cert-manager.io/cluster-issuer': 'letsencrypt-prod', + 'nginx.ingress.kubernetes.io/auth-url': 'https://auth.ankhmorpork.thaum.xyz/oauth2/auth', + 'nginx.ingress.kubernetes.io/auth-signin': 'https://auth.ankhmorpork.thaum.xyz/oauth2/start?rd=$scheme://$host$escaped_request_uri', + 'nginx.ingress.kubernetes.io/auth-response-headers': 'X-Auth-Request-Email', + }, + }, + spec: { + tls: [{ + hosts: ['grafana.ankhmorpork.thaum.xyz'], + secretName: 'grafana-tls', + }], + rules: [{ + host: 'grafana.ankhmorpork.thaum.xyz', + http: { + paths: [{ + path: '/', + pathType: 'Prefix', + backend: { + service: { + name: 'grafana', + port: { name: 'http' }, + }, + }, + }], + }, + }], + }, + }, + + +} diff --git a/apps/monitoring/jsonnet/lib/ingress.libsonnet b/apps/monitoring/jsonnet/lib/ingress.libsonnet new file mode 100644 index 000000000..12f691272 --- /dev/null +++ b/apps/monitoring/jsonnet/lib/ingress.libsonnet @@ -0,0 +1,109 @@ +// TODO: +// - tls +// - "functional" and parametrized design +// - parametrized ingress annotations + +local ingress(name, namespace, rules) = { + apiVersion: 'networking.k8s.io/v1', + kind: 'Ingress', + metadata: { + name: name, + namespace: namespace, + annotations: { + 'nginx.ingress.kubernetes.io/auth-type': 'basic', + 'nginx.ingress.kubernetes.io/auth-secret': 'basic-auth', + 'nginx.ingress.kubernetes.io/auth-realm': 'Authentication Required', + }, + }, + spec: { rules: rules }, +}; + +{ + // Configure External URL's per application + alertmanager+: { + alertmanager+: { + spec+: { + externalUrl: 'https://alertmanager.' + $.values.common.baseDomain, + }, + }, + ingress: ingress( + 'alertmanager-main', + $.values.common.namespace, + [{ + host: 'alertmanager.' + $.values.common.baseDomain, + http: { + paths: [{ + backend: { + service: { + name: 'alertmanager-main', + port: 'web', + }, + }, + }], + }, + }] + ), + }, + prometheus+: { + prometheus+: { + spec+: { + externalUrl: 'https://prometheus.' + $.values.common.baseDomain, + }, + }, + ingress: ingress( + 'alertmanager-main', + $.values.common.namespace, + [{ + host: 'prometheus.' + $.values.common.baseDomain, + http: { + paths: [{ + backend: { + service: { + name: 'prometheus-k8s', + port: 'web', + }, + }, + }], + }, + }] + ), + }, + grafana+: { + ingress: ingress( + 'grafana', + $.values.common.namespace, + [{ + host: 'grafana.' + $.values.common.baseDomain, + http: { + paths: [{ + path: '/', + pathType: 'Prefix', + backend: { + service: { + name: 'grafana', + port: 'http', + }, + }, + }], + }, + }], + ), + + }, + { + // Create basic auth secret - replace 'auth' file with your own + ingress+:: { + 'basic-auth-secret': { + apiVersion: 'v1', + kind: 'Secret', + metadata: { + name: 'basic-auth', + namespace: $.values.common.namespace, + }, + data: { auth: std.base64(importstr 'auth') }, + type: 'Opaque', + }, + }, + }; + +{ [name + '-ingress']: kp.ingress[name] for name in std.objectFields(kp.ingress) } diff --git a/apps/monitoring/jsonnet/lib/kube-events-exporter.libsonnet b/apps/monitoring/jsonnet/lib/kube-events-exporter.libsonnet new file mode 100644 index 000000000..4ffad7cfc --- /dev/null +++ b/apps/monitoring/jsonnet/lib/kube-events-exporter.libsonnet @@ -0,0 +1,143 @@ +// TODO(paulfantom): consider moving this into https://github.com/rhobs/kube-events-exporter/tree/master/jsonnet/kube-events-exporter +// and adding as addon in kube-prometheus +// ping @dgrisonnet for opinion + +local defaults = { + local defaults = self, + namespace: error 'must provide namespace', + version: error 'must provide version', + image: error 'must provide image', + + eventTypes: [], + involvedObjectAPIGroups: [], + involvedObjectNamespaces: [], + reportingControllers: [], + + commonLabels: { + 'app.kubernetes.io/name': 'kube-events-exporter', + 'app.kubernetes.io/version': defaults.version, + 'app.kubernetes.io/component': 'events-exporter', + }, + + selectorLabels: { + [labelName]: defaults.commonLabels[labelName] + for labelName in std.objectFields(defaults.commonLabels) + if !std.setMember(labelName, ['app.kubernetes.io/version']) + }, + + resources: {}, +}; + +function(params) { + config:: defaults + params, + + serviceAccount: { + apiVersion: 'v1', + kind: 'ServiceAccount', + metadata: { + labels: $.config.commonLabels, + name: 'kube-events-exporter', + namespace: $.config.namespace, + }, + }, + + clusterRole: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'ClusterRole', + metadata: { + labels: $.config.commonLabels, + name: 'kube-events-exporter', + namespace: $.config.namespace, + }, + rules: [{ + apiGroups: [''], + resources: ['events'], + verbs: ['list', 'watch'], + }], + }, + + clusterRoleBinding: { + apiVersion: 'rbac.authorization.k8s.io/v1', + kind: 'ClusterRoleBinding', + metadata: { + labels: $.config.commonLabels, + name: 'kube-events-exporter', + namespace: $.config.namespace, + }, + roleRef: { + apiGroup: 'rbac.authorization.k8s.io', + kind: 'ClusterRole', + name: $.clusterRole.metadata.name, + }, + subjects: [{ + kind: 'ServiceAccount', + name: $.serviceAccount.metadata.name, + namespace: $.config.namespace, + }], + }, + + local kee = { + args: [] + + ['--event-types=' + evType for evType in $.config.eventTypes] + + ['--involved-object-api-groups=' + apiGroup for apiGroup in $.config.involvedObjectAPIGroups] + + ['--involved-object-namespaces=' + ns for ns in $.config.involvedObjectNamespaces] + + ['--reporting-controllers=' + controller for controller in $.config.reportingControllers], + name: 'kube-events-exporter', + image: $.config.image, + ports: [ + { containerPort: 8080, name: 'event' }, + { containerPort: 8081, name: 'exporter' }, + ], + resources: $.config.resources, + }, + + deployment: { + apiVersion: 'apps/v1', + kind: 'Deployment', + metadata: { + labels: $.config.commonLabels, + name: 'kube-events-exporter', + namespace: $.config.namespace, + }, + spec: { + replicas: 1, + selector: { + matchLabels: $.config.selectorLabels, + }, + template: { + metadata: { + labels: $.config.commonLabels, + }, + spec: { + containers: [kee], + securityContext: { + runAsNonRoot: true, + runAsUser: 65534, + }, + serviceAccountName: $.serviceAccount.metadata.name, + }, + }, + }, + }, + + podMonitor: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PodMonitor', + metadata: { + labels: $.config.commonLabels, + name: 'kube-events-exporter', + namespace: $.config.namespace, + }, + spec: { + podMetricsEndpoints: [ + { port: 'event' }, + { port: 'exporter' }, + ], + selector: { + matchLabels: $.config.selectorLabels, + }, + }, + }, + + +} diff --git a/apps/monitoring/jsonnet/lib/kube-linter.libsonnet b/apps/monitoring/jsonnet/lib/kube-linter.libsonnet new file mode 100644 index 000000000..c9df17e04 --- /dev/null +++ b/apps/monitoring/jsonnet/lib/kube-linter.libsonnet @@ -0,0 +1,16 @@ +local ignoreCheck(name, comment) = { + metadata+: { + annotations+: { + ['ignore-check.kube-linter.io/' + name]: comment, + }, + }, +}; + +{ + alertmanager+: { + service+: ignoreCheck('dangling-service', 'Check is incompatible with prometheus-operator CRDs'), + }, + prometheus+: { + service+: ignoreCheck('dangling-service', 'Check is incompatible with prometheus-operator CRDs'), + }, +} \ No newline at end of file diff --git a/apps/monitoring/jsonnet/lib/pushgateway.libsonnet b/apps/monitoring/jsonnet/lib/pushgateway.libsonnet new file mode 100644 index 000000000..5406883cd --- /dev/null +++ b/apps/monitoring/jsonnet/lib/pushgateway.libsonnet @@ -0,0 +1,113 @@ +// TODO(paulfantom): consider moving this as a component into kube-prometheus + +local defaults = { + local defaults = self, + name: 'pushgateway', + namespace: error 'must provide namespace', + version: error 'must provide version', + image: error 'must provide image', + + commonLabels: { + 'app.kubernetes.io/name': defaults.name, + 'app.kubernetes.io/version': defaults.version, + 'app.kubernetes.io/component': 'exporter', + }, + + selectorLabels: { + [labelName]: defaults.commonLabels[labelName] + for labelName in std.objectFields(defaults.commonLabels) + if !std.setMember(labelName, ['app.kubernetes.io/version']) + }, + + resources: {}, +}; + +function(params) { + config:: defaults + params, + + serviceAccount: { + apiVersion: 'v1', + kind: 'ServiceAccount', + metadata: { + labels: $.config.commonLabels, + name: $.config.name, + namespace: $.config.namespace, + }, + }, + + service: { + apiVersion: 'v1', + kind: 'Service', + metadata: { + labels: $.config.commonLabels, + name: $.config.name, + namespace: $.config.namespace, + }, + spec: { + ports: [{ + name: 'http-push', + port: 9091, + protocol: 'TCP', + targetPort: 'http-push', + }], + selector: $.config.selectorLabels, + }, + }, + + local pgw = { + name: $.config.name, + image: $.config.image, + ports: [{ + containerPort: 9091, + name: 'http-push', + }], + resources: $.config.resources, + }, + + deployment: { + apiVersion: 'apps/v1', + kind: 'Deployment', + metadata: { + labels: $.config.commonLabels, + name: $.config.name, + namespace: $.config.namespace, + }, + spec: { + replicas: 1, + selector: { + matchLabels: $.config.selectorLabels, + }, + template: { + metadata: { + labels: $.config.commonLabels, + }, + spec: { + containers: [pgw], + securityContext: { + runAsNonRoot: true, + runAsUser: 65534, + }, + serviceAccountName: $.serviceAccount.metadata.name, + }, + }, + }, + }, + + serviceMonitor: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: $.config.name, + namespace: $.config.namespace, + labels: $.config.commonLabels, + }, + spec: { + selector: { + matchLabels: $.config.selectorLabels, + }, + endpoints: [ + { port: 'http-push', interval: '30s', honorLabels: true }, + ], + }, + }, +} diff --git a/apps/monitoring/jsonnet/lib/smokeping.libsonnet b/apps/monitoring/jsonnet/lib/smokeping.libsonnet new file mode 100644 index 000000000..c2fa43bdb --- /dev/null +++ b/apps/monitoring/jsonnet/lib/smokeping.libsonnet @@ -0,0 +1,129 @@ +// TODO(paulfantom): consider moving this as an addon into kube-prometheus + +local defaults = { + local defaults = self, + name: 'smokeping', + namespace: error 'must provide namespace', + version: error 'must provide version', + image: error 'must provide image', + + commonLabels: { + 'app.kubernetes.io/name': defaults.name, + 'app.kubernetes.io/version': defaults.version, + 'app.kubernetes.io/component': 'exporter', + }, + + selectorLabels: { + [labelName]: defaults.commonLabels[labelName] + for labelName in std.objectFields(defaults.commonLabels) + if !std.setMember(labelName, ['app.kubernetes.io/version']) + }, + + replicas: 1, + resources: {}, + hosts: [], +}; + +function(params) { + config:: defaults + params, + + serviceAccount: { + apiVersion: 'v1', + kind: 'ServiceAccount', + metadata: { + labels: $.config.commonLabels, + name: $.config.name, + namespace: $.config.namespace, + }, + }, + + service: { + apiVersion: 'v1', + kind: 'Service', + metadata: { + labels: $.config.commonLabels, + name: $.config.name, + namespace: $.config.namespace, + }, + spec: { + ports: [{ + name: 'http', + port: 9374, + protocol: 'TCP', + targetPort: 'http', + }], + selector: $.config.selectorLabels, + }, + }, + + local smoke = { + name: $.config.name, + image: $.config.image, + args: $.config.hosts, + ports: [{ + containerPort: 9374, + name: 'http', + }], + readinessProbe: { + tcpSocket: { + port: 'http', + }, + initialDelaySeconds: 1, + failureThreshold: 5, + timeoutSeconds: 10, + }, + securityContext: { + capabilities: { + add: ['NET_RAW'], + }, + }, + resources: $.config.resources, + }, + + deployment: { + apiVersion: 'apps/v1', + kind: 'Deployment', + metadata: { + labels: $.config.commonLabels, + name: $.config.name, + namespace: $.config.namespace, + }, + spec: { + replicas: $.config.replicas, + selector: { + matchLabels: $.config.selectorLabels, + }, + template: { + metadata: { + labels: $.config.commonLabels, + }, + spec: { + containers: [smoke], + securityContext: { + runAsNonRoot: true, + runAsUser: 65534, + }, + serviceAccountName: $.serviceAccount.metadata.name, + }, + }, + }, + }, + + serviceMonitor: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: $.config.name, + namespace: $.config.namespace, + labels: $.config.commonLabels, + }, + spec: { + selector: { + matchLabels: $.config.selectorLabels, + }, + endpoints: [ + { port: 'http', interval: '30s' }, + ], + }, + }, +} diff --git a/apps/monitoring/jsonnet/main.jsonnet b/apps/monitoring/jsonnet/main.jsonnet new file mode 100644 index 000000000..59f100c2c --- /dev/null +++ b/apps/monitoring/jsonnet/main.jsonnet @@ -0,0 +1,515 @@ +// TODO list: +// - compare and test + +// k3s additions: +// - kube-controller-manager-prometheus-discovery service +// - kube-scheduler-prometheus-discovery + +// Things to fix in kube-prometheus +// - better examples for adding custom alerts/rules +// - addon/example for additionalScrapeConfigs? +// - prometheus-pvc should be an addon +// - better `examples/` directory schema +// - addon to add 'runbook_url' annotation to every alert +// - non-prometheus ServiceMonitors shouldn't be in prometheus object +// - fix SM label selector for coreDNS in kube-prometheus +// - ... + +// TODO list for later +// - loading dashboards +// from mixins: +// - kubernetes-mixin +// - prometheus +// - node-exporter +// - coredns +// - sealed-secrets +// - go runtime metrics (https://github.com/grafana/jsonnet-libs/tree/master/go-runtime-mixin) +// from json: +// - argocd +// - blackbox-exporter +// - smokeping +// - unifi +// - nginx-controller +// - mysql (x2) +// - redis +// - home dashboard + +local addArgs(args, name, containers) = std.map( + function(c) if c.name == name then + c { + args+: args, + } + else c, + containers, +); + +local probe(name, namespace, labels, module, targets) = { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'Probe', + metadata: { + name: name, + namespace: namespace, + labels: labels, + }, + spec: { + prober: { + // TODO: point to https version at 9115 + url: 'blackbox-exporter.monitoring.svc:19115', + }, + module: module, + targets: targets, + }, +}; + +// convert file to yaml when jsonnet supports yaml imports (https://github.com/google/jsonnet/pull/888) +local blackboxExporterModules = (import 'ext/blackboxExporterConfig.json').modules; + +// TODO: add to kube-prometheus, more info in libsonnet file +local kubeEventsExporter = (import 'lib/kube-events-exporter.libsonnet'); +// TODO: add to kube-prometheus, more info in libsonnet file +local pushgateway = (import 'lib/pushgateway.libsonnet'); +// TODO: consider moving this to some other place (maybe jsonnet-libs repo?) +local smokeping = (import 'lib/smokeping.libsonnet'); + +local kp = + (import 'kube-prometheus/main.libsonnet') + + (import 'kube-prometheus/addons/anti-affinity.libsonnet') + + (import 'kube-prometheus/addons/all-namespaces.libsonnet') + + // (import 'lib/ingress.libsonnet') + + // TODO: Can be enabled after dealing with lancre ENV + // (import 'lib/additional-scrape-configs.libsonnet') + + // (import './lib/k3s.libsonnet') + + // (import './config.json') + + { + // + // Configuration + // + values+:: { + common+: { + namespace: 'monitoring', + ruleLabels: { + role: 'alert-rules', + }, + baseDomain: 'ankhmorpork.thaum.xyz', + }, + kubeEventsExporter: { + namespace: $.values.common.namespace, + version: '0.1.0', + image: 'quay.io/dgrisonnet/kube-events-exporter:v0.1.0', + resources: { + requests: { cpu: '2m', memory: '16Mi' }, + }, + commonLabels+: { + 'app.kubernetes.io/component': 'exporter', + }, + }, + pushgateway: { + namespace: $.values.common.namespace, + version: '1.2.0', + image: 'quay.io/prometheus/pushgateway:v1.2.0', + resources: { + requests: { cpu: '10m', memory: '12Mi' }, + }, + }, + smokeping: { + namespace: $.values.common.namespace, + version: '1.2.0', + image: 'quay.io/superq/smokeping-prober:v0.4.1', + resources: { + requests: { cpu: '40m', memory: '30Mi' }, + limits: { memory: '70Mi' }, + }, + replicas: 2, + hosts: [ + '8.8.8.8', + '1.1.1.1', + 'lancre.thaum.xyz', + 'krupa.net.pl', + 'cloud.krupa.net.pl', + 'pawel.krupa.net.pl', + ], + }, + alertmanager+: { + resources: { + requests: { memory: '30Mi' }, + }, + }, + prometheus+: { + resources: { + requests: { cpu: '140m', memory: '1900Mi' }, + limits: { cpu: '1' }, + }, + }, + prometheusOperator+: { + mixin+: { + _config: { + prometheusOperatorSelector: 'job="prometheus-operator"', + }, + }, + }, + blackboxExporter+: { + modules: blackboxExporterModules, + resources: { + requests: { cpu: '21m', memory: '16Mi' }, + limits: { cpu: '21m', memory: '42Mi' }, + }, + replicas: 2, + probes: { + promDemo: { + staticConfig: { + static: [ + 'https://demo.do.prometheus.io', + 'https://prometheus.demo.do.prometheus.io/-/healthy', + 'https://alertmanager.demo.do.prometheus.io/-/healthy', + 'https://node.demo.do.prometheus.io', + 'https://grafana.demo.do.prometheus.io/api/health', + ], + labels: { environment: 'prometheus.io' }, + }, + }, + thaumSites: { + staticConfig: { + static: [ + 'https://weirdo.blog/ghost', + 'https://alchemyof.it/ghost', + 'https://zmc.krupa.net.pl', + ], + labels: { environment: 'thaum.xyz' }, + }, + }, + ingress: { + ingress: { + selector: { + matchLabels: { + probe: 'enabled', + }, + }, + namespaceSelector: { any: true }, + }, + }, + }, + }, + kubeStateMetrics+: { + version: 'v2.0.0-beta', + image: 'k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.0.0-beta', + }, + grafana+: { + version: '7.3.7', + //image: 'grafana/grafana:7.3.7', // This is overridden in grafana-overrides.libsonnet + datasources: [{ + name: 'Prometheus', + type: 'prometheus', + access: 'proxy', + orgId: 1, + isDefault: true, + url: 'http://prometheus-k8s.monitoring.svc:9090', + }], + }, + kubernetesMixin+: { + mixin+: { + _config+: { + // k3s exposes all this data under single endpoint and those can be obtained via "kubelet" Service + kubeSchedulerSelector: 'job="kubelet"', + kubeControllerManagerSelector: 'job="kubelet"', + kubeApiserverSelector: 'job="kubelet"', + }, + }, + }, + }, + + // + // Objects customization + // + kubeEventsExporter: kubeEventsExporter($.values.kubeEventsExporter), + pushgateway: pushgateway($.values.pushgateway), + smokeping: smokeping($.values.smokeping) + { + deployment+: { + spec+: { + template+: { + spec+: { + affinity: (import '../../../lib/podantiaffinity.libsonnet').podantiaffinity('smokeping'), + }, + }, + }, + }, + }, + + local ingressAnnotations = { + 'kubernetes.io/ingress.class': 'nginx', + 'cert-manager.io/cluster-issuer': 'letsencrypt-prod', + 'nginx.ingress.kubernetes.io/auth-url': 'https://auth.ankhmorpork.thaum.xyz/oauth2/auth', + 'nginx.ingress.kubernetes.io/auth-signin': 'https://auth.ankhmorpork.thaum.xyz/oauth2/start?rd=$scheme://$host$escaped_request_uri', + }, + alertmanager+: { + // alertmanager secret is stored as ConfigMapSecret in plain yaml file + secret:: null, + // TODO: move ingress and externalURL to an addon + alertmanager+: { + spec+: { + externalUrl: 'https://alertmanager.' + $.values.common.baseDomain, + }, + }, + ingress: { + apiVersion: 'networking.k8s.io/v1', + kind: 'Ingress', + metadata: { + name: 'alertmanager', + namespace: $.values.common.namespace, + annotations: ingressAnnotations, + }, + spec: { + tls: [{ + hosts: ['alertmanager.ankhmorpork.thaum.xyz'], + secretName: 'alertmanager-tls', + }], + rules: [{ + host: 'alertmanager.ankhmorpork.thaum.xyz', + http: { + paths: [{ + path: '/', + pathType: 'Prefix', + backend: { + service: { + name: 'alertmanager-main', + port: { + name: 'web', + }, + }, + }, + }], + }, + }], + }, + }, + }, + // TODO: Should service expose 2 ports??? + blackboxExporter+: { + deployment+: { + spec+: { + template+: { + spec+: { + affinity: (import '../../../lib/podantiaffinity.libsonnet').podantiaffinity('blackbox-exporter'), + }, + }, + }, + }, + promDemoProbe: probe('prometheus-demo', $.values.common.namespace, $.blackboxExporter.config.commonLabels, 'http_2xx', $.values.blackboxExporter.probes.promDemo), + thaumProbe: probe('thaum-sites', $.values.common.namespace, $.blackboxExporter.config.commonLabels, 'http_2xx', $.values.blackboxExporter.probes.thaumSites), + ingressProbe: probe('ankhmorpork', $.values.common.namespace, $.blackboxExporter.config.commonLabels, 'http_2xx', $.values.blackboxExporter.probes.ingress), + }, + prometheusOperator+: { + deployment+: { + spec+: { + template+: { + spec+: { + containers: addArgs(['--config-reloader-cpu=150m', '--log-level=debug'], 'prometheus-operator', super.containers), + }, + }, + }, + }, + }, + prometheus+: { + prometheus+: { + spec+: { + // TODO: move ingress and externalURL to an addon + externalUrl: 'https://prometheus.' + $.values.common.baseDomain, + retention: '7d', + nodeSelector+: { + 'kubernetes.io/arch': 'amd64', + }, + // FIXME: reenable + securityContext:: null, + // TODO: Move this to addon when lancre is dealt with + // additionalScrapeConfigs are stored as ConfigMapSecret in plain yaml file + additionalScrapeConfigs: { + name: 'scrapeconfigs', + key: 'additional.yaml', + }, + // TODO: figure out why this is not added by default + ruleNamespaceSelector: {}, + ruleSelector: {}, + + // TODO: remove after https://github.com/prometheus-operator/kube-prometheus/pull/929 is merged + thanos:: null, + storage: { + volumeClaimTemplate: { + metadata: { + name: 'promdata', + }, + spec: { + storageClassName: 'local-path', // For performance reasons use local disk + accessModes: ['ReadWriteOnce'], + resources: { + requests: { storage: '40Gi' }, + }, + }, + }, + }, + }, + }, + + // k3s exposes all this data under single endpoint and those can be obtained via "kubelet" Service + serviceMonitorApiserver:: null, + serviceMonitorKubeControllerManager:: null, + serviceMonitorKubeScheduler:: null, + ingress: { + apiVersion: 'networking.k8s.io/v1', + kind: 'Ingress', + metadata: { + name: 'prometheus', + namespace: $.values.common.namespace, + annotations: ingressAnnotations, + }, + spec: { + tls: [{ + hosts: ['prometheus.ankhmorpork.thaum.xyz'], + secretName: 'prometheus-tls', + }], + rules: [{ + host: 'prometheus.ankhmorpork.thaum.xyz', + http: { + paths: [{ + path: '/', + pathType: 'Prefix', + backend: { + service: { + name: 'prometheus-k8s', + port: { + name: 'web', + }, + }, + }, + }], + }, + }], + }, + }, + // TODO: check if this addition is necessary + clusterRole+: { + rules+: [{ + apiGroups: ['networking.k8s.io'], + resources: ['ingresses'], + verbs: ['get', 'list', 'watch'], + }], + }, + // TODO: those should be a part of kube-prometheus/addons/all-namespaces.libsonnet + roleBindingSpecificNamespaces:: null, + roleSpecificNamespaces:: null, + // TODO: fix in kube-prometheus + serviceMonitorCoreDNS+: { + metadata+: { + labels+: { + 'k8s-app': 'kube-dns', + }, + }, + spec+: { + jobLabel: 'k8s-app', + selector: { + matchLabels: { + 'k8s-app': 'kube-dns', + }, + }, + }, + }, + }, + kubeStateMetrics+: { + deployment+: { + spec+: { + template+: { + spec+: { + containers: addArgs(['--labels-metric-allow-list=nodes=[kubernetes.io/arch,gpu.infra/intel,network.infra/type]'], 'kube-state-metrics', super.containers), + }, + }, + }, + }, + }, + grafana+: (import 'lib/grafana-overrides.libsonnet'), + other: { + local externalRules = import 'lib/externalRules.libsonnet', + coreDNSMixin:: (import 'github.com/povilasv/coredns-mixin/mixin.libsonnet') + { + _config+:: { + corednsSelector: 'job=~"kube-dns|coredns"', + corednsRunbookURLPattern: 'https://github.com/thaum-xyz/ankhmorpork/tree/master/docs/runbooks/%s', + }, + }, + coreDNSPrometheusRule: externalRules({ + name: 'coredns', + groups: $.other.coreDNSMixin.prometheusAlerts.groups, + }), + thaumPrometheusRule: externalRules({ + name: 'thaum-rules', + groups: (import 'ext/rules/thaum.json').groups, + }), + testingPrometheusRule: externalRules({ + name: 'testing-rules', + groups: (import 'ext/rules/testing.json').groups, + }), + // TODO: move to k3s addon (probably not needed since k3s exposes everything on one endpoint) + /*kubeSchedulerPrometheusDiscovery: { + apiVersion: 'v1', + kind: 'Service', + metadata: { + labels: { + 'k8s-app': 'kube-scheduler', + 'app.kubernetes.io/name': 'kube-scheduler', + 'app.kubernetes.io/part-of': 'kube-prometheus', + }, + name: 'kube-scheduler-prometheus-discovery', + namespace: 'kube-system', + }, + spec: { + ports: [{ + name: 'http-metrics', + port: 10251, + }], + }, + }, + // TODO: move to k3s addon + kubeControllerManagerPrometheusDiscovery: { + apiVersion: 'v1', + kind: 'Service', + metadata: { + labels: { + 'k8s-app': 'kube-controller-manager', + 'app.kubernetes.io/name': 'kube-controller-manager', + 'app.kubernetes.io/part-of': 'kube-prometheus', + }, + name: 'kube-controller-manager-prometheus-discovery', + namespace: 'kube-system', + }, + spec: { + ports: [{ + name: 'http-metrics', + port: 10252, + }], + }, + },*/ + }, + } + + // kube-linter annotations need to be added after all objects are created + (import 'lib/kube-linter.libsonnet'); + +// +// Manifestation +// +{ ['namespace.yaml']: std.manifestYamlDoc(kp.kubePrometheus.namespace) } + +{ ['prometheus-operator/' + name + '.yaml']: std.manifestYamlDoc(kp.prometheusOperator[name]) for name in std.objectFields(kp.prometheusOperator) } + +{ ['kube-state-metrics/' + name + '.yaml']: std.manifestYamlDoc(kp.kubeStateMetrics[name]) for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager/' + name + '.yaml']: std.manifestYamlDoc(kp.alertmanager[name]) for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus/' + name + '.yaml']: std.manifestYamlDoc(kp.prometheus[name]) for name in std.objectFields(kp.prometheus) } + +{ ['prober/' + name + '.yaml']: std.manifestYamlDoc(kp.blackboxExporter[name]) for name in std.objectFields(kp.blackboxExporter) } + +// node_exporter is deployed separately via Ansible +// { ['node-exporter/' + name + '.yaml']: std.manifestYamlDoc(kp.nodeExporter[name]) for name in std.objectFields(kp.nodeExporter) } + +// using metrics-server instead of prometheus-adater +// { ['prometheus-adapter-' + name + '.yaml']: std.manifestYamlDoc(kp.prometheusAdapter[name]) for name in std.objectFields(kp.prometheusAdapter) } + +// TBD +{ ['grafana/' + name + '.yaml']: std.manifestYamlDoc(kp.grafana[name]) for name in std.objectFields(kp.grafana) } + +{ ['pushgateway/' + name + '.yaml']: std.manifestYamlDoc(kp.pushgateway[name]) for name in std.objectFields(kp.pushgateway) } + +{ ['smokeping/' + name + '.yaml']: std.manifestYamlDoc(kp.smokeping[name]) for name in std.objectFields(kp.smokeping) } + +// { ['holiday/' + name + '.yaml']: std.manifestYamlDoc(kp.blackboxExporter[name]) for name in std.objectFields(kp.blackboxExporter) } + +{ ['kube-events-exporter/' + name + '.yaml']: std.manifestYamlDoc(kp.kubeEventsExporter[name]) for name in std.objectFields(kp.kubeEventsExporter) } + +{ ['other/' + name + '.yaml']: std.manifestYamlDoc(kp.other[name]) for name in std.objectFields(kp.other) } + +{ ['other/kubePrometheusRule.yaml']: std.manifestYamlDoc(kp.kubePrometheus.prometheusRule) } + +{ ['other/kubernetesPrometheusRule.yaml']: std.manifestYamlDoc(kp.kubernetesMixin.prometheusRule) } + +// { ['other/etcdPrometheusRule.yaml']: std.manifestYamlDoc(kp.other.etcdPrometheusRule) } + +{} diff --git a/apps/monitoring/manifests/alertmanager/alertmanager.yaml b/apps/monitoring/manifests/alertmanager/alertmanager.yaml index 2c552e130..e05443172 100644 --- a/apps/monitoring/manifests/alertmanager/alertmanager.yaml +++ b/apps/monitoring/manifests/alertmanager/alertmanager.yaml @@ -3,6 +3,10 @@ kind: Alertmanager metadata: labels: alertmanager: main + app.kubernetes.io/component: alert-router + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.21.0 name: main namespace: monitoring spec: @@ -24,6 +28,12 @@ spec: image: quay.io/prometheus/alertmanager:v0.21.0 nodeSelector: kubernetes.io/os: linux + podMetadata: + labels: + app.kubernetes.io/component: alert-router + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.21.0 replicas: 3 resources: requests: @@ -33,3 +43,4 @@ spec: runAsNonRoot: true runAsUser: 1000 serviceAccountName: alertmanager-main + version: 0.21.0 diff --git a/apps/monitoring/manifests/alertmanager/06_ingress.yaml b/apps/monitoring/manifests/alertmanager/ingress.yaml similarity index 53% rename from apps/monitoring/manifests/alertmanager/06_ingress.yaml rename to apps/monitoring/manifests/alertmanager/ingress.yaml index b2dd8a3c2..f9f0a7065 100644 --- a/apps/monitoring/manifests/alertmanager/06_ingress.yaml +++ b/apps/monitoring/manifests/alertmanager/ingress.yaml @@ -1,27 +1,26 @@ ---- apiVersion: networking.k8s.io/v1 kind: Ingress metadata: + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + kubernetes.io/ingress.class: nginx + nginx.ingress.kubernetes.io/auth-signin: https://auth.ankhmorpork.thaum.xyz/oauth2/start?rd=$scheme://$host$escaped_request_uri + nginx.ingress.kubernetes.io/auth-url: https://auth.ankhmorpork.thaum.xyz/oauth2/auth name: alertmanager namespace: monitoring - annotations: - kubernetes.io/ingress.class: "nginx" - cert-manager.io/cluster-issuer: "letsencrypt-prod" - nginx.ingress.kubernetes.io/auth-url: "https://auth.ankhmorpork.thaum.xyz/oauth2/auth" - nginx.ingress.kubernetes.io/auth-signin: "https://auth.ankhmorpork.thaum.xyz/oauth2/start?rd=$scheme://$host$escaped_request_uri" spec: - tls: - - hosts: - - alertmanager.ankhmorpork.thaum.xyz - secretName: alertmanager-tls rules: - host: alertmanager.ankhmorpork.thaum.xyz http: paths: - - path: / - pathType: Prefix - backend: + - backend: service: name: alertmanager-main port: name: web + path: / + pathType: Prefix + tls: + - hosts: + - alertmanager.ankhmorpork.thaum.xyz + secretName: alertmanager-tls diff --git a/apps/monitoring/manifests/alertmanager/prometheusRule.yaml b/apps/monitoring/manifests/alertmanager/prometheusRule.yaml new file mode 100644 index 000000000..64805327a --- /dev/null +++ b/apps/monitoring/manifests/alertmanager/prometheusRule.yaml @@ -0,0 +1,147 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: alert-router + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.21.0 + role: alert-rules + name: main-rules + namespace: monitoring +spec: + groups: + - name: alertmanager.rules + rules: + - alert: AlertmanagerFailedReload + annotations: + description: Configuration has failed to load for {{ $labels.namespace }}/{{ + $labels.pod}}. + summary: Reloading an Alertmanager configuration has failed. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="monitoring"}[5m]) == 0 + for: 10m + labels: + severity: critical + - alert: AlertmanagerMembersInconsistent + annotations: + description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} has only + found {{ $value }} members of the {{$labels.job}} cluster. + summary: A member of an Alertmanager cluster has not found all other cluster + members. + expr: | + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}[5m]) + < on (namespace,service) group_left + count by (namespace,service) (max_over_time(alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}[5m])) + for: 10m + labels: + severity: critical + - alert: AlertmanagerFailedToSendAlerts + annotations: + description: Alertmanager {{ $labels.namespace }}/{{ $labels.pod}} failed + to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration + }}. + summary: An Alertmanager instance failed to send notifications. + expr: | + ( + rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring"}[5m]) + / + rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring"}[5m]) + ) + > 0.01 + for: 5m + labels: + severity: warning + - alert: AlertmanagerClusterFailedToSendAlerts + annotations: + description: The minimum notification failure rate to {{ $labels.integration + }} sent from any instance in the {{$labels.job}} cluster is {{ $value | + humanizePercentage }}. + summary: All Alertmanager instances in a cluster failed to send notifications + to a critical integration. + expr: | + min by (namespace,service, integration) ( + rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring", integration=~`.*`}[5m]) + / + rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring", integration=~`.*`}[5m]) + ) + > 0.01 + for: 5m + labels: + severity: critical + - alert: AlertmanagerClusterFailedToSendAlerts + annotations: + description: The minimum notification failure rate to {{ $labels.integration + }} sent from any instance in the {{$labels.job}} cluster is {{ $value | + humanizePercentage }}. + summary: All Alertmanager instances in a cluster failed to send notifications + to a non-critical integration. + expr: | + min by (namespace,service, integration) ( + rate(alertmanager_notifications_failed_total{job="alertmanager-main",namespace="monitoring", integration!~`.*`}[5m]) + / + rate(alertmanager_notifications_total{job="alertmanager-main",namespace="monitoring", integration!~`.*`}[5m]) + ) + > 0.01 + for: 5m + labels: + severity: warning + - alert: AlertmanagerConfigInconsistent + annotations: + description: Alertmanager instances within the {{$labels.job}} cluster have + different configurations. + summary: Alertmanager instances within the same cluster have different configurations. + expr: | + count by (namespace,service) ( + count_values by (namespace,service) ("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"}) + ) + != 1 + for: 20m + labels: + severity: critical + - alert: AlertmanagerClusterDown + annotations: + description: '{{ $value | humanizePercentage }} of Alertmanager instances + within the {{$labels.job}} cluster have been up for less than half of the + last 5m.' + summary: Half or more of the Alertmanager instances within the same cluster + are down. + expr: | + ( + count by (namespace,service) ( + avg_over_time(up{job="alertmanager-main",namespace="monitoring"}[5m]) < 0.5 + ) + / + count by (namespace,service) ( + up{job="alertmanager-main",namespace="monitoring"} + ) + ) + >= 0.5 + for: 5m + labels: + severity: critical + - alert: AlertmanagerClusterCrashlooping + annotations: + description: '{{ $value | humanizePercentage }} of Alertmanager instances + within the {{$labels.job}} cluster have restarted at least 5 times in the + last 10m.' + summary: Half or more of the Alertmanager instances within the same cluster + are crashlooping. + expr: | + ( + count by (namespace,service) ( + changes(process_start_time_seconds{job="alertmanager-main",namespace="monitoring"}[10m]) > 4 + ) + / + count by (namespace,service) ( + up{job="alertmanager-main",namespace="monitoring"} + ) + ) + >= 0.5 + for: 5m + labels: + severity: critical diff --git a/apps/monitoring/manifests/alertmanager/secret.yaml b/apps/monitoring/manifests/alertmanager/secret.yaml new file mode 100644 index 000000000..062e62468 --- /dev/null +++ b/apps/monitoring/manifests/alertmanager/secret.yaml @@ -0,0 +1,181 @@ +apiVersion: secrets.mz.com/v1alpha1 +kind: ConfigMapSecret +metadata: + name: alertmanager-main + namespace: monitoring + labels: + app: alertmanager +spec: + template: + metadata: + name: alertmanager-main + labels: + app: alertmanager + data: + alertmanager.yaml: | + global: + resolve_timeout: 5m + slack_api_url: $(SLACK_API_URL) + opsgenie_api_url: 'https://api.eu.opsgenie.com' + opsgenie_api_key: $(OPSGENIE_API_KEY) + receivers: + + - name: 'slack' + slack_configs: + - channel: '#alerts' + send_resolved: true + title: | + [{{ .Status | toUpper -}} + {{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{- end -}} + ] {{ if ne .Status "firing" -}} + :heavy_check_mark: + {{- else if eq .CommonLabels.severity "critical" -}} + :fire: + {{- else if eq .CommonLabels.severity "warning" -}} + :warning: + {{- else if eq .CommonLabels.severity "info" -}} + :information_source: + {{- else -}} + :question: + {{- end -}} + {{ .CommonLabels.alertname }} + text: >- + {{ range .Alerts }} + {{- if .Annotations.message }} + {{ .Annotations.message }} + {{- end }} + {{- if .Annotations.description }} + {{ .Annotations.description }} + {{- end }} + {{- end }} + short_fields: true + fields: + - title: Severity + value: '{{ .CommonLabels.severity }}' + - title: Job + value: '{{ .GroupLabels.job }}' + actions: + - type: button + text: 'Runbook :green_book:' + url: '{{ (index .Alerts 0).Annotations.runbook_url }}' + - type: button + text: 'Query :mag:' + url: '{{ (index .Alerts 0).GeneratorURL }}' + - type: button + text: 'Dashboard :grafana:' + url: '{{ (index .Alerts 0).Annotations.dashboard_url }}' + - type: button + text: 'Silence :no_bell:' + url: >- + {{ .ExternalURL }}/#/silences/new?filter=%7B + {{- range .CommonLabels.SortedPairs -}} + {{- if ne .Name "alertname" -}} + {{- .Name }}%3D%22{{- reReplaceAll " +" "%20" .Value -}}%22%2C%20 + {{- end -}} + {{- end -}} + alertname%3D%22{{ reReplaceAll " +" "%20" .CommonLabels.alertname }}%22%7D + - name: 'opsgenie' + opsgenie_configs: + - message: "{{ .GroupLabels.alertname }}" + priority: >- + {{- if ne .CommonLabels.priority "" -}} + {{- .CommonLabels.priority }} + {{- else -}} + {{- if eq .CommonLabels.severity "critical" -}} + P2 + {{- else -}} + P4 + {{- end -}} + {{- end -}} + responders: + - name: 'Main' + type: team + - name: 'healthchecks.io' + webhook_configs: + - send_resolved: false + url: $(HEALTHCHECKS_URL) + route: + group_by: ['alertname', 'instance', 'job'] + #group_by: ['instance', 'job'] + group_wait: 30s + group_interval: 5m + repeat_interval: 12h + receiver: 'slack' + routes: + - match: + alertname: 'Watchdog' + receiver: 'healthchecks.io' + repeat_interval: 10m + - match: + severity: 'critical' + receiver: 'opsgenie' + continue: true + inhibit_rules: + - source_match: + severity: "critical" + target_match_re: + severity: "warning|info" + equal: ['namespace'] + - source_match: + severity: "warning" + target_match_re: + severity: "info" + equal: ['namespace'] + - source_match: + alertname: 'ProbeFailed' + target_match: + alertname: 'StatusCode' + equal: ['job', 'instance'] + - source_match: + alertname: 'NodeDown' + target_match: + alertname: 'TargetDown' + equal: ['job', 'instance'] + - source_match: + alertname: 'FederatedPrometheusDown' + instance: 'lancre.thaum.xyz:443' + target_match_re: + alertname: "TargetDown" + instance: 'lancre.thaum.xyz:443|zmc.krupa.net.pl:443' + - source_match: + alertname: 'KubeNodeUnreachable' + target_match_re: + alertname: "TargetDown" + job: "kubelet|node-exporter" + - source_match: + alertname: 'KubeNodeUnreachable' + target_match: + alertname: "KubeNodeNotReady" + vars: + - name: OPSGENIE_API_KEY + secretValue: + name: alertmanager-keys + key: opsgenie_api_key + - name: SLACK_API_URL + secretValue: + name: alertmanager-keys + key: slack_api_url + - name: HEALTHCHECKS_URL + secretValue: + name: alertmanager-keys + key: healthchecks_url +--- +apiVersion: bitnami.com/v1alpha1 +kind: SealedSecret +metadata: + creationTimestamp: null + name: alertmanager-keys + namespace: monitoring +spec: + encryptedData: + healthchecks_url: AgBm6muAr5Upz3bGEu3ykXy2IRWM8sh0n22vy91T9HX1mBpG0NsREaXejjlWmovwariUf8LtCsKU3fPozPpOKudUFoqDWJxGlHhlrJlUZZY+oLR+oX56aJOYlWnCw6aIgH9zZv8NoZzDrsJb7MssSb/JKgVmh37gyYNGceKCV3YeozQeLvP2ANyPPqSJJICL9kk+aMy5os0wSQdWFvYY0t7zXc9WDO1SlQatA6J6zUfV5kl/5Wfkk39PSj72sQlkqwPPY/xwcH3OVfuEbHH0ajkQAk72KvkhNhSsdD8p9BHlR8/aJdgBVK6jR8tLNZqsYHhsD0+X7FL1Lxq90z78e+g6/CGtfFrb59qMExJ/ZcA9f1JHooQNjvsxN3xUyIj4Tzk2nEchEmdehzdcKSmwNvLgUgboTk7n7rbulzcuTDU0BYJkjbpe8W/V+92zaJF7+iwmDxpwO61tRkO0N4ONVEN5jlnuRS2SVOTN0zvtiIn3ImQH4Oy9mX9t6xo9XdmIYsta6rMIIA0PJr1ws2Tj/kCg/ClNdHYU6EN46O0v+jsQDNkaRVrgie1wyqOKYsO1+0jg7QQxGm7lU0dK0dkQ3tuFcW9zE6wpcoaRFFAWQxgTzJ8hByWn/t2ElvWNCGw1D/lKf75hG1QLXZ5oziEnksT3IV/kzgJ0SsWfnzkxL8emQ/vLBpf4jooUa2FT8rjZltVi4eb3DY/Nj0cDkfrDVFgEYho34VFhqCaQktDIGbrJuHTqau/zXHsdL+wHn/8Nuyi2Mkk/THeCJQ== + opsgenie_api_key: AgCS2LpCTZpWfVM4S/4cJflYVwSHZSBSkQSqjIQhZMIuEp9flvkKmx9rmDRz2pdG/ph598M4SBtAM24H8vDGF3ffZuspUIx0NRMIrsp0mBi4mfizW4A5mavz9GicgrEjkQaR90qzGhMXWKhdlZ/lO8F720uoTohx+Ox5MfEKlJKVlZqxcPUzHN+S2ZEqLKYXXt5U91uI7jjkIHxOhg0EgAHUGH3nl9c37W7ipkfAVFC9M8PSBuR27YSQ1z98sq8/O9IPNNbMSWrQtj7kANzbU8K2lVhzxxWKU+u9P+XwJzWGkB5t2seQFxz55afpMoY+rRjpNYFANIGZWjMxVfmgZWJcq1YqnCnmGJLI3BKk1jkqC7FynVQH6MCnxMchUReL0u7W98V7UBUnHEFxTSyhy6peSlW7sX/7Ckh1Z01bKfWwHJrugmNBiWGIWr12s2ay7ER/jN18vy3gQauhCeas07Psld1aQe+pQMxMenUbSD8Tu6xb0jVXrih/U8+rEyXjkQPgcJsxfYl6SJo7pn38J/N8DyQY8qS/nla81ZKD/5CqZyoXqYIMgnvfHw05zuXlch3KawFl0BLVwCAru9JxBoDB5RnPoGc2qpjVRiU+LhFaCC5OjwMzeNVN6fPkC0tKin0Dfn40lg+PizWoLARgsThsjAs5/A2f5fp65xe0Zr8COh/VEKDWSsxmQ53es1lknOvYKg12nKWJpc9bl8UgosykYDxSg8eXnO4txNrqlBrRNqM8+QY= + slack_api_url: AgAaaOZ8BhPOqKeQFQ8bX6Wqr1c+vGTf8R0Bl8KuA60MHHBvSBA5HPqiedXWaRIU7WxYztkHvMsrtZDJVZgdhGbpeuIZsxhGGw2AAz/oJcvsnZvIq47YewQgvuRD6rze9Dxf5VvVP9zydXLwMgXjGEFW97X0UWsmSHyFc3eORuo3ZE01yJSLezpJ0i1COMbz/lo6AyKJwrqkOGH9s9Pk6oh5s6Cc3amPCupEKU7tP8duWTUZp7SnIk58R0dqa6d5pF5HYUIhODEl1HZA7EltUicYQKnG82LT6CkmpRmpXK8PtmUhvAjq1SuqNgsN2z61jR06611G7rah6LN6t1NvCnrQu1yPuyNNd4fjWq+9uTI0CXyHl8jCnx8mk6kO2swJnwjw4gZy9kOWwFpDfkR+rpPISpmamTAY4WbO9YrTMVN0qosNOuAQ9U2nhOhN8lgUFx/Yw2i4kfFYvACHTTK2XjJGfTRlrDy34MVYjTTWQ3ZwsXa7afVU3gzxvEqIgkzOVdwyBwMk2Wns7khWVQcxvK9TkAKsTOmPhnX12niLjnftWfGWzBq3D0Ruv4Yr+kboHDRtJg9GmoxeKCOplBVs69s5d6sqxAyRwVTIGwCp29syJCFZ+NAhMQyUbT2bCjGoKNCn8/wDF/CaXHYfKnU3fDYwiC6sxXlrUrFloXu7FgXc2c3zPaHXE6Yq/JAj7OfSblGEL0SbREDyzuoUAV2xLJfhi8Xjs8Inm10g9BgHzNo1QZKMIagYh+GUfxOgfUhQt1VOXjcVyOcBi8GoHMHRfFonDlgkaNw3LRc7q48u7A== + template: + metadata: + annotations: + sealedsecrets.bitnami.com/managed: "true" + creationTimestamp: null + name: alertmanager-keys + namespace: monitoring + type: Opaque diff --git a/apps/monitoring/manifests/alertmanager/service.yaml b/apps/monitoring/manifests/alertmanager/service.yaml index 76aac52bc..e750bdd18 100644 --- a/apps/monitoring/manifests/alertmanager/service.yaml +++ b/apps/monitoring/manifests/alertmanager/service.yaml @@ -1,11 +1,15 @@ ---- apiVersion: v1 kind: Service metadata: annotations: - ignore-check.kube-linter.io/dangling-service: "Check is incompatible with prometheus-operator CRDs" + ignore-check.kube-linter.io/dangling-service: Check is incompatible with prometheus-operator + CRDs labels: alertmanager: main + app.kubernetes.io/component: alert-router + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.21.0 name: alertmanager-main namespace: monitoring spec: @@ -13,10 +17,10 @@ spec: - name: web port: 9093 targetPort: web - - name: reloader - port: 8080 - targetPort: 8080 selector: alertmanager: main app: alertmanager + app.kubernetes.io/component: alert-router + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: kube-prometheus sessionAffinity: ClientIP diff --git a/apps/monitoring/manifests/alertmanager/serviceAccount.yaml b/apps/monitoring/manifests/alertmanager/serviceAccount.yaml index 5c06d5e40..5a645515f 100644 --- a/apps/monitoring/manifests/alertmanager/serviceAccount.yaml +++ b/apps/monitoring/manifests/alertmanager/serviceAccount.yaml @@ -1,5 +1,11 @@ apiVersion: v1 kind: ServiceAccount metadata: + labels: + alertmanager: main + app.kubernetes.io/component: alert-router + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.21.0 name: alertmanager-main namespace: monitoring diff --git a/apps/monitoring/manifests/alertmanager/serviceMonitor.yaml b/apps/monitoring/manifests/alertmanager/serviceMonitor.yaml index 43ab49631..6ff457054 100644 --- a/apps/monitoring/manifests/alertmanager/serviceMonitor.yaml +++ b/apps/monitoring/manifests/alertmanager/serviceMonitor.yaml @@ -2,15 +2,19 @@ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: labels: - k8s-app: alertmanager + app.kubernetes.io/component: alert-router + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.21.0 name: alertmanager namespace: monitoring spec: endpoints: - interval: 30s port: web - - interval: 30s - port: reloader selector: matchLabels: alertmanager: main + app.kubernetes.io/component: alert-router + app.kubernetes.io/name: alertmanager + app.kubernetes.io/part-of: kube-prometheus diff --git a/apps/monitoring/manifests/grafana/03_datasources.yaml b/apps/monitoring/manifests/grafana/03_datasources.yaml deleted file mode 100644 index 45d5b72a5..000000000 --- a/apps/monitoring/manifests/grafana/03_datasources.yaml +++ /dev/null @@ -1,16 +0,0 @@ ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: grafana-datasources - namespace: monitoring -data: - datasources.yaml: | - apiVersion: 1 - datasources: - - name: Prometheus - type: prometheus - access: proxy - orgId: 1 - isDefault: true - url: http://prometheus-k8s.monitoring.svc:9090 diff --git a/apps/monitoring/manifests/grafana/05_service.yaml b/apps/monitoring/manifests/grafana/05_service.yaml deleted file mode 100644 index 825b2c573..000000000 --- a/apps/monitoring/manifests/grafana/05_service.yaml +++ /dev/null @@ -1,15 +0,0 @@ ---- -apiVersion: v1 -kind: Service -metadata: - labels: - app.kubernetes.io/name: grafana - name: grafana - namespace: monitoring -spec: - ports: - - name: http-grafana - port: 3000 - targetPort: http-grafana - selector: - app.kubernetes.io/name: grafana diff --git a/apps/monitoring/manifests/grafana/07_ingress.yaml b/apps/monitoring/manifests/grafana/07_ingress.yaml deleted file mode 100644 index 4c19d05ee..000000000 --- a/apps/monitoring/manifests/grafana/07_ingress.yaml +++ /dev/null @@ -1,28 +0,0 @@ ---- -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: grafana - namespace: monitoring - annotations: - kubernetes.io/ingress.class: "nginx" - cert-manager.io/cluster-issuer: "letsencrypt-prod" - nginx.ingress.kubernetes.io/auth-url: "https://auth.ankhmorpork.thaum.xyz/oauth2/auth" - nginx.ingress.kubernetes.io/auth-signin: "https://auth.ankhmorpork.thaum.xyz/oauth2/start?rd=$scheme://$host$escaped_request_uri" - nginx.ingress.kubernetes.io/auth-response-headers: "X-Auth-Request-Email" -spec: - tls: - - hosts: - - grafana.ankhmorpork.thaum.xyz - secretName: grafana-tls - rules: - - host: grafana.ankhmorpork.thaum.xyz - http: - paths: - - path: / - pathType: Prefix - backend: - service: - name: grafana - port: - name: http-grafana diff --git a/apps/monitoring/manifests/grafana/08_servicemonitor.yaml b/apps/monitoring/manifests/grafana/08_servicemonitor.yaml deleted file mode 100644 index 712b555d7..000000000 --- a/apps/monitoring/manifests/grafana/08_servicemonitor.yaml +++ /dev/null @@ -1,13 +0,0 @@ ---- -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - name: grafana - namespace: monitoring -spec: - endpoints: - - interval: 30s - port: http-grafana - selector: - matchLabels: - app.kubernetes.io/name: grafana diff --git a/apps/monitoring/manifests/grafana/dashboardDatasources.yaml b/apps/monitoring/manifests/grafana/dashboardDatasources.yaml new file mode 100644 index 000000000..afda0e957 --- /dev/null +++ b/apps/monitoring/manifests/grafana/dashboardDatasources.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +data: + datasources.yaml: ewogICAgImFwaVZlcnNpb24iOiAxLAogICAgImRhdGFzb3VyY2VzIjogWwogICAgICAgIHsKICAgICAgICAgICAgImFjY2VzcyI6ICJwcm94eSIsCiAgICAgICAgICAgICJpc0RlZmF1bHQiOiB0cnVlLAogICAgICAgICAgICAibmFtZSI6ICJQcm9tZXRoZXVzIiwKICAgICAgICAgICAgIm9yZ0lkIjogMSwKICAgICAgICAgICAgInR5cGUiOiAicHJvbWV0aGV1cyIsCiAgICAgICAgICAgICJ1cmwiOiAiaHR0cDovL3Byb21ldGhldXMtazhzLm1vbml0b3Jpbmcuc3ZjOjkwOTAiCiAgICAgICAgfQogICAgXQp9 +kind: Secret +metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 7.3.7 + name: grafana-datasources + namespace: monitoring +type: Opaque diff --git a/apps/monitoring/manifests/grafana/06_deployment.yaml b/apps/monitoring/manifests/grafana/deployment.yaml similarity index 72% rename from apps/monitoring/manifests/grafana/06_deployment.yaml rename to apps/monitoring/manifests/grafana/deployment.yaml index 21a87781f..ce8d2a701 100644 --- a/apps/monitoring/manifests/grafana/06_deployment.yaml +++ b/apps/monitoring/manifests/grafana/deployment.yaml @@ -1,33 +1,36 @@ ---- apiVersion: apps/v1 kind: Deployment metadata: labels: + app.kubernetes.io/component: grafana app.kubernetes.io/name: grafana - app.kubernetes.io/version: 7.3.6 + app.kubernetes.io/part-of: kube-prometheus name: grafana namespace: monitoring spec: replicas: 1 selector: matchLabels: + app.kubernetes.io/component: grafana app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus template: metadata: labels: + app.kubernetes.io/component: grafana app.kubernetes.io/name: grafana - app.kubernetes.io/version: 7.3.6 + app.kubernetes.io/part-of: kube-prometheus spec: containers: - env: - name: GF_SERVER_ROOT_URL - value: "https://grafana.ankhmorpork.thaum.xyz" + value: https://grafana.ankhmorpork.thaum.xyz - name: GF_AUTH_ANONYMOUS_ENABLED value: "false" - name: GF_AUTH_DISABLE_LOGIN_FORM value: "true" - name: GF_AUTH_SIGNOUT_REDIRECT_URL - value: "https://auth.ankhmorpork.thaum.xyz/oauth2?logout=true" + value: https://auth.ankhmorpork.thaum.xyz/oauth2?logout=true - name: GF_AUTH_BASIC_ENABLED value: "false" - name: GF_AUTH_PROXY_AUTO_SIGN_UP @@ -35,22 +38,18 @@ spec: - name: GF_AUTH_PROXY_ENABLED value: "true" - name: GF_AUTH_PROXY_HEADER_NAME - value: "X-Auth-Request-Email" + value: X-Auth-Request-Email - name: GF_AUTH_PROXY_HEADER_PROPERTY - value: "username" + value: username - name: GF_AUTH_PROXY_HEADERS - value: "Email:X-Auth-Request-Email" + value: Email:X-Auth-Request-Email - name: GF_SNAPSHOTS_EXTERNAL_ENABLED value: "false" - image: grafana/grafana:7.3.6 + image: grafana/grafana:7.3.7 name: grafana ports: - containerPort: 3000 - name: http-grafana - # readinessProbe: - # httpGet: - # path: /api/health - # port: http-grafana + name: http resources: limits: cpu: 400m @@ -63,16 +62,16 @@ spec: name: grafana-storage - mountPath: /etc/grafana/provisioning/datasources name: grafana-datasources + nodeSelector: + kubernetes.io/os: linux securityContext: runAsNonRoot: true runAsUser: 472 - nodeSelector: - kubernetes.io/os: linux serviceAccountName: grafana volumes: - name: grafana-storage persistentVolumeClaim: claimName: grafana-data - name: grafana-datasources - configMap: - name: grafana-datasources + secret: + secretName: grafana-datasources diff --git a/apps/monitoring/manifests/grafana/ingress.yaml b/apps/monitoring/manifests/grafana/ingress.yaml new file mode 100644 index 000000000..c5e68f778 --- /dev/null +++ b/apps/monitoring/manifests/grafana/ingress.yaml @@ -0,0 +1,27 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + kubernetes.io/ingress.class: nginx + nginx.ingress.kubernetes.io/auth-response-headers: X-Auth-Request-Email + nginx.ingress.kubernetes.io/auth-signin: https://auth.ankhmorpork.thaum.xyz/oauth2/start?rd=$scheme://$host$escaped_request_uri + nginx.ingress.kubernetes.io/auth-url: https://auth.ankhmorpork.thaum.xyz/oauth2/auth + name: grafana + namespace: monitoring +spec: + rules: + - host: grafana.ankhmorpork.thaum.xyz + http: + paths: + - backend: + service: + name: grafana + port: + name: http + path: / + pathType: Prefix + tls: + - hosts: + - grafana.ankhmorpork.thaum.xyz + secretName: grafana-tls diff --git a/apps/monitoring/manifests/grafana/02_pvc.yaml b/apps/monitoring/manifests/grafana/pvc.yaml similarity index 61% rename from apps/monitoring/manifests/grafana/02_pvc.yaml rename to apps/monitoring/manifests/grafana/pvc.yaml index 76b736d65..eae65b6e1 100644 --- a/apps/monitoring/manifests/grafana/02_pvc.yaml +++ b/apps/monitoring/manifests/grafana/pvc.yaml @@ -1,15 +1,14 @@ ---- -kind: PersistentVolumeClaim apiVersion: v1 +kind: PersistentVolumeClaim metadata: + annotations: + volume.beta.kubernetes.io/storage-class: longhorn name: grafana-data namespace: monitoring - annotations: - volume.beta.kubernetes.io/storage-class: "longhorn" spec: - storageClassName: "longhorn" accessModes: - - ReadWriteMany + - ReadWriteMany resources: requests: storage: 60Mi + storageClassName: longhorn diff --git a/apps/monitoring/manifests/grafana/service.yaml b/apps/monitoring/manifests/grafana/service.yaml new file mode 100644 index 000000000..c5b6b2840 --- /dev/null +++ b/apps/monitoring/manifests/grafana/service.yaml @@ -0,0 +1,20 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 7.3.7 + name: grafana + namespace: monitoring +spec: + ports: + - name: http + port: 3000 + targetPort: http + selector: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + type: ClusterIP diff --git a/apps/monitoring/manifests/grafana/04_serviceaccount.yaml b/apps/monitoring/manifests/grafana/serviceAccount.yaml similarity index 95% rename from apps/monitoring/manifests/grafana/04_serviceaccount.yaml rename to apps/monitoring/manifests/grafana/serviceAccount.yaml index 220a33fab..3ed3e031e 100644 --- a/apps/monitoring/manifests/grafana/04_serviceaccount.yaml +++ b/apps/monitoring/manifests/grafana/serviceAccount.yaml @@ -1,4 +1,3 @@ ---- apiVersion: v1 kind: ServiceAccount metadata: diff --git a/apps/monitoring/manifests/grafana/serviceMonitor.yaml b/apps/monitoring/manifests/grafana/serviceMonitor.yaml new file mode 100644 index 000000000..e34ee23b7 --- /dev/null +++ b/apps/monitoring/manifests/grafana/serviceMonitor.yaml @@ -0,0 +1,17 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/component: grafana + app.kubernetes.io/name: grafana + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 7.3.7 + name: grafana + namespace: monitoring +spec: + endpoints: + - interval: 15s + port: http + selector: + matchLabels: + app.kubernetes.io/name: grafana diff --git a/apps/monitoring/manifests/holiday/02_config.yaml b/apps/monitoring/manifests/holiday/02_config.yaml deleted file mode 100644 index c9cfa70d8..000000000 --- a/apps/monitoring/manifests/holiday/02_config.yaml +++ /dev/null @@ -1,17 +0,0 @@ ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: holiday-exporter-config - namespace: monitoring - labels: - app.kubernetes.io/name: holiday-exporter -data: - holiday_exporter.yaml: | - main: - port: 9137 - holidays: - - country: "DE" - province: "BE" - - country: "PL" - custom_holidays: [] diff --git a/apps/monitoring/manifests/holiday/03_service.yaml b/apps/monitoring/manifests/holiday/03_service.yaml deleted file mode 100644 index 0dfdee2df..000000000 --- a/apps/monitoring/manifests/holiday/03_service.yaml +++ /dev/null @@ -1,17 +0,0 @@ ---- -apiVersion: v1 -kind: Service -metadata: - labels: - app.kubernetes.io/name: holiday-exporter - app.kubernetes.io/component: exporter - name: holiday - namespace: monitoring -spec: - ports: - - name: http-holiday - port: 9137 - protocol: TCP - targetPort: http-holiday - selector: - app.kubernetes.io/name: holiday-exporter diff --git a/apps/monitoring/manifests/holiday/04_deployment.yaml b/apps/monitoring/manifests/holiday/04_deployment.yaml deleted file mode 100644 index 2129bd045..000000000 --- a/apps/monitoring/manifests/holiday/04_deployment.yaml +++ /dev/null @@ -1,51 +0,0 @@ ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: holiday-exporter - namespace: monitoring - labels: - app.kubernetes.io/name: holiday-exporter - app.kubernetes.io/component: exporter -spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/name: holiday-exporter - template: - metadata: - labels: - app.kubernetes.io/name: holiday-exporter - spec: - containers: - - env: - - name: TZ - value: Europe/Berlin - name: holiday-exporter - image: allangood/holiday_exporter - imagePullPolicy: IfNotPresent - ports: - - containerPort: 9137 - name: http-holiday - readinessProbe: - tcpSocket: - port: http-holiday - initialDelaySeconds: 1 - failureThreshold: 5 - timeoutSeconds: 10 - volumeMounts: - - mountPath: /etc/holiday_exporter.yaml - name: config - subPath: holiday_exporter.yaml - readOnly: true - resources: - requests: - memory: 18Mi - restartPolicy: Always - volumes: - - configMap: - defaultMode: 420 - name: holiday-exporter-config - name: config - nodeSelector: - kubernetes.io/arch: amd64 diff --git a/apps/monitoring/manifests/holiday/05_servicemonitor.yaml b/apps/monitoring/manifests/holiday/05_servicemonitor.yaml deleted file mode 100644 index fc218fb18..000000000 --- a/apps/monitoring/manifests/holiday/05_servicemonitor.yaml +++ /dev/null @@ -1,13 +0,0 @@ ---- -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - name: holiday-exporter - namespace: monitoring -spec: - endpoints: - - interval: 30s - port: http-holiday - selector: - matchLabels: - app.kubernetes.io/name: holiday-exporter diff --git a/apps/monitoring/manifests/ksm/04_statefulset.yaml b/apps/monitoring/manifests/ksm/04_statefulset.yaml deleted file mode 100644 index 1eb8ef331..000000000 --- a/apps/monitoring/manifests/ksm/04_statefulset.yaml +++ /dev/null @@ -1,98 +0,0 @@ -apiVersion: apps/v1 -kind: StatefulSet -metadata: - labels: - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/version: 2.0.0-alpha - name: kube-state-metrics - namespace: monitoring -spec: - replicas: 2 - serviceName: kube-state-metrics - selector: - matchLabels: - app.kubernetes.io/name: kube-state-metrics - template: - metadata: - labels: - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/version: 2.0.0-alpha - spec: - affinity: - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 100 - podAffinityTerm: - labelSelector: - matchExpressions: - - key: app.kubernetes.io/name - operator: In - values: ["kube-state-metrics"] - topologyKey: kubernetes.io/hostname - containers: - - args: - - --host=127.0.0.1 - - --port=8080 - - --telemetry-host=127.0.0.1 - - --telemetry-port=8081 - - --pod=$(POD_NAME) - - --pod-namespace=$(POD_NAMESPACE) - #- '--labels-metric-allow-list="nodes=["kubernetes.io/arch","gpu.infra/intel","network.infra/fast"]"' - #- --labels-metric-allow-list=nodes=["kubernetes.io/arch","gpu.infra/intel","network.infra/fast"] - #- --labels-metric-allow-list="nodes=["kubernetes.io/arch","gpu.infra/intel","network.infra/fast"]" - #- --labels-metric-allow-list="nodes=[kubernetes.io/arch,gpu.infra/intel,network.infra/fast]" - - --labels-metric-allow-list=nodes=[kubernetes.io/arch,gpu.infra/intel,network.infra/type] - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - # image: k8s.gcr.io/kube-state-metrics/kube-state-metrics-arm64:2.0.0-alpha.2 - image: gcr.io/k8s-staging-kube-state-metrics/kube-state-metrics:v2.0.0-alpha.3 - name: kube-state-metrics - securityContext: - runAsUser: 65534 - resources: - requests: - cpu: 10m - memory: 26Mi - - args: - - --logtostderr - - --secure-listen-address=:8443 - - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256 - - --upstream=http://127.0.0.1:8080/ - image: quay.io/brancz/kube-rbac-proxy:v0.8.0 - name: kube-rbac-proxy-main - ports: - - containerPort: 8443 - name: https-main - securityContext: - runAsUser: 65532 - resources: - requests: - cpu: 2m - memory: 12Mi - - args: - - --logtostderr - - --secure-listen-address=:9443 - - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256 - - --upstream=http://127.0.0.1:8081/ - image: quay.io/brancz/kube-rbac-proxy:v0.8.0 - name: kube-rbac-proxy-self - ports: - - containerPort: 9443 - name: https-self - securityContext: - runAsUser: 65532 - resources: - requests: - cpu: 2m - memory: 9Mi - nodeSelector: - kubernetes.io/os: linux - # kubernetes.io/arch: arm64 - serviceAccountName: kube-state-metrics diff --git a/apps/monitoring/manifests/kube-events-exporter/02_rbac.yaml b/apps/monitoring/manifests/kube-events-exporter/02_rbac.yaml deleted file mode 100644 index a40c4dd82..000000000 --- a/apps/monitoring/manifests/kube-events-exporter/02_rbac.yaml +++ /dev/null @@ -1,45 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - labels: - app.kubernetes.io/component: exporter - app.kubernetes.io/name: kube-events-exporter - app.kubernetes.io/version: 0.1.0 - name: kube-events-exporter - namespace: monitoring -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: kube-events-exporter -subjects: -- kind: ServiceAccount - name: kube-events-exporter - namespace: monitoring ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - labels: - app.kubernetes.io/component: exporter - app.kubernetes.io/name: kube-events-exporter - app.kubernetes.io/version: 0.1.0 - name: kube-events-exporter - namespace: monitoring -rules: -- apiGroups: - - "" - resources: - - events - verbs: - - list - - watch ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - labels: - app.kubernetes.io/component: exporter - app.kubernetes.io/name: kube-events-exporter - app.kubernetes.io/version: 0.1.0 - name: kube-events-exporter - namespace: monitoring diff --git a/apps/monitoring/manifests/kube-events-exporter/clusterRole.yaml b/apps/monitoring/manifests/kube-events-exporter/clusterRole.yaml new file mode 100644 index 000000000..8b89b93b6 --- /dev/null +++ b/apps/monitoring/manifests/kube-events-exporter/clusterRole.yaml @@ -0,0 +1,17 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-events-exporter + app.kubernetes.io/version: 0.1.0 + name: kube-events-exporter + namespace: monitoring +rules: +- apiGroups: + - "" + resources: + - events + verbs: + - list + - watch diff --git a/apps/monitoring/manifests/kube-events-exporter/clusterRoleBinding.yaml b/apps/monitoring/manifests/kube-events-exporter/clusterRoleBinding.yaml new file mode 100644 index 000000000..3c95b0cfd --- /dev/null +++ b/apps/monitoring/manifests/kube-events-exporter/clusterRoleBinding.yaml @@ -0,0 +1,17 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-events-exporter + app.kubernetes.io/version: 0.1.0 + name: kube-events-exporter + namespace: monitoring +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kube-events-exporter +subjects: +- kind: ServiceAccount + name: kube-events-exporter + namespace: monitoring diff --git a/apps/monitoring/manifests/kube-events-exporter/03_deployment.yaml b/apps/monitoring/manifests/kube-events-exporter/deployment.yaml similarity index 92% rename from apps/monitoring/manifests/kube-events-exporter/03_deployment.yaml rename to apps/monitoring/manifests/kube-events-exporter/deployment.yaml index ccb06c713..aa34bdb83 100644 --- a/apps/monitoring/manifests/kube-events-exporter/03_deployment.yaml +++ b/apps/monitoring/manifests/kube-events-exporter/deployment.yaml @@ -21,7 +21,8 @@ spec: app.kubernetes.io/version: 0.1.0 spec: containers: - - image: quay.io/dgrisonnet/kube-events-exporter:v0.1.0 + - args: [] + image: quay.io/dgrisonnet/kube-events-exporter:v0.1.0 name: kube-events-exporter ports: - containerPort: 8080 diff --git a/apps/monitoring/manifests/kube-events-exporter/04_podMonitor.yaml b/apps/monitoring/manifests/kube-events-exporter/podMonitor.yaml similarity index 100% rename from apps/monitoring/manifests/kube-events-exporter/04_podMonitor.yaml rename to apps/monitoring/manifests/kube-events-exporter/podMonitor.yaml diff --git a/apps/monitoring/manifests/kube-events-exporter/serviceAccount.yaml b/apps/monitoring/manifests/kube-events-exporter/serviceAccount.yaml new file mode 100644 index 000000000..c39e8a95c --- /dev/null +++ b/apps/monitoring/manifests/kube-events-exporter/serviceAccount.yaml @@ -0,0 +1,9 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-events-exporter + app.kubernetes.io/version: 0.1.0 + name: kube-events-exporter + namespace: monitoring diff --git a/apps/monitoring/manifests/ksm/02_rbac.yaml b/apps/monitoring/manifests/kube-state-metrics/clusterRole.yaml similarity index 55% rename from apps/monitoring/manifests/ksm/02_rbac.yaml rename to apps/monitoring/manifests/kube-state-metrics/clusterRole.yaml index 739a8ce3d..39b44489e 100644 --- a/apps/monitoring/manifests/ksm/02_rbac.yaml +++ b/apps/monitoring/manifests/kube-state-metrics/clusterRole.yaml @@ -1,19 +1,11 @@ ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - labels: - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/version: 1.9.5 - name: kube-state-metrics - namespace: monitoring ---- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: + app.kubernetes.io/component: exporter app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/version: 1.9.5 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: v2.0.0-beta name: kube-state-metrics rules: - apiGroups: @@ -40,7 +32,6 @@ rules: - daemonsets - deployments - replicasets - - ingresses verbs: - list - watch @@ -126,59 +117,3 @@ rules: verbs: - list - watch ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - labels: - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/version: 1.9.5 - name: kube-state-metrics -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: kube-state-metrics -subjects: -- kind: ServiceAccount - name: kube-state-metrics - namespace: monitoring ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - labels: - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/version: 2.0.0-alpha - name: kube-state-metrics - namespace: monitoring -rules: -- apiGroups: - - "" - resources: - - pods - verbs: - - get -- apiGroups: - - apps - resourceNames: - - kube-state-metrics - resources: - - statefulsets - verbs: - - get ---- - -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - labels: - app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/version: 2.0.0-alpha - name: kube-state-metrics -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: kube-state-metrics -subjects: -- kind: ServiceAccount - name: kube-state-metrics diff --git a/apps/monitoring/manifests/kube-state-metrics/clusterRoleBinding.yaml b/apps/monitoring/manifests/kube-state-metrics/clusterRoleBinding.yaml new file mode 100644 index 000000000..31fadd581 --- /dev/null +++ b/apps/monitoring/manifests/kube-state-metrics/clusterRoleBinding.yaml @@ -0,0 +1,17 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: v2.0.0-beta + name: kube-state-metrics +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kube-state-metrics +subjects: +- kind: ServiceAccount + name: kube-state-metrics + namespace: monitoring diff --git a/apps/monitoring/manifests/kube-state-metrics/deployment.yaml b/apps/monitoring/manifests/kube-state-metrics/deployment.yaml new file mode 100644 index 000000000..7c07494d4 --- /dev/null +++ b/apps/monitoring/manifests/kube-state-metrics/deployment.yaml @@ -0,0 +1,88 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: v2.0.0-beta + name: kube-state-metrics + namespace: monitoring +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/part-of: kube-prometheus + template: + metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: v2.0.0-beta + spec: + containers: + - args: + - --host=127.0.0.1 + - --port=8081 + - --telemetry-host=127.0.0.1 + - --telemetry-port=8082 + - --labels-metric-allow-list=nodes=[kubernetes.io/arch,gpu.infra/intel,network.infra/type] + image: k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.0.0-beta + name: kube-state-metrics + resources: + limits: + cpu: 100m + memory: 250Mi + requests: + cpu: 10m + memory: 190Mi + securityContext: + runAsUser: 65534 + - args: + - --logtostderr + - --secure-listen-address=:8443 + - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 + - --upstream=http://127.0.0.1:8081/ + image: quay.io/brancz/kube-rbac-proxy:v0.8.0 + name: kube-rbac-proxy-main + ports: + - containerPort: 8443 + name: https-main + resources: + limits: + cpu: 20m + memory: 40Mi + requests: + cpu: 10m + memory: 20Mi + securityContext: + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65532 + - args: + - --logtostderr + - --secure-listen-address=:9443 + - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 + - --upstream=http://127.0.0.1:8082/ + image: quay.io/brancz/kube-rbac-proxy:v0.8.0 + name: kube-rbac-proxy-self + ports: + - containerPort: 9443 + name: https-self + resources: + limits: + cpu: 20m + memory: 40Mi + requests: + cpu: 10m + memory: 20Mi + securityContext: + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65532 + nodeSelector: + kubernetes.io/os: linux + serviceAccountName: kube-state-metrics diff --git a/apps/monitoring/manifests/prometheus/rules/kube-state-metrics.yaml b/apps/monitoring/manifests/kube-state-metrics/prometheusRule.yaml similarity index 52% rename from apps/monitoring/manifests/prometheus/rules/kube-state-metrics.yaml rename to apps/monitoring/manifests/kube-state-metrics/prometheusRule.yaml index b9fdf77c9..33c68efe2 100644 --- a/apps/monitoring/manifests/prometheus/rules/kube-state-metrics.yaml +++ b/apps/monitoring/manifests/kube-state-metrics/prometheusRule.yaml @@ -2,7 +2,10 @@ apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: labels: - prometheus: k8s + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: v2.0.0-beta role: alert-rules name: kube-state-metrics-rules namespace: monitoring @@ -12,8 +15,10 @@ spec: rules: - alert: KubeStateMetricsListErrors annotations: - message: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricslisterrors + description: kube-state-metrics is experiencing errors at an elevated rate + in list operations. This is likely causing it to not be able to expose metrics + about Kubernetes objects correctly or at all. + summary: kube-state-metrics is experiencing errors in list operations. expr: | (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) / @@ -24,8 +29,10 @@ spec: severity: critical - alert: KubeStateMetricsWatchErrors annotations: - message: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricswatcherrors + description: kube-state-metrics is experiencing errors at an elevated rate + in watch operations. This is likely causing it to not be able to expose + metrics about Kubernetes objects correctly or at all. + summary: kube-state-metrics is experiencing errors in watch operations. expr: | (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) / diff --git a/apps/monitoring/manifests/ksm/03_service.yaml b/apps/monitoring/manifests/kube-state-metrics/service.yaml similarity index 62% rename from apps/monitoring/manifests/ksm/03_service.yaml rename to apps/monitoring/manifests/kube-state-metrics/service.yaml index 3b551182f..96b4145c6 100644 --- a/apps/monitoring/manifests/ksm/03_service.yaml +++ b/apps/monitoring/manifests/kube-state-metrics/service.yaml @@ -1,10 +1,11 @@ ---- apiVersion: v1 kind: Service metadata: labels: + app.kubernetes.io/component: exporter app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/version: 1.9.5 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: v2.0.0-beta name: kube-state-metrics namespace: monitoring spec: @@ -17,4 +18,6 @@ spec: port: 9443 targetPort: https-self selector: + app.kubernetes.io/component: exporter app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/part-of: kube-prometheus diff --git a/apps/monitoring/manifests/kube-state-metrics/serviceAccount.yaml b/apps/monitoring/manifests/kube-state-metrics/serviceAccount.yaml new file mode 100644 index 000000000..fbeaccb85 --- /dev/null +++ b/apps/monitoring/manifests/kube-state-metrics/serviceAccount.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: v2.0.0-beta + name: kube-state-metrics + namespace: monitoring diff --git a/apps/monitoring/manifests/ksm/05_serviceMonitor.yaml b/apps/monitoring/manifests/kube-state-metrics/serviceMonitor.yaml similarity index 77% rename from apps/monitoring/manifests/ksm/05_serviceMonitor.yaml rename to apps/monitoring/manifests/kube-state-metrics/serviceMonitor.yaml index ad7a6438f..be42255c0 100644 --- a/apps/monitoring/manifests/ksm/05_serviceMonitor.yaml +++ b/apps/monitoring/manifests/kube-state-metrics/serviceMonitor.yaml @@ -2,8 +2,10 @@ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: labels: + app.kubernetes.io/component: exporter app.kubernetes.io/name: kube-state-metrics - app.kubernetes.io/version: 1.9.6 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: v2.0.0-beta name: kube-state-metrics namespace: monitoring spec: @@ -28,4 +30,6 @@ spec: jobLabel: app.kubernetes.io/name selector: matchLabels: + app.kubernetes.io/component: exporter app.kubernetes.io/name: kube-state-metrics + app.kubernetes.io/part-of: kube-prometheus diff --git a/apps/monitoring/manifests/01_namespace.yaml b/apps/monitoring/manifests/namespace.yaml similarity index 93% rename from apps/monitoring/manifests/01_namespace.yaml rename to apps/monitoring/manifests/namespace.yaml index ff7ae1b93..d32523606 100644 --- a/apps/monitoring/manifests/01_namespace.yaml +++ b/apps/monitoring/manifests/namespace.yaml @@ -1,4 +1,3 @@ ---- apiVersion: v1 kind: Namespace metadata: diff --git a/apps/monitoring/manifests/other/coreDNSPrometheusRule.yaml b/apps/monitoring/manifests/other/coreDNSPrometheusRule.yaml new file mode 100644 index 000000000..0ab296f7a --- /dev/null +++ b/apps/monitoring/manifests/other/coreDNSPrometheusRule.yaml @@ -0,0 +1,91 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + prometheus: k8s + role: alert-rules + name: coredns + namespace: monitoring +spec: + groups: + - name: coredns + rules: + - alert: CoreDNSDown + annotations: + message: CoreDNS has disappeared from Prometheus target discovery. + runbook_url: https://github.com/thaum-xyz/ankhmorpork/tree/master/docs/runbooks/corednsdown + expr: | + absent(up{job=~"kube-dns|coredns"} == 1) + for: 15m + labels: + severity: critical + - alert: CoreDNSLatencyHigh + annotations: + message: CoreDNS has 99th percentile latency of {{ $value }} seconds for server + {{ $labels.server }} zone {{ $labels.zone }} . + runbook_url: https://github.com/thaum-xyz/ankhmorpork/tree/master/docs/runbooks/corednslatencyhigh + expr: | + histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket{job=~"kube-dns|coredns"}[5m])) by(server, zone, le)) > 4 + for: 10m + labels: + severity: critical + - alert: CoreDNSErrorsHigh + annotations: + message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage + }} of requests. + runbook_url: https://github.com/thaum-xyz/ankhmorpork/tree/master/docs/runbooks/corednserrorshigh + expr: | + sum(rate(coredns_dns_response_rcode_count_total{job=~"kube-dns|coredns",rcode="SERVFAIL"}[5m])) + / + sum(rate(coredns_dns_response_rcode_count_total{job=~"kube-dns|coredns"}[5m])) > 0.03 + for: 10m + labels: + severity: critical + - alert: CoreDNSErrorsHigh + annotations: + message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage + }} of requests. + runbook_url: https://github.com/thaum-xyz/ankhmorpork/tree/master/docs/runbooks/corednserrorshigh + expr: | + sum(rate(coredns_dns_response_rcode_count_total{job=~"kube-dns|coredns",rcode="SERVFAIL"}[5m])) + / + sum(rate(coredns_dns_response_rcode_count_total{job=~"kube-dns|coredns"}[5m])) > 0.01 + for: 10m + labels: + severity: warning + - name: coredns_forward + rules: + - alert: CoreDNSForwardLatencyHigh + annotations: + message: CoreDNS has 99th percentile latency of {{ $value }} seconds forwarding + requests to {{ $labels.to }}. + runbook_url: https://github.com/thaum-xyz/ankhmorpork/tree/master/docs/runbooks/corednsforwardlatencyhigh + expr: | + histogram_quantile(0.99, sum(rate(coredns_forward_request_duration_seconds_bucket{job=~"kube-dns|coredns"}[5m])) by(to, le)) > 4 + for: 10m + labels: + severity: critical + - alert: CoreDNSForwardErrorsHigh + annotations: + message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage + }} of forward requests to {{ $labels.to }}. + runbook_url: https://github.com/thaum-xyz/ankhmorpork/tree/master/docs/runbooks/corednsforwarderrorshigh + expr: | + sum(rate(coredns_forward_response_rcode_count_total{job=~"kube-dns|coredns",rcode="SERVFAIL"}[5m])) + / + sum(rate(coredns_forward_response_rcode_count_total{job=~"kube-dns|coredns"}[5m])) > 0.03 + for: 10m + labels: + severity: critical + - alert: CoreDNSForwardErrorsHigh + annotations: + message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage + }} of forward requests to {{ $labels.to }}. + runbook_url: https://github.com/thaum-xyz/ankhmorpork/tree/master/docs/runbooks/corednsforwarderrorshigh + expr: | + sum(rate(coredns_dns_response_rcode_count_total{job=~"kube-dns|coredns",rcode="SERVFAIL"}[5m])) + / + sum(rate(coredns_dns_response_rcode_count_total{job=~"kube-dns|coredns"}[5m])) > 0.01 + for: 10m + labels: + severity: warning diff --git a/apps/monitoring/manifests/other/kubePrometheusRule.yaml b/apps/monitoring/manifests/other/kubePrometheusRule.yaml new file mode 100644 index 000000000..44ccf37bf --- /dev/null +++ b/apps/monitoring/manifests/other/kubePrometheusRule.yaml @@ -0,0 +1,69 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-prometheus + app.kubernetes.io/part-of: kube-prometheus + role: alert-rules + name: kube-prometheus-rules + namespace: monitoring +spec: + groups: + - name: general.rules + rules: + - alert: TargetDown + annotations: + message: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service + }} targets in {{ $labels.namespace }} namespace are down.' + expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, + namespace, service)) > 10 + for: 10m + labels: + severity: warning + - alert: Watchdog + annotations: + message: | + This is an alert meant to ensure that the entire alerting pipeline is functional. + This alert is always firing, therefore it should always be firing in Alertmanager + and always fire against a receiver. There are integrations with various notification + mechanisms that send a notification when this alert is not firing. For example the + "DeadMansSnitch" integration in PagerDuty. + expr: vector(1) + labels: + severity: none + - name: node-network + rules: + - alert: NodeNetworkInterfaceFlapping + annotations: + message: Network interface "{{ $labels.device }}" changing it's up status + often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }} + expr: | + changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2 + for: 2m + labels: + severity: warning + - name: kube-prometheus-node-recording.rules + rules: + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) + BY (instance) + record: instance:node_cpu:rate:sum + - expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance) + record: instance:node_network_receive_bytes:rate:sum + - expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance) + record: instance:node_network_transmit_bytes:rate:sum + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) + WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) + BY (instance, cpu)) BY (instance) + record: instance:node_cpu:ratio + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) + record: cluster:node_cpu:sum_rate5m + - expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) + BY (instance, cpu)) + record: cluster:node_cpu:ratio + - name: kube-prometheus-general.rules + rules: + - expr: count without(instance, pod, node) (up == 1) + record: count:up1 + - expr: count without(instance, pod, node) (up == 0) + record: count:up0 diff --git a/apps/monitoring/manifests/prometheus/rules/kubernetes.yaml b/apps/monitoring/manifests/other/kubernetesPrometheusRule.yaml similarity index 64% rename from apps/monitoring/manifests/prometheus/rules/kubernetes.yaml rename to apps/monitoring/manifests/other/kubernetesPrometheusRule.yaml index 2dff9b137..412279546 100644 --- a/apps/monitoring/manifests/prometheus/rules/kubernetes.yaml +++ b/apps/monitoring/manifests/other/kubernetesPrometheusRule.yaml @@ -2,1365 +2,1429 @@ apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: labels: - prometheus: k8s + app.kubernetes.io/component: exporter + app.kubernetes.io/name: kube-prometheus + app.kubernetes.io/part-of: kube-prometheus role: alert-rules name: kubernetes-rules namespace: monitoring spec: groups: - - name: kube-apiserver.rules + - name: kubernetes-apps rules: - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1d])) - - - ( - ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1d])) - or - vector(0) - ) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1d])) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1d])) - ) - ) - + - # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d])) + - alert: KubePodCrashLooping + annotations: + description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container + }}) is restarting {{ printf "%.2f" $value }} times / 10 minutes. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepodcrashlooping + summary: Pod is crash looping. + expr: | + rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[10m]) * 60 * 5 > 0 + for: 15m labels: - verb: read - record: apiserver_request:burnrate1d - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[1h])) - - - ( - ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1h])) - or - vector(0) - ) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[1h])) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[1h])) - ) + severity: warning + - alert: KubePodNotReady + annotations: + description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready + state for longer than 15 minutes. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepodnotready + summary: Pod has been in a non-ready state for more than 15 minutes. + expr: | + sum by (namespace, pod) ( + max by(namespace, pod) ( + kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"} + ) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) ( + 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"}) ) - + - # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h])) + ) > 0 + for: 15m labels: - verb: read - record: apiserver_request:burnrate1h - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[2h])) - - - ( - ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[2h])) - or - vector(0) - ) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[2h])) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[2h])) - ) - ) - + - # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h])) + severity: warning + - alert: KubeDeploymentGenerationMismatch + annotations: + description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment + }} does not match, this indicates that the Deployment has failed but has + not been rolled back. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedeploymentgenerationmismatch + summary: Deployment generation mismatch due to possible roll-back + expr: | + kube_deployment_status_observed_generation{job="kube-state-metrics"} + != + kube_deployment_metadata_generation{job="kube-state-metrics"} + for: 15m labels: - verb: read - record: apiserver_request:burnrate2h - - expr: | + severity: warning + - alert: KubeDeploymentReplicasMismatch + annotations: + description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has + not matched the expected number of replicas for longer than 15 minutes. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedeploymentreplicasmismatch + summary: Deployment has not matched the expected number of replicas. + expr: | ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30m])) - - - ( - ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30m])) - or - vector(0) - ) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30m])) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30m])) - ) - ) - + - # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m])) + kube_deployment_spec_replicas{job="kube-state-metrics"} + != + kube_deployment_status_replicas_available{job="kube-state-metrics"} + ) and ( + changes(kube_deployment_status_replicas_updated{job="kube-state-metrics"}[10m]) + == + 0 ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m])) + for: 15m labels: - verb: read - record: apiserver_request:burnrate30m - - expr: | + severity: warning + - alert: KubeStatefulSetReplicasMismatch + annotations: + description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} + has not matched the expected number of replicas for longer than 15 minutes. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatefulsetreplicasmismatch + summary: Deployment has not matched the expected number of replicas. + expr: | ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[3d])) - - - ( - ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[3d])) - or - vector(0) - ) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[3d])) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[3d])) - ) - ) - + - # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d])) + kube_statefulset_status_replicas_ready{job="kube-state-metrics"} + != + kube_statefulset_status_replicas{job="kube-state-metrics"} + ) and ( + changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[10m]) + == + 0 ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d])) + for: 15m labels: - verb: read - record: apiserver_request:burnrate3d - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[5m])) - - - ( - ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[5m])) - or - vector(0) - ) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[5m])) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[5m])) - ) - ) - + - # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) + severity: warning + - alert: KubeStatefulSetGenerationMismatch + annotations: + description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset + }} does not match, this indicates that the StatefulSet has failed but has + not been rolled back. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatefulsetgenerationmismatch + summary: StatefulSet generation mismatch due to possible roll-back + expr: | + kube_statefulset_status_observed_generation{job="kube-state-metrics"} + != + kube_statefulset_metadata_generation{job="kube-state-metrics"} + for: 15m labels: - verb: read - record: apiserver_request:burnrate5m - - expr: | + severity: warning + - alert: KubeStatefulSetUpdateNotRolledOut + annotations: + description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} + update has not been rolled out. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubestatefulsetupdatenotrolledout + summary: StatefulSet update has not been rolled out. + expr: | ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[6h])) - - - ( - ( - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[6h])) - or - vector(0) - ) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[6h])) - + - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[6h])) - ) + max without (revision) ( + kube_statefulset_status_current_revision{job="kube-state-metrics"} + unless + kube_statefulset_status_update_revision{job="kube-state-metrics"} ) - + - # errors - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h])) - labels: - verb: read - record: apiserver_request:burnrate6h - - expr: | - ( + * ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) - - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1d])) + kube_statefulset_replicas{job="kube-state-metrics"} + != + kube_statefulset_status_replicas_updated{job="kube-state-metrics"} ) - + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d])) + ) and ( + changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[5m]) + == + 0 ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) + for: 15m labels: - verb: write - record: apiserver_request:burnrate1d - - expr: | + severity: warning + - alert: KubeDaemonSetRolloutStuck + annotations: + description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has + not finished or progressed for at least 15 minutes. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedaemonsetrolloutstuck + summary: DaemonSet rollout is stuck. + expr: | ( ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) - - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1h])) + kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} + != + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} + ) or ( + kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} + != + 0 + ) or ( + kube_daemonset_updated_number_scheduled{job="kube-state-metrics"} + != + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} + ) or ( + kube_daemonset_status_number_available{job="kube-state-metrics"} + != + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} ) - + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) + ) and ( + changes(kube_daemonset_updated_number_scheduled{job="kube-state-metrics"}[5m]) + == + 0 ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) + for: 15m labels: - verb: write - record: apiserver_request:burnrate1h - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) - - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[2h])) - ) - + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) - labels: - verb: write - record: apiserver_request:burnrate2h - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) - - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[30m])) - ) - + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) - labels: - verb: write - record: apiserver_request:burnrate30m - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) - - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[3d])) - ) - + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) - labels: - verb: write - record: apiserver_request:burnrate3d - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) - - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[5m])) - ) - + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) - labels: - verb: write - record: apiserver_request:burnrate5m - - expr: | - ( - ( - # too slow - sum(rate(apiserver_request_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) - - - sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",le="1"}[6h])) - ) - + - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h])) - ) - / - sum(rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) - labels: - verb: write - record: apiserver_request:burnrate6h - - expr: | - sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) - labels: - verb: read - record: code_resource:apiserver_request_total:rate5m - - expr: | - sum by (code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) - labels: - verb: write - record: code_resource:apiserver_request_total:rate5m - - expr: | - histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET"}[5m]))) > 0 - labels: - quantile: "0.99" - verb: read - record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0 - labels: - quantile: "0.99" - verb: write - record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) - labels: - quantile: "0.99" - record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) - labels: - quantile: "0.9" - record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) - labels: - quantile: "0.5" - record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile - - interval: 3m - name: kube-apiserver-availability.rules - rules: - - expr: | - 1 - ( - ( - # write too slow - sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d])) - - - sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d])) - ) + - ( - # read too slow - sum(increase(apiserver_request_duration_seconds_count{verb=~"LIST|GET"}[30d])) - - - ( - ( - sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) - or - vector(0) - ) - + - sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) - + - sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="cluster",le="5"}[30d])) - ) - ) + - # errors - sum(code:apiserver_request_total:increase30d{code=~"5.."} or vector(0)) - ) - / - sum(code:apiserver_request_total:increase30d) + severity: warning + - alert: KubeContainerWaiting + annotations: + description: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} + has been in waiting state for longer than 1 hour. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubecontainerwaiting + summary: Pod container waiting longer than 1 hour + expr: | + sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0 + for: 1h labels: - verb: all - record: apiserver_request:availability30d - - expr: | - 1 - ( - sum(increase(apiserver_request_duration_seconds_count{job="apiserver",verb=~"LIST|GET"}[30d])) + severity: warning + - alert: KubeDaemonSetNotScheduled + annotations: + description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset + }} are not scheduled.' + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedaemonsetnotscheduled + summary: DaemonSet pods are not scheduled. + expr: | + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} - - ( - # too slow - ( - sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) - or - vector(0) - ) - + - sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) - + - sum(increase(apiserver_request_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",scope="cluster",le="5"}[30d])) - ) - + - # errors - sum(code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0)) - ) - / - sum(code:apiserver_request_total:increase30d{verb="read"}) - labels: - verb: read - record: apiserver_request:availability30d - - expr: | - 1 - ( - ( - # too slow - sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d])) - - - sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d])) - ) - + - # errors - sum(code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0)) - ) - / - sum(code:apiserver_request_total:increase30d{verb="write"}) + kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0 + for: 10m labels: - verb: write - record: apiserver_request:availability30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"2.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"2.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"2.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"2.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"2.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"2.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"3.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"3.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"3.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"3.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"3.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"3.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"4.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"4.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"4.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"4.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"4.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"4.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="LIST",code=~"5.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="GET",code=~"5.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="POST",code=~"5.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PUT",code=~"5.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="PATCH",code=~"5.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code, verb) (increase(apiserver_request_total{job="apiserver",verb="DELETE",code=~"5.."}[30d])) - record: code_verb:apiserver_request_total:increase30d - - expr: | - sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"}) + severity: warning + - alert: KubeDaemonSetMisScheduled + annotations: + description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset + }} are running where they are not supposed to run.' + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubedaemonsetmisscheduled + summary: DaemonSet pods are misscheduled. + expr: | + kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0 + for: 15m labels: - verb: read - record: code:apiserver_request_total:increase30d - - expr: | - sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) + severity: warning + - alert: KubeJobCompletion + annotations: + description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking + more than 12 hours to complete. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubejobcompletion + summary: Job did not complete in time + expr: | + kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0 + for: 12h labels: - verb: write - record: code:apiserver_request_total:increase30d - - name: k8s.rules - rules: - - expr: | - sum by (cluster, namespace, pod, container) ( - rate(container_cpu_usage_seconds_total{job="kubelet",metrics_path="/metrics/cadvisor", image!="", container!="POD"}[5m]) - ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) ( - 1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) - ) - record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate - - expr: | - container_memory_working_set_bytes{job="kubelet",metrics_path="/metrics/cadvisor", image!=""} - * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, - max by(namespace, pod, node) (kube_pod_info{node!=""}) - ) - record: node_namespace_pod_container:container_memory_working_set_bytes - - expr: | - container_memory_rss{job="kubelet",metrics_path="/metrics/cadvisor", image!=""} - * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, - max by(namespace, pod, node) (kube_pod_info{node!=""}) - ) - record: node_namespace_pod_container:container_memory_rss - - expr: | - container_memory_cache{job="kubelet",metrics_path="/metrics/cadvisor", image!=""} - * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, - max by(namespace, pod, node) (kube_pod_info{node!=""}) - ) - record: node_namespace_pod_container:container_memory_cache - - expr: | - container_memory_swap{job="kubelet",metrics_path="/metrics/cadvisor", image!=""} - * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, - max by(namespace, pod, node) (kube_pod_info{node!=""}) - ) - record: node_namespace_pod_container:container_memory_swap - - expr: | - sum by (namespace) ( - sum by (namespace, pod) ( - max by (namespace, pod, container) ( - kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"} OR kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} - ) * on(namespace, pod) group_left() max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Pending|Running"} == 1 - ) - ) - ) - record: namespace_memory:kube_pod_container_resource_requests:sum - - expr: | - sum by (namespace) ( - sum by (namespace, pod) ( - max by (namespace, pod, container) ( - kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} OR kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} - ) * on(namespace, pod) group_left() max by (namespace, pod) ( - kube_pod_status_phase{phase=~"Pending|Running"} == 1 - ) - ) - ) - record: namespace_cpu:kube_pod_container_resource_requests:sum - - expr: | - max by (cluster, namespace, workload, pod) ( - label_replace( - label_replace( - kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"}, - "replicaset", "$1", "owner_name", "(.*)" - ) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) ( - 1, max by (replicaset, namespace, owner_name) ( - kube_replicaset_owner{job="kube-state-metrics"} - ) - ), - "workload", "$1", "owner_name", "(.*)" - ) - ) + severity: warning + - alert: KubeJobFailed + annotations: + description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to + complete. Removing failed job after investigation should clear this alert. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubejobfailed + summary: Job failed to complete. + expr: | + kube_job_failed{job="kube-state-metrics"} > 0 + for: 15m labels: - workload_type: deployment - record: namespace_workload_pod:kube_pod_owner:relabel - - expr: | - max by (cluster, namespace, workload, pod) ( - label_replace( - kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"}, - "workload", "$1", "owner_name", "(.*)" - ) - ) + severity: warning + - alert: KubeHpaReplicasMismatch + annotations: + description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched + the desired number of replicas for longer than 15 minutes. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubehpareplicasmismatch + summary: HPA has not matched descired number of replicas. + expr: | + (kube_hpa_status_desired_replicas{job="kube-state-metrics"} + != + kube_hpa_status_current_replicas{job="kube-state-metrics"}) + and + (kube_hpa_status_current_replicas{job="kube-state-metrics"} + > + kube_hpa_spec_min_replicas{job="kube-state-metrics"}) + and + (kube_hpa_status_current_replicas{job="kube-state-metrics"} + < + kube_hpa_spec_max_replicas{job="kube-state-metrics"}) + and + changes(kube_hpa_status_current_replicas[15m]) == 0 + for: 15m labels: - workload_type: daemonset - record: namespace_workload_pod:kube_pod_owner:relabel - - expr: | - max by (cluster, namespace, workload, pod) ( - label_replace( - kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"}, - "workload", "$1", "owner_name", "(.*)" - ) - ) + severity: warning + - alert: KubeHpaMaxedOut + annotations: + description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running + at max replicas for longer than 15 minutes. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubehpamaxedout + summary: HPA is running at max replicas + expr: | + kube_hpa_status_current_replicas{job="kube-state-metrics"} + == + kube_hpa_spec_max_replicas{job="kube-state-metrics"} + for: 15m labels: - workload_type: statefulset - record: namespace_workload_pod:kube_pod_owner:relabel - - name: kube-scheduler.rules + severity: warning + - name: kubernetes-resources rules: - - expr: | - histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) - labels: - quantile: "0.99" - record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + - alert: KubeCPUOvercommit + annotations: + description: Cluster has overcommitted CPU resource requests for Pods and + cannot tolerate node failure. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubecpuovercommit + summary: Cluster has overcommitted CPU resource requests. + expr: | + sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum{}) + / + sum(kube_node_status_allocatable_cpu_cores) + > + (count(kube_node_status_allocatable_cpu_cores)-1) / count(kube_node_status_allocatable_cpu_cores) + for: 5m labels: - quantile: "0.99" - record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + severity: warning + - alert: KubeMemoryOvercommit + annotations: + description: Cluster has overcommitted memory resource requests for Pods and + cannot tolerate node failure. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubememoryovercommit + summary: Cluster has overcommitted memory resource requests. + expr: | + sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum{}) + / + sum(kube_node_status_allocatable_memory_bytes) + > + (count(kube_node_status_allocatable_memory_bytes)-1) + / + count(kube_node_status_allocatable_memory_bytes) + for: 5m labels: - quantile: "0.99" - record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + severity: warning + - alert: KubeCPUQuotaOvercommit + annotations: + description: Cluster has overcommitted CPU resource requests for Namespaces. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubecpuquotaovercommit + summary: Cluster has overcommitted CPU resource requests. + expr: | + sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"}) + / + sum(kube_node_status_allocatable_cpu_cores) + > 1.5 + for: 5m labels: - quantile: "0.9" - record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + severity: warning + - alert: KubeMemoryQuotaOvercommit + annotations: + description: Cluster has overcommitted memory resource requests for Namespaces. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubememoryquotaovercommit + summary: Cluster has overcommitted memory resource requests. + expr: | + sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"}) + / + sum(kube_node_status_allocatable_memory_bytes{job="kube-state-metrics"}) + > 1.5 + for: 5m labels: - quantile: "0.9" - record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + severity: warning + - alert: KubeQuotaAlmostFull + annotations: + description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage + }} of its {{ $labels.resource }} quota. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubequotaalmostfull + summary: Namespace quota is going to be full. + expr: | + kube_resourcequota{job="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + > 0.9 < 1 + for: 15m labels: - quantile: "0.9" - record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + severity: info + - alert: KubeQuotaFullyUsed + annotations: + description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage + }} of its {{ $labels.resource }} quota. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubequotafullyused + summary: Namespace quota is fully used. + expr: | + kube_resourcequota{job="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + == 1 + for: 15m labels: - quantile: "0.5" - record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + severity: info + - alert: KubeQuotaExceeded + annotations: + description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage + }} of its {{ $labels.resource }} quota. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubequotaexceeded + summary: Namespace quota has exceeded the limits. + expr: | + kube_resourcequota{job="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + > 1 + for: 15m labels: - quantile: "0.5" - record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) + severity: warning + - alert: CPUThrottlingHigh + annotations: + description: '{{ $value | humanizePercentage }} throttling of CPU in namespace + {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ + $labels.pod }}.' + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/cputhrottlinghigh + summary: Processes experience elevated CPU throttling. + expr: | + sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace) + / + sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container, pod, namespace) + > ( 25 / 100 ) + for: 15m labels: - quantile: "0.5" - record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile - - name: node.rules - rules: - - expr: | - topk by(namespace, pod) (1, - max by (node, namespace, pod) ( - label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)") - )) - record: 'node_namespace_pod:kube_pod_info:' - - expr: | - count by (cluster, node) (sum by (node, cpu) ( - node_cpu_seconds_total{job="node-exporter",service="kubelet"} - * on (namespace, pod) group_left(node) - node_namespace_pod:kube_pod_info: - )) - record: node:node_num_cpu:sum - - expr: | - sum( - node_memory_MemAvailable_bytes{job="node-exporter",service="kubelet"} or - ( - node_memory_Buffers_bytes{job="node-exporter",service="kubelet"} + - node_memory_Cached_bytes{job="node-exporter",service="kubelet"} + - node_memory_MemFree_bytes{job="node-exporter",service="kubelet"} + - node_memory_Slab_bytes{job="node-exporter",service="kubelet"} - ) - ) by (cluster) - record: :node_memory_MemAvailable_bytes:sum - - name: kubelet.rules + severity: info + - name: kubernetes-storage rules: - - expr: | - histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet"}) + - alert: KubePersistentVolumeFillingUp + annotations: + description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim + }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage + }} free. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: | + kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"} + / + kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"} + < 0.03 + for: 1m labels: - quantile: "0.99" - record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet"}) + severity: critical + - alert: KubePersistentVolumeFillingUp + annotations: + description: Based on recent sampling, the PersistentVolume claimed by {{ + $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is + expected to fill up within four days. Currently {{ $value | humanizePercentage + }} is available. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: | + ( + kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"} + / + kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"} + ) < 0.15 + and + predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 + for: 1h labels: - quantile: "0.9" - record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile - - expr: | - histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet"}) + severity: warning + - alert: KubePersistentVolumeErrors + annotations: + description: The persistent volume {{ $labels.persistentvolume }} has status + {{ $labels.phase }}. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubepersistentvolumeerrors + summary: PersistentVolume is having issues with provisioning. + expr: | + kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 + for: 5m labels: - quantile: "0.5" - record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile - - name: kubernetes-apps + severity: critical + - name: kubernetes-system rules: - - alert: KubePodCrashLooping + - alert: KubeVersionMismatch annotations: - description: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping - summary: Pod is crash looping. + description: There are {{ $value }} different semantic versions of Kubernetes + components running. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeversionmismatch + summary: Different semantic versions of Kubernetes components running. expr: | - rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[5m]) * 60 * 5 > 0 + count(count by (git_version) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1 for: 15m labels: severity: warning - - alert: KubePodNotReady + - alert: KubeClientErrors + annotations: + description: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance + }}' is experiencing {{ $value | humanizePercentage }} errors.' + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeclienterrors + summary: Kubernetes API server client is experiencing errors. + expr: | + (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job) + / + sum(rate(rest_client_requests_total[5m])) by (instance, job)) + > 0.01 + for: 15m + labels: + severity: warning + - name: kube-apiserver-slos + rules: + - alert: KubeAPIErrorBudgetBurn + annotations: + description: The API server is burning too much error budget. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapierrorbudgetburn + summary: The API server is burning too much error budget. + expr: | + sum(apiserver_request:burnrate1h) > (14.40 * 0.01000) + and + sum(apiserver_request:burnrate5m) > (14.40 * 0.01000) + for: 2m + labels: + long: 1h + severity: critical + short: 5m + - alert: KubeAPIErrorBudgetBurn + annotations: + description: The API server is burning too much error budget. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapierrorbudgetburn + summary: The API server is burning too much error budget. + expr: | + sum(apiserver_request:burnrate6h) > (6.00 * 0.01000) + and + sum(apiserver_request:burnrate30m) > (6.00 * 0.01000) + for: 15m + labels: + long: 6h + severity: critical + short: 30m + - alert: KubeAPIErrorBudgetBurn annotations: - description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready - summary: Pod has been in a non-ready state for more than 15 minutes. + description: The API server is burning too much error budget. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapierrorbudgetburn + summary: The API server is burning too much error budget. expr: | - sum by (namespace, pod) ( - max by(namespace, pod) ( - kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"} - ) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) ( - 1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"}) - ) - ) > 0 - for: 15m + sum(apiserver_request:burnrate1d) > (3.00 * 0.01000) + and + sum(apiserver_request:burnrate2h) > (3.00 * 0.01000) + for: 1h labels: + long: 1d severity: warning - - alert: KubeDeploymentGenerationMismatch + short: 2h + - alert: KubeAPIErrorBudgetBurn annotations: - description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch - summary: Deployment generation mismatch due to possible roll-back + description: The API server is burning too much error budget. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapierrorbudgetburn + summary: The API server is burning too much error budget. expr: | - kube_deployment_status_observed_generation{job="kube-state-metrics"} - != - kube_deployment_metadata_generation{job="kube-state-metrics"} - for: 15m + sum(apiserver_request:burnrate3d) > (1.00 * 0.01000) + and + sum(apiserver_request:burnrate6h) > (1.00 * 0.01000) + for: 3h labels: + long: 3d severity: warning - - alert: KubeDeploymentReplicasMismatch + short: 6h + - name: kubernetes-system-apiserver + rules: + - alert: KubeClientCertificateExpiration annotations: - description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch - summary: Deployment has not matched the expected number of replicas. + description: A client certificate used to authenticate to the apiserver is + expiring in less than 7.0 days. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeclientcertificateexpiration + summary: Client certificate is about to expire. expr: | - ( - kube_deployment_spec_replicas{job="kube-state-metrics"} - != - kube_deployment_status_replicas_available{job="kube-state-metrics"} - ) and ( - changes(kube_deployment_status_replicas_updated{job="kube-state-metrics"}[5m]) - == - 0 - ) - for: 15m + apiserver_client_certificate_expiration_seconds_count{job="kubelet"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kubelet"}[5m]))) < 604800 labels: severity: warning - - alert: KubeStatefulSetReplicasMismatch + - alert: KubeClientCertificateExpiration annotations: - description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch - summary: Deployment has not matched the expected number of replicas. + description: A client certificate used to authenticate to the apiserver is + expiring in less than 24.0 hours. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeclientcertificateexpiration + summary: Client certificate is about to expire. expr: | - ( - kube_statefulset_status_replicas_ready{job="kube-state-metrics"} - != - kube_statefulset_status_replicas{job="kube-state-metrics"} - ) and ( - changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[5m]) - == - 0 - ) - for: 15m + apiserver_client_certificate_expiration_seconds_count{job="kubelet"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kubelet"}[5m]))) < 86400 labels: - severity: warning - - alert: KubeStatefulSetGenerationMismatch + severity: critical + - alert: AggregatedAPIErrors annotations: - description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch - summary: StatefulSet generation mismatch due to possible roll-back + description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} + has reported errors. It has appeared unavailable {{ $value | humanize }} + times averaged over the past 10m. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/aggregatedapierrors + summary: An aggregated API has reported errors. expr: | - kube_statefulset_status_observed_generation{job="kube-state-metrics"} - != - kube_statefulset_metadata_generation{job="kube-state-metrics"} - for: 15m + sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[10m])) > 4 labels: severity: warning - - alert: KubeStatefulSetUpdateNotRolledOut + - alert: AggregatedAPIDown annotations: - description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout - summary: StatefulSet update has not been rolled out. + description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} + has been only {{ $value | humanize }}% available over the last 10m. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/aggregatedapidown + summary: An aggregated API is down. expr: | - ( - max without (revision) ( - kube_statefulset_status_current_revision{job="kube-state-metrics"} - unless - kube_statefulset_status_update_revision{job="kube-state-metrics"} - ) - * - ( - kube_statefulset_replicas{job="kube-state-metrics"} - != - kube_statefulset_status_replicas_updated{job="kube-state-metrics"} - ) - ) and ( - changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[5m]) - == - 0 - ) - for: 15m + (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85 + for: 5m labels: severity: warning - - alert: KubeDaemonSetRolloutStuck + - alert: KubeAPIDown annotations: - description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck - summary: DaemonSet rollout is stuck. + description: KubeAPI has disappeared from Prometheus target discovery. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapidown + summary: Target disappeared from Prometheus target discovery. expr: | - ( - ( - kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} - != - kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} - ) or ( - kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} - != - 0 - ) or ( - kube_daemonset_updated_number_scheduled{job="kube-state-metrics"} - != - kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} - ) or ( - kube_daemonset_status_number_available{job="kube-state-metrics"} - != - kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} - ) - ) and ( - changes(kube_daemonset_updated_number_scheduled{job="kube-state-metrics"}[5m]) - == - 0 - ) + absent(up{job="kubelet"} == 1) for: 15m labels: - severity: warning - - alert: KubeContainerWaiting + severity: critical + - alert: KubeAPITerminatedRequests annotations: - description: Pod {{ $labels.namespace }}/{{ $labels.pod }} container {{ $labels.container}} has been in waiting state for longer than 1 hour. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting - summary: Pod container waiting longer than 1 hour + description: The apiserver has terminated {{ $value | humanizePercentage }} + of its incoming requests. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeapiterminatedrequests + summary: The apiserver has terminated {{ $value | humanizePercentage }} of + its incoming requests. expr: | - sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics"}) > 0 - for: 1h + sum(rate(apiserver_request_terminations_total{job="kubelet"}[10m])) / ( sum(rate(apiserver_request_total{job="kubelet"}[10m])) + sum(rate(apiserver_request_terminations_total{job="kubelet"}[10m])) ) > 0.20 + for: 5m labels: severity: warning - - alert: KubeDaemonSetNotScheduled + - name: kubernetes-system-kubelet + rules: + - alert: KubeNodeNotReady annotations: - description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled - summary: DaemonSet pods are not scheduled. + description: '{{ $labels.node }} has been unready for more than 15 minutes.' + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubenodenotready + summary: Node is not ready. expr: | - kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} - - - kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0 - for: 10m + kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 + for: 15m labels: severity: warning - - alert: KubeDaemonSetMisScheduled + - alert: KubeNodeUnreachable annotations: - description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled - summary: DaemonSet pods are misscheduled. + description: '{{ $labels.node }} is unreachable and some workloads may be + rescheduled.' + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubenodeunreachable + summary: Node is unreachable. expr: | - kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0 + (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1 for: 15m labels: severity: warning - - alert: KubeJobCompletion + - alert: KubeletTooManyPods annotations: - description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than 12 hours to complete. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion - summary: Job did not complete in time + description: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage + }} of its Pod capacity. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubelettoomanypods + summary: Kubelet is running at capacity. expr: | - kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0 - for: 12h + count by(node) ( + (kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"}) + ) + / + max by(node) ( + kube_node_status_capacity_pods{job="kube-state-metrics"} != 1 + ) > 0.95 + for: 15m labels: severity: warning - - alert: KubeJobFailed + - alert: KubeNodeReadinessFlapping annotations: - description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed - summary: Job failed to complete. + description: The readiness status of node {{ $labels.node }} has changed {{ + $value }} times in the last 15 minutes. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubenodereadinessflapping + summary: Node readiness status is flapping. expr: | - kube_job_failed{job="kube-state-metrics"} > 0 + sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2 for: 15m labels: severity: warning - - alert: KubeHpaReplicasMismatch + - alert: KubeletPlegDurationHigh annotations: - description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the desired number of replicas for longer than 15 minutes. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch - summary: HPA has not matched descired number of replicas. + description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile + duration of {{ $value }} seconds on node {{ $labels.node }}. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletplegdurationhigh + summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist. expr: | - (kube_hpa_status_desired_replicas{job="kube-state-metrics"} - != - kube_hpa_status_current_replicas{job="kube-state-metrics"}) - and - (kube_hpa_status_current_replicas{job="kube-state-metrics"} - > - kube_hpa_spec_min_replicas{job="kube-state-metrics"}) - and - (kube_hpa_status_current_replicas{job="kube-state-metrics"} - < - kube_hpa_spec_max_replicas{job="kube-state-metrics"}) - and - changes(kube_hpa_status_current_replicas[15m]) == 0 - for: 15m + node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 + for: 5m labels: severity: warning - - alert: KubeHpaMaxedOut + - alert: KubeletPodStartUpLatencyHigh annotations: - description: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at max replicas for longer than 15 minutes. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout - summary: HPA is running at max replicas + description: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds + on node {{ $labels.node }}. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletpodstartuplatencyhigh + summary: Kubelet Pod startup latency is too high. expr: | - kube_hpa_status_current_replicas{job="kube-state-metrics"} - == - kube_hpa_spec_max_replicas{job="kube-state-metrics"} + histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 for: 15m labels: severity: warning - - name: kubernetes-resources - rules: - - alert: KubeCPUOvercommit + - alert: KubeletClientCertificateExpiration annotations: - description: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit - summary: Cluster has overcommitted CPU resource requests. + description: Client certificate for Kubelet on node {{ $labels.node }} expires + in {{ $value | humanizeDuration }}. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletclientcertificateexpiration + summary: Kubelet client certificate is about to expire. expr: | - sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) - / - sum(kube_node_status_allocatable_cpu_cores OR kube_node_status_allocatable{resource="cpu"}) - > - (count(kube_node_status_allocatable_cpu_cores OR kube_node_status_allocatable{resource="cpu"}) -1) / count(kube_node_status_allocatable_cpu_cores OR kube_node_status_allocatable{resource="cpu"}) - for: 5m + kubelet_certificate_manager_client_ttl_seconds < 604800 labels: severity: warning - - alert: KubeMemoryOvercommit + - alert: KubeletClientCertificateExpiration annotations: - description: Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit - summary: Cluster has overcommitted memory resource requests. + description: Client certificate for Kubelet on node {{ $labels.node }} expires + in {{ $value | humanizeDuration }}. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletclientcertificateexpiration + summary: Kubelet client certificate is about to expire. expr: | - sum(namespace_memory:kube_pod_container_resource_requests_bytes:sum{}) - / - sum(kube_node_status_allocatable_memory_bytes OR kube_node_status_allocatable{resource="memory"}) - > - (count(kube_node_status_allocatable_memory_bytes OR kube_node_status_allocatable{resource="memory"})-1) - / - count(kube_node_status_allocatable_memory_bytes OR kube_node_status_allocatable{resource="memory"}) - for: 5m + kubelet_certificate_manager_client_ttl_seconds < 86400 labels: - severity: warning - - alert: KubeCPUQuotaOvercommit + severity: critical + - alert: KubeletServerCertificateExpiration annotations: - description: Cluster has overcommitted CPU resource requests for Namespaces. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuquotaovercommit - summary: Cluster has overcommitted CPU resource requests. + description: Server certificate for Kubelet on node {{ $labels.node }} expires + in {{ $value | humanizeDuration }}. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletservercertificateexpiration + summary: Kubelet server certificate is about to expire. expr: | - sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"}) - / - sum(kube_node_status_allocatable_cpu_cores OR kube_node_status_allocatable{resource="cpu"}) - > 1.5 - for: 5m + kubelet_certificate_manager_server_ttl_seconds < 604800 labels: severity: warning - - alert: KubeMemoryQuotaOvercommit + - alert: KubeletServerCertificateExpiration annotations: - description: Cluster has overcommitted memory resource requests for Namespaces. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryquotaovercommit - summary: Cluster has overcommitted memory resource requests. + description: Server certificate for Kubelet on node {{ $labels.node }} expires + in {{ $value | humanizeDuration }}. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletservercertificateexpiration + summary: Kubelet server certificate is about to expire. expr: | - sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"}) - / - sum(kube_node_status_allocatable_memory_bytes{job="kube-state-metrics"} OR kube_node_status_allocatable{resource="memory",job="kube-state-metrics"}) - > 1.5 - for: 5m + kubelet_certificate_manager_server_ttl_seconds < 86400 labels: - severity: warning - - alert: KubeQuotaAlmostFull + severity: critical + - alert: KubeletClientCertificateRenewalErrors annotations: - description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaalmostfull - summary: Namespace quota is going to be full. + description: Kubelet on node {{ $labels.node }} has failed to renew its client + certificate ({{ $value | humanize }} errors in the last 5 minutes). + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletclientcertificaterenewalerrors + summary: Kubelet has failed to renew its client certificate. expr: | - kube_resourcequota{job="kube-state-metrics", type="used"} - / ignoring(instance, job, type) - (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) - > 0.9 < 1 + increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0 for: 15m labels: - severity: info - - alert: KubeQuotaFullyUsed + severity: warning + - alert: KubeletServerCertificateRenewalErrors annotations: - description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotafullyused - summary: Namespace quota is fully used. + description: Kubelet on node {{ $labels.node }} has failed to renew its server + certificate ({{ $value | humanize }} errors in the last 5 minutes). + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletservercertificaterenewalerrors + summary: Kubelet has failed to renew its server certificate. expr: | - kube_resourcequota{job="kube-state-metrics", type="used"} - / ignoring(instance, job, type) - (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) - == 1 + increase(kubelet_server_expiration_renew_errors[5m]) > 0 for: 15m labels: - severity: info - - alert: KubeQuotaExceeded + severity: warning + - alert: KubeletDown annotations: - description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded - summary: Namespace quota has exceeded the limits. + description: Kubelet has disappeared from Prometheus target discovery. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeletdown + summary: Target disappeared from Prometheus target discovery. expr: | - kube_resourcequota{job="kube-state-metrics", type="used"} - / ignoring(instance, job, type) - (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) - > 1 + absent(up{job="kubelet", metrics_path="/metrics"} == 1) for: 15m labels: - severity: warning - - alert: CPUThrottlingHigh + severity: critical + - name: kubernetes-system-scheduler + rules: + - alert: KubeSchedulerDown annotations: - description: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh - summary: Processes experience elevated CPU throttling. + description: KubeScheduler has disappeared from Prometheus target discovery. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubeschedulerdown + summary: Target disappeared from Prometheus target discovery. expr: | - sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace) - / - sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container, pod, namespace) - > ( 75 / 100 ) + absent(up{job="kubelet"} == 1) for: 15m labels: - severity: info - - name: kubernetes-storage + severity: critical + - name: kubernetes-system-controller-manager rules: - - alert: KubePersistentVolumeFillingUp + - alert: KubeControllerManagerDown annotations: - description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage }} free. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup - summary: PersistentVolume is filling up. + description: KubeControllerManager has disappeared from Prometheus target + discovery. + runbook_url: https://github.com/prometheus-operator/kube-prometheus/wiki/kubecontrollermanagerdown + summary: Target disappeared from Prometheus target discovery. expr: | - kubelet_volume_stats_available_bytes{job="kubelet"} - / - kubelet_volume_stats_capacity_bytes{job="kubelet"} - < 0.03 - for: 1m + absent(up{job="kubelet"} == 1) + for: 15m + labels: + severity: critical + - name: kube-apiserver.rules + rules: + - expr: | + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="kubelet",verb=~"LIST|GET"}[1d])) + - + ( + ( + sum(rate(apiserver_request_duration_seconds_bucket{job="kubelet",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1d])) + or + vector(0) + ) + + + sum(rate(apiserver_request_duration_seconds_bucket{job="kubelet",verb=~"LIST|GET",scope="namespace",le="0.5"}[1d])) + + + sum(rate(apiserver_request_duration_seconds_bucket{job="kubelet",verb=~"LIST|GET",scope="cluster",le="5"}[1d])) + ) + ) + + + # errors + sum(rate(apiserver_request_total{job="kubelet",verb=~"LIST|GET",code=~"5.."}[1d])) + ) + / + sum(rate(apiserver_request_total{job="kubelet",verb=~"LIST|GET"}[1d])) + labels: + verb: read + record: apiserver_request:burnrate1d + - expr: | + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="kubelet",verb=~"LIST|GET"}[1h])) + - + ( + ( + sum(rate(apiserver_request_duration_seconds_bucket{job="kubelet",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[1h])) + or + vector(0) + ) + + + sum(rate(apiserver_request_duration_seconds_bucket{job="kubelet",verb=~"LIST|GET",scope="namespace",le="0.5"}[1h])) + + + sum(rate(apiserver_request_duration_seconds_bucket{job="kubelet",verb=~"LIST|GET",scope="cluster",le="5"}[1h])) + ) + ) + + + # errors + sum(rate(apiserver_request_total{job="kubelet",verb=~"LIST|GET",code=~"5.."}[1h])) + ) + / + sum(rate(apiserver_request_total{job="kubelet",verb=~"LIST|GET"}[1h])) + labels: + verb: read + record: apiserver_request:burnrate1h + - expr: | + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="kubelet",verb=~"LIST|GET"}[2h])) + - + ( + ( + sum(rate(apiserver_request_duration_seconds_bucket{job="kubelet",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[2h])) + or + vector(0) + ) + + + sum(rate(apiserver_request_duration_seconds_bucket{job="kubelet",verb=~"LIST|GET",scope="namespace",le="0.5"}[2h])) + + + sum(rate(apiserver_request_duration_seconds_bucket{job="kubelet",verb=~"LIST|GET",scope="cluster",le="5"}[2h])) + ) + ) + + + # errors + sum(rate(apiserver_request_total{job="kubelet",verb=~"LIST|GET",code=~"5.."}[2h])) + ) + / + sum(rate(apiserver_request_total{job="kubelet",verb=~"LIST|GET"}[2h])) + labels: + verb: read + record: apiserver_request:burnrate2h + - expr: | + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="kubelet",verb=~"LIST|GET"}[30m])) + - + ( + ( + sum(rate(apiserver_request_duration_seconds_bucket{job="kubelet",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30m])) + or + vector(0) + ) + + + sum(rate(apiserver_request_duration_seconds_bucket{job="kubelet",verb=~"LIST|GET",scope="namespace",le="0.5"}[30m])) + + + sum(rate(apiserver_request_duration_seconds_bucket{job="kubelet",verb=~"LIST|GET",scope="cluster",le="5"}[30m])) + ) + ) + + + # errors + sum(rate(apiserver_request_total{job="kubelet",verb=~"LIST|GET",code=~"5.."}[30m])) + ) + / + sum(rate(apiserver_request_total{job="kubelet",verb=~"LIST|GET"}[30m])) + labels: + verb: read + record: apiserver_request:burnrate30m + - expr: | + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="kubelet",verb=~"LIST|GET"}[3d])) + - + ( + ( + sum(rate(apiserver_request_duration_seconds_bucket{job="kubelet",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[3d])) + or + vector(0) + ) + + + sum(rate(apiserver_request_duration_seconds_bucket{job="kubelet",verb=~"LIST|GET",scope="namespace",le="0.5"}[3d])) + + + sum(rate(apiserver_request_duration_seconds_bucket{job="kubelet",verb=~"LIST|GET",scope="cluster",le="5"}[3d])) + ) + ) + + + # errors + sum(rate(apiserver_request_total{job="kubelet",verb=~"LIST|GET",code=~"5.."}[3d])) + ) + / + sum(rate(apiserver_request_total{job="kubelet",verb=~"LIST|GET"}[3d])) + labels: + verb: read + record: apiserver_request:burnrate3d + - expr: | + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="kubelet",verb=~"LIST|GET"}[5m])) + - + ( + ( + sum(rate(apiserver_request_duration_seconds_bucket{job="kubelet",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[5m])) + or + vector(0) + ) + + + sum(rate(apiserver_request_duration_seconds_bucket{job="kubelet",verb=~"LIST|GET",scope="namespace",le="0.5"}[5m])) + + + sum(rate(apiserver_request_duration_seconds_bucket{job="kubelet",verb=~"LIST|GET",scope="cluster",le="5"}[5m])) + ) + ) + + + # errors + sum(rate(apiserver_request_total{job="kubelet",verb=~"LIST|GET",code=~"5.."}[5m])) + ) + / + sum(rate(apiserver_request_total{job="kubelet",verb=~"LIST|GET"}[5m])) + labels: + verb: read + record: apiserver_request:burnrate5m + - expr: | + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="kubelet",verb=~"LIST|GET"}[6h])) + - + ( + ( + sum(rate(apiserver_request_duration_seconds_bucket{job="kubelet",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[6h])) + or + vector(0) + ) + + + sum(rate(apiserver_request_duration_seconds_bucket{job="kubelet",verb=~"LIST|GET",scope="namespace",le="0.5"}[6h])) + + + sum(rate(apiserver_request_duration_seconds_bucket{job="kubelet",verb=~"LIST|GET",scope="cluster",le="5"}[6h])) + ) + ) + + + # errors + sum(rate(apiserver_request_total{job="kubelet",verb=~"LIST|GET",code=~"5.."}[6h])) + ) + / + sum(rate(apiserver_request_total{job="kubelet",verb=~"LIST|GET"}[6h])) + labels: + verb: read + record: apiserver_request:burnrate6h + - expr: | + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="kubelet",verb=~"POST|PUT|PATCH|DELETE"}[1d])) + - + sum(rate(apiserver_request_duration_seconds_bucket{job="kubelet",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1d])) + ) + + + sum(rate(apiserver_request_total{job="kubelet",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d])) + ) + / + sum(rate(apiserver_request_total{job="kubelet",verb=~"POST|PUT|PATCH|DELETE"}[1d])) + labels: + verb: write + record: apiserver_request:burnrate1d + - expr: | + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="kubelet",verb=~"POST|PUT|PATCH|DELETE"}[1h])) + - + sum(rate(apiserver_request_duration_seconds_bucket{job="kubelet",verb=~"POST|PUT|PATCH|DELETE",le="1"}[1h])) + ) + + + sum(rate(apiserver_request_total{job="kubelet",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) + ) + / + sum(rate(apiserver_request_total{job="kubelet",verb=~"POST|PUT|PATCH|DELETE"}[1h])) + labels: + verb: write + record: apiserver_request:burnrate1h + - expr: | + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="kubelet",verb=~"POST|PUT|PATCH|DELETE"}[2h])) + - + sum(rate(apiserver_request_duration_seconds_bucket{job="kubelet",verb=~"POST|PUT|PATCH|DELETE",le="1"}[2h])) + ) + + + sum(rate(apiserver_request_total{job="kubelet",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h])) + ) + / + sum(rate(apiserver_request_total{job="kubelet",verb=~"POST|PUT|PATCH|DELETE"}[2h])) + labels: + verb: write + record: apiserver_request:burnrate2h + - expr: | + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="kubelet",verb=~"POST|PUT|PATCH|DELETE"}[30m])) + - + sum(rate(apiserver_request_duration_seconds_bucket{job="kubelet",verb=~"POST|PUT|PATCH|DELETE",le="1"}[30m])) + ) + + + sum(rate(apiserver_request_total{job="kubelet",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m])) + ) + / + sum(rate(apiserver_request_total{job="kubelet",verb=~"POST|PUT|PATCH|DELETE"}[30m])) + labels: + verb: write + record: apiserver_request:burnrate30m + - expr: | + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="kubelet",verb=~"POST|PUT|PATCH|DELETE"}[3d])) + - + sum(rate(apiserver_request_duration_seconds_bucket{job="kubelet",verb=~"POST|PUT|PATCH|DELETE",le="1"}[3d])) + ) + + + sum(rate(apiserver_request_total{job="kubelet",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d])) + ) + / + sum(rate(apiserver_request_total{job="kubelet",verb=~"POST|PUT|PATCH|DELETE"}[3d])) + labels: + verb: write + record: apiserver_request:burnrate3d + - expr: | + ( + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="kubelet",verb=~"POST|PUT|PATCH|DELETE"}[5m])) + - + sum(rate(apiserver_request_duration_seconds_bucket{job="kubelet",verb=~"POST|PUT|PATCH|DELETE",le="1"}[5m])) + ) + + + sum(rate(apiserver_request_total{job="kubelet",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m])) + ) + / + sum(rate(apiserver_request_total{job="kubelet",verb=~"POST|PUT|PATCH|DELETE"}[5m])) labels: - severity: critical - - alert: KubePersistentVolumeFillingUp - annotations: - description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup - summary: PersistentVolume is filling up. - expr: | + verb: write + record: apiserver_request:burnrate5m + - expr: | ( - kubelet_volume_stats_available_bytes{job="kubelet"} - / - kubelet_volume_stats_capacity_bytes{job="kubelet"} - ) < 0.15 - and - predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0 - for: 1h + ( + # too slow + sum(rate(apiserver_request_duration_seconds_count{job="kubelet",verb=~"POST|PUT|PATCH|DELETE"}[6h])) + - + sum(rate(apiserver_request_duration_seconds_bucket{job="kubelet",verb=~"POST|PUT|PATCH|DELETE",le="1"}[6h])) + ) + + + sum(rate(apiserver_request_total{job="kubelet",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h])) + ) + / + sum(rate(apiserver_request_total{job="kubelet",verb=~"POST|PUT|PATCH|DELETE"}[6h])) labels: - severity: warning - - alert: KubePersistentVolumeErrors - annotations: - description: The persistent volume {{ $labels.persistentvolume }} has status {{ $labels.phase }}. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors - summary: PersistentVolume is having issues with provisioning. - expr: | - kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 - for: 5m + verb: write + record: apiserver_request:burnrate6h + - expr: | + sum by (code,resource) (rate(apiserver_request_total{job="kubelet",verb=~"LIST|GET"}[5m])) labels: - severity: critical - - name: kubernetes-system - rules: - - alert: KubeVersionMismatch - annotations: - description: There are {{ $value }} different semantic versions of Kubernetes components running. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch - summary: Different semantic versions of Kubernetes components running. - expr: | - count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*).*"))) > 1 - for: 15m + verb: read + record: code_resource:apiserver_request_total:rate5m + - expr: | + sum by (code,resource) (rate(apiserver_request_total{job="kubelet",verb=~"POST|PUT|PATCH|DELETE"}[5m])) labels: - severity: warning - - alert: KubeClientErrors - annotations: - description: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors.' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors - summary: Kubernetes API server client is experiencing errors. - expr: | - (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job) - / - sum(rate(rest_client_requests_total[5m])) by (instance, job)) - > 0.01 - for: 15m + verb: write + record: code_resource:apiserver_request_total:rate5m + - expr: | + histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="kubelet",verb=~"LIST|GET"}[5m]))) > 0 labels: - severity: warning - - name: kube-apiserver-slos - rules: - - alert: KubeAPIErrorBudgetBurn - annotations: - description: The API server is burning too much error budget. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn - summary: The API server is burning too much error budget. - expr: | - sum(apiserver_request:burnrate1h) > (14.40 * 0.01000) - and - sum(apiserver_request:burnrate5m) > (14.40 * 0.01000) - for: 2m + quantile: "0.99" + verb: read + record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.99, sum by (le, resource) (rate(apiserver_request_duration_seconds_bucket{job="kubelet",verb=~"POST|PUT|PATCH|DELETE"}[5m]))) > 0 labels: - long: 1h - severity: critical - short: 5m - - alert: KubeAPIErrorBudgetBurn - annotations: - description: The API server is burning too much error budget. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn - summary: The API server is burning too much error budget. - expr: | - sum(apiserver_request:burnrate6h) > (6.00 * 0.01000) - and - sum(apiserver_request:burnrate30m) > (6.00 * 0.01000) - for: 15m + quantile: "0.99" + verb: write + record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="kubelet",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) labels: - long: 6h - severity: critical - short: 30m - - alert: KubeAPIErrorBudgetBurn - annotations: - description: The API server is burning too much error budget. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn - summary: The API server is burning too much error budget. - expr: | - sum(apiserver_request:burnrate1d) > (3.00 * 0.01000) - and - sum(apiserver_request:burnrate2h) > (3.00 * 0.01000) - for: 1h + quantile: "0.99" + record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="kubelet",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) labels: - long: 1d - severity: warning - short: 2h - - alert: KubeAPIErrorBudgetBurn - annotations: - description: The API server is burning too much error budget. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn - summary: The API server is burning too much error budget. - expr: | - sum(apiserver_request:burnrate3d) > (1.00 * 0.01000) - and - sum(apiserver_request:burnrate6h) > (1.00 * 0.01000) - for: 3h + quantile: "0.9" + record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="kubelet",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)) labels: - long: 3d - severity: warning - short: 6h - - name: kubernetes-system-apiserver + quantile: "0.5" + record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile + - interval: 3m + name: kube-apiserver-availability.rules rules: - - alert: KubeClientCertificateExpiration - annotations: - description: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration - summary: Client certificate is about to expire. - expr: | - apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 + - expr: | + 1 - ( + ( + # write too slow + sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d])) + - + sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d])) + ) + + ( + # read too slow + sum(increase(apiserver_request_duration_seconds_count{verb=~"LIST|GET"}[30d])) + - + ( + ( + sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) + or + vector(0) + ) + + + sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) + + + sum(increase(apiserver_request_duration_seconds_bucket{verb=~"LIST|GET",scope="cluster",le="5"}[30d])) + ) + ) + + # errors + sum(code:apiserver_request_total:increase30d{code=~"5.."} or vector(0)) + ) + / + sum(code:apiserver_request_total:increase30d) labels: - severity: warning - - alert: KubeClientCertificateExpiration - annotations: - description: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration - summary: Client certificate is about to expire. - expr: | - apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 + verb: all + record: apiserver_request:availability30d + - expr: | + 1 - ( + sum(increase(apiserver_request_duration_seconds_count{job="kubelet",verb=~"LIST|GET"}[30d])) + - + ( + # too slow + ( + sum(increase(apiserver_request_duration_seconds_bucket{job="kubelet",verb=~"LIST|GET",scope=~"resource|",le="0.1"}[30d])) + or + vector(0) + ) + + + sum(increase(apiserver_request_duration_seconds_bucket{job="kubelet",verb=~"LIST|GET",scope="namespace",le="0.5"}[30d])) + + + sum(increase(apiserver_request_duration_seconds_bucket{job="kubelet",verb=~"LIST|GET",scope="cluster",le="5"}[30d])) + ) + + + # errors + sum(code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0)) + ) + / + sum(code:apiserver_request_total:increase30d{verb="read"}) labels: - severity: critical - - alert: AggregatedAPIErrors - annotations: - description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. The number of errors have increased for it in the past five minutes. High values indicate that the availability of the service changes too often. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors - summary: An aggregated API has reported errors. - expr: | - sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2 + verb: read + record: apiserver_request:availability30d + - expr: | + 1 - ( + ( + # too slow + sum(increase(apiserver_request_duration_seconds_count{verb=~"POST|PUT|PATCH|DELETE"}[30d])) + - + sum(increase(apiserver_request_duration_seconds_bucket{verb=~"POST|PUT|PATCH|DELETE",le="1"}[30d])) + ) + + + # errors + sum(code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0)) + ) + / + sum(code:apiserver_request_total:increase30d{verb="write"}) labels: - severity: warning - - alert: AggregatedAPIDown - annotations: - description: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapidown - summary: An aggregated API is down. - expr: | - (1 - max by(name, namespace)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85 - for: 5m + verb: write + record: apiserver_request:availability30d + - expr: | + avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24 * 30 + record: code_verb:apiserver_request_total:increase30d + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="kubelet",verb="LIST",code=~"2.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="kubelet",verb="GET",code=~"2.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="kubelet",verb="POST",code=~"2.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="kubelet",verb="PUT",code=~"2.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="kubelet",verb="PATCH",code=~"2.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="kubelet",verb="DELETE",code=~"2.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="kubelet",verb="LIST",code=~"3.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="kubelet",verb="GET",code=~"3.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="kubelet",verb="POST",code=~"3.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="kubelet",verb="PUT",code=~"3.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="kubelet",verb="PATCH",code=~"3.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="kubelet",verb="DELETE",code=~"3.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="kubelet",verb="LIST",code=~"4.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="kubelet",verb="GET",code=~"4.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="kubelet",verb="POST",code=~"4.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="kubelet",verb="PUT",code=~"4.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="kubelet",verb="PATCH",code=~"4.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="kubelet",verb="DELETE",code=~"4.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="kubelet",verb="LIST",code=~"5.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="kubelet",verb="GET",code=~"5.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="kubelet",verb="POST",code=~"5.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="kubelet",verb="PUT",code=~"5.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="kubelet",verb="PATCH",code=~"5.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: | + sum by (code, verb) (increase(apiserver_request_total{job="kubelet",verb="DELETE",code=~"5.."}[1h])) + record: code_verb:apiserver_request_total:increase1h + - expr: | + sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"}) labels: - severity: warning - - alert: KubeAPIDown - annotations: - description: KubeAPI has disappeared from Prometheus target discovery. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown - summary: Target disappeared from Prometheus target discovery. - expr: | - absent(up{job="apiserver"} == 1) - for: 15m + verb: read + record: code:apiserver_request_total:increase30d + - expr: | + sum by (code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) labels: - severity: critical - - name: kubernetes-system-kubelet + verb: write + record: code:apiserver_request_total:increase30d + - name: k8s.rules rules: - - alert: KubeNodeNotReady - annotations: - description: '{{ $labels.node }} has been unready for more than 15 minutes.' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready - summary: Node is not ready. - expr: | - kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 - for: 15m - labels: - severity: warning - - alert: KubeNodeUnreachable - annotations: - description: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.' - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeunreachable - summary: Node is unreachable. - expr: | - (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1 - for: 15m + - expr: | + sum by (cluster, namespace, pod, container) ( + rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}[5m]) + ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) ( + 1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate + - expr: | + container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} + * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, + max by(namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_memory_working_set_bytes + - expr: | + container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} + * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, + max by(namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_memory_rss + - expr: | + container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} + * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, + max by(namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_memory_cache + - expr: | + container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} + * on (namespace, pod) group_left(node) topk by(namespace, pod) (1, + max by(namespace, pod, node) (kube_pod_info{node!=""}) + ) + record: node_namespace_pod_container:container_memory_swap + - expr: | + sum by (namespace) ( + sum by (namespace, pod) ( + max by (namespace, pod, container) ( + kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"} + ) * on(namespace, pod) group_left() max by (namespace, pod) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace:kube_pod_container_resource_requests_memory_bytes:sum + - expr: | + sum by (namespace) ( + sum by (namespace, pod) ( + max by (namespace, pod, container) ( + kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} + ) * on(namespace, pod) group_left() max by (namespace, pod) ( + kube_pod_status_phase{phase=~"Pending|Running"} == 1 + ) + ) + ) + record: namespace:kube_pod_container_resource_requests_cpu_cores:sum + - expr: | + max by (cluster, namespace, workload, pod) ( + label_replace( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"}, + "replicaset", "$1", "owner_name", "(.*)" + ) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) ( + 1, max by (replicaset, namespace, owner_name) ( + kube_replicaset_owner{job="kube-state-metrics"} + ) + ), + "workload", "$1", "owner_name", "(.*)" + ) + ) labels: - severity: warning - - alert: KubeletTooManyPods - annotations: - description: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods - summary: Kubelet is running at capacity. - expr: | - count by(node) ( - (kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"}) + workload_type: deployment + record: namespace_workload_pod:kube_pod_owner:relabel + - expr: | + max by (cluster, namespace, workload, pod) ( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"}, + "workload", "$1", "owner_name", "(.*)" + ) ) - / - max by(node) ( - kube_node_status_capacity_pods{job="kube-state-metrics"} != 1 - OR - kube_node_status_capacity{job="kube-state-metrics",resource="pods"} != 1 - ) > 0.95 - for: 15m labels: - severity: warning - - alert: KubeNodeReadinessFlapping - annotations: - description: The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping - summary: Node readiness status is flapping. - expr: | - sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2 - for: 15m + workload_type: daemonset + record: namespace_workload_pod:kube_pod_owner:relabel + - expr: | + max by (cluster, namespace, workload, pod) ( + label_replace( + kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"}, + "workload", "$1", "owner_name", "(.*)" + ) + ) labels: - severity: warning - - alert: KubeletPlegDurationHigh - annotations: - description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh - summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist. - expr: | - node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 - for: 5m + workload_type: statefulset + record: namespace_workload_pod:kube_pod_owner:relabel + - name: kube-scheduler.rules + rules: + - expr: | + histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kubelet"}[5m])) without(instance, pod)) labels: - severity: warning - - alert: KubeletPodStartUpLatencyHigh - annotations: - description: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh - summary: Kubelet Pod startup latency is too high. - expr: | - histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name{job="kubelet"} > 60 - for: 15m + quantile: "0.99" + record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kubelet"}[5m])) without(instance, pod)) labels: - severity: warning - - alert: KubeletClientCertificateExpiration - annotations: - description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificateexpiration - summary: Kubelet client certificate is about to expire. - expr: | - kubelet_certificate_manager_client_ttl_seconds < 604800 + quantile: "0.99" + record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job="kubelet"}[5m])) without(instance, pod)) labels: - severity: warning - - alert: KubeletClientCertificateExpiration - annotations: - description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificateexpiration - summary: Kubelet client certificate is about to expire. - expr: | - kubelet_certificate_manager_client_ttl_seconds < 86400 + quantile: "0.99" + record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kubelet"}[5m])) without(instance, pod)) labels: - severity: critical - - alert: KubeletServerCertificateExpiration - annotations: - description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificateexpiration - summary: Kubelet server certificate is about to expire. - expr: | - kubelet_certificate_manager_server_ttl_seconds < 604800 + quantile: "0.9" + record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kubelet"}[5m])) without(instance, pod)) labels: - severity: warning - - alert: KubeletServerCertificateExpiration - annotations: - description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificateexpiration - summary: Kubelet server certificate is about to expire. - expr: | - kubelet_certificate_manager_server_ttl_seconds < 86400 + quantile: "0.9" + record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job="kubelet"}[5m])) without(instance, pod)) labels: - severity: critical - - alert: KubeletClientCertificateRenewalErrors - annotations: - description: Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes). - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificaterenewalerrors - summary: Kubelet has failed to renew its client certificate. - expr: | - increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0 - for: 15m + quantile: "0.9" + record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kubelet"}[5m])) without(instance, pod)) labels: - severity: warning - - alert: KubeletServerCertificateRenewalErrors - annotations: - description: Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes). - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificaterenewalerrors - summary: Kubelet has failed to renew its server certificate. - expr: | - increase(kubelet_server_expiration_renew_errors[5m]) > 0 - for: 15m + quantile: "0.5" + record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kubelet"}[5m])) without(instance, pod)) labels: - severity: warning - - alert: KubeletDown - annotations: - description: Kubelet has disappeared from Prometheus target discovery. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown - summary: Target disappeared from Prometheus target discovery. - expr: | - absent(up{job="kubelet"} == 1) - for: 15m + quantile: "0.5" + record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job="kubelet"}[5m])) without(instance, pod)) labels: - severity: critical - - name: kubernetes-system-scheduler + quantile: "0.5" + record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile + - name: node.rules rules: - - alert: KubeSchedulerDown - annotations: - description: KubeScheduler has disappeared from Prometheus target discovery. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown - summary: Target disappeared from Prometheus target discovery. - expr: | - absent(up{job="kube-scheduler"} == 1) - for: 15m - labels: - severity: critical - - name: kubernetes-system-controller-manager + - expr: | + topk by(namespace, pod) (1, + max by (node, namespace, pod) ( + label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)") + )) + record: 'node_namespace_pod:kube_pod_info:' + - expr: | + count by (cluster, node) (sum by (node, cpu) ( + node_cpu_seconds_total{job="node-exporter"} + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + )) + record: node:node_num_cpu:sum + - expr: | + sum( + node_memory_MemAvailable_bytes{job="node-exporter"} or + ( + node_memory_Buffers_bytes{job="node-exporter"} + + node_memory_Cached_bytes{job="node-exporter"} + + node_memory_MemFree_bytes{job="node-exporter"} + + node_memory_Slab_bytes{job="node-exporter"} + ) + ) by (cluster) + record: :node_memory_MemAvailable_bytes:sum + - name: kubelet.rules rules: - - alert: KubeControllerManagerDown - annotations: - description: KubeControllerManager has disappeared from Prometheus target discovery. - runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown - summary: Target disappeared from Prometheus target discovery. - expr: | - absent(up{job="kube-controller-manager"} == 1) - for: 15m + - expr: | + histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) labels: - severity: critical + quantile: "0.99" + record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) + labels: + quantile: "0.9" + record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) + labels: + quantile: "0.5" + record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile diff --git a/apps/monitoring/manifests/other/testingPrometheusRule.yaml b/apps/monitoring/manifests/other/testingPrometheusRule.yaml new file mode 100644 index 000000000..b4efbb51d --- /dev/null +++ b/apps/monitoring/manifests/other/testingPrometheusRule.yaml @@ -0,0 +1,23 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + prometheus: k8s + role: alert-rules + name: testing-rules + namespace: monitoring +spec: + groups: + - name: testing.rules + rules: + - alert: CPUStealTimeHigh + annotations: + description: CPU Steal Time is very high on {{ $labels.instance }} hypervisor. + This can lead to VM being stalled. + runbook_url: https://github.com/thaum-xyz/ankhmorpork/blob/master/docs/runbooks/CPUStealTimeHigh.md + summary: High CPU Steal Time + expr: | + sum by (instance) (rate(node_cpu_seconds_total{mode="steal"}[3m])) / count by (instance) (node_cpu_seconds_total{mode="steal"}) > 0.1 + for: 20m + labels: + severity: warning diff --git a/apps/monitoring/manifests/other/thaumPrometheusRule.yaml b/apps/monitoring/manifests/other/thaumPrometheusRule.yaml new file mode 100644 index 000000000..21f95abb6 --- /dev/null +++ b/apps/monitoring/manifests/other/thaumPrometheusRule.yaml @@ -0,0 +1,86 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + prometheus: k8s + role: alert-rules + name: thaum-rules + namespace: monitoring +spec: + groups: + - name: custom node alert rules + rules: + - alert: PackagesAvailable + annotations: + description: '{{ $value }} packages are available for upgrade. Maybe it is + time to upgrade?' + runbook_url: https://github.com/thaum-xyz/ankhmorpork/blob/master/docs/runbooks/PackagesAvailable.md + summary: Packages are available for upgrade + expr: | + sum by (node,instance) (yum_upgrades_pending) > 200 + or + sum by (node,instance) (apt_upgrades_pending) > 200 + for: 48h + labels: + severity: info + - alert: RebootRequired + annotations: + description: Instance '{{ $labels.instance }}' was upgraded and now requires + a reboot. + runbook_url: https://github.com/thaum-xyz/ankhmorpork/blob/master/docs/runbooks/RebootRequired.md + summary: Reboot is required to finish package upgrade + expr: node_reboot_required > 0 + for: 4h + labels: + severity: info + - name: alert rules specific to thaum.xyz + rules: + - alert: FederatedPrometheusDown + annotations: + description: Remote Prometheus server {{ $labels.instance }} has been down + for more than 10 minutes. + runbook_url: https://github.com/thaum-xyz/ankhmorpork/blob/master/docs/runbooks/FederatedPrometheusDown.md + summary: Federated prometheus is down + expr: up{job="lancre"} == 0 + for: 20m + labels: + severity: warning + - alert: FilesystemReadOnly + annotations: + description: Filesystem went read-only on {{ $labels.instance }}. Check FS + for possible corruption. + summary: Filesystem went read-only possibly due to device error. + expr: | + node_filesystem_readonly{fstype=~"(vfat|ext4|xfs)"} != 0 + labels: + severity: critical + - alert: TouchscreenNotAvailable + annotations: + description: Powercycle device {{ $labels.instance }} to bring touchscreen + up + summary: Touchscreen not available + expr: | + devices_input_touchscreen_up == 0 or absent(devices_input_touchscreen_up) + for: 10m + labels: + severity: warning + - alert: TouchscreenNotAvailable + annotations: + description: Powercycle device {{ $labels.instance }} + summary: Touchscreen not available and automatic remediation failed to restore + it + expr: | + devices_input_touchscreen_up == 0 or absent(devices_input_touchscreen_up) + for: 1h + labels: + severity: critical + - alert: TemperaturesNotAvailable + annotations: + description: Temperature data is gone. Immediatelly switch off all relays + and check OW bus. + summary: Cannot obtain temperature data + expr: | + absent(evok_temperature_celsius) + for: 15m + labels: + severity: critical diff --git a/apps/monitoring/manifests/prober/02_config.yaml b/apps/monitoring/manifests/prober/02_config.yaml deleted file mode 100644 index 9fbc24c85..000000000 --- a/apps/monitoring/manifests/prober/02_config.yaml +++ /dev/null @@ -1,37 +0,0 @@ ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: prober-config - namespace: monitoring - labels: - app.kubernetes.io/name: prober -data: - config.yml: | - modules: - http_2xx: - prober: http - timeout: 5s - http: - valid_http_versions: ["HTTP/1.1", "HTTP/2.0", "HTTP/2"] - valid_status_codes: [] # Defaults to 2xx - method: GET - headers: - Content-Type: text/html - Content-Encoding: gzip - no_follow_redirects: false - fail_if_ssl: false - fail_if_not_ssl: false - # fail_if_header_not_matches: - # - header: X-mac-address - # regexp: '^([0-9A-Fa-f]{2}[:-]){5}([0-9A-Fa-f]{2})$' - tls_config: - insecure_skip_verify: false - preferred_ip_protocol: "ip4" # defaults to "ip6" - ip_protocol_fallback: false # no fallback to "ip6" - icmp: - prober: icmp - timeout: 15s - icmp: - preferred_ip_protocol: "ip4" - source_ip_address: "127.0.0.1" diff --git a/apps/monitoring/manifests/prober/03_service.yaml b/apps/monitoring/manifests/prober/03_service.yaml deleted file mode 100644 index a2aa8b78e..000000000 --- a/apps/monitoring/manifests/prober/03_service.yaml +++ /dev/null @@ -1,17 +0,0 @@ ---- -apiVersion: v1 -kind: Service -metadata: - labels: - app.kubernetes.io/name: prober - app.kubernetes.io/component: exporter - name: prober - namespace: monitoring -spec: - ports: - - name: http - port: 9115 - protocol: TCP - targetPort: http - selector: - app.kubernetes.io/name: prober diff --git a/apps/monitoring/manifests/prober/04_deployment.yaml b/apps/monitoring/manifests/prober/04_deployment.yaml deleted file mode 100644 index 13c135486..000000000 --- a/apps/monitoring/manifests/prober/04_deployment.yaml +++ /dev/null @@ -1,65 +0,0 @@ ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: prober - namespace: monitoring - labels: - app.kubernetes.io/name: prober - app.kubernetes.io/version: 0.18.0 - app.kubernetes.io/component: exporter -spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/name: prober - template: - metadata: - labels: - app.kubernetes.io/name: prober - spec: - affinity: - podAntiAffinity: - preferredDuringSchedulingIgnoredDuringExecution: - - weight: 10 - podAffinityTerm: - labelSelector: - matchExpressions: - - key: app.kubernetes.io/name - operator: In - values: - - prober - topologyKey: kubernetes.io/hostname - containers: - - name: prober - image: quay.io/prometheus/blackbox-exporter:v0.18.0 - imagePullPolicy: IfNotPresent - args: - - --config.file=/etc/blackbox_exporter/config.yml - - --log.level=debug - ports: - - containerPort: 9115 - name: http - readinessProbe: - tcpSocket: - port: http - initialDelaySeconds: 1 - failureThreshold: 5 - timeoutSeconds: 10 - volumeMounts: - - mountPath: /etc/blackbox_exporter - name: config - readOnly: true - securityContext: - capabilities: - add: ['NET_RAW'] - resources: - requests: - cpu: 21m - memory: 16Mi - restartPolicy: Always - volumes: - - configMap: - defaultMode: 420 - name: prober-config - name: config diff --git a/apps/monitoring/manifests/prober/05_probers.yaml b/apps/monitoring/manifests/prober/05_probers.yaml deleted file mode 100644 index 1180ae870..000000000 --- a/apps/monitoring/manifests/prober/05_probers.yaml +++ /dev/null @@ -1,55 +0,0 @@ ---- -apiVersion: monitoring.coreos.com/v1 -kind: Probe -metadata: - name: prometheus-demo - namespace: monitoring -spec: - prober: - url: prober.monitoring.svc:9115 - module: http_2xx - targets: - staticConfig: - static: - - 'https://demo.do.prometheus.io' - - 'https://prometheus.demo.do.prometheus.io/-/healthy' - - 'https://alertmanager.demo.do.prometheus.io/-/healthy' - - 'https://node.demo.do.prometheus.io' - - 'https://grafana.demo.do.prometheus.io/api/health' - labels: - environment: prometheus.io ---- -apiVersion: monitoring.coreos.com/v1 -kind: Probe -metadata: - name: thaum-sites - namespace: monitoring -spec: - prober: - url: prober.monitoring.svc:9115 - module: http_2xx - targets: - staticConfig: - static: - - 'https://weirdo.blog/ghost' - - 'https://alchemyof.it/ghost' - - 'https://zmc.krupa.net.pl' - labels: - environment: thaum ---- -apiVersion: monitoring.coreos.com/v1 -kind: Probe -metadata: - name: ankhmorpork - namespace: monitoring -spec: - prober: - url: prober.monitoring.svc:9115 - module: http_2xx - targets: - ingress: - selector: - matchLabels: - probe: enabled - namespaceSelector: - any: true diff --git a/apps/monitoring/manifests/prober/clusterRole.yaml b/apps/monitoring/manifests/prober/clusterRole.yaml new file mode 100644 index 000000000..c7824058e --- /dev/null +++ b/apps/monitoring/manifests/prober/clusterRole.yaml @@ -0,0 +1,17 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: blackbox-exporter +rules: +- apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create +- apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create diff --git a/apps/monitoring/manifests/prober/clusterRoleBinding.yaml b/apps/monitoring/manifests/prober/clusterRoleBinding.yaml new file mode 100644 index 000000000..7b3ae3209 --- /dev/null +++ b/apps/monitoring/manifests/prober/clusterRoleBinding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: blackbox-exporter +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: blackbox-exporter +subjects: +- kind: ServiceAccount + name: blackbox-exporter + namespace: monitoring diff --git a/apps/monitoring/manifests/prober/configuration.yaml b/apps/monitoring/manifests/prober/configuration.yaml new file mode 100644 index 000000000..0f5b03ebf --- /dev/null +++ b/apps/monitoring/manifests/prober/configuration.yaml @@ -0,0 +1,51 @@ +apiVersion: v1 +data: + config.yml: |- + "modules": + "http_2xx": + "http": + "preferred_ip_protocol": "ip4" + "prober": "http" + "http_post_2xx": + "http": + "method": "POST" + "preferred_ip_protocol": "ip4" + "prober": "http" + "irc_banner": + "prober": "tcp" + "tcp": + "preferred_ip_protocol": "ip4" + "query_response": + - "send": "NICK prober" + - "send": "USER prober prober prober :prober" + - "expect": "PING :([^ ]+)" + "send": "PONG ${1}" + - "expect": "^:[^ ]+ 001" + "pop3s_banner": + "prober": "tcp" + "tcp": + "preferred_ip_protocol": "ip4" + "query_response": + - "expect": "^+OK" + "tls": true + "tls_config": + "insecure_skip_verify": false + "ssh_banner": + "prober": "tcp" + "tcp": + "preferred_ip_protocol": "ip4" + "query_response": + - "expect": "^SSH-2.0-" + "tcp_connect": + "prober": "tcp" + "tcp": + "preferred_ip_protocol": "ip4" +kind: ConfigMap +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: blackbox-exporter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.18.0 + name: blackbox-exporter-configuration + namespace: monitoring diff --git a/apps/monitoring/manifests/prober/deployment.yaml b/apps/monitoring/manifests/prober/deployment.yaml new file mode 100644 index 000000000..4377add54 --- /dev/null +++ b/apps/monitoring/manifests/prober/deployment.yaml @@ -0,0 +1,109 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: blackbox-exporter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.18.0 + name: blackbox-exporter + namespace: monitoring +spec: + replicas: 2 + selector: + matchLabels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: blackbox-exporter + app.kubernetes.io/part-of: kube-prometheus + template: + metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: blackbox-exporter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.18.0 + spec: + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchExpressions: + - key: app.kubernetes.io/name + operator: In + values: + - blackbox-exporter + topologyKey: kubernetes.io/hostname + weight: 100 + containers: + - args: + - --config.file=/etc/blackbox_exporter/config.yml + - --web.listen-address=:19115 + image: quay.io/prometheus/blackbox-exporter:v0.18.0 + name: blackbox-exporter + ports: + - containerPort: 19115 + name: http + resources: + limits: + cpu: 21m + memory: 42Mi + requests: + cpu: 21m + memory: 16Mi + securityContext: + runAsNonRoot: true + runAsUser: 65534 + volumeMounts: + - mountPath: /etc/blackbox_exporter/ + name: config + readOnly: true + - args: + - --webhook-url=http://localhost:19115/-/reload + - --volume-dir=/etc/blackbox_exporter/ + image: jimmidyson/configmap-reload:v0.5.0 + name: module-configmap-reloader + resources: + limits: + cpu: 21m + memory: 42Mi + requests: + cpu: 21m + memory: 16Mi + securityContext: + runAsNonRoot: true + runAsUser: 65534 + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: FallbackToLogsOnError + volumeMounts: + - mountPath: /etc/blackbox_exporter/ + name: config + readOnly: true + - args: + - --logtostderr + - --secure-listen-address=:9115 + - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 + - --upstream=http://127.0.0.1:19115/ + image: quay.io/brancz/kube-rbac-proxy:v0.8.0 + name: kube-rbac-proxy + ports: + - containerPort: 9115 + name: https + resources: + limits: + cpu: 20m + memory: 40Mi + requests: + cpu: 10m + memory: 20Mi + securityContext: + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65532 + nodeSelector: + kubernetes.io/os: linux + serviceAccountName: blackbox-exporter + volumes: + - configMap: + name: blackbox-exporter-configuration + name: config diff --git a/apps/monitoring/manifests/prober/ingressProbe.yaml b/apps/monitoring/manifests/prober/ingressProbe.yaml new file mode 100644 index 000000000..db278c3a0 --- /dev/null +++ b/apps/monitoring/manifests/prober/ingressProbe.yaml @@ -0,0 +1,21 @@ +apiVersion: monitoring.coreos.com/v1 +kind: Probe +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: blackbox-exporter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.18.0 + name: ankhmorpork + namespace: monitoring +spec: + module: http_2xx + prober: + url: blackbox-exporter.monitoring.svc:19115 + targets: + ingress: + namespaceSelector: + any: true + selector: + matchLabels: + probe: enabled diff --git a/apps/monitoring/manifests/prober/promDemoProbe.yaml b/apps/monitoring/manifests/prober/promDemoProbe.yaml new file mode 100644 index 000000000..7889f79a8 --- /dev/null +++ b/apps/monitoring/manifests/prober/promDemoProbe.yaml @@ -0,0 +1,24 @@ +apiVersion: monitoring.coreos.com/v1 +kind: Probe +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: blackbox-exporter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.18.0 + name: prometheus-demo + namespace: monitoring +spec: + module: http_2xx + prober: + url: blackbox-exporter.monitoring.svc:19115 + targets: + staticConfig: + labels: + environment: prometheus.io + static: + - https://demo.do.prometheus.io + - https://prometheus.demo.do.prometheus.io/-/healthy + - https://alertmanager.demo.do.prometheus.io/-/healthy + - https://node.demo.do.prometheus.io + - https://grafana.demo.do.prometheus.io/api/health diff --git a/apps/monitoring/manifests/prober/service.yaml b/apps/monitoring/manifests/prober/service.yaml new file mode 100644 index 000000000..8b568e274 --- /dev/null +++ b/apps/monitoring/manifests/prober/service.yaml @@ -0,0 +1,22 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: blackbox-exporter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.18.0 + name: blackbox-exporter + namespace: monitoring +spec: + ports: + - name: https + port: 9115 + targetPort: https + - name: probe + port: 19115 + targetPort: http + selector: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: blackbox-exporter + app.kubernetes.io/part-of: kube-prometheus diff --git a/apps/monitoring/manifests/prober/serviceAccount.yaml b/apps/monitoring/manifests/prober/serviceAccount.yaml new file mode 100644 index 000000000..ac2acefb2 --- /dev/null +++ b/apps/monitoring/manifests/prober/serviceAccount.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: blackbox-exporter + namespace: monitoring diff --git a/apps/monitoring/manifests/prober/serviceMonitor.yaml b/apps/monitoring/manifests/prober/serviceMonitor.yaml new file mode 100644 index 000000000..ab7b50386 --- /dev/null +++ b/apps/monitoring/manifests/prober/serviceMonitor.yaml @@ -0,0 +1,24 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: blackbox-exporter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.18.0 + name: blackbox-exporter + namespace: monitoring +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 30s + path: /metrics + port: https + scheme: https + tlsConfig: + insecureSkipVerify: true + selector: + matchLabels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: blackbox-exporter + app.kubernetes.io/part-of: kube-prometheus diff --git a/apps/monitoring/manifests/prober/thaumProbe.yaml b/apps/monitoring/manifests/prober/thaumProbe.yaml new file mode 100644 index 000000000..fa30da47a --- /dev/null +++ b/apps/monitoring/manifests/prober/thaumProbe.yaml @@ -0,0 +1,22 @@ +apiVersion: monitoring.coreos.com/v1 +kind: Probe +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: blackbox-exporter + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.18.0 + name: thaum-sites + namespace: monitoring +spec: + module: http_2xx + prober: + url: blackbox-exporter.monitoring.svc:19115 + targets: + staticConfig: + labels: + environment: thaum.xyz + static: + - https://weirdo.blog/ghost + - https://alchemyof.it/ghost + - https://zmc.krupa.net.pl diff --git a/apps/monitoring/manifests/prometheus-operator/0alertmanagerConfigCustomResourceDefinition.yaml b/apps/monitoring/manifests/prometheus-operator/0alertmanagerConfigCustomResourceDefinition.yaml new file mode 100644 index 000000000..6375123a1 --- /dev/null +++ b/apps/monitoring/manifests/prometheus-operator/0alertmanagerConfigCustomResourceDefinition.yaml @@ -0,0 +1,2436 @@ +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.4.1 + creationTimestamp: null + name: alertmanagerconfigs.monitoring.coreos.com +spec: + group: monitoring.coreos.com + names: + kind: AlertmanagerConfig + listKind: AlertmanagerConfigList + plural: alertmanagerconfigs + singular: alertmanagerconfig + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: AlertmanagerConfig defines a namespaced AlertmanagerConfig to + be aggregated across multiple namespaces configuring one Alertmanager cluster. + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' + type: string + metadata: + type: object + spec: + description: AlertmanagerConfigSpec is a specification of the desired + behavior of the Alertmanager configuration. By definition, the Alertmanager + configuration only applies to alerts for which the `namespace` label + is equal to the namespace of the AlertmanagerConfig resource. + properties: + inhibitRules: + description: List of inhibition rules. The rules will only apply to + alerts matching the resource’s namespace. + items: + description: InhibitRule defines an inhibition rule that allows + to mute alerts when other alerts are already firing. See https://prometheus.io/docs/alerting/latest/configuration/#inhibit_rule + properties: + equal: + description: Labels that must have an equal value in the source + and target alert for the inhibition to take effect. + items: + type: string + type: array + sourceMatch: + description: Matchers for which one or more alerts have to exist + for the inhibition to take effect. The operator enforces that + the alert matches the resource’s namespace. + items: + description: Matcher defines how to match on alert's labels. + properties: + name: + description: Label to match. + minLength: 1 + type: string + regex: + description: Whether to match on equality (false) or regular-expression + (true). + type: boolean + value: + description: Label value to match. + type: string + required: + - name + type: object + type: array + targetMatch: + description: Matchers that have to be fulfilled in the alerts + to be muted. The operator enforces that the alert matches + the resource’s namespace. + items: + description: Matcher defines how to match on alert's labels. + properties: + name: + description: Label to match. + minLength: 1 + type: string + regex: + description: Whether to match on equality (false) or regular-expression + (true). + type: boolean + value: + description: Label value to match. + type: string + required: + - name + type: object + type: array + type: object + type: array + receivers: + description: List of receivers. + items: + description: Receiver defines one or more notification integrations. + properties: + emailConfigs: + description: List of Email configurations. + items: + description: EmailConfig configures notifications via Email. + properties: + authIdentity: + description: The identity to use for authentication. + type: string + authPassword: + description: The secret's key that contains the password + to use for authentication. The secret needs to be in + the same namespace as the AlertmanagerConfig object + and accessible by the Prometheus Operator. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, kind, + uid?' + type: string + optional: + description: Specify whether the Secret or its key + must be defined + type: boolean + required: + - key + type: object + authSecret: + description: The secret's key that contains the CRAM-MD5 + secret. The secret needs to be in the same namespace + as the AlertmanagerConfig object and accessible by the + Prometheus Operator. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, kind, + uid?' + type: string + optional: + description: Specify whether the Secret or its key + must be defined + type: boolean + required: + - key + type: object + authUsername: + description: The username to use for authentication. + type: string + from: + description: The sender address. + type: string + headers: + description: Further headers email header key/value pairs. + Overrides any headers previously set by the notification + implementation. + items: + description: KeyValue defines a (key, value) tuple. + properties: + key: + description: Key of the tuple. + minLength: 1 + type: string + value: + description: Value of the tuple. + type: string + required: + - key + - value + type: object + type: array + hello: + description: The hostname to identify to the SMTP server. + type: string + html: + description: The HTML body of the email notification. + type: string + requireTLS: + description: The SMTP TLS requirement. Note that Go does + not support unencrypted connections to remote SMTP endpoints. + type: boolean + sendResolved: + description: Whether or not to notify about resolved alerts. + type: boolean + smarthost: + description: The SMTP host through which emails are sent. + type: string + text: + description: The text body of the email notification. + type: string + tlsConfig: + description: TLS configuration + properties: + ca: + description: Struct containing the CA cert to use + for the targets. + properties: + configMap: + description: ConfigMap containing data to use + for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: + https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap + or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for + the targets. + properties: + key: + description: The key of the secret to select + from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: + https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret or + its key must be defined + type: boolean + required: + - key + type: object + type: object + cert: + description: Struct containing the client cert file + for the targets. + properties: + configMap: + description: ConfigMap containing data to use + for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: + https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap + or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use for + the targets. + properties: + key: + description: The key of the secret to select + from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: + https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret or + its key must be defined + type: boolean + required: + - key + type: object + type: object + insecureSkipVerify: + description: Disable target certificate validation. + type: boolean + keySecret: + description: Secret containing the client key file + for the targets. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: + https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, kind, + uid?' + type: string + optional: + description: Specify whether the Secret or its + key must be defined + type: boolean + required: + - key + type: object + serverName: + description: Used to verify the hostname for the targets. + type: string + type: object + to: + description: The email address to send notifications to. + type: string + type: object + type: array + name: + description: Name of the receiver. Must be unique across all + items from the list. + minLength: 1 + type: string + opsgenieConfigs: + description: List of OpsGenie configurations. + items: + description: OpsGenieConfig configures notifications via OpsGenie. + See https://prometheus.io/docs/alerting/latest/configuration/#opsgenie_config + properties: + apiKey: + description: The secret's key that contains the OpsGenie + API key. The secret needs to be in the same namespace + as the AlertmanagerConfig object and accessible by the + Prometheus Operator. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, kind, + uid?' + type: string + optional: + description: Specify whether the Secret or its key + must be defined + type: boolean + required: + - key + type: object + apiURL: + description: The URL to send OpsGenie API requests to. + type: string + description: + description: Description of the incident. + type: string + details: + description: A set of arbitrary key/value pairs that provide + further detail about the incident. + items: + description: KeyValue defines a (key, value) tuple. + properties: + key: + description: Key of the tuple. + minLength: 1 + type: string + value: + description: Value of the tuple. + type: string + required: + - key + - value + type: object + type: array + httpConfig: + description: HTTP client configuration. + properties: + basicAuth: + description: BasicAuth for the client. + properties: + password: + description: The secret in the service monitor + namespace that contains the password for authentication. + properties: + key: + description: The key of the secret to select + from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: + https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret or + its key must be defined + type: boolean + required: + - key + type: object + username: + description: The secret in the service monitor + namespace that contains the username for authentication. + properties: + key: + description: The key of the secret to select + from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: + https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret or + its key must be defined + type: boolean + required: + - key + type: object + type: object + bearerTokenSecret: + description: The secret's key that contains the bearer + token to be used by the client for authentication. + The secret needs to be in the same namespace as + the AlertmanagerConfig object and accessible by + the Prometheus Operator. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: + https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, kind, + uid?' + type: string + optional: + description: Specify whether the Secret or its + key must be defined + type: boolean + required: + - key + type: object + proxyURL: + description: Optional proxy URL. + type: string + tlsConfig: + description: TLS configuration for the client. + properties: + ca: + description: Struct containing the CA cert to + use for the targets. + properties: + configMap: + description: ConfigMap containing data to + use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap + or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use + for the targets. + properties: + key: + description: The key of the secret to + select from. Must be a valid secret + key. + type: string + name: + description: 'Name of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret + or its key must be defined + type: boolean + required: + - key + type: object + type: object + cert: + description: Struct containing the client cert + file for the targets. + properties: + configMap: + description: ConfigMap containing data to + use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap + or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use + for the targets. + properties: + key: + description: The key of the secret to + select from. Must be a valid secret + key. + type: string + name: + description: 'Name of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret + or its key must be defined + type: boolean + required: + - key + type: object + type: object + insecureSkipVerify: + description: Disable target certificate validation. + type: boolean + keySecret: + description: Secret containing the client key + file for the targets. + properties: + key: + description: The key of the secret to select + from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: + https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret or + its key must be defined + type: boolean + required: + - key + type: object + serverName: + description: Used to verify the hostname for the + targets. + type: string + type: object + type: object + message: + description: Alert text limited to 130 characters. + type: string + note: + description: Additional alert note. + type: string + priority: + description: Priority level of alert. Possible values + are P1, P2, P3, P4, and P5. + type: string + responders: + description: List of responders responsible for notifications. + items: + description: OpsGenieConfigResponder defines a responder + to an incident. One of `id`, `name` or `username` + has to be defined. + properties: + id: + description: ID of the responder. + type: string + name: + description: Name of the responder. + type: string + type: + description: Type of responder. + minLength: 1 + type: string + username: + description: Username of the responder. + type: string + required: + - type + type: object + type: array + sendResolved: + description: Whether or not to notify about resolved alerts. + type: boolean + source: + description: Backlink to the sender of the notification. + type: string + tags: + description: Comma separated list of tags attached to + the notifications. + type: string + type: object + type: array + pagerdutyConfigs: + description: List of PagerDuty configurations. + items: + description: PagerDutyConfig configures notifications via + PagerDuty. See https://prometheus.io/docs/alerting/latest/configuration/#pagerduty_config + properties: + class: + description: The class/type of the event. + type: string + client: + description: Client identification. + type: string + clientURL: + description: Backlink to the sender of notification. + type: string + component: + description: The part or component of the affected system + that is broken. + type: string + description: + description: Description of the incident. + type: string + details: + description: Arbitrary key/value pairs that provide further + detail about the incident. + items: + description: KeyValue defines a (key, value) tuple. + properties: + key: + description: Key of the tuple. + minLength: 1 + type: string + value: + description: Value of the tuple. + type: string + required: + - key + - value + type: object + type: array + group: + description: A cluster or grouping of sources. + type: string + httpConfig: + description: HTTP client configuration. + properties: + basicAuth: + description: BasicAuth for the client. + properties: + password: + description: The secret in the service monitor + namespace that contains the password for authentication. + properties: + key: + description: The key of the secret to select + from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: + https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret or + its key must be defined + type: boolean + required: + - key + type: object + username: + description: The secret in the service monitor + namespace that contains the username for authentication. + properties: + key: + description: The key of the secret to select + from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: + https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret or + its key must be defined + type: boolean + required: + - key + type: object + type: object + bearerTokenSecret: + description: The secret's key that contains the bearer + token to be used by the client for authentication. + The secret needs to be in the same namespace as + the AlertmanagerConfig object and accessible by + the Prometheus Operator. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: + https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, kind, + uid?' + type: string + optional: + description: Specify whether the Secret or its + key must be defined + type: boolean + required: + - key + type: object + proxyURL: + description: Optional proxy URL. + type: string + tlsConfig: + description: TLS configuration for the client. + properties: + ca: + description: Struct containing the CA cert to + use for the targets. + properties: + configMap: + description: ConfigMap containing data to + use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap + or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use + for the targets. + properties: + key: + description: The key of the secret to + select from. Must be a valid secret + key. + type: string + name: + description: 'Name of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret + or its key must be defined + type: boolean + required: + - key + type: object + type: object + cert: + description: Struct containing the client cert + file for the targets. + properties: + configMap: + description: ConfigMap containing data to + use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap + or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use + for the targets. + properties: + key: + description: The key of the secret to + select from. Must be a valid secret + key. + type: string + name: + description: 'Name of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret + or its key must be defined + type: boolean + required: + - key + type: object + type: object + insecureSkipVerify: + description: Disable target certificate validation. + type: boolean + keySecret: + description: Secret containing the client key + file for the targets. + properties: + key: + description: The key of the secret to select + from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: + https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret or + its key must be defined + type: boolean + required: + - key + type: object + serverName: + description: Used to verify the hostname for the + targets. + type: string + type: object + type: object + routingKey: + description: The secret's key that contains the PagerDuty + integration key (when using Events API v2). Either this + field or `serviceKey` needs to be defined. The secret + needs to be in the same namespace as the AlertmanagerConfig + object and accessible by the Prometheus Operator. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, kind, + uid?' + type: string + optional: + description: Specify whether the Secret or its key + must be defined + type: boolean + required: + - key + type: object + sendResolved: + description: Whether or not to notify about resolved alerts. + type: boolean + serviceKey: + description: The secret's key that contains the PagerDuty + service key (when using integration type "Prometheus"). + Either this field or `routingKey` needs to be defined. + The secret needs to be in the same namespace as the + AlertmanagerConfig object and accessible by the Prometheus + Operator. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, kind, + uid?' + type: string + optional: + description: Specify whether the Secret or its key + must be defined + type: boolean + required: + - key + type: object + severity: + description: Severity of the incident. + type: string + url: + description: The URL to send requests to. + type: string + type: object + type: array + pushoverConfigs: + description: List of Pushover configurations. + items: + description: PushoverConfig configures notifications via Pushover. + See https://prometheus.io/docs/alerting/latest/configuration/#pushover_config + properties: + expire: + description: How long your notification will continue + to be retried for, unless the user acknowledges the + notification. + type: string + html: + description: Whether notification message is HTML or plain + text. + type: boolean + httpConfig: + description: HTTP client configuration. + properties: + basicAuth: + description: BasicAuth for the client. + properties: + password: + description: The secret in the service monitor + namespace that contains the password for authentication. + properties: + key: + description: The key of the secret to select + from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: + https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret or + its key must be defined + type: boolean + required: + - key + type: object + username: + description: The secret in the service monitor + namespace that contains the username for authentication. + properties: + key: + description: The key of the secret to select + from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: + https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret or + its key must be defined + type: boolean + required: + - key + type: object + type: object + bearerTokenSecret: + description: The secret's key that contains the bearer + token to be used by the client for authentication. + The secret needs to be in the same namespace as + the AlertmanagerConfig object and accessible by + the Prometheus Operator. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: + https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, kind, + uid?' + type: string + optional: + description: Specify whether the Secret or its + key must be defined + type: boolean + required: + - key + type: object + proxyURL: + description: Optional proxy URL. + type: string + tlsConfig: + description: TLS configuration for the client. + properties: + ca: + description: Struct containing the CA cert to + use for the targets. + properties: + configMap: + description: ConfigMap containing data to + use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap + or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use + for the targets. + properties: + key: + description: The key of the secret to + select from. Must be a valid secret + key. + type: string + name: + description: 'Name of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret + or its key must be defined + type: boolean + required: + - key + type: object + type: object + cert: + description: Struct containing the client cert + file for the targets. + properties: + configMap: + description: ConfigMap containing data to + use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap + or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use + for the targets. + properties: + key: + description: The key of the secret to + select from. Must be a valid secret + key. + type: string + name: + description: 'Name of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret + or its key must be defined + type: boolean + required: + - key + type: object + type: object + insecureSkipVerify: + description: Disable target certificate validation. + type: boolean + keySecret: + description: Secret containing the client key + file for the targets. + properties: + key: + description: The key of the secret to select + from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: + https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret or + its key must be defined + type: boolean + required: + - key + type: object + serverName: + description: Used to verify the hostname for the + targets. + type: string + type: object + type: object + message: + description: Notification message. + type: string + priority: + description: Priority, see https://pushover.net/api#priority + type: string + retry: + description: How often the Pushover servers will send + the same notification to the user. Must be at least + 30 seconds. + type: string + sendResolved: + description: Whether or not to notify about resolved alerts. + type: boolean + sound: + description: The name of one of the sounds supported by + device clients to override the user's default sound + choice + type: string + title: + description: Notification title. + type: string + token: + description: The secret's key that contains the registered + application’s API token, see https://pushover.net/apps. + The secret needs to be in the same namespace as the + AlertmanagerConfig object and accessible by the Prometheus + Operator. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, kind, + uid?' + type: string + optional: + description: Specify whether the Secret or its key + must be defined + type: boolean + required: + - key + type: object + url: + description: A supplementary URL shown alongside the message. + type: string + urlTitle: + description: A title for supplementary URL, otherwise + just the URL is shown + type: string + userKey: + description: The secret's key that contains the recipient + user’s user key. The secret needs to be in the same + namespace as the AlertmanagerConfig object and accessible + by the Prometheus Operator. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, kind, + uid?' + type: string + optional: + description: Specify whether the Secret or its key + must be defined + type: boolean + required: + - key + type: object + type: object + type: array + slackConfigs: + description: List of Slack configurations. + items: + description: SlackConfig configures notifications via Slack. + See https://prometheus.io/docs/alerting/latest/configuration/#slack_config + properties: + actions: + description: A list of Slack actions that are sent with + each notification. + items: + description: SlackAction configures a single Slack action + that is sent with each notification. See https://api.slack.com/docs/message-attachments#action_fields + and https://api.slack.com/docs/message-buttons for + more information. + properties: + confirm: + description: SlackConfirmationField protect users + from destructive actions or particularly distinguished + decisions by asking them to confirm their button + click one more time. See https://api.slack.com/docs/interactive-message-field-guide#confirmation_fields + for more information. + properties: + dismissText: + type: string + okText: + type: string + text: + minLength: 1 + type: string + title: + type: string + required: + - text + type: object + name: + type: string + style: + type: string + text: + minLength: 1 + type: string + type: + minLength: 1 + type: string + url: + type: string + value: + type: string + required: + - text + - type + type: object + type: array + apiURL: + description: The secret's key that contains the Slack + webhook URL. The secret needs to be in the same namespace + as the AlertmanagerConfig object and accessible by the + Prometheus Operator. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, kind, + uid?' + type: string + optional: + description: Specify whether the Secret or its key + must be defined + type: boolean + required: + - key + type: object + callbackId: + type: string + channel: + description: The channel or user to send notifications + to. + type: string + color: + type: string + fallback: + type: string + fields: + description: A list of Slack fields that are sent with + each notification. + items: + description: SlackField configures a single Slack field + that is sent with each notification. Each field must + contain a title, value, and optionally, a boolean + value to indicate if the field is short enough to + be displayed next to other fields designated as short. + See https://api.slack.com/docs/message-attachments#fields + for more information. + properties: + short: + type: boolean + title: + minLength: 1 + type: string + value: + minLength: 1 + type: string + required: + - title + - value + type: object + type: array + footer: + type: string + httpConfig: + description: HTTP client configuration. + properties: + basicAuth: + description: BasicAuth for the client. + properties: + password: + description: The secret in the service monitor + namespace that contains the password for authentication. + properties: + key: + description: The key of the secret to select + from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: + https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret or + its key must be defined + type: boolean + required: + - key + type: object + username: + description: The secret in the service monitor + namespace that contains the username for authentication. + properties: + key: + description: The key of the secret to select + from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: + https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret or + its key must be defined + type: boolean + required: + - key + type: object + type: object + bearerTokenSecret: + description: The secret's key that contains the bearer + token to be used by the client for authentication. + The secret needs to be in the same namespace as + the AlertmanagerConfig object and accessible by + the Prometheus Operator. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: + https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, kind, + uid?' + type: string + optional: + description: Specify whether the Secret or its + key must be defined + type: boolean + required: + - key + type: object + proxyURL: + description: Optional proxy URL. + type: string + tlsConfig: + description: TLS configuration for the client. + properties: + ca: + description: Struct containing the CA cert to + use for the targets. + properties: + configMap: + description: ConfigMap containing data to + use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap + or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use + for the targets. + properties: + key: + description: The key of the secret to + select from. Must be a valid secret + key. + type: string + name: + description: 'Name of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret + or its key must be defined + type: boolean + required: + - key + type: object + type: object + cert: + description: Struct containing the client cert + file for the targets. + properties: + configMap: + description: ConfigMap containing data to + use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap + or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use + for the targets. + properties: + key: + description: The key of the secret to + select from. Must be a valid secret + key. + type: string + name: + description: 'Name of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret + or its key must be defined + type: boolean + required: + - key + type: object + type: object + insecureSkipVerify: + description: Disable target certificate validation. + type: boolean + keySecret: + description: Secret containing the client key + file for the targets. + properties: + key: + description: The key of the secret to select + from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: + https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret or + its key must be defined + type: boolean + required: + - key + type: object + serverName: + description: Used to verify the hostname for the + targets. + type: string + type: object + type: object + iconEmoji: + type: string + iconURL: + type: string + imageURL: + type: string + linkNames: + type: boolean + mrkdwnIn: + items: + type: string + type: array + pretext: + type: string + sendResolved: + description: Whether or not to notify about resolved alerts. + type: boolean + shortFields: + type: boolean + text: + type: string + thumbURL: + type: string + title: + type: string + titleLink: + type: string + username: + type: string + type: object + type: array + victoropsConfigs: + description: List of VictorOps configurations. + items: + description: VictorOpsConfig configures notifications via + VictorOps. See https://prometheus.io/docs/alerting/latest/configuration/#victorops_config + properties: + apiKey: + description: The secret's key that contains the API key + to use when talking to the VictorOps API. The secret + needs to be in the same namespace as the AlertmanagerConfig + object and accessible by the Prometheus Operator. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, kind, + uid?' + type: string + optional: + description: Specify whether the Secret or its key + must be defined + type: boolean + required: + - key + type: object + apiUrl: + description: The VictorOps API URL. + type: string + customFields: + description: Additional custom fields for notification. + items: + description: KeyValue defines a (key, value) tuple. + properties: + key: + description: Key of the tuple. + minLength: 1 + type: string + value: + description: Value of the tuple. + type: string + required: + - key + - value + type: object + type: array + entityDisplayName: + description: Contains summary of the alerted problem. + type: string + httpConfig: + description: The HTTP client's configuration. + properties: + basicAuth: + description: BasicAuth for the client. + properties: + password: + description: The secret in the service monitor + namespace that contains the password for authentication. + properties: + key: + description: The key of the secret to select + from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: + https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret or + its key must be defined + type: boolean + required: + - key + type: object + username: + description: The secret in the service monitor + namespace that contains the username for authentication. + properties: + key: + description: The key of the secret to select + from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: + https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret or + its key must be defined + type: boolean + required: + - key + type: object + type: object + bearerTokenSecret: + description: The secret's key that contains the bearer + token to be used by the client for authentication. + The secret needs to be in the same namespace as + the AlertmanagerConfig object and accessible by + the Prometheus Operator. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: + https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, kind, + uid?' + type: string + optional: + description: Specify whether the Secret or its + key must be defined + type: boolean + required: + - key + type: object + proxyURL: + description: Optional proxy URL. + type: string + tlsConfig: + description: TLS configuration for the client. + properties: + ca: + description: Struct containing the CA cert to + use for the targets. + properties: + configMap: + description: ConfigMap containing data to + use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap + or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use + for the targets. + properties: + key: + description: The key of the secret to + select from. Must be a valid secret + key. + type: string + name: + description: 'Name of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret + or its key must be defined + type: boolean + required: + - key + type: object + type: object + cert: + description: Struct containing the client cert + file for the targets. + properties: + configMap: + description: ConfigMap containing data to + use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap + or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use + for the targets. + properties: + key: + description: The key of the secret to + select from. Must be a valid secret + key. + type: string + name: + description: 'Name of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret + or its key must be defined + type: boolean + required: + - key + type: object + type: object + insecureSkipVerify: + description: Disable target certificate validation. + type: boolean + keySecret: + description: Secret containing the client key + file for the targets. + properties: + key: + description: The key of the secret to select + from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: + https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret or + its key must be defined + type: boolean + required: + - key + type: object + serverName: + description: Used to verify the hostname for the + targets. + type: string + type: object + type: object + messageType: + description: Describes the behavior of the alert (CRITICAL, + WARNING, INFO). + type: string + monitoringTool: + description: The monitoring tool the state message is + from. + type: string + routingKey: + description: A key used to map the alert to a team. + type: string + sendResolved: + description: Whether or not to notify about resolved alerts. + type: boolean + stateMessage: + description: Contains long explanation of the alerted + problem. + type: string + type: object + type: array + webhookConfigs: + description: List of webhook configurations. + items: + description: WebhookConfig configures notifications via a + generic receiver supporting the webhook payload. See https://prometheus.io/docs/alerting/latest/configuration/#webhook_config + properties: + httpConfig: + description: HTTP client configuration. + properties: + basicAuth: + description: BasicAuth for the client. + properties: + password: + description: The secret in the service monitor + namespace that contains the password for authentication. + properties: + key: + description: The key of the secret to select + from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: + https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret or + its key must be defined + type: boolean + required: + - key + type: object + username: + description: The secret in the service monitor + namespace that contains the username for authentication. + properties: + key: + description: The key of the secret to select + from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: + https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret or + its key must be defined + type: boolean + required: + - key + type: object + type: object + bearerTokenSecret: + description: The secret's key that contains the bearer + token to be used by the client for authentication. + The secret needs to be in the same namespace as + the AlertmanagerConfig object and accessible by + the Prometheus Operator. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: + https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, kind, + uid?' + type: string + optional: + description: Specify whether the Secret or its + key must be defined + type: boolean + required: + - key + type: object + proxyURL: + description: Optional proxy URL. + type: string + tlsConfig: + description: TLS configuration for the client. + properties: + ca: + description: Struct containing the CA cert to + use for the targets. + properties: + configMap: + description: ConfigMap containing data to + use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap + or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use + for the targets. + properties: + key: + description: The key of the secret to + select from. Must be a valid secret + key. + type: string + name: + description: 'Name of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret + or its key must be defined + type: boolean + required: + - key + type: object + type: object + cert: + description: Struct containing the client cert + file for the targets. + properties: + configMap: + description: ConfigMap containing data to + use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap + or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use + for the targets. + properties: + key: + description: The key of the secret to + select from. Must be a valid secret + key. + type: string + name: + description: 'Name of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret + or its key must be defined + type: boolean + required: + - key + type: object + type: object + insecureSkipVerify: + description: Disable target certificate validation. + type: boolean + keySecret: + description: Secret containing the client key + file for the targets. + properties: + key: + description: The key of the secret to select + from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: + https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret or + its key must be defined + type: boolean + required: + - key + type: object + serverName: + description: Used to verify the hostname for the + targets. + type: string + type: object + type: object + maxAlerts: + description: Maximum number of alerts to be sent per webhook + message. When 0, all alerts are included. + format: int32 + minimum: 0 + type: integer + sendResolved: + description: Whether or not to notify about resolved alerts. + type: boolean + url: + description: The URL to send HTTP POST requests to. `urlSecret` + takes precedence over `url`. One of `urlSecret` and + `url` should be defined. + type: string + urlSecret: + description: The secret's key that contains the webhook + URL to send HTTP requests to. `urlSecret` takes precedence + over `url`. One of `urlSecret` and `url` should be defined. + The secret needs to be in the same namespace as the + AlertmanagerConfig object and accessible by the Prometheus + Operator. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, kind, + uid?' + type: string + optional: + description: Specify whether the Secret or its key + must be defined + type: boolean + required: + - key + type: object + type: object + type: array + wechatConfigs: + description: List of WeChat configurations. + items: + description: WeChatConfig configures notifications via WeChat. + See https://prometheus.io/docs/alerting/latest/configuration/#wechat_config + properties: + agentID: + type: string + apiSecret: + description: The secret's key that contains the WeChat + API key. The secret needs to be in the same namespace + as the AlertmanagerConfig object and accessible by the + Prometheus Operator. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, kind, + uid?' + type: string + optional: + description: Specify whether the Secret or its key + must be defined + type: boolean + required: + - key + type: object + apiURL: + description: The WeChat API URL. + type: string + corpID: + description: The corp id for authentication. + type: string + httpConfig: + description: HTTP client configuration. + properties: + basicAuth: + description: BasicAuth for the client. + properties: + password: + description: The secret in the service monitor + namespace that contains the password for authentication. + properties: + key: + description: The key of the secret to select + from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: + https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret or + its key must be defined + type: boolean + required: + - key + type: object + username: + description: The secret in the service monitor + namespace that contains the username for authentication. + properties: + key: + description: The key of the secret to select + from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: + https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret or + its key must be defined + type: boolean + required: + - key + type: object + type: object + bearerTokenSecret: + description: The secret's key that contains the bearer + token to be used by the client for authentication. + The secret needs to be in the same namespace as + the AlertmanagerConfig object and accessible by + the Prometheus Operator. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: + https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, kind, + uid?' + type: string + optional: + description: Specify whether the Secret or its + key must be defined + type: boolean + required: + - key + type: object + proxyURL: + description: Optional proxy URL. + type: string + tlsConfig: + description: TLS configuration for the client. + properties: + ca: + description: Struct containing the CA cert to + use for the targets. + properties: + configMap: + description: ConfigMap containing data to + use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap + or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use + for the targets. + properties: + key: + description: The key of the secret to + select from. Must be a valid secret + key. + type: string + name: + description: 'Name of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret + or its key must be defined + type: boolean + required: + - key + type: object + type: object + cert: + description: Struct containing the client cert + file for the targets. + properties: + configMap: + description: ConfigMap containing data to + use for the targets. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the ConfigMap + or its key must be defined + type: boolean + required: + - key + type: object + secret: + description: Secret containing data to use + for the targets. + properties: + key: + description: The key of the secret to + select from. Must be a valid secret + key. + type: string + name: + description: 'Name of the referent. More + info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret + or its key must be defined + type: boolean + required: + - key + type: object + type: object + insecureSkipVerify: + description: Disable target certificate validation. + type: boolean + keySecret: + description: Secret containing the client key + file for the targets. + properties: + key: + description: The key of the secret to select + from. Must be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: + https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + TODO: Add other useful fields. apiVersion, + kind, uid?' + type: string + optional: + description: Specify whether the Secret or + its key must be defined + type: boolean + required: + - key + type: object + serverName: + description: Used to verify the hostname for the + targets. + type: string + type: object + type: object + message: + description: API request data as defined by the WeChat + API. + type: string + messageType: + type: string + sendResolved: + description: Whether or not to notify about resolved alerts. + type: boolean + toParty: + type: string + toTag: + type: string + toUser: + type: string + type: object + type: array + required: + - name + type: object + type: array + route: + description: The Alertmanager route definition for alerts matching + the resource’s namespace. If present, it will be added to the generated + Alertmanager configuration as a first-level route. + properties: + continue: + description: Boolean indicating whether an alert should continue + matching subsequent sibling nodes. It will always be overridden + to true for the first-level route by the Prometheus operator. + type: boolean + groupBy: + description: List of labels to group by. + items: + type: string + type: array + groupInterval: + description: How long to wait before sending an updated notification. + Must match the regular expression `[0-9]+(ms|s|m|h)` (milliseconds + seconds minutes hours). + type: string + groupWait: + description: How long to wait before sending the initial notification. + Must match the regular expression `[0-9]+(ms|s|m|h)` (milliseconds + seconds minutes hours). + type: string + matchers: + description: 'List of matchers that the alert’s labels should + match. For the first level route, the operator removes any existing + equality and regexp matcher on the `namespace` label and adds + a `namespace: ` matcher.' + items: + description: Matcher defines how to match on alert's labels. + properties: + name: + description: Label to match. + minLength: 1 + type: string + regex: + description: Whether to match on equality (false) or regular-expression + (true). + type: boolean + value: + description: Label value to match. + type: string + required: + - name + type: object + type: array + receiver: + description: Name of the receiver for this route. If not empty, + it should be listed in the `receivers` field. + type: string + repeatInterval: + description: How long to wait before repeating the last notification. + Must match the regular expression `[0-9]+(ms|s|m|h)` (milliseconds + seconds minutes hours). + type: string + routes: + description: Child routes. + items: + x-kubernetes-preserve-unknown-fields: true + type: array + type: object + type: object + required: + - spec + type: object + served: true + storage: true +status: + acceptedNames: + kind: "" + plural: "" + conditions: [] + storedVersions: [] diff --git a/apps/monitoring/manifests/prometheus-operator/crds/0alertmanagerCustomResourceDefinition.yaml b/apps/monitoring/manifests/prometheus-operator/0alertmanagerCustomResourceDefinition.yaml similarity index 96% rename from apps/monitoring/manifests/prometheus-operator/crds/0alertmanagerCustomResourceDefinition.yaml rename to apps/monitoring/manifests/prometheus-operator/0alertmanagerCustomResourceDefinition.yaml index c913a5a53..675e3e62f 100644 --- a/apps/monitoring/manifests/prometheus-operator/crds/0alertmanagerCustomResourceDefinition.yaml +++ b/apps/monitoring/manifests/prometheus-operator/0alertmanagerCustomResourceDefinition.yaml @@ -1,10 +1,8 @@ - ---- apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.2.4 + controller-gen.kubebuilder.io/version: v0.4.1 creationTimestamp: null name: alertmanagers.monitoring.coreos.com spec: @@ -745,6 +743,15 @@ spec: in cluster. Needs to be provided for non RFC1918 [1] (public) addresses. [1] RFC1918: https://tools.ietf.org/html/rfc1918' type: string + clusterGossipInterval: + description: Interval between gossip attempts. + type: string + clusterPeerTimeout: + description: Timeout for cluster peering. + type: string + clusterPushpullInterval: + description: Interval between pushpull attempts. + type: string configMaps: description: ConfigMaps is a list of ConfigMaps in the same namespace as the Alertmanager object, which shall be mounted into the Alertmanager @@ -868,9 +875,13 @@ spec: optional for env vars' type: string divisor: + anyOf: + - type: integer + - type: string description: Specifies the output format of the exposed resources, defaults to "1" - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true resource: description: 'Required: resource to select' type: string @@ -1305,6 +1316,7 @@ spec: be referred to by services. type: string protocol: + default: TCP description: Protocol for port. Must be UDP, TCP, or SCTP. Defaults to "TCP". type: string @@ -1312,6 +1324,10 @@ spec: - containerPort type: object type: array + x-kubernetes-list-map-keys: + - containerPort + - protocol + x-kubernetes-list-type: map readinessProbe: description: 'Periodic probe of container service readiness. Container will be removed from service endpoints if the probe @@ -1435,13 +1451,21 @@ spec: properties: limits: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object requests: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, @@ -1951,9 +1975,13 @@ spec: optional for env vars' type: string divisor: + anyOf: + - type: integer + - type: string description: Specifies the output format of the exposed resources, defaults to "1" - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true resource: description: 'Required: resource to select' type: string @@ -2388,6 +2416,7 @@ spec: be referred to by services. type: string protocol: + default: TCP description: Protocol for port. Must be UDP, TCP, or SCTP. Defaults to "TCP". type: string @@ -2395,6 +2424,10 @@ spec: - containerPort type: object type: array + x-kubernetes-list-map-keys: + - containerPort + - protocol + x-kubernetes-list-type: map readinessProbe: description: 'Periodic probe of container service readiness. Container will be removed from service endpoints if the probe @@ -2518,13 +2551,21 @@ spec: properties: limits: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object requests: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, @@ -2909,7 +2950,7 @@ spec: description: Define which Nodes the Pods are scheduled on. type: object paused: - description: If set to true all actions on the underlaying managed + description: If set to true all actions on the underlying managed objects are not goint to be performed, except for delete actions. type: boolean podMetadata: @@ -2958,13 +2999,21 @@ spec: properties: limits: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object requests: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise @@ -3145,6 +3194,9 @@ spec: More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir' type: string sizeLimit: + anyOf: + - type: integer + - type: string description: 'Total amount of local storage required for this EmptyDir volume. The size limit is also applicable for memory medium. The maximum usage on memory medium EmptyDir would @@ -3152,7 +3204,8 @@ spec: and the sum of memory limits of all containers in a pod. The default is nil which means that the limit is undefined. More info: http://kubernetes.io/docs/user-guide/volumes#emptydir' - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true type: object volumeClaimTemplate: description: A PVC spec to be used by the Prometheus StatefulSets. @@ -3248,13 +3301,21 @@ spec: properties: limits: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object requests: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is @@ -3334,7 +3395,11 @@ spec: type: array capacity: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: Represents the actual resources of the underlying volume. type: object @@ -3429,6 +3494,100 @@ spec: type: string type: object type: array + topologySpreadConstraints: + description: If specified, the pod's topology spread constraints. + items: + description: TopologySpreadConstraint specifies how to spread matching + pods among the given topology. + properties: + labelSelector: + description: LabelSelector is used to find matching pods. Pods + that match this label selector are counted to determine the + number of pods in their corresponding topology domain. + properties: + matchExpressions: + description: matchExpressions is a list of label selector + requirements. The requirements are ANDed. + items: + description: A label selector requirement is a selector + that contains values, a key, and an operator that relates + the key and values. + properties: + key: + description: key is the label key that the selector + applies to. + type: string + operator: + description: operator represents a key's relationship + to a set of values. Valid operators are In, NotIn, + Exists and DoesNotExist. + type: string + values: + description: values is an array of string values. + If the operator is In or NotIn, the values array + must be non-empty. If the operator is Exists or + DoesNotExist, the values array must be empty. This + array is replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map of {key,value} pairs. + A single {key,value} in the matchLabels map is equivalent + to an element of matchExpressions, whose key field is + "key", the operator is "In", and the values array contains + only "value". The requirements are ANDed. + type: object + type: object + maxSkew: + description: 'MaxSkew describes the degree to which pods may + be unevenly distributed. It''s the maximum permitted difference + between the number of matching pods in any two topology domains + of a given topology type. For example, in a 3-zone cluster, + MaxSkew is set to 1, and pods with the same labelSelector + spread as 1/1/0: | zone1 | zone2 | zone3 | | P | P | | + - if MaxSkew is 1, incoming pod can only be scheduled to zone3 + to become 1/1/1; scheduling it onto zone1(zone2) would make + the ActualSkew(2-0) on zone1(zone2) violate MaxSkew(1). - + if MaxSkew is 2, incoming pod can be scheduled onto any zone. + It''s a required field. Default value is 1 and 0 is not allowed.' + format: int32 + type: integer + topologyKey: + description: TopologyKey is the key of node labels. Nodes that + have a label with this key and identical values are considered + to be in the same topology. We consider each + as a "bucket", and try to put balanced number of pods into + each bucket. It's a required field. + type: string + whenUnsatisfiable: + description: 'WhenUnsatisfiable indicates how to deal with a + pod if it doesn''t satisfy the spread constraint. - DoNotSchedule + (default) tells the scheduler not to schedule it - ScheduleAnyway + tells the scheduler to still schedule it It''s considered + as "Unsatisfiable" if and only if placing incoming pod on + any topology violates "MaxSkew". For example, in a 3-zone + cluster, MaxSkew is set to 1, and pods with the same labelSelector + spread as 3/1/1: | zone1 | zone2 | zone3 | | P P P | P | P | + If WhenUnsatisfiable is set to DoNotSchedule, incoming pod + can only be scheduled to zone2(zone3) to become 3/2/1(3/1/2) + as ActualSkew(2-1) on zone2(zone3) satisfies MaxSkew(1). In + other words, the cluster can still be imbalanced, but scheduler + won''t make it *more* imbalanced. It''s a required field.' + type: string + required: + - maxSkew + - topologyKey + - whenUnsatisfiable + type: object + type: array version: description: Version the cluster should be on. type: string @@ -3801,9 +3960,13 @@ spec: optional for env vars' type: string divisor: + anyOf: + - type: integer + - type: string description: Specifies the output format of the exposed resources, defaults to "1" - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true resource: description: 'Required: resource to select' type: string @@ -3826,6 +3989,9 @@ spec: More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir' type: string sizeLimit: + anyOf: + - type: integer + - type: string description: 'Total amount of local storage required for this EmptyDir volume. The size limit is also applicable for memory medium. The maximum usage on memory medium @@ -3833,7 +3999,8 @@ spec: specified here and the sum of memory limits of all containers in a pod. The default is nil which means that the limit is undefined. More info: http://kubernetes.io/docs/user-guide/volumes#emptydir' - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true type: object fc: description: FC represents a Fibre Channel resource that is @@ -4296,10 +4463,14 @@ spec: for volumes, optional for env vars' type: string divisor: + anyOf: + - type: integer + - type: string description: Specifies the output format of the exposed resources, defaults to "1" - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true resource: description: 'Required: resource to select' @@ -4684,7 +4855,7 @@ spec: format: int32 type: integer paused: - description: Represents whether any actions on the underlaying managed + description: Represents whether any actions on the underlying managed objects are being performed. Only delete actions will be performed. type: boolean replicas: diff --git a/apps/monitoring/manifests/prometheus-operator/crds/0podmonitorCustomResourceDefinition.yaml b/apps/monitoring/manifests/prometheus-operator/0podmonitorCustomResourceDefinition.yaml similarity index 99% rename from apps/monitoring/manifests/prometheus-operator/crds/0podmonitorCustomResourceDefinition.yaml rename to apps/monitoring/manifests/prometheus-operator/0podmonitorCustomResourceDefinition.yaml index 756b64c41..102e7dc9b 100644 --- a/apps/monitoring/manifests/prometheus-operator/crds/0podmonitorCustomResourceDefinition.yaml +++ b/apps/monitoring/manifests/prometheus-operator/0podmonitorCustomResourceDefinition.yaml @@ -1,10 +1,8 @@ - ---- apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.2.4 + controller-gen.kubebuilder.io/version: v0.4.1 creationTimestamp: null name: podmonitors.monitoring.coreos.com spec: diff --git a/apps/monitoring/manifests/prometheus-operator/crds/0probeCustomResourceDefinition.yaml b/apps/monitoring/manifests/prometheus-operator/0probeCustomResourceDefinition.yaml similarity index 99% rename from apps/monitoring/manifests/prometheus-operator/crds/0probeCustomResourceDefinition.yaml rename to apps/monitoring/manifests/prometheus-operator/0probeCustomResourceDefinition.yaml index ae875d0e8..ec0210c28 100644 --- a/apps/monitoring/manifests/prometheus-operator/crds/0probeCustomResourceDefinition.yaml +++ b/apps/monitoring/manifests/prometheus-operator/0probeCustomResourceDefinition.yaml @@ -1,10 +1,8 @@ - ---- apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.2.4 + controller-gen.kubebuilder.io/version: v0.4.1 creationTimestamp: null name: probes.monitoring.coreos.com spec: diff --git a/apps/monitoring/manifests/prometheus-operator/crds/0prometheusCustomResourceDefinition.yaml b/apps/monitoring/manifests/prometheus-operator/0prometheusCustomResourceDefinition.yaml similarity index 96% rename from apps/monitoring/manifests/prometheus-operator/crds/0prometheusCustomResourceDefinition.yaml rename to apps/monitoring/manifests/prometheus-operator/0prometheusCustomResourceDefinition.yaml index 2e790141d..dd46ab991 100644 --- a/apps/monitoring/manifests/prometheus-operator/crds/0prometheusCustomResourceDefinition.yaml +++ b/apps/monitoring/manifests/prometheus-operator/0prometheusCustomResourceDefinition.yaml @@ -1,10 +1,8 @@ - ---- apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.2.4 + controller-gen.kubebuilder.io/version: v0.4.1 creationTimestamp: null name: prometheuses.monitoring.coreos.com spec: @@ -1231,9 +1229,13 @@ spec: optional for env vars' type: string divisor: + anyOf: + - type: integer + - type: string description: Specifies the output format of the exposed resources, defaults to "1" - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true resource: description: 'Required: resource to select' type: string @@ -1668,6 +1670,7 @@ spec: be referred to by services. type: string protocol: + default: TCP description: Protocol for port. Must be UDP, TCP, or SCTP. Defaults to "TCP". type: string @@ -1675,6 +1678,10 @@ spec: - containerPort type: object type: array + x-kubernetes-list-map-keys: + - containerPort + - protocol + x-kubernetes-list-type: map readinessProbe: description: 'Periodic probe of container service readiness. Container will be removed from service endpoints if the probe @@ -1798,13 +1805,21 @@ spec: properties: limits: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object requests: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, @@ -2358,9 +2373,13 @@ spec: optional for env vars' type: string divisor: + anyOf: + - type: integer + - type: string description: Specifies the output format of the exposed resources, defaults to "1" - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true resource: description: 'Required: resource to select' type: string @@ -2795,6 +2814,7 @@ spec: be referred to by services. type: string protocol: + default: TCP description: Protocol for port. Must be UDP, TCP, or SCTP. Defaults to "TCP". type: string @@ -2802,6 +2822,10 @@ spec: - containerPort type: object type: array + x-kubernetes-list-map-keys: + - containerPort + - protocol + x-kubernetes-list-type: map readinessProbe: description: 'Periodic probe of container service readiness. Container will be removed from service endpoints if the probe @@ -2925,13 +2949,21 @@ spec: properties: limits: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object requests: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, @@ -3356,8 +3388,8 @@ spec: type: string type: object podMonitorNamespaceSelector: - description: Namespaces to be selected for PodMonitor discovery. If - nil, only check own namespace. + description: Namespace's labels to match for PodMonitor discovery. + If nil, only check own namespace. properties: matchExpressions: description: matchExpressions is a list of label selector requirements. @@ -4099,7 +4131,9 @@ spec: will _not_ be added when value is set to empty string (`""`). type: string replicas: - description: Number of instances to deploy for a Prometheus deployment. + description: Number of replicas of each shard to deploy for a Prometheus + deployment. Number of replicas multiplied by shards is the total + number of Pods created. format: int32 type: integer resources: @@ -4107,13 +4141,21 @@ spec: properties: limits: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object requests: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise @@ -4126,7 +4168,8 @@ spec: (milliseconds seconds minutes hours days weeks years). type: string retentionSize: - description: Maximum amount of disk space used by blocks. + description: 'Maximum amount of disk space used by blocks. Supported + units: B, KB, MB, GB, TB, PB, EB. Ex: `512MB`.' type: string routePrefix: description: The route prefix Prometheus registers HTTP handlers for. @@ -4393,7 +4436,7 @@ spec: to use to run the Prometheus Pods. type: string serviceMonitorNamespaceSelector: - description: Namespaces to be selected for ServiceMonitor discovery. + description: Namespace's labels to match for ServiceMonitor discovery. If nil, only check own namespace. properties: matchExpressions: @@ -4490,6 +4533,17 @@ spec: if SHA is set. Deprecated: use ''image'' instead. The image digest can be specified as part of the image URL.' type: string + shards: + description: 'EXPERIMENTAL: Number of shards to distribute targets + onto. Number of replicas multiplied by shards is the total number + of Pods created. Note that scaling down shards will not reshard + data onto remaining instances, it must be manually moved. Increasing + shards will not reshard data either but it will continue to be available + from the same instances. To query globally use Thanos sidecar and + Thanos querier or remote write data to a central location. Sharding + is done on the content of the `__address__` target meta-label.' + format: int32 + type: integer storage: description: Storage spec to specify how storage shall be used. properties: @@ -4510,6 +4564,9 @@ spec: More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir' type: string sizeLimit: + anyOf: + - type: integer + - type: string description: 'Total amount of local storage required for this EmptyDir volume. The size limit is also applicable for memory medium. The maximum usage on memory medium EmptyDir would @@ -4517,7 +4574,8 @@ spec: and the sum of memory limits of all containers in a pod. The default is nil which means that the limit is undefined. More info: http://kubernetes.io/docs/user-guide/volumes#emptydir' - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true type: object volumeClaimTemplate: description: A PVC spec to be used by the Prometheus StatefulSets. @@ -4613,13 +4671,21 @@ spec: properties: limits: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object requests: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is @@ -4699,7 +4765,11 @@ spec: type: array capacity: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: Represents the actual resources of the underlying volume. type: object @@ -4919,7 +4989,8 @@ spec: type: string objectStorageConfig: description: ObjectStorageConfig configures object storage in - Thanos. + Thanos. Alternative to ObjectStorageConfigFile, and lower order + priority. properties: key: description: The key of the secret to select from. Must be @@ -4936,6 +5007,11 @@ spec: required: - key type: object + objectStorageConfigFile: + description: ObjectStorageConfigFile specifies the path of the + object storage configuration file. When used alongside with + ObjectStorageConfig, ObjectStorageConfigFile takes precedence. + type: string resources: description: Resources defines the resource requirements for the Thanos sidecar. If not provided, no requests/limits will be @@ -4943,13 +5019,21 @@ spec: properties: limits: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object requests: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise @@ -4989,6 +5073,11 @@ spec: required: - key type: object + tracingConfigFile: + description: TracingConfig specifies the path of the tracing configuration + file. When used alongside with TracingConfig, TracingConfigFile + takes precedence. + type: string version: description: Version describes the version of Thanos to use. type: string @@ -5033,6 +5122,100 @@ spec: type: string type: object type: array + topologySpreadConstraints: + description: If specified, the pod's topology spread constraints. + items: + description: TopologySpreadConstraint specifies how to spread matching + pods among the given topology. + properties: + labelSelector: + description: LabelSelector is used to find matching pods. Pods + that match this label selector are counted to determine the + number of pods in their corresponding topology domain. + properties: + matchExpressions: + description: matchExpressions is a list of label selector + requirements. The requirements are ANDed. + items: + description: A label selector requirement is a selector + that contains values, a key, and an operator that relates + the key and values. + properties: + key: + description: key is the label key that the selector + applies to. + type: string + operator: + description: operator represents a key's relationship + to a set of values. Valid operators are In, NotIn, + Exists and DoesNotExist. + type: string + values: + description: values is an array of string values. + If the operator is In or NotIn, the values array + must be non-empty. If the operator is Exists or + DoesNotExist, the values array must be empty. This + array is replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map of {key,value} pairs. + A single {key,value} in the matchLabels map is equivalent + to an element of matchExpressions, whose key field is + "key", the operator is "In", and the values array contains + only "value". The requirements are ANDed. + type: object + type: object + maxSkew: + description: 'MaxSkew describes the degree to which pods may + be unevenly distributed. It''s the maximum permitted difference + between the number of matching pods in any two topology domains + of a given topology type. For example, in a 3-zone cluster, + MaxSkew is set to 1, and pods with the same labelSelector + spread as 1/1/0: | zone1 | zone2 | zone3 | | P | P | | + - if MaxSkew is 1, incoming pod can only be scheduled to zone3 + to become 1/1/1; scheduling it onto zone1(zone2) would make + the ActualSkew(2-0) on zone1(zone2) violate MaxSkew(1). - + if MaxSkew is 2, incoming pod can be scheduled onto any zone. + It''s a required field. Default value is 1 and 0 is not allowed.' + format: int32 + type: integer + topologyKey: + description: TopologyKey is the key of node labels. Nodes that + have a label with this key and identical values are considered + to be in the same topology. We consider each + as a "bucket", and try to put balanced number of pods into + each bucket. It's a required field. + type: string + whenUnsatisfiable: + description: 'WhenUnsatisfiable indicates how to deal with a + pod if it doesn''t satisfy the spread constraint. - DoNotSchedule + (default) tells the scheduler not to schedule it - ScheduleAnyway + tells the scheduler to still schedule it It''s considered + as "Unsatisfiable" if and only if placing incoming pod on + any topology violates "MaxSkew". For example, in a 3-zone + cluster, MaxSkew is set to 1, and pods with the same labelSelector + spread as 3/1/1: | zone1 | zone2 | zone3 | | P P P | P | P | + If WhenUnsatisfiable is set to DoNotSchedule, incoming pod + can only be scheduled to zone2(zone3) to become 3/2/1(3/1/2) + as ActualSkew(2-1) on zone2(zone3) satisfies MaxSkew(1). In + other words, the cluster can still be imbalanced, but scheduler + won''t make it *more* imbalanced. It''s a required field.' + type: string + required: + - maxSkew + - topologyKey + - whenUnsatisfiable + type: object + type: array version: description: Version of Prometheus to be deployed. type: string @@ -5405,9 +5588,13 @@ spec: optional for env vars' type: string divisor: + anyOf: + - type: integer + - type: string description: Specifies the output format of the exposed resources, defaults to "1" - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true resource: description: 'Required: resource to select' type: string @@ -5430,6 +5617,9 @@ spec: More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir' type: string sizeLimit: + anyOf: + - type: integer + - type: string description: 'Total amount of local storage required for this EmptyDir volume. The size limit is also applicable for memory medium. The maximum usage on memory medium @@ -5437,7 +5627,8 @@ spec: specified here and the sum of memory limits of all containers in a pod. The default is nil which means that the limit is undefined. More info: http://kubernetes.io/docs/user-guide/volumes#emptydir' - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true type: object fc: description: FC represents a Fibre Channel resource that is @@ -5900,10 +6091,14 @@ spec: for volumes, optional for env vars' type: string divisor: + anyOf: + - type: integer + - type: string description: Specifies the output format of the exposed resources, defaults to "1" - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true resource: description: 'Required: resource to select' @@ -6300,7 +6495,7 @@ spec: format: int32 type: integer paused: - description: Represents whether any actions on the underlaying managed + description: Represents whether any actions on the underlying managed objects are being performed. Only delete actions will be performed. type: boolean replicas: diff --git a/apps/monitoring/manifests/prometheus-operator/crds/0prometheusruleCustomResourceDefinition.yaml b/apps/monitoring/manifests/prometheus-operator/0prometheusruleCustomResourceDefinition.yaml similarity index 95% rename from apps/monitoring/manifests/prometheus-operator/crds/0prometheusruleCustomResourceDefinition.yaml rename to apps/monitoring/manifests/prometheus-operator/0prometheusruleCustomResourceDefinition.yaml index 06dbdaaa0..a91bfc736 100644 --- a/apps/monitoring/manifests/prometheus-operator/crds/0prometheusruleCustomResourceDefinition.yaml +++ b/apps/monitoring/manifests/prometheus-operator/0prometheusruleCustomResourceDefinition.yaml @@ -1,10 +1,8 @@ - ---- apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.2.4 + controller-gen.kubebuilder.io/version: v0.4.1 creationTimestamp: null name: prometheusrules.monitoring.coreos.com spec: @@ -19,7 +17,8 @@ spec: - name: v1 schema: openAPIV3Schema: - description: PrometheusRule defines alerting rules for a Prometheus instance + description: PrometheusRule defines recording and alerting rules for a Prometheus + instance properties: apiVersion: description: 'APIVersion defines the versioned schema of this representation diff --git a/apps/monitoring/manifests/prometheus-operator/crds/0servicemonitorCustomResourceDefinition.yaml b/apps/monitoring/manifests/prometheus-operator/0servicemonitorCustomResourceDefinition.yaml similarity index 99% rename from apps/monitoring/manifests/prometheus-operator/crds/0servicemonitorCustomResourceDefinition.yaml rename to apps/monitoring/manifests/prometheus-operator/0servicemonitorCustomResourceDefinition.yaml index 8acd19f42..f80801375 100644 --- a/apps/monitoring/manifests/prometheus-operator/crds/0servicemonitorCustomResourceDefinition.yaml +++ b/apps/monitoring/manifests/prometheus-operator/0servicemonitorCustomResourceDefinition.yaml @@ -1,10 +1,8 @@ - ---- apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.2.4 + controller-gen.kubebuilder.io/version: v0.4.1 creationTimestamp: null name: servicemonitors.monitoring.coreos.com spec: diff --git a/apps/monitoring/manifests/prometheus-operator/crds/0thanosrulerCustomResourceDefinition.yaml b/apps/monitoring/manifests/prometheus-operator/0thanosrulerCustomResourceDefinition.yaml similarity index 96% rename from apps/monitoring/manifests/prometheus-operator/crds/0thanosrulerCustomResourceDefinition.yaml rename to apps/monitoring/manifests/prometheus-operator/0thanosrulerCustomResourceDefinition.yaml index a904c666e..fab0e399a 100644 --- a/apps/monitoring/manifests/prometheus-operator/crds/0thanosrulerCustomResourceDefinition.yaml +++ b/apps/monitoring/manifests/prometheus-operator/0thanosrulerCustomResourceDefinition.yaml @@ -1,10 +1,8 @@ - ---- apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.2.4 + controller-gen.kubebuilder.io/version: v0.4.1 creationTimestamp: null name: thanosrulers.monitoring.coreos.com spec: @@ -779,9 +777,13 @@ spec: optional for env vars' type: string divisor: + anyOf: + - type: integer + - type: string description: Specifies the output format of the exposed resources, defaults to "1" - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true resource: description: 'Required: resource to select' type: string @@ -1216,6 +1218,7 @@ spec: be referred to by services. type: string protocol: + default: TCP description: Protocol for port. Must be UDP, TCP, or SCTP. Defaults to "TCP". type: string @@ -1223,6 +1226,10 @@ spec: - containerPort type: object type: array + x-kubernetes-list-map-keys: + - containerPort + - protocol + x-kubernetes-list-type: map readinessProbe: description: 'Periodic probe of container service readiness. Container will be removed from service endpoints if the probe @@ -1346,13 +1353,21 @@ spec: properties: limits: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object requests: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, @@ -1981,9 +1996,13 @@ spec: optional for env vars' type: string divisor: + anyOf: + - type: integer + - type: string description: Specifies the output format of the exposed resources, defaults to "1" - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true resource: description: 'Required: resource to select' type: string @@ -2418,6 +2437,7 @@ spec: be referred to by services. type: string protocol: + default: TCP description: Protocol for port. Must be UDP, TCP, or SCTP. Defaults to "TCP". type: string @@ -2425,6 +2445,10 @@ spec: - containerPort type: object type: array + x-kubernetes-list-map-keys: + - containerPort + - protocol + x-kubernetes-list-type: map readinessProbe: description: 'Periodic probe of container service readiness. Container will be removed from service endpoints if the probe @@ -2548,13 +2572,21 @@ spec: properties: limits: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object requests: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, @@ -2946,6 +2978,7 @@ spec: type: object objectStorageConfig: description: ObjectStorageConfig configures object storage in Thanos. + Alternative to ObjectStorageConfigFile, and lower order priority. properties: key: description: The key of the secret to select from. Must be a @@ -2961,6 +2994,11 @@ spec: required: - key type: object + objectStorageConfigFile: + description: ObjectStorageConfigFile specifies the path of the object + storage configuration file. When used alongside with ObjectStorageConfig, + ObjectStorageConfigFile takes precedence. + type: string paused: description: When a ThanosRuler deployment is paused, no actions except for deletion will be performed on the underlying objects. @@ -3057,13 +3095,21 @@ spec: properties: limits: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object requests: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, otherwise @@ -3316,6 +3362,9 @@ spec: More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir' type: string sizeLimit: + anyOf: + - type: integer + - type: string description: 'Total amount of local storage required for this EmptyDir volume. The size limit is also applicable for memory medium. The maximum usage on memory medium EmptyDir would @@ -3323,7 +3372,8 @@ spec: and the sum of memory limits of all containers in a pod. The default is nil which means that the limit is undefined. More info: http://kubernetes.io/docs/user-guide/volumes#emptydir' - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true type: object volumeClaimTemplate: description: A PVC spec to be used by the Prometheus StatefulSets. @@ -3419,13 +3469,21 @@ spec: properties: limits: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Limits describes the maximum amount of compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object requests: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: 'Requests describes the minimum amount of compute resources required. If Requests is omitted for a container, it defaults to Limits if that is @@ -3505,7 +3563,11 @@ spec: type: array capacity: additionalProperties: - type: string + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true description: Represents the actual resources of the underlying volume. type: object @@ -3594,6 +3656,100 @@ spec: type: string type: object type: array + topologySpreadConstraints: + description: If specified, the pod's topology spread constraints. + items: + description: TopologySpreadConstraint specifies how to spread matching + pods among the given topology. + properties: + labelSelector: + description: LabelSelector is used to find matching pods. Pods + that match this label selector are counted to determine the + number of pods in their corresponding topology domain. + properties: + matchExpressions: + description: matchExpressions is a list of label selector + requirements. The requirements are ANDed. + items: + description: A label selector requirement is a selector + that contains values, a key, and an operator that relates + the key and values. + properties: + key: + description: key is the label key that the selector + applies to. + type: string + operator: + description: operator represents a key's relationship + to a set of values. Valid operators are In, NotIn, + Exists and DoesNotExist. + type: string + values: + description: values is an array of string values. + If the operator is In or NotIn, the values array + must be non-empty. If the operator is Exists or + DoesNotExist, the values array must be empty. This + array is replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map of {key,value} pairs. + A single {key,value} in the matchLabels map is equivalent + to an element of matchExpressions, whose key field is + "key", the operator is "In", and the values array contains + only "value". The requirements are ANDed. + type: object + type: object + maxSkew: + description: 'MaxSkew describes the degree to which pods may + be unevenly distributed. It''s the maximum permitted difference + between the number of matching pods in any two topology domains + of a given topology type. For example, in a 3-zone cluster, + MaxSkew is set to 1, and pods with the same labelSelector + spread as 1/1/0: | zone1 | zone2 | zone3 | | P | P | | + - if MaxSkew is 1, incoming pod can only be scheduled to zone3 + to become 1/1/1; scheduling it onto zone1(zone2) would make + the ActualSkew(2-0) on zone1(zone2) violate MaxSkew(1). - + if MaxSkew is 2, incoming pod can be scheduled onto any zone. + It''s a required field. Default value is 1 and 0 is not allowed.' + format: int32 + type: integer + topologyKey: + description: TopologyKey is the key of node labels. Nodes that + have a label with this key and identical values are considered + to be in the same topology. We consider each + as a "bucket", and try to put balanced number of pods into + each bucket. It's a required field. + type: string + whenUnsatisfiable: + description: 'WhenUnsatisfiable indicates how to deal with a + pod if it doesn''t satisfy the spread constraint. - DoNotSchedule + (default) tells the scheduler not to schedule it - ScheduleAnyway + tells the scheduler to still schedule it It''s considered + as "Unsatisfiable" if and only if placing incoming pod on + any topology violates "MaxSkew". For example, in a 3-zone + cluster, MaxSkew is set to 1, and pods with the same labelSelector + spread as 3/1/1: | zone1 | zone2 | zone3 | | P P P | P | P | + If WhenUnsatisfiable is set to DoNotSchedule, incoming pod + can only be scheduled to zone2(zone3) to become 3/2/1(3/1/2) + as ActualSkew(2-1) on zone2(zone3) satisfies MaxSkew(1). In + other words, the cluster can still be imbalanced, but scheduler + won''t make it *more* imbalanced. It''s a required field.' + type: string + required: + - maxSkew + - topologyKey + - whenUnsatisfiable + type: object + type: array tracingConfig: description: TracingConfig configures tracing in Thanos. This is an experimental feature, it may change in any upcoming release in a @@ -3940,9 +4096,13 @@ spec: optional for env vars' type: string divisor: + anyOf: + - type: integer + - type: string description: Specifies the output format of the exposed resources, defaults to "1" - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true resource: description: 'Required: resource to select' type: string @@ -3965,6 +4125,9 @@ spec: More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir' type: string sizeLimit: + anyOf: + - type: integer + - type: string description: 'Total amount of local storage required for this EmptyDir volume. The size limit is also applicable for memory medium. The maximum usage on memory medium @@ -3972,7 +4135,8 @@ spec: specified here and the sum of memory limits of all containers in a pod. The default is nil which means that the limit is undefined. More info: http://kubernetes.io/docs/user-guide/volumes#emptydir' - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true type: object fc: description: FC represents a Fibre Channel resource that is @@ -4435,10 +4599,14 @@ spec: for volumes, optional for env vars' type: string divisor: + anyOf: + - type: integer + - type: string description: Specifies the output format of the exposed resources, defaults to "1" - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true resource: description: 'Required: resource to select' diff --git a/apps/monitoring/manifests/prometheus-operator/02_rbac.yaml b/apps/monitoring/manifests/prometheus-operator/clusterRole.yaml similarity index 62% rename from apps/monitoring/manifests/prometheus-operator/02_rbac.yaml rename to apps/monitoring/manifests/prometheus-operator/clusterRole.yaml index db4448a16..2821c35da 100644 --- a/apps/monitoring/manifests/prometheus-operator/02_rbac.yaml +++ b/apps/monitoring/manifests/prometheus-operator/clusterRole.yaml @@ -1,21 +1,11 @@ ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - labels: - app.kubernetes.io/component: controller - app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.40.0 - name: prometheus-operator - namespace: monitoring ---- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: labels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.40.0 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.45.0 name: prometheus-operator rules: - apiGroups: @@ -80,6 +70,14 @@ rules: - get - list - watch +- apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: + - get + - list + - watch - apiGroups: - authentication.k8s.io resources: @@ -92,25 +90,3 @@ rules: - subjectaccessreviews verbs: - create -- apiGroups: - - networking.k8s.io - resources: - - ingresses - verbs: ["get", "list", "watch"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - labels: - app.kubernetes.io/component: controller - app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.40.0 - name: prometheus-operator -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: prometheus-operator -subjects: -- kind: ServiceAccount - name: prometheus-operator - namespace: monitoring diff --git a/apps/monitoring/manifests/prometheus-operator/clusterRoleBinding.yaml b/apps/monitoring/manifests/prometheus-operator/clusterRoleBinding.yaml new file mode 100644 index 000000000..9c5b8dfa7 --- /dev/null +++ b/apps/monitoring/manifests/prometheus-operator/clusterRoleBinding.yaml @@ -0,0 +1,17 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: prometheus-operator + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.45.0 + name: prometheus-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus-operator +subjects: +- kind: ServiceAccount + name: prometheus-operator + namespace: monitoring diff --git a/apps/monitoring/manifests/prometheus-operator/control-plane-components/prometheusRuleCoreDNS.yaml b/apps/monitoring/manifests/prometheus-operator/control-plane-components/prometheusRuleCoreDNS.yaml deleted file mode 100644 index 479d5381e..000000000 --- a/apps/monitoring/manifests/prometheus-operator/control-plane-components/prometheusRuleCoreDNS.yaml +++ /dev/null @@ -1,85 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - labels: - prometheus: k8s - role: alert-rules - name: kube-dns-rules - namespace: monitoring -spec: - groups: - - name: kube-dns - rules: - - alert: CoreDNSDown - annotations: - message: CoreDNS has disappeared from Prometheus target discovery. - runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsdown - expr: | - absent(up{job="kube-dns"} == 1) - for: 15m - labels: - severity: critical - - alert: CoreDNSLatencyHigh - annotations: - message: CoreDNS has 99th percentile latency of {{ $value }} seconds for server {{ $labels.server }} zone {{ $labels.zone }} . - runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednslatencyhigh - expr: | - histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_seconds_bucket{job="kube-dns"}[5m])) by(server, zone, le)) > 4 - for: 10m - labels: - severity: critical - - alert: CoreDNSErrorsHigh - annotations: - message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of requests. - runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednserrorshigh - expr: | - sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m])) - / - sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns"}[5m])) > 0.03 - for: 10m - labels: - severity: critical - - alert: CoreDNSErrorsHigh - annotations: - message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of requests. - runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednserrorshigh - expr: | - sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m])) - / - sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns"}[5m])) > 0.01 - for: 10m - labels: - severity: warning - - name: coredns_forward - rules: - - alert: CoreDNSForwardLatencyHigh - annotations: - message: CoreDNS has 99th percentile latency of {{ $value }} seconds forwarding requests to {{ $labels.to }}. - runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwardlatencyhigh - expr: | - histogram_quantile(0.99, sum(rate(coredns_forward_request_duration_seconds_bucket{job="kube-dns"}[5m])) by(to, le)) > 4 - for: 10m - labels: - severity: critical - - alert: CoreDNSForwardErrorsHigh - annotations: - message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of forward requests to {{ $labels.to }}. - runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwarderrorshigh - expr: | - sum(rate(coredns_forward_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m])) - / - sum(rate(coredns_forward_response_rcode_count_total{job="kube-dns"}[5m])) > 0.03 - for: 10m - labels: - severity: critical - - alert: CoreDNSForwardErrorsHigh - annotations: - message: CoreDNS is returning SERVFAIL for {{ $value | humanizePercentage }} of forward requests to {{ $labels.to }}. - runbook_url: https://github.com/povilasv/coredns-mixin/tree/master/runbook.md#alert-name-corednsforwarderrorshigh - expr: | - sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns",rcode="SERVFAIL"}[5m])) - / - sum(rate(coredns_dns_response_rcode_count_total{job="kube-dns"}[5m])) > 0.01 - for: 10m - labels: - severity: warning diff --git a/apps/monitoring/manifests/prometheus-operator/control-plane-components/serviceMonitorApiserver.yaml b/apps/monitoring/manifests/prometheus-operator/control-plane-components/serviceMonitorApiserver.yaml deleted file mode 100644 index 500c0d3e7..000000000 --- a/apps/monitoring/manifests/prometheus-operator/control-plane-components/serviceMonitorApiserver.yaml +++ /dev/null @@ -1,74 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - labels: - k8s-app: apiserver - name: kube-apiserver - namespace: monitoring -spec: - endpoints: - - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - interval: 30s - metricRelabelings: - - action: drop - regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds) - sourceLabels: - - __name__ - - action: drop - regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds) - sourceLabels: - - __name__ - - action: drop - regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs) - sourceLabels: - - __name__ - - action: drop - regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout) - sourceLabels: - - __name__ - - action: drop - regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total) - sourceLabels: - - __name__ - - action: drop - regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary) - sourceLabels: - - __name__ - - action: drop - regex: transformation_(transformation_latencies_microseconds|failures_total) - sourceLabels: - - __name__ - - action: drop - regex: (admission_quota_controller_adds|crd_autoregistration_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|AvailableConditionController_retries|crd_openapi_controller_unfinished_work_seconds|APIServiceRegistrationController_retries|admission_quota_controller_longest_running_processor_microseconds|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_unfinished_work_seconds|crd_openapi_controller_adds|crd_autoregistration_controller_retries|crd_finalizer_queue_latency|AvailableConditionController_work_duration|non_structural_schema_condition_controller_depth|crd_autoregistration_controller_unfinished_work_seconds|AvailableConditionController_adds|DiscoveryController_longest_running_processor_microseconds|autoregister_queue_latency|crd_autoregistration_controller_adds|non_structural_schema_condition_controller_work_duration|APIServiceRegistrationController_adds|crd_finalizer_work_duration|crd_naming_condition_controller_unfinished_work_seconds|crd_openapi_controller_longest_running_processor_microseconds|DiscoveryController_adds|crd_autoregistration_controller_longest_running_processor_microseconds|autoregister_unfinished_work_seconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|non_structural_schema_condition_controller_queue_latency|crd_naming_condition_controller_depth|AvailableConditionController_longest_running_processor_microseconds|crdEstablishing_depth|crd_finalizer_longest_running_processor_microseconds|crd_naming_condition_controller_adds|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_unfinished_work_seconds|crd_openapi_controller_depth|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|DiscoveryController_work_duration|autoregister_adds|crd_autoregistration_controller_queue_latency|crd_finalizer_retries|AvailableConditionController_unfinished_work_seconds|autoregister_longest_running_processor_microseconds|non_structural_schema_condition_controller_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_depth|AvailableConditionController_depth|DiscoveryController_retries|admission_quota_controller_depth|crdEstablishing_adds|APIServiceOpenAPIAggregationControllerQueue1_retries|crdEstablishing_queue_latency|non_structural_schema_condition_controller_longest_running_processor_microseconds|autoregister_work_duration|crd_openapi_controller_retries|APIServiceRegistrationController_work_duration|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_openapi_controller_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_queue_latency|crd_autoregistration_controller_depth|AvailableConditionController_queue_latency|admission_quota_controller_queue_latency|crd_naming_condition_controller_work_duration|crd_openapi_controller_work_duration|DiscoveryController_depth|crd_naming_condition_controller_longest_running_processor_microseconds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|crd_finalizer_unfinished_work_seconds|crdEstablishing_retries|admission_quota_controller_unfinished_work_seconds|non_structural_schema_condition_controller_adds|APIServiceRegistrationController_unfinished_work_seconds|admission_quota_controller_work_duration|autoregister_depth|autoregister_retries|kubeproxy_sync_proxy_rules_latency_microseconds|rest_client_request_latency_seconds|non_structural_schema_condition_controller_retries) - sourceLabels: - - __name__ - - action: drop - regex: etcd_(debugging|disk|request|server).* - sourceLabels: - - __name__ - - action: drop - regex: apiserver_admission_controller_admission_latencies_seconds_.* - sourceLabels: - - __name__ - - action: drop - regex: apiserver_admission_step_admission_latencies_seconds_.* - sourceLabels: - - __name__ - - action: drop - regex: apiserver_request_duration_seconds_bucket;(0.15|0.25|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2.5|3|3.5|4.5|6|7|8|9|15|25|30|50) - sourceLabels: - - __name__ - - le - port: https - scheme: https - tlsConfig: - caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt - serverName: kubernetes - jobLabel: component - namespaceSelector: - matchNames: - - default - selector: - matchLabels: - component: apiserver - provider: kubernetes diff --git a/apps/monitoring/manifests/prometheus-operator/control-plane-components/serviceMonitorKubeControllerManager.yaml b/apps/monitoring/manifests/prometheus-operator/control-plane-components/serviceMonitorKubeControllerManager.yaml deleted file mode 100644 index 224dbedb6..000000000 --- a/apps/monitoring/manifests/prometheus-operator/control-plane-components/serviceMonitorKubeControllerManager.yaml +++ /dev/null @@ -1,56 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - labels: - k8s-app: kube-controller-manager - name: kube-controller-manager - namespace: monitoring -spec: - endpoints: - - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - interval: 30s - metricRelabelings: - - action: drop - regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds) - sourceLabels: - - __name__ - - action: drop - regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds) - sourceLabels: - - __name__ - - action: drop - regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs) - sourceLabels: - - __name__ - - action: drop - regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout) - sourceLabels: - - __name__ - - action: drop - regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total) - sourceLabels: - - __name__ - - action: drop - regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary) - sourceLabels: - - __name__ - - action: drop - regex: transformation_(transformation_latencies_microseconds|failures_total) - sourceLabels: - - __name__ - - action: drop - regex: (admission_quota_controller_adds|crd_autoregistration_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|AvailableConditionController_retries|crd_openapi_controller_unfinished_work_seconds|APIServiceRegistrationController_retries|admission_quota_controller_longest_running_processor_microseconds|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_unfinished_work_seconds|crd_openapi_controller_adds|crd_autoregistration_controller_retries|crd_finalizer_queue_latency|AvailableConditionController_work_duration|non_structural_schema_condition_controller_depth|crd_autoregistration_controller_unfinished_work_seconds|AvailableConditionController_adds|DiscoveryController_longest_running_processor_microseconds|autoregister_queue_latency|crd_autoregistration_controller_adds|non_structural_schema_condition_controller_work_duration|APIServiceRegistrationController_adds|crd_finalizer_work_duration|crd_naming_condition_controller_unfinished_work_seconds|crd_openapi_controller_longest_running_processor_microseconds|DiscoveryController_adds|crd_autoregistration_controller_longest_running_processor_microseconds|autoregister_unfinished_work_seconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|non_structural_schema_condition_controller_queue_latency|crd_naming_condition_controller_depth|AvailableConditionController_longest_running_processor_microseconds|crdEstablishing_depth|crd_finalizer_longest_running_processor_microseconds|crd_naming_condition_controller_adds|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_unfinished_work_seconds|crd_openapi_controller_depth|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|DiscoveryController_work_duration|autoregister_adds|crd_autoregistration_controller_queue_latency|crd_finalizer_retries|AvailableConditionController_unfinished_work_seconds|autoregister_longest_running_processor_microseconds|non_structural_schema_condition_controller_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_depth|AvailableConditionController_depth|DiscoveryController_retries|admission_quota_controller_depth|crdEstablishing_adds|APIServiceOpenAPIAggregationControllerQueue1_retries|crdEstablishing_queue_latency|non_structural_schema_condition_controller_longest_running_processor_microseconds|autoregister_work_duration|crd_openapi_controller_retries|APIServiceRegistrationController_work_duration|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_openapi_controller_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_queue_latency|crd_autoregistration_controller_depth|AvailableConditionController_queue_latency|admission_quota_controller_queue_latency|crd_naming_condition_controller_work_duration|crd_openapi_controller_work_duration|DiscoveryController_depth|crd_naming_condition_controller_longest_running_processor_microseconds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|crd_finalizer_unfinished_work_seconds|crdEstablishing_retries|admission_quota_controller_unfinished_work_seconds|non_structural_schema_condition_controller_adds|APIServiceRegistrationController_unfinished_work_seconds|admission_quota_controller_work_duration|autoregister_depth|autoregister_retries|kubeproxy_sync_proxy_rules_latency_microseconds|rest_client_request_latency_seconds|non_structural_schema_condition_controller_retries) - sourceLabels: - - __name__ - - action: drop - regex: etcd_(debugging|disk|request|server).* - sourceLabels: - - __name__ - port: http-metrics - jobLabel: k8s-app - namespaceSelector: - matchNames: - - kube-system - selector: - matchLabels: - k8s-app: kube-controller-manager diff --git a/apps/monitoring/manifests/prometheus-operator/control-plane-components/serviceMonitorKubeScheduler.yaml b/apps/monitoring/manifests/prometheus-operator/control-plane-components/serviceMonitorKubeScheduler.yaml deleted file mode 100644 index f00db0e47..000000000 --- a/apps/monitoring/manifests/prometheus-operator/control-plane-components/serviceMonitorKubeScheduler.yaml +++ /dev/null @@ -1,18 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - labels: - k8s-app: kube-scheduler - name: kube-scheduler - namespace: monitoring -spec: - endpoints: - - interval: 30s - port: http-metrics - jobLabel: k8s-app - namespaceSelector: - matchNames: - - kube-system - selector: - matchLabels: - k8s-app: kube-scheduler diff --git a/apps/monitoring/manifests/prometheus-operator/crds/0alertmanagerConfigCustomResourceDefinition.yaml b/apps/monitoring/manifests/prometheus-operator/crds/0alertmanagerConfigCustomResourceDefinition.yaml deleted file mode 100644 index 806febfef..000000000 --- a/apps/monitoring/manifests/prometheus-operator/crds/0alertmanagerConfigCustomResourceDefinition.yaml +++ /dev/null @@ -1,390 +0,0 @@ - ---- -apiVersion: apiextensions.k8s.io/v1 -kind: CustomResourceDefinition -metadata: - annotations: - controller-gen.kubebuilder.io/version: v0.2.4 - creationTimestamp: null - name: alertmanagerconfigs.monitoring.coreos.com -spec: - group: monitoring.coreos.com - names: - kind: AlertmanagerConfig - listKind: AlertmanagerConfigList - plural: alertmanagerconfigs - singular: alertmanagerconfig - scope: Namespaced - versions: - - name: v1alpha1 - schema: - openAPIV3Schema: - description: AlertmanagerConfig defines a namespaced AlertmanagerConfig to - be aggregated across multiple namespaces configuring one Alertmanager. - properties: - apiVersion: - description: 'APIVersion defines the versioned schema of this representation - of an object. Servers should convert recognized schemas to the latest - internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources' - type: string - kind: - description: 'Kind is a string value representing the REST resource this - object represents. Servers may infer this from the endpoint the client - submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' - type: string - metadata: - type: object - spec: - properties: - inhibitRules: - items: - properties: - equal: - items: - type: string - type: array - matcher: - items: - properties: - name: - type: string - regex: - type: boolean - value: - type: string - required: - - name - - value - type: object - type: array - type: object - type: array - receivers: - items: - properties: - name: - type: string - pagerDutyConfigs: - items: - properties: - class: - type: string - client: - type: string - clientURL: - type: string - component: - type: string - description: - type: string - details: - items: - properties: - key: - type: string - value: - type: string - required: - - key - - value - type: object - type: array - group: - type: string - httpConfig: - properties: - basicAuth: - description: 'BasicAuth allow an endpoint to authenticate - over basic authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints' - properties: - password: - description: The secret in the service monitor - namespace that contains the password for authentication. - properties: - key: - description: The key of the secret to select - from. Must be a valid secret key. - type: string - name: - description: 'Name of the referent. More info: - https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. apiVersion, - kind, uid?' - type: string - optional: - description: Specify whether the Secret or - its key must be defined - type: boolean - required: - - key - type: object - username: - description: The secret in the service monitor - namespace that contains the username for authentication. - properties: - key: - description: The key of the secret to select - from. Must be a valid secret key. - type: string - name: - description: 'Name of the referent. More info: - https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. apiVersion, - kind, uid?' - type: string - optional: - description: Specify whether the Secret or - its key must be defined - type: boolean - required: - - key - type: object - type: object - bearerTokenSecret: - description: SecretKeySelector selects a key of a - Secret. - properties: - key: - description: The key of the secret to select from. Must - be a valid secret key. - type: string - name: - description: 'Name of the referent. More info: - https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. apiVersion, kind, - uid?' - type: string - optional: - description: Specify whether the Secret or its - key must be defined - type: boolean - required: - - key - type: object - proxyURL: - type: string - tlsConfig: - description: SafeTLSConfig specifies safe TLS configuration - parameters. - properties: - ca: - description: Struct containing the CA cert to - use for the targets. - properties: - configMap: - description: ConfigMap containing data to - use for the targets. - properties: - key: - description: The key to select. - type: string - name: - description: 'Name of the referent. More - info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. apiVersion, - kind, uid?' - type: string - optional: - description: Specify whether the ConfigMap - or its key must be defined - type: boolean - required: - - key - type: object - secret: - description: Secret containing data to use - for the targets. - properties: - key: - description: The key of the secret to - select from. Must be a valid secret - key. - type: string - name: - description: 'Name of the referent. More - info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. apiVersion, - kind, uid?' - type: string - optional: - description: Specify whether the Secret - or its key must be defined - type: boolean - required: - - key - type: object - type: object - cert: - description: Struct containing the client cert - file for the targets. - properties: - configMap: - description: ConfigMap containing data to - use for the targets. - properties: - key: - description: The key to select. - type: string - name: - description: 'Name of the referent. More - info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. apiVersion, - kind, uid?' - type: string - optional: - description: Specify whether the ConfigMap - or its key must be defined - type: boolean - required: - - key - type: object - secret: - description: Secret containing data to use - for the targets. - properties: - key: - description: The key of the secret to - select from. Must be a valid secret - key. - type: string - name: - description: 'Name of the referent. More - info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. apiVersion, - kind, uid?' - type: string - optional: - description: Specify whether the Secret - or its key must be defined - type: boolean - required: - - key - type: object - type: object - insecureSkipVerify: - description: Disable target certificate validation. - type: boolean - keySecret: - description: Secret containing the client key - file for the targets. - properties: - key: - description: The key of the secret to select - from. Must be a valid secret key. - type: string - name: - description: 'Name of the referent. More info: - https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. apiVersion, - kind, uid?' - type: string - optional: - description: Specify whether the Secret or - its key must be defined - type: boolean - required: - - key - type: object - serverName: - description: Used to verify the hostname for the - targets. - type: string - type: object - type: object - routingKey: - description: SecretKeySelector selects a key of a Secret. - properties: - key: - description: The key of the secret to select from. Must - be a valid secret key. - type: string - name: - description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. apiVersion, kind, - uid?' - type: string - optional: - description: Specify whether the Secret or its key - must be defined - type: boolean - required: - - key - type: object - sendResolved: - type: boolean - serviceKey: - description: SecretKeySelector selects a key of a Secret. - properties: - key: - description: The key of the secret to select from. Must - be a valid secret key. - type: string - name: - description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names - TODO: Add other useful fields. apiVersion, kind, - uid?' - type: string - optional: - description: Specify whether the Secret or its key - must be defined - type: boolean - required: - - key - type: object - severity: - type: string - url: - type: string - type: object - type: array - required: - - name - type: object - type: array - route: - properties: - continue: - type: boolean - groupBy: - items: - type: string - type: array - groupInterval: - type: string - groupWait: - type: string - matchers: - items: - properties: - name: - type: string - regex: - type: boolean - value: - type: string - required: - - name - - value - type: object - type: array - receiver: - type: string - repeatInterval: - type: string - routes: - items: - type: object - type: array - type: object - type: object - required: - - spec - type: object - served: true - storage: true -status: - acceptedNames: - kind: "" - plural: "" - conditions: [] - storedVersions: [] diff --git a/apps/monitoring/manifests/prometheus-operator/04_deployment.yaml b/apps/monitoring/manifests/prometheus-operator/deployment.yaml similarity index 67% rename from apps/monitoring/manifests/prometheus-operator/04_deployment.yaml rename to apps/monitoring/manifests/prometheus-operator/deployment.yaml index af7b8714e..e3a37bd72 100644 --- a/apps/monitoring/manifests/prometheus-operator/04_deployment.yaml +++ b/apps/monitoring/manifests/prometheus-operator/deployment.yaml @@ -4,7 +4,8 @@ metadata: labels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: 0.44.1 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.45.0 name: prometheus-operator namespace: monitoring spec: @@ -13,20 +14,22 @@ spec: matchLabels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator + app.kubernetes.io/part-of: kube-prometheus template: metadata: labels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: 0.44.1 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.45.0 spec: containers: - args: - --kubelet-service=kube-system/kubelet - - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.44.1 + - --prometheus-config-reloader=quay.io/prometheus-operator/prometheus-config-reloader:v0.45.0 - --config-reloader-cpu=150m - --log-level=debug - image: quay.io/prometheus-operator/prometheus-operator:v0.44.1 + image: quay.io/prometheus-operator/prometheus-operator:v0.45.0 name: prometheus-operator ports: - containerPort: 8080 @@ -41,18 +44,28 @@ spec: securityContext: allowPrivilegeEscalation: false - args: + - --logtostderr - --secure-listen-address=:8443 - - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256 + - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384,TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305,TLS_ECDHE_ECDSA_WITH_CHACHA20_POLY1305 - --upstream=http://127.0.0.1:8080/ image: quay.io/brancz/kube-rbac-proxy:v0.8.0 name: kube-rbac-proxy ports: - containerPort: 8443 name: https + resources: + limits: + cpu: 20m + memory: 40Mi + requests: + cpu: 10m + memory: 20Mi securityContext: - runAsUser: 65534 + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65532 nodeSelector: - beta.kubernetes.io/os: linux + kubernetes.io/os: linux securityContext: runAsNonRoot: true runAsUser: 65534 diff --git a/apps/monitoring/manifests/prometheus/rules/prometheus-operator.yaml b/apps/monitoring/manifests/prometheus-operator/prometheusRule.yaml similarity index 91% rename from apps/monitoring/manifests/prometheus/rules/prometheus-operator.yaml rename to apps/monitoring/manifests/prometheus-operator/prometheusRule.yaml index 834c77335..e7f141138 100644 --- a/apps/monitoring/manifests/prometheus/rules/prometheus-operator.yaml +++ b/apps/monitoring/manifests/prometheus-operator/prometheusRule.yaml @@ -1,9 +1,11 @@ ---- apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: labels: - prometheus: k8s + app.kubernetes.io/component: controller + app.kubernetes.io/name: prometheus-operator + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.45.0 role: alert-rules name: prometheus-operator-rules namespace: monitoring @@ -43,8 +45,9 @@ spec: severity: warning - alert: PrometheusOperatorReconcileErrors annotations: - description: '{{ $value | humanizePercentage }} of reconciling operations failed - for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.' + description: '{{ $value | humanizePercentage }} of reconciling operations + failed for {{ $labels.controller }} controller in {{ $labels.namespace }} + namespace.' summary: Errors while reconciling controller. expr: | (sum by (controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator"}[5m]))) / (sum by (controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="prometheus-operator"}[5m]))) > 0.1 @@ -74,8 +77,8 @@ spec: - alert: PrometheusOperatorRejectedResources annotations: description: Prometheus operator in {{ $labels.namespace }} namespace rejected - {{ printf "%0.0f" $value }} {{ $labels.controller }}/{{ $labels.resource }} - resources. + {{ printf "%0.0f" $value }} {{ $labels.controller }}/{{ $labels.resource + }} resources. summary: Resources rejected by Prometheus operator expr: | min_over_time(prometheus_operator_managed_resources{state="rejected",job="prometheus-operator"}[5m]) > 0 diff --git a/apps/monitoring/manifests/prometheus-operator/03_service.yaml b/apps/monitoring/manifests/prometheus-operator/service.yaml similarity index 74% rename from apps/monitoring/manifests/prometheus-operator/03_service.yaml rename to apps/monitoring/manifests/prometheus-operator/service.yaml index 3f8dd96e0..0a667c839 100644 --- a/apps/monitoring/manifests/prometheus-operator/03_service.yaml +++ b/apps/monitoring/manifests/prometheus-operator/service.yaml @@ -4,7 +4,8 @@ metadata: labels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.40.0 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.45.0 name: prometheus-operator namespace: monitoring spec: @@ -16,3 +17,4 @@ spec: selector: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator + app.kubernetes.io/part-of: kube-prometheus diff --git a/apps/monitoring/manifests/prometheus-operator/serviceAccount.yaml b/apps/monitoring/manifests/prometheus-operator/serviceAccount.yaml new file mode 100644 index 000000000..7b8cd2896 --- /dev/null +++ b/apps/monitoring/manifests/prometheus-operator/serviceAccount.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + app.kubernetes.io/component: controller + app.kubernetes.io/name: prometheus-operator + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.45.0 + name: prometheus-operator + namespace: monitoring diff --git a/apps/monitoring/manifests/prometheus-operator/05_serviceMonitor.yaml b/apps/monitoring/manifests/prometheus-operator/serviceMonitor.yaml similarity index 75% rename from apps/monitoring/manifests/prometheus-operator/05_serviceMonitor.yaml rename to apps/monitoring/manifests/prometheus-operator/serviceMonitor.yaml index a775f4a67..8cf19eec5 100644 --- a/apps/monitoring/manifests/prometheus-operator/05_serviceMonitor.yaml +++ b/apps/monitoring/manifests/prometheus-operator/serviceMonitor.yaml @@ -4,7 +4,8 @@ metadata: labels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator - app.kubernetes.io/version: v0.40.0 + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.45.0 name: prometheus-operator namespace: monitoring spec: @@ -19,3 +20,5 @@ spec: matchLabels: app.kubernetes.io/component: controller app.kubernetes.io/name: prometheus-operator + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 0.45.0 diff --git a/apps/monitoring/manifests/prometheus/01_rbac.yaml b/apps/monitoring/manifests/prometheus/01_rbac.yaml deleted file mode 100644 index 0748279d7..000000000 --- a/apps/monitoring/manifests/prometheus/01_rbac.yaml +++ /dev/null @@ -1,71 +0,0 @@ ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: prometheus-k8s - namespace: monitoring ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRole -metadata: - name: prometheus-k8s -rules: -- apiGroups: [""] - resources: - - nodes - - nodes/metrics - - services - - endpoints - - pods - verbs: ["get", "list", "watch"] -- apiGroups: [""] - resources: - - configmaps - verbs: ["get"] -- nonResourceURLs: ["/metrics"] - verbs: ["get"] -- apiGroups: - - networking.k8s.io - resources: - - ingresses - verbs: ["get", "list", "watch"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: prometheus-k8s -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: prometheus-k8s -subjects: -- kind: ServiceAccount - name: prometheus-k8s - namespace: monitoring ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: prometheus-k8s-config - namespace: monitoring -rules: -- apiGroups: - - "" - resources: - - configmaps - verbs: - - get ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: prometheus-k8s-config - namespace: monitoring -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: prometheus-k8s-config -subjects: -- kind: ServiceAccount - name: prometheus-k8s - namespace: monitoring diff --git a/apps/monitoring/manifests/prometheus/02_service.yaml b/apps/monitoring/manifests/prometheus/02_service.yaml deleted file mode 100644 index 3e72d346e..000000000 --- a/apps/monitoring/manifests/prometheus/02_service.yaml +++ /dev/null @@ -1,22 +0,0 @@ ---- -apiVersion: v1 -kind: Service -metadata: - annotations: - ignore-check.kube-linter.io/dangling-service: "Check is incompatible with prometheus-operator CRDs" - labels: - prometheus: k8s - name: prometheus-k8s - namespace: monitoring -spec: - ports: - - name: web - port: 9090 - targetPort: web - - name: reloader - port: 8080 - targetPort: 8080 - selector: - app: prometheus - prometheus: k8s - sessionAffinity: ClientIP diff --git a/apps/monitoring/manifests/prometheus/04_serviceMonitor.yaml b/apps/monitoring/manifests/prometheus/04_serviceMonitor.yaml deleted file mode 100644 index 2ff833d2c..000000000 --- a/apps/monitoring/manifests/prometheus/04_serviceMonitor.yaml +++ /dev/null @@ -1,16 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: ServiceMonitor -metadata: - labels: - k8s-app: prometheus - name: prometheus - namespace: monitoring -spec: - endpoints: - - interval: 30s - port: web - - interval: 30s - port: reloader - selector: - matchLabels: - prometheus: k8s diff --git a/apps/monitoring/manifests/prometheus/additionalScrapeConfigs.yaml b/apps/monitoring/manifests/prometheus/additionalScrapeConfigs.yaml new file mode 100644 index 000000000..c0b165e9a --- /dev/null +++ b/apps/monitoring/manifests/prometheus/additionalScrapeConfigs.yaml @@ -0,0 +1,75 @@ +--- +apiVersion: secrets.mz.com/v1alpha1 +kind: ConfigMapSecret +metadata: + name: scrapeconfigs + namespace: monitoring +spec: + template: + metadata: + name: scrapeconfigs + data: + additional.yaml: | + - job_name: windows + static_configs: + - targets: + - '192.168.2.50:9182' + labels: + node: 'DESKTOP-ODOR2KB' + + - job_name: lancre + scrape_interval: 30s + scrape_timeout: 30s + scheme: 'https' + honor_labels: true + metrics_path: '/federate' + basic_auth: + username: $(USERNAME) + password: $(PASSWORD) + params: + 'match[]': + - '{job=~".+"}' + static_configs: + - targets: ['lancre.thaum.xyz'] + metric_relabel_configs: + - source_labels: [__name__] + regex: "prometheus_notifications_alertmanagers_discovered" + action: drop + - source_labels: ['instance', 'environment'] + regex: "(.*):(.*);(.*)" + replacement: '$1.$3:$2' + target_label: 'instance' + - source_labels: [state] + regex: '(activating|deactivating)' + action: drop + - source_labels: [type] + regex: '(oneshot|dbus|idle|notify)' + action: drop + vars: + - name: USERNAME + secretValue: + name: federation-creds + key: lancre_username + - name: PASSWORD + secretValue: + name: federation-creds + key: lancre_password +--- +apiVersion: bitnami.com/v1alpha1 +kind: SealedSecret +metadata: + creationTimestamp: null + name: federation-creds + namespace: monitoring +spec: + encryptedData: + lancre_password: AgDE3yAeoD/hj3t0sn8yhYYCK6Ca4xLH9O9S5s7zKP0vyCaP2BbvAbl9HYuYeXBZWNtHgPu8F0V9QNeDK04KRB1GJH600Ut0ZaYJXY0LjJCKC9wv4ax61UCOTGXj1MIEVtwckcV/1ob9wAQTgAMTTbcO8bKE5qhsNzV8R5H/A6YNEezo66PG1oM4RrF0H+GALX0ZOVXczS4uGBWHZNsqtRZD2PojeRZ/Yr3wunfVjfBNsBrK1y8qRSQsYUXDSw0lV+D8fwo25NAQ2nf+fHDFG04fdLr0gyGeIqczrakHcD8UECZzy3beX/dNUJLF7Xkepd/3hhAmcMcfcCJ8OpW56SX0wVoyAHuW9yFrqRpSiX3QHg9fJZws2jhqSZ1GoWu84h3+DX9f+G065fHqGV09m63ldE+OJREtsiapbZh3AhGtHbDUR5YJ7wFosLjpaSdiKZL+DxifOolv9gxP4uRtjQYsiV4C4onnwy9toZl+KC3wkLS+B8cXLzPeIYLQEMpCTUHpMr8iEDxJyWikyidQ3vlQybt47dsv9liQGYSh7NtB8SDuxWMSvgZRNEJaXBMVzYcncRcNkiqrC0kLE6cy3x9HbolzDYb+Iq8uKweFQ1ViSXvgHze2k0ZUgRQe8GD9Dw/IghHEEA+EBOh1/InotRjQ8lbVa8VdIv0GrERBOwJn/1cUlPcev6sg6S+AsghVkvjmqlU4ZscWoRJ3xTIkRZwIW7fjNLvUvQ5pSVuLQ2g= + lancre_username: AgB1Hr/k+79xhoZlsJ+YxKn2KGsbcB/lG5LW7OopcL/msF+dRy9Pb/kwoyPQEX8G60eLe3TEsWDV14MQTHJjl8QkT5FDrBTaRVaD9ysEojSq6f6G7LV/hoccDHVaC13bKcH8QthYkcpuM4pj1eje5B7aphO/2rSkx+fW3YjC3YP4JNFb0ay+O6RWd9RGo64hqlLG9TQcBgydzfjgTDYbQJExtTAYuh7FBbfJqnHbK2KGI19hGPFGNgybF5UN8lfw5YmPpxq9Essf1Zht3vhxh2bg8s9KZHtEPOh1GRhbtvdaEzviS44bbc5SepFs5XDOH3styBb3Z2nIcYVkJQoV8GnXKp9uaeRocleP13pxK5Atg6XwEVt0H85TtuLRLrroDKxaUqEPtnfksW/7dwZqdSQ7Ehfy1UeIvqMMDi9pYzcr6RpUeLcdLH+kErqEJMqz4lFGT+GSrcKPga0KbwLTJ/jpmTFVZ8POb2eMx1eNDcKlZuPXVihE41+bH6tYTb5XuWbGoExSlgL8opppxJ6JEo0/V8llFZ82v1H+R9J6spmlRXdD+bScob70EtC8wHpVkaAyqmtJmQD7h3IZj/70GUGbAcFErYR3+jVXufZskBCF4ex7A+GaaYzNmXpKkdQgum8p0wB1nCaz5EKHFmLa7PCTQp0Nqm4Fl0mSZ7LOnxN9SzqE/t5nbQCa4DLCyV6B1aTCwCFr3CANkyFi + template: + metadata: + annotations: + sealedsecrets.bitnami.com/managed: "true" + creationTimestamp: null + name: federation-creds + namespace: monitoring + type: Opaque diff --git a/apps/monitoring/manifests/prometheus/clusterRole.yaml b/apps/monitoring/manifests/prometheus/clusterRole.yaml new file mode 100644 index 000000000..efdf49339 --- /dev/null +++ b/apps/monitoring/manifests/prometheus/clusterRole.yaml @@ -0,0 +1,38 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.24.0 + name: prometheus-k8s +rules: +- apiGroups: + - "" + resources: + - nodes/metrics + verbs: + - get +- nonResourceURLs: + - /metrics + verbs: + - get +- apiGroups: + - "" + resources: + - services + - endpoints + - pods + verbs: + - get + - list + - watch +- apiGroups: + - networking.k8s.io + resources: + - ingresses + verbs: + - get + - list + - watch diff --git a/apps/monitoring/manifests/prometheus/clusterRoleBinding.yaml b/apps/monitoring/manifests/prometheus/clusterRoleBinding.yaml new file mode 100644 index 000000000..edb326984 --- /dev/null +++ b/apps/monitoring/manifests/prometheus/clusterRoleBinding.yaml @@ -0,0 +1,17 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.24.0 + name: prometheus-k8s +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus-k8s +subjects: +- kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring diff --git a/apps/monitoring/manifests/prometheus/06_ingress.yaml b/apps/monitoring/manifests/prometheus/ingress.yaml similarity index 52% rename from apps/monitoring/manifests/prometheus/06_ingress.yaml rename to apps/monitoring/manifests/prometheus/ingress.yaml index 9b61c118b..cde675fb6 100644 --- a/apps/monitoring/manifests/prometheus/06_ingress.yaml +++ b/apps/monitoring/manifests/prometheus/ingress.yaml @@ -1,27 +1,26 @@ ---- apiVersion: networking.k8s.io/v1 kind: Ingress metadata: + annotations: + cert-manager.io/cluster-issuer: letsencrypt-prod + kubernetes.io/ingress.class: nginx + nginx.ingress.kubernetes.io/auth-signin: https://auth.ankhmorpork.thaum.xyz/oauth2/start?rd=$scheme://$host$escaped_request_uri + nginx.ingress.kubernetes.io/auth-url: https://auth.ankhmorpork.thaum.xyz/oauth2/auth name: prometheus namespace: monitoring - annotations: - kubernetes.io/ingress.class: "nginx" - cert-manager.io/cluster-issuer: "letsencrypt-prod" - nginx.ingress.kubernetes.io/auth-url: "https://auth.ankhmorpork.thaum.xyz/oauth2/auth" - nginx.ingress.kubernetes.io/auth-signin: "https://auth.ankhmorpork.thaum.xyz/oauth2/start?rd=$scheme://$host$escaped_request_uri" spec: - tls: - - hosts: - - prometheus.ankhmorpork.thaum.xyz - secretName: prometheus-tls rules: - host: prometheus.ankhmorpork.thaum.xyz http: paths: - - path: / - pathType: Prefix - backend: + - backend: service: name: prometheus-k8s port: name: web + path: / + pathType: Prefix + tls: + - hosts: + - prometheus.ankhmorpork.thaum.xyz + secretName: prometheus-tls diff --git a/apps/monitoring/manifests/prometheus/05_prometheus.yaml b/apps/monitoring/manifests/prometheus/prometheus.yaml similarity index 77% rename from apps/monitoring/manifests/prometheus/05_prometheus.yaml rename to apps/monitoring/manifests/prometheus/prometheus.yaml index ea5334d07..517698539 100644 --- a/apps/monitoring/manifests/prometheus/05_prometheus.yaml +++ b/apps/monitoring/manifests/prometheus/prometheus.yaml @@ -1,17 +1,18 @@ ---- apiVersion: monitoring.coreos.com/v1 kind: Prometheus metadata: labels: - prometheus: k8s - app.kubernetes.io/version: 2.24.0 + app.kubernetes.io/component: prometheus app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.24.0 + prometheus: k8s name: k8s namespace: monitoring spec: additionalScrapeConfigs: - name: scrapeconfigs key: additional.yaml + name: scrapeconfigs affinity: podAntiAffinity: preferredDuringSchedulingIgnoredDuringExecution: @@ -28,38 +29,35 @@ spec: weight: 100 alerting: alertmanagers: - - name: alertmanager-main + - apiVersion: v2 + name: alertmanager-main namespace: monitoring port: web externalUrl: https://prometheus.ankhmorpork.thaum.xyz image: quay.io/prometheus/prometheus:v2.24.0 nodeSelector: - kubernetes.io/os: linux kubernetes.io/arch: amd64 - storage.infra/replicated: "true" + kubernetes.io/os: linux + podMetadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.24.0 podMonitorNamespaceSelector: {} podMonitorSelector: {} probeNamespaceSelector: {} probeSelector: {} - podMetadata: - labels: - app.kubernetes.io/name: "prometheus" replicas: 2 - retention: 7d resources: + limits: + cpu: "1" requests: cpu: 140m memory: 1900Mi - limits: - cpu: 1 + retention: 7d ruleNamespaceSelector: {} ruleSelector: {} - # TODO: Figure out why longhorn doesn't like those and why PV needs manual chmod. - # securityContext: - # fsGroup: 2000 - # runAsNonRoot: true - # runAsUser: 1000 - # RunAsGroup: 2000 serviceAccountName: prometheus-k8s serviceMonitorNamespaceSelector: {} serviceMonitorSelector: {} @@ -68,9 +66,10 @@ spec: metadata: name: promdata spec: - storageClassName: "local-path" # For performance reasons use local disk accessModes: - - ReadWriteOnce + - ReadWriteOnce resources: requests: storage: 40Gi + storageClassName: local-path + version: 2.24.0 diff --git a/apps/monitoring/manifests/prometheus/rules/prometheus.yaml b/apps/monitoring/manifests/prometheus/prometheusRule.yaml similarity index 61% rename from apps/monitoring/manifests/prometheus/rules/prometheus.yaml rename to apps/monitoring/manifests/prometheus/prometheusRule.yaml index 686cc4cf0..196fe2505 100644 --- a/apps/monitoring/manifests/prometheus/rules/prometheus.yaml +++ b/apps/monitoring/manifests/prometheus/prometheusRule.yaml @@ -2,9 +2,12 @@ apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: labels: - prometheus: k8s + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.24.0 role: alert-rules - name: prometheus-rules + name: k8s-rules namespace: monitoring spec: groups: @@ -12,128 +15,135 @@ spec: rules: - alert: PrometheusBadConfig annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to reload its configuration. + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to + reload its configuration. summary: Failed Prometheus configuration reload. expr: | # Without max_over_time, failed scrapes could create false negatives, see # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - max_over_time(prometheus_config_last_reload_successful{job="prometheus-k8s",namespace="test"}[5m]) == 0 + max_over_time(prometheus_config_last_reload_successful{job="prometheus-k8s",namespace="monitoring"}[5m]) == 0 for: 10m labels: severity: critical - alert: PrometheusNotificationQueueRunningFull annotations: - description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}} is running full. - summary: Prometheus alert notification queue predicted to run full in less than 30m. + description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}} + is running full. + summary: Prometheus alert notification queue predicted to run full in less + than 30m. expr: | # Without min_over_time, failed scrapes could create false negatives, see # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. ( - predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s",namespace="test"}[5m], 60 * 30) + predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s",namespace="monitoring"}[5m], 60 * 30) > - min_over_time(prometheus_notifications_queue_capacity{job="prometheus-k8s",namespace="test"}[5m]) + min_over_time(prometheus_notifications_queue_capacity{job="prometheus-k8s",namespace="monitoring"}[5m]) ) for: 15m labels: severity: warning - alert: PrometheusErrorSendingAlertsToSomeAlertmanagers annotations: - description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.' - summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager. + description: '{{ printf "%.1f" $value }}% errors while sending alerts from + Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.' + summary: Prometheus has encountered more than 1% errors sending alerts to + a specific Alertmanager. expr: | ( - rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="test"}[5m]) + rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m]) / - rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="test"}[5m]) + rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m]) ) * 100 > 1 for: 15m labels: severity: warning - - alert: PrometheusErrorSendingAlertsToAnyAlertmanager - annotations: - description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.' - summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager. - expr: | - min without(alertmanager) ( - rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="test"}[5m]) - / - rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="test"}[5m]) - ) - * 100 - > 3 - for: 15m - labels: - severity: critical - alert: PrometheusNotConnectedToAlertmanagers annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to any Alertmanagers. + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected + to any Alertmanagers. summary: Prometheus is not connected to any Alertmanagers. expr: | # Without max_over_time, failed scrapes could create false negatives, see # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. - max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s",namespace="test"}[5m]) < 1 + max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s",namespace="monitoring"}[5m]) < 1 for: 10m labels: severity: warning - alert: PrometheusTSDBReloadsFailing annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} reload failures over the last 3h. + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected + {{$value | humanize}} reload failures over the last 3h. summary: Prometheus has issues reloading blocks from disk. expr: | - increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s",namespace="test"}[3h]) > 0 + increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0 for: 4h labels: severity: warning - alert: PrometheusTSDBCompactionsFailing annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} compaction failures over the last 3h. + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected + {{$value | humanize}} compaction failures over the last 3h. summary: Prometheus has issues compacting blocks. expr: | - increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s",namespace="test"}[3h]) > 0 + increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0 for: 4h labels: severity: warning - alert: PrometheusNotIngestingSamples annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting samples. + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting + samples. summary: Prometheus is not ingesting samples. expr: | - rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s",namespace="test"}[5m]) <= 0 + ( + rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s",namespace="monitoring"}[5m]) <= 0 + and + ( + sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="prometheus-k8s",namespace="monitoring"}) > 0 + or + sum without(rule_group) (prometheus_rule_group_rules{job="prometheus-k8s",namespace="monitoring"}) > 0 + ) + ) for: 10m labels: severity: warning - alert: PrometheusDuplicateTimestamps annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with different values but duplicated timestamp. + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping + {{ printf "%.4g" $value }} samples/s with different values but duplicated + timestamp. summary: Prometheus is dropping samples with duplicate timestamps. expr: | - rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="test"}[5m]) > 0 + rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 for: 10m labels: severity: warning - alert: PrometheusOutOfOrderTimestamps annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with timestamps arriving out of order. + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping + {{ printf "%.4g" $value }} samples/s with timestamps arriving out of order. summary: Prometheus drops samples with out-of-order timestamps. expr: | - rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-k8s",namespace="test"}[5m]) > 0 + rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 for: 10m labels: severity: warning - alert: PrometheusRemoteStorageFailures annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }} + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send + {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ + $labels.url }} summary: Prometheus fails to send samples to remote storage. expr: | ( - rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="test"}[5m]) + rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]) / ( - rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="test"}[5m]) + rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]) + - rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus-k8s",namespace="test"}[5m]) + rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m]) ) ) * 100 @@ -143,15 +153,17 @@ spec: severity: critical - alert: PrometheusRemoteWriteBehind annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}. + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write + is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url + }}. summary: Prometheus remote write is behind. expr: | # Without max_over_time, failed scrapes could create false negatives, see # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. ( - max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus-k8s",namespace="test"}[5m]) - - on(job, instance) group_right - max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus-k8s",namespace="test"}[5m]) + max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus-k8s",namespace="monitoring"}[5m]) + - ignoring(remote_name, url) group_right + max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus-k8s",namespace="monitoring"}[5m]) ) > 120 for: 15m @@ -159,34 +171,69 @@ spec: severity: critical - alert: PrometheusRemoteWriteDesiredShards annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-k8s",namespace="test"}` $labels.instance | query | first | value }}. - summary: Prometheus remote write desired shards calculation wants to run more than configured max shards. + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write + desired shards calculation wants to run {{ $value }} shards for queue {{ + $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ + printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-k8s",namespace="monitoring"}` + $labels.instance | query | first | value }}. + summary: Prometheus remote write desired shards calculation wants to run more + than configured max shards. expr: | # Without max_over_time, failed scrapes could create false negatives, see # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. ( - max_over_time(prometheus_remote_storage_shards_desired{job="prometheus-k8s",namespace="test"}[5m]) + max_over_time(prometheus_remote_storage_shards_desired{job="prometheus-k8s",namespace="monitoring"}[5m]) > - max_over_time(prometheus_remote_storage_shards_max{job="prometheus-k8s",namespace="test"}[5m]) + max_over_time(prometheus_remote_storage_shards_max{job="prometheus-k8s",namespace="monitoring"}[5m]) ) for: 15m labels: severity: warning - alert: PrometheusRuleFailures annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m. + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to + evaluate {{ printf "%.0f" $value }} rules in the last 5m. summary: Prometheus is failing rule evaluations. expr: | - increase(prometheus_rule_evaluation_failures_total{job="prometheus-k8s",namespace="test"}[5m]) > 0 + increase(prometheus_rule_evaluation_failures_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 for: 15m labels: severity: critical - alert: PrometheusMissingRuleEvaluations annotations: - description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{ printf "%.0f" $value }} rule group evaluations in the last 5m. + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{ + printf "%.0f" $value }} rule group evaluations in the last 5m. summary: Prometheus is missing rule evaluations due to slow rule group evaluation. expr: | - increase(prometheus_rule_group_iterations_missed_total{job="prometheus-k8s",namespace="test"}[5m]) > 0 + increase(prometheus_rule_group_iterations_missed_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 for: 15m labels: severity: warning + - alert: PrometheusTargetLimitHit + annotations: + description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped + {{ printf "%.0f" $value }} targets because the number of targets exceeded + the configured target_limit. + summary: Prometheus has dropped targets because some scrape configs have exceeded + the targets limit. + expr: | + increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 + for: 15m + labels: + severity: warning + - alert: PrometheusErrorSendingAlertsToAnyAlertmanager + annotations: + description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts + from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.' + summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager. + expr: | + min without (alertmanager) ( + rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring",alertmanager!~``}[5m]) + / + rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring",alertmanager!~``}[5m]) + ) + * 100 + > 3 + for: 15m + labels: + severity: critical diff --git a/apps/monitoring/manifests/prometheus/roleBindingConfig.yaml b/apps/monitoring/manifests/prometheus/roleBindingConfig.yaml new file mode 100644 index 000000000..f0a88d7dd --- /dev/null +++ b/apps/monitoring/manifests/prometheus/roleBindingConfig.yaml @@ -0,0 +1,18 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.24.0 + name: prometheus-k8s-config + namespace: monitoring +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s-config +subjects: +- kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring diff --git a/apps/monitoring/manifests/prometheus/roleConfig.yaml b/apps/monitoring/manifests/prometheus/roleConfig.yaml new file mode 100644 index 000000000..2a7494633 --- /dev/null +++ b/apps/monitoring/manifests/prometheus/roleConfig.yaml @@ -0,0 +1,17 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.24.0 + name: prometheus-k8s-config + namespace: monitoring +rules: +- apiGroups: + - "" + resources: + - configmaps + verbs: + - get diff --git a/apps/monitoring/manifests/prometheus/rules/alertmanager.yaml b/apps/monitoring/manifests/prometheus/rules/alertmanager.yaml deleted file mode 100644 index 9c83b873a..000000000 --- a/apps/monitoring/manifests/prometheus/rules/alertmanager.yaml +++ /dev/null @@ -1,41 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - labels: - prometheus: k8s - role: alert-rules - name: alertmanager-rules - namespace: monitoring -spec: - groups: - - name: alertmanager.rules - rules: - - alert: AlertmanagerConfigInconsistent - annotations: - summary: Alertmanager configuration is out of sync - description: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync. - expr: | - count_values("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="test"}) BY (service) / ON(service) GROUP_LEFT() label_replace(max(prometheus_operator_spec_replicas{job="prometheus-operator",namespace="test",controller="alertmanager"}) by (name, job, namespace, controller), "service", "alertmanager-$1", "name", "(.*)") != 1 - for: 5m - labels: - severity: critical - - alert: AlertmanagerFailedReload - annotations: - summary: Reloading Alertmanager configuration failed - description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}. - expr: | - alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="test"} == 0 - for: 10m - labels: - severity: warning - - alert: AlertmanagerMembersInconsistent - annotations: - summary: Alertmanager other members of the cluster. - description: Alertmanager has not found all other members of the cluster. - expr: | - alertmanager_cluster_members{job="alertmanager-main",namespace="test"} - != on (service) GROUP_LEFT() - count by (service) (alertmanager_cluster_members{job="alertmanager-main",namespace="test"}) - for: 5m - labels: - severity: critical diff --git a/apps/monitoring/manifests/prometheus/rules/blackbox.yaml b/apps/monitoring/manifests/prometheus/rules/blackbox.yaml deleted file mode 100644 index 0f57ca485..000000000 --- a/apps/monitoring/manifests/prometheus/rules/blackbox.yaml +++ /dev/null @@ -1,60 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - labels: - prometheus: k8s - role: alert-rules - name: blackbox-rules - namespace: monitoring -spec: - groups: - - name: blackbox alert rules - rules: - - alert: ProbeFailed - expr: probe_success == 0 - for: 30m - labels: - severity: warning - annotations: - summary: "Blackbox probe failed" - description: "Probe against {{ $labels.instance }} failed" - - alert: StatusCode - expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400 - for: 30m - labels: - severity: warning - annotations: - summary: "HTTP status code is not 200-399" - description: "Status Code (instance {{ $labels.instance }})" - - alert: SslCertificateWillExpireSoon - expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 7 - for: 5m - labels: - severity: warning - annotations: - summary: "SSL certificate expires in 7 days" - description: "SSL certificate for '{{ $labels.instance }}' will expire soon" - - alert: SslCertificateHasExpired - expr: probe_ssl_earliest_cert_expiry - time() <= 0 - for: 5m - labels: - severity: critical - annotations: - summary: "SSL certificate has expired already" - description: "SSL certificate for '{{ $labels.instance }}' has expired" - - alert: BlackboxSlowRequests - expr: probe_http_duration_seconds > 2 - for: 10m - labels: - severity: warning - annotations: - summary: "Blackbox request took more than 2s" - description: "Blackbox slow requests against '{{ $labels.instance }}'" - - alert: BlackboxSlowPing - expr: probe_icmp_duration_seconds > 2 - for: 10m - labels: - severity: warning - annotations: - summary: "Ping took more than 2s" - description: "Blackbox slow ping against '{{ $labels.instance }}'" diff --git a/apps/monitoring/manifests/prometheus/rules/general.yaml b/apps/monitoring/manifests/prometheus/rules/general.yaml deleted file mode 100644 index bef16169b..000000000 --- a/apps/monitoring/manifests/prometheus/rules/general.yaml +++ /dev/null @@ -1,68 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - labels: - prometheus: k8s - role: alert-rules - name: general-rules - namespace: monitoring -spec: - groups: - - name: general.rules - rules: - - alert: Watchdog - annotations: - summary: "Alerting pipeline test alert" - description: | - This is an alert meant to ensure that the entire alerting pipeline is functional. - This alert is always firing, therefore it should always be firing in Alertmanager - and always fire against a receiver. There are integrations with various notification - mechanisms that send a notification when this alert is not firing. For example the - "DeadMansSnitch" integration in PagerDuty. - expr: vector(1) - labels: - severity: none - - alert: TargetsDown - annotations: - summary: "Prometheus targets went down" - description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.' - expr: 100 * (count(up{job!="windows"} == 0) BY (job, namespace, service) / count(up{job!="windows"}) BY (job, namespace, service)) > 10 - for: 10m - labels: - severity: warning - - alert: JobNotCompleted - expr: | - ((time() - (job_success_timestamp_seconds > 0)) > job_max_age_seconds) - or - (time() - job_start_timestamp_seconds > job_max_age_seconds and job_success_timestamp_seconds == 0) - for: 1m - labels: - severity: warning - annotations: - summary: "Cron job did not complete" - description: "Cron job {{ $labels.job }} has not started/completed in {{ $value | humanizeDuration }}" - runbook_url: "https://github.com/thaum-xyz/ankhmorpork/blob/master/docs/runbooks/JobCompletion.md" - - name: custom node alert rules - rules: - - alert: PackagesAvailable - expr: | - sum by (node,instance) (yum_upgrades_pending) > 200 - or - sum by (node,instance) (apt_upgrades_pending) > 200 - for: 48h - labels: - severity: info - annotations: - summary: "Packages are available for upgrade" - description: "{{ $value }} packages are available for upgrade. Maybe it is time to upgrade?" - runbook_url: "https://github.com/thaum-xyz/ankhmorpork/blob/master/docs/runbooks/PackagesAvailable.md" - - alert: RebootRequired - expr: "node_reboot_required > 0" - for: 4h - labels: - severity: info - annotations: - summary: "Reboot is required to finish package upgrade" - description: "Instance '{{ $labels.instance }}' was upgraded and now requires a reboot." - runbook_url: "https://github.com/thaum-xyz/ankhmorpork/blob/master/docs/runbooks/RebootRequired.md" - diff --git a/apps/monitoring/manifests/prometheus/rules/kube-prometheus.yaml b/apps/monitoring/manifests/prometheus/rules/kube-prometheus.yaml deleted file mode 100644 index 051ab52c7..000000000 --- a/apps/monitoring/manifests/prometheus/rules/kube-prometheus.yaml +++ /dev/null @@ -1,30 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - labels: - prometheus: k8s - role: alert-rules - name: kube-prometheus-rules - namespace: monitoring -spec: - groups: - - name: kube-prometheus-node-recording.rules - rules: - - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY (instance) - record: instance:node_cpu:rate:sum - - expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance) - record: instance:node_network_receive_bytes:rate:sum - - expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance) - record: instance:node_network_transmit_bytes:rate:sum - - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance) - record: instance:node_cpu:ratio - - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) - record: cluster:node_cpu:sum_rate5m - - expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu)) - record: cluster:node_cpu:ratio - - name: kube-prometheus-general.rules - rules: - - expr: count without(instance, pod, node) (up == 1) - record: count:up1 - - expr: count without(instance, pod, node) (up == 0) - record: count:up0 diff --git a/apps/monitoring/manifests/prometheus/rules/node-exporter.yaml b/apps/monitoring/manifests/prometheus/rules/node-exporter.yaml deleted file mode 100644 index 4e97b766e..000000000 --- a/apps/monitoring/manifests/prometheus/rules/node-exporter.yaml +++ /dev/null @@ -1,277 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - labels: - prometheus: k8s - role: alert-rules - name: node-exporter-rules - namespace: monitoring -spec: - groups: - - name: node-exporter.rules - rules: - - expr: | - count without (cpu) ( - count without (mode) ( - node_cpu_seconds_total{job=~"node|node-exporter"} - ) - ) - record: instance:node_num_cpu:sum - - expr: | - 1 - avg without (cpu, mode) ( - rate(node_cpu_seconds_total{job=~"node|node-exporter", mode="idle"}[1m]) - ) - record: instance:node_cpu_utilisation:rate1m - - expr: | - ( - node_load1{job=~"node|node-exporter"} - / - instance:node_num_cpu:sum{job=~"node|node-exporter"} - ) - record: instance:node_load1_per_cpu:ratio - - expr: | - 1 - ( - node_memory_MemAvailable_bytes{job=~"node|node-exporter"} - / - node_memory_MemTotal_bytes{job=~"node|node-exporter"} - ) - record: instance:node_memory_utilisation:ratio - - expr: | - rate(node_vmstat_pgmajfault{job=~"node|node-exporter"}[1m]) - record: instance:node_vmstat_pgmajfault:rate1m - - expr: | - rate(node_disk_io_time_seconds_total{job=~"node|node-exporter", device="nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m]) - record: instance_device:node_disk_io_time_seconds:rate1m - - expr: | - rate(node_disk_io_time_weighted_seconds_total{job=~"node|node-exporter", device="nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m]) - record: instance_device:node_disk_io_time_weighted_seconds:rate1m - - expr: | - sum without (device) ( - rate(node_network_receive_bytes_total{job=~"node|node-exporter", device!="lo"}[1m]) - ) - record: instance:node_network_receive_bytes_excluding_lo:rate1m - - expr: | - sum without (device) ( - rate(node_network_transmit_bytes_total{job=~"node|node-exporter", device!="lo"}[1m]) - ) - record: instance:node_network_transmit_bytes_excluding_lo:rate1m - - expr: | - sum without (device) ( - rate(node_network_receive_drop_total{job=~"node|node-exporter", device!="lo"}[1m]) - ) - record: instance:node_network_receive_drop_excluding_lo:rate1m - - expr: | - sum without (device) ( - rate(node_network_transmit_drop_total{job=~"node|node-exporter", device!="lo"}[1m]) - ) - record: instance:node_network_transmit_drop_excluding_lo:rate1m - - name: node-exporter - rules: - - alert: NodeFilesystemSpaceFillingUp - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - only {{ printf "%.2f" $value }}% available space left and is filling up. - summary: Filesystem is predicted to run out of space within the next 24 hours. - expr: | - ( - node_filesystem_avail_bytes{job=~"node|node-exporter",fstype!~"overlay|nsfs"} / node_filesystem_size_bytes{job=~"node|node-exporter",fstype!~"overlay|nsfs"} * 100 < 40 - and - predict_linear(node_filesystem_avail_bytes{job=~"node|node-exporter",fstype!~"overlay|nsfs"}[6h], 24*60*60) < 0 - and - node_filesystem_readonly{job=~"node|node-exporter",fstype!~"overlay|nsfs"} == 0 - ) - for: 1h - labels: - severity: warning - - alert: NodeFilesystemSpaceFillingUp - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - only {{ printf "%.2f" $value }}% available space left and is filling up fast. - summary: Filesystem is predicted to run out of space within the next 4 hours. - expr: | - ( - node_filesystem_avail_bytes{job=~"node|node-exporter",fstype!~"overlay|nsfs"} / node_filesystem_size_bytes{job=~"node|node-exporter",fstype!~"overlay|nsfs"} * 100 < 20 - and - predict_linear(node_filesystem_avail_bytes{job=~"node|node-exporter",fstype!~"overlay|nsfs"}[6h], 4*60*60) < 0 - and - node_filesystem_readonly{job=~"node|node-exporter",fstype!~"overlay|nsfs"} == 0 - ) - for: 1h - labels: - severity: critical - - alert: NodeFilesystemAlmostOutOfSpace - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - only {{ printf "%.2f" $value }}% available space left. - summary: Filesystem has less than 5% space left. - expr: | - ( - node_filesystem_avail_bytes{job=~"node|node-exporter",fstype!~"overlay|nsfs"} / node_filesystem_size_bytes{job=~"node|node-exporter",fstype!~"overlay|nsfs"} * 100 < 5 - and - node_filesystem_readonly{job=~"node|node-exporter",fstype!~"overlay|nsfs"} == 0 - ) - for: 1h - labels: - severity: warning - - alert: NodeFilesystemAlmostOutOfSpace - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - only {{ printf "%.2f" $value }}% available space left. - summary: Filesystem has less than 3% space left. - expr: | - ( - node_filesystem_avail_bytes{job=~"node|node-exporter",fstype!~"overlay|nsfs"} / node_filesystem_size_bytes{job=~"node|node-exporter",fstype!~"overlay|nsfs"} * 100 < 3 - and - node_filesystem_readonly{job=~"node|node-exporter",fstype!~"overlay|nsfs"} == 0 - ) - for: 1h - labels: - severity: critical - - alert: NodeFilesystemFilesFillingUp - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - only {{ printf "%.2f" $value }}% available inodes left and is filling up. - summary: Filesystem is predicted to run out of inodes within the next 24 hours. - expr: | - ( - node_filesystem_files_free{job=~"node|node-exporter",fstype!~"overlay|nsfs"} / node_filesystem_files{job=~"node|node-exporter",fstype!~"overlay|nsfs"} * 100 < 40 - and - predict_linear(node_filesystem_files_free{job=~"node|node-exporter",fstype!~"overlay|nsfs"}[6h], 24*60*60) < 0 - and - node_filesystem_readonly{job=~"node|node-exporter",fstype!~"overlay|nsfs"} == 0 - ) - for: 1h - labels: - severity: warning - - alert: NodeFilesystemFilesFillingUp - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - only {{ printf "%.2f" $value }}% available inodes left and is filling up fast. - summary: Filesystem is predicted to run out of inodes within the next 4 hours. - expr: | - ( - node_filesystem_files_free{job=~"node|node-exporter",fstype!~"overlay|nsfs"} / node_filesystem_files{job=~"node|node-exporter",fstype!~"overlay|nsfs"} * 100 < 20 - and - predict_linear(node_filesystem_files_free{job=~"node|node-exporter",fstype!~"overlay|nsfs"}[6h], 4*60*60) < 0 - and - node_filesystem_readonly{job=~"node|node-exporter",fstype!~"overlay|nsfs"} == 0 - ) - for: 1h - labels: - severity: critical - - alert: NodeFilesystemAlmostOutOfFiles - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - only {{ printf "%.2f" $value }}% available inodes left. - summary: Filesystem has less than 5% inodes left. - expr: | - ( - node_filesystem_files_free{job=~"node|node-exporter",fstype!~"overlay|nsfs"} / node_filesystem_files{job=~"node|node-exporter",fstype!~"overlay|nsfs"} * 100 < 5 - and - node_filesystem_readonly{job=~"node|node-exporter",fstype!~"overlay|nsfs"} == 0 - ) - for: 1h - labels: - severity: warning - - alert: NodeFilesystemAlmostOutOfFiles - annotations: - description: Filesystem on {{ $labels.device }} at {{ $labels.instance }} has - only {{ printf "%.2f" $value }}% available inodes left. - summary: Filesystem has less than 3% inodes left. - expr: | - ( - node_filesystem_files_free{job=~"node|node-exporter",fstype!~"overlay|nsfs"} / node_filesystem_files{job=~"node|node-exporter",fstype!~"overlay|nsfs"} * 100 < 3 - and - node_filesystem_readonly{job=~"node|node-exporter",fstype!~"overlay|nsfs"} == 0 - ) - for: 1h - labels: - severity: critical - - alert: NodeNetworkReceiveErrs - annotations: - description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered - {{ printf "%.0f" $value }} receive errors in the last two minutes.' - summary: Network interface is reporting many receive errors. - expr: | - increase(node_network_receive_errs_total[2m]) > 10 - for: 1h - labels: - severity: warning - - alert: NodeNetworkTransmitErrs - annotations: - description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered - {{ printf "%.0f" $value }} transmit errors in the last two minutes.' - summary: Network interface is reporting many transmit errors. - expr: | - increase(node_network_transmit_errs_total[2m]) > 10 - for: 1h - labels: - severity: warning - - alert: NodeHighNumberConntrackEntriesUsed - annotations: - description: '{{ $value | humanizePercentage }} of conntrack entries are used.' - summary: Number of conntrack are getting close to the limit. - expr: | - (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75 - labels: - severity: warning - - alert: NodeTextFileCollectorScrapeError - annotations: - description: Node Exporter text file collector failed to scrape. - summary: Node Exporter text file collector failed to scrape. - expr: | - node_textfile_scrape_error{job=~"node|node-exporter"} == 1 - labels: - severity: warning - - alert: NodeClockSkewDetected - annotations: - message: Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure - NTP is configured correctly on this host. - summary: Clock skew detected. - expr: | - ( - node_timex_offset_seconds > 0.05 - and - deriv(node_timex_offset_seconds[5m]) >= 0 - ) - or - ( - node_timex_offset_seconds < -0.05 - and - deriv(node_timex_offset_seconds[5m]) <= 0 - ) - for: 10m - labels: - severity: warning - - alert: NodeClockNotSynchronising - annotations: - message: Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is - configured on this host. - summary: Clock not synchronising. - expr: | - min_over_time(node_timex_sync_status[5m]) == 0 - and - node_timex_maxerror_seconds >= 16 - for: 10m - labels: - severity: warning - - alert: NodeRAIDDegraded - annotations: - description: RAID array '{{ $labels.device }}' on {{ $labels.instance }} is - in degraded state due to one or more disks failures. Number of spare drives - is insufficient to fix issue automatically. - summary: RAID Array is degraded - expr: | - node_md_disks_required - ignoring (state) (node_md_disks{state="active"}) > 0 - for: 15m - labels: - severity: critical - - alert: NodeRAIDDiskFailure - annotations: - description: At least one device in RAID array on {{ $labels.instance }} failed. - Array '{{ $labels.device }}' needs attention and possibly a disk swap. - summary: Failed device in RAID array - expr: | - node_md_disks{state="fail"} > 0 - labels: - severity: warning diff --git a/apps/monitoring/manifests/prometheus/rules/testing.yaml b/apps/monitoring/manifests/prometheus/rules/testing.yaml deleted file mode 100644 index 65659d2a5..000000000 --- a/apps/monitoring/manifests/prometheus/rules/testing.yaml +++ /dev/null @@ -1,24 +0,0 @@ ---- -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - labels: - prometheus: k8s - role: alert-rules - name: testing-rules - namespace: monitoring -spec: - groups: - # Rules which are tested and should be promoted upstream when ready - - name: testing.rules - rules: - - alert: CPUStealTimeHigh - expr: | - sum by (instance) (rate(node_cpu_seconds_total{mode="steal"}[3m])) / count by (instance) (node_cpu_seconds_total{mode="steal"}) > 0.1 - for: 20m - labels: - severity: warning - annotations: - summary: "High CPU Steal Time" - description: "CPU Steal Time is very high on {{ $labels.instance }} hypervisor. This can lead to VM being stalled." - runbook_url: "https://github.com/thaum-xyz/ankhmorpork/blob/master/docs/runbooks/CPUStealTimeHigh.md" diff --git a/apps/monitoring/manifests/prometheus/rules/thaum.yaml b/apps/monitoring/manifests/prometheus/rules/thaum.yaml deleted file mode 100644 index 2649dc8bc..000000000 --- a/apps/monitoring/manifests/prometheus/rules/thaum.yaml +++ /dev/null @@ -1,56 +0,0 @@ -apiVersion: monitoring.coreos.com/v1 -kind: PrometheusRule -metadata: - labels: - prometheus: k8s - role: alert-rules - name: thaum-rules - namespace: monitoring -spec: - groups: - - name: alert rules specific to thaum.xyz - rules: - - alert: FederatedPrometheusDown - expr: 'up{job="lancre"} == 0' - for: 20m - labels: - severity: warning - annotations: - summary: "Federated prometheus is down" - description: "Remote Prometheus server {{ $labels.instance }} has been down for more than 10 minutes." - runbook_url: "https://github.com/thaum-xyz/ankhmorpork/blob/master/docs/runbooks/FederatedPrometheusDown.md" - - alert: FilesystemReadOnly - expr: | - node_filesystem_readonly{fstype=~"(vfat|ext4|xfs)"} != 0 - labels: - severity: critical - annotations: - summary: "Filesystem went read-only possibly due to device error." - description: "Filesystem went read-only on {{ $labels.instance }}. Check FS for possible corruption." - - alert: TouchscreenNotAvailable - expr: | - devices_input_touchscreen_up == 0 or absent(devices_input_touchscreen_up) - for: 10m - labels: - severity: warning - annotations: - summary: "Touchscreen not available" - description: "Powercycle device {{ $labels.instance }} to bring touchscreen up" - - alert: TouchscreenNotAvailable - expr: | - devices_input_touchscreen_up == 0 or absent(devices_input_touchscreen_up) - for: 1h - labels: - severity: critical - annotations: - summary: "Touchscreen not available and automatic remediation failed to restore it" - description: "Powercycle device {{ $labels.instance }}" - - alert: TemperaturesNotAvailable - expr: | - absent(evok_temperature_celsius) - for: 15m - labels: - severity: critical - annotations: - summary: "Cannot obtain temperature data" - description: "Temperature data is gone. Immediatelly switch off all relays and check OW bus." diff --git a/apps/monitoring/manifests/prometheus/service.yaml b/apps/monitoring/manifests/prometheus/service.yaml new file mode 100644 index 000000000..9b07a7cb4 --- /dev/null +++ b/apps/monitoring/manifests/prometheus/service.yaml @@ -0,0 +1,26 @@ +apiVersion: v1 +kind: Service +metadata: + annotations: + ignore-check.kube-linter.io/dangling-service: Check is incompatible with prometheus-operator + CRDs + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.24.0 + prometheus: k8s + name: prometheus-k8s + namespace: monitoring +spec: + ports: + - name: web + port: 9090 + targetPort: web + selector: + app: prometheus + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + prometheus: k8s + sessionAffinity: ClientIP diff --git a/apps/monitoring/manifests/prometheus/serviceAccount.yaml b/apps/monitoring/manifests/prometheus/serviceAccount.yaml new file mode 100644 index 000000000..283b0821c --- /dev/null +++ b/apps/monitoring/manifests/prometheus/serviceAccount.yaml @@ -0,0 +1,10 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.24.0 + name: prometheus-k8s + namespace: monitoring diff --git a/apps/monitoring/manifests/prometheus/serviceMonitor.yaml b/apps/monitoring/manifests/prometheus/serviceMonitor.yaml new file mode 100644 index 000000000..a36e39417 --- /dev/null +++ b/apps/monitoring/manifests/prometheus/serviceMonitor.yaml @@ -0,0 +1,20 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + app.kubernetes.io/version: 2.24.0 + name: prometheus + namespace: monitoring +spec: + endpoints: + - interval: 30s + port: web + selector: + matchLabels: + app.kubernetes.io/component: prometheus + app.kubernetes.io/name: prometheus + app.kubernetes.io/part-of: kube-prometheus + prometheus: k8s diff --git a/apps/monitoring/manifests/prometheus-operator/control-plane-components/serviceMonitorCoreDNS.yaml b/apps/monitoring/manifests/prometheus/serviceMonitorCoreDNS.yaml similarity index 86% rename from apps/monitoring/manifests/prometheus-operator/control-plane-components/serviceMonitorCoreDNS.yaml rename to apps/monitoring/manifests/prometheus/serviceMonitorCoreDNS.yaml index 633aa18cf..f6403e8b5 100644 --- a/apps/monitoring/manifests/prometheus-operator/control-plane-components/serviceMonitorCoreDNS.yaml +++ b/apps/monitoring/manifests/prometheus/serviceMonitorCoreDNS.yaml @@ -2,7 +2,8 @@ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: labels: - k8s-app: coredns + app.kubernetes.io/name: coredns + k8s-app: kube-dns name: coredns namespace: monitoring spec: diff --git a/apps/monitoring/manifests/prometheus-operator/control-plane-components/serviceMonitorKubelet.yaml b/apps/monitoring/manifests/prometheus/serviceMonitorKubelet.yaml similarity index 84% rename from apps/monitoring/manifests/prometheus-operator/control-plane-components/serviceMonitorKubelet.yaml rename to apps/monitoring/manifests/prometheus/serviceMonitorKubelet.yaml index 8459a7634..72dff3fb9 100644 --- a/apps/monitoring/manifests/prometheus-operator/control-plane-components/serviceMonitorKubelet.yaml +++ b/apps/monitoring/manifests/prometheus/serviceMonitorKubelet.yaml @@ -2,7 +2,7 @@ apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: labels: - k8s-app: kubelet + app.kubernetes.io/name: kubelet name: kubelet namespace: monitoring spec: @@ -53,6 +53,7 @@ spec: insecureSkipVerify: true - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token honorLabels: true + honorTimestamps: false interval: 30s metricRelabelings: - action: drop @@ -80,58 +81,6 @@ spec: scheme: https tlsConfig: insecureSkipVerify: true - - interval: 30s - port: https-metrics - relabelings: - - action: replace - regex: (.+)(?::\d+) - replacement: $1:9100 - sourceLabels: - - __address__ - targetLabel: __address__ - - action: replace - replacement: node-exporter - sourceLabels: - - endpoint - targetLabel: endpoint - - action: replace - replacement: node-exporter - targetLabel: job - - interval: 30s - port: https-metrics - relabelings: - - action: replace - regex: (.+)(?::\d+) - replacement: $1:9558 - sourceLabels: - - __address__ - targetLabel: __address__ - - action: replace - replacement: systemd-exporter - sourceLabels: - - endpoint - targetLabel: endpoint - - action: replace - replacement: systemd-exporter - targetLabel: job - - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - honorLabels: true - interval: 30s - metricRelabelings: - - action: drop - regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s) - sourceLabels: - - __name__ - path: /metrics/resource - port: https-metrics - relabelings: - - sourceLabels: - - __metrics_path__ - targetLabel: metrics_path - scheme: https - tlsConfig: - insecureSkipVerify: true - jobLabel: k8s-app namespaceSelector: matchNames: diff --git a/apps/monitoring/manifests/pushgateway/02_deployment.yaml b/apps/monitoring/manifests/pushgateway/deployment.yaml similarity index 61% rename from apps/monitoring/manifests/pushgateway/02_deployment.yaml rename to apps/monitoring/manifests/pushgateway/deployment.yaml index 7d12d874d..ad36a31d2 100644 --- a/apps/monitoring/manifests/pushgateway/02_deployment.yaml +++ b/apps/monitoring/manifests/pushgateway/deployment.yaml @@ -1,31 +1,36 @@ ---- apiVersion: apps/v1 kind: Deployment metadata: - name: pushgateway - namespace: monitoring labels: + app.kubernetes.io/component: exporter app.kubernetes.io/name: pushgateway app.kubernetes.io/version: 1.2.0 - app.kubernetes.io/component: exporter - app.kubernetes.io/managed-by: argocd + name: pushgateway + namespace: monitoring spec: + replicas: 1 selector: matchLabels: + app.kubernetes.io/component: exporter app.kubernetes.io/name: pushgateway template: metadata: labels: - app.kubernetes.io/name: pushgateway app.kubernetes.io/component: exporter + app.kubernetes.io/name: pushgateway + app.kubernetes.io/version: 1.2.0 spec: containers: - - name: pushgateway - image: quay.io/prometheus/pushgateway:v1.2.0 + - image: quay.io/prometheus/pushgateway:v1.2.0 + name: pushgateway ports: - - name: http-push - containerPort: 9091 + - containerPort: 9091 + name: http-push resources: requests: cpu: 10m memory: 12Mi + securityContext: + runAsNonRoot: true + runAsUser: 65534 + serviceAccountName: pushgateway diff --git a/apps/monitoring/manifests/pushgateway/01_service.yaml b/apps/monitoring/manifests/pushgateway/service.yaml similarity index 85% rename from apps/monitoring/manifests/pushgateway/01_service.yaml rename to apps/monitoring/manifests/pushgateway/service.yaml index 006ad2e51..db139d018 100644 --- a/apps/monitoring/manifests/pushgateway/01_service.yaml +++ b/apps/monitoring/manifests/pushgateway/service.yaml @@ -1,11 +1,11 @@ ---- apiVersion: v1 kind: Service metadata: labels: - app.kubernetes.io/name: pushgateway app.kubernetes.io/component: exporter - name: push + app.kubernetes.io/name: pushgateway + app.kubernetes.io/version: 1.2.0 + name: pushgateway namespace: monitoring spec: ports: @@ -14,5 +14,5 @@ spec: protocol: TCP targetPort: http-push selector: - app.kubernetes.io/name: pushgateway app.kubernetes.io/component: exporter + app.kubernetes.io/name: pushgateway diff --git a/apps/monitoring/manifests/pushgateway/serviceAccount.yaml b/apps/monitoring/manifests/pushgateway/serviceAccount.yaml new file mode 100644 index 000000000..48bca8fc8 --- /dev/null +++ b/apps/monitoring/manifests/pushgateway/serviceAccount.yaml @@ -0,0 +1,9 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: pushgateway + app.kubernetes.io/version: 1.2.0 + name: pushgateway + namespace: monitoring diff --git a/apps/monitoring/manifests/pushgateway/03_servicemonitor.yaml b/apps/monitoring/manifests/pushgateway/serviceMonitor.yaml similarity index 51% rename from apps/monitoring/manifests/pushgateway/03_servicemonitor.yaml rename to apps/monitoring/manifests/pushgateway/serviceMonitor.yaml index 5acf5b13e..72caeec64 100644 --- a/apps/monitoring/manifests/pushgateway/03_servicemonitor.yaml +++ b/apps/monitoring/manifests/pushgateway/serviceMonitor.yaml @@ -1,14 +1,18 @@ ---- apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: pushgateway + app.kubernetes.io/version: 1.2.0 name: pushgateway namespace: monitoring spec: endpoints: - - interval: 30s + - honorLabels: true + interval: 30s port: http-push - honorLabels: true selector: matchLabels: + app.kubernetes.io/component: exporter app.kubernetes.io/name: pushgateway diff --git a/apps/monitoring/manifests/smokeping/03_deployment.yaml b/apps/monitoring/manifests/smokeping/deployment.yaml similarity index 62% rename from apps/monitoring/manifests/smokeping/03_deployment.yaml rename to apps/monitoring/manifests/smokeping/deployment.yaml index 9ad2421bc..572fbbefd 100644 --- a/apps/monitoring/manifests/smokeping/03_deployment.yaml +++ b/apps/monitoring/manifests/smokeping/deployment.yaml @@ -1,28 +1,29 @@ ---- apiVersion: apps/v1 kind: Deployment metadata: labels: - app.kubernetes.io/name: smokeping - app.kubernetes.io/version: latest app.kubernetes.io/component: exporter + app.kubernetes.io/name: smokeping + app.kubernetes.io/version: 1.2.0 name: smokeping namespace: monitoring spec: replicas: 2 selector: matchLabels: + app.kubernetes.io/component: exporter app.kubernetes.io/name: smokeping template: metadata: labels: + app.kubernetes.io/component: exporter app.kubernetes.io/name: smokeping + app.kubernetes.io/version: 1.2.0 spec: affinity: podAntiAffinity: preferredDuringSchedulingIgnoredDuringExecution: - - weight: 10 - podAffinityTerm: + - podAffinityTerm: labelSelector: matchExpressions: - key: app.kubernetes.io/name @@ -30,36 +31,37 @@ spec: values: - smokeping topologyKey: kubernetes.io/hostname + weight: 100 containers: - - image: quay.io/superq/smokeping-prober-linux-arm64:master - imagePullPolicy: IfNotPresent + - args: + - 8.8.8.8 + - 1.1.1.1 + - lancre.thaum.xyz + - krupa.net.pl + - cloud.krupa.net.pl + - pawel.krupa.net.pl + image: quay.io/superq/smokeping-prober:v0.4.1 name: smokeping - args: - - "8.8.8.8" - - "1.1.1.1" - - "lancre.thaum.xyz" - - "rim.thaum.xyz" - - "krupa.net.pl" - - "cloud.krupa.net.pl" - - "pawel.krupa.net.pl" ports: - - containerPort: 9374 - name: http-smokeping + - containerPort: 9374 + name: http readinessProbe: - tcpSocket: - port: http-smokeping - initialDelaySeconds: 1 failureThreshold: 5 + initialDelaySeconds: 1 + tcpSocket: + port: http timeoutSeconds: 10 - securityContext: - capabilities: - add: ["NET_RAW"] resources: + limits: + memory: 70Mi requests: cpu: 40m memory: 30Mi - limits: - memory: 70Mi - restartPolicy: Always - nodeSelector: - kubernetes.io/arch: arm64 + securityContext: + capabilities: + add: + - NET_RAW + securityContext: + runAsNonRoot: true + runAsUser: 65534 + serviceAccountName: smokeping diff --git a/apps/monitoring/manifests/smokeping/02_service.yaml b/apps/monitoring/manifests/smokeping/service.yaml similarity index 70% rename from apps/monitoring/manifests/smokeping/02_service.yaml rename to apps/monitoring/manifests/smokeping/service.yaml index 8b932c12a..054b4cd8f 100644 --- a/apps/monitoring/manifests/smokeping/02_service.yaml +++ b/apps/monitoring/manifests/smokeping/service.yaml @@ -1,17 +1,18 @@ ---- apiVersion: v1 kind: Service metadata: labels: - app.kubernetes.io/name: smokeping app.kubernetes.io/component: exporter + app.kubernetes.io/name: smokeping + app.kubernetes.io/version: 1.2.0 name: smokeping namespace: monitoring spec: ports: - - name: http-smokeping + - name: http port: 9374 protocol: TCP - targetPort: http-smokeping + targetPort: http selector: + app.kubernetes.io/component: exporter app.kubernetes.io/name: smokeping diff --git a/apps/monitoring/manifests/smokeping/serviceAccount.yaml b/apps/monitoring/manifests/smokeping/serviceAccount.yaml new file mode 100644 index 000000000..078b06066 --- /dev/null +++ b/apps/monitoring/manifests/smokeping/serviceAccount.yaml @@ -0,0 +1,9 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: smokeping + app.kubernetes.io/version: 1.2.0 + name: smokeping + namespace: monitoring diff --git a/apps/monitoring/manifests/smokeping/05_servicemonitor.yaml b/apps/monitoring/manifests/smokeping/serviceMonitor.yaml similarity index 53% rename from apps/monitoring/manifests/smokeping/05_servicemonitor.yaml rename to apps/monitoring/manifests/smokeping/serviceMonitor.yaml index 7eb7d885e..51b58c105 100644 --- a/apps/monitoring/manifests/smokeping/05_servicemonitor.yaml +++ b/apps/monitoring/manifests/smokeping/serviceMonitor.yaml @@ -1,13 +1,17 @@ ---- apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: + labels: + app.kubernetes.io/component: exporter + app.kubernetes.io/name: smokeping + app.kubernetes.io/version: 1.2.0 name: smokeping namespace: monitoring spec: endpoints: - interval: 30s - port: http-smokeping + port: http selector: matchLabels: + app.kubernetes.io/component: exporter app.kubernetes.io/name: smokeping diff --git a/base/argocd/apps/monitoring.yaml b/base/argocd/apps/monitoring.yaml index d9fbe8dde..2d5bc994a 100644 --- a/base/argocd/apps/monitoring.yaml +++ b/base/argocd/apps/monitoring.yaml @@ -71,24 +71,24 @@ spec: prune: false selfHeal: false ---- -apiVersion: argoproj.io/v1alpha1 -kind: Application -metadata: - name: holiday -spec: - destination: - namespace: monitoring - server: 'https://kubernetes.default.svc' - source: - path: apps/monitoring/manifests/holiday - repoURL: 'https://github.com/thaum-xyz/ankhmorpork.git' - targetRevision: HEAD - project: monitoring - syncPolicy: - automated: - prune: false - selfHeal: false +#--- +#apiVersion: argoproj.io/v1alpha1 +#kind: Application +#metadata: +# name: holiday +#spec: +# destination: +# namespace: monitoring +# server: 'https://kubernetes.default.svc' +# source: +# path: apps/monitoring/manifests/holiday +# repoURL: 'https://github.com/thaum-xyz/ankhmorpork.git' +# targetRevision: HEAD +# project: monitoring +# syncPolicy: +# automated: +# prune: false +# selfHeal: false --- apiVersion: argoproj.io/v1alpha1 @@ -100,7 +100,7 @@ spec: namespace: monitoring server: 'https://kubernetes.default.svc' source: - path: apps/monitoring/manifests/ksm + path: apps/monitoring/manifests/kube-state-metrics repoURL: 'https://github.com/thaum-xyz/ankhmorpork.git' targetRevision: HEAD project: monitoring @@ -208,4 +208,23 @@ spec: prune: true selfHeal: true +--- +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: other-prometheus-rules +spec: + destination: + namespace: monitoring + server: 'https://kubernetes.default.svc' + source: + path: apps/monitoring/manifests/other + repoURL: 'https://github.com/thaum-xyz/ankhmorpork.git' + targetRevision: HEAD + project: monitoring + syncPolicy: + automated: + prune: false + selfHeal: false + diff --git a/hack/checksecrets.sh b/hack/checksecrets.sh index 070cf00e9..bf21a9dff 100755 --- a/hack/checksecrets.sh +++ b/hack/checksecrets.sh @@ -6,6 +6,7 @@ cd "$(git rev-parse --show-toplevel)" EXCLUDES=$( cat < "$DIR/$(basename "$crd" | sed 's/-crd.libsonnet/.json/')" +done diff --git a/hack/rebootstorage.sh b/hack/rebootstorage.sh new file mode 100644 index 000000000..3268c25d2 --- /dev/null +++ b/hack/rebootstorage.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +# Main purpose of this script is to allow rebooting node which is used as +# a backend for external-nfs storage class. This operation is disruptive +# to some workloads hosted in cluster. +# Following actions are performed by the script: +# 1. Cordoning and draining node +# 2. Scaling argocd application server to 0 (prevents auto recovery) +# 3. Detecting workloads using external-nfs storage class +# 4. Scaling workloads from #3 to 0 +# 5. SSH into node and executing reboot +# 6. Uncordoning node +# 7. Scaling workloads from #4 to 1 +# 8. Scaling argocd to 1 to perform further recovery + +NODE="hyper01" + +kubectl cordon "${NODE}" +kubectl drain "${NODE}" --delete-emptydir-data --ignore-daemonsets + +sleep 180 # TODO: convert into proper check to see if all resources were moved + +#kubectl scale --replicas=0 deployment -n argocd argocd-application-controller + + + +ssh "${NODE}" reboot + +sleep 180 # TODO: convert into proper check to see if node is up and available +# kubectl get nodes | grep hyper01 | grep -v NotReady + +kubectl uncordon "${NODE}" + +#kubectl scale --replicas=1 deployment -n argocd argocd-application-controller