-
-
Notifications
You must be signed in to change notification settings - Fork 39
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
postgres-operator: add grafana dashboard and prometheus rules
Signed-off-by: Victor Login <batazor@evrone.com>
- Loading branch information
Showing
2 changed files
with
344 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
333 changes: 333 additions & 0 deletions
333
ops/Helm/addons/store/postgres-operator/templates/prometheus_rule.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,333 @@ | ||
apiVersion: monitoring.coreos.com/v1 | ||
kind: PrometheusRule | ||
metadata: | ||
name: postgres-operator-rules | ||
spec: | ||
groups: | ||
- name: postgres-operator.rules | ||
rules: | ||
########## EXPORTER RULES ########## | ||
- alert: PGExporterScrapeError | ||
expr: pg_exporter_last_scrape_error > 0 | ||
for: 60s | ||
labels: | ||
service: postgresql | ||
severity: critical | ||
severity_num: 300 | ||
annotations: | ||
summary: 'Postgres Exporter running on {{ $labels.job }} (instance: {{ $labels.instance }}) is encountering scrape errors processing queries. Error count: ( {{ $value }} )' | ||
|
||
########## POSTGRESQL RULES ########## | ||
- alert: PGIsUp | ||
expr: pg_up < 1 | ||
for: 60s | ||
labels: | ||
service: postgresql | ||
severity: critical | ||
severity_num: 300 | ||
annotations: | ||
summary: 'postgres_exporter running on {{ $labels.job }} is unable to communicate with the configured database' | ||
|
||
## Monitor for data block checksum failures. Only works in PG12+ | ||
- alert: PGDataChecksum | ||
expr: ccp_data_checksum_failure_count > 0 | ||
for 60s | ||
labels: | ||
service: postgresql | ||
severity: critical | ||
severity_num: 300 | ||
annotations: | ||
description: '{{ $labels.job }} has at least one data checksum failure in database {{ $labels.dbname }}. See pg_stat_database system catalog for more information.' | ||
summary: 'PGSQL Data Checksum failure' | ||
|
||
- alert: PGIdleTxn | ||
expr: ccp_connection_stats_max_idle_in_txn_time > 300 | ||
for: 60s | ||
labels: | ||
service: postgresql | ||
severity: warning | ||
severity_num: 200 | ||
annotations: | ||
description: '{{ $labels.job }} has at least one session idle in transaction for over 5 minutes.' | ||
summary: 'PGSQL Instance idle transactions' | ||
|
||
- alert: PGIdleTxn | ||
expr: ccp_connection_stats_max_idle_in_txn_time > 900 | ||
for: 60s | ||
labels: | ||
service: postgresql | ||
severity: critical | ||
severity_num: 300 | ||
annotations: | ||
description: '{{ $labels.job }} has at least one session idle in transaction for over 15 minutes.' | ||
summary: 'PGSQL Instance idle transactions' | ||
|
||
- alert: PGQueryTime | ||
expr: ccp_connection_stats_max_query_time > 43200 | ||
for: 60s | ||
labels: | ||
service: postgresql | ||
severity: warning | ||
severity_num: 200 | ||
annotations: | ||
description: '{{ $labels.job }} has at least one query running for over 12 hours.' | ||
summary: 'PGSQL Max Query Runtime' | ||
|
||
- alert: PGQueryTime | ||
expr: ccp_connection_stats_max_query_time > 86400 | ||
for: 60s | ||
labels: | ||
service: postgresql | ||
severity: critical | ||
severity_num: 300 | ||
annotations: | ||
description: '{{ $labels.job }} has at least one query running for over 1 day.' | ||
summary: 'PGSQL Max Query Runtime' | ||
|
||
- alert: PGConnPerc | ||
expr: 100 * (ccp_connection_stats_total / ccp_connection_stats_max_connections) > 75 | ||
for: 60s | ||
labels: | ||
service: postgresql | ||
severity: warning | ||
severity_num: 200 | ||
annotations: | ||
description: '{{ $labels.job }} is using 75% or more of available connections ({{ $value }}%)' | ||
summary: 'PGSQL Instance connections' | ||
|
||
- alert: PGConnPerc | ||
expr: 100 * (ccp_connection_stats_total / ccp_connection_stats_max_connections) > 90 | ||
for: 60s | ||
labels: | ||
service: postgresql | ||
severity: critical | ||
severity_num: 300 | ||
annotations: | ||
description: '{{ $labels.job }} is using 90% or more of available connections ({{ $value }}%)' | ||
summary: 'PGSQL Instance connections' | ||
|
||
- alert: PGDBSize | ||
expr: ccp_database_size_bytes > 1.073741824e+11 | ||
for: 60s | ||
labels: | ||
service: postgresql | ||
severity: warning | ||
severity_num: 200 | ||
annotations: | ||
description: 'PGSQL Instance {{ $labels.job }} over 100GB in size: {{ $value }} bytes' | ||
summary: 'PGSQL Instance size warning' | ||
|
||
- alert: PGDBSize | ||
expr: ccp_database_size_bytes > 2.68435456e+11 | ||
for: 60s | ||
labels: | ||
service: postgresql | ||
severity: critical | ||
severity_num: 300 | ||
annotations: | ||
description: 'PGSQL Instance {{ $labels.job }} over 250GB in size: {{ $value }} bytes' | ||
summary: 'PGSQL Instance size critical' | ||
|
||
- alert: PGReplicationByteLag | ||
expr: ccp_replication_lag_size_bytes > 5.24288e+07 | ||
for: 60s | ||
labels: | ||
service: postgresql | ||
severity: warning | ||
severity_num: 200 | ||
annotations: | ||
description: 'PGSQL Instance {{ $labels.job }} has at least one replica lagging over 50MB behind.' | ||
summary: 'PGSQL Instance replica lag warning' | ||
|
||
- alert: PGReplicationByteLag | ||
expr: ccp_replication_lag_size_bytes > 1.048576e+08 | ||
for: 60s | ||
labels: | ||
service: postgresql | ||
severity: critical | ||
severity_num: 300 | ||
annotations: | ||
description: 'PGSQL Instance {{ $labels.job }} has at least one replica lagging over 100MB behind.' | ||
summary: 'PGSQL Instance replica lag warning' | ||
|
||
- alert: PGReplicationSlotsInactive | ||
expr: ccp_replication_slots_active == 0 | ||
for: 60s | ||
labels: | ||
service: postgresql | ||
severity: critical | ||
severity_num: 300 | ||
annotations: | ||
description: 'PGSQL Instance {{ $labels.job }} has one or more inactive replication slots' | ||
summary: 'PGSQL Instance inactive replication slot' | ||
|
||
- alert: PGXIDWraparound | ||
expr: ccp_transaction_wraparound_percent_towards_wraparound > 50 | ||
for: 60s | ||
labels: | ||
service: postgresql | ||
severity: warning | ||
severity_num: 200 | ||
annotations: | ||
description: 'PGSQL Instance {{ $labels.job }} is over 50% towards transaction id wraparound.' | ||
summary: 'PGSQL Instance {{ $labels.job }} transaction id wraparound imminent' | ||
|
||
- alert: PGXIDWraparound | ||
expr: ccp_transaction_wraparound_percent_towards_wraparound > 75 | ||
for: 60s | ||
labels: | ||
service: postgresql | ||
severity: critical | ||
severity_num: 300 | ||
annotations: | ||
description: 'PGSQL Instance {{ $labels.job }} is over 75% towards transaction id wraparound.' | ||
summary: 'PGSQL Instance transaction id wraparound imminent' | ||
|
||
- alert: PGEmergencyVacuum | ||
expr: ccp_transaction_wraparound_percent_towards_emergency_autovac > 110 | ||
for: 60s | ||
labels: | ||
service: postgresql | ||
severity: warning | ||
severity_num: 200 | ||
annotations: | ||
description: 'PGSQL Instance {{ $labels.job }} is over 110% beyond autovacuum_freeze_max_age value. Autovacuum may need tuning to better keep up.' | ||
summary: 'PGSQL Instance emergency vacuum imminent' | ||
|
||
- alert: PGEmergencyVacuum | ||
expr: ccp_transaction_wraparound_percent_towards_emergency_autovac > 125 | ||
for: 60s | ||
labels: | ||
service: postgresql | ||
severity: critical | ||
severity_num: 300 | ||
annotations: | ||
description: 'PGSQL Instance {{ $labels.job }} is over 125% beyond autovacuum_freeze_max_age value. Autovacuum needs tuning to better keep up.' | ||
summary: 'PGSQL Instance emergency vacuum imminent' | ||
|
||
- alert: PGArchiveCommandStatus | ||
expr: ccp_archive_command_status_seconds_since_last_fail > 300 | ||
for: 60s | ||
labels: | ||
service: postgresql | ||
severity: critical | ||
severity_num: 300 | ||
annotations: | ||
description: 'PGSQL Instance {{ $labels.job }} has a recent failing archive command' | ||
summary: 'Seconds since the last recorded failure of the archive_command' | ||
|
||
- alert: PGSequenceExhaustion | ||
expr: ccp_sequence_exhaustion_count > 0 | ||
for: 60s | ||
labels: | ||
service: postgresql | ||
severity: critical | ||
severity_num: 300 | ||
annotations: | ||
description: 'Count of sequences on instance {{ $labels.job }} at over 75% usage: {{ $value }}. Run following query to see full sequence status: SELECT * FROM monitor.sequence_status() WHERE percent >= 75' | ||
|
||
- alert: PGSettingsPendingRestart | ||
expr: ccp_settings_pending_restart_count > 0 | ||
for: 60s | ||
labels: | ||
service: postgresql | ||
severity: critical | ||
severity_num: 300 | ||
annotations: | ||
description: 'One or more settings in the pg_settings system catalog on system {{ $labels.job }} are in a pending_restart state. Check the system catalog for which settings are pending and review postgresql.conf for changes.' | ||
|
||
########## PGBACKREST RULES ########## | ||
## | ||
## Uncomment and customize one or more of these rules to monitor your pgbackrest backups. | ||
## Full backups are considered the equivalent of both differentials and incrementals since both are based on the last full | ||
## And differentials are considered incrementals since incrementals will be based off the last diff if one exists | ||
## This avoid false alerts, for example when you don't run diff/incr backups on the days that you run a full | ||
## Stanza should also be set if different intervals are expected for each stanza. | ||
## Otherwise rule will be applied to all stanzas returned on target system if not set. | ||
## | ||
## Relevant metric names are: | ||
## ccp_backrest_last_full_backup_time_since_completion_seconds | ||
## ccp_backrest_last_incr_backup_time_since_completion_seconds | ||
## ccp_backrest_last_diff_backup_time_since_completion_seconds | ||
## | ||
## To avoid false positives on backup time alerts, 12 hours are added onto each threshold to allow a buffer if the backup runtime varies from day to day. | ||
## Further adjustment may be needed depending on your backup runtimes/schedule. | ||
# | ||
# - alert: PGBackRestLastCompletedFull_main | ||
# expr: ccp_backrest_last_full_backup_time_since_completion_seconds{stanza="main"} > 648000 | ||
# for: 60s | ||
# labels: | ||
# service: postgresql | ||
# severity: critical | ||
# severity_num: 300 | ||
# annotations: | ||
# summary: 'Full backup for stanza [main] on system {{ $labels.job }} has not completed in the last week.' | ||
# | ||
# - alert: PGBackRestLastCompletedIncr_main | ||
# expr: ccp_backrest_last_incr_backup_time_since_completion_seconds{stanza="main"} > 129600 | ||
# for: 60s | ||
# labels: | ||
# service: postgresql | ||
# severity: critical | ||
# severity_num: 300 | ||
# annotations: | ||
# summary: 'Incremental backup for stanza [main] on system {{ $labels.job }} has not completed in the last 24 hours.' | ||
# | ||
# | ||
## Runtime monitoring is handled with a single metric: | ||
## | ||
## ccp_backrest_last_info_backup_runtime_seconds | ||
## | ||
## Runtime monitoring should have the "backup_type" label set. | ||
## Otherwise the rule will apply to the last run of all backup types returned (full, diff, incr) | ||
## Stanza should also be set if runtimes per stanza have different expected times | ||
# | ||
# - alert: PGBackRestLastRuntimeFull_main | ||
# expr: ccp_backrest_last_info_backup_runtime_seconds{backup_type="full", stanza="main"} > 14400 | ||
# for: 60s | ||
# labels: | ||
# service: postgresql | ||
# severity: critical | ||
# severity_num: 300 | ||
# annotations: | ||
# summary: 'Expected runtime of full backup for stanza [main] has exceeded 4 hours' | ||
# | ||
# - alert: PGBackRestLastRuntimeDiff_main | ||
# expr: ccp_backrest_last_info_backup_runtime_seconds{backup_type="diff", stanza="main"} > 3600 | ||
# for: 60s | ||
# labels: | ||
# service: postgresql | ||
# severity: critical | ||
# severity_num: 300 | ||
# annotations: | ||
# summary: 'Expected runtime of diff backup for stanza [main] has exceeded 1 hour' | ||
# | ||
## As of pgBackRest version 2.36 errors encountered during a completed backup run (checksum failure, file truncation, | ||
## invalid header, etc) can be detected and are reported in the info. | ||
# | ||
# - alert: PGBackRestErrorDuringBackup | ||
# expr: ccp_backrest_last_info_backup_error > 0 | ||
# for: 60s | ||
# labels: | ||
# service: postgresql | ||
# severity: critical | ||
# severity_num: 300 | ||
# annotations: | ||
# summary: 'Error encountered during pgBackRest backup operation. See logs for more information.' | ||
# | ||
## If the pgbackrest command fails to run, the metric disappears from the exporter output and the alert never fires. | ||
## An absence alert must be configured explicitly for each target (job) that backups are being monitored. | ||
## Checking for absence of just the full backup type should be sufficient (no need for diff/incr). | ||
## Note that while the backrest check command failing will likely also cause a scrape error alert, the addition of this | ||
## check gives a clearer answer as to what is causing it and that something is wrong with the backups. | ||
# | ||
# - alert: PGBackrestAbsentFull_Prod | ||
# expr: absent(ccp_backrest_last_full_backup_time_since_completion_seconds{job="Prod"}) | ||
# for: 10s | ||
# labels: | ||
# service: postgresql | ||
# severity: critical | ||
# severity_num: 300 | ||
# annotations: | ||
# description: 'Backup Full status missing for Prod. Check that pgbackrest info command is working on target system.' | ||
|