Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for PG15. Disable JIT for monitoring user #297

Merged
merged 15 commits into from
Oct 24, 2022
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions changelogs/fragments/297.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
major_changes:
- pg15 - Update to support PostgreSQL 15 (https://github.com/CrunchyData/pgmonitor/issues/296)

minor_changes:
- jit - Disable JIT for the ccp_monitoring user to avoid memory leak issues (https://github.com/CrunchyData/pgmonitor/issues/295)
keithf4 marked this conversation as resolved.
Show resolved Hide resolved
1 change: 1 addition & 0 deletions postgres_exporter/common/pg11/setup.sql
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ GRANT pg_monitor to ccp_monitoring;
GRANT pg_execute_server_program TO ccp_monitoring;

ALTER ROLE ccp_monitoring SET lock_timeout TO '2min';
ALTER ROLE ccp_monitoring SET jit TO 'off';

CREATE SCHEMA IF NOT EXISTS monitor AUTHORIZATION ccp_monitoring;

Expand Down
1 change: 1 addition & 0 deletions postgres_exporter/common/pg12/setup.sql
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ GRANT pg_monitor to ccp_monitoring;
GRANT pg_execute_server_program TO ccp_monitoring;

ALTER ROLE ccp_monitoring SET lock_timeout TO '2min';
ALTER ROLE ccp_monitoring SET jit TO 'off';

CREATE SCHEMA IF NOT EXISTS monitor AUTHORIZATION ccp_monitoring;

Expand Down
1 change: 1 addition & 0 deletions postgres_exporter/common/pg13/setup.sql
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ GRANT pg_monitor to ccp_monitoring;
GRANT pg_execute_server_program TO ccp_monitoring;

ALTER ROLE ccp_monitoring SET lock_timeout TO '2min';
ALTER ROLE ccp_monitoring SET jit TO 'off';

CREATE SCHEMA IF NOT EXISTS monitor AUTHORIZATION ccp_monitoring;

Expand Down
1 change: 1 addition & 0 deletions postgres_exporter/common/pg14/setup.sql
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ GRANT pg_monitor to ccp_monitoring;
GRANT pg_execute_server_program TO ccp_monitoring;

ALTER ROLE ccp_monitoring SET lock_timeout TO '2min';
ALTER ROLE ccp_monitoring SET jit TO 'off';

CREATE SCHEMA IF NOT EXISTS monitor AUTHORIZATION ccp_monitoring;

Expand Down
145 changes: 145 additions & 0 deletions postgres_exporter/common/pg15/queries_general.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
###
#
# Begin File: PG15 queries_general.yml
#
# Copyright © 2017-2022 Crunchy Data Solutions, Inc. All Rights Reserved.
#
###

ccp_connection_stats:
query: "select ((total - idle) - idle_in_txn) as active
, total
, idle
, idle_in_txn
, (select coalesce(extract(epoch from (max(clock_timestamp() - state_change))),0) from pg_catalog.pg_stat_activity where state = 'idle in transaction') as max_idle_in_txn_time
, (select coalesce(extract(epoch from (max(clock_timestamp() - query_start))),0) from pg_catalog.pg_stat_activity where backend_type = 'client backend' and state <> 'idle' ) as max_query_time
, (select coalesce(extract(epoch from (max(clock_timestamp() - query_start))),0) from pg_catalog.pg_stat_activity where backend_type = 'client backend' and wait_event_type = 'Lock' ) as max_blocked_query_time
, max_connections
from (
select count(*) as total
, coalesce(sum(case when state = 'idle' then 1 else 0 end),0) as idle
, coalesce(sum(case when state = 'idle in transaction' then 1 else 0 end),0) as idle_in_txn from pg_catalog.pg_stat_activity) x
join (select setting::float AS max_connections FROM pg_settings WHERE name = 'max_connections') xx ON (true);"
metrics:
- active:
usage: "GAUGE"
description: "Total non-idle connections"
- total:
usage: "GAUGE"
description: "Total idle and non-idle connections"
- idle:
usage: "GAUGE"
description: "Total idle connections"
- idle_in_txn:
usage: "GAUGE"
description: "Total idle in transaction connections"
- max_idle_in_txn_time:
usage: "GAUGE"
description: "Length of time in seconds of the longest idle in transaction session"
- max_query_time:
usage: "GAUGE"
description: "Length of time in seconds of the longest running query"
- max_blocked_query_time:
usage: "GAUGE"
description: "Length of time in seconds of the longest running query that has been blocked by a heavyweight lock"
- max_connections:
usage: "GAUGE"
description: "Value of max_connections for the monitored database"


ccp_replication_lag:
query: "SELECT
CASE
WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0
ELSE EXTRACT (EPOCH FROM clock_timestamp() - pg_last_xact_replay_timestamp())::INTEGER
END
AS replay_time
, EXTRACT (EPOCH FROM clock_timestamp() - pg_last_xact_replay_timestamp())::INTEGER
AS received_time"
metrics:
- replay_time:
usage: "GAUGE"
description: "Length of time since the last transaction was replayed on replica. Returns zero if last WAL recieved equals last WAL replayed. Avoids false positives when primary stops writing. Monitors for replicas that cannot keep up with primary WAL generation."
- received_time:
usage: "GAUGE"
description: "Length of time since the last WAL file was received and replayed on replica. Always increases, possibly causing false positives if the primary stops writing. Monitors for replicas that stop receiving WAL all together."


ccp_replication_lag_size:
query: "SELECT client_addr as replica
, client_hostname as replica_hostname
, client_port as replica_port
, pg_wal_lsn_diff(sent_lsn, replay_lsn) as bytes
FROM pg_catalog.pg_stat_replication"
metrics:
- replica:
usage: "LABEL"
description: "Replica address"
- replica_hostname:
usage: "LABEL"
description: "Replica hostname"
- replica_port:
usage: "LABEL"
description: "Replica port"
- bytes:
usage: "GAUGE"
description: "Replication lag in bytes"


ccp_replication_slots:
query: "SELECT slot_name, active::int, pg_wal_lsn_diff(CASE WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn() ELSE pg_current_wal_insert_lsn() END, restart_lsn) AS retained_bytes FROM pg_catalog.pg_replication_slots"
metrics:
- slot_name:
usage: "LABEL"
description: "Name of replication slot"
- active:
usage: "GAUGE"
description: "Active state of slot. 1 = true. 0 = false."
- retained_bytes:
usage: "GAUGE"
description: "The amount of WAL (in bytes) being retained for this slot"


ccp_wal_activity:
query: "SELECT last_5_min_size_bytes,
(SELECT COALESCE(sum(size),0) FROM pg_catalog.pg_ls_waldir()) AS total_size_bytes
FROM (SELECT COALESCE(sum(size),0) AS last_5_min_size_bytes FROM pg_catalog.pg_ls_waldir() WHERE modification > CURRENT_TIMESTAMP - '5 minutes'::interval) x;"
metrics:
- last_5_min_size_bytes:
usage: "GAUGE"
description: "Current size in bytes of the last 5 minutes of WAL generation. Includes recycled WALs."
- total_size_bytes:
usage: "GAUGE"
description: "Current size in bytes of the WAL directory"


ccp_data_checksum_failure:
query: "SELECT datname AS dbname
, checksum_failures AS count
, coalesce(extract(epoch from (clock_timestamp() - checksum_last_failure)), 0) AS time_since_last_failure_seconds
FROM pg_catalog.pg_stat_database;"
metrics:
- dbname:
usage: "LABEL"
description: "Database name"
- count:
usage: "GAUGE"
description: "Total number of checksum failures on this database"
- time_since_last_failure_seconds:
usage: "GAUGE"
description: "Time interval in seconds since the last checksum failure was encountered"


ccp_pg_hba_checksum:
query: "SELECT monitor.pg_hba_checksum() AS status"
metrics:
- status:
usage: "GAUGE"
description: "Value of checksum monitioring status for pg_catalog.pg_hba_file_rules (pg_hba.conf). 0 = valid config. 1 = settings changed. To reset current config to valid after alert, run monitor.pg_hba_checksum_set_valid()."


###
#
# End File: PG15 queries_general.yml
#
###
135 changes: 135 additions & 0 deletions postgres_exporter/common/pg15/queries_pg_stat_statements.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
###
#
# Begin File: PG15 queries_pg_stat_statements.yml
#
# Copyright © 2017-2022 Crunchy Data Solutions, Inc. All Rights Reserved.
#
###

ccp_pg_stat_statements_total:
query: "SELECT pg_get_userbyid(s.userid) as role,
d.datname AS dbname,
sum(s.calls) AS calls_count,
sum(s.total_exec_time) AS exec_time_ms,
avg(s.mean_exec_time) AS mean_exec_time_ms,
sum(s.rows) AS row_count
FROM public.pg_stat_statements s
JOIN pg_catalog.pg_database d
ON d.oid = s.dbid
GROUP BY 1,2"
metrics:
- role:
usage: "LABEL"
description: "Role that executed the statement"
- dbname:
usage: "LABEL"
description: "Database in which the statement was executed"
- calls_count:
usage: "GAUGE"
description: "Total number of queries run per user/database"
- exec_time_ms:
usage: "GAUGE"
description: "Total runtime of all queries per user/database"
- mean_exec_time_ms:
usage: "GAUGE"
description: "Mean runtime of all queries per user/database"
- row_count:
usage: "GAUGE"
description: "Total rows returned from all queries per user/database"


ccp_pg_stat_statements_top_mean:
query: "SELECT pg_get_userbyid(s.userid) as role,
d.datname AS dbname,
s.queryid,
btrim(replace(left(s.query, 40), '\n', '')) AS query,
max(s.mean_exec_time) exec_time_ms
FROM public.pg_stat_statements s
JOIN pg_catalog.pg_database d
ON d.oid = s.dbid
GROUP BY 1,2,3,4
ORDER BY 5 DESC
LIMIT #PG_STAT_STATEMENTS_LIMIT#"
metrics:
- role:
usage: "LABEL"
description: "Role that executed the statement"
- dbname:
usage: "LABEL"
description: "Database in which the statement was executed"
- queryid:
usage: "LABEL"
description: "Internal hash code, computed from the statement's parse tree"
- query:
usage: "LABEL"
description: "First 40 characters of query text"
- exec_time_ms:
usage: "GAUGE"
description: "Average query runtime in milliseconds"


# Note that individual query stats can only be reset in PG12 or later
ccp_pg_stat_statements_top_total:
query: "SELECT pg_get_userbyid(s.userid) as role,
d.datname AS dbname,
s.queryid,
btrim(replace(left(s.query, 40), '\n', '')) AS query,
s.total_exec_time exec_time_ms
FROM public.pg_stat_statements s
JOIN pg_catalog.pg_database d
ON d.oid = s.dbid
ORDER BY 5 DESC
LIMIT #PG_STAT_STATEMENTS_LIMIT#"
metrics:
- role:
usage: "LABEL"
description: "Role that executed the statement"
- dbname:
usage: "LABEL"
description: "Database in which the statement was executed"
- queryid:
usage: "LABEL"
description: "Internal hash code, computed from the statement's parse tree"
- query:
usage: "LABEL"
description: "First 40 characters of query text"
- exec_time_ms:
usage: "GAUGE"
description: "Total time spent in the statement in milliseconds"


# Note that individual query stats can only be reset in PG12 or later
ccp_pg_stat_statements_top_max:
query: "SELECT pg_get_userbyid(s.userid) as role,
d.datname AS dbname,
s.queryid,
btrim(replace(left(s.query, 40), '\n', '')) AS query,
s.max_exec_time AS exec_time_ms
FROM public.pg_stat_statements s
JOIN pg_catalog.pg_database d
ON d.oid = s.dbid
ORDER BY 5 DESC
LIMIT #PG_STAT_STATEMENTS_LIMIT#"
metrics:
- role:
usage: "LABEL"
description: "Role that executed the statement"
- dbname:
usage: "LABEL"
description: "Database in which the statement was executed"
- queryid:
usage: "LABEL"
description: "Internal hash code, computed from the statement's parse tree"
- query:
usage: "LABEL"
description: "First 40 characters of query text"
- exec_time_ms:
usage: "GAUGE"
description: "Maximum time spent in the statement in milliseconds"


###
#
# End File: PG15 queries_pg_stat_statements.yml
#
###
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
###
#
# Begin File: pg_stat_statements_reset_info.yml
#
# Copyright © 2017-2022 Crunchy Data Solutions, Inc. All Rights Reserved.
#
###
ccp_pg_stat_statements_reset:
query: "select monitor.pg_stat_statements_reset_info(#PG_STAT_STATEMENTS_THROTTLE_MINUTES#) as time"
metrics:
- time:
usage: "GAUGE"
description: "Epoch time when stats were reset"

###
#
# End File: pg_stat_statements_reset_info.yml
#
###

Loading