diff --git a/ansible/roles/collectd-oracle-db-connected/templates/collectd_oracle_db_connected.sh.j2 b/ansible/roles/collectd-oracle-db-connected/templates/collectd_oracle_db_connected.sh.j2 index 99a228135..d051c945d 100644 --- a/ansible/roles/collectd-oracle-db-connected/templates/collectd_oracle_db_connected.sh.j2 +++ b/ansible/roles/collectd-oracle-db-connected/templates/collectd_oracle_db_connected.sh.j2 @@ -59,7 +59,7 @@ while sleep "$INTERVAL"; do if [[ "$SIDS" != "None" ]]; then for SID in $(get_sids); do db_connected $SID >/dev/null 2>&1 - echo "PUTVAL $HOSTNAME/exec-db_connected/bool-$SID interval=$INTERVAL N:$?" + echo "PUTVAL $HOSTNAME/oracle_db_connected/exitcode-$SID interval=$INTERVAL N:$?" done fi done diff --git a/ansible/roles/collectd-service-metrics/README.md b/ansible/roles/collectd-service-metrics/README.md index 39c3cfae7..c57d100b7 100644 --- a/ansible/roles/collectd-service-metrics/README.md +++ b/ansible/roles/collectd-service-metrics/README.md @@ -18,6 +18,12 @@ Intro to Collectd networking [here](https://collectd.org/wiki/index.php/Networki ## Finding metrics in Cloudwatch -Metrics collected by the Cloudwatch agent will appear in the 'metrics' panel under the CWAgent namespace as __value e.g. collectd_cpu_value, collectd_wlsadminserver_value, collectd_amazonssmagent_value etc. +Metrics collected by the Cloudwatch agent will appear in the 'metrics' panel under the CWAgent namespace + +``` +metric: collectd_service_status_value +type: exitcode +type_instance: Name of service, e.g. amazonssmagent +``` Cloudwatch metrics are easily filtered by instance_id so you can see all the metrics for a particular instance. diff --git a/ansible/roles/collectd-service-metrics/templates/collectd_service_metrics.sh.j2 b/ansible/roles/collectd-service-metrics/templates/collectd_service_metrics.sh.j2 index a71202a67..52bad613f 100644 --- a/ansible/roles/collectd-service-metrics/templates/collectd_service_metrics.sh.j2 +++ b/ansible/roles/collectd-service-metrics/templates/collectd_service_metrics.sh.j2 @@ -9,10 +9,10 @@ INTERVAL="${INTERVAL:-{{ collectd_script_interval }}}" while sleep "$INTERVAL"; do {% for item in collectd_monitored_services_role %} ({{ item.shell_cmd }}) >/dev/null 2>&1 - echo "PUTVAL $HOSTNAME/{{ item.metric_name }}/bool interval=$INTERVAL N:$?" + echo "PUTVAL $HOSTNAME/service_status/exitcode-{{ item.metric_name }} interval=$INTERVAL N:$?" {% endfor %} {% for item in collectd_monitored_services_servertype %} ({{ item.shell_cmd }}) >/dev/null 2>&1 - echo "PUTVAL $HOSTNAME/{{ item.metric_name }}/bool interval=$INTERVAL N:$?" + echo "PUTVAL $HOSTNAME/service_status/exitcode-{{ item.metric_name }} interval=$INTERVAL N:$?" {% endfor %} done diff --git a/ansible/roles/collectd-textfile-monitoring/README.md b/ansible/roles/collectd-textfile-monitoring/README.md new file mode 100644 index 000000000..c5306a6e6 --- /dev/null +++ b/ansible/roles/collectd-textfile-monitoring/README.md @@ -0,0 +1,30 @@ +# Role to import collectd metrics from textfile via collectd + +This is similar to prometheus solution where values are imported from a text file +populated by another process. By default, the same directory is used + +``` +/opt/textfile_monitoring/ +``` + +This role does not create the directory, it is assumed another role +will create this with the correct permissions. It needs to be readable by +`ec2-user`. + +Files should contain a field and a value, e.g. + +``` +$ cat /opt/textfile_monitoring/nomis_batch_monitoring.prom + +nomis_batch_failure_status 0 +``` + +This will create 2 metrics + +``` +Metric type type_instance +collectd_textfile_monitoring_seconds duration nomis_batch_failure_status +collectd_textfile_monitoring_value gauge nomis_batch_failure_status +``` + +The `seconds` metric is the number of seconds since the file was last modified. diff --git a/ansible/roles/collectd-textfile-monitoring/defaults/main.yml b/ansible/roles/collectd-textfile-monitoring/defaults/main.yml new file mode 100644 index 000000000..7cc0dd198 --- /dev/null +++ b/ansible/roles/collectd-textfile-monitoring/defaults/main.yml @@ -0,0 +1,6 @@ +--- +collectd_script_path: /usr/local/bin +collectd_script_name: collectd_textfile_monitoring +collectd_script_user: ec2-user +collectd_script_interval: 60 +collectd_textfile_monitoring_paths: /opt/textfile_monitoring/* diff --git a/ansible/roles/collectd-textfile-monitoring/handlers/main.yml b/ansible/roles/collectd-textfile-monitoring/handlers/main.yml new file mode 100644 index 000000000..a9c35d30b --- /dev/null +++ b/ansible/roles/collectd-textfile-monitoring/handlers/main.yml @@ -0,0 +1,10 @@ +--- +- name: restart collectd + ansible.builtin.service: + name: collectd + state: restarted + +- name: restart plugin script + ansible.builtin.shell: | + pkill -u {{ collectd_script_user }} -f {{ collectd_script_path }}/{{ collectd_script_name }}.sh + failed_when: false diff --git a/ansible/roles/collectd-textfile-monitoring/meta/main.yml b/ansible/roles/collectd-textfile-monitoring/meta/main.yml new file mode 100644 index 000000000..fb5e24a73 --- /dev/null +++ b/ansible/roles/collectd-textfile-monitoring/meta/main.yml @@ -0,0 +1,4 @@ +--- +dependencies: + - role: get-ec2-facts + - role: amazon-cloudwatch-agent-collectd diff --git a/ansible/roles/collectd-textfile-monitoring/tasks/configure_collectd.yml b/ansible/roles/collectd-textfile-monitoring/tasks/configure_collectd.yml new file mode 100644 index 000000000..27be4676d --- /dev/null +++ b/ansible/roles/collectd-textfile-monitoring/tasks/configure_collectd.yml @@ -0,0 +1,18 @@ +--- +- name: copy collectd config + ansible.builtin.template: + src: "{{ collectd_script_name }}.conf.j2" + dest: "/etc/collectd.d/{{ collectd_script_name }}.conf" + owner: root + mode: 0644 + notify: + - restart collectd + +- name: copy collectd plugin script + ansible.builtin.template: + src: "{{ collectd_script_name }}.sh.j2" + dest: "{{ collectd_script_path }}/{{ collectd_script_name }}.sh" + owner: root + mode: 0755 + notify: + - restart plugin script diff --git a/ansible/roles/collectd-textfile-monitoring/tasks/main.yml b/ansible/roles/collectd-textfile-monitoring/tasks/main.yml new file mode 100644 index 000000000..e701061c0 --- /dev/null +++ b/ansible/roles/collectd-textfile-monitoring/tasks/main.yml @@ -0,0 +1,6 @@ +--- +- import_tasks: configure_collectd.yml + tags: + - ec2provision + - ec2patch + when: ansible_distribution in ['RedHat', 'OracleLinux'] diff --git a/ansible/roles/collectd-textfile-monitoring/templates/collectd_textfile_monitoring.conf.j2 b/ansible/roles/collectd-textfile-monitoring/templates/collectd_textfile_monitoring.conf.j2 new file mode 100644 index 000000000..f259faeb8 --- /dev/null +++ b/ansible/roles/collectd-textfile-monitoring/templates/collectd_textfile_monitoring.conf.j2 @@ -0,0 +1,4 @@ +LoadPlugin exec + + Exec "{{ collectd_script_user }}" "{{ collectd_script_path }}/{{ collectd_script_name }}.sh" + diff --git a/ansible/roles/collectd-textfile-monitoring/templates/collectd_textfile_monitoring.sh.j2 b/ansible/roles/collectd-textfile-monitoring/templates/collectd_textfile_monitoring.sh.j2 new file mode 100644 index 000000000..5a98310e9 --- /dev/null +++ b/ansible/roles/collectd-textfile-monitoring/templates/collectd_textfile_monitoring.sh.j2 @@ -0,0 +1,27 @@ +#!/bin/bash +# Managed by collectd-textfile-monitoring ansible role +# If manually editing, just kill script and collectd will respawn +# e.g. pkill -u {{ collectd_script_user }} -f {{ collectd_script_path }}/{{ collectd_script_name }}.sh + +HOSTNAME="${HOSTNAME:-localhost}" +INTERVAL="${INTERVAL:-{{ collectd_script_interval }}}" + +while sleep "$INTERVAL"; do + now=$(date +%s) + for file in {{ collectd_textfile_monitoring_paths }}; do +{% raw %} + IFS=$'\n' + metrics=($(grep -E "^[[:alnum:]_]+[[:space:]]+[[:digit:]]+" $file)) + unset IFS + file_last_modified=$(date -r $file +%s) + secs_since_last_modified=$((now - file_last_modified)) + + num_metrics=${#metrics[@]} + for ((i=0; i