Nomis: DSOS-2359: further tweaking of cloudwatch metrics (#401)

* Allow collectd-service-metrics to specify the metric name * allow different metric names for collect_textfile_monitoring * update oracle-db-backup to use textfile monitoring * remove ImageId as dimension * fix * update monitoring * typo
ministryofjustice · Nov 17, 2023 · c185a5f · c185a5f
1 parent 080bbe9
commit c185a5f
Show file tree

Hide file tree

Showing 19 changed files with 109 additions and 49 deletions.
diff --git a/ansible/group_vars/server_type_base_ol85.yml b/ansible/group_vars/server_type_base_ol85.yml
@@ -12,7 +12,8 @@ server_type_roles_list:
   - autoscale-group-hooks-state
 
 collectd_monitored_services_servertype:
-  - metric_name: chronyd
+  - metric_name: service_status_os
+    metric_dimension: chronyd
     shell_cmd: "systemctl is-active chronyd"
 
 roles_list: "{{ (ami_roles_list | default([]) | difference(server_type_roles_list | default([]))) + (server_type_roles_list | default([])) }}"
diff --git a/ansible/group_vars/server_type_base_rhel610.yml b/ansible/group_vars/server_type_base_rhel610.yml
@@ -10,7 +10,8 @@ server_type_roles_list:
   - time
 
 collectd_monitored_services_servertype:
-  - metric_name: chronyd
+  - metric_name: service_status_os
+    metric_dimension: chronyd
     shell_cmd: "service chronyd status"
 
 roles_list: "{{ (ami_roles_list | default([]) | difference(server_type_roles_list | default([]))) + (server_type_roles_list | default([])) }}"
diff --git a/ansible/group_vars/server_type_base_rhel79.yml b/ansible/group_vars/server_type_base_rhel79.yml
@@ -10,7 +10,8 @@ server_type_roles_list:
   - autoscale-group-hooks-state
 
 collectd_monitored_services_servertype:
-  - metric_name: chronyd
+  - metric_name: service_status_os
+    metric_dimension: chronyd
     shell_cmd: "systemctl is-active chronyd"
 
 roles_list: "{{ (ami_roles_list | default([]) | difference(server_type_roles_list | default([]))) + (server_type_roles_list | default([])) }}"
diff --git a/ansible/group_vars/server_type_base_rhel85.yml b/ansible/group_vars/server_type_base_rhel85.yml
@@ -11,7 +11,8 @@ server_type_roles_list:
   - autoscale-group-hooks-state
 
 collectd_monitored_services_servertype:
-  - metric_name: chronyd
+  - metric_name: service_status_os
+    metric_dimension: chronyd
     shell_cmd: "systemctl is-active chronyd"
 
 roles_list: "{{ (ami_roles_list | default([]) | difference(server_type_roles_list | default([]))) + (server_type_roles_list | default([])) }}"
diff --git a/ansible/group_vars/server_type_hmpps_oem.yml b/ansible/group_vars/server_type_hmpps_oem.yml
@@ -59,11 +59,14 @@ server_type_roles_list:
   - collectd-service-metrics
 
 collectd_monitored_services_servertype:
-  - metric_name: chronyd
+  - metric_name: service_status_os
+    metric_dimension: chronyd
     shell_cmd: "systemctl is-active chronyd"
-  - metric_name: oracleohasd
+  - metric_name: service_status_app
+    metric_dimension: oracle-ohasd
     shell_cmd: "systemctl is-active oracle-ohasd"
-  - metric_name: oracleasm
+  - metric_name: service_status_app
+    metric_dimension: oracleasm
     shell_cmd: "systemctl is-active oracleasm"
 
 # the below vars are defined in multiple groups.  Keep the values the same to avoid unexpected behaviour

diff --git a/ansible/group_vars/server_type_nomis_db.yml b/ansible/group_vars/server_type_nomis_db.yml
@@ -30,11 +30,14 @@ database_home: /u01/app/oracle/product/11.2.0.4/db_1
 grid_home: /u01/app/oracle/product/11.2.0.4/gridhome_1
 
 collectd_monitored_services_servertype:
-  - metric_name: chronyd
+  - metric_name: service_status_os
+    metric_dimension: chronyd
     shell_cmd: "systemctl is-active chronyd"
-  - metric_name: oracleohasd
+  - metric_name: service_status_app
+    metric_dimension: oracle-ohasd
     shell_cmd: "systemctl is-active oracle-ohasd"
-  - metric_name: oracleasm
+  - metric_name: service_status_app
+    metric_dimension: oracleasm
     shell_cmd: "systemctl is-active oracleasm"
 
 use_ssm_params: true
diff --git a/ansible/group_vars/server_type_nomis_web.yml b/ansible/group_vars/server_type_nomis_web.yml
@@ -18,9 +18,11 @@ server_type_roles_list:
 roles_list: "{{ (ami_roles_list | default([]) | difference(server_type_roles_list | default([]))) + (server_type_roles_list | default([])) }}"
 
 collectd_monitored_services_servertype:
-  - metric_name: chronyd
+  - metric_name: service_status_os
+    metric_dimension: chronyd
     shell_cmd: "service chronyd status"
-  - metric_name: weblogichealthcheck
+  - metric_name: service_status_app
+    metric_dimension: weblogic-healthcheck
     shell_cmd: "service weblogic-healthcheck status"
 
 use_ssm_params: true
diff --git a/ansible/group_vars/server_type_nomis_xtag.yml b/ansible/group_vars/server_type_nomis_xtag.yml
@@ -17,22 +17,18 @@ server_type_roles_list:
 
 roles_list: "{{ (ami_roles_list | default([]) | difference(server_type_roles_list | default([]))) + (server_type_roles_list | default([])) }}"
 
-collectd_monitored_services:
-  - chronyd
-  - amazon-ssm-agent
-  - amazon-cloudwatch-agent
-  - wls_nodemanager
-  - wls_adminserver
-  - wls_managedserver
-
 collectd_monitored_services_servertype:
-  - metric_name: chronyd
+  - metric_name: service_status_os
+    metric_dimension: chronyd
     shell_cmd: "systemctl is-active chronyd"
-  - metric_name: wlsnodemanager
+  - metric_name: service_status_app
+    metric_dimension: wls_nodemanager
     shell_cmd: "systemctl is-active wls_nodemanager"
-  - metric_name: wlsadminserver
+  - metric_name: service_status_app
+    metric_dimension: wls_adminserver
     shell_cmd: "systemctl is-active wls_adminserver"
-  - metric_name: wlsmanagedserver
+  - metric_name: service_status_app
+    metric_dimension: wls_managedserver
     shell_cmd: "systemctl is-active wls_managedserver"
 
-use_ssm_params: true
+use_ssm_params: true
diff --git a/ansible/roles/amazon-cloudwatch-agent/templates/linux.json.j2 b/ansible/roles/amazon-cloudwatch-agent/templates/linux.json.j2
@@ -5,7 +5,6 @@
   },
   "metrics": {
     "append_dimensions": {
-      "ImageId": "${aws:ImageId}",
       "InstanceId": "${aws:InstanceId}",
       "AutoScalingGroupName": "${aws:AutoScalingGroupName}"
     },

diff --git a/ansible/roles/collectd-service-metrics/README.md b/ansible/roles/collectd-service-metrics/README.md
@@ -1,8 +1,32 @@
 # Role to configure monitoring for service state using collectd
 
-The collectd 'exec' plugin is being used here to run a monitored_services.sh script which checks the status of each service in the `collectd_monitored_services` list and returns a 0 (running) or 1 (not running) value for each. The script is in the templates directory and is copied to the host by the role. The script is then called by the exec plugin and the output is sent to the local collectd port.
+Monitor the status of services via collectd and cloudwatch.
 
-Anything we want to monitor should be set up as a service (see weblogic-healthcheck for exampe) and then added to the `collectd_monitored_services` list in group_vars. This will over-ride the default list in defaults/main.yml which is only amazon-cloudwatch-agent and amazon-ssm-agent.
+The role installs a collectd configuration file for using an exec plugin,
+and a script for polling the status of the services.
+
+Two variables are used to define which services are monitored:
+
+- `collectd_monitored_services_role` for services common to all servers
+- `collectd_monitored_services_servertype` for services specific to the given server type.
+
+The idea is the `collectd_monitored_services_servertype` is defined in a server
+type group vars.
+
+Example configuration is
+
+```
+collectd_monitored_services_role:
+  - metric_name: service_status_os
+    metric_dimension: amazon-ssm-agent
+    shell_cmd: "(status amazon-ssm-agent|grep running) || (systemctl is-active amazon-ssm-agent)"
+```
+
+The metric name, dimension and command to retrieve the status must all be defined.
+
+Typically we segregate OS level and application level monitoring into different metric
+names as different teams maybe responsible for these, e.g. `service_status_os` and
+`service_status_app`
 
 ### Debugging
 
@@ -12,18 +36,18 @@ Definitely add `debug: true` to the cloudwatch agent config file to see what's g
 
 Unless you specify otherwise cloudwatch agent logs go to `/opt/aws/amazon-cloudwatch-agent/logs/amazon-cloudwatch-agent.log` which is also worth checking to make sure the messages it's trying to pick up from the collectd port are making sense.
 
-Collectd relies on plugins, the most important one related to Cloudwatch is the 'network' plugin which posts the metrics data to a UDP endpoint. Cloudwatch picks metrics up from there and sends them on to cloudwatch. 
+Collectd relies on plugins, the most important one related to Cloudwatch is the 'network' plugin which posts the metrics data to a UDP endpoint. Cloudwatch picks metrics up from there and sends them on to cloudwatch.
 
 Intro to Collectd networking [here](https://collectd.org/wiki/index.php/Networking_introduction)
 
 ## Finding metrics in Cloudwatch
 
-Metrics collected by the Cloudwatch agent will appear in the 'metrics' panel under the CWAgent namespace 
+Metrics collected by the Cloudwatch agent will appear in the 'metrics' panel under the CWAgent namespace
 
 ```
-metric:        collectd_service_status_value
-type:          exitcode
-type_instance: Name of service, e.g. amazonssmagent
+metric:        collectd_service_status_value  (the metric_name)
+type:          exitcode (fixed, 0 = ok, non-zero = error)
+type_instance: Name of service, e.g. amazonssmagent (the metric_dimension)
 ```
 
 Cloudwatch metrics are easily filtered by instance_id so you can see all the metrics for a particular instance.
diff --git a/ansible/roles/collectd-service-metrics/defaults/main.yml b/ansible/roles/collectd-service-metrics/defaults/main.yml
@@ -6,9 +6,11 @@ collectd_script_interval: 60
 
 # Shell cmds work for both RHEL6 and RHEL7+
 collectd_monitored_services_role:
-  - metric_name: amazonssmagent
+  - metric_name: service_status_os
+    metric_dimension: amazon-ssm-agent
     shell_cmd: "(status amazon-ssm-agent|grep running) || (systemctl is-active amazon-ssm-agent)"
-  - metric_name: amazoncloudwatchagent
+  - metric_name: service_status_os
+    metric_dimension: amazon-cloudwatch-agent
     shell_cmd: "(status amazon-cloudwatch-agent|grep running) || (systemctl is-active amazon-cloudwatch-agent)"
 
 # add additional services using this variable in servertype group vars

diff --git a/ansible/roles/collectd-service-metrics/templates/collectd_service_metrics.sh.j2 b/ansible/roles/collectd-service-metrics/templates/collectd_service_metrics.sh.j2
@@ -9,10 +9,10 @@ INTERVAL="${INTERVAL:-{{ collectd_script_interval }}}"
 while sleep "$INTERVAL"; do
 {% for item in collectd_monitored_services_role %}
   ({{ item.shell_cmd }}) >/dev/null 2>&1
-  echo "PUTVAL $HOSTNAME/service_status/exitcode-{{ item.metric_name }} interval=$INTERVAL N:$?"
+  echo "PUTVAL $HOSTNAME/{{ item.metric_name }}/exitcode-{{ item.metric_dimension }} interval=$INTERVAL N:$?"
 {% endfor %}
 {% for item in collectd_monitored_services_servertype %}
   ({{ item.shell_cmd }}) >/dev/null 2>&1
-  echo "PUTVAL $HOSTNAME/service_status/exitcode-{{ item.metric_name }} interval=$INTERVAL N:$?"
+  echo "PUTVAL $HOSTNAME/{{ item.metric_name }}/exitcode-{{ item.metric_dimension }} interval=$INTERVAL N:$?"
 {% endfor %}
 done
diff --git a/ansible/roles/collectd-textfile-monitoring/README.md b/ansible/roles/collectd-textfile-monitoring/README.md
@@ -11,7 +11,7 @@ This role does not create the directory, it is assumed another role
 will create this with the correct permissions. It needs to be readable by
 `ec2-user`.
 
-Files should contain a field and a value, e.g.
+Files should contain a field and a value and use a `.prom` or `.metric` extension, e.g.
 
 ```
 $ cat /opt/textfile_monitoring/nomis_batch_monitoring.prom
@@ -28,3 +28,17 @@ collectd_textfile_monitoring_value    gauge    nomis_batch_failure_status
 ```
 
 The `seconds` metric is the number of seconds since the file was last modified.
+
+You can use different metric names by using a subdirectory, for example
+
+```
+$ cat /opt/textfile_monitoring/rman_backup/CNOMP.metric
+CNOMP 1
+```
+Will create metrics
+
+```
+Metric                                type     type_instance
+collectd_textfile_monitoring_rman_backup_seconds  duration CNOMP
+collectd_textfile_monitoring_rman_backup_value    gauge    CNOMP
+```
diff --git a/ansible/roles/collectd-textfile-monitoring/defaults/main.yml b/ansible/roles/collectd-textfile-monitoring/defaults/main.yml
@@ -3,4 +3,3 @@ collectd_script_path: /usr/local/bin
 collectd_script_name: collectd_textfile_monitoring
 collectd_script_user: ec2-user
 collectd_script_interval: 60
-collectd_textfile_monitoring_paths: /opt/textfile_monitoring/*
diff --git a/ansible/roles/collectd-textfile-monitoring/templates/collectd_textfile_monitoring.sh.j2 b/ansible/roles/collectd-textfile-monitoring/templates/collectd_textfile_monitoring.sh.j2
@@ -8,20 +8,21 @@ INTERVAL="${INTERVAL:-{{ collectd_script_interval }}}"
 
 while sleep "$INTERVAL"; do
   now=$(date +%s)
-  for file in {{ collectd_textfile_monitoring_paths }}; do
+  for file in /opt/textfile_monitoring/*.prom /opt/textfile_monitoring/*.metric /opt/textfile_monitoring/*/*.prom /opt/textfile_monitoring/*/*.metric; do
 {% raw %}
     if [[ -e "$file" ]]; then
       IFS=$'\n'
-      metrics=($(grep -E "^[[:alnum:]_]+[[:space:]]+[[:digit:]]+" "$file"))
+      metrics=($(grep -E "^[[:alnum:]_-:]+[[:space:]]+[[:digit:]]+" "$file"))
+      metric_name=$(dirname $file | sed 's|^/opt/||g' | sed 's|/|_|g')
       unset IFS
       file_last_modified=$(date -r "$file" +%s)
       secs_since_last_modified=$((now - file_last_modified))
 
       num_metrics=${#metrics[@]}
       for ((i=0; i<num_metrics; i++)); do
         metric=(${metrics[i]})
-        echo "PUTVAL $HOSTNAME/textfile_monitoring/gauge-${metric[0]} interval=$INTERVAL N:${metric[1]}"
-        echo "PUTVAL $HOSTNAME/textfile_monitoring/duration-${metric[0]} interval=$INTERVAL N:${secs_since_last_modified}"
+        echo "PUTVAL $HOSTNAME/${metric_name}/gauge-${metric[0]} interval=$INTERVAL N:${metric[1]}"
+        echo "PUTVAL $HOSTNAME/${metric_name}/duration-${metric[0]} interval=$INTERVAL N:${secs_since_last_modified}"
       done
     fi
 {% endraw %}

diff --git a/ansible/roles/nomis-misload/templates/trigger_mis_load.sh.j2 b/ansible/roles/nomis-misload/templates/trigger_mis_load.sh.j2
@@ -5,16 +5,24 @@ export PATH=$PATH:/usr/local/bin
 target=$(aws ssm get-parameter --name "{{ misload_ssm_parameter }}" --query Parameter.Value --with-decryption --output text | jq -r .target)
 username=$(aws ssm get-parameter --name "{{ misload_ssm_parameter }}" --query Parameter.Value --with-decryption --output text | jq -r .username)
 password=$(aws ssm get-parameter --name "{{ misload_ssm_parameter }}" --query Parameter.Value --with-decryption --output text | jq -r .password)
+
+if [[ -z $target || $target == "null" || $target == "None" ||
+      -z $username || $username == "null" || $username == "None" ||
+      -z $password || $password == "null" || $password == "None" ]]; then
+  echo "Could not retrieve config from {{ misload_ssm_parameter }}"
+  echo "misload_status 1" > /opt/textfile_monitoring/misload_status.prom
+  exit 1
+fi
+
+echo "misload_running 0" > /opt/textfile_monitoring/misload_triggered.prom
+
 misload_batch_file_path="{{ misload_batch_file_path }}"
 
 cd {{ oracle_admin_script_dir }}
-export ORACLE_SID=T1MIS
+export ORACLE_SID={{ misload_dbname }}
 export ORAENV_ASK=NO
 . oraenv
-utc_time=$(date -u +"%Y-%m-%d %H:%M:%S")
-echo "misload_triggered $utc_time" > /opt/textfile_monitoring/misload_triggered.prom
 
 {{ ansible_python_interpreter }} /usr/local/share/trigger_mis_load.py -u "$username" -p "$password" -t "$target" -b "$misload_batch_file_path"
-
-# echo "misload_running 0" > /opt/textfile_monitoring/misload_running.prom
-# echo "misload_start_time " `date '+%s'` > /opt/textfile_monitoring/misload_start_time.prom
+echo "misload_status $?" > /opt/textfile_monitoring/misload_status.prom
+rm -f /opt/textfile_monitoring/misload_triggered.prom
diff --git a/ansible/roles/oracle-db-backup/README.md b/ansible/roles/oracle-db-backup/README.md
@@ -1,6 +1,8 @@
 # Overview
 
 Role for configuring scheduled oracle DB backups, or taking adhoc backups
+Status of backups is stored in /opt/textfile_monitoring for monitoring, e.g.
+see collectd-textfile-monitoring role.
 
 # Pre-requisite for scheduled backup  
 

diff --git a/ansible/roles/oracle-db-backup/tasks/rman-backup-setup.yml b/ansible/roles/oracle-db-backup/tasks/rman-backup-setup.yml
@@ -21,6 +21,7 @@
     state: directory
     recurse: yes
   loop:
+    - /opt/textfile_monitoring/rman_backup
     - /home/oracle/admin/rman_scripts/logs
     - /home/oracle/admin/rman_scripts/status
 

diff --git a/ansible/roles/oracle-db-backup/templates/rman_backup.sh.j2 b/ansible/roles/oracle-db-backup/templates/rman_backup.sh.j2
@@ -414,9 +414,11 @@ do
     if [[ $? -eq 0 ]]; then
       info "Rman reported errors for $TARGET_DB"
       echo "$TARGET_DB,$(date +%s),0,$(date),Errors" > /home/oracle/admin/rman_scripts/status/status.${TARGET_DB}
+      echo "${TARGET_DB} 1" > /opt/textfile_monitoring/rman-backup/${TARGET_DB}.prom
     else
       info "Backup of $TARGET_DB completed successfully"
       echo "$TARGET_DB,$(date +%s),1,$(date),Success" > /home/oracle/admin/rman_scripts/status/status.${TARGET_DB}
+      echo "${TARGET_DB} 0" > /opt/textfile_monitoring/rman-backup/${TARGET_DB}.prom
       find /home/oracle/admin/rman_scripts/logs -name "*cmd" -mtime +15 -exec rm {} \;
       find /home/oracle/admin/rman_scripts/logs -name "*log" -mtime +15 -exec rm {} \;
     fi