Skip to content

Commit

Permalink
Nomis: DSOS-2359: further tweaking of cloudwatch metrics (#401)
Browse files Browse the repository at this point in the history
* Allow collectd-service-metrics to specify the metric name

* allow different metric names for collect_textfile_monitoring

* update oracle-db-backup to use textfile monitoring

* remove ImageId as dimension

* fix

* update monitoring

* typo
  • Loading branch information
drobinson-moj authored Nov 17, 2023
1 parent 080bbe9 commit c185a5f
Show file tree
Hide file tree
Showing 19 changed files with 109 additions and 49 deletions.
3 changes: 2 additions & 1 deletion ansible/group_vars/server_type_base_ol85.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ server_type_roles_list:
- autoscale-group-hooks-state

collectd_monitored_services_servertype:
- metric_name: chronyd
- metric_name: service_status_os
metric_dimension: chronyd
shell_cmd: "systemctl is-active chronyd"

roles_list: "{{ (ami_roles_list | default([]) | difference(server_type_roles_list | default([]))) + (server_type_roles_list | default([])) }}"
3 changes: 2 additions & 1 deletion ansible/group_vars/server_type_base_rhel610.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ server_type_roles_list:
- time

collectd_monitored_services_servertype:
- metric_name: chronyd
- metric_name: service_status_os
metric_dimension: chronyd
shell_cmd: "service chronyd status"

roles_list: "{{ (ami_roles_list | default([]) | difference(server_type_roles_list | default([]))) + (server_type_roles_list | default([])) }}"
3 changes: 2 additions & 1 deletion ansible/group_vars/server_type_base_rhel79.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ server_type_roles_list:
- autoscale-group-hooks-state

collectd_monitored_services_servertype:
- metric_name: chronyd
- metric_name: service_status_os
metric_dimension: chronyd
shell_cmd: "systemctl is-active chronyd"

roles_list: "{{ (ami_roles_list | default([]) | difference(server_type_roles_list | default([]))) + (server_type_roles_list | default([])) }}"
3 changes: 2 additions & 1 deletion ansible/group_vars/server_type_base_rhel85.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ server_type_roles_list:
- autoscale-group-hooks-state

collectd_monitored_services_servertype:
- metric_name: chronyd
- metric_name: service_status_os
metric_dimension: chronyd
shell_cmd: "systemctl is-active chronyd"

roles_list: "{{ (ami_roles_list | default([]) | difference(server_type_roles_list | default([]))) + (server_type_roles_list | default([])) }}"
9 changes: 6 additions & 3 deletions ansible/group_vars/server_type_hmpps_oem.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,14 @@ server_type_roles_list:
- collectd-service-metrics

collectd_monitored_services_servertype:
- metric_name: chronyd
- metric_name: service_status_os
metric_dimension: chronyd
shell_cmd: "systemctl is-active chronyd"
- metric_name: oracleohasd
- metric_name: service_status_app
metric_dimension: oracle-ohasd
shell_cmd: "systemctl is-active oracle-ohasd"
- metric_name: oracleasm
- metric_name: service_status_app
metric_dimension: oracleasm
shell_cmd: "systemctl is-active oracleasm"

# the below vars are defined in multiple groups. Keep the values the same to avoid unexpected behaviour
Expand Down
9 changes: 6 additions & 3 deletions ansible/group_vars/server_type_nomis_db.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,14 @@ database_home: /u01/app/oracle/product/11.2.0.4/db_1
grid_home: /u01/app/oracle/product/11.2.0.4/gridhome_1

collectd_monitored_services_servertype:
- metric_name: chronyd
- metric_name: service_status_os
metric_dimension: chronyd
shell_cmd: "systemctl is-active chronyd"
- metric_name: oracleohasd
- metric_name: service_status_app
metric_dimension: oracle-ohasd
shell_cmd: "systemctl is-active oracle-ohasd"
- metric_name: oracleasm
- metric_name: service_status_app
metric_dimension: oracleasm
shell_cmd: "systemctl is-active oracleasm"

use_ssm_params: true
6 changes: 4 additions & 2 deletions ansible/group_vars/server_type_nomis_web.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,11 @@ server_type_roles_list:
roles_list: "{{ (ami_roles_list | default([]) | difference(server_type_roles_list | default([]))) + (server_type_roles_list | default([])) }}"

collectd_monitored_services_servertype:
- metric_name: chronyd
- metric_name: service_status_os
metric_dimension: chronyd
shell_cmd: "service chronyd status"
- metric_name: weblogichealthcheck
- metric_name: service_status_app
metric_dimension: weblogic-healthcheck
shell_cmd: "service weblogic-healthcheck status"

use_ssm_params: true
22 changes: 9 additions & 13 deletions ansible/group_vars/server_type_nomis_xtag.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,22 +17,18 @@ server_type_roles_list:

roles_list: "{{ (ami_roles_list | default([]) | difference(server_type_roles_list | default([]))) + (server_type_roles_list | default([])) }}"

collectd_monitored_services:
- chronyd
- amazon-ssm-agent
- amazon-cloudwatch-agent
- wls_nodemanager
- wls_adminserver
- wls_managedserver

collectd_monitored_services_servertype:
- metric_name: chronyd
- metric_name: service_status_os
metric_dimension: chronyd
shell_cmd: "systemctl is-active chronyd"
- metric_name: wlsnodemanager
- metric_name: service_status_app
metric_dimension: wls_nodemanager
shell_cmd: "systemctl is-active wls_nodemanager"
- metric_name: wlsadminserver
- metric_name: service_status_app
metric_dimension: wls_adminserver
shell_cmd: "systemctl is-active wls_adminserver"
- metric_name: wlsmanagedserver
- metric_name: service_status_app
metric_dimension: wls_managedserver
shell_cmd: "systemctl is-active wls_managedserver"

use_ssm_params: true
use_ssm_params: true
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
},
"metrics": {
"append_dimensions": {
"ImageId": "${aws:ImageId}",
"InstanceId": "${aws:InstanceId}",
"AutoScalingGroupName": "${aws:AutoScalingGroupName}"
},
Expand Down
38 changes: 31 additions & 7 deletions ansible/roles/collectd-service-metrics/README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,32 @@
# Role to configure monitoring for service state using collectd

The collectd 'exec' plugin is being used here to run a monitored_services.sh script which checks the status of each service in the `collectd_monitored_services` list and returns a 0 (running) or 1 (not running) value for each. The script is in the templates directory and is copied to the host by the role. The script is then called by the exec plugin and the output is sent to the local collectd port.
Monitor the status of services via collectd and cloudwatch.

Anything we want to monitor should be set up as a service (see weblogic-healthcheck for exampe) and then added to the `collectd_monitored_services` list in group_vars. This will over-ride the default list in defaults/main.yml which is only amazon-cloudwatch-agent and amazon-ssm-agent.
The role installs a collectd configuration file for using an exec plugin,
and a script for polling the status of the services.

Two variables are used to define which services are monitored:

- `collectd_monitored_services_role` for services common to all servers
- `collectd_monitored_services_servertype` for services specific to the given server type.

The idea is the `collectd_monitored_services_servertype` is defined in a server
type group vars.

Example configuration is

```
collectd_monitored_services_role:
- metric_name: service_status_os
metric_dimension: amazon-ssm-agent
shell_cmd: "(status amazon-ssm-agent|grep running) || (systemctl is-active amazon-ssm-agent)"
```

The metric name, dimension and command to retrieve the status must all be defined.

Typically we segregate OS level and application level monitoring into different metric
names as different teams maybe responsible for these, e.g. `service_status_os` and
`service_status_app`

### Debugging

Expand All @@ -12,18 +36,18 @@ Definitely add `debug: true` to the cloudwatch agent config file to see what's g

Unless you specify otherwise cloudwatch agent logs go to `/opt/aws/amazon-cloudwatch-agent/logs/amazon-cloudwatch-agent.log` which is also worth checking to make sure the messages it's trying to pick up from the collectd port are making sense.

Collectd relies on plugins, the most important one related to Cloudwatch is the 'network' plugin which posts the metrics data to a UDP endpoint. Cloudwatch picks metrics up from there and sends them on to cloudwatch.
Collectd relies on plugins, the most important one related to Cloudwatch is the 'network' plugin which posts the metrics data to a UDP endpoint. Cloudwatch picks metrics up from there and sends them on to cloudwatch.

Intro to Collectd networking [here](https://collectd.org/wiki/index.php/Networking_introduction)

## Finding metrics in Cloudwatch

Metrics collected by the Cloudwatch agent will appear in the 'metrics' panel under the CWAgent namespace
Metrics collected by the Cloudwatch agent will appear in the 'metrics' panel under the CWAgent namespace

```
metric: collectd_service_status_value
type: exitcode
type_instance: Name of service, e.g. amazonssmagent
metric: collectd_service_status_value (the metric_name)
type: exitcode (fixed, 0 = ok, non-zero = error)
type_instance: Name of service, e.g. amazonssmagent (the metric_dimension)
```

Cloudwatch metrics are easily filtered by instance_id so you can see all the metrics for a particular instance.
6 changes: 4 additions & 2 deletions ansible/roles/collectd-service-metrics/defaults/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,11 @@ collectd_script_interval: 60

# Shell cmds work for both RHEL6 and RHEL7+
collectd_monitored_services_role:
- metric_name: amazonssmagent
- metric_name: service_status_os
metric_dimension: amazon-ssm-agent
shell_cmd: "(status amazon-ssm-agent|grep running) || (systemctl is-active amazon-ssm-agent)"
- metric_name: amazoncloudwatchagent
- metric_name: service_status_os
metric_dimension: amazon-cloudwatch-agent
shell_cmd: "(status amazon-cloudwatch-agent|grep running) || (systemctl is-active amazon-cloudwatch-agent)"

# add additional services using this variable in servertype group vars
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@ INTERVAL="${INTERVAL:-{{ collectd_script_interval }}}"
while sleep "$INTERVAL"; do
{% for item in collectd_monitored_services_role %}
({{ item.shell_cmd }}) >/dev/null 2>&1
echo "PUTVAL $HOSTNAME/service_status/exitcode-{{ item.metric_name }} interval=$INTERVAL N:$?"
echo "PUTVAL $HOSTNAME/{{ item.metric_name }}/exitcode-{{ item.metric_dimension }} interval=$INTERVAL N:$?"
{% endfor %}
{% for item in collectd_monitored_services_servertype %}
({{ item.shell_cmd }}) >/dev/null 2>&1
echo "PUTVAL $HOSTNAME/service_status/exitcode-{{ item.metric_name }} interval=$INTERVAL N:$?"
echo "PUTVAL $HOSTNAME/{{ item.metric_name }}/exitcode-{{ item.metric_dimension }} interval=$INTERVAL N:$?"
{% endfor %}
done
16 changes: 15 additions & 1 deletion ansible/roles/collectd-textfile-monitoring/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ This role does not create the directory, it is assumed another role
will create this with the correct permissions. It needs to be readable by
`ec2-user`.

Files should contain a field and a value, e.g.
Files should contain a field and a value and use a `.prom` or `.metric` extension, e.g.

```
$ cat /opt/textfile_monitoring/nomis_batch_monitoring.prom
Expand All @@ -28,3 +28,17 @@ collectd_textfile_monitoring_value gauge nomis_batch_failure_status
```

The `seconds` metric is the number of seconds since the file was last modified.

You can use different metric names by using a subdirectory, for example

```
$ cat /opt/textfile_monitoring/rman_backup/CNOMP.metric
CNOMP 1
```
Will create metrics

```
Metric type type_instance
collectd_textfile_monitoring_rman_backup_seconds duration CNOMP
collectd_textfile_monitoring_rman_backup_value gauge CNOMP
```
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,3 @@ collectd_script_path: /usr/local/bin
collectd_script_name: collectd_textfile_monitoring
collectd_script_user: ec2-user
collectd_script_interval: 60
collectd_textfile_monitoring_paths: /opt/textfile_monitoring/*
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,21 @@ INTERVAL="${INTERVAL:-{{ collectd_script_interval }}}"

while sleep "$INTERVAL"; do
now=$(date +%s)
for file in {{ collectd_textfile_monitoring_paths }}; do
for file in /opt/textfile_monitoring/*.prom /opt/textfile_monitoring/*.metric /opt/textfile_monitoring/*/*.prom /opt/textfile_monitoring/*/*.metric; do
{% raw %}
if [[ -e "$file" ]]; then
IFS=$'\n'
metrics=($(grep -E "^[[:alnum:]_]+[[:space:]]+[[:digit:]]+" "$file"))
metrics=($(grep -E "^[[:alnum:]_-:]+[[:space:]]+[[:digit:]]+" "$file"))
metric_name=$(dirname $file | sed 's|^/opt/||g' | sed 's|/|_|g')
unset IFS
file_last_modified=$(date -r "$file" +%s)
secs_since_last_modified=$((now - file_last_modified))

num_metrics=${#metrics[@]}
for ((i=0; i<num_metrics; i++)); do
metric=(${metrics[i]})
echo "PUTVAL $HOSTNAME/textfile_monitoring/gauge-${metric[0]} interval=$INTERVAL N:${metric[1]}"
echo "PUTVAL $HOSTNAME/textfile_monitoring/duration-${metric[0]} interval=$INTERVAL N:${secs_since_last_modified}"
echo "PUTVAL $HOSTNAME/${metric_name}/gauge-${metric[0]} interval=$INTERVAL N:${metric[1]}"
echo "PUTVAL $HOSTNAME/${metric_name}/duration-${metric[0]} interval=$INTERVAL N:${secs_since_last_modified}"
done
fi
{% endraw %}
Expand Down
20 changes: 14 additions & 6 deletions ansible/roles/nomis-misload/templates/trigger_mis_load.sh.j2
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,24 @@ export PATH=$PATH:/usr/local/bin
target=$(aws ssm get-parameter --name "{{ misload_ssm_parameter }}" --query Parameter.Value --with-decryption --output text | jq -r .target)
username=$(aws ssm get-parameter --name "{{ misload_ssm_parameter }}" --query Parameter.Value --with-decryption --output text | jq -r .username)
password=$(aws ssm get-parameter --name "{{ misload_ssm_parameter }}" --query Parameter.Value --with-decryption --output text | jq -r .password)

if [[ -z $target || $target == "null" || $target == "None" ||
-z $username || $username == "null" || $username == "None" ||
-z $password || $password == "null" || $password == "None" ]]; then
echo "Could not retrieve config from {{ misload_ssm_parameter }}"
echo "misload_status 1" > /opt/textfile_monitoring/misload_status.prom
exit 1
fi

echo "misload_running 0" > /opt/textfile_monitoring/misload_triggered.prom

misload_batch_file_path="{{ misload_batch_file_path }}"

cd {{ oracle_admin_script_dir }}
export ORACLE_SID=T1MIS
export ORACLE_SID={{ misload_dbname }}
export ORAENV_ASK=NO
. oraenv
utc_time=$(date -u +"%Y-%m-%d %H:%M:%S")
echo "misload_triggered $utc_time" > /opt/textfile_monitoring/misload_triggered.prom

{{ ansible_python_interpreter }} /usr/local/share/trigger_mis_load.py -u "$username" -p "$password" -t "$target" -b "$misload_batch_file_path"

# echo "misload_running 0" > /opt/textfile_monitoring/misload_running.prom
# echo "misload_start_time " `date '+%s'` > /opt/textfile_monitoring/misload_start_time.prom
echo "misload_status $?" > /opt/textfile_monitoring/misload_status.prom
rm -f /opt/textfile_monitoring/misload_triggered.prom
2 changes: 2 additions & 0 deletions ansible/roles/oracle-db-backup/README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Overview

Role for configuring scheduled oracle DB backups, or taking adhoc backups
Status of backups is stored in /opt/textfile_monitoring for monitoring, e.g.
see collectd-textfile-monitoring role.

# Pre-requisite for scheduled backup

Expand Down
1 change: 1 addition & 0 deletions ansible/roles/oracle-db-backup/tasks/rman-backup-setup.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
state: directory
recurse: yes
loop:
- /opt/textfile_monitoring/rman_backup
- /home/oracle/admin/rman_scripts/logs
- /home/oracle/admin/rman_scripts/status

Expand Down
2 changes: 2 additions & 0 deletions ansible/roles/oracle-db-backup/templates/rman_backup.sh.j2
Original file line number Diff line number Diff line change
Expand Up @@ -414,9 +414,11 @@ do
if [[ $? -eq 0 ]]; then
info "Rman reported errors for $TARGET_DB"
echo "$TARGET_DB,$(date +%s),0,$(date),Errors" > /home/oracle/admin/rman_scripts/status/status.${TARGET_DB}
echo "${TARGET_DB} 1" > /opt/textfile_monitoring/rman-backup/${TARGET_DB}.prom
else
info "Backup of $TARGET_DB completed successfully"
echo "$TARGET_DB,$(date +%s),1,$(date),Success" > /home/oracle/admin/rman_scripts/status/status.${TARGET_DB}
echo "${TARGET_DB} 0" > /opt/textfile_monitoring/rman-backup/${TARGET_DB}.prom
find /home/oracle/admin/rman_scripts/logs -name "*cmd" -mtime +15 -exec rm {} \;
find /home/oracle/admin/rman_scripts/logs -name "*log" -mtime +15 -exec rm {} \;
fi
Expand Down

0 comments on commit c185a5f

Please sign in to comment.