Skip to content

Commit

Permalink
Feature/cluster dashboard (#1721)
Browse files Browse the repository at this point in the history
* adding automation to capture new information in influx

* adding new monitoring dashboards

* removing vmss as a key and updating dashboards

* allowing setting the default thresholds for the grafana dashboards

* updating dashboards to move networking panels into the compute servers dashboard

* final changes to cluster stats dashboard
  • Loading branch information
egmsft authored Nov 10, 2023
1 parent ba9beb7 commit 92127ce
Show file tree
Hide file tree
Showing 8 changed files with 5,188 additions and 6 deletions.
5 changes: 4 additions & 1 deletion config.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,10 @@
"install_agent": {"type": "boolean", "default": true, "deprecated": true },
"azure_monitor_agent": {"type": "boolean", "default": false },
"telegraf": {"type": "boolean", "default": true },
"grafana": {"type": "boolean", "default": true }
"grafana": {"type": "boolean", "default": true },
"idle_threshold": {"type": "string", "pattern": "^(10|20|30|40|50|60|70|80|90|100)$", "default": "70"},
"iowait_threshold": {"type": "string", "pattern": "^(10|20|30|40|50|60|70|80|90|100)$", "default": "40"},
"mem_threshold": {"type": "string", "pattern": "^(10|20|30|40|50|60|70|80|90|100)$", "default": "30"}
},
"title": "monitoring"
},
Expand Down
3 changes: 3 additions & 0 deletions config.tpl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ monitoring:
# Optional settings to deploy Grafana and install Telegraf
telegraf: true # Install telegraf on static infra VMs and dynamic compute nodes. Default: true
grafana: true # Deploy a Grafana instance with pre-defined dashboards. Default: true
idle_threshold: '70' # default threshold to highlight idle VMs in grafana cluster stats dashboard
mem_threshold: '30' # default threshold to highlight VMs running out of memory in grafana cluster stats dashboard
iowait_threshold: '40' # default threshold to highlight VMs waiting on IO in grafana cluster stats dashboard

#If set to true, it will create alert rules associated with az-hop. Enablement of alerting will require the specification of an admin email to send alerts to.
alerting:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
[global_tags]
sku = "__SKU__"
physical_hostname = "__PHYS_HOST__"
vmss = "__VMSS__"

node_array = "__ARRAY__"
[agent]
interval = "10s"
round_interval = true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@ echo "Configuring global tags"
AZHPC_VMSIZE=$(curl -s --noproxy "*" -H Metadata:true "http://169.254.169.254/metadata/instance/compute?api-version=2019-08-15" | jq -r '.vmSize' | tr '[:upper:]' '[:lower:]')
PHYSICAL_HOST=$(strings /var/lib/hyperv/.kvp_pool_3 | grep -A1 PhysicalHostName | head -n 2 | tail -1)
VMSS=$(curl -s -H Metadata:true "http://169.254.169.254/metadata/instance?api-version=2018-10-01" | jq -r '.compute.vmScaleSetName')
ARRAY=$(jetpack config cyclecloud.node.template)

sed -i "s/__SKU__/${AZHPC_VMSIZE}/g" ../files/telegraf.conf
sed -i "s/__PHYS_HOST__/${PHYSICAL_HOST}/g" ../files/telegraf.conf
sed -i "s/__VMSS__/${VMSS}/g" ../files/telegraf.conf
sed -i "s/__ARRAY__/${ARRAY}/g" ../files/telegraf.conf

echo "Copy configuration file to use"
TELEGRAF_CONF_DIR=/etc/telegraf
Expand Down
16 changes: 15 additions & 1 deletion playbooks/roles/grafana/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,20 @@
src: '{{role_path}}/dashboards'
dest: '{{ grafana_paths_provisioning }}/dashboards'

- name: Copy template dashboards
template:
src: "{{ item.dashboard }}"
dest: "{{ item.destination }}"
with_items:
- { dashboard: cluster-stats.json.j2, destination: '{{ grafana_paths_provisioning }}/dashboards/dashboards/cluster-stats.json' }
- { dashboard: azhop_compute_servers.json.j2, destination: '{{ grafana_paths_provisioning }}/dashboards/dashboards/azhop_compute_servers.json' }
vars:
scheduler_name: "{{scheduler.name | default('scheduler')}}"
thresholds: ["10", "20", "30", "40", "50","60", "70", "80", "90", "100"]
idle_threshold: "{{monitoring.idle_threshold | default('70')}}"
mem_threshold: "{{monitoring.mem_threshold | default('30')}}"
iowait_threshold: "{{monitoring.iowait_threshold | default('40')}}"

- name: Copy infra dashboard file
template:
src: azhop_infra_servers.json.j2
Expand All @@ -43,4 +57,4 @@
service:
name: grafana-server
state: started
enabled: true
enabled: true
Loading

0 comments on commit 92127ce

Please sign in to comment.