Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/cluster dashboard #1721

Merged
merged 7 commits into from
Nov 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion config.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,10 @@
"install_agent": {"type": "boolean", "default": true, "deprecated": true },
"azure_monitor_agent": {"type": "boolean", "default": false },
"telegraf": {"type": "boolean", "default": true },
"grafana": {"type": "boolean", "default": true }
"grafana": {"type": "boolean", "default": true },
"idle_threshold": {"type": "string", "pattern": "^(10|20|30|40|50|60|70|80|90|100)$", "default": "70"},
"iowait_threshold": {"type": "string", "pattern": "^(10|20|30|40|50|60|70|80|90|100)$", "default": "40"},
"mem_threshold": {"type": "string", "pattern": "^(10|20|30|40|50|60|70|80|90|100)$", "default": "30"}
},
"title": "monitoring"
},
Expand Down
3 changes: 3 additions & 0 deletions config.tpl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ monitoring:
# Optional settings to deploy Grafana and install Telegraf
telegraf: true # Install telegraf on static infra VMs and dynamic compute nodes. Default: true
grafana: true # Deploy a Grafana instance with pre-defined dashboards. Default: true
idle_threshold: '70' # default threshold to highlight idle VMs in grafana cluster stats dashboard
mem_threshold: '30' # default threshold to highlight VMs running out of memory in grafana cluster stats dashboard
iowait_threshold: '40' # default threshold to highlight VMs waiting on IO in grafana cluster stats dashboard

#If set to true, it will create alert rules associated with az-hop. Enablement of alerting will require the specification of an admin email to send alerts to.
alerting:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
[global_tags]
sku = "__SKU__"
physical_hostname = "__PHYS_HOST__"
vmss = "__VMSS__"

node_array = "__ARRAY__"
[agent]
interval = "10s"
round_interval = true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@ echo "Configuring global tags"
AZHPC_VMSIZE=$(curl -s --noproxy "*" -H Metadata:true "http://169.254.169.254/metadata/instance/compute?api-version=2019-08-15" | jq -r '.vmSize' | tr '[:upper:]' '[:lower:]')
PHYSICAL_HOST=$(strings /var/lib/hyperv/.kvp_pool_3 | grep -A1 PhysicalHostName | head -n 2 | tail -1)
VMSS=$(curl -s -H Metadata:true "http://169.254.169.254/metadata/instance?api-version=2018-10-01" | jq -r '.compute.vmScaleSetName')
ARRAY=$(jetpack config cyclecloud.node.template)

sed -i "s/__SKU__/${AZHPC_VMSIZE}/g" ../files/telegraf.conf
sed -i "s/__PHYS_HOST__/${PHYSICAL_HOST}/g" ../files/telegraf.conf
sed -i "s/__VMSS__/${VMSS}/g" ../files/telegraf.conf
sed -i "s/__ARRAY__/${ARRAY}/g" ../files/telegraf.conf

echo "Copy configuration file to use"
TELEGRAF_CONF_DIR=/etc/telegraf
Expand Down
16 changes: 15 additions & 1 deletion playbooks/roles/grafana/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,20 @@
src: '{{role_path}}/dashboards'
dest: '{{ grafana_paths_provisioning }}/dashboards'

- name: Copy template dashboards
template:
src: "{{ item.dashboard }}"
dest: "{{ item.destination }}"
with_items:
- { dashboard: cluster-stats.json.j2, destination: '{{ grafana_paths_provisioning }}/dashboards/dashboards/cluster-stats.json' }
- { dashboard: azhop_compute_servers.json.j2, destination: '{{ grafana_paths_provisioning }}/dashboards/dashboards/azhop_compute_servers.json' }
vars:
scheduler_name: "{{scheduler.name | default('scheduler')}}"
thresholds: ["10", "20", "30", "40", "50","60", "70", "80", "90", "100"]
idle_threshold: "{{monitoring.idle_threshold | default('70')}}"
mem_threshold: "{{monitoring.mem_threshold | default('30')}}"
iowait_threshold: "{{monitoring.iowait_threshold | default('40')}}"

- name: Copy infra dashboard file
template:
src: azhop_infra_servers.json.j2
Expand All @@ -43,4 +57,4 @@
service:
name: grafana-server
state: started
enabled: true
enabled: true
Loading