Skip to content
This repository has been archived by the owner on Jun 6, 2024. It is now read-only.

send regular GPU utilization report with CronJob #5281

Merged
merged 29 commits into from
Feb 7, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 22 additions & 1 deletion .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ jobs:
rm -rf ./src/watchdog/GOPATH/src/github.com/microsoft/watchdog/vendor/
misspell -error .

pylint:
pylint-deployment:
name: pylint of deployment scripts
runs-on: ubuntu-16.04

Expand All @@ -64,6 +64,27 @@ jobs:
- name: Lint
run: |
pylint contrib/kubespray/script --rcfile=contrib/kubespray/script/pylintrc

pylint-alert-manager:
name: pylint of alert-manager
runs-on: ubuntu-16.04

steps:
- name: Checkout
uses: actions/checkout@v1
- name: Use Python 3.7
uses: actions/setup-python@v2
with:
python-version: 3.7
architecture: x64
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install -r src/alert-manager/src/cluster-utilization/requirements.txt
python -m pip install pylint
- name: Lint
run: |
pylint src/alert-manager/src/cluster-utilization/ --rcfile=src/alert-manager/src/cluster-utilization/pylintrc

swagger-validate:
name: Validate swagger
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,9 @@ authentication:
# smtp-from: alert-sender@example.com
# smtp-auth-username: alert-sender@example.com
# smtp-auth-password: password-for-alert-sender
# cluster-utilization: # cluster-utilization is a k8s CronJob which reports the GPU utilization of the cluster
# # for schedule syntex, refer to https://kubernetes.io/docs/concepts/workloads/controllers/cron-jobs/#cron-schedule-syntax
# schedule: "0 0 * * *" # daily report at UTC 00:00
# customized-routes:
# routes:
# - receiver: pai-email-admin-user-and-stop-job
Expand Down
3 changes: 3 additions & 0 deletions deployment/quick-start/services-configuration.yaml.template
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,9 @@ rest-server:
# smtp-from: alert-sender@example.com
# smtp-auth-username: alert-sender@example.com
# smtp-auth-password: password-for-alert-sender
# cluster-utilization: # cluster-utilization is a k8s CronJob which reports the GPU utilization of the cluster
# # for schedule syntex, refer to https://kubernetes.io/docs/concepts/workloads/controllers/cron-jobs/#cron-schedule-syntax
# schedule: "0 0 * * *" # daily report at UTC 00:00
# customized-routes:
# routes:
# - receiver: pai-email-admin-user-and-stop-job
Expand Down
2 changes: 1 addition & 1 deletion docs/manual/cluster-admin/how-to-use-alert-system.md
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ Remember to re-build and push the docker image, and restart the `alert-manager`

```bash
./build/pai_build.py build -c /cluster-configuration/ -s alert-manager
./build/pai_build.py push -c /cluster-configuration/ -i alert-handler
./build/pai_build.py push -c /cluster-configuration/ -i alert-handler cluster-utilization
./paictl.py service stop -n alert-manager
./paictl.py config push -p /cluster-configuration -m service
./paictl.py service start -n alert-manager
Expand Down
3 changes: 3 additions & 0 deletions examples/cluster-configuration/services-configuration.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,9 @@ rest-server:
# smtp-from: alert-sender@example.com
# smtp-auth-username: alert-sender@example.com
# smtp-auth-password: password-for-alert-sender
# cluster-utilization: # cluster-utilization is a k8s CronJob which reports the GPU utilization of the cluster
# # for schedule syntex, refer to https://kubernetes.io/docs/concepts/workloads/controllers/cron-jobs/#cron-schedule-syntax
# schedule: "0 0 * * *" # daily report at UTC 00:00
# customized-routes:
# routes:
# - receiver: pai-email-admin-user-and-stop-job
Expand Down
24 changes: 24 additions & 0 deletions src/alert-manager/build/cluster-utilization.common.dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

FROM python:3.7

COPY ./src/cluster-utilization .

RUN pip3 install -r requirements.txt

ENTRYPOINT ["python3", "send_alert.py"]
4 changes: 3 additions & 1 deletion src/alert-manager/config/alert-manager.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,7 @@ alert-handler:
log-level: 'info'
port: 9095
configured: False
cluster-utilization:
configured: False
use-pylon: False
repeat-interval: '24h'
repeat-interval: '24h'
4 changes: 4 additions & 0 deletions src/alert-manager/config/alert_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,10 @@ def run(self):
else:
result["alert-handler"]["configured"] = False

if result.get("cluster-utilization") is not None and \
result["cluster-utilization"].get("schedule") is not None:
result["cluster-utilization"]["configured"] = True

result["host"] = self.get_master_ip()
result["url"] = "http://{0}:{1}".format(self.get_master_ip(), result["port"])

Expand Down
11 changes: 11 additions & 0 deletions src/alert-manager/deploy/alert-manager-configmap.yaml.template
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ data:
- receiver: pai-cordon-nodes
match:
alertname: NvidiaSmiDoubleEccError

- receiver: pai-cluster-usage
match:
report_type: cluster-usage

{% if 'routes' in cluster_cfg["alert-manager"]["customized-routes"] %}
{% for route in cluster_cfg["alert-manager"]["customized-routes"]["routes"] %}
Expand All @@ -62,6 +66,13 @@ data:
- url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/send-email-to-admin'
send_resolved: true
{% endif %}

- name: pai-cluster-usage
webhook_configs:
{% if 'email-admin' in cluster_cfg["alert-manager"]["actions-available"] %}
- url: 'http://localhost:{{ cluster_cfg["alert-manager"]["alert-handler"]["port"] }}/alert-handler/send-email-to-admin/?template=cluster-usage'
send_resolved: false
{% endif %}

- name: pai-cordon-nodes
webhook_configs:
Expand Down
44 changes: 44 additions & 0 deletions src/alert-manager/deploy/alert-manager-cronjob.yaml.template
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


apiVersion: batch/v1beta1
kind: CronJob
metadata:
name: cluster-utilization
spec:
schedule: "{{ cluster_cfg["alert-manager"]["cluster-utilization"]["schedule"] }}"
jobTemplate:
spec:
template:
spec:
containers:
- name: cluster-utilization
image: {{ cluster_cfg['cluster']['docker-registry']['prefix'] }}cluster-utilization:{{ cluster_cfg['cluster']['docker-registry']['tag'] }}
imagePullPolicy: Always
env:
- name: PAI_URI
{%- if "ssl" in cluster_cfg["pylon"] and cluster_cfg["pylon"]["ssl"] %}
value: "{{ cluster_cfg['pylon']['uri-https']}}"
{%- else %}
value: "{{ cluster_cfg['pylon']['uri']}}"
{%- endif %}
- name: PAI_BEARER_TOKEN
value: {{ cluster_cfg["alert-manager"]["alert-handler"]["pai-bearer-token"] }}
imagePullSecrets:
- name: {{ cluster_cfg["cluster"]["docker-registry"]["secret-name"] }}
restartPolicy: OnFailure
84 changes: 84 additions & 0 deletions src/alert-manager/deploy/alert-templates/cluster-usage/html.ejs
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
<!DOCTYPE html
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xmlns="http://www.w3.org/1999/xhtml">

<head>
<meta name="viewport" content="width=device-width" />
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<title>
<%= cluster_id %>: Cluster GPU utilization for One Week
</title>
</head>

<body itemscope="" itemtype="http://schema.org/EmailMessage"
style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 16px; -webkit-font-smoothing: antialiased; -webkit-text-size-adjust: none; height: 100%; line-height: 1.6em; width: 100% !important; background-color: #f6f6f6; margin: 0; padding: 0;">
<h3 style="text-align:center">Cluster GPU utilization for One Week</h3>
<table style="font-size: 16px; width: 100%; margin: 0;">
<% alerts.filter( element=> typeof element.labels.cluster_usage !== 'undefined').forEach(function(alert){ %>
<tr>
<th>Cluster GPU utilization</th>
<td>
<%= alert.labels.cluster_usage %>
</td>
</tr>
<% }); %>
</table>
<br />
<br />
<h3 style="text-align:center">User GPU Utilization for One Week</h3>
<table style="font-size: 16px; width: 100%;margin: 0;text-align:center;">
<tr>
<th>User name</th>
<th>GPU utilization</th>
</tr>
<% alerts.filter( element=> typeof element.labels.user_name !== 'undefined' && typeof element.labels.user_usage !==
'undefined').forEach(function(alert){ %>
<tr>
<td>
<%= alert.labels.user_name %>
</td>
<td>
<%= alert.labels.user_usage %>
</td>
</tr>
<% }); %>
</table>
<br />
<br />
<h3 style="text-align:center">Job GPU Utilization for One Week</h3>
<table style="font-size: 16px; width: 100%; margin: 0; text-align:center;">
<tr>
<th>Job name</th>
<th>GPU utilization</th>
<th>Job duration</th>
<th>Job start time</th>
<th>Job status</th>
<th>GPU number</th>
</tr>
<% alerts.filter( element=> typeof element.labels.job_name !== 'undefined' && typeof element.labels.job_usage !==
'undefined').forEach(function(alert){ %>
<tr>
<td>
<%= alert.labels.job_name %>
</td>
<td>
<%= alert.labels.job_usage %>
</td>
<td>
<%= alert.labels.job_duration %>
</td>
<td>
<%= alert.labels.job_start_time %>
</td>
<td>
<%= alert.labels.job_status %>
</td>
<td>
<%= alert.labels.job_gpu_number %>
</td>
</tr>
<% }); %>
</table>
</body>

</html>
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
<%= cluster_id %>: Cluster GPU Utilization for One Week
1 change: 1 addition & 0 deletions src/alert-manager/deploy/service.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ prerequisite:
template-list:
- alert-manager-deployment.yaml
- alert-manager-configmap.yaml
- alert-manager-cronjob.yaml
- start.sh

start-script: start.sh
Expand Down
3 changes: 3 additions & 0 deletions src/alert-manager/deploy/start.sh.template
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ kubectl create configmap alert-templates \
kubectl apply --overwrite=true -f rbac.yaml || exit $?
kubectl apply --overwrite=true -f alert-manager-configmap.yaml || exit $?
kubectl apply --overwrite=true -f alert-manager-deployment.yaml || exit $?
{% if cluster_cfg["alert-manager"]["cluster-utilization"]["configured"] -%}
kubectl apply --overwrite=true -f alert-manager-cronjob.yaml || exit $?
{% endif -%}

sleep 10
# wait until the service is ready.
Expand Down
1 change: 1 addition & 0 deletions src/alert-manager/deploy/stop.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
kubectl delete --ignore-not-found --now configmap/alert-templates
kubectl delete --ignore-not-found --now configmap/alertmanager
kubectl delete --ignore-not-found --now deployment/alertmanager
kubectl delete --ignore-not-found --now cronjob/cluster-utilization

if kubectl get clusterrolebinding | grep -q "alert-manager-role-binding"; then
kubectl delete clusterrolebinding alert-manager-role-binding || exit $?
Expand Down
Empty file.
9 changes: 9 additions & 0 deletions src/alert-manager/src/cluster-utilization/pylintrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
[SETTINGS]

max-line-length=140

disable =
missing-docstring,
invalid-name,
cell-var-from-loop,
undefined-loop-variable,
1 change: 1 addition & 0 deletions src/alert-manager/src/cluster-utilization/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
requests==2.23.0
Loading