Skip to content
This repository has been archived by the owner on Jun 6, 2024. It is now read-only.

[cluster utilization] get the full list of jobs in 7 days #5376

Merged
merged 3 commits into from
Mar 19, 2021
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,6 @@ authentication:
# summary: "{% raw %}{{$labels.job_name}}{% endraw %} has a job gpu percent lower than 30% for 1 hour"
# description: Monitor job level gpu utilization in certain virtual clusters.


# uncomment following if you want to change customize grafana
# grafana:
# port: 3000
Expand Down
38 changes: 31 additions & 7 deletions src/alert-manager/src/cluster-utilization/send_alert.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from datetime import timezone, datetime
from datetime import timezone, datetime, timedelta
import logging
import os
import requests
Expand All @@ -14,7 +14,7 @@
ALERT_PREFIX = "/alert-manager/api/v1/alerts"
# only the jobs that are running or completed within 7d should be included
# currently, we just set the limit to max
REST_JOB_API_PREFIX = "/rest-server/api/v2/jobs?limit=50000"
REST_JOB_API_PREFIX = "/rest-server/api/v2/jobs?order=completionTime,DESC"

TOKEN = os.environ.get('PAI_BEARER_TOKEN')
PROMETHEUS_SCRAPE_INTERVAL = int(os.environ.get('PROMETHEUS_SCRAPE_INTERVAL'))
Expand Down Expand Up @@ -49,15 +49,39 @@ def datetime_to_hours(dt):
return dt.days * 24 + dt.seconds / 3600


def check_timestamp_within_7d(timestamp):
"""
check if a timestamp is within 7 days
"""
return datetime.fromtimestamp(int(timestamp/1000), timezone.utc) > datetime.now(timezone.utc) - timedelta(days=7)


def get_jobs_in_7d(rest_url):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please change to another name. This function name is not correct, since it may return job before 7 days and contain dup job.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please change to another name. This function name is not correct, since it may return job before 7 days and contain dup job.

Fixed, thx

"""
Returns all jobs within 7 days
"""
jobs_in_7d = []

offset = 0
limit = 5000
headers = {'Authorization': "Bearer {}".format(TOKEN)}
while True:
resp = requests.get(rest_url+"limit={}&offset={}".format(limit, offset), headers=headers)
resp.raise_for_status()
jobs = resp.json()
jobs_in_7d += jobs
if not jobs or (jobs[-1]["completedTime"] is not None and not check_timestamp_within_7d(jobs[-1]["completedTime"])) :
break
offset += limit

return jobs_in_7d


@enable_request_debug_log
def get_usage_info(job_gpu_percent, job_gpu_hours, user_usage_result, rest_url):
job_infos = {}
user_infos = {}
# get all jobs
headers = {'Authorization': "Bearer {}".format(TOKEN)}
resp = requests.get(rest_url, headers=headers)
resp.raise_for_status()
job_list = resp.json()
job_list = get_jobs_in_7d(rest_url)

for v in user_usage_result["data"]["result"]:
user_infos[v["metric"]["username"]] = {
Expand Down