Skip to content
This repository has been archived by the owner on Jun 6, 2024. It is now read-only.

[cluster utilization] get the full list of jobs in 7 days #5376

Merged
merged 3 commits into from
Mar 19, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 40 additions & 8 deletions src/alert-manager/src/cluster-utilization/send_alert.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from datetime import timezone, datetime
from datetime import timezone, datetime, timedelta
import logging
import os
import requests
Expand All @@ -14,7 +14,7 @@
ALERT_PREFIX = "/alert-manager/api/v1/alerts"
# only the jobs that are running or completed within 7d should be included
# currently, we just set the limit to max
REST_JOB_API_PREFIX = "/rest-server/api/v2/jobs?limit=50000"
REST_JOB_API_PREFIX = "/rest-server/api/v2/jobs?order=completionTime,DESC"

TOKEN = os.environ.get('PAI_BEARER_TOKEN')
PROMETHEUS_SCRAPE_INTERVAL = int(os.environ.get('PROMETHEUS_SCRAPE_INTERVAL'))
Expand Down Expand Up @@ -49,15 +49,47 @@ def datetime_to_hours(dt):
return dt.days * 24 + dt.seconds / 3600


def check_timestamp_within_7d(timestamp):
"""
check if a timestamp is within 7 days
"""
return datetime.fromtimestamp(int(timestamp/1000), timezone.utc) > datetime.now(timezone.utc) - timedelta(days=7)


def get_related_jobs(rest_url):
"""
Returns all related jobs

Returns:
--------
list
All the jobs completed within 7 days will be included in the list.
Jobs completed before 7 days may also be included.
The list may contain duplicated jobs.
"""
jobs_related = []

offset = 0
limit = 5000
headers = {'Authorization': "Bearer {}".format(TOKEN)}
while True:
resp = requests.get(rest_url+"limit={}&offset={}".format(limit, offset), headers=headers)
resp.raise_for_status()
jobs = resp.json()
jobs_related += jobs
# no more jobs or the last job in the list completed before 7 days
if not jobs or (jobs[-1]["completedTime"] is not None and not check_timestamp_within_7d(jobs[-1]["completedTime"])) :
break
offset += limit

return jobs_related


@enable_request_debug_log
def get_usage_info(job_gpu_percent, job_gpu_hours, user_usage_result, rest_url):
job_infos = {}
user_infos = {}
# get all jobs
headers = {'Authorization': "Bearer {}".format(TOKEN)}
resp = requests.get(rest_url, headers=headers)
resp.raise_for_status()
job_list = resp.json()
jobs_related = get_related_jobs(rest_url)

for v in user_usage_result["data"]["result"]:
user_infos[v["metric"]["username"]] = {
Expand All @@ -68,7 +100,7 @@ def get_usage_info(job_gpu_percent, job_gpu_hours, user_usage_result, rest_url):
job_name = v["metric"]["job_name"]
matched_job = list(
filter(lambda job: "{}~{}".format(job["username"], job["name"]) == job_name,
job_list))
jobs_related))
# ingore unfounded jobs
if not matched_job:
logging.warning("Job %s not found.", job_name)
Expand Down