diff --git a/pkg/scheduler/cache/cache.go b/pkg/scheduler/cache/cache.go index 5af0c6eaf3..60a660cd74 100644 --- a/pkg/scheduler/cache/cache.go +++ b/pkg/scheduler/cache/cache.go @@ -853,7 +853,7 @@ func (sc *SchedulerCache) processCleanupJob() { if schedulingapi.JobTerminated(job) { delete(sc.Jobs, job.UID) - metrics.DeleteJobShare(job.Namespace, job.Name) + metrics.DeleteJobMetrics(job.Name, string(job.Queue), job.Namespace) klog.V(3).Infof("Job <%v:%v/%v> was deleted.", job.UID, job.Namespace, job.Name) } else { // Retry diff --git a/pkg/scheduler/cache/event_handlers.go b/pkg/scheduler/cache/event_handlers.go index fc0c39ee4e..f29355d6d2 100644 --- a/pkg/scheduler/cache/event_handlers.go +++ b/pkg/scheduler/cache/event_handlers.go @@ -36,6 +36,7 @@ import ( schedulingv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1" "volcano.sh/apis/pkg/apis/utils" schedulingapi "volcano.sh/volcano/pkg/scheduler/api" + "volcano.sh/volcano/pkg/scheduler/metrics" ) func isTerminated(status schedulingapi.TaskStatus) bool { @@ -613,7 +614,10 @@ func (sc *SchedulerCache) updateQueue(queue *scheduling.Queue) { } func (sc *SchedulerCache) deleteQueue(id schedulingapi.QueueID) { - delete(sc.Queues, id) + if queue, ok := sc.Queues[id]; ok { + delete(sc.Queues, id) + metrics.DeleteQueueMetrics(queue.Name) + } } //DeletePriorityClass delete priorityclass from the scheduler cache diff --git a/pkg/scheduler/metrics/job.go b/pkg/scheduler/metrics/job.go index 8f9a5453bb..ace0008280 100644 --- a/pkg/scheduler/metrics/job.go +++ b/pkg/scheduler/metrics/job.go @@ -44,12 +44,15 @@ func UpdateJobShare(jobNs, jobID string, share float64) { jobShare.WithLabelValues(jobNs, jobID).Set(share) } -// DeleteJobShare delete jobShare for one job -func DeleteJobShare(jobNs, jobID string) { - jobShare.DeleteLabelValues(jobNs, jobID) -} - // RegisterJobRetries total number of job retries. func RegisterJobRetries(jobID string) { jobRetryCount.WithLabelValues(jobID).Inc() } + +// DeleteJobMetrics delete all metrics related to the job +func DeleteJobMetrics(jobName, queue, namespace string) { + e2eJobSchedulingDuration.DeleteLabelValues(jobName, queue, namespace) + unscheduleTaskCount.DeleteLabelValues(jobName) + jobShare.DeleteLabelValues(namespace, jobName) + jobRetryCount.DeleteLabelValues(jobName) +} diff --git a/pkg/scheduler/metrics/queue.go b/pkg/scheduler/metrics/queue.go index 3af5607134..ce2504d0f5 100644 --- a/pkg/scheduler/metrics/queue.go +++ b/pkg/scheduler/metrics/queue.go @@ -185,3 +185,20 @@ func UpdateQueuePodGroupRunningCount(queueName string, count int32) { func UpdateQueuePodGroupUnknownCount(queueName string, count int32) { queuePodGroupUnknown.WithLabelValues(queueName).Set(float64(count)) } + +// DeleteQueueMetrics delete all metrics related to the queue +func DeleteQueueMetrics(queueName string) { + queueAllocatedMilliCPU.DeleteLabelValues(queueName) + queueAllocatedMemory.DeleteLabelValues(queueName) + queueRequestMilliCPU.DeleteLabelValues(queueName) + queueRequestMemory.DeleteLabelValues(queueName) + queueDeservedMilliCPU.DeleteLabelValues(queueName) + queueDeservedMemory.DeleteLabelValues(queueName) + queueShare.DeleteLabelValues(queueName) + queueWeight.DeleteLabelValues(queueName) + queueOverused.DeleteLabelValues(queueName) + queuePodGroupInqueue.DeleteLabelValues(queueName) + queuePodGroupPending.DeleteLabelValues(queueName) + queuePodGroupRunning.DeleteLabelValues(queueName) + queuePodGroupUnknown.DeleteLabelValues(queueName) +}