fix: proportion metrics accuracy #2297

LY-today · 2022-06-13T09:19:47Z

refer:#2296
What happened:
When there is only one vcjob under a queue, after deleting the vcjob, it will be found that the metric data in the proportion is inaccurate
What you expected to happen:
Metric data is accurate at all times
How to reproduce it (as minimally and precisely as possible):

/*
Copyright 2017 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package proportion

import (
	"github.com/prometheus/client_golang/prometheus/promhttp"
	"io/ioutil"
	"k8s.io/client-go/util/workqueue"
	"net/http"
	"reflect"
	"strconv"
	"strings"
	"testing"
	"time"
	"volcano.sh/volcano/pkg/scheduler/actions/allocate"

	"github.com/agiledragon/gomonkey/v2"
	apiv1 "k8s.io/api/core/v1"
	schedulingv1 "k8s.io/api/scheduling/v1"
	"k8s.io/apimachinery/pkg/api/resource"
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
	"k8s.io/client-go/tools/record"

	schedulingv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
	"volcano.sh/volcano/cmd/scheduler/app/options"
	"volcano.sh/volcano/pkg/scheduler/api"
	"volcano.sh/volcano/pkg/scheduler/cache"
	"volcano.sh/volcano/pkg/scheduler/conf"
	"volcano.sh/volcano/pkg/scheduler/framework"
	"volcano.sh/volcano/pkg/scheduler/plugins/gang"
	"volcano.sh/volcano/pkg/scheduler/plugins/priority"
	"volcano.sh/volcano/pkg/scheduler/util"
)

func getWorkerAffinity() *apiv1.Affinity {
	return &apiv1.Affinity{
		PodAntiAffinity: &apiv1.PodAntiAffinity{
			RequiredDuringSchedulingIgnoredDuringExecution: []apiv1.PodAffinityTerm{
				{
					LabelSelector: &metav1.LabelSelector{
						MatchExpressions: []metav1.LabelSelectorRequirement{
							{
								Key:      "role",
								Operator: "In",
								Values:   []string{"worker"},
							},
						},
					},
					TopologyKey: "kubernetes.io/hostname",
				},
			},
		},
	}
}

func getLocalMetrics() int {
	var data int

	url := "http://127.0.0.1:8081/metrics"
	method := "GET"

	client := &http.Client{
	}
	req, err := http.NewRequest(method, url, nil)

	if err != nil {
		return data
	}
	req.Header.Add("Authorization", "8cbdb37a-b880-4f2e-844c-e420858ea7eb")

	res, err := client.Do(req)
	if err != nil {
		return data
	}
	defer res.Body.Close()

	body, err := ioutil.ReadAll(res.Body)
	if err != nil {
		return data
	}

	split := strings.Split(string(body), "\n")
	for _, v := range split {
		if !strings.Contains(v, "#") && (strings.Contains(v, "volcano_queue_allocated_memory_bytes") || strings.Contains(v, "volcano_queue_allocated_milli_cpu")) {
			data, _ = strconv.Atoi(strings.Split(v, " ")[1])
		}
	}

	return data
}

func TestProportion(t *testing.T) {
	c := make(chan bool, 1)
	var tmp *cache.SchedulerCache
	patches := gomonkey.ApplyMethod(reflect.TypeOf(tmp), "AddBindTask", func(scCache *cache.SchedulerCache, task *api.TaskInfo) error {
		scCache.Binder.Bind(nil, []*api.TaskInfo{task})
		return nil
	})
	defer patches.Reset()

	framework.RegisterPluginBuilder(PluginName, New)
	framework.RegisterPluginBuilder(gang.PluginName, gang.New)
	framework.RegisterPluginBuilder(priority.PluginName, priority.New)
	options.ServerOpts = options.NewServerOption()
	defer framework.CleanupPluginBuilders()

	// Running pods
	w1 := util.BuildPod("ns1", "worker-1", "", apiv1.PodRunning, util.BuildResourceList("3", "3k"), "pg1", map[string]string{"role": "worker"}, map[string]string{"selector": "worker"})
	w2 := util.BuildPod("ns1", "worker-2", "", apiv1.PodRunning, util.BuildResourceList("5", "5k"), "pg1", map[string]string{"role": "worker"}, map[string]string{})
	w3 := util.BuildPod("ns1", "worker-3", "", apiv1.PodRunning, util.BuildResourceList("4", "4k"), "pg2", map[string]string{"role": "worker"}, map[string]string{})
	w1.Spec.Affinity = getWorkerAffinity()
	w2.Spec.Affinity = getWorkerAffinity()
	w3.Spec.Affinity = getWorkerAffinity()

	// nodes
	n1 := util.BuildNode("node1", util.BuildResourceList("4", "4k"), map[string]string{"selector": "worker"})
	n2 := util.BuildNode("node2", util.BuildResourceList("3", "3k"), map[string]string{})
	n1.Status.Allocatable["pods"] = resource.MustParse("15")
	n2.Status.Allocatable["pods"] = resource.MustParse("15")
	n1.Labels["kubernetes.io/hostname"] = "node1"
	n2.Labels["kubernetes.io/hostname"] = "node2"

	// priority
	p1 := &schedulingv1.PriorityClass{ObjectMeta: metav1.ObjectMeta{Name: "p1"}, Value: 1}
	p2 := &schedulingv1.PriorityClass{ObjectMeta: metav1.ObjectMeta{Name: "p2"}, Value: 2}
	// podgroup
	pg1 := &schedulingv1beta1.PodGroup{
		ObjectMeta: metav1.ObjectMeta{
			Namespace: "ns1",
			Name:      "pg1",
		},
		Spec: schedulingv1beta1.PodGroupSpec{
			Queue:             "q1",
			MinMember:         int32(2),
			PriorityClassName: p2.Name,
		},
	}
	pg2 := &schedulingv1beta1.PodGroup{
		ObjectMeta: metav1.ObjectMeta{
			Namespace: "ns1",
			Name:      "pg2",
		},
		Spec: schedulingv1beta1.PodGroupSpec{
			Queue:             "q1",
			MinMember:         int32(1),
			PriorityClassName: p1.Name,
		},
	}
	// queue
	queue1 := &schedulingv1beta1.Queue{
		ObjectMeta: metav1.ObjectMeta{
			Name: "q1",
		},
	}

	// tests
	tests := []struct {
		name     string
		pods     []*apiv1.Pod
		nodes    []*apiv1.Node
		pcs      []*schedulingv1.PriorityClass
		pgs      []*schedulingv1beta1.PodGroup
		expected map[string]string
	}{
		{
			name:  "pod-deallocate",
			pods:  []*apiv1.Pod{w1, w2, w3},
			nodes: []*apiv1.Node{n1, n2},
			pcs:   []*schedulingv1.PriorityClass{p1, p2},
			pgs:   []*schedulingv1beta1.PodGroup{pg1, pg2},
			expected: map[string]string{ // podKey -> node
				"ns1/worker-3": "node1",
			},
		},
	}

	for _, test := range tests {
		// initialize schedulerCache
		binder := &util.FakeBinder{
			Binds:   map[string]string{},
			Channel: make(chan string),
		}
		recorder := record.NewFakeRecorder(100)
		go func() {
			for {
				event := <-recorder.Events
				t.Logf("%s: [Event] %s", test.name, event)
			}
		}()
		schedulerCache := &cache.SchedulerCache{
			Nodes:           make(map[string]*api.NodeInfo),
			Jobs:            make(map[api.JobID]*api.JobInfo),
			PriorityClasses: make(map[string]*schedulingv1.PriorityClass),
			Queues:          make(map[api.QueueID]*api.QueueInfo),
			Binder:          binder,
			StatusUpdater:   &util.FakeStatusUpdater{},
			VolumeBinder:    &util.FakeVolumeBinder{},
			Recorder:        recorder,
		}

		// Here, if DeletedJobs are not modified to public resources, subsequent jobs cannot be deleted, and the problem cannot be reproduced.
		// ps: I have also tried other methods without modifying deletedJobs, such as switching to cached packages for testing, but it will introduce new problems of package conflicts. I also tried to use Cache.New, because there is no mechanism similar to fake, and the problem of failure to create the default queue will also occur. Finally, I had no choice but to use this method for testing.
		schedulerCache.DeletedJobs = workqueue.NewRateLimitingQueue(workqueue.DefaultControllerRateLimiter())

		for _, node := range test.nodes {
			schedulerCache.AddNode(node)
		}
		for _, pod := range test.pods {
			schedulerCache.AddPod(pod)
		}
		for _, pc := range test.pcs {
			schedulerCache.PriorityClasses[pc.Name] = pc
		}
		for _, pg := range test.pgs {
			pg.Status = schedulingv1beta1.PodGroupStatus{
				Phase: schedulingv1beta1.PodGroupInqueue,
			}
			schedulerCache.AddPodGroupV1beta1(pg)
		}
		schedulerCache.AddQueueV1beta1(queue1)
		// session
		trueValue := true

		num := 1
		// proportion
		go func() {
			for {
				select {
				default:
					ssn := framework.OpenSession(schedulerCache, []conf.Tier{
						{
							Plugins: []conf.PluginOption{
								{
									Name:             PluginName,
									EnabledPredicate: &trueValue,
								},
								{
									Name:                gang.PluginName,
									EnabledJobReady:     &trueValue,
									EnabledJobPipelined: &trueValue,
								},
								{
									Name:            priority.PluginName,
									EnabledJobOrder: &trueValue,
								},
							},
						},
					}, nil)

					allocator := allocate.New()
					allocator.Execute(ssn)
					framework.CloseSession(ssn)
					time.Sleep(time.Second * 3)
					if num == 1 {
						metrics := getLocalMetrics()
						if metrics == 12000 {
							t.Logf("init queue_allocated metrics is ok,%v", metrics)
						}
						schedulerCache.DeletePodGroupV1beta1(pg1)
					} else if num == 2 {
						metrics := getLocalMetrics()
						if metrics == 4000 {
							t.Logf("after delete vcjob pg1, queue_allocated metrics is ok,%v", metrics)
						}
						schedulerCache.DeletePodGroupV1beta1(pg2)
					} else {
						metrics := getLocalMetrics()
						if metrics != 0 {
							t.Errorf("after delete vcjob pg2, queue_allocated metrics is fail,%v", metrics)
							c <- false
							return
						} else {
							t.Logf("after delete vcjob pg2, queue_allocated metrics is ok,%v", metrics)
							c <- true
						}
					}
					num++
				}
			}
		}()

		go func() {
			http.Handle("/metrics", promhttp.Handler())
			err := http.ListenAndServe(":8081", nil)
			if err != nil {
				t.Errorf("ListenAndServe() err = %v", err.Error())
			}
		}()

		for {
			select {
			case res := <-c:
				if !res {
					t.Error("TestProportion failed")
				} else {
					t.Log("TestProportion successful")
				}
				return
			}

		}
	}
}

Anything else we need to know?:
Here, if DeletedJobs are not modified to public resources, subsequent jobs cannot be deleted, and the problem cannot be reproduced.
ps: I have also tried other methods without modifying deletedJobs, such as switching to cached packages for testing, but it will introduce new problems of package conflicts. I also tried to use Cache.New, because there is no mechanism similar to fake, and the problem of failure to create the default queue will also occur. Finally, I had no choice but to use this method for testing.
Environment:

Volcano Version:
master：5cfe62d01d27a5b6482ae6a9458b9451b7e5bc3e
Kubernetes version (use kubectl version):
v1.18.2

volcano-sh-bot · 2022-06-13T09:19:50Z

Welcome @LY-today!

It looks like this is your first PR to volcano-sh/volcano 馃帀.

Thank you, and welcome to Volcano. 😃

LY-today · 2022-06-13T10:36:23Z

/assign @shinytang6

LY-today · 2022-06-13T10:40:12Z

/assign @shinytang6

mangk · 2022-06-13T13:59:12Z

I also encountered this problem. The deletedJobs of the cache in volcano seems to be not very friendly to testing. hope to see changes in the future. Seeing your pr, I finally know why my prometheus is inaccurate, adopted locally

Thor-wl · 2022-06-14T03:32:29Z

@LY-today Hey, thanks for your contribution! I've reivewed the description and the fix, and there are some questions. May I know that why

Here, if DeletedJobs are not modified to public resources, subsequent jobs cannot be deleted, and the problem cannot be reproduced.

Hey, @LY-today Thanks for your contribution! I've reviewed the scenario you mentioned and the fix. May I know why it is hard to reproduced if not publish the api?

Thor-wl · 2022-06-14T03:34:23Z

Besides, pls rebase the latest master code to resolve the CI Code Verify.

LY-today · 2022-06-14T04:26:04Z

@LY-today Hey, thanks for your contribution! I've reivewed the description and the fix, and there are some questions. May I know that why

Here, if DeletedJobs are not modified to public resources, subsequent jobs cannot be deleted, and the problem cannot be reproduced.

Hey, @LY-today Thanks for your contribution! I've reviewed the scenario you mentioned and the fix. May I know why it is hard to reproduced if not publish the api?
@Thor-wl
To keep reproducibility and precision to a minimum, I try not to modify deleted jobs. Example: 1. Cached packages test this problem, the result is a package conflict. 2. The schedulerCache is created in the Cache.New method, but the default queue creation fails. ps: The above problems are all due to the reproduction of the scene, and the vcjobs under the queue need to be deleted to 0 to appear. However, the deletion operation depends on the initialization of deletedJobs. However, the initialization of deletedJobs is not very friendly, and encountered many problems with the previous design, so finally modified deletedJobs to be public. I don't know if I explained it clearly

LY-today · 2022-06-14T04:52:30Z

Besides, pls rebase the latest master code to resolve the CI Code Verify.

Thanks for the tip，done

LY-today · 2022-06-16T06:24:51Z

@Thor-wl hi,The tide process seems to be stuck, what information do I need to add to move the process forward?

LY-today · 2022-06-21T07:22:18Z

@Thor-wl hi,The tide process seems to be stuck, what information do I need to add to move the process forward?

LY-today · 2022-06-21T07:23:04Z

/assign @shinytang6

hwdef · 2022-06-21T07:56:06Z

Please squash your commit.

Thor-wl · 2022-06-21T09:10:42Z

@Thor-wl hi,The tide process seems to be stuck, what information do I need to add to move the process forward?

Sorry for the late reply. I'll finish the review ASAP.

Thor-wl · 2022-06-21T09:15:14Z

/lgtm

LY-today · 2022-06-21T09:15:20Z

@Thor-wl hi,The tide process seems to be stuck, what information do I need to add to move the process forward?

Sorry for the late reply. I'll finish the review ASAP.

That's fine, but I'm having an unexpected problem. I want to merge the previous commits through rebase, but some of your commits have caused conflicts in my local rebase.

Thor-wl · 2022-06-21T09:19:57Z

@Thor-wl hi,The tide process seems to be stuck, what information do I need to add to move the process forward?

Sorry for the late reply. I'll finish the review ASAP.

That's fine, but I'm having an unexpected problem. I want to merge the previous commits through rebase, but some of your commits have caused conflicts in my local rebase.

Perhaps you can rebase all your commits first, and then execute git pull --rebase origin master to merge with the latest code and resolve the conflicts.

Signed-off-by: LY-today <724102053@qq.com>

LY-today · 2022-06-21T09:52:13Z

@Thor-wl hi,The tide process seems to be stuck, what information do I need to add to move the process forward?

Sorry for the late reply. I'll finish the review ASAP.

That's fine, but I'm having an unexpected problem. I want to merge the previous commits through rebase, but some of your commits have caused conflicts in my local rebase.

Perhaps you can rebase all your commits first, and then execute git pull --rebase origin master to merge with the latest code and resolve the conflicts.

Thanks for the tip, I have rebase all the commits and push them again. After the test is passed, you can rejoin /lgtm

LY-today · 2022-06-21T10:34:11Z

@Thor-wl您好，潮汐流程似乎卡住了，我需要添加哪些信息才能推动流程前进？

这么晚才回复很抱歉。我会尽快完成审查。

没关系，但我遇到了一个意想不到的问题。我想通过 rebase 合并之前的提交，但是你的一些提交在我的本地 rebase 中引起了冲突。

或许您可以先对所有提交进行 rebase，然后执行git pull --rebase origin master以与最新代码合并并解决冲突。

感谢您的提示，我已重新设置所有提交并再次推送它们。测试通过后可以重新加入/lgtm

@Thor-wl Looks like the test is done
@hwdef done

Thor-wl · 2022-06-22T02:01:48Z

/lgtm

LY-today · 2022-06-23T01:58:10Z

@Thor-wl hi, I would like to know, do I still need to make some changes?

Thor-wl · 2022-06-24T01:37:49Z

@Thor-wl hi, I would like to know, do I still need to make some changes?

It's generally ok to me. And I've marked lgtm. Looking forward to other reviewers' adivce.

LY-today · 2022-06-24T01:54:05Z

@Thor-wl hi, I would like to know, do I still need to make some changes?

It's generally ok to me. And I've marked lgtm. Looking forward to other reviewers' adivce.

Understood, thanks

LY-today · 2022-06-24T02:00:39Z

@alcorj-mizar @zen-xu @hwdef If you have time, please take a look at this pr. If there are deficiencies, I can improve it as soon as possible. If there is no problem, can we proceed to the next step? Because this problem is affecting our resource monitoring system, thank you three reviewers.

william-wang

/approve

volcano-sh-bot · 2022-06-24T02:31:21Z

[APPROVALNOTIFIER] This PR is APPROVED

This pull-request has been approved by: william-wang

The full list of commands accepted by this bot can be found here.

The pull request process is described here

Needs approval from an approver in each of these files:

~~pkg/scheduler/OWNERS~~ [william-wang]

Approvers can indicate their approval by writing /approve in a comment
Approvers can cancel approval by writing /approve cancel in a comment

volcano-sh-bot requested review from alcorj-mizar and zen-xu June 13, 2022 09:19

volcano-sh-bot added the size/L Denotes a PR that changes 100-499 lines, ignoring generated files. label Jun 13, 2022

LY-today mentioned this pull request Jun 13, 2022

fix：proportion metrics accuracy #2296

Closed

LY-today force-pushed the proportion-metric-accuracy branch 2 times, most recently from 88f6ccb to 33c6d5e Compare June 13, 2022 09:47

volcano-sh-bot assigned shinytang6 Jun 13, 2022

LY-today force-pushed the proportion-metric-accuracy branch from 75b19e1 to 949eaa6 Compare June 14, 2022 06:41

LY-today force-pushed the proportion-metric-accuracy branch 2 times, most recently from d369118 to 70ba674 Compare June 14, 2022 09:54

volcano-sh-bot assigned Thor-wl Jun 21, 2022

volcano-sh-bot added the lgtm Indicates that a PR is ready to be merged. label Jun 21, 2022

fix: proportion metrics accuracy

428ee73

Signed-off-by: LY-today <724102053@qq.com>

LY-today force-pushed the proportion-metric-accuracy branch from 70ba674 to 428ee73 Compare June 21, 2022 09:45

volcano-sh-bot removed the lgtm Indicates that a PR is ready to be merged. label Jun 21, 2022

volcano-sh-bot added the lgtm Indicates that a PR is ready to be merged. label Jun 22, 2022

Thor-wl requested review from shinytang6, qiankunli, Thor-wl and william-wang and removed request for zen-xu and alcorj-mizar June 24, 2022 02:12

william-wang approved these changes Jun 24, 2022

View reviewed changes

volcano-sh-bot assigned william-wang Jun 24, 2022

volcano-sh-bot added the approved Indicates a PR has been approved by an approver from all required OWNERS files. label Jun 24, 2022

volcano-sh-bot merged commit 40b7fa7 into volcano-sh:master Jun 24, 2022

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

fix: proportion metrics accuracy #2297

fix: proportion metrics accuracy #2297

LY-today commented Jun 13, 2022 •

edited by william-wang

Loading

volcano-sh-bot commented Jun 13, 2022

LY-today commented Jun 13, 2022

LY-today commented Jun 13, 2022

mangk commented Jun 13, 2022

Thor-wl commented Jun 14, 2022

Thor-wl commented Jun 14, 2022

LY-today commented Jun 14, 2022

LY-today commented Jun 14, 2022

LY-today commented Jun 16, 2022 •

edited

Loading

LY-today commented Jun 21, 2022

LY-today commented Jun 21, 2022

hwdef commented Jun 21, 2022

Thor-wl commented Jun 21, 2022

Thor-wl commented Jun 21, 2022

LY-today commented Jun 21, 2022

Thor-wl commented Jun 21, 2022

LY-today commented Jun 21, 2022

LY-today commented Jun 21, 2022

Thor-wl commented Jun 22, 2022

LY-today commented Jun 23, 2022 •

edited

Loading

Thor-wl commented Jun 24, 2022

LY-today commented Jun 24, 2022

LY-today commented Jun 24, 2022 •

edited

Loading

william-wang left a comment

volcano-sh-bot commented Jun 24, 2022

fix: proportion metrics accuracy #2297

fix: proportion metrics accuracy #2297

Conversation

LY-today commented Jun 13, 2022 • edited by william-wang Loading

volcano-sh-bot commented Jun 13, 2022

LY-today commented Jun 13, 2022

LY-today commented Jun 13, 2022

mangk commented Jun 13, 2022

Thor-wl commented Jun 14, 2022

Thor-wl commented Jun 14, 2022

LY-today commented Jun 14, 2022

LY-today commented Jun 14, 2022

LY-today commented Jun 16, 2022 • edited Loading

LY-today commented Jun 21, 2022

LY-today commented Jun 21, 2022

hwdef commented Jun 21, 2022

Thor-wl commented Jun 21, 2022

Thor-wl commented Jun 21, 2022

LY-today commented Jun 21, 2022

Thor-wl commented Jun 21, 2022

LY-today commented Jun 21, 2022

LY-today commented Jun 21, 2022

Thor-wl commented Jun 22, 2022

LY-today commented Jun 23, 2022 • edited Loading

Thor-wl commented Jun 24, 2022

LY-today commented Jun 24, 2022

LY-today commented Jun 24, 2022 • edited Loading

william-wang left a comment

Choose a reason for hiding this comment

volcano-sh-bot commented Jun 24, 2022

LY-today commented Jun 13, 2022 •

edited by william-wang

Loading

LY-today commented Jun 16, 2022 •

edited

Loading

LY-today commented Jun 23, 2022 •

edited

Loading

LY-today commented Jun 24, 2022 •

edited

Loading