Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix possible panic when 'SetNode' is called #1952

Merged
merged 1 commit into from
Jan 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 25 additions & 8 deletions pkg/scheduler/api/node_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -295,17 +295,34 @@ func (ni *NodeInfo) setNodeGPUInfo(node *v1.Node) {

// SetNode sets kubernetes node object to nodeInfo object
func (ni *NodeInfo) SetNode(node *v1.Node) {
ni.setOversubscription(node)
ni.setNodeState(node)
ni.setNodeGPUInfo(node)
ni.setRevocableZone(node)

if !ni.Ready() {
klog.Warningf("Failed to set node info, phase: %s, reason: %s",
ni.State.Phase, ni.State.Reason)
klog.Warningf("Failed to set node info for %s, phase: %s, reason: %s",
ni.Name, ni.State.Phase, ni.State.Reason)
return
}

// Dry run, make sure all fields other than `State` are in the original state.
copy := ni.Clone()
copy.setNode(node)
copy.setNodeState(node)
if !copy.Ready() {
klog.Warningf("SetNode makes node %s not ready, phase: %s, reason: %s",
copy.Name, copy.State.Phase, copy.State.Reason)
// Set state of node to !Ready, left other fields untouched
ni.State = copy.State
return
}

ni.setNode(node)
}

// setNode sets kubernetes node object to nodeInfo object without assertion
func (ni *NodeInfo) setNode(node *v1.Node) {
ni.setOversubscription(node)
ni.setNodeGPUInfo(node)
ni.setRevocableZone(node)

ni.Name = node.Name
ni.Node = node

Expand All @@ -319,14 +336,14 @@ func (ni *NodeInfo) SetNode(node *v1.Node) {
for _, ti := range ni.Tasks {
switch ti.Status {
case Releasing:
ni.Idle.Sub(ti.Resreq)
ni.Idle.sub(ti.Resreq) // sub without assertion
ni.Releasing.Add(ti.Resreq)
ni.Used.Add(ti.Resreq)
ni.AddGPUResource(ti.Pod)
case Pipelined:
ni.Pipelined.Add(ti.Resreq)
default:
ni.Idle.Sub(ti.Resreq)
ni.Idle.sub(ti.Resreq) // sub without assertion
ni.Used.Add(ti.Resreq)
ni.AddGPUResource(ti.Pod)
}
Expand Down
66 changes: 66 additions & 0 deletions pkg/scheduler/api/node_info_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -167,3 +167,69 @@ func TestNodeInfo_RemovePod(t *testing.T) {
}
}
}

func TestNodeInfo_SetNode(t *testing.T) {
// case1
case01Node1 := buildNode("n1", buildResourceList("10", "10G"))
case01Node2 := buildNode("n1", buildResourceList("8", "8G"))
case01Pod1 := buildPod("c1", "p1", "n1", v1.PodRunning, buildResourceList("1", "1G"), []metav1.OwnerReference{}, make(map[string]string))
case01Pod2 := buildPod("c1", "p2", "n1", v1.PodRunning, buildResourceList("2", "2G"), []metav1.OwnerReference{}, make(map[string]string))
case01Pod3 := buildPod("c1", "p3", "n1", v1.PodRunning, buildResourceList("6", "6G"), []metav1.OwnerReference{}, make(map[string]string))

tests := []struct {
name string
node *v1.Node
updated *v1.Node
pods []*v1.Pod
expected *NodeInfo
}{
{
name: "add 3 running non-owner pod",
node: case01Node1,
updated: case01Node2,
pods: []*v1.Pod{case01Pod1, case01Pod2, case01Pod3},
expected: &NodeInfo{
Name: "n1",
Node: case01Node1,
Idle: buildResource("1", "1G"),
Used: buildResource("9", "9G"),
OversubscriptionResource: EmptyResource(),
Releasing: EmptyResource(),
Pipelined: EmptyResource(),
Allocatable: buildResource("10", "10G"),
Capability: buildResource("10", "10G"),
State: NodeState{Phase: NotReady, Reason: "OutOfSync"},
Tasks: map[TaskID]*TaskInfo{
"c1/p1": NewTaskInfo(case01Pod1),
"c1/p2": NewTaskInfo(case01Pod2),
"c1/p3": NewTaskInfo(case01Pod3),
},
GPUDevices: make(map[int]*GPUDevice),
},
},
}

for i, test := range tests {
ni := NewNodeInfo(test.node)
for _, pod := range test.pods {
pi := NewTaskInfo(pod)
ni.AddTask(pi)
ni.Name = pod.Spec.NodeName
}

// OutOfSync. e.g.: nvidia-device-plugin is down causes gpus turn from 8 to 0 (node.status.allocatable."nvidia.com/gpu": 0)
ni.SetNode(test.updated)
if !nodeInfoEqual(ni, test.expected) {
t.Errorf("node info %d: \n expected\t%v, \n got\t\t%v \n",
i, test.expected, ni)
}

// Recover. e.g.: nvidia-device-plugin is restarted successfully
ni.SetNode(test.node)
test.expected.State = NodeState{Phase: Ready}
if !nodeInfoEqual(ni, test.expected) {
t.Errorf("recovered %d: \n expected\t%v, \n got\t\t%v \n",
i, test.expected, ni)
}
}
}
6 changes: 5 additions & 1 deletion pkg/scheduler/api/resource_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -197,10 +197,14 @@ func (r *Resource) Add(rr *Resource) *Resource {
return r
}

//Sub subtracts two Resource objects.
// Sub subtracts two Resource objects with assertion.
func (r *Resource) Sub(rr *Resource) *Resource {
assert.Assertf(rr.LessEqual(r, Zero), "resource is not sufficient to do operation: <%v> sub <%v>", r, rr)
return r.sub(rr)
}

// sub subtracts two Resource objects.
func (r *Resource) sub(rr *Resource) *Resource {
r.MilliCPU -= rr.MilliCPU
r.Memory -= rr.Memory

Expand Down