Skip to content

Commit

Permalink
Merge pull request kubernetes#4926 from DataDog/deleteCreatedNodesWit…
Browse files Browse the repository at this point in the history
…hErrors-no-segfault

- Don't deref nil nodegroup in deleteCreatedNodesWithErrors
- Bugfix: Expander Priority warns misleading log
  • Loading branch information
k8s-ci-robot authored and Bruno Batista committed Sep 27, 2022
1 parent 1585bbd commit b3f5a4b
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 6 deletions.
17 changes: 15 additions & 2 deletions cluster-autoscaler/core/static_autoscaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,15 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
return nil
}

a.deleteCreatedNodesWithErrors()
danglingNodes, err := a.deleteCreatedNodesWithErrors()
if err != nil {
klog.Warningf("Failed to remove nodes that were created with errors, skipping iteration: %v", err)
return nil
}
if danglingNodes {
klog.V(0).Infof("Some nodes that failed to create were removed, skipping iteration")
return nil
}

// Check if there has been a constant difference between the number of nodes in k8s and
// the number of nodes on the cloud provider side.
Expand Down Expand Up @@ -624,7 +632,7 @@ func removeOldUnregisteredNodes(unregisteredNodes []clusterstate.UnregisteredNod
return removedAny, nil
}

func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() {
func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() (bool, error) {
// We always schedule deleting of incoming errornous nodes
// TODO[lukaszos] Consider adding logic to not retry delete every loop iteration
nodes := a.clusterStateRegistry.GetCreatedNodesWithErrors()
Expand All @@ -642,6 +650,9 @@ func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() {
klog.Warningf("Cannot determine nodeGroup for node %v; %v", id, err)
continue
}
if nodeGroup == nil || reflect.ValueOf(nodeGroup).IsNil() {
return false, fmt.Errorf("node %s has no known nodegroup", node.GetName())
}
nodesToBeDeletedByNodeGroupId[nodeGroup.Id()] = append(nodesToBeDeletedByNodeGroupId[nodeGroup.Id()], node)
}

Expand All @@ -662,6 +673,8 @@ func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() {

a.clusterStateRegistry.InvalidateNodeInstancesCacheEntry(nodeGroup)
}

return deletedAny, nil
}

func (a *StaticAutoscaler) nodeGroupsById() map[string]cloudprovider.NodeGroup {
Expand Down
50 changes: 46 additions & 4 deletions cluster-autoscaler/core/static_autoscaler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -905,7 +905,7 @@ func TestStaticAutoscalerRunOnceWithFilteringOnUpcomingNodesEnabledNoScaleUp(t *
podDisruptionBudgetListerMock, daemonSetListerMock, onScaleUpMock, onScaleDownMock)
}

func TestStaticAutoscalerInstaceCreationErrors(t *testing.T) {
func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) {

// setup
provider := &mockprovider.CloudProvider{}
Expand Down Expand Up @@ -1038,7 +1038,9 @@ func TestStaticAutoscalerInstaceCreationErrors(t *testing.T) {
clusterState.UpdateNodes([]*apiv1.Node{}, nil, now)

// delete nodes with create errors
autoscaler.deleteCreatedNodesWithErrors()
removedNodes, err := autoscaler.deleteCreatedNodesWithErrors()
assert.True(t, removedNodes)
assert.NoError(t, err)

// check delete was called on correct nodes
nodeGroupA.AssertCalled(t, "DeleteNodes", mock.MatchedBy(
Expand All @@ -1062,7 +1064,9 @@ func TestStaticAutoscalerInstaceCreationErrors(t *testing.T) {
clusterState.UpdateNodes([]*apiv1.Node{}, nil, now)

// delete nodes with create errors
autoscaler.deleteCreatedNodesWithErrors()
removedNodes, err = autoscaler.deleteCreatedNodesWithErrors()
assert.True(t, removedNodes)
assert.NoError(t, err)

// nodes should be deleted again
nodeGroupA.AssertCalled(t, "DeleteNodes", mock.MatchedBy(
Expand Down Expand Up @@ -1125,10 +1129,48 @@ func TestStaticAutoscalerInstaceCreationErrors(t *testing.T) {
clusterState.UpdateNodes([]*apiv1.Node{}, nil, now)

// delete nodes with create errors
autoscaler.deleteCreatedNodesWithErrors()
removedNodes, err = autoscaler.deleteCreatedNodesWithErrors()
assert.False(t, removedNodes)
assert.NoError(t, err)

// we expect no more Delete Nodes
nodeGroupA.AssertNumberOfCalls(t, "DeleteNodes", 2)

// failed node not included by NodeGroupForNode
nodeGroupC := &mockprovider.NodeGroup{}
nodeGroupC.On("Exist").Return(true)
nodeGroupC.On("Autoprovisioned").Return(false)
nodeGroupC.On("TargetSize").Return(1, nil)
nodeGroupC.On("Id").Return("C")
nodeGroupC.On("DeleteNodes", mock.Anything).Return(nil)
nodeGroupC.On("Nodes").Return([]cloudprovider.Instance{
{
Id: "C1",
Status: &cloudprovider.InstanceStatus{
State: cloudprovider.InstanceCreating,
ErrorInfo: &cloudprovider.InstanceErrorInfo{
ErrorClass: cloudprovider.OutOfResourcesErrorClass,
ErrorCode: "QUOTA",
},
},
},
}, nil)
provider = &mockprovider.CloudProvider{}
provider.On("NodeGroups").Return([]cloudprovider.NodeGroup{nodeGroupC})
provider.On("NodeGroupForNode", mock.Anything).Return(nil, nil)

clusterState = clusterstate.NewClusterStateRegistry(provider, clusterStateConfig, context.LogRecorder, NewBackoff())
clusterState.RefreshCloudProviderNodeInstancesCache()
autoscaler.clusterStateRegistry = clusterState

// update cluster state
clusterState.UpdateNodes([]*apiv1.Node{}, nil, time.Now())

// return early on failed nodes without matching nodegroups
removedNodes, err = autoscaler.deleteCreatedNodesWithErrors()
assert.False(t, removedNodes)
assert.Error(t, err)
nodeGroupC.AssertNumberOfCalls(t, "DeleteNodes", 0)
}

func TestStaticAutoscalerProcessorCallbacks(t *testing.T) {
Expand Down

0 comments on commit b3f5a4b

Please sign in to comment.