Skip to content

Commit

Permalink
Merge pull request kubernetes#4926 from DataDog/deleteCreatedNodesWit…
Browse files Browse the repository at this point in the history
…hErrors-no-segfault

Don't deref nil nodegroup in deleteCreatedNodesWithErrors
  • Loading branch information
k8s-ci-robot authored May 30, 2022
2 parents 3ce5dc4 + b2383ee commit c0232bc
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 7 deletions.
14 changes: 11 additions & 3 deletions cluster-autoscaler/core/static_autoscaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -365,7 +365,12 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
return nil
}

if a.deleteCreatedNodesWithErrors() {
danglingNodes, err := a.deleteCreatedNodesWithErrors()
if err != nil {
klog.Warningf("Failed to remove nodes that were created with errors, skipping iteration: %v", err)
return nil
}
if danglingNodes {
klog.V(0).Infof("Some nodes that failed to create were removed, skipping iteration")
return nil
}
Expand Down Expand Up @@ -658,7 +663,7 @@ func removeOldUnregisteredNodes(unregisteredNodes []clusterstate.UnregisteredNod
return removedAny, nil
}

func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() bool {
func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() (bool, error) {
// We always schedule deleting of incoming errornous nodes
// TODO[lukaszos] Consider adding logic to not retry delete every loop iteration
nodes := a.clusterStateRegistry.GetCreatedNodesWithErrors()
Expand All @@ -676,6 +681,9 @@ func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() bool {
klog.Warningf("Cannot determine nodeGroup for node %v; %v", id, err)
continue
}
if nodeGroup == nil || reflect.ValueOf(nodeGroup).IsNil() {
return false, fmt.Errorf("node %s has no known nodegroup", node.GetName())
}
nodesToBeDeletedByNodeGroupId[nodeGroup.Id()] = append(nodesToBeDeletedByNodeGroupId[nodeGroup.Id()], node)
}

Expand All @@ -700,7 +708,7 @@ func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() bool {
a.clusterStateRegistry.InvalidateNodeInstancesCacheEntry(nodeGroup)
}

return deletedAny
return deletedAny, nil
}

func (a *StaticAutoscaler) nodeGroupsById() map[string]cloudprovider.NodeGroup {
Expand Down
50 changes: 46 additions & 4 deletions cluster-autoscaler/core/static_autoscaler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -951,7 +951,7 @@ func TestStaticAutoscalerRunOnceWithFilteringOnUpcomingNodesEnabledNoScaleUp(t *
podDisruptionBudgetListerMock, daemonSetListerMock, onScaleUpMock, onScaleDownMock)
}

func TestStaticAutoscalerInstaceCreationErrors(t *testing.T) {
func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) {

// setup
provider := &mockprovider.CloudProvider{}
Expand Down Expand Up @@ -1086,7 +1086,9 @@ func TestStaticAutoscalerInstaceCreationErrors(t *testing.T) {
clusterState.UpdateNodes([]*apiv1.Node{}, nil, now)

// delete nodes with create errors
assert.True(t, autoscaler.deleteCreatedNodesWithErrors())
removedNodes, err := autoscaler.deleteCreatedNodesWithErrors()
assert.True(t, removedNodes)
assert.NoError(t, err)

// check delete was called on correct nodes
nodeGroupA.AssertCalled(t, "DeleteNodes", mock.MatchedBy(
Expand All @@ -1110,7 +1112,9 @@ func TestStaticAutoscalerInstaceCreationErrors(t *testing.T) {
clusterState.UpdateNodes([]*apiv1.Node{}, nil, now)

// delete nodes with create errors
assert.True(t, autoscaler.deleteCreatedNodesWithErrors())
removedNodes, err = autoscaler.deleteCreatedNodesWithErrors()
assert.True(t, removedNodes)
assert.NoError(t, err)

// nodes should be deleted again
nodeGroupA.AssertCalled(t, "DeleteNodes", mock.MatchedBy(
Expand Down Expand Up @@ -1173,10 +1177,48 @@ func TestStaticAutoscalerInstaceCreationErrors(t *testing.T) {
clusterState.UpdateNodes([]*apiv1.Node{}, nil, now)

// delete nodes with create errors
assert.False(t, autoscaler.deleteCreatedNodesWithErrors())
removedNodes, err = autoscaler.deleteCreatedNodesWithErrors()
assert.False(t, removedNodes)
assert.NoError(t, err)

// we expect no more Delete Nodes
nodeGroupA.AssertNumberOfCalls(t, "DeleteNodes", 2)

// failed node not included by NodeGroupForNode
nodeGroupC := &mockprovider.NodeGroup{}
nodeGroupC.On("Exist").Return(true)
nodeGroupC.On("Autoprovisioned").Return(false)
nodeGroupC.On("TargetSize").Return(1, nil)
nodeGroupC.On("Id").Return("C")
nodeGroupC.On("DeleteNodes", mock.Anything).Return(nil)
nodeGroupC.On("Nodes").Return([]cloudprovider.Instance{
{
Id: "C1",
Status: &cloudprovider.InstanceStatus{
State: cloudprovider.InstanceCreating,
ErrorInfo: &cloudprovider.InstanceErrorInfo{
ErrorClass: cloudprovider.OutOfResourcesErrorClass,
ErrorCode: "QUOTA",
},
},
},
}, nil)
provider = &mockprovider.CloudProvider{}
provider.On("NodeGroups").Return([]cloudprovider.NodeGroup{nodeGroupC})
provider.On("NodeGroupForNode", mock.Anything).Return(nil, nil)

clusterState = clusterstate.NewClusterStateRegistry(provider, clusterStateConfig, context.LogRecorder, NewBackoff())
clusterState.RefreshCloudProviderNodeInstancesCache()
autoscaler.clusterStateRegistry = clusterState

// update cluster state
clusterState.UpdateNodes([]*apiv1.Node{}, nil, time.Now())

// return early on failed nodes without matching nodegroups
removedNodes, err = autoscaler.deleteCreatedNodesWithErrors()
assert.False(t, removedNodes)
assert.Error(t, err)
nodeGroupC.AssertNumberOfCalls(t, "DeleteNodes", 0)
}

func TestStaticAutoscalerProcessorCallbacks(t *testing.T) {
Expand Down

0 comments on commit c0232bc

Please sign in to comment.