diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index b196df2ad752..84b09b65739f 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -365,7 +365,12 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError return nil } - if a.deleteCreatedNodesWithErrors() { + danglingNodes, err := a.deleteCreatedNodesWithErrors() + if err != nil { + klog.Warningf("Failed to remove nodes that were created with errors, skipping iteration: %v", err) + return nil + } + if danglingNodes { klog.V(0).Infof("Some nodes that failed to create were removed, skipping iteration") return nil } @@ -658,7 +663,7 @@ func removeOldUnregisteredNodes(unregisteredNodes []clusterstate.UnregisteredNod return removedAny, nil } -func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() bool { +func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() (bool, error) { // We always schedule deleting of incoming errornous nodes // TODO[lukaszos] Consider adding logic to not retry delete every loop iteration nodes := a.clusterStateRegistry.GetCreatedNodesWithErrors() @@ -676,6 +681,9 @@ func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() bool { klog.Warningf("Cannot determine nodeGroup for node %v; %v", id, err) continue } + if nodeGroup == nil || reflect.ValueOf(nodeGroup).IsNil() { + return false, fmt.Errorf("node %s has no known nodegroup", node.GetName()) + } nodesToBeDeletedByNodeGroupId[nodeGroup.Id()] = append(nodesToBeDeletedByNodeGroupId[nodeGroup.Id()], node) } @@ -700,7 +708,7 @@ func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() bool { a.clusterStateRegistry.InvalidateNodeInstancesCacheEntry(nodeGroup) } - return deletedAny + return deletedAny, nil } func (a *StaticAutoscaler) nodeGroupsById() map[string]cloudprovider.NodeGroup { diff --git a/cluster-autoscaler/core/static_autoscaler_test.go b/cluster-autoscaler/core/static_autoscaler_test.go index 7dc079bf0ffd..cfc8e749660d 100644 --- a/cluster-autoscaler/core/static_autoscaler_test.go +++ b/cluster-autoscaler/core/static_autoscaler_test.go @@ -951,7 +951,7 @@ func TestStaticAutoscalerRunOnceWithFilteringOnUpcomingNodesEnabledNoScaleUp(t * podDisruptionBudgetListerMock, daemonSetListerMock, onScaleUpMock, onScaleDownMock) } -func TestStaticAutoscalerInstaceCreationErrors(t *testing.T) { +func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { // setup provider := &mockprovider.CloudProvider{} @@ -1086,7 +1086,9 @@ func TestStaticAutoscalerInstaceCreationErrors(t *testing.T) { clusterState.UpdateNodes([]*apiv1.Node{}, nil, now) // delete nodes with create errors - assert.True(t, autoscaler.deleteCreatedNodesWithErrors()) + removedNodes, err := autoscaler.deleteCreatedNodesWithErrors() + assert.True(t, removedNodes) + assert.NoError(t, err) // check delete was called on correct nodes nodeGroupA.AssertCalled(t, "DeleteNodes", mock.MatchedBy( @@ -1110,7 +1112,9 @@ func TestStaticAutoscalerInstaceCreationErrors(t *testing.T) { clusterState.UpdateNodes([]*apiv1.Node{}, nil, now) // delete nodes with create errors - assert.True(t, autoscaler.deleteCreatedNodesWithErrors()) + removedNodes, err = autoscaler.deleteCreatedNodesWithErrors() + assert.True(t, removedNodes) + assert.NoError(t, err) // nodes should be deleted again nodeGroupA.AssertCalled(t, "DeleteNodes", mock.MatchedBy( @@ -1173,10 +1177,48 @@ func TestStaticAutoscalerInstaceCreationErrors(t *testing.T) { clusterState.UpdateNodes([]*apiv1.Node{}, nil, now) // delete nodes with create errors - assert.False(t, autoscaler.deleteCreatedNodesWithErrors()) + removedNodes, err = autoscaler.deleteCreatedNodesWithErrors() + assert.False(t, removedNodes) + assert.NoError(t, err) // we expect no more Delete Nodes nodeGroupA.AssertNumberOfCalls(t, "DeleteNodes", 2) + + // failed node not included by NodeGroupForNode + nodeGroupC := &mockprovider.NodeGroup{} + nodeGroupC.On("Exist").Return(true) + nodeGroupC.On("Autoprovisioned").Return(false) + nodeGroupC.On("TargetSize").Return(1, nil) + nodeGroupC.On("Id").Return("C") + nodeGroupC.On("DeleteNodes", mock.Anything).Return(nil) + nodeGroupC.On("Nodes").Return([]cloudprovider.Instance{ + { + Id: "C1", + Status: &cloudprovider.InstanceStatus{ + State: cloudprovider.InstanceCreating, + ErrorInfo: &cloudprovider.InstanceErrorInfo{ + ErrorClass: cloudprovider.OutOfResourcesErrorClass, + ErrorCode: "QUOTA", + }, + }, + }, + }, nil) + provider = &mockprovider.CloudProvider{} + provider.On("NodeGroups").Return([]cloudprovider.NodeGroup{nodeGroupC}) + provider.On("NodeGroupForNode", mock.Anything).Return(nil, nil) + + clusterState = clusterstate.NewClusterStateRegistry(provider, clusterStateConfig, context.LogRecorder, NewBackoff()) + clusterState.RefreshCloudProviderNodeInstancesCache() + autoscaler.clusterStateRegistry = clusterState + + // update cluster state + clusterState.UpdateNodes([]*apiv1.Node{}, nil, time.Now()) + + // return early on failed nodes without matching nodegroups + removedNodes, err = autoscaler.deleteCreatedNodesWithErrors() + assert.False(t, removedNodes) + assert.Error(t, err) + nodeGroupC.AssertNumberOfCalls(t, "DeleteNodes", 0) } func TestStaticAutoscalerProcessorCallbacks(t *testing.T) {