From 6d78fae7d5b88ce9176b25d777a89e177f63ae9d Mon Sep 17 00:00:00 2001 From: Benjamin Pineau Date: Mon, 30 May 2022 18:11:06 +0200 Subject: [PATCH 1/2] Don't deref nil nodegroup in deleteCreatedNodesWithErrors Various cloudproviders' `NodeGroupForNode()` implementations (including aws, azure, and gce) can returns a `nil` error _and_ a `nil` nodegroup. Eg. we're seeing AWS returning that on failed upscales on live clusters. Checking that `deleteCreatedNodesWithErrors` doesn't return an error is not enough to safely dereference the nodegroup (as returned by `NodeGroupForNode()`) by calling nodegroup.Id(). In that situation, logging and returning early seems the safest option, to give various caches (eg. clusterstateregistry's and cloud provider's) the opportunity to eventually converge. --- cluster-autoscaler/core/static_autoscaler.go | 14 ++++-- .../core/static_autoscaler_test.go | 50 +++++++++++++++++-- 2 files changed, 57 insertions(+), 7 deletions(-) diff --git a/cluster-autoscaler/core/static_autoscaler.go b/cluster-autoscaler/core/static_autoscaler.go index c06088f6711d..0ce60478427f 100644 --- a/cluster-autoscaler/core/static_autoscaler.go +++ b/cluster-autoscaler/core/static_autoscaler.go @@ -357,7 +357,12 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError return nil } - if a.deleteCreatedNodesWithErrors() { + danglingNodes, err := a.deleteCreatedNodesWithErrors() + if err != nil { + klog.Warningf("Failed to remove nodes that were created with errors, skipping iteration: %v", err) + return nil + } + if danglingNodes { klog.V(0).Infof("Some nodes that failed to create were removed, skipping iteration") return nil } @@ -656,7 +661,7 @@ func removeOldUnregisteredNodes(unregisteredNodes []clusterstate.UnregisteredNod return removedAny, nil } -func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() bool { +func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() (bool, error) { // We always schedule deleting of incoming errornous nodes // TODO[lukaszos] Consider adding logic to not retry delete every loop iteration nodes := a.clusterStateRegistry.GetCreatedNodesWithErrors() @@ -674,6 +679,9 @@ func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() bool { klog.Warningf("Cannot determine nodeGroup for node %v; %v", id, err) continue } + if nodeGroup == nil || reflect.ValueOf(nodeGroup).IsNil() { + return false, fmt.Errorf("node %s has no known nodegroup", node.GetName()) + } nodesToBeDeletedByNodeGroupId[nodeGroup.Id()] = append(nodesToBeDeletedByNodeGroupId[nodeGroup.Id()], node) } @@ -698,7 +706,7 @@ func (a *StaticAutoscaler) deleteCreatedNodesWithErrors() bool { a.clusterStateRegistry.InvalidateNodeInstancesCacheEntry(nodeGroup) } - return deletedAny + return deletedAny, nil } func (a *StaticAutoscaler) nodeGroupsById() map[string]cloudprovider.NodeGroup { diff --git a/cluster-autoscaler/core/static_autoscaler_test.go b/cluster-autoscaler/core/static_autoscaler_test.go index 8be932c50792..fb56a5943572 100644 --- a/cluster-autoscaler/core/static_autoscaler_test.go +++ b/cluster-autoscaler/core/static_autoscaler_test.go @@ -927,7 +927,7 @@ func TestStaticAutoscalerRunOnceWithFilteringOnUpcomingNodesEnabledNoScaleUp(t * podDisruptionBudgetListerMock, daemonSetListerMock, onScaleUpMock, onScaleDownMock) } -func TestStaticAutoscalerInstaceCreationErrors(t *testing.T) { +func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { // setup provider := &mockprovider.CloudProvider{} @@ -1062,7 +1062,9 @@ func TestStaticAutoscalerInstaceCreationErrors(t *testing.T) { clusterState.UpdateNodes([]*apiv1.Node{}, nil, now) // delete nodes with create errors - assert.True(t, autoscaler.deleteCreatedNodesWithErrors()) + removedNodes, err := autoscaler.deleteCreatedNodesWithErrors() + assert.True(t, removedNodes) + assert.NoError(t, err) // check delete was called on correct nodes nodeGroupA.AssertCalled(t, "DeleteNodes", mock.MatchedBy( @@ -1086,7 +1088,9 @@ func TestStaticAutoscalerInstaceCreationErrors(t *testing.T) { clusterState.UpdateNodes([]*apiv1.Node{}, nil, now) // delete nodes with create errors - assert.True(t, autoscaler.deleteCreatedNodesWithErrors()) + removedNodes, err = autoscaler.deleteCreatedNodesWithErrors() + assert.True(t, removedNodes) + assert.NoError(t, err) // nodes should be deleted again nodeGroupA.AssertCalled(t, "DeleteNodes", mock.MatchedBy( @@ -1149,10 +1153,48 @@ func TestStaticAutoscalerInstaceCreationErrors(t *testing.T) { clusterState.UpdateNodes([]*apiv1.Node{}, nil, now) // delete nodes with create errors - assert.False(t, autoscaler.deleteCreatedNodesWithErrors()) + removedNodes, err = autoscaler.deleteCreatedNodesWithErrors() + assert.False(t, removedNodes) + assert.NoError(t, err) // we expect no more Delete Nodes nodeGroupA.AssertNumberOfCalls(t, "DeleteNodes", 2) + + // failed node not included by NodeGroupForNode + nodeGroupC := &mockprovider.NodeGroup{} + nodeGroupC.On("Exist").Return(true) + nodeGroupC.On("Autoprovisioned").Return(false) + nodeGroupC.On("TargetSize").Return(1, nil) + nodeGroupC.On("Id").Return("C") + nodeGroupC.On("DeleteNodes", mock.Anything).Return(nil) + nodeGroupC.On("Nodes").Return([]cloudprovider.Instance{ + { + Id: "C1", + Status: &cloudprovider.InstanceStatus{ + State: cloudprovider.InstanceCreating, + ErrorInfo: &cloudprovider.InstanceErrorInfo{ + ErrorClass: cloudprovider.OutOfResourcesErrorClass, + ErrorCode: "QUOTA", + }, + }, + }, + }, nil) + provider = &mockprovider.CloudProvider{} + provider.On("NodeGroups").Return([]cloudprovider.NodeGroup{nodeGroupC}) + provider.On("NodeGroupForNode", mock.Anything).Return(nil, nil) + + clusterState = clusterstate.NewClusterStateRegistry(provider, clusterStateConfig, context.LogRecorder, NewBackoff()) + clusterState.RefreshCloudProviderNodeInstancesCache() + autoscaler.clusterStateRegistry = clusterState + + // update cluster state + clusterState.UpdateNodes([]*apiv1.Node{}, nil, time.Now()) + + // return early on failed nodes without matching nodegroups + removedNodes, err = autoscaler.deleteCreatedNodesWithErrors() + assert.False(t, removedNodes) + assert.Error(t, err) + nodeGroupC.AssertNumberOfCalls(t, "DeleteNodes", 0) } func TestStaticAutoscalerProcessorCallbacks(t *testing.T) { From b8f79f8cf75efad471f35d14b956fe11883a9fd0 Mon Sep 17 00:00:00 2001 From: "qianlei.qianl" Date: Wed, 24 May 2023 17:39:20 +0800 Subject: [PATCH 2/2] test(core): fix undefined NewBackoff() --- cluster-autoscaler/core/static_autoscaler_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cluster-autoscaler/core/static_autoscaler_test.go b/cluster-autoscaler/core/static_autoscaler_test.go index fb56a5943572..602fa7e70cae 100644 --- a/cluster-autoscaler/core/static_autoscaler_test.go +++ b/cluster-autoscaler/core/static_autoscaler_test.go @@ -1183,7 +1183,7 @@ func TestStaticAutoscalerInstanceCreationErrors(t *testing.T) { provider.On("NodeGroups").Return([]cloudprovider.NodeGroup{nodeGroupC}) provider.On("NodeGroupForNode", mock.Anything).Return(nil, nil) - clusterState = clusterstate.NewClusterStateRegistry(provider, clusterStateConfig, context.LogRecorder, NewBackoff()) + clusterState = clusterstate.NewClusterStateRegistry(provider, clusterStateConfig, context.LogRecorder, newBackoff()) clusterState.RefreshCloudProviderNodeInstancesCache() autoscaler.clusterStateRegistry = clusterState