Skip to content

Commit

Permalink
attach errors to scale-up request and add comments
Browse files Browse the repository at this point in the history
  • Loading branch information
wllbo committed Oct 18, 2023
1 parent cb0465d commit e85eeae
Show file tree
Hide file tree
Showing 5 changed files with 159 additions and 50 deletions.
31 changes: 25 additions & 6 deletions cluster-autoscaler/clusterstate/clusterstate.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,8 @@ type ScaleUpRequest struct {
ExpectedAddTime time.Time
// How much the node group is increased.
Increase int
// ErrorClasses is a set of the classes of errors encountered during a scale-up, if any.
ErrorClasses map[cloudprovider.InstanceErrorClass]struct{}
}

// ScaleDownRequest contains information about the requested node deletion.
Expand All @@ -82,8 +84,11 @@ type ClusterStateRegistryConfig struct {
// Minimum number of nodes that must be unready for MaxTotalUnreadyPercentage to apply.
// This is to ensure that in very small clusters (e.g. 2 nodes) a single node's failure doesn't disable autoscaling.
OkTotalUnreadyCount int
// NodeGroupKeepBackoffOutOfResources is whether a backoff can be removed before expiration when a scale-up fails due to the cloud provider being out of resources.
NodeGroupKeepBackoffOutOfResources bool
// NodeGroupRemovePersistentErrorBackoffEarly is whether a backoff can be removed before expiration when
// a scale-up partially fails due to a likely persistent error.
// If true (default), the backoff will be removed early regardless of the error.
// If false and the backoff is due to a likely persistent error, e.g. OutOfResourcesError, it will not be removed early.
NodeGroupRemovePersistentErrorBackoffEarly bool
}

// IncorrectNodeGroupSize contains information about how much the current size of the node group
Expand Down Expand Up @@ -216,6 +221,7 @@ func (csr *ClusterStateRegistry) registerOrUpdateScaleUpNoLock(nodeGroup cloudpr
Increase: delta,
Time: currentTime,
ExpectedAddTime: currentTime.Add(maxNodeProvisionTime),
ErrorClasses: make(map[cloudprovider.InstanceErrorClass]struct{}),
}
csr.scaleUpRequests[nodeGroup.Id()] = scaleUpRequest
return
Expand Down Expand Up @@ -258,9 +264,16 @@ func (csr *ClusterStateRegistry) updateScaleRequests(currentTime time.Time) {
// scale-out finished successfully
// remove it and reset node group backoff
delete(csr.scaleUpRequests, nodeGroupName)
shouldKeepBackoff := csr.config.NodeGroupKeepBackoffOutOfResources && csr.backoff.IsNodeGroupOutOfResources(scaleUpRequest.NodeGroup)
if !shouldKeepBackoff {
klog.V(4).Infof("Removing backoff for node group %v", scaleUpRequest.NodeGroup.Id())
// If a node group is backed off during a scale-up due to instance creation errors but partially succeeds,
// the backoff could be removed early, allowing the CA to retry scaling the same node group.
// Optionally, the backoff can be retained for persistent errors given the risk of recurrence.
// The backoff will be removed early if either of the following conditions are true:
// 1. NodeGroupRemovePersistentErrorBackoffEarly is enabled (default)
// 2. There is no persistent error class attached to the scale-up request
_, persistentError := scaleUpRequest.ErrorClasses[cloudprovider.OutOfResourcesErrorClass]
shouldRemoveBackoffEarly := csr.config.NodeGroupRemovePersistentErrorBackoffEarly || !persistentError
if shouldRemoveBackoffEarly {
klog.V(4).Infof("Removing backoff early for node group %v", scaleUpRequest.NodeGroup.Id())
csr.backoff.RemoveBackoff(scaleUpRequest.NodeGroup, csr.nodeInfosForGroups[scaleUpRequest.NodeGroup.Id()])
}
klog.V(4).Infof("Scale up in group %v finished successfully in %v",
Expand Down Expand Up @@ -315,6 +328,12 @@ func (csr *ClusterStateRegistry) RegisterFailedScaleUp(nodeGroup cloudprovider.N
func (csr *ClusterStateRegistry) registerFailedScaleUpNoLock(nodeGroup cloudprovider.NodeGroup, reason metrics.FailedScaleUpReason, errorClass cloudprovider.InstanceErrorClass, errorCode string, gpuResourceName, gpuType string, currentTime time.Time) {
csr.scaleUpFailures[nodeGroup.Id()] = append(csr.scaleUpFailures[nodeGroup.Id()], ScaleUpFailure{NodeGroup: nodeGroup, Reason: reason, Time: currentTime})
metrics.RegisterFailedScaleUp(reason, gpuResourceName, gpuType)
// attach the error class to the scale-up request if it exists
// it will be used to determine whether to remove the backoff early when updating scale-up requests
scaleUpRequest, found := csr.scaleUpRequests[nodeGroup.Id()]
if found {
scaleUpRequest.ErrorClasses[errorClass] = struct{}{}
}
csr.backoffNodeGroup(nodeGroup, errorClass, errorCode, currentTime)
}

Expand Down Expand Up @@ -1097,7 +1116,7 @@ func (csr *ClusterStateRegistry) handleInstanceCreationErrorsForNodeGroup(
// If node group is scaling up and there are new node-create requests which cannot be satisfied because of
// out-of-resources errors we:
// - emit event
// - alter the scale-up
// - alter the scale-up and attach the error class
// - increase scale-up failure metric
// - backoff the node group
for errorCode, instances := range currentErrorCodeToInstance {
Expand Down
89 changes: 89 additions & 0 deletions cluster-autoscaler/clusterstate/clusterstate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,95 @@ func TestRegisterScaleDown(t *testing.T) {
assert.Empty(t, clusterstate.GetScaleUpFailures())
}

func TestRemovePersistentErrorBackoffEarlyEnabled(t *testing.T) {
ng1_1 := BuildTestNode("ng1-1", 1000, 1000)
provider := testprovider.NewTestCloudProvider(nil, nil)
provider.AddNodeGroup("ng1", 1, 10, 1)
provider.AddNode("ng1", ng1_1)
assert.NotNil(t, provider)

fakeClient := &fake.Clientset{}
fakeLogRecorder, _ := utils.NewStatusMapRecorder(fakeClient, "kube-system", kube_record.NewFakeRecorder(5), false, "my-cool-configmap")
clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{
MaxTotalUnreadyPercentage: 10,
OkTotalUnreadyCount: 1,
NodeGroupRemovePersistentErrorBackoffEarly: true,
}, fakeLogRecorder, newBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{MaxNodeProvisionTime: 15 * time.Minute}))

now := time.Now()

provider.GetNodeGroup("ng1").(*testprovider.TestNodeGroup).SetTargetSize(4)
clusterstate.UpdateNodes([]*apiv1.Node{ng1_1}, nil, now)
clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), 3, now)
assert.True(t, clusterstate.IsNodeGroupScalingUp("ng1"))

// Fail two nodes with a persistent and a non-persistent error
clusterstate.registerFailedScaleUpNoLock(provider.GetNodeGroup("ng1"), metrics.CloudProviderError, cloudprovider.OutOfResourcesErrorClass, string(metrics.CloudProviderError), "", "", now)
clusterstate.registerFailedScaleUpNoLock(provider.GetNodeGroup("ng1"), metrics.CloudProviderError, cloudprovider.OtherErrorClass, string(metrics.CloudProviderError), "", "", now)
clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), -2, now)
assert.Equal(t, 2, len(clusterstate.scaleUpRequests["ng1"].ErrorClasses))
assert.True(t, clusterstate.backoff.IsBackedOff(provider.GetNodeGroup("ng1"), nil, now))

// Reduce the target size to original value to complete the scale-up and trigger the backoff early removal
provider.GetNodeGroup("ng1").(*testprovider.TestNodeGroup).SetTargetSize(1)
clusterstate.UpdateNodes([]*apiv1.Node{ng1_1}, nil, now)
assert.False(t, clusterstate.IsNodeGroupScalingUp("ng1"))
assert.False(t, clusterstate.backoff.IsBackedOff(provider.GetNodeGroup("ng1"), nil, now))
}

func TestRemovePersistentErrorBackoffEarlyDisabled(t *testing.T) {
ng1_1 := BuildTestNode("ng1-1", 1000, 1000)
provider := testprovider.NewTestCloudProvider(nil, nil)
provider.AddNodeGroup("ng1", 1, 10, 1)
provider.AddNode("ng1", ng1_1)
assert.NotNil(t, provider)

fakeClient := &fake.Clientset{}
fakeLogRecorder, _ := utils.NewStatusMapRecorder(fakeClient, "kube-system", kube_record.NewFakeRecorder(5), false, "my-cool-configmap")
clusterstate := NewClusterStateRegistry(provider, ClusterStateRegistryConfig{
MaxTotalUnreadyPercentage: 10,
OkTotalUnreadyCount: 1,
NodeGroupRemovePersistentErrorBackoffEarly: false,
}, fakeLogRecorder, newBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{MaxNodeProvisionTime: 15 * time.Minute}))

now := time.Now()

provider.GetNodeGroup("ng1").(*testprovider.TestNodeGroup).SetTargetSize(3)
clusterstate.UpdateNodes([]*apiv1.Node{ng1_1}, nil, now)
clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), 2, now)
assert.True(t, clusterstate.IsNodeGroupScalingUp("ng1"))

// Fail one node with a persistent error
clusterstate.registerFailedScaleUpNoLock(provider.GetNodeGroup("ng1"), metrics.CloudProviderError, cloudprovider.OutOfResourcesErrorClass, string(metrics.CloudProviderError), "", "", now)
clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), -1, now)
assert.True(t, clusterstate.backoff.IsBackedOff(provider.GetNodeGroup("ng1"), nil, now))

// Confirm the persistent error backoff is not removed early
provider.GetNodeGroup("ng1").(*testprovider.TestNodeGroup).SetTargetSize(1)
clusterstate.UpdateNodes([]*apiv1.Node{ng1_1}, nil, now)
assert.False(t, clusterstate.IsNodeGroupScalingUp("ng1"))
assert.True(t, clusterstate.backoff.IsBackedOff(provider.GetNodeGroup("ng1"), nil, now))

// Remove the backoff and scale up again
clusterstate.backoff.RemoveBackoff(provider.GetNodeGroup("ng1"), nil)
provider.GetNodeGroup("ng1").(*testprovider.TestNodeGroup).SetTargetSize(3)
clusterstate.UpdateNodes([]*apiv1.Node{ng1_1}, nil, now)
clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), 2, now)
assert.False(t, clusterstate.backoff.IsBackedOff(provider.GetNodeGroup("ng1"), nil, now))
assert.True(t, clusterstate.IsNodeGroupScalingUp("ng1"))

// Fail one node with a non-persistent error
clusterstate.registerFailedScaleUpNoLock(provider.GetNodeGroup("ng1"), metrics.CloudProviderError, cloudprovider.OtherErrorClass, string(metrics.CloudProviderError), "", "", now)
clusterstate.RegisterOrUpdateScaleUp(provider.GetNodeGroup("ng1"), -1, now)
assert.True(t, clusterstate.backoff.IsBackedOff(provider.GetNodeGroup("ng1"), nil, now))

// Complete the scale-up and confirm the backoff is removed early
provider.GetNodeGroup("ng1").(*testprovider.TestNodeGroup).SetTargetSize(1)
clusterstate.UpdateNodes([]*apiv1.Node{ng1_1}, nil, now)
assert.False(t, clusterstate.IsNodeGroupScalingUp("ng1"))
assert.False(t, clusterstate.backoff.IsBackedOff(provider.GetNodeGroup("ng1"), nil, now))
}

func TestUpcomingNodes(t *testing.T) {
provider := testprovider.NewTestCloudProvider(nil, nil)
now := time.Now()
Expand Down
4 changes: 2 additions & 2 deletions cluster-autoscaler/config/autoscaling_options.go
Original file line number Diff line number Diff line change
Expand Up @@ -244,8 +244,8 @@ type AutoscalingOptions struct {
MaxNodeGroupBackoffDuration time.Duration
// NodeGroupBackoffResetTimeout is the time after last failed scale-up when the backoff duration is reset.
NodeGroupBackoffResetTimeout time.Duration
// NodeGroupKeepBackoffOutOfResources is whether a backoff can be removed before expiration when a scale-up fails due to the cloud provider being out of resources.
NodeGroupKeepBackoffOutOfResources bool
// NodeGroupRemovePersistentErrorBackoffEarly is whether a backoff can be removed before expiration when a scale-up partially fails due to a likely persistent error.
NodeGroupRemovePersistentErrorBackoffEarly bool
// MaxScaleDownParallelism is the maximum number of nodes (both empty and needing drain) that can be deleted in parallel.
MaxScaleDownParallelism int
// MaxDrainParallelism is the maximum number of nodes needing drain, that can be drained and deleted in parallel.
Expand Down
5 changes: 3 additions & 2 deletions cluster-autoscaler/core/static_autoscaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,8 +148,9 @@ func NewStaticAutoscaler(
drainabilityRules rules.Rules) *StaticAutoscaler {

clusterStateConfig := clusterstate.ClusterStateRegistryConfig{
MaxTotalUnreadyPercentage: opts.MaxTotalUnreadyPercentage,
OkTotalUnreadyCount: opts.OkTotalUnreadyCount,
MaxTotalUnreadyPercentage: opts.MaxTotalUnreadyPercentage,
OkTotalUnreadyCount: opts.OkTotalUnreadyCount,
NodeGroupRemovePersistentErrorBackoffEarly: opts.NodeGroupRemovePersistentErrorBackoffEarly,
}
clusterStateRegistry := clusterstate.NewClusterStateRegistry(cloudProvider, clusterStateConfig, autoscalingKubeClients.LogRecorder, backoff, processors.NodeGroupConfigProcessor)
processorCallbacks := newStaticAutoscalerProcessorCallbacks()
Expand Down
Loading

0 comments on commit e85eeae

Please sign in to comment.