diff --git a/cluster-autoscaler/cloudprovider/aws/README.md b/cluster-autoscaler/cloudprovider/aws/README.md index 7f7f777e7496..3b0872ae3bb6 100644 --- a/cluster-autoscaler/cloudprovider/aws/README.md +++ b/cluster-autoscaler/cloudprovider/aws/README.md @@ -48,10 +48,41 @@ discover](#auto-discovery-setup) EC2 Auto Scaling Groups **(recommended)**, add Launch Configuration) and/or `ec2:DescribeLaunchTemplateVersions` (if you created your ASG using a Launch Template) to the `Action` list. -If you prefer, you can restrict the target resources for the autoscaling actions -by specifying Auto Scaling Group ARNs in the `Resource` list of the policy. More -information can be found -[here](https://docs.aws.amazon.com/autoscaling/latest/userguide/control-access-using-iam.html#policy-auto-scaling-resources). +*NOTE:* The below policies/arguments to the Cluster Autoscaler need to be modified as appropriate +for the names of your ASGs, as well as account ID and AWS region before being used. + +The following policy provides the minimum privileges necessary for Cluster Autoscaler to run. +When using this policy, you cannot use autodiscovery of ASGs. In addition, it restricts the +IAM permissions to the node groups the Cluster Autoscaler is configured to scale. + +This in turn means that you must pass the following arguments to the Cluster Autoscaler +binary, replacing min and max node counts and the ASG: + +```bash +--aws-use-static-instance-list=false +--nodes=1:100:exampleASG1 +--nodes=1:100:exampleASG2 +``` + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": [ + "autoscaling:DescribeAutoScalingGroups", + "autoscaling:DescribeAutoScalingInstances", + "autoscaling:DescribeLaunchConfigurations", + "autoscaling:DescribeScalingActivities", + "autoscaling:SetDesiredCapacity", + "autoscaling:TerminateInstanceInAutoScalingGroup" + ], + "Resource": ["arn:aws:autoscaling:${YOUR_CLUSTER_AWS_REGION}:${YOUR_AWS_ACCOUNT_ID}:autoScalingGroup:*:autoScalingGroupName/${YOUR_ASG_NAME}"] + } + ] +} +``` ### Using OIDC Federated Authentication diff --git a/cluster-autoscaler/cloudprovider/aws/auto_scaling_groups.go b/cluster-autoscaler/cloudprovider/aws/auto_scaling_groups.go index 5f66e808dbdc..cf719e23532f 100644 --- a/cluster-autoscaler/cloudprovider/aws/auto_scaling_groups.go +++ b/cluster-autoscaler/cloudprovider/aws/auto_scaling_groups.go @@ -67,7 +67,7 @@ type asg struct { minSize int maxSize int curSize int - lastUpdateTime *time.Time + lastUpdateTime time.Time AvailabilityZones []string LaunchConfigurationName string @@ -254,7 +254,7 @@ func (m *asgCache) setAsgSizeNoLock(asg *asg, size int) error { } // Proactively set the ASG size so autoscaler makes better decisions - asg.lastUpdateTime = &start + asg.lastUpdateTime = start asg.curSize = size return nil @@ -479,7 +479,6 @@ func (m *asgCache) createPlaceholdersForDesiredNonStartedInstances(groups []*aut func (m *asgCache) isNodeGroupAvailable(group *autoscaling.Group) (bool, error) { input := &autoscaling.DescribeScalingActivitiesInput{ AutoScalingGroupName: group.AutoScalingGroupName, - MaxRecords: aws.Int64(1), // We only care about the most recent event } start := time.Now() @@ -489,12 +488,14 @@ func (m *asgCache) isNodeGroupAvailable(group *autoscaling.Group) (bool, error) return true, err // If we can't describe the scaling activities we assume the node group is available } - if len(response.Activities) > 0 { - activity := response.Activities[0] + for _, activity := range response.Activities { asgRef := AwsRef{Name: *group.AutoScalingGroupName} if a, ok := m.registeredAsgs[asgRef]; ok { lut := a.lastUpdateTime - if lut != nil && activity.StartTime.After(*lut) && *activity.StatusCode == "Failed" { + if activity.StartTime.Before(lut) { + break + } else if *activity.StatusCode == "Failed" { + klog.Warningf("ASG %s scaling failed with %s", asgRef.Name, *activity) return false, nil } } else { diff --git a/cluster-autoscaler/cloudprovider/aws/auto_scaling_groups_test.go b/cluster-autoscaler/cloudprovider/aws/auto_scaling_groups_test.go index d28519f29c0c..c5078b03b5b7 100644 --- a/cluster-autoscaler/cloudprovider/aws/auto_scaling_groups_test.go +++ b/cluster-autoscaler/cloudprovider/aws/auto_scaling_groups_test.go @@ -59,7 +59,7 @@ func TestCreatePlaceholders(t *testing.T) { name string desiredCapacity *int64 activities []*autoscaling.Activity - groupLastUpdateTime *time.Time + groupLastUpdateTime time.Time describeErr error asgToCheck *string }{ @@ -85,7 +85,7 @@ func TestCreatePlaceholders(t *testing.T) { StartTime: aws.Time(time.Unix(10, 0)), }, }, - groupLastUpdateTime: aws.Time(time.Unix(9, 0)), + groupLastUpdateTime: time.Unix(9, 0), }, { name: "AWS scaling failed event before CA scale_up", @@ -96,7 +96,7 @@ func TestCreatePlaceholders(t *testing.T) { StartTime: aws.Time(time.Unix(9, 0)), }, }, - groupLastUpdateTime: aws.Time(time.Unix(10, 0)), + groupLastUpdateTime: time.Unix(10, 0), }, { name: "asg not registered", @@ -107,7 +107,7 @@ func TestCreatePlaceholders(t *testing.T) { StartTime: aws.Time(time.Unix(10, 0)), }, }, - groupLastUpdateTime: aws.Time(time.Unix(9, 0)), + groupLastUpdateTime: time.Unix(9, 0), asgToCheck: aws.String("unregisteredAsgName"), }, } @@ -128,7 +128,6 @@ func TestCreatePlaceholders(t *testing.T) { if shouldCallDescribeScalingActivities { a.On("DescribeScalingActivities", &autoscaling.DescribeScalingActivitiesInput{ AutoScalingGroupName: asgName, - MaxRecords: aws.Int64(1), }).Return( &autoscaling.DescribeScalingActivitiesOutput{Activities: tc.activities}, tc.describeErr, @@ -158,7 +157,7 @@ func TestCreatePlaceholders(t *testing.T) { } asgCache.createPlaceholdersForDesiredNonStartedInstances(groups) assert.Equal(t, int64(len(groups[0].Instances)), *tc.desiredCapacity) - if tc.activities != nil && *tc.activities[0].StatusCode == "Failed" && tc.activities[0].StartTime.After(*tc.groupLastUpdateTime) && asgName == registeredAsgName { + if tc.activities != nil && *tc.activities[0].StatusCode == "Failed" && tc.activities[0].StartTime.After(tc.groupLastUpdateTime) && asgName == registeredAsgName { assert.Equal(t, *groups[0].Instances[0].HealthStatus, placeholderUnfulfillableStatus) } else if len(groups[0].Instances) > 0 { assert.Equal(t, *groups[0].Instances[0].HealthStatus, "") diff --git a/cluster-autoscaler/cloudprovider/aws/aws_cloud_provider_test.go b/cluster-autoscaler/cloudprovider/aws/aws_cloud_provider_test.go index 2753daabd2dc..e1d6a92a1ef9 100644 --- a/cluster-autoscaler/cloudprovider/aws/aws_cloud_provider_test.go +++ b/cluster-autoscaler/cloudprovider/aws/aws_cloud_provider_test.go @@ -466,7 +466,6 @@ func TestDeleteNodesWithPlaceholder(t *testing.T) { a.On("DescribeScalingActivities", &autoscaling.DescribeScalingActivitiesInput{ AutoScalingGroupName: aws.String("test-asg"), - MaxRecords: aws.Int64(1), }, ).Return(&autoscaling.DescribeScalingActivitiesOutput{}, nil) diff --git a/cluster-autoscaler/cloudprovider/aws/aws_manager_test.go b/cluster-autoscaler/cloudprovider/aws/aws_manager_test.go index 02c298201b32..295bbf0e92d2 100644 --- a/cluster-autoscaler/cloudprovider/aws/aws_manager_test.go +++ b/cluster-autoscaler/cloudprovider/aws/aws_manager_test.go @@ -394,7 +394,6 @@ func TestFetchExplicitAsgs(t *testing.T) { a.On("DescribeScalingActivities", &autoscaling.DescribeScalingActivitiesInput{ AutoScalingGroupName: aws.String("coolasg"), - MaxRecords: aws.Int64(1), }, ).Return(&autoscaling.DescribeScalingActivitiesOutput{}, nil) @@ -559,7 +558,6 @@ func TestFetchAutoAsgs(t *testing.T) { a.On("DescribeScalingActivities", &autoscaling.DescribeScalingActivitiesInput{ AutoScalingGroupName: aws.String("coolasg"), - MaxRecords: aws.Int64(1), }, ).Return(&autoscaling.DescribeScalingActivitiesOutput{}, nil)