From 09ab1ea20dd0367fbd71f65e42dbadc55c7bd12d Mon Sep 17 00:00:00 2001 From: Benjamin Pineau Date: Wed, 6 Jan 2021 12:02:19 +0100 Subject: [PATCH] Don't pile up successive full refreshes during AWS scaledowns Force refreshing everything at every DeleteNodes calls causes slow down and throttling on large clusters with many ASGs (and lot of activity). That function might be called several times in a row during scale-down (once for each ASG having a node to be removed). Each time the forced refresh will re-discover all ASGs, all LaunchConfigurations, then re-list all instances from discovered ASGs. That immediate refresh isn't required anyway, as the cache's DeleteInstances concrete implementation will decrement the nodegroup size, and we can schedule a grouped refresh for the next loop iteration. --- .../cloudprovider/aws/aws_cloud_provider_test.go | 4 ++-- cluster-autoscaler/cloudprovider/aws/aws_manager.go | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/aws/aws_cloud_provider_test.go b/cluster-autoscaler/cloudprovider/aws/aws_cloud_provider_test.go index 587833779335..4ae024c437d4 100644 --- a/cluster-autoscaler/cloudprovider/aws/aws_cloud_provider_test.go +++ b/cluster-autoscaler/cloudprovider/aws/aws_cloud_provider_test.go @@ -469,7 +469,7 @@ func TestDeleteNodes(t *testing.T) { err = asgs[0].DeleteNodes([]*apiv1.Node{node}) assert.NoError(t, err) service.AssertNumberOfCalls(t, "TerminateInstanceInAutoScalingGroup", 1) - service.AssertNumberOfCalls(t, "DescribeAutoScalingGroupsPages", 2) + service.AssertNumberOfCalls(t, "DescribeAutoScalingGroupsPages", 1) newSize, err := asgs[0].TargetSize() assert.NoError(t, err) @@ -516,7 +516,7 @@ func TestDeleteNodesWithPlaceholder(t *testing.T) { err = asgs[0].DeleteNodes([]*apiv1.Node{node}) assert.NoError(t, err) service.AssertNumberOfCalls(t, "SetDesiredCapacity", 1) - service.AssertNumberOfCalls(t, "DescribeAutoScalingGroupsPages", 2) + service.AssertNumberOfCalls(t, "DescribeAutoScalingGroupsPages", 1) newSize, err := asgs[0].TargetSize() assert.NoError(t, err) diff --git a/cluster-autoscaler/cloudprovider/aws/aws_manager.go b/cluster-autoscaler/cloudprovider/aws/aws_manager.go index d9f71bec8281..f47c9f17c211 100644 --- a/cluster-autoscaler/cloudprovider/aws/aws_manager.go +++ b/cluster-autoscaler/cloudprovider/aws/aws_manager.go @@ -294,8 +294,9 @@ func (m *AwsManager) DeleteInstances(instances []*AwsInstanceRef) error { if err := m.asgCache.DeleteInstances(instances); err != nil { return err } - klog.V(2).Infof("Some ASG instances might have been deleted, forcing ASG list refresh") - return m.forceRefresh() + klog.V(2).Infof("Some ASG instances might have been deleted, scheduling an ASG list refresh") + m.lastRefresh = time.Now().Add(-refreshInterval) + return nil } // GetAsgNodes returns Asg nodes.