diff --git a/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go b/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go index dd7e0b0276e4..76940e1143c5 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_manager_test.go @@ -697,6 +697,7 @@ func TestGetFilteredAutoscalingGroupsVmss(t *testing.T) { maxSize: maxVal, manager: manager, curSize: 3, + sizeRefreshPeriod: manager.azureCache.refreshInterval, instancesRefreshPeriod: defaultVmssInstancesRefreshPeriod, }} assert.True(t, assert.ObjectsAreEqualValues(expectedAsgs, asgs), "expected %#v, but found: %#v", expectedAsgs, asgs) diff --git a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go index de4619d29e7f..103d0b36237b 100644 --- a/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go +++ b/cluster-autoscaler/cloudprovider/azure/azure_scale_set.go @@ -33,7 +33,6 @@ import ( "github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2020-12-01/compute" "github.com/Azure/go-autorest/autorest/azure" - "github.com/Azure/go-autorest/autorest/to" ) var ( @@ -53,6 +52,9 @@ type ScaleSet struct { sizeMutex sync.Mutex curSize int64 + lastSizeRefresh time.Time + sizeRefreshPeriod time.Duration + instancesRefreshPeriod time.Duration instancesRefreshJitter int @@ -67,11 +69,11 @@ func NewScaleSet(spec *dynamic.NodeGroupSpec, az *AzureManager, curSize int64) ( azureRef: azureRef{ Name: spec.Name, }, - minSize: spec.MinSize, - maxSize: spec.MaxSize, - manager: az, - curSize: curSize, - + minSize: spec.MinSize, + maxSize: spec.MaxSize, + manager: az, + curSize: curSize, + sizeRefreshPeriod: az.azureCache.refreshInterval, instancesRefreshJitter: az.config.VmssVmsCacheJitter, } @@ -140,18 +142,17 @@ func (scaleSet *ScaleSet) getCurSize() (int64, error) { scaleSet.sizeMutex.Lock() defer scaleSet.sizeMutex.Unlock() + if scaleSet.lastSizeRefresh.Add(scaleSet.sizeRefreshPeriod).After(time.Now()) { + klog.V(3).Infof("VMSS: %s, returning in-memory size: %d", scaleSet.Name, scaleSet.curSize) + return scaleSet.curSize, nil + } + set, err := scaleSet.getVMSSFromCache() if err != nil { klog.Errorf("failed to get information for VMSS: %s, error: %v", scaleSet.Name, err) return -1, err } - // If VMSS state is updating, return the currentSize which would've been proactively incremented or decremented by CA - if set.VirtualMachineScaleSetProperties != nil && strings.EqualFold(to.String(set.VirtualMachineScaleSetProperties.ProvisioningState), string(compute.ProvisioningStateUpdating)) { - klog.V(3).Infof("VMSS %q is in updating state, returning cached size: %d", scaleSet.Name, scaleSet.curSize) - return scaleSet.curSize, nil - } - vmssSizeMutex.Lock() curSize := *set.Sku.Capacity vmssSizeMutex.Unlock() @@ -161,9 +162,10 @@ func (scaleSet *ScaleSet) getCurSize() (int64, error) { klog.V(5).Infof("VMSS %q size changed from: %d to %d, invalidating instance cache", scaleSet.Name, scaleSet.curSize, curSize) scaleSet.invalidateInstanceCache() } - klog.V(3).Infof("VMSS: %s, previous size: %d, new size: %d", scaleSet.Name, scaleSet.curSize, curSize) + klog.V(3).Infof("VMSS: %s, in-memory size: %d, new size: %d", scaleSet.Name, scaleSet.curSize, curSize) scaleSet.curSize = curSize + scaleSet.lastSizeRefresh = time.Now() return scaleSet.curSize, nil } @@ -194,6 +196,7 @@ func (scaleSet *ScaleSet) updateVMSSCapacity(future *azure.Future) { if err != nil { klog.Errorf("Failed to update the capacity for vmss %s with error %v, invalidate the cache so as to get the real size from API", scaleSet.Name, err) // Invalidate the VMSS size cache in order to fetch the size from the API. + scaleSet.invalidateLastSizeRefreshWithLock() scaleSet.manager.invalidateCache() } }() @@ -247,6 +250,7 @@ func (scaleSet *ScaleSet) SetScaleSetSize(size int64) error { // Proactively set the VMSS size so autoscaler makes better decisions. scaleSet.curSize = size + scaleSet.lastSizeRefresh = time.Now() go scaleSet.updateVMSSCapacity(future) return nil @@ -405,6 +409,7 @@ func (scaleSet *ScaleSet) DeleteInstances(instances []*azureRef, hasUnregistered if !hasUnregisteredNodes { scaleSet.sizeMutex.Lock() scaleSet.curSize -= int64(len(instanceIDs)) + scaleSet.lastSizeRefresh = time.Now() scaleSet.sizeMutex.Unlock() } @@ -567,6 +572,7 @@ func (scaleSet *ScaleSet) setInstanceStatusByProviderID(providerID string, statu scaleSet.instanceCache[k].Status = &status } } + scaleSet.lastInstanceRefresh = time.Now() } // instanceStatusFromVM converts the VM provisioning state to cloudprovider.InstanceStatus @@ -594,3 +600,9 @@ func (scaleSet *ScaleSet) invalidateInstanceCache() { scaleSet.lastInstanceRefresh = time.Now().Add(-1 * scaleSet.instancesRefreshPeriod) scaleSet.instanceMutex.Unlock() } + +func (scaleSet *ScaleSet) invalidateLastSizeRefreshWithLock() { + scaleSet.sizeMutex.Lock() + scaleSet.lastSizeRefresh = time.Now().Add(-1 * scaleSet.sizeRefreshPeriod) + scaleSet.sizeMutex.Unlock() +}