Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cherry-pick #4685, #47874 - Azure vmss cache improvements #4794

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -697,6 +697,7 @@ func TestGetFilteredAutoscalingGroupsVmss(t *testing.T) {
maxSize: maxVal,
manager: manager,
curSize: 3,
sizeRefreshPeriod: manager.azureCache.refreshInterval,
instancesRefreshPeriod: defaultVmssInstancesRefreshPeriod,
}}
assert.True(t, assert.ObjectsAreEqualValues(expectedAsgs, asgs), "expected %#v, but found: %#v", expectedAsgs, asgs)
Expand Down
38 changes: 25 additions & 13 deletions cluster-autoscaler/cloudprovider/azure/azure_scale_set.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ import (

"github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2020-12-01/compute"
"github.com/Azure/go-autorest/autorest/azure"
"github.com/Azure/go-autorest/autorest/to"
)

var (
Expand All @@ -53,6 +52,9 @@ type ScaleSet struct {
sizeMutex sync.Mutex
curSize int64

lastSizeRefresh time.Time
sizeRefreshPeriod time.Duration

instancesRefreshPeriod time.Duration
instancesRefreshJitter int

Expand All @@ -67,11 +69,11 @@ func NewScaleSet(spec *dynamic.NodeGroupSpec, az *AzureManager, curSize int64) (
azureRef: azureRef{
Name: spec.Name,
},
minSize: spec.MinSize,
maxSize: spec.MaxSize,
manager: az,
curSize: curSize,

minSize: spec.MinSize,
maxSize: spec.MaxSize,
manager: az,
curSize: curSize,
sizeRefreshPeriod: az.azureCache.refreshInterval,
instancesRefreshJitter: az.config.VmssVmsCacheJitter,
}

Expand Down Expand Up @@ -140,18 +142,17 @@ func (scaleSet *ScaleSet) getCurSize() (int64, error) {
scaleSet.sizeMutex.Lock()
defer scaleSet.sizeMutex.Unlock()

if scaleSet.lastSizeRefresh.Add(scaleSet.sizeRefreshPeriod).After(time.Now()) {
klog.V(3).Infof("VMSS: %s, returning in-memory size: %d", scaleSet.Name, scaleSet.curSize)
return scaleSet.curSize, nil
}

set, err := scaleSet.getVMSSFromCache()
if err != nil {
klog.Errorf("failed to get information for VMSS: %s, error: %v", scaleSet.Name, err)
return -1, err
}

// If VMSS state is updating, return the currentSize which would've been proactively incremented or decremented by CA
if set.VirtualMachineScaleSetProperties != nil && strings.EqualFold(to.String(set.VirtualMachineScaleSetProperties.ProvisioningState), string(compute.ProvisioningStateUpdating)) {
klog.V(3).Infof("VMSS %q is in updating state, returning cached size: %d", scaleSet.Name, scaleSet.curSize)
return scaleSet.curSize, nil
}

vmssSizeMutex.Lock()
curSize := *set.Sku.Capacity
vmssSizeMutex.Unlock()
Expand All @@ -161,9 +162,10 @@ func (scaleSet *ScaleSet) getCurSize() (int64, error) {
klog.V(5).Infof("VMSS %q size changed from: %d to %d, invalidating instance cache", scaleSet.Name, scaleSet.curSize, curSize)
scaleSet.invalidateInstanceCache()
}
klog.V(3).Infof("VMSS: %s, previous size: %d, new size: %d", scaleSet.Name, scaleSet.curSize, curSize)
klog.V(3).Infof("VMSS: %s, in-memory size: %d, new size: %d", scaleSet.Name, scaleSet.curSize, curSize)

scaleSet.curSize = curSize
scaleSet.lastSizeRefresh = time.Now()
return scaleSet.curSize, nil
}

Expand Down Expand Up @@ -194,6 +196,7 @@ func (scaleSet *ScaleSet) updateVMSSCapacity(future *azure.Future) {
if err != nil {
klog.Errorf("Failed to update the capacity for vmss %s with error %v, invalidate the cache so as to get the real size from API", scaleSet.Name, err)
// Invalidate the VMSS size cache in order to fetch the size from the API.
scaleSet.invalidateLastSizeRefreshWithLock()
scaleSet.manager.invalidateCache()
}
}()
Expand Down Expand Up @@ -247,6 +250,7 @@ func (scaleSet *ScaleSet) SetScaleSetSize(size int64) error {

// Proactively set the VMSS size so autoscaler makes better decisions.
scaleSet.curSize = size
scaleSet.lastSizeRefresh = time.Now()

go scaleSet.updateVMSSCapacity(future)
return nil
Expand Down Expand Up @@ -405,6 +409,7 @@ func (scaleSet *ScaleSet) DeleteInstances(instances []*azureRef, hasUnregistered
if !hasUnregisteredNodes {
scaleSet.sizeMutex.Lock()
scaleSet.curSize -= int64(len(instanceIDs))
scaleSet.lastSizeRefresh = time.Now()
scaleSet.sizeMutex.Unlock()
}

Expand Down Expand Up @@ -567,6 +572,7 @@ func (scaleSet *ScaleSet) setInstanceStatusByProviderID(providerID string, statu
scaleSet.instanceCache[k].Status = &status
}
}
scaleSet.lastInstanceRefresh = time.Now()
}

// instanceStatusFromVM converts the VM provisioning state to cloudprovider.InstanceStatus
Expand Down Expand Up @@ -594,3 +600,9 @@ func (scaleSet *ScaleSet) invalidateInstanceCache() {
scaleSet.lastInstanceRefresh = time.Now().Add(-1 * scaleSet.instancesRefreshPeriod)
scaleSet.instanceMutex.Unlock()
}

func (scaleSet *ScaleSet) invalidateLastSizeRefreshWithLock() {
scaleSet.sizeMutex.Lock()
scaleSet.lastSizeRefresh = time.Now().Add(-1 * scaleSet.sizeRefreshPeriod)
scaleSet.sizeMutex.Unlock()
}