Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

improve logging for scale set size changes #4541

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 19 additions & 27 deletions cluster-autoscaler/cloudprovider/azure/azure_scale_set.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,8 @@ type ScaleSet struct {
minSize int
maxSize int

sizeMutex sync.Mutex
curSize int64
lastSizeRefresh time.Time
sizeRefreshPeriod time.Duration
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The sizeRefreshPeriod was always 0 after #3717. The TTL for VMSS cache refreshes is in honoured there where we depend on the CA core cloudprovider.Refresh() calls to update the caches.

https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/azure/azure_manager.go#L166-L168

sizeMutex sync.Mutex
curSize int64

instancesRefreshPeriod time.Duration
instancesRefreshJitter int
Expand Down Expand Up @@ -118,7 +116,7 @@ func (scaleSet *ScaleSet) Autoprovisioned() bool {
func (scaleSet *ScaleSet) GetOptions(defaults config.NodeGroupAutoscalingOptions) (*config.NodeGroupAutoscalingOptions, error) {
template, err := scaleSet.getVMSSFromCache()
if err != nil {
return nil, err.Error()
return nil, err
}
return scaleSet.manager.GetScaleSetOptions(*template.Name, defaults), nil
}
Expand All @@ -128,11 +126,11 @@ func (scaleSet *ScaleSet) MaxSize() int {
return scaleSet.maxSize
}

func (scaleSet *ScaleSet) getVMSSFromCache() (compute.VirtualMachineScaleSet, *retry.Error) {
func (scaleSet *ScaleSet) getVMSSFromCache() (compute.VirtualMachineScaleSet, error) {
Copy link
Member Author

@marwanad marwanad Dec 20, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no reason to return the ARM error type here. The call to scaleSet.manager.azureCache.getScaleSets() is always a cache read call.

allVMSS := scaleSet.manager.azureCache.getScaleSets()

if _, exists := allVMSS[scaleSet.Name]; !exists {
return compute.VirtualMachineScaleSet{}, &retry.Error{RawError: fmt.Errorf("could not find vmss: %s", scaleSet.Name)}
return compute.VirtualMachineScaleSet{}, fmt.Errorf("could not find vmss: %s", scaleSet.Name)
}

return allVMSS[scaleSet.Name], nil
Expand All @@ -142,35 +140,30 @@ func (scaleSet *ScaleSet) getCurSize() (int64, error) {
scaleSet.sizeMutex.Lock()
defer scaleSet.sizeMutex.Unlock()

if scaleSet.lastSizeRefresh.Add(scaleSet.sizeRefreshPeriod).After(time.Now()) {
Copy link
Member Author

@marwanad marwanad Dec 20, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this was always no-op after #3717 as lastSizeRefresh + 0 will always be in the past so this conditional was skipped.

return scaleSet.curSize, nil
}

klog.V(5).Infof("Get scale set size for %q", scaleSet.Name)
set, rerr := scaleSet.getVMSSFromCache()
if rerr != nil {
return -1, rerr.Error()
set, err := scaleSet.getVMSSFromCache()
if err != nil {
klog.Errorf("failed to get information for VMSS: %s, error: %v", scaleSet.Name, err)
return -1, err
}

// If VMSS state is updating, return the currentSize which would've been proactively incremented or decremented by CA
if set.VirtualMachineScaleSetProperties != nil && strings.EqualFold(to.String(set.VirtualMachineScaleSetProperties.ProvisioningState), string(compute.ProvisioningStateUpdating)) {
klog.V(5).Infof("VMSS %q is in updating state, returning cached size: %d", scaleSet.Name, scaleSet.curSize)
klog.V(3).Infof("VMSS % is in updating state, returning cached size: %d", scaleSet.Name, scaleSet.curSize)
return scaleSet.curSize, nil
}

vmssSizeMutex.Lock()
curSize := *set.Sku.Capacity
vmssSizeMutex.Unlock()

klog.V(5).Infof("Getting scale set (%q) capacity: %d\n", scaleSet.Name, curSize)

if scaleSet.curSize != curSize {
// Invalidate the instance cache if the capacity has changed.
klog.V(5).Infof("VMSS %q size changed from: %d to %d, invalidating instance cache", scaleSet.Name, scaleSet.curSize, curSize)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

keeping it as v=5 intentionally because it can be inferred from L64.

scaleSet.invalidateInstanceCache()
}
klog.V(3).Infof("VMSS: %s, previous size: %d, new size: %d", scaleSet.Name, scaleSet.curSize, curSize)

scaleSet.curSize = curSize
scaleSet.lastSizeRefresh = time.Now()
return scaleSet.curSize, nil
}

Expand Down Expand Up @@ -226,10 +219,10 @@ func (scaleSet *ScaleSet) SetScaleSetSize(size int64) error {
scaleSet.sizeMutex.Lock()
defer scaleSet.sizeMutex.Unlock()

vmssInfo, rerr := scaleSet.getVMSSFromCache()
if rerr != nil {
klog.Errorf("Failed to get information for VMSS (%q): %v", scaleSet.Name, rerr)
return rerr.Error()
vmssInfo, err := scaleSet.getVMSSFromCache()
if err != nil {
klog.Errorf("Failed to get information for VMSS (%q): %v", scaleSet.Name, err)
return err
}

// Update the new capacity to cache.
Expand All @@ -254,7 +247,6 @@ func (scaleSet *ScaleSet) SetScaleSetSize(size int64) error {

// Proactively set the VMSS size so autoscaler makes better decisions.
scaleSet.curSize = size
scaleSet.lastSizeRefresh = time.Now()

go scaleSet.updateVMSSCapacity(future)
return nil
Expand Down Expand Up @@ -474,9 +466,9 @@ func (scaleSet *ScaleSet) Debug() string {

// TemplateNodeInfo returns a node template for this scale set.
func (scaleSet *ScaleSet) TemplateNodeInfo() (*schedulerframework.NodeInfo, error) {
template, rerr := scaleSet.getVMSSFromCache()
if rerr != nil {
return nil, rerr.Error()
template, err := scaleSet.getVMSSFromCache()
if err != nil {
return nil, err
}

node, err := buildNodeFromTemplate(scaleSet.Name, template)
Expand Down
15 changes: 5 additions & 10 deletions cluster-autoscaler/cloudprovider/azure/azure_scale_set_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,13 @@ package azure

import (
"fmt"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"net/http"
"testing"
"time"

apiv1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
"net/http"
"sigs.k8s.io/cloud-provider-azure/pkg/azureclients/vmssclient/mockvmssclient"
"sigs.k8s.io/cloud-provider-azure/pkg/azureclients/vmssvmclient/mockvmssvmclient"
"testing"

"github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2020-12-01/compute"
"github.com/Azure/go-autorest/autorest/to"
Expand Down Expand Up @@ -138,12 +136,9 @@ func TestIncreaseSize(t *testing.T) {
err := provider.azureManager.forceRefresh()
assert.NoError(t, err)

ss := newTestScaleSet(provider.azureManager, "test-asg")
ss.lastSizeRefresh = time.Now()
ss.sizeRefreshPeriod = 1 * time.Minute
ss.curSize = -1
ss := newTestScaleSet(provider.azureManager, "test-asg-doesnt-exist")
err = ss.IncreaseSize(100)
expectedErr := fmt.Errorf("the scale set test-asg is under initialization, skipping IncreaseSize")
expectedErr := fmt.Errorf("could not find vmss: test-asg-doesnt-exist")
Copy link
Member Author

@marwanad marwanad Dec 20, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

technically with the changes in #3717, we'll never end with the "-1" initialization case because the cache wont have the VMSS and we exit earlier. (keeping it there for now though)

assert.Equal(t, expectedErr, err)

registered := provider.azureManager.RegisterNodeGroup(
Expand Down