-
Notifications
You must be signed in to change notification settings - Fork 4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
improve logging for scale set size changes #4541
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -50,10 +50,8 @@ type ScaleSet struct { | |
minSize int | ||
maxSize int | ||
|
||
sizeMutex sync.Mutex | ||
curSize int64 | ||
lastSizeRefresh time.Time | ||
sizeRefreshPeriod time.Duration | ||
sizeMutex sync.Mutex | ||
curSize int64 | ||
|
||
instancesRefreshPeriod time.Duration | ||
instancesRefreshJitter int | ||
|
@@ -118,7 +116,7 @@ func (scaleSet *ScaleSet) Autoprovisioned() bool { | |
func (scaleSet *ScaleSet) GetOptions(defaults config.NodeGroupAutoscalingOptions) (*config.NodeGroupAutoscalingOptions, error) { | ||
template, err := scaleSet.getVMSSFromCache() | ||
if err != nil { | ||
return nil, err.Error() | ||
return nil, err | ||
} | ||
return scaleSet.manager.GetScaleSetOptions(*template.Name, defaults), nil | ||
} | ||
|
@@ -128,11 +126,11 @@ func (scaleSet *ScaleSet) MaxSize() int { | |
return scaleSet.maxSize | ||
} | ||
|
||
func (scaleSet *ScaleSet) getVMSSFromCache() (compute.VirtualMachineScaleSet, *retry.Error) { | ||
func (scaleSet *ScaleSet) getVMSSFromCache() (compute.VirtualMachineScaleSet, error) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no reason to return the ARM error type here. The call to |
||
allVMSS := scaleSet.manager.azureCache.getScaleSets() | ||
|
||
if _, exists := allVMSS[scaleSet.Name]; !exists { | ||
return compute.VirtualMachineScaleSet{}, &retry.Error{RawError: fmt.Errorf("could not find vmss: %s", scaleSet.Name)} | ||
return compute.VirtualMachineScaleSet{}, fmt.Errorf("could not find vmss: %s", scaleSet.Name) | ||
} | ||
|
||
return allVMSS[scaleSet.Name], nil | ||
|
@@ -142,35 +140,30 @@ func (scaleSet *ScaleSet) getCurSize() (int64, error) { | |
scaleSet.sizeMutex.Lock() | ||
defer scaleSet.sizeMutex.Unlock() | ||
|
||
if scaleSet.lastSizeRefresh.Add(scaleSet.sizeRefreshPeriod).After(time.Now()) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this was always no-op after #3717 as lastSizeRefresh + 0 will always be in the past so this conditional was skipped. |
||
return scaleSet.curSize, nil | ||
} | ||
|
||
klog.V(5).Infof("Get scale set size for %q", scaleSet.Name) | ||
set, rerr := scaleSet.getVMSSFromCache() | ||
if rerr != nil { | ||
return -1, rerr.Error() | ||
set, err := scaleSet.getVMSSFromCache() | ||
if err != nil { | ||
klog.Errorf("failed to get information for VMSS: %s, error: %v", scaleSet.Name, err) | ||
return -1, err | ||
} | ||
|
||
// If VMSS state is updating, return the currentSize which would've been proactively incremented or decremented by CA | ||
if set.VirtualMachineScaleSetProperties != nil && strings.EqualFold(to.String(set.VirtualMachineScaleSetProperties.ProvisioningState), string(compute.ProvisioningStateUpdating)) { | ||
klog.V(5).Infof("VMSS %q is in updating state, returning cached size: %d", scaleSet.Name, scaleSet.curSize) | ||
klog.V(3).Infof("VMSS % is in updating state, returning cached size: %d", scaleSet.Name, scaleSet.curSize) | ||
return scaleSet.curSize, nil | ||
} | ||
|
||
vmssSizeMutex.Lock() | ||
curSize := *set.Sku.Capacity | ||
vmssSizeMutex.Unlock() | ||
|
||
klog.V(5).Infof("Getting scale set (%q) capacity: %d\n", scaleSet.Name, curSize) | ||
|
||
if scaleSet.curSize != curSize { | ||
// Invalidate the instance cache if the capacity has changed. | ||
klog.V(5).Infof("VMSS %q size changed from: %d to %d, invalidating instance cache", scaleSet.Name, scaleSet.curSize, curSize) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. keeping it as v=5 intentionally because it can be inferred from L64. |
||
scaleSet.invalidateInstanceCache() | ||
} | ||
klog.V(3).Infof("VMSS: %s, previous size: %d, new size: %d", scaleSet.Name, scaleSet.curSize, curSize) | ||
|
||
scaleSet.curSize = curSize | ||
scaleSet.lastSizeRefresh = time.Now() | ||
return scaleSet.curSize, nil | ||
} | ||
|
||
|
@@ -226,10 +219,10 @@ func (scaleSet *ScaleSet) SetScaleSetSize(size int64) error { | |
scaleSet.sizeMutex.Lock() | ||
defer scaleSet.sizeMutex.Unlock() | ||
|
||
vmssInfo, rerr := scaleSet.getVMSSFromCache() | ||
if rerr != nil { | ||
klog.Errorf("Failed to get information for VMSS (%q): %v", scaleSet.Name, rerr) | ||
return rerr.Error() | ||
vmssInfo, err := scaleSet.getVMSSFromCache() | ||
if err != nil { | ||
klog.Errorf("Failed to get information for VMSS (%q): %v", scaleSet.Name, err) | ||
return err | ||
} | ||
|
||
// Update the new capacity to cache. | ||
|
@@ -254,7 +247,6 @@ func (scaleSet *ScaleSet) SetScaleSetSize(size int64) error { | |
|
||
// Proactively set the VMSS size so autoscaler makes better decisions. | ||
scaleSet.curSize = size | ||
scaleSet.lastSizeRefresh = time.Now() | ||
|
||
go scaleSet.updateVMSSCapacity(future) | ||
return nil | ||
|
@@ -474,9 +466,9 @@ func (scaleSet *ScaleSet) Debug() string { | |
|
||
// TemplateNodeInfo returns a node template for this scale set. | ||
func (scaleSet *ScaleSet) TemplateNodeInfo() (*schedulerframework.NodeInfo, error) { | ||
template, rerr := scaleSet.getVMSSFromCache() | ||
if rerr != nil { | ||
return nil, rerr.Error() | ||
template, err := scaleSet.getVMSSFromCache() | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
node, err := buildNodeFromTemplate(scaleSet.Name, template) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,15 +18,13 @@ package azure | |
|
||
import ( | ||
"fmt" | ||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
"net/http" | ||
"testing" | ||
"time" | ||
|
||
apiv1 "k8s.io/api/core/v1" | ||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
"k8s.io/autoscaler/cluster-autoscaler/cloudprovider" | ||
"net/http" | ||
"sigs.k8s.io/cloud-provider-azure/pkg/azureclients/vmssclient/mockvmssclient" | ||
"sigs.k8s.io/cloud-provider-azure/pkg/azureclients/vmssvmclient/mockvmssvmclient" | ||
"testing" | ||
|
||
"github.com/Azure/azure-sdk-for-go/services/compute/mgmt/2020-12-01/compute" | ||
"github.com/Azure/go-autorest/autorest/to" | ||
|
@@ -138,12 +136,9 @@ func TestIncreaseSize(t *testing.T) { | |
err := provider.azureManager.forceRefresh() | ||
assert.NoError(t, err) | ||
|
||
ss := newTestScaleSet(provider.azureManager, "test-asg") | ||
ss.lastSizeRefresh = time.Now() | ||
ss.sizeRefreshPeriod = 1 * time.Minute | ||
ss.curSize = -1 | ||
ss := newTestScaleSet(provider.azureManager, "test-asg-doesnt-exist") | ||
err = ss.IncreaseSize(100) | ||
expectedErr := fmt.Errorf("the scale set test-asg is under initialization, skipping IncreaseSize") | ||
expectedErr := fmt.Errorf("could not find vmss: test-asg-doesnt-exist") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. technically with the changes in #3717, we'll never end with the "-1" initialization case because the cache wont have the VMSS and we exit earlier. (keeping it there for now though) |
||
assert.Equal(t, expectedErr, err) | ||
|
||
registered := provider.azureManager.RegisterNodeGroup( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The
sizeRefreshPeriod
was always 0 after #3717. The TTL for VMSS cache refreshes is in honoured there where we depend on the CA corecloudprovider.Refresh()
calls to update the caches.https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/azure/azure_manager.go#L166-L168