Skip to content

Commit

Permalink
Merge pull request #4022 from amrmahdi/amrh/nodegroupminmaxmetrics
Browse files Browse the repository at this point in the history
[cluster-autoscaler] Publish node group min/max metrics
  • Loading branch information
k8s-ci-robot authored Jul 5, 2021
2 parents 7858da6 + 8b2aee0 commit 9f84d39
Show file tree
Hide file tree
Showing 6 changed files with 85 additions and 3 deletions.
1 change: 1 addition & 0 deletions cluster-autoscaler/FAQ.md
Original file line number Diff line number Diff line change
Expand Up @@ -704,6 +704,7 @@ The following startup parameters are supported for cluster autoscaler:
| `max-node-provision-time` | Maximum time CA waits for node to be provisioned | 15 minutes
| `nodes` | sets min,max size and other configuration data for a node group in a format accepted by cloud provider. Can be used multiple times. Format: <min>:<max>:<other...> | ""
| `node-group-auto-discovery` | One or more definition(s) of node group auto-discovery.<br>A definition is expressed `<name of discoverer>:[<key>[=<value>]]`<br>The `aws`, `gce`, and `azure` cloud providers are currently supported. AWS matches by ASG tags, e.g. `asg:tag=tagKey,anotherTagKey`<br>GCE matches by IG name prefix, and requires you to specify min and max nodes per IG, e.g. `mig:namePrefix=pfx,min=0,max=10`<br> Azure matches by tags on VMSS, e.g. `label:foo=bar`, and will auto-detect `min` and `max` tags on the VMSS to set scaling limits.<br>Can be used multiple times | ""
| `emit-per-nodegroup-metrics` | If true, emit per node group metrics. | false
| `estimator` | Type of resource estimator to be used in scale up | binpacking
| `expander` | Type of node group expander to be used in scale up. | random
| `write-status-configmap` | Should CA write status information to a configmap | true
Expand Down
2 changes: 1 addition & 1 deletion cluster-autoscaler/core/scale_up_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -978,7 +978,7 @@ func TestCheckScaleUpDeltaWithinLimits(t *testing.T) {
}

func TestAuthError(t *testing.T) {
metrics.RegisterAll()
metrics.RegisterAll(false)
context, err := NewScaleTestAutoscalingContext(config.AutoscalingOptions{}, &fake.Clientset{}, nil, nil, nil)
assert.NoError(t, err)

Expand Down
6 changes: 6 additions & 0 deletions cluster-autoscaler/core/static_autoscaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,12 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) errors.AutoscalerError
return errors.ToAutoscalerError(errors.CloudProviderError, err)
}

// Update node groups min/max after cloud provider refresh
for _, nodeGroup := range a.AutoscalingContext.CloudProvider.NodeGroups() {
metrics.UpdateNodeGroupMin(nodeGroup.Id(), nodeGroup.MinSize())
metrics.UpdateNodeGroupMax(nodeGroup.Id(), nodeGroup.MaxSize())
}

nonExpendableScheduledPods := core_utils.FilterOutExpendablePods(originalScheduledPods, a.ExpendablePodsPriorityCutoff)
// Initialize cluster state to ClusterSnapshot
if typedErr := a.initializeClusterSnapshot(allNodes, nonExpendableScheduledPods); typedErr != nil {
Expand Down
4 changes: 3 additions & 1 deletion cluster-autoscaler/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,8 @@ var (
daemonSetEvictionForEmptyNodes = flag.Bool("daemonset-eviction-for-empty-nodes", false, "DaemonSet pods will be gracefully terminated from empty nodes")
daemonSetEvictionForOccupiedNodes = flag.Bool("daemonset-eviction-for-occupied-nodes", true, "DaemonSet pods will be gracefully terminated from non-empty nodes")
userAgent = flag.String("user-agent", "cluster-autoscaler", "User agent used for HTTP calls.")

emitPerNodeGroupMetrics = flag.Bool("emit-per-nodegroup-metrics", false, "If true, emit per node group metrics.")
)

func createAutoscalingOptions() config.AutoscalingOptions {
Expand Down Expand Up @@ -342,7 +344,7 @@ func buildAutoscaler() (core.Autoscaler, error) {
}

func run(healthCheck *metrics.HealthCheck) {
metrics.RegisterAll()
metrics.RegisterAll(*emitPerNodeGroupMetrics)

autoscaler, err := buildAutoscaler()
if err != nil {
Expand Down
33 changes: 32 additions & 1 deletion cluster-autoscaler/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,22 @@ var (
}, []string{"direction"},
)

nodesGroupMinNodes = k8smetrics.NewGaugeVec(
&k8smetrics.GaugeOpts{
Namespace: caNamespace,
Name: "node_group_min_count",
Help: "Minimum number of nodes in the node group",
}, []string{"node_group"},
)

nodesGroupMaxNodes = k8smetrics.NewGaugeVec(
&k8smetrics.GaugeOpts{
Namespace: caNamespace,
Name: "node_group_max_count",
Help: "Maximum number of nodes in the node group",
}, []string{"node_group"},
)

/**** Metrics related to autoscaler execution ****/
lastActivity = k8smetrics.NewGaugeVec(
&k8smetrics.GaugeOpts{
Expand Down Expand Up @@ -315,7 +331,7 @@ var (
)

// RegisterAll registers all metrics.
func RegisterAll() {
func RegisterAll(emitPerNodeGroupMetrics bool) {
legacyregistry.MustRegister(clusterSafeToAutoscale)
legacyregistry.MustRegister(nodesCount)
legacyregistry.MustRegister(nodeGroupsCount)
Expand All @@ -342,6 +358,11 @@ func RegisterAll() {
legacyregistry.MustRegister(napEnabled)
legacyregistry.MustRegister(nodeGroupCreationCount)
legacyregistry.MustRegister(nodeGroupDeletionCount)

if emitPerNodeGroupMetrics {
legacyregistry.MustRegister(nodesGroupMinNodes)
legacyregistry.MustRegister(nodesGroupMaxNodes)
}
}

// UpdateDurationFromStart records the duration of the step identified by the
Expand Down Expand Up @@ -423,6 +444,16 @@ func UpdateMemoryLimitsBytes(minMemoryCount int64, maxMemoryCount int64) {
memoryLimitsBytes.WithLabelValues("maximum").Set(float64(maxMemoryCount))
}

// UpdateNodeGroupMin records the node group minimum allowed number of nodes
func UpdateNodeGroupMin(nodeGroup string, minNodes int) {
nodesGroupMinNodes.WithLabelValues(nodeGroup).Set(float64(minNodes))
}

// UpdateNodeGroupMax records the node group maximum allowed number of nodes
func UpdateNodeGroupMax(nodeGroup string, maxNodes int) {
nodesGroupMaxNodes.WithLabelValues(nodeGroup).Set(float64(maxNodes))
}

// RegisterError records any errors preventing Cluster Autoscaler from working.
// No more than one error should be recorded per loop.
func RegisterError(err errors.AutoscalerError) {
Expand Down
42 changes: 42 additions & 0 deletions cluster-autoscaler/metrics/metrics_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package metrics

import (
"testing"

"github.com/prometheus/client_golang/prometheus/testutil"
"github.com/stretchr/testify/assert"
)

func TestDisabledPerNodeGroupMetrics(t *testing.T) {
RegisterAll(false)
assert.False(t, nodesGroupMinNodes.IsCreated())
assert.False(t, nodesGroupMaxNodes.IsCreated())
}

func TestEnabledPerNodeGroupMetrics(t *testing.T) {
RegisterAll(true)
assert.True(t, nodesGroupMinNodes.IsCreated())
assert.True(t, nodesGroupMaxNodes.IsCreated())

UpdateNodeGroupMin("foo", 2)
UpdateNodeGroupMax("foo", 100)

assert.Equal(t, 2, int(testutil.ToFloat64(nodesGroupMinNodes.GaugeVec.WithLabelValues("foo"))))
assert.Equal(t, 100, int(testutil.ToFloat64(nodesGroupMaxNodes.GaugeVec.WithLabelValues("foo"))))
}

0 comments on commit 9f84d39

Please sign in to comment.