Add consecutive count check for Auto-scaling (#1703)

* Add consecutive count check for Auto-scaling
pingcap · Feb 17, 2020 · 56daf5e · 56daf5e
1 parent b4cda09
commit 56daf5e
Show file tree

Hide file tree

Showing 12 changed files with 839 additions and 28 deletions.
diff --git a/manifests/crd.yaml b/manifests/crd.yaml
@@ -7060,6 +7060,10 @@ spec:
             tidb:
               description: TidbAutoScalerSpec describes the spec for tidb auto-scaling
               properties:
+                MetricsTimeDuration:
+                  description: MetricsTimeDuration describe the Time duration to be
+                    queried in the Prometheus
+                  type: string
                 maxReplicas:
                   description: maxReplicas is the upper limit for the number of replicas
                     to which the autoscaler can scale out. It cannot be less than
@@ -7090,18 +7094,36 @@ spec:
                     will be set to 500
                   format: int32
                   type: integer
+                scaleInThreshold:
+                  description: ScaleInThreshold describe the consecutive threshold
+                    for the auto-scaling, if the consecutive counts of the scale-in
+                    result in auto-scaling reach this number, the auto-scaling would
+                    be performed. If not set, the default value is 5.
+                  format: int32
+                  type: integer
                 scaleOutIntervalSeconds:
                   description: ScaleOutIntervalSeconds represents the duration seconds
                     between each auto-scaling-out If not set, the default ScaleOutIntervalSeconds
                     will be set to 300
                   format: int32
                   type: integer
+                scaleOutThreshold:
+                  description: ScaleOutThreshold describe the consecutive threshold
+                    for the auto-scaling, if the consecutive counts of the scale-out
+                    result in auto-scaling reach this number, the auto-scaling would
+                    be performed. If not set, the default value is 3.
+                  format: int32
+                  type: integer
               required:
               - maxReplicas
               type: object
             tikv:
               description: TikvAutoScalerSpec describes the spec for tikv auto-scaling
               properties:
+                MetricsTimeDuration:
+                  description: MetricsTimeDuration describe the Time duration to be
+                    queried in the Prometheus
+                  type: string
                 maxReplicas:
                   description: maxReplicas is the upper limit for the number of replicas
                     to which the autoscaler can scale out. It cannot be less than
@@ -7132,12 +7154,26 @@ spec:
                     will be set to 500
                   format: int32
                   type: integer
+                scaleInThreshold:
+                  description: ScaleInThreshold describe the consecutive threshold
+                    for the auto-scaling, if the consecutive counts of the scale-in
+                    result in auto-scaling reach this number, the auto-scaling would
+                    be performed. If not set, the default value is 5.
+                  format: int32
+                  type: integer
                 scaleOutIntervalSeconds:
                   description: ScaleOutIntervalSeconds represents the duration seconds
                     between each auto-scaling-out If not set, the default ScaleOutIntervalSeconds
                     will be set to 300
                   format: int32
                   type: integer
+                scaleOutThreshold:
+                  description: ScaleOutThreshold describe the consecutive threshold
+                    for the auto-scaling, if the consecutive counts of the scale-out
+                    result in auto-scaling reach this number, the auto-scaling would
+                    be performed. If not set, the default value is 3.
+                  format: int32
+                  type: integer
               required:
               - maxReplicas
               type: object

diff --git a/pkg/apis/pingcap/v1alpha1/openapi_generated.go b/pkg/apis/pingcap/v1alpha1/openapi_generated.go
diff --git a/pkg/apis/pingcap/v1alpha1/tidbclusterautoscaler_types.go b/pkg/apis/pingcap/v1alpha1/tidbclusterautoscaler_types.go
@@ -112,6 +112,24 @@ type BasicAutoScalerSpec struct {
 	// If not set, the default metric will be set to 80% average CPU utilization.
 	// +optional
 	Metrics []v2beta2.MetricSpec `json:"metrics,omitempty"`
+
+	// MetricsTimeDuration describe the Time duration to be queried in the Prometheus
+	// +optional
+	MetricsTimeDuration *string `json:"MetricsTimeDuration,omitempty"`
+
+	// ScaleOutThreshold describe the consecutive threshold for the auto-scaling,
+	// if the consecutive counts of the scale-out result in auto-scaling reach this number,
+	// the auto-scaling would be performed.
+	// If not set, the default value is 3.
+	// +optional
+	ScaleOutThreshold *int32 `json:"scaleOutThreshold,omitempty"`
+
+	// ScaleInThreshold describe the consecutive threshold for the auto-scaling,
+	// if the consecutive counts of the scale-in result in auto-scaling reach this number,
+	// the auto-scaling would be performed.
+	// If not set, the default value is 5.
+	// +optional
+	ScaleInThreshold *int32 `json:"scaleInThreshold,omitempty"`
 }
 
 // TODO: sync status

diff --git a/pkg/apis/pingcap/v1alpha1/zz_generated.deepcopy.go b/pkg/apis/pingcap/v1alpha1/zz_generated.deepcopy.go
diff --git a/pkg/autoscaler/autoscaler/autoscaler_manager.go b/pkg/autoscaler/autoscaler/autoscaler_manager.go
@@ -20,6 +20,7 @@ import (
 	informers "github.com/pingcap/tidb-operator/pkg/client/informers/externalversions"
 	v1alpha1listers "github.com/pingcap/tidb-operator/pkg/client/listers/pingcap/v1alpha1"
 	promClient "github.com/prometheus/client_golang/api"
+	"k8s.io/apimachinery/pkg/api/errors"
 	kubeinformers "k8s.io/client-go/informers"
 	appslisters "k8s.io/client-go/listers/apps/v1"
 	"k8s.io/client-go/tools/record"
@@ -57,8 +58,14 @@ func (am *autoScalerManager) Sync(tac *v1alpha1.TidbClusterAutoScaler) error {
 	tcNamespace := tac.Spec.Cluster.Namespace
 	tc, err := am.tcLister.TidbClusters(tcNamespace).Get(tcName)
 	if err != nil {
+		if errors.IsNotFound(err) {
+			// Target TidbCluster Ref is deleted, empty the auto-scaling status
+			emptyAutoScalingCountAnn(tac, v1alpha1.TiDBMemberType)
+			emptyAutoScalingCountAnn(tac, v1alpha1.TiKVMemberType)
+		}
 		return err
 	}
+	checkAndUpdateTacAnn(tac)
 	oldTCSpec := tc.Spec.DeepCopy()
 	if err := am.syncAutoScaling(tc, tac); err != nil {
 		return err
@@ -94,7 +101,7 @@ func (am *autoScalerManager) syncTidbClusterReplicas(tc *v1alpha1.TidbCluster, o
 }
 
 //TODO: sync tac status
-func (am *autoScalerManager) syncAutoScalingStatus(tc *v1alpha1.TidbCluster, oldTCSpec *v1alpha1.TidbClusterSpec,
+func (am *autoScalerManager) syncAutoScalingStatus(tc *v1alpha1.TidbCluster, oldTc *v1alpha1.TidbClusterSpec,
 	tac *v1alpha1.TidbClusterAutoScaler) error {
 	return nil
 }
diff --git a/pkg/autoscaler/autoscaler/tidb_autoscaler.go b/pkg/autoscaler/autoscaler/tidb_autoscaler.go
@@ -24,38 +24,59 @@ import (
 
 func (am *autoScalerManager) syncTiDB(tc *v1alpha1.TidbCluster, tac *v1alpha1.TidbClusterAutoScaler, client promClient.Client) error {
 	if tac.Spec.TiDB == nil {
+		emptyAutoScalingCountAnn(tac, v1alpha1.TiDBMemberType)
 		return nil
 	}
 	sts, err := am.stsLister.StatefulSets(tc.Namespace).Get(operatorUtils.GetStatefulSetName(tc, v1alpha1.TiDBMemberType))
 	if err != nil {
 		return err
 	}
 	if !checkAutoScalingPrerequisites(tc, sts, v1alpha1.TiDBMemberType) {
+		emptyAutoScalingCountAnn(tac, v1alpha1.TiDBMemberType)
 		return nil
 	}
-	targetReplicas := tc.Spec.TiDB.Replicas
-
-	// TODO: sync tidb.metrics from prometheus
-	// rate(process_cpu_seconds_total{cluster="tidb",job="tidb"}[threshold Minute])
-	//for _, _ = range tac.Spec.TiDB.Metrics {
-	//	// revive:disable:empty-block
-	//}
+	currentReplicas := tc.Spec.TiDB.Replicas
+	targetReplicas := calculateRecommendedReplicas(tac, v1alpha1.TiDBMemberType, client)
 	targetReplicas = limitTargetReplicas(targetReplicas, tac, v1alpha1.TiDBMemberType)
 	if targetReplicas == tc.Spec.TiDB.Replicas {
+		emptyAutoScalingCountAnn(tac, v1alpha1.TiDBMemberType)
+		return nil
+	}
+	return syncTiDBAfterCalculated(tc, tac, currentReplicas, targetReplicas)
+}
+
+// syncTiDBAfterCalculated would check the Consecutive count to avoid jitter, and it would also check the interval
+// duration between each auto-scaling. If either of them is not meet, the auto-scaling would be rejected.
+// If the auto-scaling is permitted, the timestamp would be recorded and the Consecutive count would be zeroed.
+func syncTiDBAfterCalculated(tc *v1alpha1.TidbCluster, tac *v1alpha1.TidbClusterAutoScaler, currentReplicas, recommendedReplicas int32) error {
+	if err := updateConsecutiveCount(tac, v1alpha1.TiDBMemberType, currentReplicas, recommendedReplicas); err != nil {
+		return err
+	}
+
+	ableToScale, err := checkConsecutiveCount(tac, v1alpha1.TiDBMemberType, currentReplicas, recommendedReplicas)
+	if err != nil {
+		return err
+	}
+	if !ableToScale {
 		return nil
 	}
 	intervalSeconds := tac.Spec.TiDB.ScaleInIntervalSeconds
-	if targetReplicas > tc.Spec.TiDB.Replicas {
+	if recommendedReplicas > currentReplicas {
 		intervalSeconds = tac.Spec.TiDB.ScaleOutIntervalSeconds
 	}
-	ableToScale, err := checkStsAutoScalingInterval(tc, *intervalSeconds, v1alpha1.TiDBMemberType)
+	ableToScale, err = checkStsAutoScalingInterval(tc, *intervalSeconds, v1alpha1.TiDBMemberType)
 	if err != nil {
 		return err
 	}
 	if !ableToScale {
 		return nil
 	}
-	tc.Spec.Annotations[label.AnnTiDBLastAutoScalingTimestamp] = time.Now().String()
-	tc.Spec.TiDB.Replicas = targetReplicas
+	updateTcTiDBAnnIfScale(tac)
+	tc.Spec.TiDB.Replicas = recommendedReplicas
 	return nil
 }
+
+func updateTcTiDBAnnIfScale(tac *v1alpha1.TidbClusterAutoScaler) {
+	tac.Annotations[label.AnnTiDBLastAutoScalingTimestamp] = time.Now().String()
+	emptyAutoScalingCountAnn(tac, v1alpha1.TiDBMemberType)
+}