-
Notifications
You must be signed in to change notification settings - Fork 501
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add auto-scaling calculation based by CPU load (#1722)
* add cpu metrics func
- Loading branch information
Showing
9 changed files
with
371 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
// Copyright 2020 PingCAP, Inc. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
package calculate | ||
|
||
import ( | ||
"encoding/json" | ||
"fmt" | ||
"math" | ||
"net/http" | ||
"strconv" | ||
|
||
"github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1" | ||
promClient "github.com/prometheus/client_golang/api" | ||
autoscalingv2beta2 "k8s.io/api/autoscaling/v2beta2" | ||
"k8s.io/apimachinery/pkg/util/sets" | ||
) | ||
|
||
const ( | ||
TikvSumCpuMetricsPattern = `sum(increase(tikv_thread_cpu_seconds_total{cluster="%s"}[%s])) by (instance)` | ||
TidbSumCpuMetricsPattern = `sum(increase(process_cpu_seconds_total{cluster="%s",job="tidb"}[%s])) by (instance)` | ||
InvalidTacMetricConfigureMsg = "tac[%s/%s] metric configuration invalid" | ||
queryPath = "/api/v1/query" | ||
|
||
float64EqualityThreshold = 1e-9 | ||
) | ||
|
||
type SingleQuery struct { | ||
Timestamp int64 | ||
Quary string | ||
Instances []string | ||
Metric autoscalingv2beta2.MetricSpec | ||
} | ||
|
||
func queryMetricsFromPrometheus(tac *v1alpha1.TidbClusterAutoScaler, client promClient.Client, sq *SingleQuery, resp *Response) error { | ||
query := sq.Quary | ||
timestamp := sq.Timestamp | ||
req, err := http.NewRequest("GET", fmt.Sprintf("%s%s", *tac.Spec.MetricsUrl, queryPath), nil) | ||
if err != nil { | ||
return err | ||
} | ||
q := req.URL.Query() | ||
q.Add("query", query) | ||
q.Add("time", fmt.Sprintf("%d", timestamp)) | ||
req.URL.RawQuery = q.Encode() | ||
r, body, err := client.Do(req.Context(), req) | ||
if err != nil { | ||
return err | ||
} | ||
if r.StatusCode != http.StatusOK { | ||
return fmt.Errorf("tac[%s/%s] query error, status code:%d", tac.Namespace, tac.Name, r.StatusCode) | ||
} | ||
err = json.Unmarshal(body, resp) | ||
if err != nil { | ||
return err | ||
} | ||
if resp.Status != statusSuccess { | ||
return fmt.Errorf("tac[%s/%s] query error, response status: %v", tac.Namespace, tac.Name, resp.Status) | ||
} | ||
return nil | ||
} | ||
|
||
// sumForEachInstance sum the value in Response of each instance from Prometheus | ||
func sumForEachInstance(instances []string, resp *Response) (float64, error) { | ||
if resp == nil { | ||
return 0, fmt.Errorf("metrics response from Promethus can't be empty") | ||
} | ||
s := sets.String{} | ||
for _, instance := range instances { | ||
s.Insert(instance) | ||
} | ||
sum := 0.0 | ||
if len(resp.Data.Result) < 1 { | ||
return 0, fmt.Errorf("metrics Response return zero info") | ||
} | ||
for _, r := range resp.Data.Result { | ||
if s.Has(r.Metric.Instance) { | ||
v, err := strconv.ParseFloat(r.Value[1].(string), 64) | ||
if err != nil { | ||
return 0.0, err | ||
} | ||
sum = sum + v | ||
} | ||
} | ||
return sum, nil | ||
} | ||
|
||
// calculate func calculate the recommended replicas by given usageRatio and currentReplicas | ||
func calculate(currentValue float64, targetValue float64, currentReplicas int32) (int32, error) { | ||
if almostEqual(targetValue, 0.0) { | ||
return -1, fmt.Errorf("targetValue in calculate func can't be zero") | ||
} | ||
usageRatio := currentValue / targetValue | ||
return int32(math.Ceil(usageRatio * float64(currentReplicas))), nil | ||
} | ||
|
||
func almostEqual(a, b float64) bool { | ||
return math.Abs(a-b) <= float64EqualityThreshold | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
// Copyright 2020 PingCAP, Inc. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
package calculate | ||
|
||
import ( | ||
"fmt" | ||
"time" | ||
|
||
"github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1" | ||
promClient "github.com/prometheus/client_golang/api" | ||
appsv1 "k8s.io/api/apps/v1" | ||
corev1 "k8s.io/api/core/v1" | ||
) | ||
|
||
const ( | ||
CpuSumMetricsErrorMsg = "tac[%s/%s] cpu sum metrics error, can't calculate the past %s cpu metrics, may caused by prometheus restart while data persistence not enabled" | ||
) | ||
|
||
//TODO: create issue to explain how auto-scaling algorithm based on cpu metrics work | ||
func CalculateRecomendedReplicasByCpuCosts(tac *v1alpha1.TidbClusterAutoScaler, sq *SingleQuery, sts *appsv1.StatefulSet, | ||
client promClient.Client, memberType v1alpha1.MemberType, duration time.Duration) (int32, error) { | ||
metric := sq.Metric | ||
instances := sq.Instances | ||
|
||
if metric.Resource == nil || metric.Resource.Target.AverageUtilization == nil { | ||
return -1, fmt.Errorf(InvalidTacMetricConfigureMsg, tac.Namespace, tac.Name) | ||
} | ||
currentReplicas := len(instances) | ||
c, err := filterContainer(tac, sts, memberType.String()) | ||
if err != nil { | ||
return -1, err | ||
} | ||
cpuRequestsRatio, err := extractCpuRequestsRatio(c) | ||
if err != nil { | ||
return -1, err | ||
} | ||
r := &Response{} | ||
err = queryMetricsFromPrometheus(tac, client, sq, r) | ||
if err != nil { | ||
return -1, err | ||
} | ||
sum, err := sumForEachInstance(instances, r) | ||
if err != nil { | ||
return -1, err | ||
} | ||
if sum < 0 { | ||
return -1, fmt.Errorf(CpuSumMetricsErrorMsg, tac.Namespace, tac.Name, duration.String()) | ||
} | ||
cpuSecsTotal := sum | ||
durationSeconds := duration.Seconds() | ||
utilizationRatio := float64(*metric.Resource.Target.AverageUtilization) / 100.0 | ||
expectedCpuSecsTotal := cpuRequestsRatio * durationSeconds * float64(currentReplicas) * utilizationRatio | ||
rc, err := calculate(cpuSecsTotal, expectedCpuSecsTotal, int32(currentReplicas)) | ||
if err != nil { | ||
return -1, err | ||
} | ||
return rc, nil | ||
} | ||
|
||
func extractCpuRequestsRatio(c *corev1.Container) (float64, error) { | ||
if c.Resources.Requests.Cpu() == nil || c.Resources.Requests.Cpu().MilliValue() < 1 { | ||
return 0, fmt.Errorf("container[%s] cpu requests is empty", c.Name) | ||
} | ||
return float64(c.Resources.Requests.Cpu().MilliValue()) / 1000.0, nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
// Copyright 2020 PingCAP, Inc. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
package calculate | ||
|
||
import ( | ||
"fmt" | ||
|
||
"github.com/pingcap/tidb-operator/pkg/apis/pingcap/v1alpha1" | ||
appsv1 "k8s.io/api/apps/v1" | ||
autoscalingv2beta2 "k8s.io/api/autoscaling/v2beta2" | ||
corev1 "k8s.io/api/core/v1" | ||
) | ||
|
||
// MetricType describe the current Supported Metric Type to calculate the recommended Replicas | ||
type MetricType string | ||
|
||
const ( | ||
MetricTypeCPU MetricType = "cpu" | ||
//metricTypeQPS MetricType = "qps" | ||
) | ||
|
||
// currently, we only choose one metrics to be computed. | ||
// If there exists several metrics, we tend to choose ResourceMetricSourceType metric | ||
func FilterMetrics(metrics []autoscalingv2beta2.MetricSpec) autoscalingv2beta2.MetricSpec { | ||
for _, m := range metrics { | ||
if m.Type == autoscalingv2beta2.ResourceMetricSourceType && m.Resource != nil { | ||
return m | ||
} | ||
} | ||
return metrics[0] | ||
} | ||
|
||
// genMetricType return the supported MetricType in Operator by kubernetes auto-scaling MetricType | ||
func GenMetricType(tac *v1alpha1.TidbClusterAutoScaler, metric autoscalingv2beta2.MetricSpec) (MetricType, error) { | ||
if metric.Type == autoscalingv2beta2.ResourceMetricSourceType && metric.Resource != nil && metric.Resource.Name == corev1.ResourceCPU { | ||
return MetricTypeCPU, nil | ||
} | ||
return "", fmt.Errorf(InvalidTacMetricConfigureMsg, tac.Namespace, tac.Name) | ||
} | ||
|
||
// filterContainer is to filter the specific container from the given statefulset(tidb/tikv) | ||
func filterContainer(tac *v1alpha1.TidbClusterAutoScaler, sts *appsv1.StatefulSet, containerName string) (*corev1.Container, error) { | ||
for _, c := range sts.Spec.Template.Spec.Containers { | ||
if c.Name == containerName { | ||
return &c, nil | ||
} | ||
} | ||
return nil, fmt.Errorf("tac[%s/%s]'s Target have not %s container", tac.Namespace, tac.Name, containerName) | ||
} | ||
|
||
const ( | ||
statusSuccess = "success" | ||
) | ||
|
||
// Response is used to marshal the data queried from Prometheus | ||
type Response struct { | ||
Status string `json:"status"` | ||
Data Data `json:"data"` | ||
} | ||
|
||
type Data struct { | ||
ResultType string `json:"resultType"` | ||
Result []Result `json:"result"` | ||
} | ||
|
||
type Result struct { | ||
Metric Metric `json:"metric"` | ||
Value []interface{} `json:"value"` | ||
} | ||
|
||
type Metric struct { | ||
Cluster string `json:"cluster,omitempty"` | ||
Instance string `json:"instance"` | ||
Job string `json:"job,omitempty"` | ||
KubernetesNamespace string `json:"kubernetes_namespace,omitempty"` | ||
KubernetesNode string `json:"kubernetes_node,omitempty"` | ||
KubernetesPodIp string `json:"kubernetes_pod_ip,omitempty"` | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.