Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

use topology to achive HA #2366

Merged
merged 3 commits into from
May 8, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions charts/tidb-cluster/templates/tidb-cluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ metadata:
pingcap.com/pd.{{ template "cluster.name" . }}-pd.sha: {{ include "pd-configmap.data-digest" . | quote }}
pingcap.com/tikv.{{ template "cluster.name" . }}-tikv.sha: {{ include "tikv-configmap.data-digest" . | quote }}
pingcap.com/tidb.{{ template "cluster.name" . }}-tidb.sha: {{ include "tidb-configmap.data-digest" . | quote }}
pingcap.com/ha-topology-key: {{ .Values.haTopologyKey | default "kubernetes.io/hostname" }}
{{- end }}
labels:
app.kubernetes.io/name: {{ template "chart.name" . }}
Expand Down
4 changes: 4 additions & 0 deletions charts/tidb-cluster/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,10 @@ discovery:
# if the ConfigMap was not changed.
enableConfigMapRollout: true

# The scheduler achieves HA scheduling based on the topologyKey.
# You can modify it to other label of node
haTopologyKey: kubernetes.io/hostname

# Whether enable the TLS connection between TiDB server components
tlsCluster:
# The steps to enable this feature:
Expand Down
3 changes: 3 additions & 0 deletions pkg/label/label.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ const (
// BackupProtectionFinalizer is the name of finalizer on backups
BackupProtectionFinalizer string = "tidb.pingcap.com/backup-protection"

// High availability is realized based on the topology
AnnHATopologyKey = "pingcap.com/ha-topology-key"

// AnnFailTiDBScheduler is for injecting a failure into the TiDB custom scheduler
// A pod with this annotation will produce an error when scheduled.
AnnFailTiDBScheduler string = "tidb.pingcap.com/fail-scheduler"
Expand Down
112 changes: 69 additions & 43 deletions pkg/scheduler/predicates/ha.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,32 +120,44 @@ func (h *ha) Filter(instanceName string, pod *apiv1.Pod, nodes []apiv1.Node) ([]
replicas := getReplicasFrom(tc, component)
klog.Infof("ha: tidbcluster %s/%s component %s replicas %d", ns, tcName, component, replicas)

allNodes := make(sets.String)
nodeMap := make(map[string][]string)
var topologyKey string
if tc.Annotations[label.AnnHATopologyKey] != "" {
topologyKey = tc.Annotations[label.AnnHATopologyKey]
} else {
topologyKey = "kubernetes.io/hostname"
}
klog.Infof("current topology key: %s", topologyKey)

allTopologies := make(sets.String)
topologyMap := make(map[string]sets.String)

for _, node := range nodes {
nodeMap[node.GetName()] = make([]string, 0)
topologyMap[node.Labels[topologyKey]] = make(sets.String)
}
for _, pod := range podList.Items {
pName := pod.GetName()
nodeName := pod.Spec.NodeName
if nodeName != "" {
allNodes.Insert(nodeName)

topology := getTopologyFromNode(topologyKey, nodeName, nodes)
if topology != "" {
allTopologies.Insert(topology)
}
if nodeName == "" || nodeMap[nodeName] == nil {
if topology == "" || topologyMap[topology] == nil {
klog.Infof("pod %s is not bind", pName)
continue
}

nodeMap[nodeName] = append(nodeMap[nodeName], pName)
topologyMap[topology] = topologyMap[topology].Insert(pName)
}
klog.V(4).Infof("nodeMap: %+v", nodeMap)
klog.V(4).Infof("topologyMap: %+v", topologyMap)

min := -1
minNodeNames := make([]string, 0)
maxPodsPerNode := 0
minTopologies := make([]string, 0)
maxPodsPerTopology := 0

if component == label.PDLabelVal {
/**
* replicas maxPodsPerNode
* replicas maxPodsPerTopology
* ---------------------------
* 1 1
* 2 1
Expand All @@ -154,20 +166,20 @@ func (h *ha) Filter(instanceName string, pod *apiv1.Pod, nodes []apiv1.Node) ([]
* 5 2
* ...
*/
maxPodsPerNode = int((replicas+1)/2) - 1
if maxPodsPerNode <= 0 {
maxPodsPerNode = 1
maxPodsPerTopology = int((replicas+1)/2) - 1
if maxPodsPerTopology <= 0 {
maxPodsPerTopology = 1
}
} else {
// 1. TiKV instances must run on at least 3 nodes, otherwise HA is not possible
if allNodes.Len() < 3 {
maxPodsPerNode = 1
// 1. TiKV instances must run on at least 3 nodes(topologies), otherwise HA is not possible
if allTopologies.Len() < 3 {
maxPodsPerTopology = 1
} else {
/**
* 2. we requires TiKV instances to run on at least 3 nodes, so max
* allowed pods on each node is ceil(replicas / 3)
* 2. we requires TiKV instances to run on at least 3 nodes(topologies), so max
* allowed pods on each topology is ceil(replicas / 3)
*
* replicas maxPodsPerNode best HA on three nodes
* replicas maxPodsPerTopology best HA on three topologies
* ---------------------------------------------------
* 3 1 1, 1, 1
* 4 2 1, 1, 2
Expand All @@ -177,57 +189,57 @@ func (h *ha) Filter(instanceName string, pod *apiv1.Pod, nodes []apiv1.Node) ([]
* 8 3 2, 3, 3
* ...
*/
maxPodsPerNode = int(math.Ceil(float64(replicas) / 3))
maxPodsPerTopology = int(math.Ceil(float64(replicas) / 3))
}
}

for nodeName, podNames := range nodeMap {
for topology, podNames := range topologyMap {
podsCount := len(podNames)

// tikv replicas less than 3 cannot achieve high availability
if component == label.TiKVLabelVal && replicas < 3 {
minNodeNames = append(minNodeNames, nodeName)
klog.Infof("replicas is %d, add node %s to minNodeNames", replicas, nodeName)
minTopologies = append(minTopologies, topology)
klog.Infof("replicas is %d, add topology %s to minTopologies", replicas, topology)
continue
}

if podsCount+1 > maxPodsPerNode {
// pods on this node exceeds the limit, skip
klog.Infof("node %s has %d instances of component %s, max allowed is %d, skipping",
nodeName, podsCount, component, maxPodsPerNode)
if podsCount+1 > maxPodsPerTopology {
// pods on this topology exceeds the limit, skip
klog.Infof("topology %s has %d instances of component %s, max allowed is %d, skipping",
topology, podsCount, component, maxPodsPerTopology)
continue
}

// Choose nodes which has minimum count of the component
// Choose topology which has minimum count of the component
if min == -1 {
min = podsCount
}
if podsCount > min {
klog.Infof("node %s podsCount %d > min %d, skipping", nodeName, podsCount, min)
klog.Infof("topology %s podsCount %d > min %d, skipping", topology, podsCount, min)
continue
}
if podsCount < min {
min = podsCount
minNodeNames = make([]string, 0)
minTopologies = make([]string, 0)
}
minNodeNames = append(minNodeNames, nodeName)
minTopologies = append(minTopologies, topology)
}

if len(minNodeNames) == 0 {
nodesStrArr := []string{}
for nodeName, podNameArr := range nodeMap {
s := fmt.Sprintf("%s (%d %s pods)",
nodeName, len(podNameArr), strings.ToLower(component))
nodesStrArr = append(nodesStrArr, s)
if len(minTopologies) == 0 {
topologyStrArr := []string{}
for topology, podNames := range topologyMap {
s := fmt.Sprintf("%s (%d %s pods)", topology, podNames.Len(), strings.ToLower(component))
topologyStrArr = append(topologyStrArr, s)
}
sort.Strings(nodesStrArr)
sort.Strings(topologyStrArr)

// example: unable to schedule to nodes: kube-node-1 (1 pd pods), kube-node-2 (1 pd pods), max pods per node: 1
errMsg := fmt.Sprintf("unable to schedule to nodes: %s, max pods per node: %d",
strings.Join(nodesStrArr, ", "), maxPodsPerNode)
// example: unable to schedule to topologies: kube-node-1 (1 pd pods), kube-node-2 (1 pd pods), max pods per topology: 1
errMsg := fmt.Sprintf("unable to schedule to topology: %s, max pods per topology: %d",
strings.Join(topologyStrArr, ", "), maxPodsPerTopology)
return nil, errors.New(errMsg)
}
return getNodeFromNames(nodes, minNodeNames), nil

return getNodeFromTopologies(nodes, topologyKey, minTopologies), nil
}

// kubernetes scheduling is parallel, to achieve HA, we must ensure the scheduling is serial,
Expand Down Expand Up @@ -392,3 +404,17 @@ func GetNodeNames(nodes []apiv1.Node) []string {
func getPodNameFromPVC(pvc *apiv1.PersistentVolumeClaim) string {
return strings.TrimPrefix(pvc.Name, fmt.Sprintf("%s-", pvc.Labels[label.ComponentLabelKey]))
}

func getTopologyFromNode(topologyKey string, nodeName string, nodes []apiv1.Node) string {
var topology string
for _, node := range nodes {
if _, ok := node.Labels[topologyKey]; !ok {
continue
}
if node.Name == nodeName {
topology = node.Labels[topologyKey]
break
}
}
return topology
}
Loading