-
Notifications
You must be signed in to change notification settings - Fork 219
/
Copy pathscheduling.go
194 lines (167 loc) · 7.75 KB
/
scheduling.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
/*
Copyright The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package pod
import (
"time"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/utils/clock"
"sigs.k8s.io/karpenter/pkg/apis/v1alpha5"
"sigs.k8s.io/karpenter/pkg/apis/v1beta1"
"sigs.k8s.io/karpenter/pkg/scheduling"
)
// IsActive checks if Karpenter should consider this pod as running by ensuring that the pod:
// - Isn't a terminal pod (Failed or Succeeded)
// - Isn't actively terminating
func IsActive(pod *v1.Pod) bool {
return !IsTerminal(pod) &&
!IsTerminating(pod)
}
// IsReschedulable checks if a Karpenter should consider this pod when re-scheduling to new capacity by ensuring that the pod:
// - Is an active pod (isn't terminal or actively terminating) OR Is owned by a StatefulSet and Is Terminating
// - Isn't owned by a DaemonSet
// - Isn't a mirror pod (https://kubernetes.io/docs/tasks/configure-pod-container/static-pod/)
func IsReschedulable(pod *v1.Pod) bool {
// StatefulSet pods can be handled differently here because we know that StatefulSet pods MUST
// get deleted before new pods are re-created. This means that we can model terminating pods for StatefulSets
// differently for higher availability by considering terminating pods for scheduling
return (IsActive(pod) || (IsOwnedByStatefulSet(pod) && IsTerminating(pod))) &&
!IsOwnedByDaemonSet(pod) &&
!IsOwnedByNode(pod)
}
// IsEvictable checks if a pod is evictable by Karpenter by ensuring that the pod:
// - Is an active pod (isn't terminal or actively terminating)
// - Doesn't tolerate the "karpenter.sh/disruption=disrupting" taint
// - Isn't a mirror pod (https://kubernetes.io/docs/tasks/configure-pod-container/static-pod/)
func IsEvictable(pod *v1.Pod) bool {
return IsActive(pod) &&
!ToleratesDisruptionNoScheduleTaint(pod) &&
!IsOwnedByNode(pod)
}
// IsWaitingEviction checks if this is a pod that we are waiting to be removed from the node by ensuring that the pod:
// - Isn't a terminal pod (Failed or Succeeded)
// - Isn't a pod that has been terminating past its terminationGracePeriodSeconds
// - Doesn't tolerate the "karpenter.sh/disruption=disrupting" taint
// - Isn't a mirror pod (https://kubernetes.io/docs/tasks/configure-pod-container/static-pod/)
func IsWaitingEviction(pod *v1.Pod, clk clock.Clock) bool {
return !IsTerminal(pod) &&
!IsStuckTerminating(pod, clk) &&
!ToleratesDisruptionNoScheduleTaint(pod) &&
// Mirror pods cannot be deleted through the API server since they are created and managed by kubelet
// This means they are effectively read-only and can't be controlled by API server calls
// https://kubernetes.io/docs/reference/generated/kubectl/kubectl-commands#drain
!IsOwnedByNode(pod)
}
// IsProvisionable checks if a pod needs to be scheduled to new capacity by Karpenter by ensuring that the pod:
// - Has been marked as "Unschedulable" in the PodScheduled reason by the kube-scheduler
// - Has not been bound to a node
// - Isn't currently preempting other pods on the cluster and about to schedule
// - Isn't owned by a DaemonSet
// - Isn't a mirror pod (https://kubernetes.io/docs/tasks/configure-pod-container/static-pod/)
func IsProvisionable(pod *v1.Pod) bool {
return FailedToSchedule(pod) &&
!IsScheduled(pod) &&
!IsPreempting(pod) &&
!IsOwnedByDaemonSet(pod) &&
!IsOwnedByNode(pod)
}
// IsDisruptable checks if a pod can be disrupted based on validating the `karpenter.sh/do-not-disrupt` annotation on the pod.
// It checks whether the following is true for the pod:
// - Has the `karpenter.sh/do-not-disrupt` annotation
// - Is an actively running pod
func IsDisruptable(pod *v1.Pod) bool {
return !(IsActive(pod) && HasDoNotDisrupt(pod))
}
// FailedToSchedule ensures that the kube-scheduler has seen this pod and has intentionally
// marked this pod with a condition, noting that it thinks that the pod can't schedule anywhere
// It does this by marking the pod status condition "PodScheduled" as "Unschedulable"
// Note that it's possible that other schedulers may be scheduling another pod and may have a different
// semantic (e.g. Fargate on AWS marks with MATCH_NODE_SELECTOR_FAILED). If that's the case, Karpenter
// won't react to this pod because the scheduler didn't add this specific condition.
func FailedToSchedule(pod *v1.Pod) bool {
for _, condition := range pod.Status.Conditions {
if condition.Type == v1.PodScheduled && condition.Reason == v1.PodReasonUnschedulable {
return true
}
}
return false
}
func IsScheduled(pod *v1.Pod) bool {
return pod.Spec.NodeName != ""
}
func IsPreempting(pod *v1.Pod) bool {
return pod.Status.NominatedNodeName != ""
}
func IsTerminal(pod *v1.Pod) bool {
return pod.Status.Phase == v1.PodFailed || pod.Status.Phase == v1.PodSucceeded
}
func IsTerminating(pod *v1.Pod) bool {
return pod.DeletionTimestamp != nil
}
func IsStuckTerminating(pod *v1.Pod, clk clock.Clock) bool {
// The pod DeletionTimestamp will be set to the time the pod was deleted plus its
// grace period in seconds. We give an additional minute as a buffer to allow
// pods to force delete off the node before we actually go and terminate the node
// so that we get less pod leaking on the cluster.
return IsTerminating(pod) && clk.Since(pod.DeletionTimestamp.Time) > time.Minute
}
func IsOwnedByStatefulSet(pod *v1.Pod) bool {
return IsOwnedBy(pod, []schema.GroupVersionKind{
{Group: "apps", Version: "v1", Kind: "StatefulSet"},
})
}
func IsOwnedByDaemonSet(pod *v1.Pod) bool {
return IsOwnedBy(pod, []schema.GroupVersionKind{
{Group: "apps", Version: "v1", Kind: "DaemonSet"},
})
}
// IsOwnedByNode returns true if the pod is a static pod owned by a specific node
func IsOwnedByNode(pod *v1.Pod) bool {
return IsOwnedBy(pod, []schema.GroupVersionKind{
{Version: "v1", Kind: "Node"},
})
}
func IsOwnedBy(pod *v1.Pod, gvks []schema.GroupVersionKind) bool {
for _, ignoredOwner := range gvks {
for _, owner := range pod.ObjectMeta.OwnerReferences {
if owner.APIVersion == ignoredOwner.GroupVersion().String() && owner.Kind == ignoredOwner.Kind {
return true
}
}
}
return false
}
func HasDoNotDisrupt(pod *v1.Pod) bool {
if pod.Annotations == nil {
return false
}
// TODO Remove checking do-not-evict as part of v1
return pod.Annotations[v1alpha5.DoNotEvictPodAnnotationKey] == "true" ||
pod.Annotations[v1beta1.DoNotDisruptAnnotationKey] == "true"
}
// ToleratesDisruptionNoScheduleTaint returns true if the pod tolerates karpenter.sh/disruption:NoSchedule=Disrupting taint
func ToleratesDisruptionNoScheduleTaint(pod *v1.Pod) bool {
return scheduling.Taints([]v1.Taint{v1beta1.DisruptionNoScheduleTaint}).Tolerates(pod) == nil
}
// HasRequiredPodAntiAffinity returns true if a non-empty PodAntiAffinity/RequiredDuringSchedulingIgnoredDuringExecution
// is defined in the pod spec
func HasRequiredPodAntiAffinity(pod *v1.Pod) bool {
return HasPodAntiAffinity(pod) &&
len(pod.Spec.Affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution) != 0
}
// HasPodAntiAffinity returns true if a non-empty PodAntiAffinity is defined in the pod spec
func HasPodAntiAffinity(pod *v1.Pod) bool {
return pod.Spec.Affinity != nil && pod.Spec.Affinity.PodAntiAffinity != nil &&
(len(pod.Spec.Affinity.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution) != 0 ||
len(pod.Spec.Affinity.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution) != 0)
}