From e3e7cf4cf33931ab9621dec0a7e521adfe11bf58 Mon Sep 17 00:00:00 2001
From: Mark Mandel <markmandel@google.com>
Date: Tue, 30 Oct 2018 15:26:53 -0700
Subject: [PATCH] Cluster Autoscaling: safe-to-evict=false annotations for
 GameServer Pods

This is the final piece for ensuring that the Kubernetes Autoscaler
works with Agones.

This ensures that `GameServer` Pods cannot be evicted from the cluster, via
annotations that the autoscaler uses to determine that `GameServer` Pods
are unsafe to be evicted.

This annotation has also been placed on the controller, but can be turned off
via Helm chart variables.

I expect that cluster autoscaling, and the backing strategies will get tweaked
for performance and resource usage as we get more real world experience with it,
but this is working relatively nicely right now.

Closes #368
---
 docs/scheduling_autoscaling.md                | 72 +++++++++++++------
 install/helm/agones/templates/controller.yaml |  3 +-
 install/helm/agones/values.yaml               |  1 +
 install/yaml/install.yaml                     |  2 +
 pkg/apis/stable/v1alpha1/gameserver.go        |  7 +-
 pkg/apis/stable/v1alpha1/gameserver_test.go   | 37 ++++++++++
 6 files changed, 100 insertions(+), 22 deletions(-)

diff --git a/docs/scheduling_autoscaling.md b/docs/scheduling_autoscaling.md
index 1b0747dac0..1d05bf6752 100644
--- a/docs/scheduling_autoscaling.md
+++ b/docs/scheduling_autoscaling.md
@@ -7,34 +7,56 @@
 Table of Contents
 =================
 
-   * [Scheduling and Autoscaling](#scheduling-and-autoscaling)
-   * [Table of Contents](#table-of-contents)
-      * [Fleet Autoscaling](#fleet-autoscaling)
-      * [Autoscaling Concepts](#autoscaling-concepts)
-         * [Allocation Scheduling](#allocation-scheduling)
-         * [Pod Scheduling](#pod-scheduling)
-         * [Fleet Scale Down Strategy](#fleet-scale-down-strategy)
-      * [Fleet Scheduling](#fleet-scheduling)
-         * [Packed](#packed)
-            * [Allocation Scheduling Strategy](#allocation-scheduling-strategy)
-            * [Pod Scheduling Strategy](#pod-scheduling-strategy)
-            * [Fleet Scale Down Strategy](#fleet-scale-down-strategy-1)
-         * [Distributed](#distributed)
-            * [Allocation Scheduling Strategy](#allocation-scheduling-strategy-1)
-            * [Pod Scheduling Strategy](#pod-scheduling-strategy-1)
-            * [Fleet Scale Down Strategy](#fleet-scale-down-strategy-2)
-
+  * [Kubernetes Cluster Autoscaler](#kubernetes-cluster-autoscaler)
+     * [Google Kubernetes Engine](#google-kubernetes-engine)
+     * [Azure Kubernetes Service](#azure-kubernetes-service)
+  * [Fleet Autoscaling](#fleet-autoscaling)
+  * [Autoscaling Concepts](#autoscaling-concepts)
+     * [Allocation Scheduling](#allocation-scheduling)
+     * [Pod Scheduling](#pod-scheduling)
+     * [Fleet Scale Down Strategy](#fleet-scale-down-strategy)
+  * [Fleet Scheduling](#fleet-scheduling)
+     * [Packed](#packed)
+        * [Allocation Scheduling Strategy](#allocation-scheduling-strategy)
+        * [Pod Scheduling Strategy](#pod-scheduling-strategy)
+        * [Fleet Scale Down Strategy](#fleet-scale-down-strategy-1)
+     * [Distributed](#distributed)
+        * [Allocation Scheduling Strategy](#allocation-scheduling-strategy-1)
+        * [Pod Scheduling Strategy](#pod-scheduling-strategy-1)
+        * [Fleet Scale Down Strategy](#fleet-scale-down-strategy-2)
 
 Scheduling and autoscaling go hand in hand, as where in the cluster `GameServers` are provisioned
 impacts how to autoscale fleets up and down (or if you would even want to)
 
+## Cluster Autoscaler
+
+Kubernetes has a [cluster node autoscaler that works with a wide variety of cloud providers](https://github.com/kubernetes/autoscaler/tree/master/cluster-autoscaler).
+
+The default scheduling strategy (`Packed`) is designed to work with the Kubernetes autoscaler out of the box.
+
+The autoscaler will automatically add Nodes to the cluster when `GameServers` don't have room to be scheduled on the
+clusters, and then scale down when there are empty Nodes with no `GameServers` running on them.
+
+This means that scaling `Fleets` up and down can be used to control the size of the cluster, as the cluster autoscaler
+will adjust the size of the cluster to match the resource needs of one or more `Fleets` running on it.
+
+To enable and configure autoscaling on your cloud provider, check their [connector implementation](https://github.com/kubernetes/autoscaler/tree/master/cluster-autoscaler/cloudprovider),
+or their cloud specific documentation.
+
+### Google Kubernetes Engine
+* [Administering Clusters: Autoscaling a Cluster](https://cloud.google.com/kubernetes-engine/docs/how-to/cluster-autoscaler)
+* [Cluster Autoscaler](https://cloud.google.com/kubernetes-engine/docs/concepts/cluster-autoscaler)
+
+### Azure Kubernetes Service
+* [Cluster Autoscaler on Azure Kubernetes Service (AKS) - Preview](https://docs.microsoft.com/en-us/azure/aks/autoscaler)
+
 ## Fleet Autoscaling
 
-Fleet autoscaling is currently the only type of autoscaling that exists in Agones. It is also only available as a simple
+Fleet autoscaling is the only type of autoscaling that exists in Agones. It is currently only available as a simple
 buffer autoscaling strategy. Have a look at the [Create a Fleet Autoscaler](create_fleetautoscaler.md) quickstart,
 and the [Fleet Autoscaler Specification](fleetautoscaler_spec.md) for details.
 
-Node scaling, and more sophisticated fleet autoscaling will be coming in future releases ([design](https://github.com/GoogleCloudPlatform/agones/issues/368))
+More sophisticated fleet autoscaling will be coming in future releases.
 
 ## Autoscaling Concepts
 
@@ -88,7 +110,13 @@ for the infrastructure you use.
 It attempts to _pack_ as much as possible into the smallest set of nodes, to make
 scaling infrastructure down as easy as possible.
 
-This affects Allocation Scheduling, Pod Scheduling and Fleet Scale Down Scheduling.
+This affects the Cluster autoscaler, Allocation Scheduling, Pod Scheduling and Fleet Scale Down Scheduling.
+
+#### Cluster Autoscaler
+
+To ensure that the Cluster Autoscaler doesn't attempt to evict and move `GameServer` `Pods` onto new Nodes during
+gameplay, Agones adds the annotation [`"cluster-autoscaler.kubernetes.io/safe-to-evict": "false"`](https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/FAQ.md#what-types-of-pods-can-prevent-ca-from-removing-a-node)
+to the backing Pod.
 
 #### Allocation Scheduling Strategy
 
@@ -138,6 +166,10 @@ size of the cluster.
 
 This affects Allocation Scheduling, Pod Scheduling and Fleet Scale Down Scheduling.
 
+#### Cluster Autoscaler
+
+Since this strategy is not aimed at clusters that autoscale, this strategy does nothing for the cluster autoscaler.
+
 #### Allocation Scheduling Strategy
 
 Under the "Distributed" strategy, allocation will prioritise allocating `GameSerers` to nodes that have the least
diff --git a/install/helm/agones/templates/controller.yaml b/install/helm/agones/templates/controller.yaml
index 2e35d2cd3a..03acde420f 100644
--- a/install/helm/agones/templates/controller.yaml
+++ b/install/helm/agones/templates/controller.yaml
@@ -35,8 +35,9 @@ spec:
     type: Recreate
   template:
     metadata:
-{{- if .Values.agones.controller.generateTLS }}
       annotations:
+        cluster-autoscaler.kubernetes.io/safe-to-evict: {{ .Values.agones.controller.safeToEvict | quote }}
+{{- if .Values.agones.controller.generateTLS }}
         revision/tls-cert: {{ .Release.Revision | quote }}
 {{- end }}
       labels:
diff --git a/install/helm/agones/values.yaml b/install/helm/agones/values.yaml
index 2d0be0b6e5..5245a91d9a 100644
--- a/install/helm/agones/values.yaml
+++ b/install/helm/agones/values.yaml
@@ -22,6 +22,7 @@ agones:
   controller:
     resources: {}
     generateTLS: true
+    safeToEvict: false
     healthCheck:
       http:
         port: 8080
diff --git a/install/yaml/install.yaml b/install/yaml/install.yaml
index 4609795a21..ca4fe35c46 100644
--- a/install/yaml/install.yaml
+++ b/install/yaml/install.yaml
@@ -790,6 +790,8 @@ spec:
     type: Recreate
   template:
     metadata:
+      annotations:
+        cluster-autoscaler.kubernetes.io/safe-to-evict: "false"
       labels:
         stable.agones.dev/role: controller
         app: agones
diff --git a/pkg/apis/stable/v1alpha1/gameserver.go b/pkg/apis/stable/v1alpha1/gameserver.go
index beddc69301..c9ffe0db0b 100644
--- a/pkg/apis/stable/v1alpha1/gameserver.go
+++ b/pkg/apis/stable/v1alpha1/gameserver.go
@@ -352,13 +352,18 @@ func (gs *GameServer) podObjectMeta(pod *corev1.Pod) {
 	pod.ObjectMeta.Annotations[GameServerContainerAnnotation] = gs.Spec.Container
 	ref := metav1.NewControllerRef(gs, SchemeGroupVersion.WithKind("GameServer"))
 	pod.ObjectMeta.OwnerReferences = append(pod.ObjectMeta.OwnerReferences, *ref)
+
+	if gs.Spec.Scheduling == Packed {
+		// This means that the autoscaler cannot remove the Node that this Pod is on.
+		// (and evict the Pod in the process)
+		pod.ObjectMeta.Annotations["cluster-autoscaler.kubernetes.io/safe-to-evict"] = "false"
+	}
 }
 
 // podScheduling applies the Fleet scheduling strategy to the passed in Pod
 // this sets the a PreferredDuringSchedulingIgnoredDuringExecution for GameServer
 // pods to a host topology. Basically doing a half decent job of packing GameServer
 // pods together.
-// TODO: update the scheduling doc
 func (gs *GameServer) podScheduling(pod *corev1.Pod) {
 	if gs.Spec.Scheduling == Packed {
 		if pod.Spec.Affinity == nil {
diff --git a/pkg/apis/stable/v1alpha1/gameserver_test.go b/pkg/apis/stable/v1alpha1/gameserver_test.go
index 48f4cdc6e6..1775bb5cd2 100644
--- a/pkg/apis/stable/v1alpha1/gameserver_test.go
+++ b/pkg/apis/stable/v1alpha1/gameserver_test.go
@@ -285,6 +285,43 @@ func TestGameServerPod(t *testing.T) {
 	assert.True(t, metav1.IsControlledBy(pod, fixture))
 }
 
+func TestGameServerPodObjectMeta(t *testing.T) {
+	fixture := &GameServer{ObjectMeta: metav1.ObjectMeta{Name: "lucy"},
+		Spec: GameServerSpec{Container: "goat"}}
+
+	f := func(t *testing.T, gs *GameServer, pod *corev1.Pod) {
+		assert.Equal(t, gs.ObjectMeta.Name+"-", pod.ObjectMeta.GenerateName)
+		assert.Equal(t, gs.ObjectMeta.Namespace, pod.ObjectMeta.Namespace)
+		assert.Equal(t, GameServerLabelRole, pod.ObjectMeta.Labels[RoleLabel])
+		assert.Equal(t, "gameserver", pod.ObjectMeta.Labels[stable.GroupName+"/role"])
+		assert.Equal(t, gs.ObjectMeta.Name, pod.ObjectMeta.Labels[GameServerPodLabel])
+		assert.Equal(t, "goat", pod.ObjectMeta.Annotations[GameServerContainerAnnotation])
+		assert.True(t, metav1.IsControlledBy(pod, gs))
+	}
+
+	t.Run("packed", func(t *testing.T) {
+		gs := fixture.DeepCopy()
+		gs.Spec.Scheduling = Packed
+		pod := &corev1.Pod{}
+
+		gs.podObjectMeta(pod)
+		f(t, gs, pod)
+
+		assert.Equal(t, "false", pod.ObjectMeta.Annotations["cluster-autoscaler.kubernetes.io/safe-to-evict"])
+	})
+
+	t.Run("distributed", func(t *testing.T) {
+		gs := fixture.DeepCopy()
+		gs.Spec.Scheduling = Distributed
+		pod := &corev1.Pod{}
+
+		gs.podObjectMeta(pod)
+		f(t, gs, pod)
+
+		assert.Equal(t, "", pod.ObjectMeta.Annotations["cluster-autoscaler.kubernetes.io/safe-to-evict"])
+	})
+}
+
 func TestGameServerPodScheduling(t *testing.T) {
 	fixture := &corev1.Pod{Spec: corev1.PodSpec{}}