From e7b3b476d3f1e5de11cb3ec37a251631fae0f436 Mon Sep 17 00:00:00 2001 From: Minyi Zhong Date: Wed, 10 Aug 2022 14:10:13 +1000 Subject: [PATCH 1/3] cordon node before performing pre-termination checks --- docs/cycling/README.md | 6 +-- .../transitioner/transitions.go | 51 +++++++++---------- 2 files changed, 28 insertions(+), 29 deletions(-) diff --git a/docs/cycling/README.md b/docs/cycling/README.md index 6db0dcb..e17cfcf 100644 --- a/docs/cycling/README.md +++ b/docs/cycling/README.md @@ -3,12 +3,12 @@ For more concrete examples see [examples](./examples/README.md) - [Cycling Documentation](#cycling-documentation) - - [Cycle Process](#cycle-processa-name%22cycling%22a) + - [Cycle Process](#cycle-process) - [CycleNodeRequest](#cyclenoderequest) - [CycleNodeStatus](#cyclenodestatus) - [State Machine Diagram](#state-machine-diagram) - [CycleNodeRequest object](#cyclenoderequest-object) - - [Usage ](#usage-a-name%22cycling%22a) + - [Usage ](#usage-) - [Interacting with the CRDs](#interacting-with-the-crds) - [Creating](#creating) - [GET](#get) @@ -34,7 +34,7 @@ The CycleNodeRequest CRD handles a request to cycle nodes belonging to a specifi 5. In the **ScalingUp** phase, wait for the cloud provider to bring up the new nodes and then wait for the new nodes to be **Ready** in the Kubernetes API. Wait for the configured health checks on the node succeed. Transition the object to **CordoningNode**. -6. In the **CordoningNode** phase, perform the pre-termination checks and then cordon the selected nodes in the Kubernetes API. Transition the object to **WaitingTermination**. +6. In the **CordoningNode** phase, cordon the selected nodes in the Kubernetes API then perform the pre-termination checks. Transition the object to **WaitingTermination**. 7. In the **WaitingTermination** phase, create a CycleNodeStatus CRD for every node that was cordoned. Each of these CycleNodeStatuses handles the termination of an individual node. The controller will wait for a number of them to enter the **Successful** or **Failed** phase before moving on. diff --git a/pkg/controller/cyclenoderequest/transitioner/transitions.go b/pkg/controller/cyclenoderequest/transitioner/transitions.go index c9e3cf8..2e41233 100644 --- a/pkg/controller/cyclenoderequest/transitioner/transitions.go +++ b/pkg/controller/cyclenoderequest/transitioner/transitions.go @@ -411,47 +411,45 @@ func (t *CycleNodeRequestTransitioner) transitionCordoning() (reconcile.Result, t.rm.Logger.Info("Skipping pre-termination checks") } - allNodesCordoned := true + allNodesReadyForTermination := true for _, node := range t.cycleNodeRequest.Status.CurrentNodes { - // Perform pre-termination checks before the node is cordoned - // Cruicially, do this before the CNS is created for node to begin that process - // The node should be ready for termination before any of this takes place + // If the node is not already cordoned, cordon it + cordoned, err := k8s.IsCordoned(node.Name, t.rm.RawClient) + if err != nil { + t.rm.Logger.Error(err, "failed to check if node is cordoned", "nodeName", node.Name) + return t.transitionToHealing(err) + } + if !cordoned { + if err := k8s.CordonNode(node.Name, t.rm.RawClient); err != nil { + return t.transitionToHealing(err) + } + } + + // Perform pre-termination checks after the node is cordoned + // Cruicially, do this before the CNS is created for node to begin termination if !t.cycleNodeRequest.Spec.SkipPreTerminationChecks && len(t.cycleNodeRequest.Spec.PreTerminationChecks) > 0 { - // First try to send the trigger, if is has already been sent then this will + // Try to send the trigger, if is has already been sent then this will // be skipped in the function. The trigger must only be sent once if err := t.sendPreTerminationTrigger(node); err != nil { - return t.transitionToHealing(err) + return t.transitionToHealing(errors.Wrapf(err, "failed to send pre-termination trigger, %s is still cordononed", node.Name)) } // After the trigger has been sent, perform health checks to monitor if the node - // can be cordoned/terminated. If all checks pass then it can be cordoned/terminated. + // can be terminated. If all checks pass then it can be terminated. allHealthChecksPassed, err := t.performPreTerminationHealthChecks(node) if err != nil { - return t.transitionToHealing(err) + return t.transitionToHealing(errors.Wrapf(err, "failed to perform pre-termination health checks, %s is still cordononed", node.Name)) } - // If not all health checks have passed, continue to the next node instead of cordoning + // If not all health checks have passed, it is not ready for termination yet + // But we can continue to trigger checks on the other nodes if !allHealthChecksPassed { - allNodesCordoned = false + allNodesReadyForTermination = false continue } } - // If node is already cordoned, continue to the next node - cordoned, err := k8s.IsCordoned(node.Name, t.rm.RawClient) - if err != nil { - t.rm.Logger.Error(err, "failed to check if node is cordoned", "nodeName", node.Name) - return t.transitionToHealing(err) - } - if cordoned { - continue - } - - // Cordon the node and create a CycleNodeStatus CRD to do work on it - if err := k8s.CordonNode(node.Name, t.rm.RawClient); err != nil { - return t.transitionToHealing(err) - } - + // Create a CycleNodeStatus CRD to start the termination process if err := t.rm.Client.Create(context.TODO(), t.makeCycleNodeStatusForNode(node.Name)); err != nil { return t.transitionToHealing(err) } @@ -463,7 +461,8 @@ func (t *CycleNodeRequestTransitioner) transitionCordoning() (reconcile.Result, } } - if !allNodesCordoned { + // If not all nodes are ready for termination, requeue the CNR to try again + if !allNodesReadyForTermination { // Reconcile any health checks passed to the cnr object if err := t.rm.UpdateObject(t.cycleNodeRequest); err != nil { return t.transitionToHealing(err) From 91ef9533ed774fcf89c5aa386fef8087985ec241 Mon Sep 17 00:00:00 2001 From: Minyi Zhong Date: Thu, 11 Aug 2022 09:24:12 +1000 Subject: [PATCH 2/3] bump golangci-lint --- .github/workflows/golangci-lint.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/golangci-lint.yml b/.github/workflows/golangci-lint.yml index 685c33c..49fcf47 100644 --- a/.github/workflows/golangci-lint.yml +++ b/.github/workflows/golangci-lint.yml @@ -15,5 +15,5 @@ jobs: - name: golangci-lint uses: golangci/golangci-lint-action@v2 with: - version: v1.45 + version: v1.48 args: --timeout=5m From 13cb07df6819492f3169973d4bb01754b0a70b14 Mon Sep 17 00:00:00 2001 From: Minyi Zhong Date: Thu, 11 Aug 2022 09:36:35 +1000 Subject: [PATCH 3/3] fix linter error --- pkg/controller/cyclenoderequest/transitioner/checks.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/controller/cyclenoderequest/transitioner/checks.go b/pkg/controller/cyclenoderequest/transitioner/checks.go index e60e687..0436423 100644 --- a/pkg/controller/cyclenoderequest/transitioner/checks.go +++ b/pkg/controller/cyclenoderequest/transitioner/checks.go @@ -5,7 +5,7 @@ import ( "crypto/x509" "fmt" "html/template" - "io/ioutil" + "io" "net/http" "os" "regexp" @@ -141,7 +141,7 @@ func (t *CycleNodeRequestTransitioner) makeRequest(httpMethod string, httpClient defer resp.Body.Close() - bytes, err := ioutil.ReadAll(resp.Body) + bytes, err := io.ReadAll(resp.Body) if err != nil { return 0, nil, err }