Skip to content

Commit

Permalink
Merge pull request #50 from atlassian-labs/mzhong/swap-pre-terminatin…
Browse files Browse the repository at this point in the history
…g-check-order

cordon node before performing pre-termination checks
  • Loading branch information
MinyiZ authored Aug 15, 2022
2 parents 3e1b38d + 13cb07d commit 251868c
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 32 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/golangci-lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,5 @@ jobs:
- name: golangci-lint
uses: golangci/golangci-lint-action@v2
with:
version: v1.45
version: v1.48
args: --timeout=5m
6 changes: 3 additions & 3 deletions docs/cycling/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
For more concrete examples see [examples](./examples/README.md)

- [Cycling Documentation](#cycling-documentation)
- [Cycle Process<a name="cycling"></a>](#cycle-processa-name%22cycling%22a)
- [Cycle Process<a name="cycling"></a>](#cycle-process)
- [CycleNodeRequest](#cyclenoderequest)
- [CycleNodeStatus](#cyclenodestatus)
- [State Machine Diagram](#state-machine-diagram)
- [CycleNodeRequest object](#cyclenoderequest-object)
- [Usage <a name="cycling"></a>](#usage-a-name%22cycling%22a)
- [Usage <a name="cycling"></a>](#usage-)
- [Interacting with the CRDs](#interacting-with-the-crds)
- [Creating](#creating)
- [GET](#get)
Expand All @@ -34,7 +34,7 @@ The CycleNodeRequest CRD handles a request to cycle nodes belonging to a specifi

5. In the **ScalingUp** phase, wait for the cloud provider to bring up the new nodes and then wait for the new nodes to be **Ready** in the Kubernetes API. Wait for the configured health checks on the node succeed. Transition the object to **CordoningNode**.

6. In the **CordoningNode** phase, perform the pre-termination checks and then cordon the selected nodes in the Kubernetes API. Transition the object to **WaitingTermination**.
6. In the **CordoningNode** phase, cordon the selected nodes in the Kubernetes API then perform the pre-termination checks. Transition the object to **WaitingTermination**.

7. In the **WaitingTermination** phase, create a CycleNodeStatus CRD for every node that was cordoned. Each of these CycleNodeStatuses handles the termination of an individual node. The controller will wait for a number of them to enter the **Successful** or **Failed** phase before moving on.

Expand Down
4 changes: 2 additions & 2 deletions pkg/controller/cyclenoderequest/transitioner/checks.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import (
"crypto/x509"
"fmt"
"html/template"
"io/ioutil"
"io"
"net/http"
"os"
"regexp"
Expand Down Expand Up @@ -141,7 +141,7 @@ func (t *CycleNodeRequestTransitioner) makeRequest(httpMethod string, httpClient

defer resp.Body.Close()

bytes, err := ioutil.ReadAll(resp.Body)
bytes, err := io.ReadAll(resp.Body)
if err != nil {
return 0, nil, err
}
Expand Down
51 changes: 25 additions & 26 deletions pkg/controller/cyclenoderequest/transitioner/transitions.go
Original file line number Diff line number Diff line change
Expand Up @@ -411,47 +411,45 @@ func (t *CycleNodeRequestTransitioner) transitionCordoning() (reconcile.Result,
t.rm.Logger.Info("Skipping pre-termination checks")
}

allNodesCordoned := true
allNodesReadyForTermination := true
for _, node := range t.cycleNodeRequest.Status.CurrentNodes {
// Perform pre-termination checks before the node is cordoned
// Cruicially, do this before the CNS is created for node to begin that process
// The node should be ready for termination before any of this takes place
// If the node is not already cordoned, cordon it
cordoned, err := k8s.IsCordoned(node.Name, t.rm.RawClient)
if err != nil {
t.rm.Logger.Error(err, "failed to check if node is cordoned", "nodeName", node.Name)
return t.transitionToHealing(err)
}
if !cordoned {
if err := k8s.CordonNode(node.Name, t.rm.RawClient); err != nil {
return t.transitionToHealing(err)
}
}

// Perform pre-termination checks after the node is cordoned
// Cruicially, do this before the CNS is created for node to begin termination
if !t.cycleNodeRequest.Spec.SkipPreTerminationChecks && len(t.cycleNodeRequest.Spec.PreTerminationChecks) > 0 {
// First try to send the trigger, if is has already been sent then this will
// Try to send the trigger, if is has already been sent then this will
// be skipped in the function. The trigger must only be sent once
if err := t.sendPreTerminationTrigger(node); err != nil {
return t.transitionToHealing(err)
return t.transitionToHealing(errors.Wrapf(err, "failed to send pre-termination trigger, %s is still cordononed", node.Name))
}

// After the trigger has been sent, perform health checks to monitor if the node
// can be cordoned/terminated. If all checks pass then it can be cordoned/terminated.
// can be terminated. If all checks pass then it can be terminated.
allHealthChecksPassed, err := t.performPreTerminationHealthChecks(node)
if err != nil {
return t.transitionToHealing(err)
return t.transitionToHealing(errors.Wrapf(err, "failed to perform pre-termination health checks, %s is still cordononed", node.Name))
}

// If not all health checks have passed, continue to the next node instead of cordoning
// If not all health checks have passed, it is not ready for termination yet
// But we can continue to trigger checks on the other nodes
if !allHealthChecksPassed {
allNodesCordoned = false
allNodesReadyForTermination = false
continue
}
}

// If node is already cordoned, continue to the next node
cordoned, err := k8s.IsCordoned(node.Name, t.rm.RawClient)
if err != nil {
t.rm.Logger.Error(err, "failed to check if node is cordoned", "nodeName", node.Name)
return t.transitionToHealing(err)
}
if cordoned {
continue
}

// Cordon the node and create a CycleNodeStatus CRD to do work on it
if err := k8s.CordonNode(node.Name, t.rm.RawClient); err != nil {
return t.transitionToHealing(err)
}

// Create a CycleNodeStatus CRD to start the termination process
if err := t.rm.Client.Create(context.TODO(), t.makeCycleNodeStatusForNode(node.Name)); err != nil {
return t.transitionToHealing(err)
}
Expand All @@ -463,7 +461,8 @@ func (t *CycleNodeRequestTransitioner) transitionCordoning() (reconcile.Result,
}
}

if !allNodesCordoned {
// If not all nodes are ready for termination, requeue the CNR to try again
if !allNodesReadyForTermination {
// Reconcile any health checks passed to the cnr object
if err := t.rm.UpdateObject(t.cycleNodeRequest); err != nil {
return t.transitionToHealing(err)
Expand Down

0 comments on commit 251868c

Please sign in to comment.