From fa950e1a48eee29269ac613f42f34cf5502b6886 Mon Sep 17 00:00:00 2001 From: Tanner Altares Date: Thu, 30 Jan 2020 15:11:59 -0600 Subject: [PATCH 1/5] support gated rollback --- pkg/apis/flagger/v1alpha3/types.go | 2 + pkg/controller/scheduler.go | 104 ++++++++++++++++++----------- 2 files changed, 68 insertions(+), 38 deletions(-) diff --git a/pkg/apis/flagger/v1alpha3/types.go b/pkg/apis/flagger/v1alpha3/types.go index 74d5fabe7..96e7f9353 100644 --- a/pkg/apis/flagger/v1alpha3/types.go +++ b/pkg/apis/flagger/v1alpha3/types.go @@ -153,6 +153,8 @@ const ( ConfirmPromotionHook HookType = "confirm-promotion" // EventHook dispatches Flagger events to the specified endpoint EventHook HookType = "event" + // RollbackHook rollback canary anaylysis if webhook returns HTTP 200 + RollbackHook HookType = "rollback" ) // CanaryWebhook holds the reference to external checks used for canary analysis diff --git a/pkg/controller/scheduler.go b/pkg/controller/scheduler.go index 6472150f5..0c17ccfbf 100644 --- a/pkg/controller/scheduler.go +++ b/pkg/controller/scheduler.go @@ -224,6 +224,17 @@ func (c *Controller) advanceCanary(name string, namespace string, skipLivenessCh return } + // check if we should rollback + if cd.Status.Phase == flaggerv1.CanaryPhaseProgressing || + cd.Status.Phase == flaggerv1.CanaryPhaseWaiting { + if ok := c.runRollbackHooks(cd, cd.Status.Phase); ok { + c.recordEventWarningf(cd, "Rolling back %s.%s manual webhook invoked", cd.Name, cd.Namespace) + c.sendNotification(cd, "Rolling back manual webhook invoked", false, true) + c.rollback(cd, canaryController, meshRouter) + return + } + } + // route all traffic to primary if analysis has succeeded if cd.Status.Phase == flaggerv1.CanaryPhasePromoting { if provider != "kubernetes" { @@ -267,50 +278,13 @@ func (c *Controller) advanceCanary(name string, namespace string, skipLivenessCh // check if the number of failed checks reached the threshold if cd.Status.Phase == flaggerv1.CanaryPhaseProgressing && (!retriable || cd.Status.FailedChecks >= cd.Spec.CanaryAnalysis.Threshold) { - - if cd.Status.FailedChecks >= cd.Spec.CanaryAnalysis.Threshold { - c.recordEventWarningf(cd, "Rolling back %s.%s failed checks threshold reached %v", - cd.Name, cd.Namespace, cd.Status.FailedChecks) - c.sendNotification(cd, fmt.Sprintf("Failed checks threshold reached %v", cd.Status.FailedChecks), - false, true) - } - if !retriable { c.recordEventWarningf(cd, "Rolling back %s.%s progress deadline exceeded %v", cd.Name, cd.Namespace, err) c.sendNotification(cd, fmt.Sprintf("Progress deadline exceeded %v", err), false, true) } - - // route all traffic back to primary - primaryWeight = 100 - canaryWeight = 0 - if err := meshRouter.SetRoutes(cd, primaryWeight, canaryWeight, false); err != nil { - c.recordEventWarningf(cd, "%v", err) - return - } - - canaryPhaseFailed := cd.DeepCopy() - canaryPhaseFailed.Status.Phase = flaggerv1.CanaryPhaseFailed - c.recordEventWarningf(canaryPhaseFailed, "Canary failed! Scaling down %s.%s", - canaryPhaseFailed.Name, canaryPhaseFailed.Namespace) - - c.recorder.SetWeight(cd, primaryWeight, canaryWeight) - - // shutdown canary - if err := canaryController.Scale(cd, 0); err != nil { - c.recordEventWarningf(cd, "%v", err) - return - } - - // mark canary as failed - if err := canaryController.SyncStatus(cd, flaggerv1.CanaryStatus{Phase: flaggerv1.CanaryPhaseFailed, CanaryWeight: 0}); err != nil { - c.logger.With("canary", fmt.Sprintf("%s.%s", cd.Name, cd.Namespace)).Errorf("%v", err) - return - } - - c.recorder.SetStatus(cd, flaggerv1.CanaryPhaseFailed) - c.runPostRolloutHooks(cd, flaggerv1.CanaryPhaseFailed) + c.rollback(cd, canaryController, meshRouter) return } @@ -757,6 +731,21 @@ func (c *Controller) runPostRolloutHooks(canary *flaggerv1.Canary, phase flagger return true } +func (c *Controller) runRollbackHooks(canary *flaggerv1.Canary, phase flaggerv1.CanaryPhase) bool { + for _, webhook := range canary.Spec.CanaryAnalysis.Webhooks { + if webhook.Type == flaggerv1.RollbackHook { + err := CallWebhook(canary.Name, canary.Namespace, phase, webhook) + if err != nil { + c.recordEventInfof(canary, "Rollback hook %s not signaling a rollback", webhook.Name) + } else { + c.recordEventWarningf(canary, "Rollback check %s passed", webhook.Name) + return true + } + } + } + return false +} + func (c *Controller) runAnalysis(r *flaggerv1.Canary) bool { // run external checks for _, webhook := range r.Spec.CanaryAnalysis.Webhooks { @@ -878,3 +867,42 @@ func (c *Controller) runAnalysis(r *flaggerv1.Canary) bool { return true } + +func (c *Controller) rollback(canary *flaggerv1.Canary, canaryController canary.Controller, meshRouter router.Interface) { + if canary.Status.FailedChecks >= canary.Spec.CanaryAnalysis.Threshold { + c.recordEventWarningf(canary, "Rolling back %s.%s failed checks threshold reached %v", + canary.Name, canary.Namespace, canary.Status.FailedChecks) + c.sendNotification(canary, fmt.Sprintf("Failed checks threshold reached %v", canary.Status.FailedChecks), + false, true) + } + + // route all traffic back to primary + primaryWeight := 100 + canaryWeight := 0 + if err := meshRouter.SetRoutes(canary, primaryWeight, canaryWeight, false); err != nil { + c.recordEventWarningf(canary, "%v", err) + return + } + + canaryPhaseFailed := canary.DeepCopy() + canaryPhaseFailed.Status.Phase = flaggerv1.CanaryPhaseFailed + c.recordEventWarningf(canaryPhaseFailed, "Canary failed! Scaling down %s.%s", + canaryPhaseFailed.Name, canaryPhaseFailed.Namespace) + + c.recorder.SetWeight(canary, primaryWeight, canaryWeight) + + // shutdown canary + if err := canaryController.Scale(canary, 0); err != nil { + c.recordEventWarningf(canary, "%v", err) + return + } + + // mark canary as failed + if err := canaryController.SyncStatus(canary, flaggerv1.CanaryStatus{Phase: flaggerv1.CanaryPhaseFailed, CanaryWeight: 0}); err != nil { + c.logger.With("canary", fmt.Sprintf("%s.%s", canary.Name, canary.Namespace)).Errorf("%v", err) + return + } + + c.recorder.SetStatus(canary, flaggerv1.CanaryPhaseFailed) + c.runPostRolloutHooks(canary, flaggerv1.CanaryPhaseFailed) +} From 1d23c0f0a2224642989ea8acdd8e6451cb58d8e5 Mon Sep 17 00:00:00 2001 From: Tanner Altares Date: Wed, 5 Feb 2020 10:29:32 -0600 Subject: [PATCH 2/5] update CRD manifest to add rollback enum to webhook validation --- artifacts/flagger/crd.yaml | 1 + charts/flagger/templates/crd.yaml | 1 + kustomize/base/flagger/crd.yaml | 1 + 3 files changed, 3 insertions(+) diff --git a/artifacts/flagger/crd.yaml b/artifacts/flagger/crd.yaml index 8c8029724..0989e4032 100644 --- a/artifacts/flagger/crd.yaml +++ b/artifacts/flagger/crd.yaml @@ -249,6 +249,7 @@ spec: - confirm-promotion - post-rollout - event + - rollback url: description: URL address of this webhook type: string diff --git a/charts/flagger/templates/crd.yaml b/charts/flagger/templates/crd.yaml index ef5fed8b1..48575feff 100644 --- a/charts/flagger/templates/crd.yaml +++ b/charts/flagger/templates/crd.yaml @@ -250,6 +250,7 @@ spec: - confirm-promotion - post-rollout - event + - rollback url: description: URL address of this webhook type: string diff --git a/kustomize/base/flagger/crd.yaml b/kustomize/base/flagger/crd.yaml index 8c8029724..0989e4032 100644 --- a/kustomize/base/flagger/crd.yaml +++ b/kustomize/base/flagger/crd.yaml @@ -249,6 +249,7 @@ spec: - confirm-promotion - post-rollout - event + - rollback url: description: URL address of this webhook type: string From edbc373109f06286a2aadd1688ad669a4f2619a5 Mon Sep 17 00:00:00 2001 From: Tanner Altares Date: Wed, 5 Feb 2020 14:14:13 -0600 Subject: [PATCH 3/5] add docs for manual rollback --- docs/gitbook/how-it-works.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/docs/gitbook/how-it-works.md b/docs/gitbook/how-it-works.md index 7a6ad1e8a..17381fbeb 100644 --- a/docs/gitbook/how-it-works.md +++ b/docs/gitbook/how-it-works.md @@ -551,6 +551,9 @@ The canary promotion is paused until the hooks return HTTP 200. While the promotion is paused, Flagger will continue to run the metrics checks and rollout hooks. * Post-rollout hooks are executed after the canary has been promoted or rolled back. If a post rollout hook fails the error is logged. +* Rollback hooks are executed while a canary deployment is in either Progressing or Waiting status. +This provides the ability to rollback during analysis or while waiting for a confirmation. If a rollback hook +returns a successful HTTP status code, Flagger will rollback the canary deployment. * Event hooks are executed every time Flagger emits a Kubernetes event. When configured, every action that Flagger takes during a canary deployment will be sent as JSON via an HTTP POST request. @@ -584,6 +587,9 @@ Spec: timeout: 5s metadata: some: "message" + - name: "rollback gate" + type: rollback + url: http://flagger-loadtester.test/gate/halt - name: "send to Slack" type: event url: http://event-recevier.notifications/slack @@ -830,6 +836,10 @@ For manual approval of a canary deployment you can use the `confirm-rollout` and The confirmation rollout hooks are executed before the pre-rollout hooks. Flagger will halt the canary traffic shifting and analysis until the confirm webhook returns HTTP status 200. +For manual rollback of a canary deployment you can use the `rollback` webhook. The rollback hook will be called +during the analysis and confirmation states. If a rollback webhook returns a successful HTTP status code, Flagger +will shift all traffic back to the primary instance and fail the canary. + Manual gating with Flagger's tester: ```yaml @@ -898,4 +908,14 @@ While the promotion is paused, Flagger will continue to run the metrics checks a url: http://flagger-loadtester.test/gate/halt ``` +The `rollback` hook type can be used to manually rollback the canary promotion. + +```yaml + canaryAnalysis: + webhooks: + - name: "rollback gate" + type: rollback + url: http://flagger-loadtester.test/gate/halt +``` + If you have notifications enabled, Flagger will post a message to Slack or MS Teams if a canary promotion is waiting for approval. From 69e969ac51899babfc33491d1f5febec79244553 Mon Sep 17 00:00:00 2001 From: Tanner Altares Date: Wed, 5 Feb 2020 14:49:35 -0600 Subject: [PATCH 4/5] modify the hook name --- docs/gitbook/how-it-works.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/gitbook/how-it-works.md b/docs/gitbook/how-it-works.md index 17381fbeb..fde3232fe 100644 --- a/docs/gitbook/how-it-works.md +++ b/docs/gitbook/how-it-works.md @@ -913,7 +913,7 @@ The `rollback` hook type can be used to manually rollback the canary promotion. ```yaml canaryAnalysis: webhooks: - - name: "rollback gate" + - name: "rollback" type: rollback url: http://flagger-loadtester.test/gate/halt ``` From 402dda71e6989bd39d7c2cbe20bd04d7be288534 Mon Sep 17 00:00:00 2001 From: Tanner Altares Date: Wed, 5 Feb 2020 19:17:45 -0600 Subject: [PATCH 5/5] manual push to trigger build --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index dab3b8104..62f565466 100644 --- a/README.md +++ b/README.md @@ -193,4 +193,4 @@ If you have any questions about Flagger and progressive delivery: hands-on training and meetups in your area. * File an [issue](https://github.com/weaveworks/flagger/issues/new). -Your feedback is always welcome! +Your feedback is always welcome! \ No newline at end of file