Skip to content

Commit

Permalink
oadp-1.4: OADP-3227: Reconcile to fail on restore stuck in-progress (#…
Browse files Browse the repository at this point in the history
…330)

* oadp-1.4: OADP-3227: Mark InProgress backup/restore as failed upon requeuing (#315)

* Mark InProgress backup/restore as failed upon requeuing

Signed-off-by: Tiger Kaovilai <tkaovila@redhat.com>

remove uuid, return err to requeue instead of requeue: true

Signed-off-by: Tiger Kaovilai <tkaovila@redhat.com>

* cleanup to minimize diff from upstream

Signed-off-by: Scott Seago <sseago@redhat.com>

* error message update

Signed-off-by: Scott Seago <sseago@redhat.com>

* requeue on finalize status update.

Unlike the InProgress transition, there's no need to fail here,
since the Finalize steps can be repeated.

* Only run patch once for all backup finalizer return scenarios

Signed-off-by: Scott Seago <sseago@redhat.com>

---------

Signed-off-by: Tiger Kaovilai <tkaovila@redhat.com>
Signed-off-by: Scott Seago <sseago@redhat.com>
Co-authored-by: Scott Seago <sseago@redhat.com>

* oadp-1.4: OADP-3227: Reconcile To Fail: Add backup/restore trackers (#324)

* OADP-4265: Reconcile To Fail: Add backup/restore trackers

Signed-off-by: Tiger Kaovilai <tkaovila@redhat.com>

* Apply suggestions from code review: backupTracker

* Address restoreTracker feedback

Signed-off-by: Tiger Kaovilai <tkaovila@redhat.com>

* s/delete from/add to/ in the comment

* unit test fix

Signed-off-by: Tiger Kaovilai <tkaovila@redhat.com>

* backup_controller unit test

Signed-off-by: Tiger Kaovilai <tkaovila@redhat.com>

* restore_controller unit test

Signed-off-by: Tiger Kaovilai <tkaovila@redhat.com>

* `make update`

Signed-off-by: Tiger Kaovilai <tkaovila@redhat.com>

* mock patch to fail failure due to connection refused

Signed-off-by: Tiger Kaovilai <tkaovila@redhat.com>

---------

Signed-off-by: Tiger Kaovilai <tkaovila@redhat.com>

* regenerate mocks

Signed-off-by: Tiger Kaovilai <tkaovila@redhat.com>

---------

Signed-off-by: Tiger Kaovilai <tkaovila@redhat.com>
Signed-off-by: Scott Seago <sseago@redhat.com>
Co-authored-by: Scott Seago <sseago@redhat.com>
  • Loading branch information
kaovilai and sseago committed Jul 30, 2024
1 parent ff91229 commit c035379
Show file tree
Hide file tree
Showing 10 changed files with 611 additions and 20 deletions.
1 change: 1 addition & 0 deletions changelogs/unreleased/7863-kaovilai
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Requeue backup/restore in their respective controllers to unstuck InProgress status without requiring velero pod restart.
24 changes: 24 additions & 0 deletions pkg/controller/backup_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,24 @@ func (b *backupReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
switch original.Status.Phase {
case "", velerov1api.BackupPhaseNew:
// only process new backups
case velerov1api.BackupPhaseInProgress:
if b.backupTracker.Contains(original.Namespace, original.Name) {
log.Debug("Backup is in progress, skipping")
return ctrl.Result{}, nil
}
// if backup phase is in progress, we should not process it again
// we want to mark it as failed to avoid it being stuck in progress
// if so, mark it as failed, last loop did not successfully complete the backup
log.Debug("Backup has in progress status from prior reconcile, marking it as failed")
failedCopy := original.DeepCopy()
failedCopy.Status.Phase = velerov1api.BackupPhaseFailed
failedCopy.Status.FailureReason = "Backup from previous reconcile still in progress. The API Server may have been down."
if err := kubeutil.PatchResource(original, failedCopy, b.kbClient); err != nil {
// return the error so the status can be re-processed; it's currently still not completed or failed
return ctrl.Result{}, err
}
// patch to mark it as failed succeeded, do not requeue
return ctrl.Result{}, nil
default:
b.logger.WithFields(logrus.Fields{
"backup": kubeutil.NamespaceAndName(original),
Expand All @@ -248,6 +266,7 @@ func (b *backupReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr

// update status
if err := kubeutil.PatchResource(original, request.Backup, b.kbClient); err != nil {
// if we fail to patch to inprogress, we will try again in the next reconcile loop
return ctrl.Result{}, errors.Wrapf(err, "error updating Backup status to %s", request.Status.Phase)
}

Expand Down Expand Up @@ -306,7 +325,12 @@ func (b *backupReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
log.Info("Updating backup's final status")
if err := kubeutil.PatchResource(original, request.Backup, b.kbClient); err != nil {
log.WithError(err).Error("error updating backup's final status")
// delete from tracker so next reconcile fails the backup
b.backupTracker.Delete(original.Namespace, original.Name)
// return the error so the status can be re-processed; it's currently still not completed or failed
return ctrl.Result{}, err
}

return ctrl.Result{}, nil
}

Expand Down
76 changes: 73 additions & 3 deletions pkg/controller/backup_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"reflect"
"sort"
"strings"
"syscall"
"testing"
"time"

Expand All @@ -41,6 +42,7 @@ import (
"k8s.io/utils/clock"
testclocks "k8s.io/utils/clock/testing"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
kbclient "sigs.k8s.io/controller-runtime/pkg/client"
fakeClient "sigs.k8s.io/controller-runtime/pkg/client/fake"

Expand All @@ -58,6 +60,7 @@ import (
pluginmocks "github.com/vmware-tanzu/velero/pkg/plugin/mocks"
biav2 "github.com/vmware-tanzu/velero/pkg/plugin/velero/backupitemaction/v2"
velerotest "github.com/vmware-tanzu/velero/pkg/test"
velerotestmocks "github.com/vmware-tanzu/velero/pkg/test/mocks"
"github.com/vmware-tanzu/velero/pkg/util/boolptr"
kubeutil "github.com/vmware-tanzu/velero/pkg/util/kube"
"github.com/vmware-tanzu/velero/pkg/util/logging"
Expand Down Expand Up @@ -131,9 +134,10 @@ func TestProcessBackupNonProcessedItems(t *testing.T) {
)

c := &backupReconciler{
kbClient: velerotest.NewFakeControllerRuntimeClient(t),
formatFlag: formatFlag,
logger: logger,
kbClient: velerotest.NewFakeControllerRuntimeClient(t),
formatFlag: formatFlag,
logger: logger,
backupTracker: NewBackupTracker(),
}
if test.backup != nil {
require.NoError(t, c.kbClient.Create(context.Background(), test.backup))
Expand All @@ -150,6 +154,72 @@ func TestProcessBackupNonProcessedItems(t *testing.T) {
}
}

// OADP Carry: Test that backup that has status inProgress on reconcile is changed to failed if velero has no memory of it still in-progress.
func TestProcessBackupInProgressFailOnSecondReconcile(t *testing.T) {
tests := []struct {
name string
tracked bool
reconciledPhase velerov1api.BackupPhase
expectedErr error
mockFailedPatch bool
}{
{
name: "InProgress backup tracked as being in-progress is not processed",
tracked: true,
reconciledPhase: velerov1api.BackupPhaseInProgress,
},
{
name: "InProgress backup untracked as being in-progress is marked as failed",
tracked: false,
reconciledPhase: velerov1api.BackupPhaseFailed,
},
{
name: "InProgress backup untracked is marked as failed, if patch fails, err is returned from reconcile to retry patch again, backup still inprogress",
tracked: false,
reconciledPhase: velerov1api.BackupPhaseInProgress,
mockFailedPatch: true,
expectedErr: syscall.ECONNREFUSED,
},
}

for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
formatFlag := logging.FormatText
var (
logger = logging.DefaultLogger(logrus.DebugLevel, formatFlag)
)
backup := defaultBackup().Phase(velerov1api.BackupPhaseInProgress).Result()
var kclient kbclient.Client
fakeKclient := velerotest.NewFakeControllerRuntimeClient(t, backup)
if test.mockFailedPatch {
mockClient := velerotestmocks.NewClient(t)
mockClient.On("Patch", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(syscall.ECONNREFUSED).Once()
mockClient.On("Get", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(func(ctx context.Context, key client.ObjectKey, obj client.Object, opts ...kbclient.GetOption) error {
return fakeKclient.Get(ctx, key, obj, opts...)
})
kclient = mockClient
} else {
kclient = fakeKclient
}
c := &backupReconciler{
kbClient: kclient,
formatFlag: formatFlag,
logger: logger,
backupTracker: NewBackupTracker(),
}
if test.tracked {
c.backupTracker.Add(backup.Namespace, backup.Name)
}
_, err := c.Reconcile(ctx, ctrl.Request{NamespacedName: types.NamespacedName{Namespace: backup.Namespace, Name: backup.Name}})
assert.Equal(t, test.expectedErr, err)
reconciledBackup := velerov1api.Backup{}
err = c.kbClient.Get(context.Background(), types.NamespacedName{Namespace: backup.Namespace, Name: backup.Name}, &reconciledBackup)
assert.Nil(t, err)
assert.Equal(t, test.reconciledPhase, reconciledBackup.Status.Phase)
})
}
}

func TestProcessBackupValidationFailures(t *testing.T) {
defaultBackupLocation := builder.ForBackupStorageLocation("velero", "loc-1").Result()

Expand Down
32 changes: 21 additions & 11 deletions pkg/controller/backup_finalizer_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,19 +109,21 @@ func (r *backupFinalizerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
}

original := backup.DeepCopy()
patched := false
patchErr := false
defer func() {
switch backup.Status.Phase {
case
velerov1api.BackupPhaseCompleted,
velerov1api.BackupPhasePartiallyFailed,
velerov1api.BackupPhaseFailed,
velerov1api.BackupPhaseFailedValidation:
r.backupTracker.Delete(backup.Namespace, backup.Name)
}
// Always attempt to Patch the backup object and status after each reconciliation.
if err := r.client.Patch(ctx, backup, kbclient.MergeFrom(original)); err != nil {
log.WithError(err).Error("Error updating backup")
return
if !patched {
if err := r.client.Patch(ctx, backup, kbclient.MergeFrom(original)); err != nil {
log.WithError(err).Error("Error updating backup")
return
}
}
if !patchErr {
switch backup.Status.Phase {
case velerov1api.BackupPhaseCompleted, velerov1api.BackupPhasePartiallyFailed, velerov1api.BackupPhaseFailed, velerov1api.BackupPhaseFailedValidation:
r.backupTracker.Delete(backup.Namespace, backup.Name)
}
}
}()

Expand Down Expand Up @@ -223,6 +225,14 @@ func (r *backupFinalizerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
return ctrl.Result{}, errors.Wrap(err, "error uploading backup final contents")
}
}
// Even though we already have patch in a defer call, we call it explicitly here so that
// if update fails, we can requeue. Prior return statements already requeue.
patched = true
if err = r.client.Patch(ctx, backup, kbclient.MergeFrom(original)); err != nil {
log.WithError(err).Error("Error updating backup")
patchErr = true
return ctrl.Result{}, err
}
return ctrl.Result{}, nil
}

Expand Down
30 changes: 25 additions & 5 deletions pkg/controller/restore_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ type restoreReconciler struct {
newPluginManager func(logger logrus.FieldLogger) clientmgmt.Manager
backupStoreGetter persistence.ObjectBackupStoreGetter
globalCrClient client.Client
failingTracker RestoreTracker
}

type backupInfo struct {
Expand Down Expand Up @@ -138,6 +139,7 @@ func NewRestoreReconciler(
kbClient: kbClient,
logger: logger,
restoreLogLevel: restoreLogLevel,
failingTracker: NewRestoreTracker(),
metrics: metrics,
logFormat: logFormat,
clock: &clock.RealClock{},
Expand Down Expand Up @@ -215,17 +217,33 @@ func (r *restoreReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct
switch restore.Status.Phase {
case "", api.RestorePhaseNew:
// only process new restores
case api.RestorePhaseInProgress:
if !r.failingTracker.Contains(restore.Namespace, restore.Name) {
log.Debug("Restore in progress, skipping")
return ctrl.Result{}, nil
}
// if restore is in progress, we should not process it again
// we want to mark it as failed to avoid it being stuck in progress
// if so, mark it as failed, last loop did not successfully complete the restore
log.Debug("Restore has in progress status from prior reconcile, marking it as failed")
failedCopy := restore.DeepCopy()
failedCopy.Status.Phase = api.RestorePhaseFailed
failedCopy.Status.FailureReason = "Restore from previous reconcile still in progress. The API Server may have been down."
if err := kubeutil.PatchResource(restore, failedCopy, r.kbClient); err != nil {
// return the error so the status can be re-processed; it's currently still not completed or failed
return ctrl.Result{}, err
}
// patch to mark it as failed succeeded, do not requeue
r.failingTracker.Delete(restore.Namespace, restore.Name)
return ctrl.Result{}, nil
default:
r.logger.WithFields(logrus.Fields{
"restore": kubeutil.NamespaceAndName(restore),
"phase": restore.Status.Phase,
}).Debug("Restore is not handled")
return ctrl.Result{}, nil
}

// store a copy of the original restore for creating patch
original := restore.DeepCopy()

// Validate the restore and fetch the backup
info, resourceModifiers := r.validateAndComplete(restore)

Expand Down Expand Up @@ -278,8 +296,10 @@ func (r *restoreReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct

if err = kubeutil.PatchResource(original, restore, r.kbClient); err != nil {
log.WithError(errors.WithStack(err)).Info("Error updating restore's final status")
// No need to re-enqueue here, because restore's already set to InProgress before.
// Controller only handle New restore.
// add to failureTracker so next reconcile fails the restore
r.failingTracker.Add(restore.Namespace, restore.Name)
// return the error so the status can be re-processed; it's currently still not completed or failed
return ctrl.Result{}, err
}

return ctrl.Result{}, nil
Expand Down
76 changes: 76 additions & 0 deletions pkg/controller/restore_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"bytes"
"context"
"io"
"syscall"
"testing"
"time"

Expand All @@ -35,6 +36,7 @@ import (
clocktesting "k8s.io/utils/clock/testing"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
kbclient "sigs.k8s.io/controller-runtime/pkg/client"

"github.com/vmware-tanzu/velero/internal/resourcemodifiers"
"github.com/vmware-tanzu/velero/internal/volume"
Expand All @@ -48,6 +50,7 @@ import (
riav2 "github.com/vmware-tanzu/velero/pkg/plugin/velero/restoreitemaction/v2"
pkgrestore "github.com/vmware-tanzu/velero/pkg/restore"
velerotest "github.com/vmware-tanzu/velero/pkg/test"
velerotestmocks "github.com/vmware-tanzu/velero/pkg/test/mocks"
"github.com/vmware-tanzu/velero/pkg/util/logging"
"github.com/vmware-tanzu/velero/pkg/util/results"
)
Expand Down Expand Up @@ -989,6 +992,79 @@ func TestMostRecentCompletedBackup(t *testing.T) {
assert.Equal(t, expected, mostRecentCompletedBackup(backups))
}

// OADP Carry: Test that restore that has status inProgress on reconcile is changed to failed if velero has memory of it failing.
func TestProcessRestoreInProgressFailOnSecondReconcile(t *testing.T) {
tests := []struct {
name string
trackedAsFailed bool
reconciledPhase velerov1api.RestorePhase
expectedErr error
mockFailedPatch bool
}{
{
name: "InProgress restore not tracked as failing is not processed",
trackedAsFailed: false,
reconciledPhase: velerov1api.RestorePhaseInProgress,
},
{
name: "InProgress restore tracked as failing is marked as failed",
trackedAsFailed: true,
reconciledPhase: velerov1api.RestorePhaseFailed,
},
{
name: "InProgress restore tracked as failing is marked as failed, if patch fails, err is returned from reconcile to retry patch again, restore still inprogress",
trackedAsFailed: true,
reconciledPhase: velerov1api.RestorePhaseInProgress,
mockFailedPatch: true,
expectedErr: syscall.ECONNREFUSED,
},
}

for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
formatFlag := logging.FormatText
var (
logger = logging.DefaultLogger(logrus.DebugLevel, formatFlag)
)
restore := &velerov1api.Restore{
ObjectMeta: metav1.ObjectMeta{
Namespace: velerov1api.DefaultNamespace,
Name: "restore-1",
},
Status: velerov1api.RestoreStatus{
Phase: velerov1api.RestorePhaseInProgress,
},
}
var kclient kbclient.Client
fakeKclient := velerotest.NewFakeControllerRuntimeClient(t, restore)
if test.mockFailedPatch {
mockClient := velerotestmocks.NewClient(t)
mockClient.On("Patch", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(syscall.ECONNREFUSED).Once()
mockClient.On("Get", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(func(ctx context.Context, key client.ObjectKey, obj client.Object, opts ...kbclient.GetOption) error {
return fakeKclient.Get(ctx, key, obj, opts...)
})
kclient = mockClient
} else {
kclient = fakeKclient
}
c := &restoreReconciler{
kbClient: kclient,
logger: logger,
failingTracker: NewRestoreTracker(),
}
if test.trackedAsFailed {
c.failingTracker.Add(restore.Namespace, restore.Name)
}
_, err := c.Reconcile(ctx, ctrl.Request{NamespacedName: types.NamespacedName{Namespace: restore.Namespace, Name: restore.Name}})
assert.Equal(t, test.expectedErr, err)
reconciledRestore := velerov1api.Restore{}
err = c.kbClient.Get(context.Background(), types.NamespacedName{Namespace: restore.Namespace, Name: restore.Name}, &reconciledRestore)
assert.Nil(t, err)
assert.Equal(t, test.reconciledPhase, reconciledRestore.Status.Phase)
})
}
}

func NewRestore(ns, name, backup, includeNS, includeResource string, phase velerov1api.RestorePhase) *builder.RestoreBuilder {
restore := builder.ForRestore(ns, name).Phase(phase).Backup(backup).ItemOperationTimeout(60 * time.Minute)

Expand Down
10 changes: 10 additions & 0 deletions pkg/controller/restore_tracker.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
package controller

type RestoreTracker interface {
BackupTracker
}

// NewRestoreTracker returns a new RestoreTracker.
func NewRestoreTracker() RestoreTracker {
return NewBackupTracker()
}
Loading

0 comments on commit c035379

Please sign in to comment.