Skip to content

Commit

Permalink
Mark InProgress backup/restore as failed upon requeuing
Browse files Browse the repository at this point in the history
Signed-off-by: Tiger Kaovilai <tkaovila@redhat.com>
  • Loading branch information
kaovilai committed Jun 5, 2024
1 parent a8d77ea commit b17981a
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 3 deletions.
6 changes: 6 additions & 0 deletions pkg/apis/velero/v1/labels_annotations.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ const (
// BackupUIDLabel is the label key used to identify a backup by uid.
BackupUIDLabel = "velero.io/backup-uid"

// BackupControllerUID is the label key used to identify the controller UID that processed the backup.
BackupControllerUID = "velero.io/backup-controller-uid"

// RestoreNameLabel is the label key used to identify a restore by name.
RestoreNameLabel = "velero.io/restore-name"

Expand All @@ -32,6 +35,9 @@ const (
// RestoreUIDLabel is the label key used to identify a restore by uid.
RestoreUIDLabel = "velero.io/restore-uid"

// RestoreControllerUID is the label key used to identify the controller UID that processed the restore.
RestoreControllerUID = "velero.io/restore-controller-uid"

// PodUIDLabel is the label key used to identify a pod by uid.
PodUIDLabel = "velero.io/pod-uid"

Expand Down
28 changes: 25 additions & 3 deletions pkg/controller/backup_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@ import (
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
apimachinerytypes "k8s.io/apimachinery/pkg/types"
kerrors "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/apimachinery/pkg/util/uuid"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/utils/clock"
ctrl "sigs.k8s.io/controller-runtime"
Expand Down Expand Up @@ -86,6 +88,7 @@ type backupReconciler struct {
maxConcurrentK8SConnections int
defaultSnapshotMoveData bool
globalCRClient kbclient.Client
uuid apimachinerytypes.UID
}

func NewBackupReconciler(
Expand Down Expand Up @@ -136,6 +139,7 @@ func NewBackupReconciler(
maxConcurrentK8SConnections: maxConcurrentK8SConnections,
defaultSnapshotMoveData: defaultSnapshotMoveData,
globalCRClient: globalCRClient,
uuid: uuid.NewUUID(),
}
b.updateTotalBackupMetric()
return b
Expand Down Expand Up @@ -230,6 +234,22 @@ func (b *backupReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
switch original.Status.Phase {
case "", velerov1api.BackupPhaseNew:
// only process new backups
case velerov1api.BackupPhaseInProgress:
// if backup is in progress, we should not process it again
// we want to mark it as failed to avoid it being stuck in progress
// if this backupReconciler uuid is the same as the one in the backup, then we are the one who is processing it
// if so, mark it as failed, last loop did not successfully complete the backup
if original.Annotations[velerov1api.BackupControllerUID] == string(b.uuid) {
log.Debug("Backup was processed by this backupReconciler and still has in progress status upon reconcile, marking it as failed")
}
failedCopy := original.DeepCopy()
failedCopy.Status.Phase = velerov1api.BackupPhaseFailed
failedCopy.Status.FailureReason = "Backup from previous reconcile still in progress"
if err := kubeutil.PatchResource(original, failedCopy, b.kbClient); err != nil {
return ctrl.Result{RequeueAfter: 5 * time.Second}, errors.Wrapf(err, "error updating Backup status to %s", failedCopy.Status.Phase)
}
// patch to mark it as failed succeeded, do not requeue
return ctrl.Result{}, nil
default:
b.logger.WithFields(logrus.Fields{
"backup": kubeutil.NamespaceAndName(original),
Expand All @@ -246,8 +266,9 @@ func (b *backupReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
request.Status.Phase = velerov1api.BackupPhaseInProgress
request.Status.StartTimestamp = &metav1.Time{Time: b.clock.Now()}
}

// update status
// add uuid to backup annotations
request.Annotations[velerov1api.BackupControllerUID] = string(b.uuid)
// update status and annotations
if err := kubeutil.PatchResource(original, request.Backup, b.kbClient); err != nil {
return ctrl.Result{}, errors.Wrapf(err, "error updating Backup status to %s", request.Status.Phase)
}
Expand Down Expand Up @@ -308,7 +329,8 @@ func (b *backupReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctr
if err := kubeutil.PatchResource(original, request.Backup, b.kbClient); err != nil {
log.WithError(err).Error("error updating backup's final status")
}
return ctrl.Result{}, nil
// requeue to check on backup status patching, if status is still in progress we will eventually fail the backup.
return ctrl.Result{RequeueAfter: 5 * time.Second}, nil
}

func (b *backupReconciler) prepareBackupRequest(backup *velerov1api.Backup, logger logrus.FieldLogger) *pkgbackup.Request {
Expand Down

0 comments on commit b17981a

Please sign in to comment.