Skip to content

Commit

Permalink
fix irrecoverable errors in async operations
Browse files Browse the repository at this point in the history
Reconciliations involving asynchronous operations can fall into a loop
where an eventual "Failed" result can block future reconciliations from
making any further changes to that particular resource to fix the
problem.

This change sets `longRunningOperationStates` for agent pools on the
corresponding AzureManagedMachinePool instead of the
AzureManagedControlPlane, since changes to that resource were not being
persisted. It also only blocks starting new operations on the status of
existing operations of the same type. In-progress PUT operations will no
longer block new DELETEs and in-progress DELETEs will not block
in-progress PUTs.

In cases where polling a future from the Azure API would eventually
return both `isDone==true` and a non-nil error, a "failed checking if
the operation was complete" message would be logged and the error would
refer to the ultimate failure unrelated to polling the future itself.
This change treats all `isDone==true` polling checks as successful and
relies on the operation's error to be captured in the future's `Result`.
The future will always be deleted from the status when the operation is
done so it can be retried the next reconciliation if it failed.
  • Loading branch information
nojnhuh committed Sep 27, 2022
1 parent d28ff9a commit cf3e347
Show file tree
Hide file tree
Showing 60 changed files with 248 additions and 319 deletions.
2 changes: 1 addition & 1 deletion azure/interfaces.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ type ClusterDescriber interface {
// AsyncStatusUpdater is an interface used to keep track of long running operations in Status that has Conditions and Futures.
type AsyncStatusUpdater interface {
SetLongRunningOperationState(*infrav1.Future)
GetLongRunningOperationState(string, string) *infrav1.Future
GetLongRunningOperationState(string, string, string) *infrav1.Future
DeleteLongRunningOperationState(string, string)
UpdatePutStatus(clusterv1.ConditionType, string, error)
UpdateDeleteStatus(clusterv1.ConditionType, string, error)
Expand Down
8 changes: 4 additions & 4 deletions azure/mock_azure/azure_mock.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions azure/scope/cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -914,8 +914,8 @@ func (s *ClusterScope) SetLongRunningOperationState(future *infrav1.Future) {
}

// GetLongRunningOperationState will get the future on the AzureCluster status.
func (s *ClusterScope) GetLongRunningOperationState(name, service string) *infrav1.Future {
return futures.Get(s.AzureCluster, name, service)
func (s *ClusterScope) GetLongRunningOperationState(name, service, futureType string) *infrav1.Future {
return futures.Get(s.AzureCluster, name, service, futureType)
}

// DeleteLongRunningOperationState will delete the future from the AzureCluster status.
Expand Down
10 changes: 5 additions & 5 deletions azure/scope/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -359,9 +359,9 @@ func (m *MachineScope) Subnet() infrav1.SubnetSpec {

// AvailabilityZone returns the AzureMachine Availability Zone.
// Priority for selecting the AZ is
// 1) Machine.Spec.FailureDomain
// 2) AzureMachine.Spec.FailureDomain (This is to support deprecated AZ)
// 3) No AZ
// 1. Machine.Spec.FailureDomain
// 2. AzureMachine.Spec.FailureDomain (This is to support deprecated AZ)
// 3. No AZ
func (m *MachineScope) AvailabilityZone() string {
if m.Machine.Spec.FailureDomain != nil {
return *m.Machine.Spec.FailureDomain
Expand Down Expand Up @@ -687,8 +687,8 @@ func (m *MachineScope) SetLongRunningOperationState(future *infrav1.Future) {
}

// GetLongRunningOperationState will get the future on the AzureMachine status.
func (m *MachineScope) GetLongRunningOperationState(name, service string) *infrav1.Future {
return futures.Get(m.AzureMachine, name, service)
func (m *MachineScope) GetLongRunningOperationState(name, service, futureType string) *infrav1.Future {
return futures.Get(m.AzureMachine, name, service, futureType)
}

// DeleteLongRunningOperationState will delete the future from the AzureMachine status.
Expand Down
8 changes: 5 additions & 3 deletions azure/scope/machinepool.go
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,9 @@ func (m *MachinePoolScope) applyAzureMachinePoolMachines(ctx context.Context) er
return nil
}

if futures.Has(m.AzureMachinePool, m.Name(), ScalesetsServiceName) {
if futures.Has(m.AzureMachinePool, m.Name(), ScalesetsServiceName, infrav1.PatchFuture) ||
futures.Has(m.AzureMachinePool, m.Name(), ScalesetsServiceName, infrav1.PutFuture) ||
futures.Has(m.AzureMachinePool, m.Name(), ScalesetsServiceName, infrav1.DeleteFuture) {
log.V(4).Info("exiting early due an in-progress long running operation on the ScaleSet")
// exit early to be less greedy about delete
return nil
Expand Down Expand Up @@ -377,8 +379,8 @@ func (m *MachinePoolScope) SetLongRunningOperationState(future *infrav1.Future)
}

// GetLongRunningOperationState will get the future on the AzureMachinePool status.
func (m *MachinePoolScope) GetLongRunningOperationState(name, service string) *infrav1.Future {
return futures.Get(m.AzureMachinePool, name, service)
func (m *MachinePoolScope) GetLongRunningOperationState(name, service, futureType string) *infrav1.Future {
return futures.Get(m.AzureMachinePool, name, service, futureType)
}

// DeleteLongRunningOperationState will delete the future from the AzureMachinePool status.
Expand Down
4 changes: 2 additions & 2 deletions azure/scope/machinepoolmachine.go
Original file line number Diff line number Diff line change
Expand Up @@ -170,8 +170,8 @@ func (s *MachinePoolMachineScope) SetLongRunningOperationState(future *infrav1.F
}

// GetLongRunningOperationState will get the future on the AzureMachinePoolMachine status.
func (s *MachinePoolMachineScope) GetLongRunningOperationState(name, service string) *infrav1.Future {
return futures.Get(s.AzureMachinePoolMachine, name, service)
func (s *MachinePoolMachineScope) GetLongRunningOperationState(name, service, futureType string) *infrav1.Future {
return futures.Get(s.AzureMachinePoolMachine, name, service, futureType)
}

// DeleteLongRunningOperationState will delete the future from the AzureMachinePoolMachine status.
Expand Down
4 changes: 2 additions & 2 deletions azure/scope/managedcontrolplane.go
Original file line number Diff line number Diff line change
Expand Up @@ -561,8 +561,8 @@ func (s *ManagedControlPlaneScope) SetLongRunningOperationState(future *infrav1.
}

// GetLongRunningOperationState will get the future on the AzureManagedControlPlane status.
func (s *ManagedControlPlaneScope) GetLongRunningOperationState(name, service string) *infrav1.Future {
return futures.Get(s.ControlPlane, name, service)
func (s *ManagedControlPlaneScope) GetLongRunningOperationState(name, service, futureType string) *infrav1.Future {
return futures.Get(s.ControlPlane, name, service, futureType)
}

// DeleteLongRunningOperationState will delete the future from the AzureManagedControlPlane status.
Expand Down
14 changes: 7 additions & 7 deletions azure/scope/managedmachinepool.go
Original file line number Diff line number Diff line change
Expand Up @@ -221,20 +221,20 @@ func (s *ManagedMachinePoolScope) SetAgentPoolReady(ready bool) {
s.InfraMachinePool.Status.Ready = ready
}

// SetLongRunningOperationState will set the future on the AzureManagedControlPlane status to allow the resource to continue
// SetLongRunningOperationState will set the future on the AzureManagedMachinePool status to allow the resource to continue
// in the next reconciliation.
func (s *ManagedMachinePoolScope) SetLongRunningOperationState(future *infrav1.Future) {
futures.Set(s.ControlPlane, future)
futures.Set(s.InfraMachinePool, future)
}

// GetLongRunningOperationState will get the future on the AzureManagedControlPlane status.
func (s *ManagedMachinePoolScope) GetLongRunningOperationState(name, service string) *infrav1.Future {
return futures.Get(s.ControlPlane, name, service)
// GetLongRunningOperationState will get the future on the AzureManagedMachinePool status.
func (s *ManagedMachinePoolScope) GetLongRunningOperationState(name, service, futureType string) *infrav1.Future {
return futures.Get(s.InfraMachinePool, name, service, futureType)
}

// DeleteLongRunningOperationState will delete the future from the AzureManagedControlPlane status.
// DeleteLongRunningOperationState will delete the future from the AzureManagedMachinePool status.
func (s *ManagedMachinePoolScope) DeleteLongRunningOperationState(name, service string) {
futures.Delete(s.ControlPlane, name, service)
futures.Delete(s.InfraMachinePool, name, service)
}

// UpdateDeleteStatus updates a condition on the AzureManagedControlPlane status after a DELETE operation.
Expand Down
7 changes: 1 addition & 6 deletions azure/services/agentpools/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -131,12 +131,7 @@ func (ac *azureClient) IsDone(ctx context.Context, future azureautorest.FutureAP
ctx, _, done := tele.StartSpanWithLogger(ctx, "agentpools.azureClient.IsDone")
defer done()

isDone, err = future.DoneWithContext(ctx, ac.agentpools)
if err != nil {
return false, errors.Wrap(err, "failed checking if the operation was complete")
}

return isDone, nil
return future.DoneWithContext(ctx, ac.agentpools)
}

// Result fetches the result of a long-running operation future.
Expand Down
8 changes: 4 additions & 4 deletions azure/services/agentpools/mock_agentpools/agentpools_mock.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

43 changes: 25 additions & 18 deletions azure/services/async/async.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,11 @@ func New(scope FutureScope, createClient Creator, deleteClient Deleter) *Service

// processOngoingOperation is a helper function that will process an ongoing operation to check if it is done.
// If it is not done, it will return a transient error.
func processOngoingOperation(ctx context.Context, scope FutureScope, client FutureHandler, resourceName string, serviceName string) (result interface{}, err error) {
func processOngoingOperation(ctx context.Context, scope FutureScope, client FutureHandler, resourceName string, serviceName string, futureType string) (result interface{}, err error) {
ctx, log, done := tele.StartSpanWithLogger(ctx, "async.Service.processOngoingOperation")
defer done()

future := scope.GetLongRunningOperationState(resourceName, serviceName)
future := scope.GetLongRunningOperationState(resourceName, serviceName, futureType)
if future == nil {
log.V(2).Info("no long running operation found", "service", serviceName, "resource", resourceName)
return nil, nil
Expand All @@ -66,26 +66,31 @@ func processOngoingOperation(ctx context.Context, scope FutureScope, client Futu
}

isDone, err := client.IsDone(ctx, sdkFuture)
if err != nil {
return nil, errors.Wrap(err, "failed checking if the operation was complete")
}

// Assume that if isDone is true, then we successfully checked that the
// operation was complete even if err is non-nil. Assume the error in that
// case is unrelated and will be captured in Result below.
if !isDone {
if err != nil {
return nil, errors.Wrap(err, "failed checking if the operation was complete")
}

// Operation is still in progress, update conditions and requeue.
log.V(2).Info("long running operation is still ongoing", "service", serviceName, "resource", resourceName)
return nil, azure.WithTransientError(azure.NewOperationNotDoneError(future), retryAfter(sdkFuture))
}
if err != nil {
log.V(2).Error(err, "error checking long running operation status after it finished")
}

// Once the operation is done, we can delete the long running operation state.
// If the operation failed, this will allow it to be retried during the next reconciliation.
// If the resource is not found, we also reset the long-running operation state so we can attempt to create it again.
// This can happen if the resource was deleted by another process before we could get the result.
scope.DeleteLongRunningOperationState(resourceName, serviceName)

// Resource has been created/deleted/updated.
log.V(2).Info("long running operation has completed", "service", serviceName, "resource", resourceName)
result, err = client.Result(ctx, sdkFuture, future.Type)
if err == nil || azure.ResourceNotFound(err) {
// Once we have the result, we can delete the long running operation state.
// If the resource is not found, we also reset the long-running operation state so we can attempt to create it again.
// This can happen if the resource was deleted by another process before we could get the result.
scope.DeleteLongRunningOperationState(resourceName, serviceName)
}
return result, err
return client.Result(ctx, sdkFuture, future.Type)
}

// CreateResource implements the logic for creating a resource Asynchronously.
Expand All @@ -95,11 +100,12 @@ func (s *Service) CreateResource(ctx context.Context, spec azure.ResourceSpecGet

resourceName := spec.ResourceName()
rgName := spec.ResourceGroupName()
futureType := infrav1.PutFuture

// Check if there is an ongoing long running operation.
future := s.Scope.GetLongRunningOperationState(resourceName, serviceName)
future := s.Scope.GetLongRunningOperationState(resourceName, serviceName, futureType)
if future != nil {
return processOngoingOperation(ctx, s.Scope, s.Creator, resourceName, serviceName)
return processOngoingOperation(ctx, s.Scope, s.Creator, resourceName, serviceName, futureType)
}

// Get the resource if it already exists, and use it to construct the desired resource parameters.
Expand Down Expand Up @@ -146,11 +152,12 @@ func (s *Service) DeleteResource(ctx context.Context, spec azure.ResourceSpecGet

resourceName := spec.ResourceName()
rgName := spec.ResourceGroupName()
futureType := infrav1.DeleteFuture

// Check if there is an ongoing long running operation.
future := s.Scope.GetLongRunningOperationState(resourceName, serviceName)
future := s.Scope.GetLongRunningOperationState(resourceName, serviceName, futureType)
if future != nil {
_, err := processOngoingOperation(ctx, s.Scope, s.Deleter, resourceName, serviceName)
_, err := processOngoingOperation(ctx, s.Scope, s.Deleter, resourceName, serviceName, futureType)
return err
}

Expand Down
Loading

0 comments on commit cf3e347

Please sign in to comment.