Skip to content

Commit

Permalink
add initial grace period before triggering a failure due to missing c…
Browse files Browse the repository at this point in the history
…omponents (project-codeflare#273)
  • Loading branch information
dgrove-oss authored Nov 27, 2024
1 parent ba18866 commit 256dbab
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 12 deletions.
25 changes: 15 additions & 10 deletions internal/controller/appwrapper/appwrapper_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -268,21 +268,26 @@ func (r *AppWrapperReconciler) Reconcile(ctx context.Context, req ctrl.Request)
}

// Detect externally deleted components and transition to Failed with no GracePeriod or retry
detailMsg := fmt.Sprintf("Only found %v deployed components, but was expecting %v", compStatus.deployed, compStatus.expected)
if compStatus.deployed != compStatus.expected {
meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{
Type: string(workloadv1beta2.Unhealthy),
Status: metav1.ConditionTrue,
Reason: "MissingComponent",
Message: detailMsg,
})
r.Recorder.Event(aw, v1.EventTypeNormal, string(workloadv1beta2.Unhealthy), "MissingComponent: "+detailMsg)
return ctrl.Result{}, r.transitionToPhase(ctx, orig, aw, workloadv1beta2.AppWrapperFailed)
// There may be a lag before created resources become visible in the cache; don't react too quickly.
whenDeployed := meta.FindStatusCondition(aw.Status.Conditions, string(workloadv1beta2.ResourcesDeployed)).LastTransitionTime
graceDuration := r.admissionGraceDuration(ctx, aw)
if time.Now().After(whenDeployed.Add(graceDuration)) {
detailMsg := fmt.Sprintf("Only found %v deployed components, but was expecting %v", compStatus.deployed, compStatus.expected)
meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{
Type: string(workloadv1beta2.Unhealthy),
Status: metav1.ConditionTrue,
Reason: "MissingComponent",
Message: detailMsg,
})
r.Recorder.Event(aw, v1.EventTypeNormal, string(workloadv1beta2.Unhealthy), "MissingComponent: "+detailMsg)
return ctrl.Result{}, r.transitionToPhase(ctx, orig, aw, workloadv1beta2.AppWrapperFailed)
}
}

// If a component's controller has put it into a failed state, we do not need
// to allow a grace period. The situation will not self-correct.
detailMsg = fmt.Sprintf("Found %v failed components", compStatus.failed)
detailMsg := fmt.Sprintf("Found %v failed components", compStatus.failed)
if compStatus.failed > 0 {
meta.SetStatusCondition(&aw.Status.Conditions, metav1.Condition{
Type: string(workloadv1beta2.Unhealthy),
Expand Down
9 changes: 7 additions & 2 deletions test/e2e/appwrapper_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -297,8 +297,13 @@ var _ = Describe("AppWrapper E2E Test", func() {
Expect(aw.Status.Retries).Should(Equal(int32(2)))
})

It("Deleting a Running Component yields a failed AppWrapper", func() {
aw := createAppWrapper(ctx, pytorchjob(2, 500))
It("Deleting a Running Component yields a failed AppWrapper", Label("slow"), func() {
aw := toAppWrapper(pytorchjob(2, 500))
if aw.Annotations == nil {
aw.Annotations = make(map[string]string)
}
aw.Annotations[workloadv1beta2.AdmissionGracePeriodDurationAnnotation] = "5s"
Expect(getClient(ctx).Create(ctx, aw)).To(Succeed())
appwrappers = append(appwrappers, aw)
Eventually(AppWrapperPhase(ctx, aw), 60*time.Second).Should(Equal(workloadv1beta2.AppWrapperRunning))
aw = getAppWrapper(ctx, types.NamespacedName{Name: aw.Name, Namespace: aw.Namespace})
Expand Down

0 comments on commit 256dbab

Please sign in to comment.