Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bubble up pod schedule errors to revision status #4191

Merged
merged 1 commit into from
Jun 3, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions pkg/apis/serving/v1alpha1/revision_lifecycle.go
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,12 @@ func (rs *RevisionStatus) MarkResourcesAvailable() {
revCondSet.Manage(rs).MarkTrue(RevisionConditionResourcesAvailable)
}

// MarkResourcesUnavailable changes "ResourcesAvailable" condition to false to reflect that the
// resources of the given kind and name cannot be created.
func (rs *RevisionStatus) MarkResourcesUnavailable(reason, message string) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Golint comments: exported method RevisionStatus.MarkResourcesUnavailable should have comment or be unexported. More info.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please fix this as well.

revCondSet.Manage(rs).MarkFalse(RevisionConditionResourcesAvailable, reason, message)
}

func (rs *RevisionStatus) MarkActive() {
revCondSet.Manage(rs).MarkTrue(RevisionConditionActive)
}
Expand Down
19 changes: 19 additions & 0 deletions pkg/apis/serving/v1alpha1/revision_lifecycle_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,25 @@ func TestRevisionNotOwnedStuff(t *testing.T) {
}
}

func TestRevisionResourcesUnavailable(t *testing.T) {
r := &RevisionStatus{}
r.InitializeConditions()
apitest.CheckConditionOngoing(r.duck(), RevisionConditionResourcesAvailable, t)
apitest.CheckConditionOngoing(r.duck(), RevisionConditionContainerHealthy, t)
apitest.CheckConditionOngoing(r.duck(), RevisionConditionReady, t)

const wantReason, wantMessage = "unschedulable", "insufficient energy"
r.MarkResourcesUnavailable(wantReason, wantMessage)
apitest.CheckConditionFailed(r.duck(), RevisionConditionResourcesAvailable, t)
apitest.CheckConditionFailed(r.duck(), RevisionConditionReady, t)
if got := r.GetCondition(RevisionConditionResourcesAvailable); got == nil || got.Reason != wantReason {
t.Errorf("RevisionConditionResourcesAvailable = %v, want %v", got, wantReason)
}
if got := r.GetCondition(RevisionConditionResourcesAvailable); got == nil || got.Message != wantMessage {
t.Errorf("RevisionConditionResourcesAvailable = %v, want %v", got, wantMessage)
}
}

func TestRevisionGetGroupVersionKind(t *testing.T) {
r := &Revision{}
want := schema.GroupVersionKind{
Expand Down
9 changes: 9 additions & 0 deletions pkg/reconciler/revision/reconcile_resources.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,15 @@ func (c *Reconciler) reconcileDeployment(ctx context.Context, rev *v1alpha1.Revi
// Arbitrarily grab the very first pod, as they all should be crashing
pod := pods.Items[0]

// Update the revision status if pod cannot be scheduled(possibly resource constraints)
// If pod cannot be scheduled then we expect the container status to be empty.
for _, cond := range pod.Status.Conditions {
if cond.Type == corev1.PodScheduled && cond.Status == corev1.ConditionFalse {
rev.Status.MarkResourcesUnavailable(cond.Reason, cond.Message)
break
}
}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we care about overriding this status with the statuses below if they all happen?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mean updating revision status if resources are present on nodes?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

NM, I think it's fine. But what I meant is code below would override the status value. But we don't have idea of priorities, so I guess it doesn't matter here.

for _, status := range pod.Status.ContainerStatuses {
if status.Name == resources.UserContainerName {
if t := status.LastTerminationState.Terminated; t != nil {
Expand Down
18 changes: 18 additions & 0 deletions pkg/reconciler/revision/table_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -456,6 +456,24 @@ func TestReconcile(t *testing.T) {
WithLogURL, AllUnknownConditions, MarkContainerExiting(5, "I failed man!")),
}},
Key: "foo/pod-error",
}, {
Name: "surface pod schedule errors",
// Test the propagation of the scheduling errors of Pod into the revision.
// This initializes the world to unschedule pod. It then verifies
// that Reconcile propagates this into the status of the Revision.
Objects: []runtime.Object{
rev("foo", "pod-schedule-error",
withK8sServiceName("a-pod-schedule-error"), WithLogURL, AllUnknownConditions, MarkActive),
kpa("foo", "pod-schedule-error"), // PA can't be ready, since no traffic.
pod("foo", "pod-schedule-error", WithUnschedulableContainer("Insufficient energy", "Unschedulable")),
deploy("foo", "pod-schedule-error"),
image("foo", "pod-schedule-error"),
},
WantStatusUpdates: []clientgotesting.UpdateActionImpl{{
Object: rev("foo", "pod-schedule-error",
WithLogURL, AllUnknownConditions, MarkResourcesUnavailable("Insufficient energy", "Unschedulable")),
}},
Key: "foo/pod-schedule-error",
}, {
Name: "ready steady state",
// Test the transition that Reconcile makes when Endpoints become ready on the
Expand Down
20 changes: 20 additions & 0 deletions pkg/reconciler/testing/functional.go
Original file line number Diff line number Diff line change
Expand Up @@ -903,6 +903,13 @@ func MarkContainerExiting(exitCode int32, message string) RevisionOption {
}
}

// MarkResourcesUnavailable calls .Status.MarkResourcesUnavailable on the Revision.
func MarkResourcesUnavailable(reason, message string) RevisionOption {
return func(r *v1alpha1.Revision) {
r.Status.MarkResourcesUnavailable(reason, message)
}
}

// MarkRevisionReady calls the necessary helpers to make the Revision Ready=True.
func MarkRevisionReady(r *v1alpha1.Revision) {
WithInitRevConditions(r)
Expand Down Expand Up @@ -1109,6 +1116,19 @@ func WithFailingContainer(name string, exitCode int, message string) PodOption {
}
}

// WithUnschedulableContainer sets the .Status.Conditionss on the pod to
// include `PodScheduled` status to `False` with the given message and reason.
func WithUnschedulableContainer(reason, message string) PodOption {
return func(pod *corev1.Pod) {
pod.Status.Conditions = []corev1.PodCondition{{
Type: corev1.PodScheduled,
Reason: reason,
Message: message,
Status: corev1.ConditionFalse,
}}
}
}

// ClusterIngressOption enables further configuration of the Cluster Ingress.
type ClusterIngressOption func(*netv1alpha1.ClusterIngress)

Expand Down
117 changes: 117 additions & 0 deletions test/e2e/pod_schedule_error_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
// +build e2e

/*
Copyright 2019 The Knative Authors

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package e2e
Copy link
Contributor

@jonjohnsonjr jonjohnsonjr Jun 3, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Needs:

// +build e2e

/*
Copyright 2019 The Knative Authors

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you so much


import (
"fmt"
"strings"
"testing"

"github.com/knative/serving/pkg/apis/serving/v1alpha1"
serviceresourcenames "github.com/knative/serving/pkg/reconciler/service/resources/names"
"github.com/knative/serving/test"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

func TestPodScheduleError(t *testing.T) {
clients := Setup(t)
const (
errorReason = "RevisionFailed"
errorMsg = "nodes are available"
revisionReason = "Unschedulable"
)
names := test.ResourceNames{
Service: test.ObjectNameForTest(t),
Image: "helloworld",
}

defer test.TearDown(clients, names)
test.CleanupOnInterrupt(func() { test.TearDown(clients, names) })

t.Logf("Creating a new Service %s", names.Image)
resources := corev1.ResourceRequirements{
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("50000"),
},
Requests: corev1.ResourceList{
corev1.ResourceMemory: resource.MustParse("350Mi"),
},
}
var (
svc *v1alpha1.Service
err error
)
if svc, err = test.CreateLatestService(t, clients, names, &test.Options{ContainerResources: resources}); err != nil {
t.Fatalf("Failed to create Service %s: %v", names.Service, err)
}

names.Config = serviceresourcenames.Configuration(svc)

err = test.WaitForServiceState(clients.ServingClient, names.Service, func(r *v1alpha1.Service) (bool, error) {
cond := r.Status.GetCondition(v1alpha1.ConfigurationConditionReady)
if cond != nil && !cond.IsUnknown() {
if strings.Contains(cond.Message, errorMsg) && cond.IsFalse() {
return true, nil
}
t.Logf("Reason: %s ; Message: %s ; Status: %s", cond.Reason, cond.Message, cond.Status)
return true, fmt.Errorf("the service %s was not marked with expected error condition (Reason=\"%s\", Message=\"%s\", Status=\"%s\"), but with (Reason=\"%s\", Message=\"%s\", Status=\"%s\")",
names.Config, errorReason, errorMsg, "False", cond.Reason, cond.Message, cond.Status)
}
return false, nil
}, "ContainerUnscheduleable")

if err != nil {
t.Fatalf("Failed to validate service state: %s", err)
}

revisionName, err := revisionFromConfiguration(clients, names.Config)
if err != nil {
t.Fatalf("Failed to get revision from configuration %s: %v", names.Config, err)
}

t.Log("When the containers are not scheduled, the revision should have error status.")
err = test.WaitForRevisionState(clients.ServingClient, revisionName, func(r *v1alpha1.Revision) (bool, error) {
cond := r.Status.GetCondition(v1alpha1.RevisionConditionReady)
if cond != nil {
if cond.Reason == revisionReason && strings.Contains(cond.Message, errorMsg) {
return true, nil
}
return true, fmt.Errorf("the revision %s was not marked with expected error condition (Reason=%q, Message=%q), but with (Reason=%q, Message=%q)",
revisionName, revisionReason, errorMsg, cond.Reason, cond.Message)
}
return false, nil
}, errorReason)

if err != nil {
t.Fatalf("Failed to validate revision state: %s", err)
}
}

// Get revision name from configuration.
func revisionFromConfiguration(clients *test.Clients, configName string) (string, error) {
config, err := clients.ServingClient.Configs.Get(configName, metav1.GetOptions{})
if err != nil {
return "", err
}
if config.Status.LatestCreatedRevisionName != "" {
return config.Status.LatestCreatedRevisionName, nil
}
return "", fmt.Errorf("no valid revision name found in configuration %s", configName)
}