Skip to content
This repository has been archived by the owner on Mar 9, 2022. It is now read-only.

Commit

Permalink
Merge pull request #1133 from Random-Liu/use-wait
Browse files Browse the repository at this point in the history
Use wait instead of `TaskExit`.
  • Loading branch information
Random-Liu committed Apr 18, 2019
2 parents f207148 + d1f9611 commit a5c5d55
Show file tree
Hide file tree
Showing 18 changed files with 387 additions and 263 deletions.
3 changes: 3 additions & 0 deletions pkg/server/container_remove.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,9 @@ func setContainerRemoving(container containerstore.Container) error {
if status.State() == runtime.ContainerState_CONTAINER_UNKNOWN {
return status, errors.New("container state is unknown, to stop first")
}
if status.Starting {
return status, errors.New("container is in starting state, can't be removed")
}
if status.Removing {
return status, errors.New("container is already in removing state")
}
Expand Down
9 changes: 9 additions & 0 deletions pkg/server/container_remove_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,13 @@ func TestSetContainerRemoving(t *testing.T) {
},
expectErr: true,
},
"should return error when container is in starting state": {
status: containerstore.Status{
CreatedAt: time.Now().UnixNano(),
Starting: true,
},
expectErr: true,
},
"should return error when container is in removing state": {
status: containerstore.Status{
CreatedAt: time.Now().UnixNano(),
Expand Down Expand Up @@ -71,6 +78,8 @@ func TestSetContainerRemoving(t *testing.T) {
} else {
assert.NoError(t, err)
assert.True(t, container.Status.Get().Removing, "removing should be set")
assert.NoError(t, resetContainerRemoving(container))
assert.False(t, container.Status.Get().Removing, "removing should be reset")
}
}
}
114 changes: 72 additions & 42 deletions pkg/server/container_start.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,64 +38,48 @@ import (

// StartContainer starts the container.
func (c *criService) StartContainer(ctx context.Context, r *runtime.StartContainerRequest) (retRes *runtime.StartContainerResponse, retErr error) {
container, err := c.containerStore.Get(r.GetContainerId())
cntr, err := c.containerStore.Get(r.GetContainerId())
if err != nil {
return nil, errors.Wrapf(err, "an error occurred when try to find container %q", r.GetContainerId())
}

var startErr error
// update container status in one transaction to avoid race with event monitor.
if err := container.Status.UpdateSync(func(status containerstore.Status) (containerstore.Status, error) {
// Always apply status change no matter startContainer fails or not. Because startContainer
// may change container state no matter it fails or succeeds.
startErr = c.startContainer(ctx, container, &status)
return status, nil
}); startErr != nil {
return nil, startErr
} else if err != nil {
return nil, errors.Wrapf(err, "failed to update container %q metadata", container.ID)
}
return &runtime.StartContainerResponse{}, nil
}

// startContainer actually starts the container. The function needs to be run in one transaction. Any updates
// to the status passed in will be applied no matter the function returns error or not.
func (c *criService) startContainer(ctx context.Context,
cntr containerstore.Container,
status *containerstore.Status) (retErr error) {
id := cntr.ID
meta := cntr.Metadata
container := cntr.Container
config := meta.Config

// Return error if container is not in created state.
if status.State() != runtime.ContainerState_CONTAINER_CREATED {
return errors.Errorf("container %q is in %s state", id, criContainerStateToString(status.State()))
}
// Do not start the container when there is a removal in progress.
if status.Removing {
return errors.Errorf("container %q is in removing state", id)
// Set starting state to prevent other start/remove operations against this container
// while it's being started.
if err := setContainerStarting(cntr); err != nil {
return nil, errors.Wrapf(err, "failed to set starting state for container %q", id)
}

defer func() {
if retErr != nil {
// Set container to exited if fail to start.
status.Pid = 0
status.FinishedAt = time.Now().UnixNano()
status.ExitCode = errorStartExitCode
status.Reason = errorStartReason
status.Message = retErr.Error()
if err := cntr.Status.UpdateSync(func(status containerstore.Status) (containerstore.Status, error) {
status.Pid = 0
status.FinishedAt = time.Now().UnixNano()
status.ExitCode = errorStartExitCode
status.Reason = errorStartReason
status.Message = retErr.Error()
return status, nil
}); err != nil {
logrus.WithError(err).Errorf("failed to set start failure state for container %q", id)
}
}
if err := resetContainerStarting(cntr); err != nil {
logrus.WithError(err).Errorf("failed to reset starting state for container %q", id)
}
}()

// Get sandbox config from sandbox store.
sandbox, err := c.sandboxStore.Get(meta.SandboxID)
if err != nil {
return errors.Wrapf(err, "sandbox %q not found", meta.SandboxID)
return nil, errors.Wrapf(err, "sandbox %q not found", meta.SandboxID)
}
sandboxID := meta.SandboxID
if sandbox.Status.Get().State != sandboxstore.StateReady {
return errors.Errorf("sandbox container %q is not running", sandboxID)
return nil, errors.Errorf("sandbox container %q is not running", sandboxID)
}

ioCreation := func(id string) (_ containerdio.IO, err error) {
Expand All @@ -110,7 +94,7 @@ func (c *criService) startContainer(ctx context.Context,

ctrInfo, err := container.Info(ctx)
if err != nil {
return errors.Wrap(err, "failed to get container info")
return nil, errors.Wrap(err, "failed to get container info")
}

var taskOpts []containerd.NewTaskOpts
Expand All @@ -120,7 +104,7 @@ func (c *criService) startContainer(ctx context.Context,
}
task, err := container.NewTask(ctx, ioCreation, taskOpts...)
if err != nil {
return errors.Wrap(err, "failed to create containerd task")
return nil, errors.Wrap(err, "failed to create containerd task")
}
defer func() {
if retErr != nil {
Expand All @@ -133,15 +117,61 @@ func (c *criService) startContainer(ctx context.Context,
}
}()

// wait is a long running background request, no timeout needed.
exitCh, err := task.Wait(ctrdutil.NamespacedContext())
if err != nil {
return nil, errors.Wrap(err, "failed to wait for containerd task")
}

// Start containerd task.
if err := task.Start(ctx); err != nil {
return errors.Wrapf(err, "failed to start containerd task %q", id)
return nil, errors.Wrapf(err, "failed to start containerd task %q", id)
}

// Update container start timestamp.
status.Pid = task.Pid()
status.StartedAt = time.Now().UnixNano()
return nil
if err := cntr.Status.UpdateSync(func(status containerstore.Status) (containerstore.Status, error) {
status.Pid = task.Pid()
status.StartedAt = time.Now().UnixNano()
return status, nil
}); err != nil {
return nil, errors.Wrapf(err, "failed to update container %q state", id)
}

// start the monitor after updating container state, this ensures that
// event monitor receives the TaskExit event and update container state
// after this.
c.eventMonitor.startExitMonitor(context.Background(), id, task.Pid(), exitCh)

return &runtime.StartContainerResponse{}, nil
}

// setContainerStarting sets the container into starting state. In starting state, the
// container will not be removed or started again.
func setContainerStarting(container containerstore.Container) error {
return container.Status.Update(func(status containerstore.Status) (containerstore.Status, error) {
// Return error if container is not in created state.
if status.State() != runtime.ContainerState_CONTAINER_CREATED {
return status, errors.Errorf("container is in %s state", criContainerStateToString(status.State()))
}
// Do not start the container when there is a removal in progress.
if status.Removing {
return status, errors.New("container is in removing state, can't be started")
}
if status.Starting {
return status, errors.New("container is already in starting state")
}
status.Starting = true
return status, nil
})
}

// resetContainerStarting resets the container starting state on start failure. So
// that we could remove the container later.
func resetContainerStarting(container containerstore.Container) error {
return container.Status.Update(func(status containerstore.Status) (containerstore.Status, error) {
status.Starting = false
return status, nil
})
}

// createContainerLoggers creates container loggers and return write closer for stdout and stderr.
Expand Down
98 changes: 98 additions & 0 deletions pkg/server/container_start_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
/*
Copyright 2019 The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package server

import (
"testing"
"time"

"github.com/stretchr/testify/assert"

containerstore "github.com/containerd/cri/pkg/store/container"
)

// TestSetContainerStarting tests setContainerStarting sets removing
// state correctly.
func TestSetContainerStarting(t *testing.T) {
testID := "test-id"
for desc, test := range map[string]struct {
status containerstore.Status
expectErr bool
}{

"should not return error when container is in created state": {
status: containerstore.Status{
CreatedAt: time.Now().UnixNano(),
},
expectErr: false,
},
"should return error when container is in running state": {
status: containerstore.Status{
CreatedAt: time.Now().UnixNano(),
StartedAt: time.Now().UnixNano(),
},
expectErr: true,
},
"should return error when container is in exited state": {
status: containerstore.Status{
CreatedAt: time.Now().UnixNano(),
StartedAt: time.Now().UnixNano(),
FinishedAt: time.Now().UnixNano(),
},
expectErr: true,
},
"should return error when container is in unknown state": {
status: containerstore.Status{
CreatedAt: 0,
StartedAt: 0,
FinishedAt: 0,
},
expectErr: true,
},
"should return error when container is in starting state": {
status: containerstore.Status{
CreatedAt: time.Now().UnixNano(),
Starting: true,
},
expectErr: true,
},
"should return error when container is in removing state": {
status: containerstore.Status{
CreatedAt: time.Now().UnixNano(),
Removing: true,
},
expectErr: true,
},
} {
t.Logf("TestCase %q", desc)
container, err := containerstore.NewContainer(
containerstore.Metadata{ID: testID},
containerstore.WithFakeStatus(test.status),
)
assert.NoError(t, err)
err = setContainerStarting(container)
if test.expectErr {
assert.Error(t, err)
assert.Equal(t, test.status, container.Status.Get(), "metadata should not be updated")
} else {
assert.NoError(t, err)
assert.True(t, container.Status.Get().Starting, "starting should be set")
assert.NoError(t, resetContainerStarting(container))
assert.False(t, container.Status.Get().Starting, "starting should be reset")
}
}
}
57 changes: 25 additions & 32 deletions pkg/server/container_stop.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
"golang.org/x/sys/unix"
runtime "k8s.io/kubernetes/pkg/kubelet/apis/cri/runtime/v1alpha2"

ctrdutil "github.com/containerd/cri/pkg/containerd/util"
"github.com/containerd/cri/pkg/store"
containerstore "github.com/containerd/cri/pkg/store/container"
)
Expand Down Expand Up @@ -74,36 +75,34 @@ func (c *criService) stopContainer(ctx context.Context, container containerstore
return errors.Wrapf(err, "failed to get task for container %q", id)
}
// Don't return for unknown state, some cleanup needs to be done.
if state != runtime.ContainerState_CONTAINER_UNKNOWN {
return nil
if state == runtime.ContainerState_CONTAINER_UNKNOWN {
return cleanupUnknownContainer(ctx, id, container)
}
// Task is an interface, explicitly set it to nil just in case.
task = nil
return nil
}

// Handle unknown state.
if state == runtime.ContainerState_CONTAINER_UNKNOWN {
status, err := getTaskStatus(ctx, task)
// Start an exit handler for containers in unknown state.
waitCtx, waitCancel := context.WithCancel(ctrdutil.NamespacedContext())
defer waitCancel()
exitCh, err := task.Wait(waitCtx)
if err != nil {
return errors.Wrapf(err, "failed to get task status for %q", id)
}
switch status.Status {
case containerd.Running, containerd.Created:
// The task is still running, continue stopping the task.
case containerd.Stopped:
// The task has exited. If the task exited after containerd
// started, the event monitor will receive its exit event; if it
// exited before containerd started, the event monitor will never
// receive its exit event.
// However, we can't tell that because the task state was not
// successfully loaded during containerd start (container is
// in UNKNOWN state).
// So always do cleanup here, just in case that we've missed the
// exit event.
return cleanupUnknownContainer(ctx, id, status, container)
default:
return errors.Wrapf(err, "unsupported task status %q", status.Status)
if !errdefs.IsNotFound(err) {
return errors.Wrapf(err, "failed to wait for task for %q", id)
}
return cleanupUnknownContainer(ctx, id, container)
}

exitCtx, exitCancel := context.WithCancel(context.Background())
stopCh := c.eventMonitor.startExitMonitor(exitCtx, id, task.Pid(), exitCh)
defer func() {
exitCancel()
// This ensures that exit monitor is stopped before
// `Wait` is cancelled, so no exit event is generated
// because of the `Wait` cancellation.
<-stopCh
}()
}

// We only need to kill the task. The event handler will Delete the
Expand Down Expand Up @@ -176,19 +175,13 @@ func (c *criService) waitContainerStop(ctx context.Context, container containers
}

// cleanupUnknownContainer cleanup stopped container in unknown state.
func cleanupUnknownContainer(ctx context.Context, id string, status containerd.Status,
cntr containerstore.Container) error {
func cleanupUnknownContainer(ctx context.Context, id string, cntr containerstore.Container) error {
// Reuse handleContainerExit to do the cleanup.
// NOTE(random-liu): If the task did exit after containerd started, both
// the event monitor and the cleanup function would update the container
// state. The final container state will be whatever being updated first.
// There is no way to completely avoid this race condition, and for best
// effort unknown state container cleanup, this seems acceptable.
return handleContainerExit(ctx, &eventtypes.TaskExit{
ContainerID: id,
ID: id,
Pid: 0,
ExitStatus: status.ExitStatus,
ExitedAt: status.ExitTime,
ExitStatus: unknownExitCode,
ExitedAt: time.Now(),
}, cntr)
}
Loading

0 comments on commit a5c5d55

Please sign in to comment.