Skip to content

Commit

Permalink
libcontainer: set cgroup config late
Browse files Browse the repository at this point in the history
Due to the fact that the init is implemented in Go (which seemingly
randomly spawns new processes and loves eating memory), most cgroup
configurations are required to have an arbitrary minimum dictated by the
init. This confuses users and makes configuration more annoying than it
should. An example of this is pids.max, where Go spawns multiple
processes that then cause init to violate the pids cgroup constraint
before the container can even start.

Solve this problem by setting the cgroup configurations as late as
possible, to avoid hitting as many of the resources hogged by the Go
init as possible. This has to be done before seccomp rules are applied,
as the parent and child must synchronise in order for the parent to
correctly set the configurations (and writes might be blocked by seccomp).

Signed-off-by: Aleksa Sarai <asarai@suse.com>
  • Loading branch information
cyphar committed Jan 11, 2016
1 parent a954834 commit 103853e
Showing 6 changed files with 111 additions and 17 deletions.
17 changes: 12 additions & 5 deletions libcontainer/factory_linux.go
Original file line number Diff line number Diff line change
@@ -5,7 +5,6 @@ package libcontainer
import (
"encoding/json"
"fmt"
"io/ioutil"
"os"
"os/exec"
"path/filepath"
@@ -226,21 +225,29 @@ func (l *LinuxFactory) StartInitialization() (err error) {
// clear the current process's environment to clean any libcontainer
// specific env vars.
os.Clearenv()
var i initer
defer func() {
// if we have an error during the initialization of the container's init then send it back to the
// parent process in the form of an initError.
if err != nil {
// ensure that any data sent from the parent is consumed so it doesn't
// receive ECONNRESET when the child writes to the pipe.
ioutil.ReadAll(pipe)
if _, ok := i.(*linuxStandardInit); ok {
// Synchronisation only necessary for standard init.
if err := json.NewEncoder(pipe).Encode(procError); err != nil {
panic(err)
}
}
if err := json.NewEncoder(pipe).Encode(newSystemError(err)); err != nil {
panic(err)
}
} else {
if err := json.NewEncoder(pipe).Encode(procStart); err != nil {
panic(err)
}
}
// ensure that this pipe is always closed
pipe.Close()
}()
i, err := newContainerInit(it, pipe)
i, err = newContainerInit(it, pipe)
if err != nil {
return err
}
9 changes: 9 additions & 0 deletions libcontainer/generic_error.go
Original file line number Diff line number Diff line change
@@ -9,6 +9,15 @@ import (
"github.com/opencontainers/runc/libcontainer/stacktrace"
)

type syncType uint8

const (
procReady syncType = iota
procError
procStart
procRun
)

var errorTemplate = template.Must(template.New("error").Parse(`Timestamp: {{.Timestamp}}
Code: {{.ECode}}
{{if .Message }}
24 changes: 24 additions & 0 deletions libcontainer/init_linux.go
Original file line number Diff line number Diff line change
@@ -5,6 +5,7 @@ package libcontainer
import (
"encoding/json"
"fmt"
"io"
"io/ioutil"
"net"
"os"
@@ -73,6 +74,7 @@ func newContainerInit(t initType, pipe *os.File) (initer, error) {
}, nil
case initStandard:
return &linuxStandardInit{
pipe: pipe,
parentPid: syscall.Getppid(),
config: config,
}, nil
@@ -140,6 +142,28 @@ func finalizeNamespace(config *initConfig) error {
return nil
}

// syncParentReady sends to the given pipe a JSON payload which indicates that
// the init is ready to Exec the child process. It then waits for the parent to
// indicate that it is cleared to Exec.
func syncParentReady(pipe io.ReadWriter) error {
// Tell parent.
if err := json.NewEncoder(pipe).Encode(procReady); err != nil {
return err
}

// Wait for parent to give the all-clear.
var procSync syncType
if err := json.NewDecoder(pipe).Decode(&procSync); err != nil {
if err == io.EOF {
return fmt.Errorf("parent closed synchronisation channel")
}
if procSync != procRun {
return fmt.Errorf("invalid synchronisation flag from parent")
}
}
return nil
}

// joinExistingNamespaces gets all the namespace paths specified for the container and
// does a setns on the namespace fd so that the current process joins the namespace.
func joinExistingNamespaces(namespaces []configs.Namespace) error {
5 changes: 5 additions & 0 deletions libcontainer/integration/exec_test.go
Original file line number Diff line number Diff line change
@@ -587,6 +587,11 @@ func testPids(t *testing.T, systemd bool) {
if err == nil {
t.Fatalf("expected fork() to fail with restrictive pids limit")
}

// Minimal restrictions are not really supported, due to quirks in using Go
// due to the fact that it spawns random processes. While we do our best with
// late setting cgroup values, it's just too unreliable with very small pids.max.
// As such, we don't test that case. YMMV.
}

func TestRunWithKernelMemory(t *testing.T) {
63 changes: 53 additions & 10 deletions libcontainer/process_linux.go
Original file line number Diff line number Diff line change
@@ -5,6 +5,7 @@ package libcontainer
import (
"encoding/json"
"errors"
"fmt"
"io"
"os"
"os/exec"
@@ -86,6 +87,7 @@ func (p *setnsProcess) start() (err error) {
if err := json.NewEncoder(p.parentPipe).Encode(p.config); err != nil {
return newSystemError(err)
}

if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
return newSystemError(err)
}
@@ -95,6 +97,7 @@ func (p *setnsProcess) start() (err error) {
if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
return newSystemError(err)
}
// Must be done after Shutdown so the child will exit and we can wait for it.
if ierr != nil {
p.wait()
return newSystemError(ierr)
@@ -198,15 +201,11 @@ func (p *initProcess) start() (err error) {
return newSystemError(err)
}
p.setExternalDescriptors(fds)

// Do this before syncing with child so that no children
// can escape the cgroup
if err := p.manager.Apply(p.pid()); err != nil {
return newSystemError(err)
}
if err := p.manager.Set(p.config.Config); err != nil {
return newSystemError(err)
}
defer func() {
if err != nil {
// TODO: should not be the responsibility to call here
@@ -232,13 +231,58 @@ func (p *initProcess) start() (err error) {
if err := p.sendConfig(); err != nil {
return newSystemError(err)
}
// wait for the child process to fully complete and receive an error message
// if one was encoutered
var ierr *genericError
if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {

var (
procSync syncType
sentRun bool
ierr *genericError
)

loop:
for {
if err := json.NewDecoder(p.parentPipe).Decode(&procSync); err != nil {
if err == io.EOF {
break loop
}
return newSystemError(err)
}

switch procSync {
case procStart:
break loop
case procReady:
if err := p.manager.Set(p.config.Config); err != nil {
return newSystemError(err)
}
// Sync with child.
if err := json.NewEncoder(p.parentPipe).Encode(procRun); err != nil {
return newSystemError(err)
}
sentRun = true
case procError:
// wait for the child process to fully complete and receive an error message
// if one was encoutered
if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
return newSystemError(err)
}
if ierr != nil {
break loop
}
// Programmer error.
panic("No error following JSON procError payload.")
default:
return newSystemError(fmt.Errorf("invalid JSON synchronisation payload from child"))
}
}
if !sentRun {
return newSystemError(fmt.Errorf("could not synchronise with container process"))
}
if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
return newSystemError(err)
}
// Must be done after Shutdown so the child will exit and we can wait for it.
if ierr != nil {
p.wait()
return newSystemError(ierr)
}
return nil
@@ -276,8 +320,7 @@ func (p *initProcess) sendConfig() error {
if err := json.NewEncoder(p.parentPipe).Encode(p.config); err != nil {
return err
}
// shutdown writes for the parent side of the pipe
return syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR)
return nil
}

func (p *initProcess) createNetworkInterfaces() error {
10 changes: 8 additions & 2 deletions libcontainer/standard_init_linux.go
Original file line number Diff line number Diff line change
@@ -3,6 +3,7 @@
package libcontainer

import (
"io"
"os"
"syscall"

@@ -14,6 +15,7 @@ import (
)

type linuxStandardInit struct {
pipe io.ReadWriter
parentPid int
config *initConfig
}
@@ -50,7 +52,6 @@ func (l *linuxStandardInit) Init() error {
if err := setOomScoreAdj(l.config.Config.OomScoreAdj); err != nil {
return err
}

label.Init()
// InitializeMountNamespace() can be executed only for a new mount namespace
if l.config.Config.Namespaces.Contains(configs.NEWNS) {
@@ -75,7 +76,6 @@ func (l *linuxStandardInit) Init() error {
return err
}
}

for _, path := range l.config.Config.ReadonlyPaths {
if err := remountReadonly(path); err != nil {
return err
@@ -90,6 +90,12 @@ func (l *linuxStandardInit) Init() error {
if err != nil {
return err
}
// Tell our parent that we're ready to Execv. This must be done before the
// Seccomp rules have been applied, because we need to be able to read and
// write to a socket.
if err := syncParentReady(l.pipe); err != nil {
return err
}
if l.config.Config.Seccomp != nil {
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
return err

0 comments on commit 103853e

Please sign in to comment.