From c6e8cb79260cdfc435ceecc797efb92cc5977632 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Thu, 23 Mar 2023 10:56:20 -0700 Subject: [PATCH 1/4] libct/cg/sd: refactor startUnit Move error handling earlier, removing "if err == nil" block. No change of logic. Signed-off-by: Kir Kolyshkin --- libcontainer/cgroups/systemd/common.go | 33 ++++++++++++++------------ 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/libcontainer/cgroups/systemd/common.go b/libcontainer/cgroups/systemd/common.go index 2f0767b2fd8..c0a058012ce 100644 --- a/libcontainer/cgroups/systemd/common.go +++ b/libcontainer/cgroups/systemd/common.go @@ -130,24 +130,27 @@ func startUnit(cm *dbusConnManager, unitName string, properties []systemdDbus.Pr _, err := c.StartTransientUnitContext(context.TODO(), unitName, "replace", properties, statusChan) return err }) - if err == nil { - timeout := time.NewTimer(30 * time.Second) - defer timeout.Stop() + if err != nil { + if !isUnitExists(err) { + return err + } + return nil + } - select { - case s := <-statusChan: - close(statusChan) - // Please refer to https://pkg.go.dev/github.com/coreos/go-systemd/v22/dbus#Conn.StartUnit - if s != "done" { - resetFailedUnit(cm, unitName) - return fmt.Errorf("error creating systemd unit `%s`: got `%s`", unitName, s) - } - case <-timeout.C: + timeout := time.NewTimer(30 * time.Second) + defer timeout.Stop() + + select { + case s := <-statusChan: + close(statusChan) + // Please refer to https://pkg.go.dev/github.com/coreos/go-systemd/v22/dbus#Conn.StartUnit + if s != "done" { resetFailedUnit(cm, unitName) - return errors.New("Timeout waiting for systemd to create " + unitName) + return fmt.Errorf("error creating systemd unit `%s`: got `%s`", unitName, s) } - } else if !isUnitExists(err) { - return err + case <-timeout.C: + resetFailedUnit(cm, unitName) + return errors.New("Timeout waiting for systemd to create " + unitName) } return nil From c2533420613be63ac9a84fc4363b8ddd46568985 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Thu, 23 Mar 2023 11:01:14 -0700 Subject: [PATCH 2/4] libct/cg/sd: ignore UnitExists only for Apply(-1) Commit d223e2adae83f62d5 ("Ignore error when starting transient unit that already exists" modified the code handling errors from startUnit to ignore UnitExists error. Apparently it was done so that kubelet can create the same pod slice over and over without hitting an error (see [1]). While it works for a pod slice to ensure it exists, it is a gross bug to ignore UnitExists when creating a container. In this case, the container init PID won't be added to the systemd unit (and to the required cgroup), and as a result the container will successfully run in a current user cgroup, without any cgroup limits applied. So, fix the code to only ignore UnitExists if we're not adding a process to the systemd unit. This way, kubelet will keep working as is, but runc will refuse to create containers which are not placed into a requested cgroup. [1] https://github.com/opencontainers/runc/pull/1124 Signed-off-by: Kir Kolyshkin --- libcontainer/cgroups/systemd/common.go | 10 ++++++++-- libcontainer/cgroups/systemd/v1.go | 2 +- libcontainer/cgroups/systemd/v2.go | 2 +- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/libcontainer/cgroups/systemd/common.go b/libcontainer/cgroups/systemd/common.go index c0a058012ce..31a3656ee23 100644 --- a/libcontainer/cgroups/systemd/common.go +++ b/libcontainer/cgroups/systemd/common.go @@ -124,7 +124,7 @@ func isUnitExists(err error) bool { return isDbusError(err, "org.freedesktop.systemd1.UnitExists") } -func startUnit(cm *dbusConnManager, unitName string, properties []systemdDbus.Property) error { +func startUnit(cm *dbusConnManager, unitName string, properties []systemdDbus.Property, ignoreExist bool) error { statusChan := make(chan string, 1) err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error { _, err := c.StartTransientUnitContext(context.TODO(), unitName, "replace", properties, statusChan) @@ -134,7 +134,13 @@ func startUnit(cm *dbusConnManager, unitName string, properties []systemdDbus.Pr if !isUnitExists(err) { return err } - return nil + if ignoreExist { + // TODO: remove this hack. + // This is kubelet making sure a slice exists (see + // https://github.com/opencontainers/runc/pull/1124). + return nil + } + return err } timeout := time.NewTimer(30 * time.Second) diff --git a/libcontainer/cgroups/systemd/v1.go b/libcontainer/cgroups/systemd/v1.go index ab9333cb2c8..e76449910c2 100644 --- a/libcontainer/cgroups/systemd/v1.go +++ b/libcontainer/cgroups/systemd/v1.go @@ -204,7 +204,7 @@ func (m *LegacyManager) Apply(pid int) error { properties = append(properties, c.SystemdProps...) - if err := startUnit(m.dbus, unitName, properties); err != nil { + if err := startUnit(m.dbus, unitName, properties, pid == -1); err != nil { return err } diff --git a/libcontainer/cgroups/systemd/v2.go b/libcontainer/cgroups/systemd/v2.go index 823bf38fc17..3d66ed40610 100644 --- a/libcontainer/cgroups/systemd/v2.go +++ b/libcontainer/cgroups/systemd/v2.go @@ -291,7 +291,7 @@ func (m *UnifiedManager) Apply(pid int) error { properties = append(properties, c.SystemdProps...) - if err := startUnit(m.dbus, unitName, properties); err != nil { + if err := startUnit(m.dbus, unitName, properties, pid == -1); err != nil { return fmt.Errorf("unable to start unit %q (properties %+v): %w", unitName, properties, err) } From 1d18743f9e6767580a4276108e0c977d64fd03b6 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Thu, 23 Mar 2023 11:15:01 -0700 Subject: [PATCH 3/4] libct/cg/sd: reset-failed and retry startUnit on UnitExists In case a systemd unit fails (for example, timed out or OOM-killed), systemd keeps the unit. This prevents starting a new container with the same systemd unit name. The fix is to call reset-failed in case UnitExists error is returned, and retry once. Signed-off-by: Kir Kolyshkin --- libcontainer/cgroups/systemd/common.go | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/libcontainer/cgroups/systemd/common.go b/libcontainer/cgroups/systemd/common.go index 31a3656ee23..e6db845cb13 100644 --- a/libcontainer/cgroups/systemd/common.go +++ b/libcontainer/cgroups/systemd/common.go @@ -126,6 +126,9 @@ func isUnitExists(err error) bool { func startUnit(cm *dbusConnManager, unitName string, properties []systemdDbus.Property, ignoreExist bool) error { statusChan := make(chan string, 1) + retry := true + +retry: err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error { _, err := c.StartTransientUnitContext(context.TODO(), unitName, "replace", properties, statusChan) return err @@ -140,6 +143,14 @@ func startUnit(cm *dbusConnManager, unitName string, properties []systemdDbus.Pr // https://github.com/opencontainers/runc/pull/1124). return nil } + if retry { + // In case a unit with the same name exists, this may + // be a leftover failed unit. Reset it, so systemd can + // remove it, and retry once. + resetFailedUnit(cm, unitName) + retry = false + goto retry + } return err } From 82bc89cd10eb5fa5fdb5d77e41b7ea3c07f9d834 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Thu, 23 Mar 2023 11:57:46 -0700 Subject: [PATCH 4/4] runc run: refuse a non-empty cgroup Commit d08bc0c1b3bb2 ("runc run: warn on non-empty cgroup") introduced a warning when a container is started in a non-empty cgroup. Such configuration has lots of issues. In addition to that, such configuration is not possible at all when using the systemd cgroup driver. As planned, let's promote this warning to an error, and fix the test case accordingly. Signed-off-by: Kir Kolyshkin --- libcontainer/factory_linux.go | 4 +--- tests/integration/cgroups.bats | 6 +++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/libcontainer/factory_linux.go b/libcontainer/factory_linux.go index b8d2d9c287d..bf8904efb85 100644 --- a/libcontainer/factory_linux.go +++ b/libcontainer/factory_linux.go @@ -77,9 +77,7 @@ func Create(root, id string, config *configs.Config) (*Container, error) { return nil, fmt.Errorf("unable to get cgroup PIDs: %w", err) } if len(pids) != 0 { - // TODO: return an error. - logrus.Warnf("container's cgroup is not empty: %d process(es) found", len(pids)) - logrus.Warn("DEPRECATED: running container in a non-empty cgroup won't be supported in runc 1.2; https://github.com/opencontainers/runc/issues/3132") + return nil, fmt.Errorf("container's cgroup is not empty: %d process(es) found", len(pids)) } } diff --git a/tests/integration/cgroups.bats b/tests/integration/cgroups.bats index 14f26d2d5a4..17e384e2c86 100644 --- a/tests/integration/cgroups.bats +++ b/tests/integration/cgroups.bats @@ -356,7 +356,7 @@ function setup() { [ "$output" = "ok" ] } -@test "runc run/create should warn about a non-empty cgroup" { +@test "runc run/create should error for a non-empty cgroup" { [ $EUID -ne 0 ] && requires rootless_cgroup set_cgroups_path @@ -366,12 +366,12 @@ function setup() { # Run a second container sharing the cgroup with the first one. runc --debug run -d --console-socket "$CONSOLE_SOCKET" ct2 - [ "$status" -eq 0 ] + [ "$status" -ne 0 ] [[ "$output" == *"container's cgroup is not empty"* ]] # Same but using runc create. runc create --console-socket "$CONSOLE_SOCKET" ct3 - [ "$status" -eq 0 ] + [ "$status" -ne 0 ] [[ "$output" == *"container's cgroup is not empty"* ]] }