From f131eaa74aa603e69b7c71ae073302b359ff1502 Mon Sep 17 00:00:00 2001 From: Valentin Rothberg Date: Tue, 28 Mar 2023 14:54:26 +0200 Subject: [PATCH] auto-update: stop+start instead of restart sytemd units It turns out the restart is _not_ a stop+start but keeps certain resources open and is subject to some timeouts that may differ across distributions' default settings. [NO NEW TESTS NEEDED] as I have absolutely no idea how to reliably cause the failure/flake/race. Also ignore ENOENTS of the CID file when removing a container which has been identified of actually fixing #17607. Fixes: #17607 Signed-off-by: Valentin Rothberg --- libpod/runtime_ctr.go | 2 +- pkg/autoupdate/autoupdate.go | 33 +++++++++++++++++++++++++++++++-- 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/libpod/runtime_ctr.go b/libpod/runtime_ctr.go index 6a90785161c8..648118c3741d 100644 --- a/libpod/runtime_ctr.go +++ b/libpod/runtime_ctr.go @@ -811,7 +811,7 @@ func (r *Runtime) removeContainer(ctx context.Context, c *Container, force, remo // Remove the container's CID file on container removal. if cidFile, ok := c.config.Spec.Annotations[define.InspectAnnotationCIDFile]; ok { - if err := os.Remove(cidFile); err != nil { + if err := os.Remove(cidFile); err != nil && !errors.Is(err, os.ErrNotExist) { if cleanupErr == nil { cleanupErr = err } else { diff --git a/pkg/autoupdate/autoupdate.go b/pkg/autoupdate/autoupdate.go index 10258701f8d9..ccd81d8cdbce 100644 --- a/pkg/autoupdate/autoupdate.go +++ b/pkg/autoupdate/autoupdate.go @@ -334,8 +334,16 @@ func (t *task) rollbackImage() error { // restartSystemdUnit restarts the systemd unit the container is running in. func (u *updater) restartSystemdUnit(ctx context.Context, unit string) error { + if err := u.stopSystemdUnit(ctx, unit); err != nil { + return err + } + return u.startSystemdUnit(ctx, unit) +} + +// startSystemdUnit starts the systemd unit the container is running in. +func (u *updater) startSystemdUnit(ctx context.Context, unit string) error { restartChan := make(chan string) - if _, err := u.conn.RestartUnitContext(ctx, unit, "replace", restartChan); err != nil { + if _, err := u.conn.StartUnitContext(ctx, unit, "replace", restartChan); err != nil { return err } @@ -349,7 +357,28 @@ func (u *updater) restartSystemdUnit(ctx context.Context, unit string) error { return nil default: - return fmt.Errorf("expected %q but received %q", "done", result) + return fmt.Errorf("error starting systemd unit %q expected %q but received %q", unit, "done", result) + } +} + +// stopSystemdUnit stop the systemd unit the container is running in. +func (u *updater) stopSystemdUnit(ctx context.Context, unit string) error { + restartChan := make(chan string) + if _, err := u.conn.StopUnitContext(ctx, unit, "replace", restartChan); err != nil { + return err + } + + // Wait for the restart to finish and actually check if it was + // successful or not. + result := <-restartChan + + switch result { + case "done": + logrus.Infof("Successfully stopped systemd unit %q", unit) + return nil + + default: + return fmt.Errorf("error stopping systemd unit %q expected %q but received %q", unit, "done", result) } }