diff --git a/doc/api-extensions.md b/doc/api-extensions.md index cbe90a348f..71363c2adc 100644 --- a/doc/api-extensions.md +++ b/doc/api-extensions.md @@ -2586,3 +2586,9 @@ This extends `ovn.transit.pattern` to allow `peerName` as a template variable. ## `qemu_scriptlet` This adds the ability to run a scriptlet at various stages of startup: using the `raw.qemu.scriptlet` configuration key. + +## `instance_auto_restart` + +This introduces a new `boot.autorestart` configuration key which when +set to `true` will have the instance automatically be restarted upon +unexpected exit for up to 10 times over a 1 minute period. diff --git a/doc/config_options.txt b/doc/config_options.txt index 9be0e2c9e8..fa0d8fc0de 100644 --- a/doc/config_options.txt +++ b/doc/config_options.txt @@ -402,6 +402,13 @@ For file systems (shared directories or custom volumes), this is one of: +```{config:option} boot.autorestart instance-boot +:liveupdate: "no" +:shortdesc: "Whether to automatically restart an instance on unexpected exit" +:type: "bool" +If set to `true` will attempt up to 10 restarts over a 1 minute period upon unexpected instance exit. +``` + ```{config:option} boot.autostart instance-boot :liveupdate: "no" :shortdesc: "Whether to always start the instance when the daemon starts" diff --git a/internal/instance/config.go b/internal/instance/config.go index d01001f79d..b9980b5a0c 100644 --- a/internal/instance/config.go +++ b/internal/instance/config.go @@ -29,6 +29,14 @@ var HugePageSizeSuffix = [...]string{"64KB", "1MB", "2MB", "1GB"} // InstanceConfigKeysAny is a map of config key to validator. (keys applying to containers AND virtual machines). var InstanceConfigKeysAny = map[string]func(value string) error{ + // gendoc:generate(entity=instance, group=boot, key=boot.autorestart) + // If set to `true` will attempt up to 10 restarts over a 1 minute period upon unexpected instance exit. + // --- + // type: bool + // liveupdate: no + // shortdesc: Whether to automatically restart an instance on unexpected exit + "boot.autorestart": validate.Optional(validate.IsBool), + // gendoc:generate(entity=instance, group=boot, key=boot.autostart) // If set to `false`, restore the last state. // --- diff --git a/internal/server/instance/drivers/driver_common.go b/internal/server/instance/drivers/driver_common.go index a5e8cf1d4e..5b7474dbf0 100644 --- a/internal/server/instance/drivers/driver_common.go +++ b/internal/server/instance/drivers/driver_common.go @@ -42,6 +42,10 @@ import ( "github.com/lxc/incus/v6/shared/util" ) +// Track last autorestart of an instance. +var instancesLastRestart = map[int][10]time.Time{} +var muInstancesLastRestart sync.Mutex + // ErrExecCommandNotFound indicates the command is not found. var ErrExecCommandNotFound = api.StatusErrorf(http.StatusBadRequest, "Command not found") @@ -145,6 +149,52 @@ func (d *common) ExpiryDate() time.Time { return time.Time{} } +func (d *common) shouldAutoRestart() bool { + if !util.IsTrue(d.expandedConfig["boot.autorestart"]) { + return false + } + + muInstancesLastRestart.Lock() + defer muInstancesLastRestart.Unlock() + + // Check if the instance was ever auto-restarted. + timestamps, ok := instancesLastRestart[d.id] + if !ok || len(timestamps) == 0 { + // If not, record it and allow the auto-restart. + instancesLastRestart[d.id] = [10]time.Time{time.Now()} + return true + } + + // If it has been auto-restarted, look for the oldest non-zero timestamp. + oldestIndex := 0 + validTimestamps := 0 + for i, timestamp := range timestamps { + if timestamp.IsZero() { + // We found an unused slot, lets use it. + timestamps[i] = time.Now() + instancesLastRestart[d.id] = timestamps + return true + } + + validTimestamps++ + + if timestamp.Before(timestamps[oldestIndex]) { + oldestIndex = i + } + } + + // Check if the oldest restart was more than a minute ago. + if timestamps[oldestIndex].Before(time.Now().Add(-1 * time.Minute)) { + // Remove the old timestamp and replace it with ours. + timestamps[oldestIndex] = time.Now() + instancesLastRestart[d.id] = timestamps + return true + } + + // If not and all slots are used + return false +} + // ID gets instances's ID. func (d *common) ID() int { return d.id diff --git a/internal/server/instance/drivers/driver_lxc.go b/internal/server/instance/drivers/driver_lxc.go index 39d0d08e30..e7e50fed29 100644 --- a/internal/server/instance/drivers/driver_lxc.go +++ b/internal/server/instance/drivers/driver_lxc.go @@ -3303,8 +3303,25 @@ func (d *lxc) onStop(args map[string]string) error { _ = unix.Unmount(filepath.Join(d.DevicesPath(), "lxcfs"), unix.MNT_DETACH) } + // Determine if instance should be auto-restarted. + var autoRestart bool + if target != "reboot" && op.GetInstanceInitiated() && d.shouldAutoRestart() { + autoRestart = true + + // Mark current shutdown as complete. + op.Done(nil) + + // Create a new restart operation. + op, err = operationlock.CreateWaitGet(d.Project().Name, d.Name(), d.op, operationlock.ActionRestart, nil, true, false) + if err == nil { + defer op.Done(nil) + } else { + d.logger.Error("Failed to setup new restart operation", logger.Ctx{"err": err}) + } + } + // Log and emit lifecycle if not user triggered - if op.GetInstanceInitiated() { + if target != "reboot" && !autoRestart && op.GetInstanceInitiated() { ctxMap := logger.Ctx{ "action": target, "created": d.creationDate, @@ -3318,7 +3335,7 @@ func (d *lxc) onStop(args map[string]string) error { } // Reboot the container - if target == "reboot" { + if target == "reboot" || autoRestart { // Start the container again err = d.Start(false) if err != nil { diff --git a/internal/server/instance/drivers/driver_qemu.go b/internal/server/instance/drivers/driver_qemu.go index c23507b1a4..723d7d66ae 100644 --- a/internal/server/instance/drivers/driver_qemu.go +++ b/internal/server/instance/drivers/driver_qemu.go @@ -637,15 +637,32 @@ func (d *qemu) onStop(target string) error { return err } + // Determine if instance should be auto-restarted. + var autoRestart bool + if target != "reboot" && op.GetInstanceInitiated() && d.shouldAutoRestart() { + autoRestart = true + + // Mark current shutdown as complete. + op.Done(nil) + + // Create a new restart operation. + op, err = operationlock.CreateWaitGet(d.Project().Name, d.Name(), d.op, operationlock.ActionRestart, nil, true, false) + if err == nil { + defer op.Done(nil) + } else { + d.logger.Error("Failed to setup new restart operation", logger.Ctx{"err": err}) + } + } + // Log and emit lifecycle if not user triggered. - if op.GetInstanceInitiated() { + if target != "reboot" && !autoRestart && op.GetInstanceInitiated() { d.state.Events.SendLifecycle(d.project.Name, lifecycle.InstanceShutdown.Event(d, nil)) - } else if op.Action() != operationlock.ActionMigrate { + } else if !autoRestart && op.Action() != operationlock.ActionMigrate { d.state.Events.SendLifecycle(d.project.Name, lifecycle.InstanceStopped.Event(d, nil)) } // Reboot the instance. - if target == "reboot" { + if target == "reboot" || autoRestart { err = d.Start(false) if err != nil { op.Done(err) diff --git a/internal/server/metadata/configuration.json b/internal/server/metadata/configuration.json index 308b8d3a3e..87094776c9 100644 --- a/internal/server/metadata/configuration.json +++ b/internal/server/metadata/configuration.json @@ -453,6 +453,14 @@ "instance": { "boot": { "keys": [ + { + "boot.autorestart": { + "liveupdate": "no", + "longdesc": "If set to `true` will attempt up to 10 restarts over a 1 minute period upon unexpected instance exit.", + "shortdesc": "Whether to automatically restart an instance on unexpected exit", + "type": "bool" + } + }, { "boot.autostart": { "liveupdate": "no", diff --git a/internal/version/api.go b/internal/version/api.go index 6d2298ebac..f8d1286233 100644 --- a/internal/version/api.go +++ b/internal/version/api.go @@ -438,6 +438,7 @@ var APIExtensions = []string{ "oidc_scopes", "network_integrations_peer_name", "qemu_scriptlet", + "instance_auto_restart", } // APIExtensionsCount returns the number of available API extensions. diff --git a/test/suites/basic.sh b/test/suites/basic.sh index 9f08a9d618..ad5cf5b871 100644 --- a/test/suites/basic.sh +++ b/test/suites/basic.sh @@ -668,4 +668,22 @@ test_basic_usage() { incus delete -f c1 c2 c3 remaining_instances="$(incus list --format csv)" [ -z "${remaining_instances}" ] + + # Test autorestart mechanism + incus launch testimage c1 -c boot.autorestart=true + + for _ in $(seq 10); do + PID=$(incus info c1 | awk '/^PID/ {print $2}') + kill -9 "${PID}" + sleep 3 + done + + [ "$(incus list -cs -fcsv c1)" = "RUNNING" ] || false + + PID=$(incus info c1 | awk '/^PID/ {print $2}') + kill -9 "${PID}" + sleep 3 + + [ "$(incus list -cs -fcsv c1)" = "STOPPED" ] || false + incus delete -f c1 }