Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement boot.autorestart #1167

Merged
merged 7 commits into from
Aug 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions doc/api-extensions.md
Original file line number Diff line number Diff line change
Expand Up @@ -2586,3 +2586,9 @@ This extends `ovn.transit.pattern` to allow `peerName` as a template variable.
## `qemu_scriptlet`

This adds the ability to run a scriptlet at various stages of startup: using the `raw.qemu.scriptlet` configuration key.

## `instance_auto_restart`

This introduces a new `boot.autorestart` configuration key which when
set to `true` will have the instance automatically be restarted upon
unexpected exit for up to 10 times over a 1 minute period.
7 changes: 7 additions & 0 deletions doc/config_options.txt
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,13 @@ For file systems (shared directories or custom volumes), this is one of:

<!-- config group image-requirements end -->
<!-- config group instance-boot start -->
```{config:option} boot.autorestart instance-boot
:liveupdate: "no"
:shortdesc: "Whether to automatically restart an instance on unexpected exit"
:type: "bool"
If set to `true` will attempt up to 10 restarts over a 1 minute period upon unexpected instance exit.
```

```{config:option} boot.autostart instance-boot
:liveupdate: "no"
:shortdesc: "Whether to always start the instance when the daemon starts"
Expand Down
8 changes: 8 additions & 0 deletions internal/instance/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,14 @@ var HugePageSizeSuffix = [...]string{"64KB", "1MB", "2MB", "1GB"}

// InstanceConfigKeysAny is a map of config key to validator. (keys applying to containers AND virtual machines).
var InstanceConfigKeysAny = map[string]func(value string) error{
// gendoc:generate(entity=instance, group=boot, key=boot.autorestart)
// If set to `true` will attempt up to 10 restarts over a 1 minute period upon unexpected instance exit.
// ---
// type: bool
// liveupdate: no
// shortdesc: Whether to automatically restart an instance on unexpected exit
"boot.autorestart": validate.Optional(validate.IsBool),

// gendoc:generate(entity=instance, group=boot, key=boot.autostart)
// If set to `false`, restore the last state.
// ---
Expand Down
50 changes: 50 additions & 0 deletions internal/server/instance/drivers/driver_common.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ import (
"github.com/lxc/incus/v6/shared/util"
)

// Track last autorestart of an instance.
var instancesLastRestart = map[int][10]time.Time{}
var muInstancesLastRestart sync.Mutex

// ErrExecCommandNotFound indicates the command is not found.
var ErrExecCommandNotFound = api.StatusErrorf(http.StatusBadRequest, "Command not found")

Expand Down Expand Up @@ -145,6 +149,52 @@ func (d *common) ExpiryDate() time.Time {
return time.Time{}
}

func (d *common) shouldAutoRestart() bool {
if !util.IsTrue(d.expandedConfig["boot.autorestart"]) {
return false
}

muInstancesLastRestart.Lock()
defer muInstancesLastRestart.Unlock()

// Check if the instance was ever auto-restarted.
timestamps, ok := instancesLastRestart[d.id]
if !ok || len(timestamps) == 0 {
// If not, record it and allow the auto-restart.
instancesLastRestart[d.id] = [10]time.Time{time.Now()}
return true
}

// If it has been auto-restarted, look for the oldest non-zero timestamp.
oldestIndex := 0
validTimestamps := 0
for i, timestamp := range timestamps {
if timestamp.IsZero() {
// We found an unused slot, lets use it.
timestamps[i] = time.Now()
instancesLastRestart[d.id] = timestamps
return true
}

validTimestamps++

if timestamp.Before(timestamps[oldestIndex]) {
oldestIndex = i
}
}

// Check if the oldest restart was more than a minute ago.
if timestamps[oldestIndex].Before(time.Now().Add(-1 * time.Minute)) {
// Remove the old timestamp and replace it with ours.
timestamps[oldestIndex] = time.Now()
instancesLastRestart[d.id] = timestamps
return true
}

// If not and all slots are used
return false
}

// ID gets instances's ID.
func (d *common) ID() int {
return d.id
Expand Down
21 changes: 19 additions & 2 deletions internal/server/instance/drivers/driver_lxc.go
Original file line number Diff line number Diff line change
Expand Up @@ -3303,8 +3303,25 @@ func (d *lxc) onStop(args map[string]string) error {
_ = unix.Unmount(filepath.Join(d.DevicesPath(), "lxcfs"), unix.MNT_DETACH)
}

// Determine if instance should be auto-restarted.
var autoRestart bool
if target != "reboot" && op.GetInstanceInitiated() && d.shouldAutoRestart() {
autoRestart = true

// Mark current shutdown as complete.
op.Done(nil)

// Create a new restart operation.
op, err = operationlock.CreateWaitGet(d.Project().Name, d.Name(), d.op, operationlock.ActionRestart, nil, true, false)
if err == nil {
defer op.Done(nil)
} else {
d.logger.Error("Failed to setup new restart operation", logger.Ctx{"err": err})
}
}

// Log and emit lifecycle if not user triggered
if op.GetInstanceInitiated() {
if target != "reboot" && !autoRestart && op.GetInstanceInitiated() {
ctxMap := logger.Ctx{
"action": target,
"created": d.creationDate,
Expand All @@ -3318,7 +3335,7 @@ func (d *lxc) onStop(args map[string]string) error {
}

// Reboot the container
if target == "reboot" {
if target == "reboot" || autoRestart {
// Start the container again
err = d.Start(false)
if err != nil {
Expand Down
23 changes: 20 additions & 3 deletions internal/server/instance/drivers/driver_qemu.go
Original file line number Diff line number Diff line change
Expand Up @@ -637,15 +637,32 @@ func (d *qemu) onStop(target string) error {
return err
}

// Determine if instance should be auto-restarted.
var autoRestart bool
if target != "reboot" && op.GetInstanceInitiated() && d.shouldAutoRestart() {
autoRestart = true

// Mark current shutdown as complete.
op.Done(nil)

// Create a new restart operation.
op, err = operationlock.CreateWaitGet(d.Project().Name, d.Name(), d.op, operationlock.ActionRestart, nil, true, false)
if err == nil {
defer op.Done(nil)
} else {
d.logger.Error("Failed to setup new restart operation", logger.Ctx{"err": err})
}
}

// Log and emit lifecycle if not user triggered.
if op.GetInstanceInitiated() {
if target != "reboot" && !autoRestart && op.GetInstanceInitiated() {
d.state.Events.SendLifecycle(d.project.Name, lifecycle.InstanceShutdown.Event(d, nil))
} else if op.Action() != operationlock.ActionMigrate {
} else if !autoRestart && op.Action() != operationlock.ActionMigrate {
d.state.Events.SendLifecycle(d.project.Name, lifecycle.InstanceStopped.Event(d, nil))
}

// Reboot the instance.
if target == "reboot" {
if target == "reboot" || autoRestart {
err = d.Start(false)
if err != nil {
op.Done(err)
Expand Down
8 changes: 8 additions & 0 deletions internal/server/metadata/configuration.json
Original file line number Diff line number Diff line change
Expand Up @@ -453,6 +453,14 @@
"instance": {
"boot": {
"keys": [
{
"boot.autorestart": {
"liveupdate": "no",
"longdesc": "If set to `true` will attempt up to 10 restarts over a 1 minute period upon unexpected instance exit.",
"shortdesc": "Whether to automatically restart an instance on unexpected exit",
"type": "bool"
}
},
{
"boot.autostart": {
"liveupdate": "no",
Expand Down
1 change: 1 addition & 0 deletions internal/version/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -438,6 +438,7 @@ var APIExtensions = []string{
"oidc_scopes",
"network_integrations_peer_name",
"qemu_scriptlet",
"instance_auto_restart",
}

// APIExtensionsCount returns the number of available API extensions.
Expand Down
18 changes: 18 additions & 0 deletions test/suites/basic.sh
Original file line number Diff line number Diff line change
Expand Up @@ -668,4 +668,22 @@ test_basic_usage() {
incus delete -f c1 c2 c3
remaining_instances="$(incus list --format csv)"
[ -z "${remaining_instances}" ]

# Test autorestart mechanism
incus launch testimage c1 -c boot.autorestart=true

for _ in $(seq 10); do
PID=$(incus info c1 | awk '/^PID/ {print $2}')
kill -9 "${PID}"
sleep 3
done

[ "$(incus list -cs -fcsv c1)" = "RUNNING" ] || false

PID=$(incus info c1 | awk '/^PID/ {print $2}')
kill -9 "${PID}"
sleep 3

[ "$(incus list -cs -fcsv c1)" = "STOPPED" ] || false
incus delete -f c1
}
Loading