diff --git a/control/runner.go b/control/runner.go index cfd93e281..e6dc6c480 100644 --- a/control/runner.go +++ b/control/runner.go @@ -244,7 +244,7 @@ func (r *runner) HandleGomitEvent(e gomit.Event) { } if pool.Eligible() { - if pool.RestartCount() < MaxPluginRestartCount { + if pool.RestartCount() < MaxPluginRestartCount || MaxPluginRestartCount == -1 { e := r.restartPlugin(v.Key) if e != nil { runnerLog.WithFields(log.Fields{ @@ -257,9 +257,8 @@ func (r *runner) HandleGomitEvent(e gomit.Event) { runnerLog.WithFields(log.Fields{ "_block": "handle-events", - "event": v.Name, - "aplugin": v.Version, - "restart_count": pool.RestartCount(), + "aplugin": v.String, + "restart-count": pool.RestartCount(), }).Warning("plugin restarted") r.emitter.Emit(&control_event.RestartedAvailablePluginEvent{ @@ -270,6 +269,11 @@ func (r *runner) HandleGomitEvent(e gomit.Event) { Type: v.Type, }) } else { + runnerLog.WithFields(log.Fields{ + "_block": "handle-events", + "aplugin": v.String, + }).Warning("plugin disabled due to exceeding restart limit: ", MaxPluginRestartCount) + r.emitter.Emit(&control_event.MaxPluginRestartsExceededEvent{ Id: v.Id, Name: v.Name, diff --git a/docs/SNAPTELD_CONFIGURATION.md b/docs/SNAPTELD_CONFIGURATION.md index c4d26415a..7e5c8d9af 100644 --- a/docs/SNAPTELD_CONFIGURATION.md +++ b/docs/SNAPTELD_CONFIGURATION.md @@ -101,6 +101,10 @@ control: # not be loaded. Valid values are 0 - Off, 1 - Enabled, 2 - Warning plugin_trust_level: 1 + # max_plugin_restarts controls how many times a plugin is allowed to be restarted + # before failing. Snap will not disable a plugin due to failures when this value is -1. + max_plugin_restarts: 10 + # plugins section contains plugin config settings that will be applied for # plugins across tasks. plugins: diff --git a/docs/TASKS.md b/docs/TASKS.md index aed96b963..8760a895e 100644 --- a/docs/TASKS.md +++ b/docs/TASKS.md @@ -85,11 +85,14 @@ or without time zone offset (in that cases uppercase'Z' must be present): More on cron expressions can be found here: https://godoc.org/github.com/robfig/cron #### Max-Failures + By default, Snap will disable a task if there are 10 consecutive errors from any plugins within the workflow. The configuration -can be changed by specifying the number of failures value in the task header. If the max-failures value is -1, Snap will +can be changed by specifying the number of failures value in the task header. If the `max-failures` value is -1, Snap will not disable a task with consecutive failure. Instead, Snap will sleep for 1 second for every 10 consecutive failures and retry again. +If you intend to run tasks with `max-failures: -1`, please also configure `max_plugin_restarts: -1` in [snap daemon control configuration section](SNAPTELD_CONFIGURATION.md). + For more on tasks, visit [`SNAPTEL.md`](SNAPTEL.md). ### The Workflow diff --git a/examples/configs/snap-config-sample.yaml b/examples/configs/snap-config-sample.yaml index 57214d5bb..98e8f5538 100644 --- a/examples/configs/snap-config-sample.yaml +++ b/examples/configs/snap-config-sample.yaml @@ -66,7 +66,7 @@ control: plugin_trust_level: 0 # max_plugin_restarts controls how many times a plugin is allowed to be restarted - # before failing. + # before failing. Snap will not disable a plugin due to failures when this value is -1. max_plugin_restarts: 10 # plugins section contains plugin config settings that will be applied for