Skip to content

Commit

Permalink
drviver: restore pledge environment during task recovery (#44)
Browse files Browse the repository at this point in the history
Fixes a bug where tasks would be lost after a plugin / agent restart.
  • Loading branch information
shoenig authored Jul 1, 2023
1 parent e4222c1 commit d0eb363
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 6 deletions.
4 changes: 3 additions & 1 deletion pkg/pledge/exec.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,11 @@ func New(bin string, env *Environment, opts *Options) Exec {
}
}

func Recover(pid int) Exec {
func Recover(pid int, env *Environment) Exec {
return &exe{
pid: pid,
env: env,
opts: nil, // necessary?
waiter: process.WaitOnOrphan(pid),
signal: process.Interrupts(pid),
cpu: new(resources.TrackCPU),
Expand Down
26 changes: 21 additions & 5 deletions pkg/plugin/driver.go
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ func (p *PledgeDriver) StartTask(config *drivers.TaskConfig) (*drivers.TaskHandl
Env: config.Env,
Dir: config.TaskDir().Dir,
User: config.User,
Cgroup: fmt.Sprintf("/sys/fs/cgroup/nomad.slice/%s.%s.scope", config.AllocID, config.Name),
Cgroup: p.cgroup(config.AllocID, config.Name),
}

opts, err := parseOptions(config)
Expand Down Expand Up @@ -280,12 +280,24 @@ func (p *PledgeDriver) RecoverTask(handle *drivers.TaskHandle) error {
return fmt.Errorf("failed to decode task state: %w", err)
}

var taskConfig TaskConfig
if err := taskState.TaskConfig.DecodeDriverConfig(&taskConfig); err != nil {
return fmt.Errorf("failed to decode task config: %w", err)
taskState.TaskConfig = handle.Config.Copy()
stdout, stderr, err := open(handle.Config.StdoutPath, handle.Config.StderrPath)
if err != nil {
p.logger.Error("failed to re-open log files", "error", err)
return fmt.Errorf("failed to open log file(s): %w", err)
}

// re-create the environment for pledge
env := &pledge.Environment{
Out: stdout,
Err: stderr,
Env: handle.Config.Env,
Dir: handle.Config.TaskDir().Dir,
User: handle.Config.User,
Cgroup: p.cgroup(handle.Config.AllocID, handle.Config.Name),
}

runner := pledge.Recover(taskState.PID)
runner := pledge.Recover(taskState.PID, env)
recHandle := task.RecreateHandle(runner, taskState.TaskConfig, taskState.StartedAt)
p.tasks.Set(taskState.TaskConfig.ID, recHandle)
return nil
Expand Down Expand Up @@ -439,3 +451,7 @@ func (p *PledgeDriver) ExecTask(taskID string, cmd []string, timeout time.Durati
// todo
return nil, fmt.Errorf("ExecTask not implemented")
}

func (*PledgeDriver) cgroup(allocID, task string) string {
return fmt.Sprintf("/sys/fs/cgroup/nomad.slice/%s.%s.scope", allocID, task)
}

0 comments on commit d0eb363

Please sign in to comment.