Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Support OOMKilled with container-set. Fixes #10063 #11484

Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 26 additions & 17 deletions cmd/argoexec/commands/emissary.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,24 +70,38 @@ func NewEmissaryCommand() *cobra.Command {
return fmt.Errorf("failed to unmarshal template: %w", err)
}

// setup signal handlers
signals := make(chan os.Signal, 1)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moved from lower in the file.

defer close(signals)
signal.Notify(signals)
defer signal.Reset()

for _, x := range template.ContainerSet.GetGraph() {
if x.Name == containerName {
for _, y := range x.Dependencies {
logger.Infof("waiting for dependency %q", y)
for {
data, err := os.ReadFile(filepath.Clean(varRunArgo + "/ctr/" + y + "/exitcode"))
if os.IsNotExist(err) {
time.Sleep(time.Second)
continue
}
exitCode, err := strconv.Atoi(string(data))
if err != nil {
return fmt.Errorf("failed to read exit-code of dependency %q: %w", y, err)
}
if exitCode != 0 {
return fmt.Errorf("dependency %q exited with non-zero code: %d", y, exitCode)
select {
// If we receive a terminated or killed signal, we should exit immediately.
case s := <-signals:
if s == osspecific.Term || s == os.Kill {
return fmt.Errorf("received %q signal while waiting for dependency", s)
}
default:
data, err := os.ReadFile(filepath.Clean(varRunArgo + "/ctr/" + y + "/exitcode"))
if os.IsNotExist(err) {
time.Sleep(time.Second)
continue
}
exitCode, err := strconv.Atoi(string(data))
if err != nil {
return fmt.Errorf("failed to read exit-code of dependency %q: %w", y, err)
}
if exitCode != 0 {
return fmt.Errorf("dependency %q exited with non-zero code: %d", y, exitCode)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need a rebase here

}
break
}
break
}
}
}
Expand Down Expand Up @@ -117,11 +131,6 @@ func NewEmissaryCommand() *cobra.Command {
}

cmdErr := retry.OnError(backoff, func(error) bool { return true }, func() error {
// setup signal handlers
signals := make(chan os.Signal, 1)
defer close(signals)
signal.Notify(signals)
defer signal.Reset()

command, closer, err := startCommand(name, args, template)
if err != nil {
Expand Down
23 changes: 23 additions & 0 deletions test/e2e/signals_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,29 @@ func (s *SignalsSuite) TestSignaled() {
})
}

func (s *SignalsSuite) TestSignaledContainerSet() {
s.Given().
Workflow("@testdata/signaled-container-set-workflow.yaml").
When().
SubmitWorkflow().
WaitForWorkflow().
Then().
ExpectWorkflow(func(t *testing.T, metadata *metav1.ObjectMeta, status *wfv1.WorkflowStatus) {
assert.Equal(t, wfv1.WorkflowFailed, status.Phase)
assert.Equal(t, "OOMKilled (exit code 137)", status.Message)
one := status.Nodes.FindByDisplayName("one")
if assert.NotNil(t, one) {
assert.Equal(t, wfv1.NodeFailed, one.Phase)
assert.Equal(t, "OOMKilled (exit code 137): ", one.Message)
}
two := status.Nodes.FindByDisplayName("two")
if assert.NotNil(t, two) {
assert.Equal(t, wfv1.NodeError, two.Phase)
assert.Equal(t, "Error (exit code 64): received \"terminated\" signal while waiting for dependency", two.Message)
}
})
}

func TestSignalsSuite(t *testing.T) {
suite.Run(t, new(SignalsSuite))
}
50 changes: 50 additions & 0 deletions test/e2e/testdata/signaled-container-set-workflow.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
apiVersion: argoproj.io/v1alpha1
kind: Workflow
metadata:
name: example
namespace: argo
spec:
templates:
- name: entrypoint
metadata:
containerSet:
containers:
- name: one
resources:
requests:
memory: "50Mi"
cpu: "50m"
limits:
memory: "50Mi"
image: argoproj/argosay:v2
command:
- bash
- '-c'
args:
- |
/bin/bash <<'EOF'
echo "hello one"
apt update -y
apt install stress -y
echo 'stress --vm 1 --vm-bytes 512M --vm-hang 100' > abc.sh
bash abc.sh
EOF
- name: two
resources:
requests:
memory: "150Mi"
cpu: "50m"
limits:
memory: "250Mi"
image: argoproj/argosay:v2
command:
- bash
- '-c'
args:
- |
/bin/bash <<'EOF'
echo "hello world"
EOF
dependencies:
- one
entrypoint: entrypoint
10 changes: 9 additions & 1 deletion workflow/controller/operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -1422,8 +1422,16 @@ func getExitCode(pod *apiv1.Pod) *int32 {
}

func podHasContainerNeedingTermination(pod *apiv1.Pod, tmpl wfv1.Template) bool {
// pod needs to be terminated if any of the following are true:
// 1. any main container has exited with non-zero exit code
// 2. all main containers have exited
// pod termination will cause the wait container to finish
for _, c := range pod.Status.ContainerStatuses {
if tmpl.IsMainContainerName(c.Name) && c.State.Terminated != nil && c.State.Terminated.ExitCode != 0 {
return true
}
}
for _, c := range pod.Status.ContainerStatuses {
// Only clean up pod when all main containers are terminated
if tmpl.IsMainContainerName(c.Name) && c.State.Terminated == nil {
return false
}
Expand Down
4 changes: 4 additions & 0 deletions workflow/executor/os-specific/signal_darwin.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@ import (
"github.com/argoproj/argo-workflows/v3/util/errors"
)

var (
Term = syscall.SIGTERM
)

func CanIgnoreSignal(s os.Signal) bool {
return s == syscall.SIGCHLD || s == syscall.SIGURG
}
Expand Down
4 changes: 4 additions & 0 deletions workflow/executor/os-specific/signal_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ import (
"github.com/argoproj/argo-workflows/v3/util/errors"
)

var (
Term = os.Interrupt
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is to os.Term because Windows does not have that. We fake it.

)

func CanIgnoreSignal(s os.Signal) bool {
return false
}
Expand Down