Skip to content
This repository has been archived by the owner on Jun 6, 2024. It is now read-only.

ErrorSpec: runtime part #3585

Merged
merged 29 commits into from
Sep 18, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,6 @@ __pycache__
/subprojects/GOPATH/pkg/
*.egg-info/
.openpai/
.ipynb_checkpoints/
.ipynb_checkpoints/
/src/kube-runtime/GOPATH/bin/
/src/kube-runtime/GOPATH/pkg/
140 changes: 137 additions & 3 deletions src/k8s-job-exit-spec/config/k8s-job-exit-spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ schema:
- PAI_RUNTIME
- PAI_K8S
- PAI_FC
- PAI_HW
- PAI_UNKNOWN
- UNKNOWN
- field: type
Expand Down Expand Up @@ -580,7 +581,7 @@ spec:

###########################################################################
# Range: [129, 192]
# Owner: PAI_FC
# Owner: PAI_FC and PAI_RUNTIME
# Description: User Container issued failures:
# -> Involuntary failures caused by OS Signal
###########################################################################
Expand All @@ -598,6 +599,9 @@ spec:
solution:
- "Wait result from next retry"
- "Contact Cluster Admin"
pattern:
runtimeContainerPatterns:
- exitCode: 130

- code: 131
phrase: ContainerSigQuitReceived
Expand All @@ -613,6 +617,9 @@ spec:
solution:
- "Wait result from next retry"
- "Contact Cluster Admin"
pattern:
runtimeContainerPatterns:
- exitCode: 131

- code: 132
phrase: ContainerSigIllReceived
Expand All @@ -627,6 +634,9 @@ spec:
- "User program executes an illegal, malformed, unknown, or privileged machine instruction"
solution:
- "Check container log and fix your program bug"
pattern:
runtimeContainerPatterns:
- exitCode: 132

- code: 134
phrase: ContainerSigAbrtReceived
Expand All @@ -642,6 +652,9 @@ spec:
solution:
- "Check container log and find root cause"
- "Wait result from next retry"
pattern:
runtimeContainerPatterns:
- exitCode: 134

- code: 135
phrase: ContainerSigBusReceived
Expand All @@ -656,6 +669,9 @@ spec:
- "User program accesses an unaligned memory address"
solution:
- "Check container log and fix your program bug"
pattern:
runtimeContainerPatterns:
- exitCode: 135

- code: 136
phrase: ContainerSigFpeReceived
Expand All @@ -670,6 +686,9 @@ spec:
- "User program division by zero"
solution:
- "Check container log and fix your program bug"
pattern:
runtimeContainerPatterns:
- exitCode: 136

- code: 137
phrase: ContainerSigKillReceived
Expand All @@ -685,6 +704,9 @@ spec:
solution:
- "Wait result from next retry"
- "Contact Cluster Admin"
pattern:
runtimeContainerPatterns:
- exitCode: 137

- code: 139
phrase: ContainerSigSegvReceived
Expand All @@ -699,6 +721,9 @@ spec:
- "User program accesses an illegal memory address"
solution:
- "Check container log and fix your program bug"
pattern:
runtimeContainerPatterns:
- exitCode: 139

- code: 141
phrase: ContainerSigPipeReceived
Expand All @@ -713,6 +738,9 @@ spec:
- "User program writes to a pipe without a process connected to the other end"
solution:
- "Check container log and fix your program bug"
pattern:
runtimeContainerPatterns:
- exitCode: 141

- code: 143
phrase: ContainerSigTermReceived
Expand All @@ -728,6 +756,9 @@ spec:
solution:
- "Wait result from next retry"
- "Contact Cluster Admin"
pattern:
runtimeContainerPatterns:
- exitCode: 143


###########################################################################
Expand All @@ -736,7 +767,80 @@ spec:
# Description: User Container issued failures:
Copy link
Member

@yqwang-ms yqwang-ms Sep 10, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

# -> Voluntary failures caused by Container itself
###########################################################################
- code: 240
- code: 220
phrase: UserCommandExitWithError
issuer: USER_CONTAINER
causer: USER_CONTAINER
type: USER_FAILURE
stage: RUNNING
behavior: PERMANENT
reaction: NEVER_RETRY
reason: "User command is invalid"
repro:
- "Start a job with invalid command"
solution:
- "Check the user command"
- "Check the PATH environment variable in the container"
pattern:
runtimeContainerPatterns:
- exitCode: 127
userLogRegex: "command not found"

- code: 221
phrase: ContainerTensorflowOOMKilled
issuer: PAI_RUNTIME
causer: USER_CONTAINER
type: USER_FAILURE
stage: RUNNING
behavior: PERMANENT
reaction: NEVER_RETRY
reason: "Tensorflow failed due to out of memory"
repro:
- "Tensorflow uses more memory than it is allocated"
solution:
- "Increase per task memory request"
- "Decrease per task memory usage by such as increasing task number"
- "Tuning the tensorflow program"
pattern:
runtimeContainerPatterns:
- userLogRegex: "(?msi)tensorflow.*ResourceExhaustedError.*OOM.*"

- code: 222
phrase: ContainerMPISegvFault
issuer: PAI_RUNTIME
causer: USER_CONTAINER
type: USER_FAILURE
stage: RUNNING
behavior: PERMANENT
reaction: NEVER_RETRY
reason: "MPI failed due to memory segmentation fault"
repro:
- "MPI accesses unmapped memory region"
solution:
- "Check container log and fix your program bug"
pattern:
runtimeContainerPatterns:
- userLogRegex: "(?msi)Signal code: Address not mapped.*"

- code: 223
phrase: ContainerCudaUncorrectableECCError
issuer: PAI_RUNTIME
causer: PAI_HW
type: PLATFORM_FAILURE
stage: RUNNING
behavior: TRANSIENT
reaction: ALWAYS_RETRY
reason: "Container failed due to GPU uncorrectable ECC error"
repro:
- "Run tensorflow on a GPU which has ECC error"
solution:
- "Wait result from next retry"
- "Contact Cluster Admin"
pattern:
runtimeContainerPatterns:
- userLogRegex: "(?msi)CUDA_ERROR_ECC_UNCORRECTABLE.*"

- code: 249
phrase: PAIRuntimeSSHBarrierTimeout
issuer: PAI_RUNTIME
causer: PAI_RUNTIME
Expand Down Expand Up @@ -798,10 +902,40 @@ spec:
solution:
- "Contact PAI Dev to fix the PAI Runtime bug"

- code: 253
phrase: ContainerPortConflict
issuer: PAI_RUNTIME
causer: PAI_RUNTIME
type: PLATFORM_FAILURE
stage: LAUNCHING
behavior: TRANSIENT
reaction: ALWAYS_RETRY
reason: "Can not alloc request ports due to ports is occupied"
repro:
- "Run task in a host which most of ports are occupied"
solution:
- "Wait result from next retry"
- "Decrease per task ports request"

- code: 255
phrase: PAIRuntimeUnknownFailed
issuer: PAI_RUNTIME
causer: UNKNOWN
type: UNKNOWN_FAILURE
stage: UNKNOWN
behavior: UNKNOWN
reaction: RETRY_TO_MAX
reason: "User command failed but the failure cannot be recognized by PAI Runtime"
repro:
- "User program directly exits with exitcode 1"
solution:
- "Check container log and find root cause"
- "Wait result from next retry"

Copy link
Member

@yqwang-ms yqwang-ms Sep 10, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be unknown, pls refer previous spec on yarn :

  • code: 255
    phrase: PAIRuntimeUnknownFailed
    issuer: PAI_RUNTIME
    causer: UNKNOWN
    type: UNKNOWN_FAILURE
    stage: COMPLETING
    behavior: UNKNOWN
    reaction: RETRY_TO_MAX
    reason: "Container failed but the failure cannot be recognized by PAI Runtime"
    repro:
    • "User program directly exits with exitcode 1"
      solution:
    • "Check container log and find root cause"
    • "Wait result from next retry" #Closed


################################
# Range: {Undefined Negative ExitCodes}
# Owner: PAI_LAUNCHER
# Owner: PAI_FC
# Description: Shadow Fallback ExitCode
################################
# Here the code -8000 is just used to represent all undefined negative exitcodes
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[prune]
go-tests = true
unused-packages = true

[[constraint]]
name = "gopkg.in/yaml.v2"
version = "2.2.2"
Copy link
Member

@yqwang-ms yqwang-ms Sep 6, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why not use built-in yaml #Closed

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't find the build-in yaml lib. Do I miss something?


In reply to: 321699635 [](ancestors = 321699635)


[[constraint]]
name = "github.com/stretchr/testify"
version = "1.3.0"
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/bin/bash

# MIT License
#
# Copyright (c) Microsoft Corporation. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE

set -o errexit
set -o nounset
set -o pipefail

BASH_DIR=$(cd $(dirname ${BASH_SOURCE}) && pwd)
# Ensure ${PROJECT_DIR} is ${GOPATH}/src/github.com/microsoft/hivedscheduler
PROJECT_DIR=${BASH_DIR}/../..
DIST_DIR=${PROJECT_DIR}/dist/runtime

cd ${PROJECT_DIR}

rm -rf ${DIST_DIR}
mkdir -p ${DIST_DIR}

go build -o ${DIST_DIR}/exithandler cmd/exithandler/*
chmod a+x ${DIST_DIR}/exithandler

echo Succeeded to build binary distribution into ${DIST_DIR}:
cd ${DIST_DIR} && ls -lR .
Loading