This repository has been archived by the owner on Jun 6, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 549
ErrorSpec: runtime part #3585
Merged
Merged
ErrorSpec: runtime part #3585
Changes from 22 commits
Commits
Show all changes
29 commits
Select commit
Hold shift + click to select a range
f2c8685
init runtime error spec info
Binyang2014 b760ebd
add tests
Binyang2014 5003a6f
minor change
Binyang2014 5108fdf
add build command
Binyang2014 7751716
fix build issue
Binyang2014 88bf36d
deploy change
Binyang2014 e7c8b94
add config
Binyang2014 1563fc8
add copy error sepc
Binyang2014 2531a7a
e2e fix
Binyang2014 1a7e0fa
ret code change
Binyang2014 d55482d
merge master branch
Binyang2014 d8bb812
change args seq
Binyang2014 bf31292
terminate fix
Binyang2014 39cabb7
minor change
Binyang2014 e1f8130
change config name
Binyang2014 31535bb
runtime fix
Binyang2014 6dca174
fix review comments
Binyang2014 0f87190
minor fix
Binyang2014 bfc872d
fix review comments
Binyang2014 b6f9bcb
fix issues
Binyang2014 91c58ea
add error spec for port conflict
Binyang2014 93f4b96
comment fix
Binyang2014 1b21556
fix error aggregator bug
Binyang2014 30cdc51
fix review comments
Binyang2014 2092999
Merge branch 'master' into binyli/runtime_error
Binyang2014 a0da0d2
fix init container bug
Binyang2014 6b03022
make kube-runtime only be delpoyed in k8s
Binyang2014 b902bd9
remove useless code
Binyang2014 c19e164
export PAI_WORD_DIR
Binyang2014 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -64,6 +64,7 @@ schema: | |
- PAI_RUNTIME | ||
- PAI_K8S | ||
- PAI_FC | ||
- PAI_HW | ||
- PAI_UNKNOWN | ||
- UNKNOWN | ||
- field: type | ||
|
@@ -564,7 +565,7 @@ spec: | |
reaction: RETRY_TO_MAX | ||
reason: "Failed to start docker container due to unknown error" | ||
repro: | ||
- "Submit a Pod with unknown capability specified in its SecurityContext" | ||
- "Submit a Pod with unknown capability specified in its SecurityContext" | ||
solution: | ||
- "Check diagnostics and find root cause" | ||
- "Wait result from next retry" | ||
|
@@ -598,6 +599,9 @@ spec: | |
solution: | ||
- "Wait result from next retry" | ||
- "Contact Cluster Admin" | ||
pattern: | ||
runtimeContainerPatterns: | ||
- exitCode: 130 | ||
|
||
- code: 131 | ||
phrase: ContainerSigQuitReceived | ||
|
@@ -613,6 +617,9 @@ spec: | |
solution: | ||
- "Wait result from next retry" | ||
- "Contact Cluster Admin" | ||
pattern: | ||
runtimeContainerPatterns: | ||
- exitCode: 131 | ||
|
||
- code: 132 | ||
phrase: ContainerSigIllReceived | ||
|
@@ -627,6 +634,9 @@ spec: | |
- "User program executes an illegal, malformed, unknown, or privileged machine instruction" | ||
solution: | ||
- "Check container log and fix your program bug" | ||
pattern: | ||
runtimeContainerPatterns: | ||
- exitCode: 132 | ||
|
||
- code: 134 | ||
phrase: ContainerSigAbrtReceived | ||
|
@@ -642,6 +652,9 @@ spec: | |
solution: | ||
- "Check container log and find root cause" | ||
- "Wait result from next retry" | ||
pattern: | ||
runtimeContainerPatterns: | ||
- exitCode: 134 | ||
|
||
- code: 135 | ||
phrase: ContainerSigBusReceived | ||
|
@@ -656,6 +669,9 @@ spec: | |
- "User program accesses an unaligned memory address" | ||
solution: | ||
- "Check container log and fix your program bug" | ||
pattern: | ||
runtimeContainerPatterns: | ||
- exitCode: 135 | ||
|
||
- code: 136 | ||
phrase: ContainerSigFpeReceived | ||
|
@@ -670,6 +686,9 @@ spec: | |
- "User program division by zero" | ||
solution: | ||
- "Check container log and fix your program bug" | ||
pattern: | ||
runtimeContainerPatterns: | ||
- exitCode: 136 | ||
|
||
- code: 137 | ||
phrase: ContainerSigKillReceived | ||
|
@@ -685,6 +704,9 @@ spec: | |
solution: | ||
- "Wait result from next retry" | ||
- "Contact Cluster Admin" | ||
pattern: | ||
runtimeContainerPatterns: | ||
- exitCode: 137 | ||
|
||
- code: 139 | ||
phrase: ContainerSigSegvReceived | ||
|
@@ -699,6 +721,9 @@ spec: | |
- "User program accesses an illegal memory address" | ||
solution: | ||
- "Check container log and fix your program bug" | ||
pattern: | ||
runtimeContainerPatterns: | ||
- exitCode: 139 | ||
|
||
- code: 141 | ||
phrase: ContainerSigPipeReceived | ||
|
@@ -713,6 +738,9 @@ spec: | |
- "User program writes to a pipe without a process connected to the other end" | ||
solution: | ||
- "Check container log and fix your program bug" | ||
pattern: | ||
runtimeContainerPatterns: | ||
- exitCode: 141 | ||
|
||
- code: 143 | ||
phrase: ContainerSigTermReceived | ||
|
@@ -728,6 +756,9 @@ spec: | |
solution: | ||
- "Wait result from next retry" | ||
- "Contact Cluster Admin" | ||
pattern: | ||
runtimeContainerPatterns: | ||
- exitCode: 143 | ||
|
||
|
||
########################################################################### | ||
|
@@ -736,7 +767,80 @@ spec: | |
# Description: User Container issued failures: | ||
# -> Voluntary failures caused by Container itself | ||
########################################################################### | ||
- code: 240 | ||
- code: 220 | ||
phrase: UserCommandExitWithError | ||
issuer: USER_CONTAINER | ||
causer: USER_CONTAINER | ||
type: USER_FAILURE | ||
stage: RUNNING | ||
behavior: PERMANENT | ||
reaction: NEVER_RETRY | ||
reason: "User command is invalid" | ||
repro: | ||
- "Start a job with invalid command" | ||
solution: | ||
- "Check the user command" | ||
- "Check the PATH environment variable in the container" | ||
pattern: | ||
runtimeContainerPatterns: | ||
- exitCode: 127 | ||
userLogRegex: "command not found" | ||
|
||
- code: 221 | ||
phrase: ContainerTensorflowOOMKilled | ||
issuer: PAI_RUNTIME | ||
causer: USER_CONTAINER | ||
type: USER_FAILURE | ||
stage: RUNNING | ||
behavior: PERMANENT | ||
reaction: NEVER_RETRY | ||
reason: "Tensorflow failed due to out of memory" | ||
repro: | ||
- "Tensorflow uses more memory than it is allocated" | ||
solution: | ||
- "Increase per task memory request" | ||
- "Decrease per task memory usage by such as increasing task number" | ||
- "Tuning the tensorflow program" | ||
pattern: | ||
runtimeContainerPatterns: | ||
- userLogRegex: "(?msi)tensorflow.*ResourceExhaustedError.*OOM.*" | ||
|
||
- code: 222 | ||
phrase: ContainerMPISegvFault | ||
issuer: PAI_RUNTIME | ||
causer: USER_CONTAINER | ||
type: USER_FAILURE | ||
stage: RUNNING | ||
behavior: PERMANENT | ||
reaction: NEVER_RETRY | ||
reason: "MPI failed due to memory segmentation fault" | ||
repro: | ||
- "MPI accesses unmapped memory region" | ||
solution: | ||
- "Check container log and fix your program bug" | ||
pattern: | ||
runtimeContainerPatterns: | ||
- userLogRegex: "(?msi)Signal code: Address not mapped.*" | ||
|
||
- code: 223 | ||
phrase: ContainerCudaUncorrectableECCError | ||
issuer: PAI_RUNTIME | ||
causer: PAI_HW | ||
type: PLATFORM_FAILURE | ||
stage: RUNNING | ||
behavior: TRANSIENT | ||
reaction: ALWAYS_RETRY | ||
reason: "Container failed due to GPU uncorrectable ECC error" | ||
repro: | ||
- "Run tensorflow on a GPU which has ECC error" | ||
solution: | ||
- "Wait result from next retry" | ||
- "Contact Cluster Admin" | ||
pattern: | ||
runtimeContainerPatterns: | ||
- userLogRegex: "(?msi)CUDA_ERROR_ECC_UNCORRECTABLE.*" | ||
|
||
- code: 249 | ||
phrase: PAIRuntimeSSHBarrierTimeout | ||
issuer: PAI_RUNTIME | ||
causer: PAI_RUNTIME | ||
|
@@ -798,10 +902,40 @@ spec: | |
solution: | ||
- "Contact PAI Dev to fix the PAI Runtime bug" | ||
|
||
- code: 253 | ||
phrase: ContainerPortConflict | ||
issuer: PAI_RUNTIME | ||
causer: PAI_RUNTIME | ||
type: PLATFORM_FAILURE | ||
stage: LAUNCHING | ||
behavior: TRANSIENT | ||
reaction: ALWAYS_RETRY | ||
reason: "Can not alloc request ports due to ports is occupied" | ||
repro: | ||
- "Run task in a host which most of ports are occupied" | ||
solution: | ||
- "Wait result from next retry" | ||
- "Decrease per task ports request" | ||
|
||
- code: 255 | ||
phrase: PAIRuntimeUnknownFailed | ||
issuer: PAI_RUNTIME | ||
causer: UNKNOWN | ||
type: UNKNOWN_FAILURE | ||
stage: UNKNOWN | ||
behavior: UNKNOWN | ||
reaction: RETRY_TO_MAX | ||
reason: "User command failed but the failure cannot be recognized by PAI Runtime" | ||
repro: | ||
- "User program directly exits with exitcode 1" | ||
solution: | ||
- "Check container log and find root cause" | ||
- "Wait result from next retry" | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This should be unknown, pls refer previous spec on yarn :
|
||
|
||
################################ | ||
# Range: {Undefined Negative ExitCodes} | ||
# Owner: PAI_LAUNCHER | ||
# Owner: PAI_FC | ||
# Description: Shadow Fallback ExitCode | ||
################################ | ||
# Here the code -8000 is just used to represent all undefined negative exitcodes | ||
|
44 changes: 44 additions & 0 deletions
44
src/kube-runtime/GOPATH/src/github.com/microsoft/runtime/Gopkg.lock
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
11 changes: 11 additions & 0 deletions
11
src/kube-runtime/GOPATH/src/github.com/microsoft/runtime/Gopkg.toml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
[prune] | ||
go-tests = true | ||
unused-packages = true | ||
|
||
[[constraint]] | ||
name = "gopkg.in/yaml.v2" | ||
version = "2.2.2" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why not use built-in yaml #Closed There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't find the build-in yaml lib. Do I miss something? In reply to: 321699635 [](ancestors = 321699635) |
||
|
||
[[constraint]] | ||
name = "github.com/stretchr/testify" | ||
version = "1.3.0" |
43 changes: 43 additions & 0 deletions
43
src/kube-runtime/GOPATH/src/github.com/microsoft/runtime/build/runtime/go-build.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
#!/bin/bash | ||
|
||
# MIT License | ||
# | ||
# Copyright (c) Microsoft Corporation. All rights reserved. | ||
# | ||
# Permission is hereby granted, free of charge, to any person obtaining a copy | ||
# of this software and associated documentation files (the "Software"), to deal | ||
# in the Software without restriction, including without limitation the rights | ||
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
# copies of the Software, and to permit persons to whom the Software is | ||
# furnished to do so, subject to the following conditions: | ||
# | ||
# The above copyright notice and this permission notice shall be included in all | ||
# copies or substantial portions of the Software. | ||
# | ||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
# SOFTWARE | ||
|
||
set -o errexit | ||
set -o nounset | ||
set -o pipefail | ||
|
||
BASH_DIR=$(cd $(dirname ${BASH_SOURCE}) && pwd) | ||
# Ensure ${PROJECT_DIR} is ${GOPATH}/src/github.com/microsoft/hivedscheduler | ||
PROJECT_DIR=${BASH_DIR}/../.. | ||
DIST_DIR=${PROJECT_DIR}/dist/runtime | ||
|
||
cd ${PROJECT_DIR} | ||
|
||
rm -rf ${DIST_DIR} | ||
mkdir -p ${DIST_DIR} | ||
|
||
go build -o ${DIST_DIR}/exithandler cmd/exithandler/* | ||
chmod a+x ${DIST_DIR}/exithandler | ||
|
||
echo Succeeded to build binary distribution into ${DIST_DIR}: | ||
cd ${DIST_DIR} && ls -lR . |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Add codes from below? https://github.com/microsoft/frameworkcontroller/blob/b28b9a3d8d3aac29a17e719b750c53b7d11890f3/example/config/default/frameworkcontroller.yaml#L232-L259 #Closed