From 2adba1b9865c0e4e3c003aedd532304cd6a63e3d Mon Sep 17 00:00:00 2001 From: WangDian Date: Fri, 6 Sep 2019 19:36:57 +0800 Subject: [PATCH 1/9] Add ssh barrier Add ssh barrier --- src/kube-runtime/src/plugins/ssh/README.md | 3 + .../src/plugins/ssh/sshbarrier.sh | 75 +++++++++++++++++++ src/kube-runtime/src/plugins/ssh/sshd.sh | 4 + .../test/sshbarrier_test_job.yaml | 60 +++++++++++++++ 4 files changed, 142 insertions(+) create mode 100644 src/kube-runtime/src/plugins/ssh/sshbarrier.sh create mode 100644 src/kube-runtime/test/sshbarrier_test_job.yaml diff --git a/src/kube-runtime/src/plugins/ssh/README.md b/src/kube-runtime/src/plugins/ssh/README.md index cee04f2216..fc64c6d17d 100644 --- a/src/kube-runtime/src/plugins/ssh/README.md +++ b/src/kube-runtime/src/plugins/ssh/README.md @@ -10,9 +10,12 @@ extras: - plugin: ssh parameters: jobssh: boolean + sshbarrier: + - taskrole userssh: type: string value: string ``` - jobssh: true to enable job container wise ssh, false to disable. +- sshbarrier: wait until can ssh to all defined taskrole's job containers. Only valid when jobssh is true. - userssh: currently the userssh type should be system|custom. Type system means the value is a key stored in PAI, and type custom means the value is the string defined in job config. \ No newline at end of file diff --git a/src/kube-runtime/src/plugins/ssh/sshbarrier.sh b/src/kube-runtime/src/plugins/ssh/sshbarrier.sh new file mode 100644 index 0000000000..3b7fe5f12c --- /dev/null +++ b/src/kube-runtime/src/plugins/ssh/sshbarrier.sh @@ -0,0 +1,75 @@ +#!/bin/bash +# Copyright (c) Microsoft Corporation +# All rights reserved. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated +# documentation files (the "Software"), to deal in the Software without restriction, including without limitation +# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and +# to permit persons to whom the Software is furnished to do so, subject to the following conditions: +# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING +# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +# no set -o errexit because use exitcode to judge ssh connectivity +set -o nounset +set -o pipefail + +readonly MAX_RETRY_COUNT=10 +readonly RETRY_INTERVAL=1 + +function check_ssh_connection() +{ + ssh -q -o BatchMode=yes -o StrictHostKeyChecking=no $1 "exit 0" + _RCODE=$? + return $_RCODE +} + + +instanceToCheck=() +# Set ssh config for all task role instances +taskRoleInstanceArray=(${PAI_TASK_ROLE_INSTANCES//,/ }) + +for validTaskRole in $@; do + for i in "${taskRoleInstanceArray[@]}"; do + instancePair=(${i//:/ }) + taskRole=${instancePair[0]} + index=${instancePair[1]} + + if [[ $taskRole -ne $validTaskRole ]]; then + continue + fi + + if [[ $taskrole = $FC_TASKROLE_NAME ]] && [[ $index = $FC_TASK_INDEX ]]; then + continue + fi + + instanceToCheck+=("${taskrole}-${index}") + done +done + +retryCount=0 +while [ ${#instanceToCheck[@]} -ne 0 ] +do + if [ $retryCount -ge $MAX_RETRY_COUNT ]; then + exit 240 + fi + + instanceFailed=() + for instance in "${instanceToCheck[@]}"; do + check_ssh_connection "$instance" + if [ $? -ne 0 ]; then + instanceFailed+=("$instance") + fi + done + instanceToCheck=(${instanceFailed[*]}) + + ((retryCount++)) + sleep $RETRY_INTERVAL +done + diff --git a/src/kube-runtime/src/plugins/ssh/sshd.sh b/src/kube-runtime/src/plugins/ssh/sshd.sh index 8e7a4cdd9b..5c58cef858 100644 --- a/src/kube-runtime/src/plugins/ssh/sshd.sh +++ b/src/kube-runtime/src/plugins/ssh/sshd.sh @@ -16,6 +16,10 @@ # DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +set -o errexit +set -o nounset +set -o pipefail + PAI_WORK_DIR=/usr/local/pai SSH_DIR=/root/.ssh diff --git a/src/kube-runtime/test/sshbarrier_test_job.yaml b/src/kube-runtime/test/sshbarrier_test_job.yaml new file mode 100644 index 0000000000..c2dc3ca9cc --- /dev/null +++ b/src/kube-runtime/test/sshbarrier_test_job.yaml @@ -0,0 +1,60 @@ +protocolVersion: 2 +name: horovod_pytorch_synthetic_benchmark +type: job +version: horovod0.16.4-tf1.12.0-torch1.1.0-mxnet1.4.1-py3.5 +contributor: OpenPAI +description: | + This is a distributed synthetic benchmark for Horovod with PyTorch backend running on OpenPAI. + It runs [Horovod with Open MPI](https://github.com/horovod/horovod/blob/master/docs/mpirun.rst). +parameters: + model: resnet50 + batchsize: 64 + +prerequisites: + - protocolVersion: 2 + name: horovod_official + type: dockerimage + contributor : Horovod + uri : horovod/horovod:0.16.4-tf1.12.0-torch1.1.0-mxnet1.4.1-py3.5 + +taskRoles: + master: + instances: 1 + completion: + minSucceededInstances: 1 + dockerImage: horovod_official + resourcePerInstance: + cpu: 16 + memoryMB: 16384 + gpu: 4 + commands: + - > + horovodrun -np 8 -H master-0:4,worker-0:4 + python pytorch_synthetic_benchmark.py + --model <% $parameters.model %> + --batch-size <% $parameters.batchsize %> + worker: + instances: 1 + dockerImage: horovod_official + resourcePerInstance: + cpu: 16 + memoryMB: 16384 + gpu: 4 + commands: + - sleep infinity + +extras: + com.microsoft.pai.runtimeplugin: + - plugin: ssh + taskroles: + - master + parameters: + jobssh: true + sshbarrier: + - master + - worker + - plugin: ssh + taskroles: + - worker + parameters: + jobssh: true \ No newline at end of file From 65031a0ec13d3821e18186e61717d305111751da Mon Sep 17 00:00:00 2001 From: WangDian Date: Fri, 6 Sep 2019 19:59:12 +0800 Subject: [PATCH 2/9] Add unsaved codes --- src/kube-runtime/src/plugins/ssh/init.py | 18 ++++++++++++------ src/kube-runtime/src/plugins/ssh/sshbarrier.sh | 3 +++ src/kube-runtime/test/sshbarrier_test_job.yaml | 10 +++++----- 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/src/kube-runtime/src/plugins/ssh/init.py b/src/kube-runtime/src/plugins/ssh/init.py index 7167acbfbf..76a6905188 100644 --- a/src/kube-runtime/src/plugins/ssh/init.py +++ b/src/kube-runtime/src/plugins/ssh/init.py @@ -33,12 +33,12 @@ if __name__ == "__main__": [parameters, pre_script, post_script] = plugin_init() - cmdParams = [] - if parameters is not None: + if parameters is not None: if "jobssh" in parameters: - cmdParams.append(str(parameters["jobssh"]).lower()) + jobssh = str(parameters["jobssh"]).lower()) else: - cmdParams.append("false") + jobssh = "false" + cmdParams = [jobssh] if "userssh" in parameters: if "type" in parameters["userssh"] and "value" in parameters["userssh"]: @@ -46,5 +46,11 @@ cmdParams.append("\'{}\'".format(parameters["userssh"]["value"])) # write call to real executable script - command = "{}/sshd.sh {}\n".format(os.path.dirname(os.path.abspath(__file__)), " ".join(cmdParams)) - inject_commands([command], pre_script) + command = ["{}/sshd.sh {}\n".format(os.path.dirname(os.path.abspath(__file__)), " ".join(cmdParams))] + + # ssh barrier + if jobssh == "true" and "sshbarrier" in parameters: + barrierParams = " ".join('"{}"'.format(tr) for tr in parameters["sshbarrier"]) + command.append("{}/sshbarrier.sh {}\n".format(os.path.dirname(os.path.abspath(__file__)), barrierParams)) + + inject_commands(command, pre_script) diff --git a/src/kube-runtime/src/plugins/ssh/sshbarrier.sh b/src/kube-runtime/src/plugins/ssh/sshbarrier.sh index 3b7fe5f12c..8ac469ace0 100644 --- a/src/kube-runtime/src/plugins/ssh/sshbarrier.sh +++ b/src/kube-runtime/src/plugins/ssh/sshbarrier.sh @@ -53,6 +53,8 @@ for validTaskRole in $@; do done done +echo "Checking instances: ${instanceToCheck[*]}" + retryCount=0 while [ ${#instanceToCheck[@]} -ne 0 ] do @@ -73,3 +75,4 @@ do sleep $RETRY_INTERVAL done +echo "All ssh connections are set, continue." diff --git a/src/kube-runtime/test/sshbarrier_test_job.yaml b/src/kube-runtime/test/sshbarrier_test_job.yaml index c2dc3ca9cc..eb791bb91f 100644 --- a/src/kube-runtime/test/sshbarrier_test_job.yaml +++ b/src/kube-runtime/test/sshbarrier_test_job.yaml @@ -1,5 +1,5 @@ protocolVersion: 2 -name: horovod_pytorch_synthetic_benchmark +name: sshbarrier_test_job type: job version: horovod0.16.4-tf1.12.0-torch1.1.0-mxnet1.4.1-py3.5 contributor: OpenPAI @@ -24,9 +24,9 @@ taskRoles: minSucceededInstances: 1 dockerImage: horovod_official resourcePerInstance: - cpu: 16 + cpu: 4 memoryMB: 16384 - gpu: 4 + gpu: 1 commands: - > horovodrun -np 8 -H master-0:4,worker-0:4 @@ -37,9 +37,9 @@ taskRoles: instances: 1 dockerImage: horovod_official resourcePerInstance: - cpu: 16 + cpu: 4 memoryMB: 16384 - gpu: 4 + gpu: 1 commands: - sleep infinity From 70722ef08cc671b4ff2cdf9aeb3ddd0e98ea3d83 Mon Sep 17 00:00:00 2001 From: WangDian Date: Fri, 6 Sep 2019 20:29:58 +0800 Subject: [PATCH 3/9] Bug fix --- src/kube-runtime/src/plugins/ssh/init.py | 4 ++-- src/kube-runtime/src/plugins/ssh/sshbarrier.sh | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/kube-runtime/src/plugins/ssh/init.py b/src/kube-runtime/src/plugins/ssh/init.py index 76a6905188..0a6e426aa9 100644 --- a/src/kube-runtime/src/plugins/ssh/init.py +++ b/src/kube-runtime/src/plugins/ssh/init.py @@ -33,9 +33,9 @@ if __name__ == "__main__": [parameters, pre_script, post_script] = plugin_init() - if parameters is not None: + if parameters is not None: if "jobssh" in parameters: - jobssh = str(parameters["jobssh"]).lower()) + jobssh = str(parameters["jobssh"]).lower() else: jobssh = "false" cmdParams = [jobssh] diff --git a/src/kube-runtime/src/plugins/ssh/sshbarrier.sh b/src/kube-runtime/src/plugins/ssh/sshbarrier.sh index 8ac469ace0..6eb398c26f 100644 --- a/src/kube-runtime/src/plugins/ssh/sshbarrier.sh +++ b/src/kube-runtime/src/plugins/ssh/sshbarrier.sh @@ -17,7 +17,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # no set -o errexit because use exitcode to judge ssh connectivity -set -o nounset +# no set -o nounset because use empty array to judge end set -o pipefail readonly MAX_RETRY_COUNT=10 @@ -45,11 +45,11 @@ for validTaskRole in $@; do continue fi - if [[ $taskrole = $FC_TASKROLE_NAME ]] && [[ $index = $FC_TASK_INDEX ]]; then + if [[ $taskRole = $FC_TASKROLE_NAME ]] && [[ $index = $FC_TASK_INDEX ]]; then continue fi - instanceToCheck+=("${taskrole}-${index}") + instanceToCheck+=("${taskRole}-${index}") done done @@ -75,4 +75,4 @@ do sleep $RETRY_INTERVAL done -echo "All ssh connections are set, continue." +echo "All ssh connections are set, continue..." From 999a5df91ffa1685e21c3fa8b556184154c1b37a Mon Sep 17 00:00:00 2001 From: WangDian Date: Mon, 9 Sep 2019 19:00:53 +0800 Subject: [PATCH 4/9] Make sshbarriertaskroles as optional paramters --- src/kube-runtime/src/plugins/ssh/README.md | 8 +++-- src/kube-runtime/src/plugins/ssh/init.py | 7 ++-- .../src/plugins/ssh/sshbarrier.sh | 35 ++++++++++--------- .../test/sshbarrier_test_job.yaml | 4 +-- src/kube-runtime/test/test_runtime.py | 10 ++++++ 5 files changed, 40 insertions(+), 24 deletions(-) diff --git a/src/kube-runtime/src/plugins/ssh/README.md b/src/kube-runtime/src/plugins/ssh/README.md index fc64c6d17d..2bb66c6feb 100644 --- a/src/kube-runtime/src/plugins/ssh/README.md +++ b/src/kube-runtime/src/plugins/ssh/README.md @@ -10,12 +10,14 @@ extras: - plugin: ssh parameters: jobssh: boolean - sshbarrier: + sshbarrier: boolean + sshbarriertaskroles: - taskrole userssh: type: string value: string ``` - jobssh: true to enable job container wise ssh, false to disable. -- sshbarrier: wait until can ssh to all defined taskrole's job containers. Only valid when jobssh is true. -- userssh: currently the userssh type should be system|custom. Type system means the value is a key stored in PAI, and type custom means the value is the string defined in job config. \ No newline at end of file +- sshbarrier: if set to true, wait until can ssh to all corresponding job containers. If not set, the defalut value is false. +- sshbarriertaskroles: only valid if sshbarrier set to true. Defines the task roles that the barrier will test ssh to. If not defind, all taskroles will be included. +- userssh: currently the userssh type should be ```custom```. Type ```custom``` means use the userssh value as the SSH public key to run job. User can use the corresponding SSH private key to connect to job container. \ No newline at end of file diff --git a/src/kube-runtime/src/plugins/ssh/init.py b/src/kube-runtime/src/plugins/ssh/init.py index 0a6e426aa9..7a113076a7 100644 --- a/src/kube-runtime/src/plugins/ssh/init.py +++ b/src/kube-runtime/src/plugins/ssh/init.py @@ -49,8 +49,11 @@ command = ["{}/sshd.sh {}\n".format(os.path.dirname(os.path.abspath(__file__)), " ".join(cmdParams))] # ssh barrier - if jobssh == "true" and "sshbarrier" in parameters: - barrierParams = " ".join('"{}"'.format(tr) for tr in parameters["sshbarrier"]) + if jobssh == "true" and "sshbarrier" in parameters and str(parameters["sshbarrier"]).lower() == "true": + if "sshbarriertaskroles" in parameters: + barrierParams = " ".join('"{}"'.format(tr) for tr in parameters["sshbarrier"]) + else: + barrierParams = "" command.append("{}/sshbarrier.sh {}\n".format(os.path.dirname(os.path.abspath(__file__)), barrierParams)) inject_commands(command, pre_script) diff --git a/src/kube-runtime/src/plugins/ssh/sshbarrier.sh b/src/kube-runtime/src/plugins/ssh/sshbarrier.sh index 6eb398c26f..bee0c3b0a7 100644 --- a/src/kube-runtime/src/plugins/ssh/sshbarrier.sh +++ b/src/kube-runtime/src/plugins/ssh/sshbarrier.sh @@ -20,7 +20,7 @@ # no set -o nounset because use empty array to judge end set -o pipefail -readonly MAX_RETRY_COUNT=10 +readonly MAX_RETRY_COUNT=20 readonly RETRY_INTERVAL=1 function check_ssh_connection() @@ -34,34 +34,37 @@ function check_ssh_connection() instanceToCheck=() # Set ssh config for all task role instances taskRoleInstanceArray=(${PAI_TASK_ROLE_INSTANCES//,/ }) +barrierTaskRoles=$@ -for validTaskRole in $@; do - for i in "${taskRoleInstanceArray[@]}"; do - instancePair=(${i//:/ }) - taskRole=${instancePair[0]} - index=${instancePair[1]} +for i in "${taskRoleInstanceArray[@]}"; do + instancePair=(${i//:/ }) + taskRole=${instancePair[0]} + index=${instancePair[1]} - if [[ $taskRole -ne $validTaskRole ]]; then - continue - fi + if [[ $taskRole = $FC_TASKROLE_NAME ]] && [[ $index = $FC_TASK_INDEX ]]; then + continue + fi - if [[ $taskRole = $FC_TASKROLE_NAME ]] && [[ $index = $FC_TASK_INDEX ]]; then - continue +# If barrier task roles defined, then only check instances for defined task roles. Otherwise check all instances. + if [ ${#barrierTaskRoles[@]} != 0 ]; then + if [[ " ${barrierTaskRoles[@]} " =~ " ${value} " ]]; then + instanceToCheck+=("${taskRole}-${index}") fi - + else instanceToCheck+=("${taskRole}-${index}") - done + fi done -echo "Checking instances: ${instanceToCheck[*]}" - retryCount=0 while [ ${#instanceToCheck[@]} -ne 0 ] do if [ $retryCount -ge $MAX_RETRY_COUNT ]; then + echo "SSH barrier reaches max retry count. Failed instances: ${instanceToCheck[*]} Exit..." exit 240 fi + echo "Trying to SSH to instances: ${instanceToCheck[*]}" + instanceFailed=() for instance in "${instanceToCheck[@]}"; do check_ssh_connection "$instance" @@ -75,4 +78,4 @@ do sleep $RETRY_INTERVAL done -echo "All ssh connections are set, continue..." +echo "All ssh connections are established, continue..." diff --git a/src/kube-runtime/test/sshbarrier_test_job.yaml b/src/kube-runtime/test/sshbarrier_test_job.yaml index eb791bb91f..c71a05ac2b 100644 --- a/src/kube-runtime/test/sshbarrier_test_job.yaml +++ b/src/kube-runtime/test/sshbarrier_test_job.yaml @@ -50,9 +50,7 @@ extras: - master parameters: jobssh: true - sshbarrier: - - master - - worker + sshbarrier: true - plugin: ssh taskroles: - worker diff --git a/src/kube-runtime/test/test_runtime.py b/src/kube-runtime/test/test_runtime.py index c1071d69d6..eed75ae0dc 100644 --- a/src/kube-runtime/test/test_runtime.py +++ b/src/kube-runtime/test/test_runtime.py @@ -52,6 +52,16 @@ def test_ssh_plugin(self): commands = [[],[]] init_plugins(jobconfig, commands, "../src/plugins", ".", "worker") + def test_ssh_plugin_barrier(self): + job_path = "sshbarrier_test_job.yaml" + if os.path.exists(job_path): + with open(job_path, 'rt') as f: + jobconfig = yaml.load(f) + commands = [[],[]] + init_plugins(jobconfig, commands, "../src/plugins", ".", "master") + commands = [[],[]] + init_plugins(jobconfig, commands, "../src/plugins", ".", "worker") + if __name__ == '__main__': unittest.main() \ No newline at end of file From 1397d5f104bace5586d0c84dfa25b2460c7ef8ce Mon Sep 17 00:00:00 2001 From: WangDian Date: Tue, 10 Sep 2019 10:32:46 +0800 Subject: [PATCH 5/9] Bug fix --- src/kube-runtime/src/plugins/ssh/init.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kube-runtime/src/plugins/ssh/init.py b/src/kube-runtime/src/plugins/ssh/init.py index 7a113076a7..a84f708719 100644 --- a/src/kube-runtime/src/plugins/ssh/init.py +++ b/src/kube-runtime/src/plugins/ssh/init.py @@ -51,7 +51,7 @@ # ssh barrier if jobssh == "true" and "sshbarrier" in parameters and str(parameters["sshbarrier"]).lower() == "true": if "sshbarriertaskroles" in parameters: - barrierParams = " ".join('"{}"'.format(tr) for tr in parameters["sshbarrier"]) + barrierParams = " ".join('"{}"'.format(tr) for tr in parameters["sshbarriertaskroles"]) else: barrierParams = "" command.append("{}/sshbarrier.sh {}\n".format(os.path.dirname(os.path.abspath(__file__)), barrierParams)) From ccf1a14431ea8e3121670b0c3ad4be74c29e6e1d Mon Sep 17 00:00:00 2001 From: WangDian Date: Tue, 10 Sep 2019 10:48:29 +0800 Subject: [PATCH 6/9] Bug fix --- src/kube-runtime/src/plugins/ssh/sshbarrier.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kube-runtime/src/plugins/ssh/sshbarrier.sh b/src/kube-runtime/src/plugins/ssh/sshbarrier.sh index bee0c3b0a7..4bb8711e0f 100644 --- a/src/kube-runtime/src/plugins/ssh/sshbarrier.sh +++ b/src/kube-runtime/src/plugins/ssh/sshbarrier.sh @@ -47,7 +47,7 @@ for i in "${taskRoleInstanceArray[@]}"; do # If barrier task roles defined, then only check instances for defined task roles. Otherwise check all instances. if [ ${#barrierTaskRoles[@]} != 0 ]; then - if [[ " ${barrierTaskRoles[@]} " =~ " ${value} " ]]; then + if [[ " ${barrierTaskRoles[@]} " =~ " ${taskRole} " ]]; then instanceToCheck+=("${taskRole}-${index}") fi else From b6cdc20a512bc7c39e7006582093dec3a9631188 Mon Sep 17 00:00:00 2001 From: WangDian Date: Tue, 10 Sep 2019 11:58:23 +0800 Subject: [PATCH 7/9] Add error spec for SSH barrier. Refine code Add error spec for SSH barrier. Refine code based on code review --- .../config/k8s-job-exit-spec.yaml | 18 ++++++++++ .../src/plugins/ssh/sshbarrier.sh | 36 ++++++++++--------- 2 files changed, 37 insertions(+), 17 deletions(-) diff --git a/src/k8s-job-exit-spec/config/k8s-job-exit-spec.yaml b/src/k8s-job-exit-spec/config/k8s-job-exit-spec.yaml index 923f79974d..08252f6df9 100644 --- a/src/k8s-job-exit-spec/config/k8s-job-exit-spec.yaml +++ b/src/k8s-job-exit-spec/config/k8s-job-exit-spec.yaml @@ -827,3 +827,21 @@ spec: - "PAI Runtime exits with exitcode 1" solution: - "Contact PAI Dev to fix PAI Runtime bugs" + +- code: 240 + phrase: PAIRuntimeSSHBarrierTimeout + issuer: PAI_RUNTIME + causer: PAI_RUNTIME + type: PLATFORM_FAILURE + stage: RUNNING + behavior: PERMANENT + reaction: NEVER_RETRY + reason: "SSH barrier reaches max retry count" + repro: + - "SSH barrier reaches max retry count, please check if SSH plugin is correctly configured" + solution: + - "Check job config to confirm all SSH barrier relied task roles enabled SSH" + pattern: + runtimeContainerPatterns: + - exitCode: 10 + platformLogRegex: "SSH barrier reaches max retry count" \ No newline at end of file diff --git a/src/kube-runtime/src/plugins/ssh/sshbarrier.sh b/src/kube-runtime/src/plugins/ssh/sshbarrier.sh index 4bb8711e0f..c2cb56c53e 100644 --- a/src/kube-runtime/src/plugins/ssh/sshbarrier.sh +++ b/src/kube-runtime/src/plugins/ssh/sshbarrier.sh @@ -30,13 +30,15 @@ function check_ssh_connection() return $_RCODE } +taskRolesToCheck=() +for barrierTaskRole in $@; do + taskRolesToCheck+=($barrierTaskRole) +done -instanceToCheck=() +instancesToCheck=() # Set ssh config for all task role instances -taskRoleInstanceArray=(${PAI_TASK_ROLE_INSTANCES//,/ }) -barrierTaskRoles=$@ - -for i in "${taskRoleInstanceArray[@]}"; do +taskRoleInstances=(${PAI_TASK_ROLE_INSTANCES//,/ }) +for i in "${taskRoleInstances[@]}"; do instancePair=(${i//:/ }) taskRole=${instancePair[0]} index=${instancePair[1]} @@ -46,33 +48,33 @@ for i in "${taskRoleInstanceArray[@]}"; do fi # If barrier task roles defined, then only check instances for defined task roles. Otherwise check all instances. - if [ ${#barrierTaskRoles[@]} != 0 ]; then - if [[ " ${barrierTaskRoles[@]} " =~ " ${taskRole} " ]]; then - instanceToCheck+=("${taskRole}-${index}") + if [[ ${#taskRolesToCheck[@]} != 0 ]]; then + if [[ ${taskRolesToCheck[@]} =~ ${taskRole} ]]; then + instancesToCheck+=("${taskRole}-${index}") fi else - instanceToCheck+=("${taskRole}-${index}") + instancesToCheck+=("${taskRole}-${index}") fi done retryCount=0 -while [ ${#instanceToCheck[@]} -ne 0 ] +while [[ ${#instancesToCheck[@]} != 0 ]] do - if [ $retryCount -ge $MAX_RETRY_COUNT ]; then - echo "SSH barrier reaches max retry count. Failed instances: ${instanceToCheck[*]} Exit..." - exit 240 + if [[ $retryCount > $MAX_RETRY_COUNT ]]; then + echo "SSH barrier reaches max retry count. Failed instances: ${instancesToCheck[*]} Exit..." >&2 + exit 10 fi - echo "Trying to SSH to instances: ${instanceToCheck[*]}" + echo "Trying to SSH to instances: ${instancesToCheck[*]}" instanceFailed=() - for instance in "${instanceToCheck[@]}"; do + for instance in "${instancesToCheck[@]}"; do check_ssh_connection "$instance" - if [ $? -ne 0 ]; then + if [[ $? != 0 ]]; then instanceFailed+=("$instance") fi done - instanceToCheck=(${instanceFailed[*]}) + instancesToCheck=(${instanceFailed[*]}) ((retryCount++)) sleep $RETRY_INTERVAL From 9d94c82f9c07cd16b956ff5b0c3d39dec68c6ffd Mon Sep 17 00:00:00 2001 From: WangDian Date: Tue, 10 Sep 2019 12:39:01 +0800 Subject: [PATCH 8/9] Refine while --- src/kube-runtime/src/plugins/ssh/sshbarrier.sh | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/kube-runtime/src/plugins/ssh/sshbarrier.sh b/src/kube-runtime/src/plugins/ssh/sshbarrier.sh index c2cb56c53e..5233256263 100644 --- a/src/kube-runtime/src/plugins/ssh/sshbarrier.sh +++ b/src/kube-runtime/src/plugins/ssh/sshbarrier.sh @@ -58,13 +58,8 @@ for i in "${taskRoleInstances[@]}"; do done retryCount=0 -while [[ ${#instancesToCheck[@]} != 0 ]] +while true do - if [[ $retryCount > $MAX_RETRY_COUNT ]]; then - echo "SSH barrier reaches max retry count. Failed instances: ${instancesToCheck[*]} Exit..." >&2 - exit 10 - fi - echo "Trying to SSH to instances: ${instancesToCheck[*]}" instanceFailed=() @@ -74,9 +69,17 @@ do instanceFailed+=("$instance") fi done - instancesToCheck=(${instanceFailed[*]}) + [[ ${#instanceFailed[@]} = 0 ]] && break + + if (( $retryCount >= $MAX_RETRY_COUNT )); then + echo "SSH barrier reaches max retry count. Failed instances: ${instancesToCheck[*]} Exit..." >&2 + exit 10 + fi + + instancesToCheck=(${instanceFailed[*]}) ((retryCount++)) + sleep $RETRY_INTERVAL done From a46c5ef0f2fc01816342a5e8931dce70b24511a9 Mon Sep 17 00:00:00 2001 From: WangDian Date: Tue, 10 Sep 2019 13:34:39 +0800 Subject: [PATCH 9/9] Adjust test job and exit spec --- .../config/k8s-job-exit-spec.yaml | 36 +++++++++---------- .../test/sshbarrier_test_job.yaml | 11 +++--- 2 files changed, 24 insertions(+), 23 deletions(-) diff --git a/src/k8s-job-exit-spec/config/k8s-job-exit-spec.yaml b/src/k8s-job-exit-spec/config/k8s-job-exit-spec.yaml index 08252f6df9..94361a80f8 100644 --- a/src/k8s-job-exit-spec/config/k8s-job-exit-spec.yaml +++ b/src/k8s-job-exit-spec/config/k8s-job-exit-spec.yaml @@ -736,6 +736,24 @@ spec: # Description: User Container issued failures: # -> Voluntary failures caused by Container itself ########################################################################### +- code: 240 + phrase: PAIRuntimeSSHBarrierTimeout + issuer: PAI_RUNTIME + causer: PAI_RUNTIME + type: PLATFORM_FAILURE + stage: RUNNING + behavior: PERMANENT + reaction: NEVER_RETRY + reason: "SSH barrier reaches max retry count" + repro: + - "SSH barrier reaches max retry count, please check if SSH plugin is correctly configured" + solution: + - "Check job config to confirm all SSH barrier relied task roles enabled SSH" + pattern: + runtimeContainerPatterns: + - exitCode: 10 + platformLogRegex: "SSH barrier reaches max retry count" + - code: 250 phrase: FrameworkBarrierTransientFailed issuer: PAI_RUNTIME @@ -827,21 +845,3 @@ spec: - "PAI Runtime exits with exitcode 1" solution: - "Contact PAI Dev to fix PAI Runtime bugs" - -- code: 240 - phrase: PAIRuntimeSSHBarrierTimeout - issuer: PAI_RUNTIME - causer: PAI_RUNTIME - type: PLATFORM_FAILURE - stage: RUNNING - behavior: PERMANENT - reaction: NEVER_RETRY - reason: "SSH barrier reaches max retry count" - repro: - - "SSH barrier reaches max retry count, please check if SSH plugin is correctly configured" - solution: - - "Check job config to confirm all SSH barrier relied task roles enabled SSH" - pattern: - runtimeContainerPatterns: - - exitCode: 10 - platformLogRegex: "SSH barrier reaches max retry count" \ No newline at end of file diff --git a/src/kube-runtime/test/sshbarrier_test_job.yaml b/src/kube-runtime/test/sshbarrier_test_job.yaml index c71a05ac2b..ce1fc4278c 100644 --- a/src/kube-runtime/test/sshbarrier_test_job.yaml +++ b/src/kube-runtime/test/sshbarrier_test_job.yaml @@ -24,12 +24,13 @@ taskRoles: minSucceededInstances: 1 dockerImage: horovod_official resourcePerInstance: - cpu: 4 + cpu: 8 memoryMB: 16384 - gpu: 1 + gpu: 2 commands: + - sleep 10 - > - horovodrun -np 8 -H master-0:4,worker-0:4 + horovodrun -np 4 -H master-0:2,worker-0:2 python pytorch_synthetic_benchmark.py --model <% $parameters.model %> --batch-size <% $parameters.batchsize %> @@ -37,9 +38,9 @@ taskRoles: instances: 1 dockerImage: horovod_official resourcePerInstance: - cpu: 4 + cpu: 8 memoryMB: 16384 - gpu: 1 + gpu: 2 commands: - sleep infinity