diff --git a/cmd/experimental/kjobctl/docs/commands/kjobctl_create/kjobctl_create_slurm.md b/cmd/experimental/kjobctl/docs/commands/kjobctl_create/kjobctl_create_slurm.md index cf79a1cc77..0dba3309b4 100644 --- a/cmd/experimental/kjobctl/docs/commands/kjobctl_create/kjobctl_create_slurm.md +++ b/cmd/experimental/kjobctl/docs/commands/kjobctl_create/kjobctl_create_slurm.md @@ -71,7 +71,7 @@ kjobctl create slurm --profile APPLICATION_PROFILE_NAME [--localqueue LOCAL_QUEU - --init-image string     Default: "bash:5-alpine3.20" + --init-image string     Default: "registry.k8s.io/busybox:1.27.2" diff --git a/cmd/experimental/kjobctl/pkg/builder/slurm_builder.go b/cmd/experimental/kjobctl/pkg/builder/slurm_builder.go index 6b674a1674..bf0d4ee61b 100644 --- a/cmd/experimental/kjobctl/pkg/builder/slurm_builder.go +++ b/cmd/experimental/kjobctl/pkg/builder/slurm_builder.go @@ -24,7 +24,6 @@ import ( "fmt" "math" "os" - "slices" "strconv" "strings" "text/template" @@ -282,7 +281,7 @@ func (b *slurmBuilder) build(ctx context.Context) (runtime.Object, []runtime.Obj job.Spec.Template.Spec.InitContainers = append(job.Spec.Template.Spec.InitContainers, corev1.Container{ Name: "slurm-init-env", Image: b.initImage, - Command: []string{"bash", slurmInitEntrypointFilenamePath}, + Command: []string{"sh", slurmInitEntrypointFilenamePath}, VolumeMounts: []corev1.VolumeMount{ { Name: "slurm-scripts", @@ -485,22 +484,30 @@ func (b *slurmBuilder) build(ctx context.Context) (runtime.Object, []runtime.Obj return job, []runtime.Object{configMap, service}, nil } -func (b *slurmBuilder) buildIndexesMap() map[int32][]int32 { - indexMap := make(map[int32][]int32) +func (b *slurmBuilder) buildArrayIndexes() string { nTasks := ptr.Deref(b.nTasks, 1) + length := int64(math.Ceil(float64(len(b.arrayIndexes.Indexes)) / float64(nTasks))) + containerIndexes := make([][]string, length) + var ( completionIndex int32 containerIndex int32 ) for _, index := range b.arrayIndexes.Indexes { - indexMap[completionIndex] = append(indexMap[completionIndex], index) + containerIndexes[completionIndex] = append(containerIndexes[completionIndex], fmt.Sprint(index)) containerIndex++ if containerIndex >= nTasks { containerIndex = 0 completionIndex++ } } - return indexMap + + completionIndexes := make([]string, length) + for completionIndex, containerIndexes := range containerIndexes { + completionIndexes[completionIndex] = strings.Join(containerIndexes, ",") + } + + return strings.Join(completionIndexes, ";") } type slurmInitEntrypointScript struct { @@ -546,18 +553,6 @@ func (b *slurmBuilder) buildInitEntrypointScript() (string, error) { nTasks := ptr.Deref(b.nTasks, 1) nodes := ptr.Deref(b.nodes, 1) - indexesMap := b.buildIndexesMap() - keyValues := make([]string, 0, len(indexesMap)) - for key, value := range indexesMap { - strIndexes := make([]string, 0, len(value)) - for _, index := range value { - strIndexes = append(strIndexes, fmt.Sprintf("%d", index)) - } - keyValues = append(keyValues, fmt.Sprintf(`["%d"]="%s"`, key, strings.Join(strIndexes, ","))) - } - - slices.Sort(keyValues) - var gpusPerTask, memPerCPU, memPerGPU string if b.gpusPerTask != nil { gpus := make([]string, 0) @@ -579,7 +574,7 @@ func (b *slurmBuilder) buildInitEntrypointScript() (string, error) { } scriptValues := slurmInitEntrypointScript{ - ArrayIndexes: strings.Join(keyValues, " "), + ArrayIndexes: b.buildArrayIndexes(), EnvsPath: slurmEnvsPath, SbatchEnvFilename: slurmSbatchEnvFilename, diff --git a/cmd/experimental/kjobctl/pkg/builder/slurm_builder_test.go b/cmd/experimental/kjobctl/pkg/builder/slurm_builder_test.go index da2e400bcb..3193443987 100644 --- a/cmd/experimental/kjobctl/pkg/builder/slurm_builder_test.go +++ b/cmd/experimental/kjobctl/pkg/builder/slurm_builder_test.go @@ -238,7 +238,7 @@ func TestSlurmBuilderDo(t *testing.T) { Mode(v1alpha1.SlurmMode). Subdomain("profile-slurm"). WithInitContainer(*wrappers.MakeContainer("slurm-init-env", "bash:latest"). - Command("bash", "/slurm/scripts/init-entrypoint.sh"). + Command("sh", "/slurm/scripts/init-entrypoint.sh"). WithVolumeMount(corev1.VolumeMount{Name: "slurm-scripts", MountPath: "/slurm/scripts"}). WithVolumeMount(corev1.VolumeMount{Name: "slurm-env", MountPath: "/slurm/env"}). Obj()). @@ -282,7 +282,7 @@ func TestSlurmBuilderDo(t *testing.T) { Mode(v1alpha1.SlurmMode). Data(map[string]string{ "script": "#!/bin/bash\nsleep 300'", - "init-entrypoint.sh": `#!/usr/local/bin/bash + "init-entrypoint.sh": `#!/bin/sh set -o errexit set -o nounset @@ -292,16 +292,14 @@ set -x # External variables # JOB_COMPLETION_INDEX - completion index of the job. -for i in {0..1} -do - # ["COMPLETION_INDEX"]="CONTAINER_INDEX_1,CONTAINER_INDEX_2" - declare -A array_indexes=(["0"]="1" ["1"]="2" ["2"]="3" ["3"]="4" ["4"]="5") # Requires bash v4+ +array_indexes="1;2;3;4;5" +container_indexes=$(echo "$array_indexes" | awk -F';' -v idx="$JOB_COMPLETION_INDEX" '{print $((idx + 1))}') - container_indexes=${array_indexes[${JOB_COMPLETION_INDEX}]} - container_indexes=(${container_indexes//,/ }) +for i in $(seq 0 1) +do + container_index=$(echo "$container_indexes" | awk -F',' -v idx="$i" '{print $((idx + 1))}') - if [[ ! -v container_indexes[$i] ]]; - then + if [ -z "$container_index" ]; then break fi @@ -341,9 +339,9 @@ SLURM_SUBMIT_DIR=/slurm/scripts SLURM_SUBMIT_HOST=$HOSTNAME SLURM_JOB_NODELIST=profile-slurm-0.profile-slurm,profile-slurm-1.profile-slurm SLURM_JOB_FIRST_NODE=profile-slurm-0.profile-slurm -SLURM_JOB_ID=$(( JOB_COMPLETION_INDEX * 1 + i + 1 )) -SLURM_JOBID=$(( JOB_COMPLETION_INDEX * 1 + i + 1 )) -SLURM_ARRAY_TASK_ID=${container_indexes[$i]} +SLURM_JOB_ID=$(expr $JOB_COMPLETION_INDEX \* 1 + $i + 1) +SLURM_JOBID=$(expr $JOB_COMPLETION_INDEX \* 1 + $i + 1) +SLURM_ARRAY_TASK_ID=$container_index EOF done diff --git a/cmd/experimental/kjobctl/pkg/builder/templates/slurm_init_entrypoint_script.sh.tmpl b/cmd/experimental/kjobctl/pkg/builder/templates/slurm_init_entrypoint_script.sh.tmpl index d5d747288a..5f1e02942b 100644 --- a/cmd/experimental/kjobctl/pkg/builder/templates/slurm_init_entrypoint_script.sh.tmpl +++ b/cmd/experimental/kjobctl/pkg/builder/templates/slurm_init_entrypoint_script.sh.tmpl @@ -1,4 +1,4 @@ -#!/usr/local/bin/bash +#!/bin/sh set -o errexit set -o nounset @@ -8,16 +8,14 @@ set -x # External variables # JOB_COMPLETION_INDEX - completion index of the job. -for i in {0..{{.SlurmNTasksPerNode}}} -do - # ["COMPLETION_INDEX"]="CONTAINER_INDEX_1,CONTAINER_INDEX_2" - declare -A array_indexes=({{.ArrayIndexes}}) # Requires bash v4+ +array_indexes="{{.ArrayIndexes}}" +container_indexes=$(echo "$array_indexes" | awk -F';' -v idx="$JOB_COMPLETION_INDEX" '{print $((idx + 1))}') - container_indexes=${array_indexes[${JOB_COMPLETION_INDEX}]} - container_indexes=(${container_indexes//,/ }) +for i in $(seq 0 {{.SlurmNTasksPerNode}}) +do + container_index=$(echo "$container_indexes" | awk -F',' -v idx="$i" '{print $((idx + 1))}') - if [[ ! -v container_indexes[$i] ]]; - then + if [ -z "$container_index" ]; then break fi @@ -57,9 +55,9 @@ SLURM_SUBMIT_DIR={{.SlurmSubmitDir}} SLURM_SUBMIT_HOST=$HOSTNAME SLURM_JOB_NODELIST={{.SlurmJobNodeList}} SLURM_JOB_FIRST_NODE={{.SlurmJobFirstNode}} -SLURM_JOB_ID=$(( JOB_COMPLETION_INDEX * {{.SlurmNTasksPerNode}} + i + {{.SlurmArrayJobID}} )) -SLURM_JOBID=$(( JOB_COMPLETION_INDEX * {{.SlurmNTasksPerNode}} + i + {{.SlurmArrayJobID}} )) -SLURM_ARRAY_TASK_ID=${container_indexes[$i]} +SLURM_JOB_ID=$(expr $JOB_COMPLETION_INDEX \* {{.SlurmNTasksPerNode}} + $i + 1) +SLURM_JOBID=$(expr $JOB_COMPLETION_INDEX \* {{.SlurmNTasksPerNode}} + $i + 1) +SLURM_ARRAY_TASK_ID=$container_index EOF done diff --git a/cmd/experimental/kjobctl/pkg/cmd/create/create.go b/cmd/experimental/kjobctl/pkg/cmd/create/create.go index ccec998d2f..cd49c8a44a 100644 --- a/cmd/experimental/kjobctl/pkg/cmd/create/create.go +++ b/cmd/experimental/kjobctl/pkg/cmd/create/create.go @@ -351,7 +351,7 @@ var createModeSubcommands = map[string]modeSubcommand{ subcmd.Flags().BoolVar(&o.IgnoreUnknown, ignoreUnknownFlagName, false, "Ignore all the unsupported flags in the bash script.") - subcmd.Flags().StringVar(&o.InitImage, initImageFlagName, "bash:5-alpine3.20", + subcmd.Flags().StringVar(&o.InitImage, initImageFlagName, "registry.k8s.io/busybox:1.27.2", "The image used for the init container.") subcmd.Flags().BoolVar(&o.SkipPriorityValidation, skipPriorityValidationFlagName, false, "Skip workload priority class validation. Add priority class label even if the class does not exist.") diff --git a/cmd/experimental/kjobctl/pkg/cmd/create/create_test.go b/cmd/experimental/kjobctl/pkg/cmd/create/create_test.go index 7aa1b47b76..0058e561fa 100644 --- a/cmd/experimental/kjobctl/pkg/cmd/create/create_test.go +++ b/cmd/experimental/kjobctl/pkg/cmd/create/create_test.go @@ -724,8 +724,8 @@ func TestCreateCmd(t *testing.T) { Profile("profile"). Mode(v1alpha1.SlurmMode). Subdomain("profile-slurm"). - WithInitContainer(*wrappers.MakeContainer("slurm-init-env", "bash:5-alpine3.20"). - Command("bash", "/slurm/scripts/init-entrypoint.sh"). + WithInitContainer(*wrappers.MakeContainer("slurm-init-env", "registry.k8s.io/busybox:1.27.2"). + Command("sh", "/slurm/scripts/init-entrypoint.sh"). WithVolumeMount(corev1.VolumeMount{Name: "slurm-scripts", MountPath: "/slurm/scripts"}). WithVolumeMount(corev1.VolumeMount{Name: "slurm-env", MountPath: "/slurm/env"}). Obj()). @@ -778,7 +778,7 @@ func TestCreateCmd(t *testing.T) { Mode(v1alpha1.SlurmMode). Data(map[string]string{ "script": "#!/bin/bash\nsleep 300'", - "init-entrypoint.sh": `#!/usr/local/bin/bash + "init-entrypoint.sh": `#!/bin/sh set -o errexit set -o nounset @@ -788,16 +788,14 @@ set -x # External variables # JOB_COMPLETION_INDEX - completion index of the job. -for i in {0..1} -do - # ["COMPLETION_INDEX"]="CONTAINER_INDEX_1,CONTAINER_INDEX_2" - declare -A array_indexes=(["0"]="0") # Requires bash v4+ +array_indexes="0" +container_indexes=$(echo "$array_indexes" | awk -F';' -v idx="$JOB_COMPLETION_INDEX" '{print $((idx + 1))}') - container_indexes=${array_indexes[${JOB_COMPLETION_INDEX}]} - container_indexes=(${container_indexes//,/ }) +for i in $(seq 0 1) +do + container_index=$(echo "$container_indexes" | awk -F',' -v idx="$i" '{print $((idx + 1))}') - if [[ ! -v container_indexes[$i] ]]; - then + if [ -z "$container_index" ]; then break fi @@ -837,9 +835,9 @@ SLURM_SUBMIT_DIR=/slurm/scripts SLURM_SUBMIT_HOST=$HOSTNAME SLURM_JOB_NODELIST=profile-slurm-0.profile-slurm SLURM_JOB_FIRST_NODE=profile-slurm-0.profile-slurm -SLURM_JOB_ID=$(( JOB_COMPLETION_INDEX * 1 + i + 1 )) -SLURM_JOBID=$(( JOB_COMPLETION_INDEX * 1 + i + 1 )) -SLURM_ARRAY_TASK_ID=${container_indexes[$i]} +SLURM_JOB_ID=$(expr $JOB_COMPLETION_INDEX \* 1 + $i + 1) +SLURM_JOBID=$(expr $JOB_COMPLETION_INDEX \* 1 + $i + 1) +SLURM_ARRAY_TASK_ID=$container_index EOF done @@ -982,7 +980,7 @@ error_path=$(unmask_filename "$SBATCH_ERROR") LocalQueue("lq1"). Subdomain("profile-slurm"). WithInitContainer(*wrappers.MakeContainer("slurm-init-env", "bash:latest"). - Command("bash", "/slurm/scripts/init-entrypoint.sh"). + Command("sh", "/slurm/scripts/init-entrypoint.sh"). WithVolumeMount(corev1.VolumeMount{Name: "slurm-scripts", MountPath: "/slurm/scripts"}). WithVolumeMount(corev1.VolumeMount{Name: "slurm-env", MountPath: "/slurm/env"}). Obj()). @@ -1055,7 +1053,7 @@ error_path=$(unmask_filename "$SBATCH_ERROR") LocalQueue("lq1"). Data(map[string]string{ "script": "#!/bin/bash\nsleep 300'", - "init-entrypoint.sh": `#!/usr/local/bin/bash + "init-entrypoint.sh": `#!/bin/sh set -o errexit set -o nounset @@ -1065,16 +1063,14 @@ set -x # External variables # JOB_COMPLETION_INDEX - completion index of the job. -for i in {0..3} -do - # ["COMPLETION_INDEX"]="CONTAINER_INDEX_1,CONTAINER_INDEX_2" - declare -A array_indexes=(["0"]="0,1,2" ["1"]="3,4,5" ["2"]="6,7,8" ["3"]="9,10,11" ["4"]="12,13,14" ["5"]="15,16,17" ["6"]="18,19,20" ["7"]="21,22,23" ["8"]="24,25") # Requires bash v4+ +array_indexes="0,1,2;3,4,5;6,7,8;9,10,11;12,13,14;15,16,17;18,19,20;21,22,23;24,25" +container_indexes=$(echo "$array_indexes" | awk -F';' -v idx="$JOB_COMPLETION_INDEX" '{print $((idx + 1))}') - container_indexes=${array_indexes[${JOB_COMPLETION_INDEX}]} - container_indexes=(${container_indexes//,/ }) +for i in $(seq 0 3) +do + container_index=$(echo "$container_indexes" | awk -F',' -v idx="$i" '{print $((idx + 1))}') - if [[ ! -v container_indexes[$i] ]]; - then + if [ -z "$container_index" ]; then break fi @@ -1114,9 +1110,9 @@ SLURM_SUBMIT_DIR=/slurm/scripts SLURM_SUBMIT_HOST=$HOSTNAME SLURM_JOB_NODELIST=profile-slurm-fpxnj-0.profile-slurm-fpxnj,profile-slurm-fpxnj-1.profile-slurm-fpxnj SLURM_JOB_FIRST_NODE=profile-slurm-fpxnj-0.profile-slurm-fpxnj -SLURM_JOB_ID=$(( JOB_COMPLETION_INDEX * 3 + i + 1 )) -SLURM_JOBID=$(( JOB_COMPLETION_INDEX * 3 + i + 1 )) -SLURM_ARRAY_TASK_ID=${container_indexes[$i]} +SLURM_JOB_ID=$(expr $JOB_COMPLETION_INDEX \* 3 + $i + 1) +SLURM_JOBID=$(expr $JOB_COMPLETION_INDEX \* 3 + $i + 1) +SLURM_ARRAY_TASK_ID=$container_index EOF done