From 5b3e7672c8b5403118b22611546fd17674f60dbb Mon Sep 17 00:00:00 2001 From: Andrey Velichkevich Date: Mon, 14 Feb 2022 20:31:58 +0000 Subject: [PATCH 1/4] Fix default label for Training Operators --- examples/v1beta1/kubeflow-training-operator/mxjob-byteps.yaml | 2 +- pkg/apis/controller/experiments/v1beta1/constants.go | 3 ++- pkg/ui/v1beta1/frontend/src/reducers/general.js | 4 ++-- test/e2e/v1beta1/argo_workflow.py | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/examples/v1beta1/kubeflow-training-operator/mxjob-byteps.yaml b/examples/v1beta1/kubeflow-training-operator/mxjob-byteps.yaml index 962425e41a0..4f52a95c721 100644 --- a/examples/v1beta1/kubeflow-training-operator/mxjob-byteps.yaml +++ b/examples/v1beta1/kubeflow-training-operator/mxjob-byteps.yaml @@ -23,7 +23,7 @@ spec: primaryContainerName: mxnet # In this example we can collect metrics only from the Worker pods. primaryPodLabels: - replica-type: worker + training.kubeflow.org/replica-type: worker trialParameters: - name: learningRate description: Learning rate for the training model diff --git a/pkg/apis/controller/experiments/v1beta1/constants.go b/pkg/apis/controller/experiments/v1beta1/constants.go index 992f3e181cb..5111cf2788a 100644 --- a/pkg/apis/controller/experiments/v1beta1/constants.go +++ b/pkg/apis/controller/experiments/v1beta1/constants.go @@ -38,7 +38,7 @@ const ( var ( // DefaultKubeflowJobPrimaryPodLabels is the default value of spec.trialTemplate.primaryPodLabels for Kubeflow Training Job. - DefaultKubeflowJobPrimaryPodLabels = map[string]string{"job-role": "master"} + DefaultKubeflowJobPrimaryPodLabels = map[string]string{"training.kubeflow.org/job-role": "master"} // KubeflowJobKinds is the list of Kubeflow Training Job kinds. KubeflowJobKinds = map[string]bool{ @@ -46,5 +46,6 @@ var ( "PyTorchJob": true, "XGBoostJob": true, "MXJob": true, + "MPIJob": true, } ) diff --git a/pkg/ui/v1beta1/frontend/src/reducers/general.js b/pkg/ui/v1beta1/frontend/src/reducers/general.js index 7fafd67bee5..e967f237df0 100644 --- a/pkg/ui/v1beta1/frontend/src/reducers/general.js +++ b/pkg/ui/v1beta1/frontend/src/reducers/general.js @@ -50,14 +50,14 @@ const initialState = { value: 'status.conditions.#(type=="Complete")#|#(status=="True")#', description: `Condition when Trial custom resource is succeeded. Default value for k8s BatchJob: status.conditions.#(type=="Complete")#|#(status=="True")#. - Default value for Kubeflow Job (TFJob, PyTorchJob, XGBoostJob, MXJob): status.conditions.#(type=="Succeeded")#|#(status=="True")#.`, + Default value for Kubeflow Job (TFJob, PyTorchJob, XGBoostJob, MXJob, MPIJob): status.conditions.#(type=="Succeeded")#|#(status=="True")#.`, }, { name: 'FailureCondition', value: 'status.conditions.#(type=="Failed")#|#(status=="True")#', description: `Condition when Trial custom resource is failed. Default value for k8s BatchJob: status.conditions.#(type=="Failed")#|#(status=="True")#. - Default value for Kubeflow Job (TFJob, PyTorchJob, XGBoostJob, MXJob): status.conditions.#(type=="Failed")#|#(status=="True")#.`, + Default value for Kubeflow Job (TFJob, PyTorchJob, XGBoostJob, MXJob, MPIJob): status.conditions.#(type=="Failed")#|#(status=="True")#.`, }, { name: 'Retain', diff --git a/test/e2e/v1beta1/argo_workflow.py b/test/e2e/v1beta1/argo_workflow.py index 504091928c3..01d7e6124fa 100644 --- a/test/e2e/v1beta1/argo_workflow.py +++ b/test/e2e/v1beta1/argo_workflow.py @@ -167,7 +167,7 @@ def create_task_template(self, task_name, exec_image, command): }, { "name": "EXTRA_REPOS", - "value": "kubeflow/testing@HEAD;kubeflow/manifests@v1.4-branch" + "value": "kubeflow/testing@HEAD;kubeflow/manifests@v1.5-branch" }, # Set GOPATH to test_dir because Katib repo is located under /src/github.com/kubeflow/katib { From 6339e48f01847372132477d123d6fe48dc72f164 Mon Sep 17 00:00:00 2001 From: Andrey Velichkevich Date: Mon, 14 Feb 2022 22:18:11 +0000 Subject: [PATCH 2/4] Fix version comment --- test/e2e/v1beta1/scripts/setup-katib.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/e2e/v1beta1/scripts/setup-katib.sh b/test/e2e/v1beta1/scripts/setup-katib.sh index 8d55959ffaa..99a1704796c 100755 --- a/test/e2e/v1beta1/scripts/setup-katib.sh +++ b/test/e2e/v1beta1/scripts/setup-katib.sh @@ -41,8 +41,8 @@ cat "manifests/v1beta1/components/controller/katib-config.yaml" echo "Creating Kubeflow namespace" kubectl create namespace kubeflow -echo "Deploying training-operator from kubeflow/manifests v1.4 branch" cd "${MANIFESTS_DIR}/apps/training-operator/upstream/overlays/kubeflow" +echo "Deploying Training Operator from kubeflow/manifests $(git branch --show-current)" kustomize build . | kubectl apply -f - echo "Deploying Katib" From bd06b24c0a7f7a58616753072f283642cec6c023 Mon Sep 17 00:00:00 2001 From: Andrey Velichkevich Date: Mon, 14 Feb 2022 23:15:32 +0000 Subject: [PATCH 3/4] Change the docs --- docs/proposals/metrics-collector.md | 2 +- docs/proposals/trial-custom-crd.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/proposals/metrics-collector.md b/docs/proposals/metrics-collector.md index 82ee716af02..4a780c621c3 100644 --- a/docs/proposals/metrics-collector.md +++ b/docs/proposals/metrics-collector.md @@ -123,7 +123,7 @@ In the namespace with `katib.kubeflow.org/metrics-collector-injection=enabled` l In **Pod Level Injecting**, -1. Job operators (_e.x. TFjob/PyTorchjob_) tag the `job-role: master` ([#1064](https://github.com/kubeflow/tf-operator/pull/1064)) label on the master pod. +1. Job operators (_e.x. TFjob/PyTorchjob_) tag the `training.kubeflow.org/job-role: master` ([#1064](https://github.com/kubeflow/tf-operator/pull/1064)) label on the master pod. 2. The webhook inject the metric collector only if the webhook recognizes this label. 3. The webhook uses [ObjectSelector](https://github.com/kubernetes/kubernetes/pull/78505) to skip on irrelevant objects in order to optimize the performance. 4. ObjectSelector is only supported above _Kubernetes v1.15_. Without this new feature, there may be a [performance issue](https://github.com/kubeflow/katib/issues/685#issuecomment-516226070) in webhook. In this situation, the following **Job Level Injecting** mode may be a better option. diff --git a/docs/proposals/trial-custom-crd.md b/docs/proposals/trial-custom-crd.md index e3e5082a15d..a939f77011a 100644 --- a/docs/proposals/trial-custom-crd.md +++ b/docs/proposals/trial-custom-crd.md @@ -124,7 +124,7 @@ For example, for TFJob: ```yaml . . . PrimaryPodLabel: - "job-role": "master" + "training.kubeflow.org/job-role": "master" . . . ``` From 1517c4abcbf398365370ba42be39ab6cd80b0704 Mon Sep 17 00:00:00 2001 From: Andrey Velichkevich Date: Mon, 14 Feb 2022 23:18:58 +0000 Subject: [PATCH 4/4] Change git command --- test/e2e/v1beta1/scripts/setup-katib.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/e2e/v1beta1/scripts/setup-katib.sh b/test/e2e/v1beta1/scripts/setup-katib.sh index 99a1704796c..4d2f9314189 100755 --- a/test/e2e/v1beta1/scripts/setup-katib.sh +++ b/test/e2e/v1beta1/scripts/setup-katib.sh @@ -42,7 +42,7 @@ echo "Creating Kubeflow namespace" kubectl create namespace kubeflow cd "${MANIFESTS_DIR}/apps/training-operator/upstream/overlays/kubeflow" -echo "Deploying Training Operator from kubeflow/manifests $(git branch --show-current)" +echo "Deploying Training Operator from kubeflow/manifests $(git rev-parse --abbrev-ref HEAD)" kustomize build . | kubectl apply -f - echo "Deploying Katib"