From 287e868023f315b00171e29e006d306fd8584eb1 Mon Sep 17 00:00:00 2001 From: Andrey Velichkevich Date: Tue, 3 Aug 2021 00:45:11 +0100 Subject: [PATCH] Add support for XGBoost Operator with LightGBM example (#1603) * Add support for XGBoost Operator * Specify Tag for LightGBM image --- README.md | 2 + examples/v1beta1/README.md | 6 + examples/v1beta1/xgboost-lightgbm.yaml | 123 ++++++++++++++++++ .../components/controller/controller.yaml | 2 + .../v1beta1/components/controller/rbac.yaml | 7 + 5 files changed, 140 insertions(+) create mode 100644 examples/v1beta1/xgboost-lightgbm.yaml diff --git a/README.md b/README.md index 6c3d1696e28..a5291fcb6d9 100644 --- a/README.md +++ b/README.md @@ -125,6 +125,8 @@ Katib has these CRD examples in upstream: - [Kubeflow `MPIJob`](https://www.kubeflow.org/docs/components/training/mpi/) +- [Kubeflow `XGBoostJob`](https://github.com/kubeflow/xgboost-operator) + - [Tekton `Pipeline`](https://github.com/tektoncd/pipeline) Thus, Katib supports multiple frameworks with the help of different job kinds. diff --git a/examples/v1beta1/README.md b/examples/v1beta1/README.md index c8c2f7e1143..a472428a296 100644 --- a/examples/v1beta1/README.md +++ b/examples/v1beta1/README.md @@ -391,3 +391,9 @@ docker.io/inaccel/jupyter:lab ``` docker.io/kubeflow/mpi-horovod-mnist ``` + +- XGBoost operator LightGBM dist example, [source](https://github.com/kubeflow/xgboost-operator/tree/master/config/samples/lightgbm-dist). + +``` +docker.io/kubeflowkatib/xgboost-lightgbm +``` diff --git a/examples/v1beta1/xgboost-lightgbm.yaml b/examples/v1beta1/xgboost-lightgbm.yaml new file mode 100644 index 00000000000..762f6275242 --- /dev/null +++ b/examples/v1beta1/xgboost-lightgbm.yaml @@ -0,0 +1,123 @@ +apiVersion: kubeflow.org/v1beta1 +kind: Experiment +metadata: + namespace: kubeflow + name: xgboost-lightgbm +spec: + objective: + type: maximize + goal: 0.99 + objectiveMetricName: valid_1 auc + additionalMetricNames: + - valid_1 binary_logloss + - training auc + - training binary_logloss + metricsCollectorSpec: + source: + filter: + metricsFormat: + - "(\\w+\\s\\w+)\\s:\\s((-?\\d+)(\\.\\d+)?)" + algorithm: + algorithmName: random + parallelTrialCount: 2 + maxTrialCount: 6 + maxFailedTrialCount: 3 + parameters: + - name: lr + parameterType: double + feasibleSpace: + min: "0.01" + max: "0.1" + - name: num-leaves + parameterType: int + feasibleSpace: + min: "50" + max: "60" + step: "1" + trialTemplate: + primaryPodLabels: + job-role: master + primaryContainerName: xgboostjob + successCondition: status.conditions.#(type=="Succeeded")#|#(status=="True")# + failureCondition: status.conditions.#(type=="Failed")#|#(status=="True")# + trialParameters: + - name: learningRate + description: Learning rate for the training model + reference: lr + - name: numberLeaves + description: Number of leaves for one tree + reference: num-leaves + trialSpec: + # TODO (andreyvelich): Change to kubeflow.org/v1 once all-in-one operator is finished. + apiVersion: xgboostjob.kubeflow.org/v1 + kind: XGBoostJob + spec: + xgbReplicaSpecs: + Master: + replicas: 1 + restartPolicy: Never + template: + spec: + containers: + - name: xgboostjob + image: docker.io/kubeflowkatib/xgboost-lightgbm:1.0 + ports: + - containerPort: 9991 + name: xgboostjob-port + imagePullPolicy: Always + args: + - --job_type=Train + - --metric=binary_logloss,auc + - --learning_rate=${trialParameters.learningRate} + - --num_leaves=${trialParameters.numberLeaves} + - --num_trees=100 + - --boosting_type=gbdt + - --objective=binary + - --metric_freq=1 + - --is_training_metric=true + - --max_bin=255 + - --data=data/binary.train + - --valid_data=data/binary.test + - --tree_learner=feature + - --feature_fraction=0.8 + - --bagging_freq=5 + - --bagging_fraction=0.8 + - --min_data_in_leaf=50 + - --min_sum_hessian_in_leaf=50 + - --is_enable_sparse=true + - --use_two_round_loading=false + - --is_save_binary_file=false + Worker: + replicas: 2 + restartPolicy: ExitCode + template: + spec: + containers: + - name: xgboostjob + image: docker.io/kubeflowkatib/xgboost-lightgbm:1.0 + ports: + - containerPort: 9991 + name: xgboostjob-port + imagePullPolicy: Always + args: + - --job_type=Train + - --metric=binary_logloss,auc + - --learning_rate=${trialParameters.learningRate} + - --num_leaves=${trialParameters.numberLeaves} + - --num_trees=100 + - --boosting_type=gbdt + - --objective=binary + - --metric_freq=1 + - --is_training_metric=true + - --max_bin=255 + - --data=data/binary.train + - --valid_data=data/binary.test + - --tree_learner=feature + - --feature_fraction=0.8 + - --bagging_freq=5 + - --bagging_fraction=0.8 + - --min_data_in_leaf=50 + - --min_sum_hessian_in_leaf=50 + - --is_enable_sparse=true + - --use_two_round_loading=false + - --is_save_binary_file=false diff --git a/manifests/v1beta1/components/controller/controller.yaml b/manifests/v1beta1/components/controller/controller.yaml index 7d88dfd018c..23738828865 100644 --- a/manifests/v1beta1/components/controller/controller.yaml +++ b/manifests/v1beta1/components/controller/controller.yaml @@ -30,6 +30,8 @@ spec: - "--trial-resources=TFJob.v1.kubeflow.org" - "--trial-resources=PyTorchJob.v1.kubeflow.org" - "--trial-resources=MPIJob.v1.kubeflow.org" + # TODO (andreyvelich): Change to v1.kubeflow.org once all-in-one operator is finished. + - "--trial-resources=XGBoostJob.v1.xgboostjob.kubeflow.org" - "--trial-resources=PipelineRun.v1beta1.tekton.dev" ports: - containerPort: 8443 diff --git a/manifests/v1beta1/components/controller/rbac.yaml b/manifests/v1beta1/components/controller/rbac.yaml index 3d760395b7f..73d1ba3b05d 100644 --- a/manifests/v1beta1/components/controller/rbac.yaml +++ b/manifests/v1beta1/components/controller/rbac.yaml @@ -55,6 +55,13 @@ rules: - mpijobs verbs: - "*" + # TODO (andreyvelich): Move to "apiGroup: kubeflow.org" once all-in-one operator is finished. + - apiGroups: + - xgboostjob.kubeflow.org + resources: + - xgboostjobs + verbs: + - "*" - apiGroups: - tekton.dev resources: