diff --git a/samples/core/xgboost_training_cm/README.md b/samples/core/xgboost_training_cm/README.md index 3bd175f0f74..bad38e1334a 100644 --- a/samples/core/xgboost_training_cm/README.md +++ b/samples/core/xgboost_training_cm/README.md @@ -2,7 +2,7 @@ The `xgboost_training_cm.py` pipeline creates XGBoost models on structured data in CSV format. Both classification and regression are supported. -The pipeline starts by creating an Google DataProc cluster, and then running analysis, transformation, distributed training and +The pipeline starts by creating a Google DataProc cluster, and then running analysis, transformation, distributed training and prediction in the created cluster. Then a single node confusion-matrix and ROC aggregator is used (for classification case) to provide the confusion matrix data, and ROC data to the front end, respectively. @@ -30,11 +30,18 @@ Open the Kubeflow pipelines UI. Create a new pipeline, and then upload the compi ## Run -Most arguments come with default values. Only `output` and `project` need to be filled always. - -* `output` is a Google Storage path which holds -pipeline run results. Note that each pipeline run will create a unique directory under `output` so it will not override previous results. -* `project` is a GCP project. +All arguments come with default values. This pipeline is preloaded as a Demo pipeline in Pipeline UI. You can run the pipeline without any changes. + +## Modifying the pipeline +To do additional exploration you may change some of the parameters, or pipeline input that is currently specified in the pipeline definition. + +* `output` is a Google Storage path which holds pipeline run results. +Note that each pipeline run will create a unique directory under `output` so it will not override previous results. +* `workers` is nubmer of worker notes used for this training. +* `rounds` is the number of XGBoost training iterations. Set the value to 200 to get a reasonable trained model. +* `train_data` points to a CSV file that contains the training data. For a sample see 'gs://ml-pipeline-playground/sfpd/train.csv'. +* `eval_data` points to a CSV file that contains the training data. For a sample see 'gs://ml-pipeline-playground/sfpd/eval.csv'. +* `schema` points to a schema file for train and eval datasets. For a sample see 'gs://ml-pipeline-playground/sfpd/schema.json'. ## Components source diff --git a/samples/core/xgboost_training_cm/xgboost_training_cm.py b/samples/core/xgboost_training_cm/xgboost_training_cm.py index 665c36eacc3..8705c806f5b 100644 --- a/samples/core/xgboost_training_cm/xgboost_training_cm.py +++ b/samples/core/xgboost_training_cm/xgboost_training_cm.py @@ -22,7 +22,7 @@ import subprocess diagnose_me_op = components.load_component_from_url( - 'https://raw.githubusercontent.com/kubeflow/pipelines/df450617af6e385da8c436628afafb1c76ca6c79/components/diagnostics/diagnose_me/component.yaml') + 'https://raw.githubusercontent.com/kubeflow/pipelines/566dddfdfc0a6a725b6e50ea85e73d8d5578bbb9/components/diagnostics/diagnose_me/component.yaml') confusion_matrix_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/0ad0b368802eca8ca73b40fe08adb6d97af6a62f/components/local/confusion_matrix/component.yaml') @@ -207,19 +207,20 @@ def dataproc_predict_op( def xgb_train_pipeline( output='gs://{{kfp-default-bucket}}', project='{{kfp-project-id}}', - cluster_name='xgb-%s' % dsl.RUN_ID_PLACEHOLDER, - region='us-central1', - train_data='gs://ml-pipeline-playground/sfpd/train.csv', - eval_data='gs://ml-pipeline-playground/sfpd/eval.csv', - schema='gs://ml-pipeline-playground/sfpd/schema.json', - target='resolution', - execution_mode='HALT_ON_ERROR', - required_apis='stackdriver.googleapis.com, storage-api.googleapis.com, bigquery.googleapis.com, dataflow.googleapis.com, dataproc.googleapis.com', - rounds=200, - workers=2, - true_label='ACTION', + diagnostic_mode='HALT_ON_ERROR', + rounds=5, + workers=1, ): output_template = str(output) + '/' + dsl.RUN_ID_PLACEHOLDER + '/data' + region='us-central1' + quota_check=[{'region':region,'metric':'CPUS','quota_needed':1.0}] + train_data='gs://ml-pipeline-playground/sfpd/train.csv' + eval_data='gs://ml-pipeline-playground/sfpd/eval.csv' + schema='gs://ml-pipeline-playground/sfpd/schema.json' + true_label='ACTION' + target='resolution' + required_apis='storage-api.googleapis.com, dataproc.googleapis.com' + cluster_name='xgb-%s' % dsl.RUN_ID_PLACEHOLDER # Current GCP pyspark/spark op do not provide outputs as return values, instead, # we need to use strings to pass the uri around. @@ -231,9 +232,10 @@ def xgb_train_pipeline( _diagnose_me_op = diagnose_me_op( bucket=output, - execution_mode=execution_mode, + execution_mode=diagnostic_mode, project_id=project, - target_apis=required_apis) + target_apis=required_apis, + quota_check=quota_check) with dsl.ExitHandler(exit_op=dataproc_delete_cluster_op( project_id=project, diff --git a/test/sample-test/configs/xgboost_training_cm.config.yaml b/test/sample-test/configs/xgboost_training_cm.config.yaml index 626a7a46a50..999777646eb 100644 --- a/test/sample-test/configs/xgboost_training_cm.config.yaml +++ b/test/sample-test/configs/xgboost_training_cm.config.yaml @@ -16,9 +16,7 @@ test_name: xgboost_training_cm arguments: output: project: ml-pipeline-test - train_data: gs://ml-pipeline-dataset/sample-test/sfpd/train_20.csv - eval_data: gs://ml-pipeline-dataset/sample-test/sfpd/eval_5.csv - schema: gs://ml-pipeline-dataset/sample-test/sfpd/schema.json rounds: 5 workers: 2 + diagnostic_mode: False test_timeout: 3600 # xgboost needs extra time, 60 * 60 secs