From 8bca4adf51ceea72bbf2bb2107c457758ead18e6 Mon Sep 17 00:00:00 2001 From: Ning Gao Date: Thu, 20 Dec 2018 17:47:55 -0800 Subject: [PATCH 1/7] add another sample test to test the current sample codes instead of using newly built component images --- test/sample-test/run_test.sh | 69 +++++++------ test/sample_test.yaml | 189 +++++++++++++++++++++++++++++++++++ 2 files changed, 230 insertions(+), 28 deletions(-) create mode 100644 test/sample_test.yaml diff --git a/test/sample-test/run_test.sh b/test/sample-test/run_test.sh index 29471e97b4b..d28c49238ea 100755 --- a/test/sample-test/run_test.sh +++ b/test/sample-test/run_test.sh @@ -146,10 +146,12 @@ if [ "$TEST_NAME" == 'tf-training' ]; then # Compile samples cd ${BASE_DIR}/samples/kubeflow-tf - sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-dataflow-tft:\([a-zA-Z0-9_.-]\)\+|${DATAFLOW_TFT_IMAGE}|g" kubeflow-training-classification.py - sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-kubeflow-tf-trainer:\([a-zA-Z0-9_.-]\)\+|${KUBEFLOW_DNNTRAINER_IMAGE}|g" kubeflow-training-classification.py - sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-dataflow-tf-predict:\([a-zA-Z0-9_.-]\)\+|${DATAFLOW_PREDICT_IMAGE}|g" kubeflow-training-classification.py - sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-local-confusion-matrix:\([a-zA-Z0-9_.-]\)\+|${LOCAL_CONFUSIONMATRIX_IMAGE}|g" kubeflow-training-classification.py + if [ -n ${DATAFLOW_TFT_IMAGE} ];then + sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-dataflow-tft:\([a-zA-Z0-9_.-]\)\+|${DATAFLOW_TFT_IMAGE}|g" kubeflow-training-classification.py + sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-kubeflow-tf-trainer:\([a-zA-Z0-9_.-]\)\+|${KUBEFLOW_DNNTRAINER_IMAGE}|g" kubeflow-training-classification.py + sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-dataflow-tf-predict:\([a-zA-Z0-9_.-]\)\+|${DATAFLOW_PREDICT_IMAGE}|g" kubeflow-training-classification.py + sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-local-confusion-matrix:\([a-zA-Z0-9_.-]\)\+|${LOCAL_CONFUSIONMATRIX_IMAGE}|g" kubeflow-training-classification.py + fi dsl-compile --py kubeflow-training-classification.py --output kubeflow-training-classification.tar.gz @@ -165,14 +167,16 @@ elif [ "$TEST_NAME" == "tfx" ]; then # Compile samples cd ${BASE_DIR}/samples/tfx - sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-dataflow-tft:\([a-zA-Z0-9_.-]\)\+|${DATAFLOW_TFT_IMAGE}|g" taxi-cab-classification-pipeline.py - sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-dataflow-tf-predict:\([a-zA-Z0-9_.-]\)\+|${DATAFLOW_PREDICT_IMAGE}|g" taxi-cab-classification-pipeline.py - sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-dataflow-tfdv:\([a-zA-Z0-9_.-]\)\+|${DATAFLOW_TFDV_IMAGE}|g" taxi-cab-classification-pipeline.py - sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-dataflow-tfma:\([a-zA-Z0-9_.-]\)\+|${DATAFLOW_TFMA_IMAGE}|g" taxi-cab-classification-pipeline.py - sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-kubeflow-tf-trainer:\([a-zA-Z0-9_.-]\)\+|${KUBEFLOW_DNNTRAINER_IMAGE}|g" taxi-cab-classification-pipeline.py - sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-kubeflow-deployer:\([a-zA-Z0-9_.-]\)\+|${KUBEFLOW_DEPLOYER_IMAGE}|g" taxi-cab-classification-pipeline.py - sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-local-confusion-matrix:\([a-zA-Z0-9_.-]\)\+|${LOCAL_CONFUSIONMATRIX_IMAGE}|g" taxi-cab-classification-pipeline.py - sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-local-roc:\([a-zA-Z0-9_.-]\)\+|${LOCAL_ROC_IMAGE}|g" taxi-cab-classification-pipeline.py + if [ -n ${DATAFLOW_TFT_IMAGE} ];then + sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-dataflow-tft:\([a-zA-Z0-9_.-]\)\+|${DATAFLOW_TFT_IMAGE}|g" taxi-cab-classification-pipeline.py + sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-dataflow-tf-predict:\([a-zA-Z0-9_.-]\)\+|${DATAFLOW_PREDICT_IMAGE}|g" taxi-cab-classification-pipeline.py + sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-dataflow-tfdv:\([a-zA-Z0-9_.-]\)\+|${DATAFLOW_TFDV_IMAGE}|g" taxi-cab-classification-pipeline.py + sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-dataflow-tfma:\([a-zA-Z0-9_.-]\)\+|${DATAFLOW_TFMA_IMAGE}|g" taxi-cab-classification-pipeline.py + sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-kubeflow-tf-trainer:\([a-zA-Z0-9_.-]\)\+|${KUBEFLOW_DNNTRAINER_IMAGE}|g" taxi-cab-classification-pipeline.py + sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-kubeflow-deployer:\([a-zA-Z0-9_.-]\)\+|${KUBEFLOW_DEPLOYER_IMAGE}|g" taxi-cab-classification-pipeline.py + sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-local-confusion-matrix:\([a-zA-Z0-9_.-]\)\+|${LOCAL_CONFUSIONMATRIX_IMAGE}|g" taxi-cab-classification-pipeline.py + sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-local-roc:\([a-zA-Z0-9_.-]\)\+|${LOCAL_ROC_IMAGE}|g" taxi-cab-classification-pipeline.py + fi dsl-compile --py taxi-cab-classification-pipeline.py --output taxi-cab-classification-pipeline.tar.gz cd "${TEST_DIR}" @@ -251,15 +255,16 @@ elif [ "$TEST_NAME" == "xgboost" ]; then # Compile samples cd ${BASE_DIR}/samples/xgboost-spark - sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-dataproc-create-cluster:\([a-zA-Z0-9_.-]\)\+|${DATAPROC_CREATE_CLUSTER_IMAGE}|g" xgboost-training-cm.py - sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-dataproc-delete-cluster:\([a-zA-Z0-9_.-]\)\+|${DATAPROC_DELETE_CLUSTER_IMAGE}|g" xgboost-training-cm.py - sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-dataproc-analyze:\([a-zA-Z0-9_.-]\)\+|${DATAPROC_ANALYZE_IMAGE}|g" xgboost-training-cm.py - sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-dataproc-transform:\([a-zA-Z0-9_.-]\)\+|${DATAPROC_TRANSFORM_IMAGE}|g" xgboost-training-cm.py - sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-dataproc-train:\([a-zA-Z0-9_.-]\)\+|${DATAPROC_TRAIN_IMAGE}|g" xgboost-training-cm.py - sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-dataproc-predict:\([a-zA-Z0-9_.-]\)\+|${DATAPROC_PREDICT_IMAGE}|g" xgboost-training-cm.py - sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-local-roc:\([a-zA-Z0-9_.-]\)\+|${LOCAL_ROC_IMAGE}|g" xgboost-training-cm.py - sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-local-confusion-matrix:\([a-zA-Z0-9_.-]\)\+|${LOCAL_CONFUSIONMATRIX_IMAGE}|g" xgboost-training-cm.py - + if [ -n ${DATAPROC_CREATE_CLUSTER_IMAGE} ];then + sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-dataproc-create-cluster:\([a-zA-Z0-9_.-]\)\+|${DATAPROC_CREATE_CLUSTER_IMAGE}|g" xgboost-training-cm.py + sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-dataproc-delete-cluster:\([a-zA-Z0-9_.-]\)\+|${DATAPROC_DELETE_CLUSTER_IMAGE}|g" xgboost-training-cm.py + sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-dataproc-analyze:\([a-zA-Z0-9_.-]\)\+|${DATAPROC_ANALYZE_IMAGE}|g" xgboost-training-cm.py + sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-dataproc-transform:\([a-zA-Z0-9_.-]\)\+|${DATAPROC_TRANSFORM_IMAGE}|g" xgboost-training-cm.py + sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-dataproc-train:\([a-zA-Z0-9_.-]\)\+|${DATAPROC_TRAIN_IMAGE}|g" xgboost-training-cm.py + sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-dataproc-predict:\([a-zA-Z0-9_.-]\)\+|${DATAPROC_PREDICT_IMAGE}|g" xgboost-training-cm.py + sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-local-roc:\([a-zA-Z0-9_.-]\)\+|${LOCAL_ROC_IMAGE}|g" xgboost-training-cm.py + sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-local-confusion-matrix:\([a-zA-Z0-9_.-]\)\+|${LOCAL_CONFUSIONMATRIX_IMAGE}|g" xgboost-training-cm.py + fi dsl-compile --py xgboost-training-cm.py --output xgboost-training-cm.tar.gz cd "${TEST_DIR}" @@ -281,13 +286,21 @@ elif [ "$TEST_NAME" == "notebook-tfx" ]; then cd ${BASE_DIR}/samples/notebooks export LC_ALL=C.UTF-8 export LANG=C.UTF-8 - papermill --prepare-only -p EXPERIMENT_NAME notebook-tfx-test -p OUTPUT_DIR ${RESULTS_GCS_DIR} -p PROJECT_NAME ml-pipeline-test \ - -p BASE_IMAGE ${TARGET_IMAGE_PREFIX}pusherbase:dev -p TARGET_IMAGE ${TARGET_IMAGE_PREFIX}pusher:dev \ - -p KFP_PACKAGE /tmp/kfp.tar.gz -p DEV_DEPLOYER_MODEL ${DEV_DEPLOYER_MODEL}.${MODEL_VERSION} -p PROD_DEPLOYER_MODEL ${PROD_DEPLOYER_MODEL}.${MODEL_VERSION} \ - -p DATAFLOW_TFDV_IMAGE ${DATAFLOW_TFDV_IMAGE} -p DATAFLOW_TFT_IMAGE ${DATAFLOW_TFT_IMAGE} -p DATAFLOW_TFMA_IMAGE ${DATAFLOW_TFMA_IMAGE} -p DATAFLOW_TF_PREDICT_IMAGE ${DATAFLOW_PREDICT_IMAGE} \ - -p KUBEFLOW_TF_TRAINER_IMAGE ${KUBEFLOW_DNNTRAINER_IMAGE} -p KUBEFLOW_DEPLOYER_IMAGE ${KUBEFLOW_DEPLOYER_IMAGE} \ - -p TRAIN_DATA gs://ml-pipeline-dataset/sample-test/taxi-cab-classification/train50.csv -p EVAL_DATA gs://ml-pipeline-dataset/sample-test/taxi-cab-classification/eval20.csv \ - -p HIDDEN_LAYER_SIZE 10 -p STEPS 50 KubeFlow\ Pipeline\ Using\ TFX\ OSS\ Components.ipynb notebook-tfx.ipynb + if [ -n ${DATAFLOW_TFT_IMAGE} ];then + papermill --prepare-only -p EXPERIMENT_NAME notebook-tfx-test -p OUTPUT_DIR ${RESULTS_GCS_DIR} -p PROJECT_NAME ml-pipeline-test \ + -p BASE_IMAGE ${TARGET_IMAGE_PREFIX}pusherbase:dev -p TARGET_IMAGE ${TARGET_IMAGE_PREFIX}pusher:dev \ + -p KFP_PACKAGE /tmp/kfp.tar.gz -p DEV_DEPLOYER_MODEL ${DEV_DEPLOYER_MODEL}.${MODEL_VERSION} -p PROD_DEPLOYER_MODEL ${PROD_DEPLOYER_MODEL}.${MODEL_VERSION} \ + -p DATAFLOW_TFDV_IMAGE ${DATAFLOW_TFDV_IMAGE} -p DATAFLOW_TFT_IMAGE ${DATAFLOW_TFT_IMAGE} -p DATAFLOW_TFMA_IMAGE ${DATAFLOW_TFMA_IMAGE} -p DATAFLOW_TF_PREDICT_IMAGE ${DATAFLOW_PREDICT_IMAGE} \ + -p KUBEFLOW_TF_TRAINER_IMAGE ${KUBEFLOW_DNNTRAINER_IMAGE} -p KUBEFLOW_DEPLOYER_IMAGE ${KUBEFLOW_DEPLOYER_IMAGE} \ + -p TRAIN_DATA gs://ml-pipeline-dataset/sample-test/taxi-cab-classification/train50.csv -p EVAL_DATA gs://ml-pipeline-dataset/sample-test/taxi-cab-classification/eval20.csv \ + -p HIDDEN_LAYER_SIZE 10 -p STEPS 50 KubeFlow\ Pipeline\ Using\ TFX\ OSS\ Components.ipynb notebook-tfx.ipynb + else + papermill --prepare-only -p EXPERIMENT_NAME notebook-tfx-test -p OUTPUT_DIR ${RESULTS_GCS_DIR} -p PROJECT_NAME ml-pipeline-test \ + -p BASE_IMAGE ${TARGET_IMAGE_PREFIX}pusherbase:dev -p TARGET_IMAGE ${TARGET_IMAGE_PREFIX}pusher:dev \ + -p KFP_PACKAGE /tmp/kfp.tar.gz -p DEV_DEPLOYER_MODEL ${DEV_DEPLOYER_MODEL}.${MODEL_VERSION} -p PROD_DEPLOYER_MODEL ${PROD_DEPLOYER_MODEL}.${MODEL_VERSION} \ + -p TRAIN_DATA gs://ml-pipeline-dataset/sample-test/taxi-cab-classification/train50.csv -p EVAL_DATA gs://ml-pipeline-dataset/sample-test/taxi-cab-classification/eval20.csv \ + -p HIDDEN_LAYER_SIZE 10 -p STEPS 50 KubeFlow\ Pipeline\ Using\ TFX\ OSS\ Components.ipynb notebook-tfx.ipynb + fi jupyter nbconvert --to python notebook-tfx.ipynb pip3 install tensorflow==1.8.0 ipython notebook-tfx.py diff --git a/test/sample_test.yaml b/test/sample_test.yaml new file mode 100644 index 00000000000..2e9aac399c0 --- /dev/null +++ b/test/sample_test.yaml @@ -0,0 +1,189 @@ +# Copyright 2018 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: argoproj.io/v1alpha1 +kind: Workflow +metadata: + generateName: sample-test- +spec: + entrypoint: sample-test + volumes: + - name: gcp-credentials + secret: + secretName: user-gcp-sa + arguments: + parameters: + - name: image-build-context-gcs-uri + - name: target-image-prefix + - name: test-results-gcs-dir + - name: sample-tests-image-suffix + value: sample-tests + - name: namespace + value: kubeflow + templates: + - name: sample-test + inputs: + parameters: + - name: target-image-prefix + - name: test-results-gcs-dir + - name: sample-tests-image-suffix + - name: namespace + steps: + - - name: build-sample-tests-image + template: build-image-by-dockerfile + arguments: + parameters: + - name: docker-path + value: . + - name: docker-file + value: test/sample-test/Dockerfile + - name: image-name + value: "{{inputs.parameters.target-image-prefix}}{{inputs.parameters.sample-tests-image-suffix}}" + - - name: run-tf-training-tests + template: run-sample-tests + arguments: + parameters: + - name: test-results-gcs-dir + value: "{{inputs.parameters.test-results-gcs-dir}}" + - name: target-image-prefix + value: "{{inputs.parameters.target-image-prefix}}" + - name: sample-tests-image + value: "{{inputs.parameters.target-image-prefix}}{{inputs.parameters.sample-tests-image-suffix}}" + - name: namespace + value: "{{inputs.parameters.namespace}}" + - name: test-name + value: "tf-training" + - name: run-tfx-tests + template: run-sample-tests + arguments: + parameters: + - name: test-results-gcs-dir + value: "{{inputs.parameters.test-results-gcs-dir}}" + - name: target-image-prefix + value: "{{inputs.parameters.target-image-prefix}}" + - name: sample-tests-image + value: "{{inputs.parameters.target-image-prefix}}{{inputs.parameters.sample-tests-image-suffix}}" + - name: namespace + value: "{{inputs.parameters.namespace}}" + - name: test-name + value: "tfx" + - name: run-xgboost-tests + template: run-sample-tests + arguments: + parameters: + - name: test-results-gcs-dir + value: "{{inputs.parameters.test-results-gcs-dir}}" + - name: target-image-prefix + value: "{{inputs.parameters.target-image-prefix}}" + - name: sample-tests-image + value: "{{inputs.parameters.target-image-prefix}}{{inputs.parameters.sample-tests-image-suffix}}" + - name: namespace + value: "{{inputs.parameters.namespace}}" + - name: test-name + value: "xgboost" + - name: run-notebook-tfx-tests + template: run-sample-tests + arguments: + parameters: + - name: test-results-gcs-dir + value: "{{inputs.parameters.test-results-gcs-dir}}" + - name: target-image-prefix + value: "{{inputs.parameters.target-image-prefix}}" + - name: sample-tests-image + value: "{{inputs.parameters.target-image-prefix}}{{inputs.parameters.sample-tests-image-suffix}}" + - name: namespace + value: "{{inputs.parameters.namespace}}" + - name: test-name + value: "notebook-tfx" + - name: run-notebook-lightweight-tests + template: run-sample-tests + arguments: + parameters: + - name: test-results-gcs-dir + value: "{{inputs.parameters.test-results-gcs-dir}}" + - name: target-image-prefix + value: "{{inputs.parameters.target-image-prefix}}" + - name: sample-tests-image + value: "{{inputs.parameters.target-image-prefix}}{{inputs.parameters.sample-tests-image-suffix}}" + - name: namespace + value: "{{inputs.parameters.namespace}}" + - name: test-name + value: "notebook-lightweight" + + + # Build and push image + - name: build-image-by-dockerfile + inputs: + parameters: + # GCS URI prefix pointing to a .tar.gz archive of Docker build context + - name: image-build-context-gcs-uri + value: "{{workflow.parameters.image-build-context-gcs-uri}}" + # The relative code path to the Dockerfile + - name: docker-path + # Name of the Docker file to use. "Dockerfile" by default + - name: docker-file + value: Dockerfile + - name: image-name + outputs: + parameters: + - name: strict-image-name + valueFrom: + path: /outputs/strict-image-name/file + container: + image: gcr.io/ml-pipeline-test/image-builder:v20181128-0.1.3-rc.1-109-ga5a14dc-e3b0c4 + imagePullPolicy: 'Always' + args: [ + "--image-build-context-gcs-uri", "{{inputs.parameters.image-build-context-gcs-uri}}", + "--docker_path", "{{inputs.parameters.docker-path}}", + "--docker_file", "{{inputs.parameters.docker-file}}", + "--image_name", "{{inputs.parameters.image-name}}", + ] + env: + - name: DOCKER_HOST + value: 127.0.0.1 + - name: GOOGLE_APPLICATION_CREDENTIALS + value: /secret/gcp-credentials/user-gcp-sa.json + volumeMounts: + - name: gcp-credentials + mountPath: /secret/gcp-credentials + sidecars: + - name: dind + image: docker:17.10-dind + securityContext: + privileged: true + mirrorVolumeMounts: true + + - name: run-sample-tests + inputs: + parameters: + - name: test-results-gcs-dir + - name: target-image-prefix + - name: sample-tests-image + - name: namespace + - name: test-name + container: + image: "{{inputs.parameters.sample-tests-image}}" + args: [ + "--results-gcs-dir", "{{inputs.parameters.test-results-gcs-dir}}", + "--target-image-prefix", "{{inputs.parameters.target-image-prefix}}", + "--namespace", "{{inputs.parameters.namespace}}", + "--test-name", "{{inputs.parameters.test-name}}", + ] + env: + - name: GOOGLE_APPLICATION_CREDENTIALS + value: /secret/gcp-credentials/user-gcp-sa.json + volumeMounts: + - name: gcp-credentials + mountPath: /secret/gcp-credentials + From 299e94d8f13e8cb86396c432700cc8ae29b72293 Mon Sep 17 00:00:00 2001 From: Ning Gao Date: Thu, 20 Dec 2018 17:49:06 -0800 Subject: [PATCH 2/7] rename sample test yamls --- test/{sample_test_v2.yaml => sample_test_components.yaml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename test/{sample_test_v2.yaml => sample_test_components.yaml} (100%) diff --git a/test/sample_test_v2.yaml b/test/sample_test_components.yaml similarity index 100% rename from test/sample_test_v2.yaml rename to test/sample_test_components.yaml From 7545c6cba5d4d8447b39beb50fb901cd9d044396 Mon Sep 17 00:00:00 2001 From: Ning Gao Date: Thu, 20 Dec 2018 17:53:12 -0800 Subject: [PATCH 3/7] use the v2 name --- test/{sample_test.yaml => sample_test_v2.yaml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename test/{sample_test.yaml => sample_test_v2.yaml} (100%) diff --git a/test/sample_test.yaml b/test/sample_test_v2.yaml similarity index 100% rename from test/sample_test.yaml rename to test/sample_test_v2.yaml From 115ea3bcc2dcc479c387297d58efd028632a08b6 Mon Sep 17 00:00:00 2001 From: Ning Gao Date: Thu, 20 Dec 2018 18:47:39 -0800 Subject: [PATCH 4/7] bash bug --- test/sample-test/run_test.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/sample-test/run_test.sh b/test/sample-test/run_test.sh index d28c49238ea..f524af13c19 100755 --- a/test/sample-test/run_test.sh +++ b/test/sample-test/run_test.sh @@ -146,7 +146,7 @@ if [ "$TEST_NAME" == 'tf-training' ]; then # Compile samples cd ${BASE_DIR}/samples/kubeflow-tf - if [ -n ${DATAFLOW_TFT_IMAGE} ];then + if [ -n "${DATAFLOW_TFT_IMAGE}" ];then sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-dataflow-tft:\([a-zA-Z0-9_.-]\)\+|${DATAFLOW_TFT_IMAGE}|g" kubeflow-training-classification.py sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-kubeflow-tf-trainer:\([a-zA-Z0-9_.-]\)\+|${KUBEFLOW_DNNTRAINER_IMAGE}|g" kubeflow-training-classification.py sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-dataflow-tf-predict:\([a-zA-Z0-9_.-]\)\+|${DATAFLOW_PREDICT_IMAGE}|g" kubeflow-training-classification.py @@ -167,7 +167,7 @@ elif [ "$TEST_NAME" == "tfx" ]; then # Compile samples cd ${BASE_DIR}/samples/tfx - if [ -n ${DATAFLOW_TFT_IMAGE} ];then + if [ -n "${DATAFLOW_TFT_IMAGE}" ];then sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-dataflow-tft:\([a-zA-Z0-9_.-]\)\+|${DATAFLOW_TFT_IMAGE}|g" taxi-cab-classification-pipeline.py sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-dataflow-tf-predict:\([a-zA-Z0-9_.-]\)\+|${DATAFLOW_PREDICT_IMAGE}|g" taxi-cab-classification-pipeline.py sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-dataflow-tfdv:\([a-zA-Z0-9_.-]\)\+|${DATAFLOW_TFDV_IMAGE}|g" taxi-cab-classification-pipeline.py @@ -255,7 +255,7 @@ elif [ "$TEST_NAME" == "xgboost" ]; then # Compile samples cd ${BASE_DIR}/samples/xgboost-spark - if [ -n ${DATAPROC_CREATE_CLUSTER_IMAGE} ];then + if [ -n "${DATAPROC_CREATE_CLUSTER_IMAGE}" ];then sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-dataproc-create-cluster:\([a-zA-Z0-9_.-]\)\+|${DATAPROC_CREATE_CLUSTER_IMAGE}|g" xgboost-training-cm.py sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-dataproc-delete-cluster:\([a-zA-Z0-9_.-]\)\+|${DATAPROC_DELETE_CLUSTER_IMAGE}|g" xgboost-training-cm.py sed -i -e "s|gcr.io/ml-pipeline/ml-pipeline-dataproc-analyze:\([a-zA-Z0-9_.-]\)\+|${DATAPROC_ANALYZE_IMAGE}|g" xgboost-training-cm.py @@ -286,7 +286,7 @@ elif [ "$TEST_NAME" == "notebook-tfx" ]; then cd ${BASE_DIR}/samples/notebooks export LC_ALL=C.UTF-8 export LANG=C.UTF-8 - if [ -n ${DATAFLOW_TFT_IMAGE} ];then + if [ -n "${DATAFLOW_TFT_IMAGE}" ];then papermill --prepare-only -p EXPERIMENT_NAME notebook-tfx-test -p OUTPUT_DIR ${RESULTS_GCS_DIR} -p PROJECT_NAME ml-pipeline-test \ -p BASE_IMAGE ${TARGET_IMAGE_PREFIX}pusherbase:dev -p TARGET_IMAGE ${TARGET_IMAGE_PREFIX}pusher:dev \ -p KFP_PACKAGE /tmp/kfp.tar.gz -p DEV_DEPLOYER_MODEL ${DEV_DEPLOYER_MODEL}.${MODEL_VERSION} -p PROD_DEPLOYER_MODEL ${PROD_DEPLOYER_MODEL}.${MODEL_VERSION} \ From 31f45f3bfd6f98064689af92b797ecc6181e617a Mon Sep 17 00:00:00 2001 From: Ning Gao Date: Thu, 3 Jan 2019 10:15:44 -0800 Subject: [PATCH 5/7] tf-training bug fix --- samples/kubeflow-tf/kubeflow-training-classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/kubeflow-tf/kubeflow-training-classification.py b/samples/kubeflow-tf/kubeflow-training-classification.py index ddcc0181697..20fa9257f5a 100755 --- a/samples/kubeflow-tf/kubeflow-training-classification.py +++ b/samples/kubeflow-tf/kubeflow-training-classification.py @@ -52,7 +52,7 @@ def kubeflow_tf_training_op(transformed_data_dir, schema: 'GcsUri[text/json]', l file_outputs = {'train': '/output.txt'} ) if use_gpu: - kubeflow_tf_training_op.image = 'gcr.io/ml-pipeline/ml-pipeline-kubeflow-tf-trainer-gpu:85c6413a2e13da4b8f198aeac1abc2f3a74fe789', + kubeflow_tf_training_op.image = 'gcr.io/ml-pipeline/ml-pipeline-kubeflow-tf-trainer-gpu:85c6413a2e13da4b8f198aeac1abc2f3a74fe789' kubeflow_tf_training_op.set_gpu_limit(1) return kubeflow_tf_training_op From 5eee9305beb0792730cc9f7c808f1f1600c5b209 Mon Sep 17 00:00:00 2001 From: Ning Gao Date: Thu, 3 Jan 2019 11:31:29 -0800 Subject: [PATCH 6/7] output argo log in case of exceptions for tf-training sample --- test/sample-test/run_kubeflow_test.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/test/sample-test/run_kubeflow_test.py b/test/sample-test/run_kubeflow_test.py index 15ef3faddea..1f614c546d6 100644 --- a/test/sample-test/run_kubeflow_test.py +++ b/test/sample-test/run_kubeflow_test.py @@ -86,17 +86,18 @@ def main(): ###### Monitor Job ###### start_time = datetime.now() response = client.wait_for_run_completion(run_id, 1200) - succ = (response.run.status.lower()=='succeeded') - end_time = datetime.now() - elapsed_time = (end_time - start_time).seconds - utils.add_junit_test(test_cases, 'job completion', succ, 'waiting for job completion failure', elapsed_time) - - ###### Output Argo Log for Debugging ###### - workflow_json = client._get_workflow_json(run_id) - workflow_id = workflow_json['metadata']['name'] - argo_log, _ = utils.run_bash_command('argo logs -n {} -w {}'.format(args.namespace, workflow_id)) - print("=========Argo Workflow Log=========") - print(argo_log) + try: + succ = (response.run.status.lower()=='succeeded') + end_time = datetime.now() + elapsed_time = (end_time - start_time).seconds + utils.add_junit_test(test_cases, 'job completion', succ, 'waiting for job completion failure', elapsed_time) + finally: + ###### Output Argo Log for Debugging ###### + workflow_json = client._get_workflow_json(run_id) + workflow_id = workflow_json['metadata']['name'] + argo_log, _ = utils.run_bash_command('argo logs -n {} -w {}'.format(args.namespace, workflow_id)) + print("=========Argo Workflow Log=========") + print(argo_log) if not succ: utils.write_junit_xml(test_name, args.result, test_cases) From 83907e5693cbbb14fce6cf1cd9b66f40bd2f2b29 Mon Sep 17 00:00:00 2001 From: Ning Gao Date: Thu, 3 Jan 2019 11:32:52 -0800 Subject: [PATCH 7/7] disable gpu --- samples/kubeflow-tf/kubeflow-training-classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/kubeflow-tf/kubeflow-training-classification.py b/samples/kubeflow-tf/kubeflow-training-classification.py index 20fa9257f5a..245c7b22928 100755 --- a/samples/kubeflow-tf/kubeflow-training-classification.py +++ b/samples/kubeflow-tf/kubeflow-training-classification.py @@ -102,7 +102,7 @@ def kubeflow_training(output, project, # TODO: use the argo job name as the workflow workflow = '{{workflow.name}}' # set the flag to use GPU trainer - use_gpu = True + use_gpu = False preprocess = dataflow_tf_transform_op(train, evaluation, schema, project, preprocess_mode, '', '%s/%s/transformed' % (output, workflow)).apply(gcp.use_gcp_secret('user-gcp-sa')) training = kubeflow_tf_training_op(preprocess.output, schema, learning_rate, hidden_layer_size, steps, target, '', '%s/%s/train' % (output, workflow), use_gpu=use_gpu).apply(gcp.use_gcp_secret('user-gcp-sa'))