diff --git a/components/kubeflow/container/deployer/deploy.sh b/components/kubeflow/container/deployer/deploy.sh index 198ee480880..a0fb8cfc603 100755 --- a/components/kubeflow/container/deployer/deploy.sh +++ b/components/kubeflow/container/deployer/deploy.sh @@ -14,6 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +set -x + KUBERNETES_NAMESPACE="${KUBERNETES_NAMESPACE:-default}" SERVER_NAME="${SERVER_NAME:-model-server}" @@ -115,10 +117,11 @@ ks apply default -c server echo "Waiting for the TF Serving deployment to show up..." timeout="1000" start_time=`date +%s` -while [[ $(kubectl get deploy --selector=app="${SERVER_NAME}" 2>&1|wc -l) != "2" ]];do +while [[ $(kubectl get deploy --namespace "${KUBERNETES_NAMESPACE}" --selector=app="${SERVER_NAME}" 2>&1|wc -l) != "2" ]];do current_time=`date +%s` - elapsed_time=$(expr $current_time - $start_time) - if [[ $elapsed_time > $timeout ]];then + elapsed_time=$(expr $current_time + 1 - $start_time) + if [[ $elapsed_time -gt $timeout ]];then + echo "timeout" exit 1 fi sleep 2 @@ -128,11 +131,12 @@ echo "Waiting for the valid workflow json..." start_time=`date +%s` exit_code="1" while [[ $exit_code != "0" ]];do - kubectl get deploy --selector=app="${SERVER_NAME}" --output=jsonpath='{.items[0].status.availableReplicas}' - exit_code = $? + kubectl get deploy --namespace "${KUBERNETES_NAMESPACE}" --selector=app="${SERVER_NAME}" --output=jsonpath='{.items[0].status.availableReplicas}' + exit_code=$? current_time=`date +%s` - elapsed_time=$(expr $current_time - $start_time) - if [[ $elapsed_time > $timeout ]];then + elapsed_time=$(expr $current_time + 1 - $start_time) + if [[ $elapsed_time -gt $timeout ]];then + echo "timeout" exit 1 fi sleep 2 @@ -140,10 +144,11 @@ done echo "Waiting for the TF Serving deployment to have at least one available replica..." start_time=`date +%s` -while [[ $(kubectl get deploy --selector=app="${SERVER_NAME}" --output=jsonpath='{.items[0].status.availableReplicas}') < "1" ]]; do +while [[ $(kubectl get deploy --namespace "${KUBERNETES_NAMESPACE}" --selector=app="${SERVER_NAME}" --output=jsonpath='{.items[0].status.availableReplicas}') < "1" ]]; do current_time=`date +%s` - elapsed_time=$(expr $current_time - $start_time) - if [[ $elapsed_time > $timeout ]];then + elapsed_time=$(expr $current_time + 1 - $start_time) + if [[ $elapsed_time -gt $timeout ]];then + echo "timeout" exit 1 fi sleep 5 @@ -153,10 +158,11 @@ echo "Obtaining the pod name..." start_time=`date +%s` pod_name="" while [[ $pod_name == "" ]];do - pod_name=$(kubectl get pods --selector=app=${SERVER_NAME} --template '{{range .items}}{{.metadata.name}}{{"\n"}}{{end}}') + pod_name=$(kubectl get pods --namespace "${KUBERNETES_NAMESPACE}" --selector=app="${SERVER_NAME}" --template '{{range .items}}{{.metadata.name}}{{"\n"}}{{end}}') current_time=`date +%s` - elapsed_time=$(expr $current_time - start_time) - if [[ $elapsed_time > $timeout ]];then + elapsed_time=$(expr $current_time + 1 - $start_time) + if [[ $elapsed_time -gt $timeout ]];then + echo "timeout" exit 1 fi sleep 2 @@ -168,21 +174,23 @@ echo "Waiting for the TF Serving pod to start running..." start_time=`date +%s` exit_code="1" while [[ $exit_code != "0" ]];do - kubectl get po ${pod_name} -o jsonpath='{.status.containerStatuses[0].state.running}' - exit_code = $? + kubectl get po ${pod_name} --namespace "${KUBERNETES_NAMESPACE}" -o jsonpath='{.status.containerStatuses[0].state.running}' + exit_code=$? current_time=`date +%s` - elapsed_time=$(expr $current_time - $start_time) - if [[ $elapsed_time > $timeout ]];then + elapsed_time=$(expr $current_time + 1 - $start_time) + if [[ $elapsed_time -gt $timeout ]];then + echo "timeout" exit 1 fi sleep 2 done start_time=`date +%s` -while [ -z "$(kubectl get po ${pod_name} -o jsonpath='{.status.containerStatuses[0].state.running}')" ]; do +while [ -z "$(kubectl get po ${pod_name} --namespace "${KUBERNETES_NAMESPACE}" -o jsonpath='{.status.containerStatuses[0].state.running}')" ]; do current_time=`date +%s` - elapsed_time=$(expr $current_time - $start_time) - if [[ $elapsed_time > $timeout ]];then + elapsed_time=$(expr $current_time + 1 - $start_time) + if [[ $elapsed_time -gt $timeout ]];then + echo "timeout" exit 1 fi sleep 5 @@ -191,4 +199,4 @@ done # Wait a little while and then grab the logs of the running server sleep 10 echo "Logs from the TF Serving pod:" -kubectl logs ${pod_name} +kubectl logs ${pod_name} --namespace "${KUBERNETES_NAMESPACE}" diff --git a/samples/README.md b/samples/README.md index 5c1a05b3c3c..52dd836bc18 100644 --- a/samples/README.md +++ b/samples/README.md @@ -125,17 +125,17 @@ Each pipeline is identified as a python function. For example: ```python @kfp.dsl.pipeline( - name='TFMA Trainer', - description='A trainer that does end-to-end training for TFMA models.' + name='TFX Trainer', + description='A trainer that does end-to-end training for TFX models.' ) def train( output_path, train_data=kfp.dsl.PipelineParam('train-data', - value='gs://ml-pipeline-playground/tfma/taxi-cab-classification/train.csv'), + value='gs://ml-pipeline-playground/tfx/taxi-cab-classification/train.csv'), eval_data=kfp.dsl.PipelineParam('eval-data', - value='gs://ml-pipeline-playground/tfma/taxi-cab-classification/eval.csv'), + value='gs://ml-pipeline-playground/tfx/taxi-cab-classification/eval.csv'), schema=kfp.dsl.PipelineParam('schema', - value='gs://ml-pipeline-playground/tfma/taxi-cab-classification/schema.json'), + value='gs://ml-pipeline-playground/tfx/taxi-cab-classification/schema.json'), target=kfp.dsl.PipelineParam('target', value='tips'), learning_rate=kfp.dsl.PipelineParam('learning-rate', value=0.1), hidden_layer_size=kfp.dsl.PipelineParam('hidden-layer-size', value='100,50'), diff --git a/samples/tfma/README.md b/samples/tfx/README.md similarity index 100% rename from samples/tfma/README.md rename to samples/tfx/README.md diff --git a/samples/tfma/taxi-cab-classification-pipeline.py b/samples/tfx/taxi-cab-classification-pipeline.py similarity index 93% rename from samples/tfma/taxi-cab-classification-pipeline.py rename to samples/tfx/taxi-cab-classification-pipeline.py index 47e79ca241d..b471e8549b3 100755 --- a/samples/tfma/taxi-cab-classification-pipeline.py +++ b/samples/tfx/taxi-cab-classification-pipeline.py @@ -106,7 +106,7 @@ def dataflow_tf_predict_op(evaluation_data: 'GcsUri', schema: 'GcsUri[text/json] def kubeflow_deploy_op(model: 'TensorFlow model', tf_server_name, step_name='deploy'): return dsl.ContainerOp( name = step_name, - image = 'gcr.io/ml-pipeline/ml-pipeline-kubeflow-deployer:0.0.42', + image = 'gcr.io/ml-pipeline/ml-pipeline-kubeflow-deployer:dev', #TODO: change the tag to the release versions when new releases are built with the updated image arguments = [ '--model-path', model, '--server-name', tf_server_name @@ -115,7 +115,7 @@ def kubeflow_deploy_op(model: 'TensorFlow model', tf_server_name, step_name='dep @dsl.pipeline( - name='TFMA Taxi Cab Classification Pipeline Example', + name='TFX Taxi Cab Classification Pipeline Example', description='Example pipeline that does classification with model analysis based on a public BigQuery dataset.' ) def taxi_cab_classification( @@ -124,23 +124,23 @@ def taxi_cab_classification( column_names: dsl.PipelineParam=dsl.PipelineParam( name='column-names', - value='gs://ml-pipeline-playground/tfma/taxi-cab-classification/column-names.json'), + value='gs://ml-pipeline-playground/tfx/taxi-cab-classification/column-names.json'), key_columns: dsl.PipelineParam=dsl.PipelineParam( name='key-columns', value='trip_start_timestamp'), train: dsl.PipelineParam=dsl.PipelineParam( name='train', - value='gs://ml-pipeline-playground/tfma/taxi-cab-classification/train.csv'), + value='gs://ml-pipeline-playground/tfx/taxi-cab-classification/train.csv'), evaluation: dsl.PipelineParam=dsl.PipelineParam( name='evaluation', - value='gs://ml-pipeline-playground/tfma/taxi-cab-classification/eval.csv'), + value='gs://ml-pipeline-playground/tfx/taxi-cab-classification/eval.csv'), validation_mode: dsl.PipelineParam=dsl.PipelineParam( name='validation-mode', value='local'), preprocess_mode: dsl.PipelineParam=dsl.PipelineParam( name='preprocess-mode', value='local'), preprocess_module: dsl.PipelineParam=dsl.PipelineParam( name='preprocess-module', - value='gs://ml-pipeline-playground/tfma/taxi-cab-classification/preprocessing.py'), + value='gs://ml-pipeline-playground/tfx/taxi-cab-classification/preprocessing.py'), target: dsl.PipelineParam=dsl.PipelineParam( name='target', value='tips'), learning_rate: dsl.PipelineParam=dsl.PipelineParam(name='learning-rate', value=0.1), diff --git a/samples/tfma/taxi-cab-classification/column-names.json b/samples/tfx/taxi-cab-classification/column-names.json similarity index 100% rename from samples/tfma/taxi-cab-classification/column-names.json rename to samples/tfx/taxi-cab-classification/column-names.json diff --git a/samples/tfma/taxi-cab-classification/eval.csv b/samples/tfx/taxi-cab-classification/eval.csv similarity index 100% rename from samples/tfma/taxi-cab-classification/eval.csv rename to samples/tfx/taxi-cab-classification/eval.csv diff --git a/samples/tfma/taxi-cab-classification/preprocessing.py b/samples/tfx/taxi-cab-classification/preprocessing.py similarity index 100% rename from samples/tfma/taxi-cab-classification/preprocessing.py rename to samples/tfx/taxi-cab-classification/preprocessing.py diff --git a/samples/tfma/taxi-cab-classification/schema.json b/samples/tfx/taxi-cab-classification/schema.json similarity index 100% rename from samples/tfma/taxi-cab-classification/schema.json rename to samples/tfx/taxi-cab-classification/schema.json diff --git a/samples/tfma/taxi-cab-classification/train.csv b/samples/tfx/taxi-cab-classification/train.csv similarity index 100% rename from samples/tfma/taxi-cab-classification/train.csv rename to samples/tfx/taxi-cab-classification/train.csv diff --git a/test/presubmit-tests.sh b/test/presubmit-tests.sh index 667f8681804..2a919111349 100755 --- a/test/presubmit-tests.sh +++ b/test/presubmit-tests.sh @@ -72,7 +72,7 @@ if [ "$CLUSTER_TYPE" == "create-gke" ]; then echo "Delete cluster..." gcloud container clusters delete ${TEST_CLUSTER} --async } - trap delete_cluster EXIT + #trap delete_cluster EXIT gcloud config set project ml-pipeline-test gcloud config set compute/zone us-central1-a diff --git a/test/sample-test/Dockerfile b/test/sample-test/Dockerfile index ee4e6c2a496..df1e41cc1e9 100644 --- a/test/sample-test/Dockerfile +++ b/test/sample-test/Dockerfile @@ -16,7 +16,7 @@ RUN pip3 install minio COPY ./run_test.sh /run_test.sh COPY ./run_kubeflow_test.py /run_kubeflow_test.py -COPY ./run_tfma_test.py /run_tfma_test.py +COPY ./run_tfx_test.py /run_tfx_test.py COPY ./utils.py /utils.py RUN chmod +x /run_test.sh diff --git a/test/sample-test/run_test.sh b/test/sample-test/run_test.sh index 331ad8ca211..1e81effb889 100755 --- a/test/sample-test/run_test.sh +++ b/test/sample-test/run_test.sh @@ -130,12 +130,12 @@ if [ "$TEST_NAME" == 'tf-training' ]; then echo "Copy the test results to GCS ${RESULTS_GCS_DIR}/" gsutil cp ${SAMPLE_KUBEFLOW_TEST_RESULT} ${RESULTS_GCS_DIR}/${SAMPLE_KUBEFLOW_TEST_RESULT} -elif [ "$TEST_NAME" == "tfma" ]; then - SAMPLE_TFMA_TEST_RESULT=junit_SampleTFMAOutput.xml - SAMPLE_TFMA_TEST_OUTPUT=${RESULTS_GCS_DIR} +elif [ "$TEST_NAME" == "tfx" ]; then + SAMPLE_TFX_TEST_RESULT=junit_SampleTFXOutput.xml + SAMPLE_TFX_TEST_OUTPUT=${RESULTS_GCS_DIR} # Compile samples - cd ${BASE_DIR}/samples/tfma + cd ${BASE_DIR}/samples/tfx DATAFLOW_TFT_IMAGE_FOR_SED=$(echo ${DATAFLOW_TFT_IMAGE}|sed -e "s/\//\\\\\//g"|sed -e "s/\./\\\\\./g") DATAFLOW_PREDICT_IMAGE_FOR_SED=$(echo ${DATAFLOW_PREDICT_IMAGE}|sed -e "s/\//\\\\\//g"|sed -e "s/\./\\\\\./g") DATAFLOW_TFDV_IMAGE_FOR_SED=$(echo ${DATAFLOW_TFDV_IMAGE}|sed -e "s/\//\\\\\//g"|sed -e "s/\./\\\\\./g") @@ -153,8 +153,8 @@ elif [ "$TEST_NAME" == "tfma" ]; then dsl-compile --py taxi-cab-classification-pipeline.py --output taxi-cab-classification-pipeline.tar.gz cd / - python3 run_tfma_test.py --input ${BASE_DIR}/samples/tfma/taxi-cab-classification-pipeline.tar.gz --result $SAMPLE_TFMA_TEST_RESULT --output $SAMPLE_TFMA_TEST_OUTPUT + python3 run_tfx_test.py --input ${BASE_DIR}/samples/tfx/taxi-cab-classification-pipeline.tar.gz --result $SAMPLE_TFX_TEST_RESULT --output $SAMPLE_TFX_TEST_OUTPUT echo "Copy the test results to GCS ${RESULTS_GCS_DIR}/" - gsutil cp ${SAMPLE_TFMA_TEST_RESULT} ${RESULTS_GCS_DIR}/${SAMPLE_TFMA_TEST_RESULT} + gsutil cp ${SAMPLE_TFX_TEST_RESULT} ${RESULTS_GCS_DIR}/${SAMPLE_TFX_TEST_RESULT} fi diff --git a/test/sample-test/run_tfma_test.py b/test/sample-test/run_tfx_test.py similarity index 97% rename from test/sample-test/run_tfma_test.py rename to test/sample-test/run_tfx_test.py index f5cc12afd1a..e18fcd1ca4f 100644 --- a/test/sample-test/run_tfma_test.py +++ b/test/sample-test/run_tfx_test.py @@ -47,7 +47,7 @@ def parse_arguments(): def main(): args = parse_arguments() test_cases = [] - test_name = 'TFMA Sample Test' + test_name = 'TFX Sample Test' ###### Initialization ###### client = Client() @@ -59,13 +59,13 @@ def main(): exit() ###### Create Experiment ###### - experiment_name = 'TFMA sample experiment' + experiment_name = 'TFX sample experiment' response = client.create_experiment(experiment_name) experiment_id = response.id utils.add_junit_test(test_cases, 'create experiment', True) ###### Create Job ###### - job_name = 'TFMA_sample' + job_name = 'TFX_sample' params = {'output': args.output, 'project': 'ml-pipeline-test', 'column-names': 'gs://ml-pipeline-dataset/sample-test/taxi-cab-classification/column-names.json', diff --git a/test/sample_test.yaml b/test/sample_test.yaml index cca3c7007a0..f249517706e 100644 --- a/test/sample_test.yaml +++ b/test/sample_test.yaml @@ -100,7 +100,7 @@ spec: - name: image-suffix value: "{{inputs.parameters.dataflow-tfma-image-suffix}}" - name: build-script - value: components/dataflow/containers/tfma/build.sh + value: components/dataflow/containers/tfx/build.sh - name: build-dataflow-tfdv-image template: build-image-by-script arguments: @@ -201,7 +201,7 @@ spec: value: "gcr.io/{{steps.get-project.outputs.result}}/{{inputs.parameters.commit-sha}}/{{inputs.parameters.sample-tests-image-suffix}}" - name: test-name value: "tf-training" - - name: run-tfma-tests + - name: run-tfx-tests template: run-sample-tests arguments: parameters: @@ -226,7 +226,7 @@ spec: - name: sample-tests-image value: "gcr.io/{{steps.get-project.outputs.result}}/{{inputs.parameters.commit-sha}}/{{inputs.parameters.sample-tests-image-suffix}}" - name: test-name - value: "tfma" + value: "tfx" - name: get-project script: