Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TFMA deployer bug fix #27

Merged
merged 8 commits into from
Nov 5, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 29 additions & 21 deletions components/kubeflow/container/deployer/deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

set -x

KUBERNETES_NAMESPACE="${KUBERNETES_NAMESPACE:-default}"
SERVER_NAME="${SERVER_NAME:-model-server}"

Expand Down Expand Up @@ -115,10 +117,11 @@ ks apply default -c server
echo "Waiting for the TF Serving deployment to show up..."
timeout="1000"
start_time=`date +%s`
while [[ $(kubectl get deploy --selector=app="${SERVER_NAME}" 2>&1|wc -l) != "2" ]];do
while [[ $(kubectl get deploy --namespace "${KUBERNETES_NAMESPACE}" --selector=app="${SERVER_NAME}" 2>&1|wc -l) != "2" ]];do
current_time=`date +%s`
elapsed_time=$(expr $current_time - $start_time)
if [[ $elapsed_time > $timeout ]];then
elapsed_time=$(expr $current_time + 1 - $start_time)
if [[ $elapsed_time -gt $timeout ]];then
echo "timeout"
exit 1
fi
sleep 2
Expand All @@ -128,22 +131,24 @@ echo "Waiting for the valid workflow json..."
start_time=`date +%s`
exit_code="1"
while [[ $exit_code != "0" ]];do
kubectl get deploy --selector=app="${SERVER_NAME}" --output=jsonpath='{.items[0].status.availableReplicas}'
exit_code = $?
kubectl get deploy --namespace "${KUBERNETES_NAMESPACE}" --selector=app="${SERVER_NAME}" --output=jsonpath='{.items[0].status.availableReplicas}'
exit_code=$?
current_time=`date +%s`
elapsed_time=$(expr $current_time - $start_time)
if [[ $elapsed_time > $timeout ]];then
elapsed_time=$(expr $current_time + 1 - $start_time)
if [[ $elapsed_time -gt $timeout ]];then
echo "timeout"
exit 1
fi
sleep 2
done

echo "Waiting for the TF Serving deployment to have at least one available replica..."
start_time=`date +%s`
while [[ $(kubectl get deploy --selector=app="${SERVER_NAME}" --output=jsonpath='{.items[0].status.availableReplicas}') < "1" ]]; do
while [[ $(kubectl get deploy --namespace "${KUBERNETES_NAMESPACE}" --selector=app="${SERVER_NAME}" --output=jsonpath='{.items[0].status.availableReplicas}') < "1" ]]; do
current_time=`date +%s`
elapsed_time=$(expr $current_time - $start_time)
if [[ $elapsed_time > $timeout ]];then
elapsed_time=$(expr $current_time + 1 - $start_time)
if [[ $elapsed_time -gt $timeout ]];then
echo "timeout"
exit 1
fi
sleep 5
Expand All @@ -153,10 +158,11 @@ echo "Obtaining the pod name..."
start_time=`date +%s`
pod_name=""
while [[ $pod_name == "" ]];do
pod_name=$(kubectl get pods --selector=app=${SERVER_NAME} --template '{{range .items}}{{.metadata.name}}{{"\n"}}{{end}}')
pod_name=$(kubectl get pods --namespace "${KUBERNETES_NAMESPACE}" --selector=app="${SERVER_NAME}" --template '{{range .items}}{{.metadata.name}}{{"\n"}}{{end}}')
current_time=`date +%s`
elapsed_time=$(expr $current_time - start_time)
if [[ $elapsed_time > $timeout ]];then
elapsed_time=$(expr $current_time + 1 - $start_time)
if [[ $elapsed_time -gt $timeout ]];then
echo "timeout"
exit 1
fi
sleep 2
Expand All @@ -168,21 +174,23 @@ echo "Waiting for the TF Serving pod to start running..."
start_time=`date +%s`
exit_code="1"
while [[ $exit_code != "0" ]];do
kubectl get po ${pod_name} -o jsonpath='{.status.containerStatuses[0].state.running}'
exit_code = $?
kubectl get po ${pod_name} --namespace "${KUBERNETES_NAMESPACE}" -o jsonpath='{.status.containerStatuses[0].state.running}'
exit_code=$?
current_time=`date +%s`
elapsed_time=$(expr $current_time - $start_time)
if [[ $elapsed_time > $timeout ]];then
elapsed_time=$(expr $current_time + 1 - $start_time)
if [[ $elapsed_time -gt $timeout ]];then
echo "timeout"
exit 1
fi
sleep 2
done

start_time=`date +%s`
while [ -z "$(kubectl get po ${pod_name} -o jsonpath='{.status.containerStatuses[0].state.running}')" ]; do
while [ -z "$(kubectl get po ${pod_name} --namespace "${KUBERNETES_NAMESPACE}" -o jsonpath='{.status.containerStatuses[0].state.running}')" ]; do
current_time=`date +%s`
elapsed_time=$(expr $current_time - $start_time)
if [[ $elapsed_time > $timeout ]];then
elapsed_time=$(expr $current_time + 1 - $start_time)
if [[ $elapsed_time -gt $timeout ]];then
echo "timeout"
exit 1
fi
sleep 5
Expand All @@ -191,4 +199,4 @@ done
# Wait a little while and then grab the logs of the running server
sleep 10
echo "Logs from the TF Serving pod:"
kubectl logs ${pod_name}
kubectl logs ${pod_name} --namespace "${KUBERNETES_NAMESPACE}"
10 changes: 5 additions & 5 deletions samples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -125,17 +125,17 @@ Each pipeline is identified as a python function. For example:

```python
@kfp.dsl.pipeline(
name='TFMA Trainer',
description='A trainer that does end-to-end training for TFMA models.'
name='TFX Trainer',
description='A trainer that does end-to-end training for TFX models.'
)
def train(
output_path,
train_data=kfp.dsl.PipelineParam('train-data',
value='gs://ml-pipeline-playground/tfma/taxi-cab-classification/train.csv'),
value='gs://ml-pipeline-playground/tfx/taxi-cab-classification/train.csv'),
eval_data=kfp.dsl.PipelineParam('eval-data',
value='gs://ml-pipeline-playground/tfma/taxi-cab-classification/eval.csv'),
value='gs://ml-pipeline-playground/tfx/taxi-cab-classification/eval.csv'),
schema=kfp.dsl.PipelineParam('schema',
value='gs://ml-pipeline-playground/tfma/taxi-cab-classification/schema.json'),
value='gs://ml-pipeline-playground/tfx/taxi-cab-classification/schema.json'),
target=kfp.dsl.PipelineParam('target', value='tips'),
learning_rate=kfp.dsl.PipelineParam('learning-rate', value=0.1),
hidden_layer_size=kfp.dsl.PipelineParam('hidden-layer-size', value='100,50'),
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def dataflow_tf_predict_op(evaluation_data: 'GcsUri', schema: 'GcsUri[text/json]
def kubeflow_deploy_op(model: 'TensorFlow model', tf_server_name, step_name='deploy'):
return dsl.ContainerOp(
name = step_name,
image = 'gcr.io/ml-pipeline/ml-pipeline-kubeflow-deployer:0.0.42',
image = 'gcr.io/ml-pipeline/ml-pipeline-kubeflow-deployer:dev', #TODO: change the tag to the release versions when new releases are built with the updated image
arguments = [
'--model-path', model,
'--server-name', tf_server_name
Expand All @@ -115,7 +115,7 @@ def kubeflow_deploy_op(model: 'TensorFlow model', tf_server_name, step_name='dep


@dsl.pipeline(
name='TFMA Taxi Cab Classification Pipeline Example',
name='TFX Taxi Cab Classification Pipeline Example',
description='Example pipeline that does classification with model analysis based on a public BigQuery dataset.'
)
def taxi_cab_classification(
Expand All @@ -124,23 +124,23 @@ def taxi_cab_classification(

column_names: dsl.PipelineParam=dsl.PipelineParam(
name='column-names',
value='gs://ml-pipeline-playground/tfma/taxi-cab-classification/column-names.json'),
value='gs://ml-pipeline-playground/tfx/taxi-cab-classification/column-names.json'),
gaoning777 marked this conversation as resolved.
Show resolved Hide resolved
key_columns: dsl.PipelineParam=dsl.PipelineParam(
name='key-columns',
value='trip_start_timestamp'),
train: dsl.PipelineParam=dsl.PipelineParam(
name='train',
value='gs://ml-pipeline-playground/tfma/taxi-cab-classification/train.csv'),
value='gs://ml-pipeline-playground/tfx/taxi-cab-classification/train.csv'),
evaluation: dsl.PipelineParam=dsl.PipelineParam(
name='evaluation',
value='gs://ml-pipeline-playground/tfma/taxi-cab-classification/eval.csv'),
value='gs://ml-pipeline-playground/tfx/taxi-cab-classification/eval.csv'),
validation_mode: dsl.PipelineParam=dsl.PipelineParam(
name='validation-mode', value='local'),
preprocess_mode: dsl.PipelineParam=dsl.PipelineParam(
name='preprocess-mode', value='local'),
preprocess_module: dsl.PipelineParam=dsl.PipelineParam(
name='preprocess-module',
value='gs://ml-pipeline-playground/tfma/taxi-cab-classification/preprocessing.py'),
value='gs://ml-pipeline-playground/tfx/taxi-cab-classification/preprocessing.py'),
target: dsl.PipelineParam=dsl.PipelineParam(
name='target', value='tips'),
learning_rate: dsl.PipelineParam=dsl.PipelineParam(name='learning-rate', value=0.1),
Expand Down
2 changes: 1 addition & 1 deletion test/presubmit-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ if [ "$CLUSTER_TYPE" == "create-gke" ]; then
echo "Delete cluster..."
gcloud container clusters delete ${TEST_CLUSTER} --async
}
trap delete_cluster EXIT
#trap delete_cluster EXIT
gaoning777 marked this conversation as resolved.
Show resolved Hide resolved

gcloud config set project ml-pipeline-test
gcloud config set compute/zone us-central1-a
Expand Down
2 changes: 1 addition & 1 deletion test/sample-test/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ RUN pip3 install minio

COPY ./run_test.sh /run_test.sh
COPY ./run_kubeflow_test.py /run_kubeflow_test.py
COPY ./run_tfma_test.py /run_tfma_test.py
COPY ./run_tfx_test.py /run_tfx_test.py
COPY ./utils.py /utils.py
RUN chmod +x /run_test.sh

Expand Down
12 changes: 6 additions & 6 deletions test/sample-test/run_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -130,12 +130,12 @@ if [ "$TEST_NAME" == 'tf-training' ]; then

echo "Copy the test results to GCS ${RESULTS_GCS_DIR}/"
gsutil cp ${SAMPLE_KUBEFLOW_TEST_RESULT} ${RESULTS_GCS_DIR}/${SAMPLE_KUBEFLOW_TEST_RESULT}
elif [ "$TEST_NAME" == "tfma" ]; then
SAMPLE_TFMA_TEST_RESULT=junit_SampleTFMAOutput.xml
SAMPLE_TFMA_TEST_OUTPUT=${RESULTS_GCS_DIR}
elif [ "$TEST_NAME" == "tfx" ]; then
SAMPLE_TFX_TEST_RESULT=junit_SampleTFXOutput.xml
SAMPLE_TFX_TEST_OUTPUT=${RESULTS_GCS_DIR}

# Compile samples
cd ${BASE_DIR}/samples/tfma
cd ${BASE_DIR}/samples/tfx
DATAFLOW_TFT_IMAGE_FOR_SED=$(echo ${DATAFLOW_TFT_IMAGE}|sed -e "s/\//\\\\\//g"|sed -e "s/\./\\\\\./g")
DATAFLOW_PREDICT_IMAGE_FOR_SED=$(echo ${DATAFLOW_PREDICT_IMAGE}|sed -e "s/\//\\\\\//g"|sed -e "s/\./\\\\\./g")
DATAFLOW_TFDV_IMAGE_FOR_SED=$(echo ${DATAFLOW_TFDV_IMAGE}|sed -e "s/\//\\\\\//g"|sed -e "s/\./\\\\\./g")
Expand All @@ -153,8 +153,8 @@ elif [ "$TEST_NAME" == "tfma" ]; then
dsl-compile --py taxi-cab-classification-pipeline.py --output taxi-cab-classification-pipeline.tar.gz

cd /
python3 run_tfma_test.py --input ${BASE_DIR}/samples/tfma/taxi-cab-classification-pipeline.tar.gz --result $SAMPLE_TFMA_TEST_RESULT --output $SAMPLE_TFMA_TEST_OUTPUT
python3 run_tfx_test.py --input ${BASE_DIR}/samples/tfx/taxi-cab-classification-pipeline.tar.gz --result $SAMPLE_TFX_TEST_RESULT --output $SAMPLE_TFX_TEST_OUTPUT

echo "Copy the test results to GCS ${RESULTS_GCS_DIR}/"
gsutil cp ${SAMPLE_TFMA_TEST_RESULT} ${RESULTS_GCS_DIR}/${SAMPLE_TFMA_TEST_RESULT}
gsutil cp ${SAMPLE_TFX_TEST_RESULT} ${RESULTS_GCS_DIR}/${SAMPLE_TFX_TEST_RESULT}
fi
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def parse_arguments():
def main():
args = parse_arguments()
test_cases = []
test_name = 'TFMA Sample Test'
test_name = 'TFX Sample Test'

###### Initialization ######
client = Client()
Expand All @@ -59,13 +59,13 @@ def main():
exit()

###### Create Experiment ######
experiment_name = 'TFMA sample experiment'
experiment_name = 'TFX sample experiment'
response = client.create_experiment(experiment_name)
experiment_id = response.id
utils.add_junit_test(test_cases, 'create experiment', True)

###### Create Job ######
job_name = 'TFMA_sample'
job_name = 'TFX_sample'
params = {'output': args.output,
'project': 'ml-pipeline-test',
'column-names': 'gs://ml-pipeline-dataset/sample-test/taxi-cab-classification/column-names.json',
Expand Down
6 changes: 3 additions & 3 deletions test/sample_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ spec:
- name: image-suffix
value: "{{inputs.parameters.dataflow-tfma-image-suffix}}"
- name: build-script
value: components/dataflow/containers/tfma/build.sh
value: components/dataflow/containers/tfx/build.sh
- name: build-dataflow-tfdv-image
template: build-image-by-script
arguments:
Expand Down Expand Up @@ -201,7 +201,7 @@ spec:
value: "gcr.io/{{steps.get-project.outputs.result}}/{{inputs.parameters.commit-sha}}/{{inputs.parameters.sample-tests-image-suffix}}"
- name: test-name
value: "tf-training"
- name: run-tfma-tests
- name: run-tfx-tests
template: run-sample-tests
arguments:
parameters:
Expand All @@ -226,7 +226,7 @@ spec:
- name: sample-tests-image
value: "gcr.io/{{steps.get-project.outputs.result}}/{{inputs.parameters.commit-sha}}/{{inputs.parameters.sample-tests-image-suffix}}"
- name: test-name
value: "tfma"
value: "tfx"

- name: get-project
script:
Expand Down