diff --git a/README.md b/README.md index c48a4aaee..e471d3549 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,4 @@ ## A repository to host extended examples and tutorials for kubeflow. 1. [Github issue summarization using sequence-to-sequence learning](./github_issue_summarization) by [Hamel Husain](https://github.com/hamelsmu) +1. [MNIST example using S3 for Training, Serving, and Tensorboard monitoring. Automated using Argo and Kubeflow](./mnist) by [Elson Rodriguez](https://github.com/elsonrodriguez) diff --git a/mnist/Dockerfile.ksonnet b/mnist/Dockerfile.ksonnet new file mode 100644 index 000000000..cbb476233 --- /dev/null +++ b/mnist/Dockerfile.ksonnet @@ -0,0 +1,31 @@ +# This container is for running ksonnet within Kubernetes +FROM ubuntu:16.04 + +ENV KUBECTL_VERSION v1.9.2 +ENV KSONNET_VERSION 0.8.0 + +RUN apt-get update +RUN apt-get -y install curl +#RUN apk add --update ca-certificates openssl && update-ca-certificates + +RUN curl -O -L https://github.com/ksonnet/ksonnet/releases/download/v${KSONNET_VERSION}/ks_${KSONNET_VERSION}_linux_amd64.tar.gz +RUN tar -zxvf ks_${KSONNET_VERSION}_linux_amd64.tar.gz -C /usr/bin/ --strip-components=1 ks_${KSONNET_VERSION}_linux_amd64/ks +RUN chmod +x /usr/bin/ks + +RUN curl -L https://storage.googleapis.com/kubernetes-release/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl -o /usr/bin/kubectl +RUN chmod +x /usr/bin/kubectl + +#ksonnet doesn't work without a kubeconfig, the following is just to add a utility to generate a kubeconfig from a service account. +ADD https://raw.githubusercontent.com/zlabjp/kubernetes-scripts/cb265de1d4c4dc4ad0f15f4aaaf5b936dcf639a5/create-kubeconfig /usr/bin/ +ADD https://raw.githubusercontent.com/zlabjp/kubernetes-scripts/cb265de1d4c4dc4ad0f15f4aaaf5b936dcf639a5/LICENSE.txt /usr/bin/create-kubeconfig.LICENSE +RUN chmod +x /usr/bin/create-kubeconfig + +RUN kubectl config set-context default --cluster=default +RUN kubectl config use-context default + +ENV USER root + +ADD ksonnet-entrypoint.sh / +RUN chmod +x /ksonnet-entrypoint.sh + +ENTRYPOINT ["/ksonnet-entrypoint.sh"] diff --git a/mnist/Dockerfile.model b/mnist/Dockerfile.model new file mode 100644 index 000000000..025a586b0 --- /dev/null +++ b/mnist/Dockerfile.model @@ -0,0 +1,8 @@ +#This container contains your model and any helper scripts specific to your model. +FROM tensorflow/tensorflow:1.5.1 + +ADD model.py /opt/model.py +RUN chmod +x /opt/model.py + +ENTRYPOINT ["/usr/bin/python"] +CMD ["/opt/model.py"] diff --git a/mnist/README.md b/mnist/README.md new file mode 100644 index 000000000..de5f61978 --- /dev/null +++ b/mnist/README.md @@ -0,0 +1,352 @@ +# Training MNIST using Kubeflow, S3, and Argo. + +This example guides you through the process of taking an example model, modifying it to run better within Kubeflow, and serving the resulting trained model. We will be using Argo to manage the workflow, Tensorflow's S3 support for saving model training info, Tensorboard to visualize the training, and Kubeflow to deploy the Tensorflow operator and serve the model. + +## Prerequisites + +Before we get started there a few requirements. + +### Kubernetes Cluster Environment + +Your cluster must: + +- Be at least version 1.9 +- Have access to an S3-compatible object store ([Amazon S3](https://aws.amazon.com/s3/), [Google Storage](https://cloud.google.com/storage/docs/interoperability), [Minio](https://www.minio.io/kubernetes.html)) +- Contain 3 nodes of at least 8 cores and 16 GB of RAM. + +If using GKE, the following will provision a cluster with the required features: + +``` +export CLOUDSDK_CONTAINER_USE_CLIENT_CERTIFICATE=True +gcloud alpha container clusters create ${USER} --enable-kubernetes-alpha --machine-type=n1-standard-8 --num-nodes=3 --disk-size=200 --zone=us-west1-a --cluster-version=1.9.3-gke.0 --image-type=UBUNTU +``` + +NOTE: You must be a Kubernetes admin to follow this guide. If you are not an admin, please contact your local cluster administrator for a client cert, or credentials to pass into the following commands: + +``` +$ kubectl config set-credentials --username= --password= +$ kubectl config set-context --cluster= --user= --namespace= +$ kubectl config use-context +``` + +### Local Setup + +You also need the following command line tools: + +- [kubectl](https://kubernetes.io/docs/tasks/tools/install-kubectl/) +- [argo](https://github.com/argoproj/argo/blob/master/demo.md#1-download-argo) +- [ksonnet](https://ksonnet.io/#get-started) + +To run the client at the end of the example, you must have [requirements.txt](requirements.txt) intalled in your active python environment. + +``` +pip install -r requirements.txt +``` + +NOTE: These instructions rely on Github, and may cause issues if behind a firewall with many Github users. Make sure you [generate and export this token](https://help.github.com/articles/creating-a-personal-access-token-for-the-command-line/): + +``` +export GITHUB_TOKEN=xxxxxxxx +``` + +## Modifying existing examples + +Many examples online use models that are unconfigurable, or don't work well in distributed mode. We will modify one of these [examples](https://github.com/tensorflow/tensorflow/blob/9a24e8acfcd8c9046e1abaac9dbf5e146186f4c2/tensorflow/examples/learn/mnist.py) to be better suited for distributed training and model serving. + +### Prepare model + +There is a delta between existing distributed mnist examples and what's needed to run well as a TFJob. + +Basically, we must: + +1. Add options in order to make the model configurable. +1. Use `tf.estimator.train_and_evaluate` to enable model exporting and serving. +1. Define serving signatures for model serving. + +The resulting model is [model.py](model.py). + +### Build and push model image. + +With our code ready, we will now build/push the docker image. + +``` +DOCKER_BASE_URL=docker.io/elsonrodriguez # Put your docker registry here +docker build . --no-cache -f Dockerfile.model -t ${DOCKER_BASE_URL}/mytfmodel:1.0 + +docker push ${DOCKER_BASE_URL}/mytfmodel:1.0 +``` + +## Preparing your Kubernetes Cluster + +With our data and workloads ready, now the cluster must be prepared. We will be deploying the TF Operator, and Argo to help manage our training job. + +In the following instructions we will install our required components to a single namespace. For these instructions we will assume the chosen namespace is `tfworkflow`: + +### Deploying Tensorflow Operator and Argo. + +We are using the Tensorflow operator to automate the deployment of our distributed model training, and Argo to create the overall training pipeline. The easiest way to install these components on your Kubernetes cluster is by using Kubeflow's ksonnet prototypes. + +``` +NAMESPACE=tfworkflow +APP_NAME=my-kubeflow +ks init ${APP_NAME} +cd ${APP_NAME} + +ks registry add kubeflow github.com/kubeflow/kubeflow/tree/master/kubeflow + +ks pkg install kubeflow/core@1a6fc9d0e19e456b784ba1c23c03ec47648819d0 +ks pkg install kubeflow/argo@8d617d68b707d52a5906d38b235e04e540f2fcf7 + +# Deploy TF Operator and Argo +kubectl create namespace ${NAMESPACE} +ks generate core kubeflow-core --name=kubeflow-core --namespace=${NAMESPACE} +ks generate argo kubeflow-argo --name=kubeflow-argo --namespace=${NAMESPACE} + +ks apply default -c kubeflow-core +ks apply default -c kubeflow-argo + +# Switch context for the rest of the example +kubectl config set-context $(kubectl config current-context) --namespace=${NAMESPACE} +cd - + +# Create a user for our workflow +kubectl apply -f tf-user.yaml +``` + +You can check to make sure the components have deployed: + +``` +$ kubectl get pods -l name=tf-job-operator +NAME READY STATUS RESTARTS AGE +tf-job-operator-78757955b-2glvj 1/1 Running 0 1m + +$ kubectl get pods -l app=workflow-controller +NAME READY STATUS RESTARTS AGE +workflow-controller-7d8f4bc5df-4zltg 1/1 Running 0 1m + +$ kubectl get crd +NAME AGE +tfjobs.kubeflow.org 1m +workflows.argoproj.io 1m + +$ argo list +NAME STATUS AGE DURATION +``` + +### Creating secrets for our workflow +For fetching and uploading data, our workflow requires S3 credentials. These credentials will be provided as kubernetes secrets: + +``` +export S3_ENDPOINT=s3.us-west-2.amazonaws.com +export AWS_ENDPOINT_URL=https://${S3_ENDPOINT} +export AWS_ACCESS_KEY_ID=xxxxx +export AWS_SECRET_ACCESS_KEY=xxxxx +export BUCKET_NAME=mybucket + +kubectl create secret generic aws-creds --from-literal=awsAccessKeyID=${AWS_ACCESS_KEY_ID} \ + --from-literal=awsSecretAccessKey=${AWS_SECRET_ACCESS_KEY} +``` + +## Defining your training workflow + +This is the bulk of the work, let's walk through what is needed: + +1. Train the model +1. Export the model +1. Serve the model + +Now let's look at how this is represented in our [example workflow](model-train.yaml) + +The argo workflow can be daunting, but basically our steps above extrapolate as follows: + +1. `get-workflow-info`: Generate and set variables for consumption in the rest of the pipeline. +1. `tensorboard`: Tensorboard is spawned, configured to watch the S3 URL for the training output. +1. `train-model`: A TFJob is spawned taking in variables such as number of workers, what path the datasets are at, which model container image, etc. The model is exported at the end. +1. `serve-model`: Optionally, the model is served. + +With our workflow defined, we can now execute it. + +## Submitting your training workflow + +First we need to set a few variables in our workflow. Make sure to set your docker registry or remove the `IMAGE` parameters in order to use our defaults: + +``` +DOCKER_BASE_URL=docker.io/elsonrodriguez # Put your docker registry here +export S3_DATA_URL=s3://${BUCKET_NAME}/data/mnist/ +export S3_TRAIN_BASE_URL=s3://${BUCKET_NAME}/models +export AWS_REGION=us-west-2 +export JOB_NAME=myjob-$(uuidgen | cut -c -5 | tr '[:upper:]' '[:lower:]') +export TF_MODEL_IMAGE=${DOCKER_BASE_URL}/mytfmodel:1.0 +export TF_WORKER=3 +export MODEL_TRAIN_STEPS=200 +``` + +Next, submit your workflow. + +``` +argo submit model-train.yaml -n ${NAMESPACE} --serviceaccount tf-user \ + -p aws-endpoint-url=${AWS_ENDPOINT_URL} \ + -p s3-endpoint=${S3_ENDPOINT} \ + -p aws-region=${AWS_REGION} \ + -p tf-model-image=${TF_MODEL_IMAGE} \ + -p s3-data-url=${S3_DATA_URL} \ + -p s3-train-base-url=${S3_TRAIN_BASE_URL} \ + -p job-name=${JOB_NAME} \ + -p tf-worker=${TF_WORKER} \ + -p model-train-steps=${MODEL_TRAIN_STEPS} \ + -p namespace=${NAMESPACE} +``` + +Your training workflow should now be executing. + +You can verify and keep track of your workflow using the argo commands: +``` +$ argo list +NAME STATUS AGE DURATION +tf-workflow-h7hwh Running 1h 1h + +$ argo get tf-workflow-h7hwh +``` + +## Monitoring + +There are various ways to monitor workflow/training job. In addition to using `kubectl` to query for the status of `pods`, some basic dashboards are also available. + +### Argo UI + +The Argo UI is useful for seeing what stage your worfklow is in: + +``` +PODNAME=$(kubectl get pod -l app=argo-ui -n${NAMESPACE} -o jsonpath='{.items[0].metadata.name}') +kubectl port-forward ${PODNAME} 8001:8001 +``` + +You should now be able to visit [http://127.0.0.1:8001](http://127.0.0.1:8001) to see the status of your workflows. + +### Tensorboard + +Tensorboard is deployed just before training starts. To connect: + +``` +PODNAME=$(kubectl get pod -l app=tensorboard-${JOB_NAME} -o jsonpath='{.items[0].metadata.name}') +kubectl port-forward ${PODNAME} 6006:6006 +``` + +Tensorboard can now be accessed at [http://127.0.0.1:6006](http://127.0.0.1:6006). + +## Using Tensorflow serving + +By default the workflow deploys our model via Tensorflow Serving. Included in this example is a client that can query your model and provide results: + +``` +POD_NAME=$(kubectl get pod -l=app=mnist-${JOB_NAME} -o jsonpath='{.items[0].metadata.name}') +kubectl port-forward ${POD_NAME} 9000:9000 & +TF_MNIST_IMAGE_PATH=data/7.png python mnist_client.py +``` + +This should result in output similar to this, depending on how well your model was trained: +``` +outputs { + key: "classes" + value { + dtype: DT_UINT8 + tensor_shape { + dim { + size: 1 + } + } + int_val: 7 + } +} +outputs { + key: "predictions" + value { + dtype: DT_FLOAT + tensor_shape { + dim { + size: 1 + } + dim { + size: 10 + } + } + float_val: 0.0 + float_val: 0.0 + float_val: 0.0 + float_val: 0.0 + float_val: 0.0 + float_val: 0.0 + float_val: 0.0 + float_val: 1.0 + float_val: 0.0 + float_val: 0.0 + } +} + + +............................ +............................ +............................ +............................ +............................ +............................ +............................ +..............@@@@@@........ +..........@@@@@@@@@@........ +........@@@@@@@@@@@@........ +........@@@@@@@@.@@@........ +........@@@@....@@@@........ +................@@@@........ +...............@@@@......... +...............@@@@......... +...............@@@.......... +..............@@@@.......... +..............@@@........... +.............@@@@........... +.............@@@............ +............@@@@............ +............@@@............. +............@@@............. +...........@@@.............. +..........@@@@.............. +..........@@@@.............. +..........@@................ +............................ +Your model says the above number is... 7! +``` + +You can also omit `TF_MNIST_IMAGE_PATH`, and the client will pick a random number from the mnist test data. Run it repeatedly and see how your model fares! + +### Disabling Serving + +Model serving can be turned off by passing in `-p model-serving=false` to the `model-train.yaml` workflow. Then if you wish to serve your model after training, use the `model-deploy.yaml` workflow. Simply pass in the desired finished argo workflow as an argument: + +``` +WORKFLOW= +argo submit model-deploy.yaml -n ${NAMESPACE} -p workflow=${WORKFLOW} --serviceaccount=tf-user +``` + +## Submitting new argo jobs + +If you want to rerun your workflow from scratch, then you will need to provide a new `job-name` to the argo workflow. For example: + +``` +#We're re-using previous variables except JOB_NAME +export JOB_NAME=myawesomejob + +argo submit model-train.yaml -n ${NAMESPACE} --serviceaccount tf-user \ + -p aws-endpoint-url=${AWS_ENDPOINT_URL} \ + -p s3-endpoint=${S3_ENDPOINT} \ + -p aws-region=${AWS_REGION} \ + -p tf-model-image=${TF_MODEL_IMAGE} \ + -p s3-data-url=${S3_DATA_URL} \ + -p s3-train-base-url=${S3_TRAIN_BASE_URL} \ + -p job-name=${JOB_NAME} \ + -p tf-worker=${TF_WORKER} \ + -p model-train-steps=${MODEL_TRAIN_STEPS} \ + -p namespace=${NAMESPACE} +``` + +## Conclusion and Next Steps + +This is an example of what your machine learning pipeline can look like. Feel free to play with the tunables and see if you can increase your model's accuracy (increasing `model-train-steps` can go a long way). diff --git a/mnist/data/0.png b/mnist/data/0.png new file mode 100644 index 000000000..2b01c3617 Binary files /dev/null and b/mnist/data/0.png differ diff --git a/mnist/data/1.png b/mnist/data/1.png new file mode 100644 index 000000000..a1d9dd16a Binary files /dev/null and b/mnist/data/1.png differ diff --git a/mnist/data/2.png b/mnist/data/2.png new file mode 100644 index 000000000..e21302bc6 Binary files /dev/null and b/mnist/data/2.png differ diff --git a/mnist/data/3.png b/mnist/data/3.png new file mode 100644 index 000000000..5a948fa67 Binary files /dev/null and b/mnist/data/3.png differ diff --git a/mnist/data/4.png b/mnist/data/4.png new file mode 100644 index 000000000..501447be2 Binary files /dev/null and b/mnist/data/4.png differ diff --git a/mnist/data/5.png b/mnist/data/5.png new file mode 100644 index 000000000..2b22a2df7 Binary files /dev/null and b/mnist/data/5.png differ diff --git a/mnist/data/6.png b/mnist/data/6.png new file mode 100644 index 000000000..96ae62151 Binary files /dev/null and b/mnist/data/6.png differ diff --git a/mnist/data/7.png b/mnist/data/7.png new file mode 100644 index 000000000..b91b82aa9 Binary files /dev/null and b/mnist/data/7.png differ diff --git a/mnist/data/8.png b/mnist/data/8.png new file mode 100644 index 000000000..1149eedd3 Binary files /dev/null and b/mnist/data/8.png differ diff --git a/mnist/data/9.png b/mnist/data/9.png new file mode 100644 index 000000000..5b469fba2 Binary files /dev/null and b/mnist/data/9.png differ diff --git a/mnist/ksonnet-entrypoint.sh b/mnist/ksonnet-entrypoint.sh new file mode 100644 index 000000000..93e9f9ce2 --- /dev/null +++ b/mnist/ksonnet-entrypoint.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +SERVICE_ACCOUNT=${SERVICE_ACCOUNT:-default} + +create-kubeconfig ${SERVICE_ACCOUNT} > kubeconfig.tmp +cp kubeconfig.tmp ~/.kube/config + +kubectl config set-cluster default --server="https://${KUBERNETES_SERVICE_HOST}:${KUBERNETES_SERVICE_PORT}" + +exec /bin/bash "$@" diff --git a/mnist/mnist_client.py b/mnist/mnist_client.py new file mode 100644 index 000000000..7d8558f92 --- /dev/null +++ b/mnist/mnist_client.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python2.7 + +import os +import random +import numpy + +from PIL import Image + +import tensorflow as tf +from tensorflow.examples.tutorials.mnist import input_data +from tensorflow_serving.apis import predict_pb2 +from tensorflow_serving.apis import prediction_service_pb2 + +from grpc.beta import implementations + +from mnist import MNIST # pylint: disable=no-name-in-module + +TF_MODEL_SERVER_HOST = os.getenv("TF_MODEL_SERVER_HOST", "127.0.0.1") +TF_MODEL_SERVER_PORT = int(os.getenv("TF_MODEL_SERVER_PORT", 9000)) +TF_DATA_DIR = os.getenv("TF_DATA_DIR", "/tmp/data/") +TF_MNIST_IMAGE_PATH = os.getenv("TF_MNIST_IMAGE_PATH", None) +TF_MNIST_TEST_IMAGE_NUMBER = int(os.getenv("TF_MNIST_TEST_IMAGE_NUMBER", -1)) + +if TF_MNIST_IMAGE_PATH != None: + raw_image = Image.open(TF_MNIST_IMAGE_PATH) + int_image = numpy.array(raw_image) + image = numpy.reshape(int_image, 784).astype(numpy.float32) +elif TF_MNIST_TEST_IMAGE_NUMBER > -1: + test_data_set = input_data.read_data_sets(TF_DATA_DIR, one_hot=True).test + image = test_data_set.images[TF_MNIST_TEST_IMAGE_NUMBER] +else: + test_data_set = input_data.read_data_sets(TF_DATA_DIR, one_hot=True).test + image = random.choice(test_data_set.images) + +channel = implementations.insecure_channel( + TF_MODEL_SERVER_HOST, TF_MODEL_SERVER_PORT) +stub = prediction_service_pb2.beta_create_PredictionService_stub(channel) + +request = predict_pb2.PredictRequest() +request.model_spec.name = "mnist" +request.model_spec.signature_name = "serving_default" +request.inputs['x'].CopyFrom( + tf.contrib.util.make_tensor_proto(image, shape=[1, 28, 28])) + +result = stub.Predict(request, 10.0) # 10 secs timeout + +print(result) +print(MNIST.display(image, threshold=0)) +print("Your model says the above number is... %d!" % + result.outputs["classes"].int_val[0]) diff --git a/mnist/model-deploy.yaml b/mnist/model-deploy.yaml new file mode 100644 index 000000000..f3fd2dcf4 --- /dev/null +++ b/mnist/model-deploy.yaml @@ -0,0 +1,144 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Workflow +metadata: + generateName: tf-workflow- +spec: + entrypoint: deploy-model + # Parameters can be passed/overridden via the argo CLI. + # To override the printed message, run `argo submit` with the -p option: + # $ argo submit examples/arguments-parameters.yaml -p message="goodbye world" + arguments: + parameters: + - name: workflow + value: workflow-name + templates: + - name: deploy-model + steps: + - - name: get-workflow-info + template: get-workflow-info + - - name: serve-model + template: tf-inference + arguments: + parameters: + - name: s3-model-url + value: "{{steps.get-workflow-info.outputs.parameters.s3-model-url}}" + - name: s3-exported-url + value: "{{steps.get-workflow-info.outputs.parameters.s3-exported-url}}" + - name: aws-secret + value: "{{steps.get-workflow-info.outputs.parameters.aws-secret}}" + - name: namespace + value: "{{steps.get-workflow-info.outputs.parameters.namespace}}" + - name: aws-region + value: "{{steps.get-workflow-info.outputs.parameters.aws-region}}" + - name: s3-endpoint + value: "{{steps.get-workflow-info.outputs.parameters.s3-endpoint}}" + - name: s3-use-https + value: "{{steps.get-workflow-info.outputs.parameters.s3-use-https}}" + - name: s3-verify-ssl + value: "{{steps.get-workflow-info.outputs.parameters.s3-verify-ssl}}" + - name: job-name + value: "{{steps.get-workflow-info.outputs.parameters.job-name}}" + - name: tf-serving-image + value: "{{steps.get-workflow-info.outputs.parameters.tf-serving-image}}" + - name: model-serving-servicetype + value: "{{steps.get-workflow-info.outputs.parameters.model-serving-servicetype}}" + - name: model-serving-ks-url + value: "{{steps.get-workflow-info.outputs.parameters.model-serving-ks-url}}" + - name: model-serving-ks-tag + value: "{{steps.get-workflow-info.outputs.parameters.model-serving-ks-tag}}" + - name: model-name + value: "{{steps.get-workflow-info.outputs.parameters.model-name}}" + - name: get-workflow-info + container: + image: nervana/circleci:master + imagePullPolicy: Always + command: ["bash", "-c", "-x", 'for var in s3-model-url s3-exported-url; do kubectl get workflow {{workflow.parameters.workflow}} -o json | jq -r ".status.nodes[] | select(.name|contains(\"get-workflow-info\")) | .outputs.parameters[] | select(.name == \"${var}\") | .value" > /tmp/${var} ; done; for var in job-name namespace aws-secret aws-region s3-endpoint s3-use-https s3-verify-ssl tf-serving-image model-serving-servicetype model-serving-ks-url model-serving-ks-tag model-name; do kubectl get workflow {{workflow.parameters.workflow}} -o jsonpath="{.spec.arguments.parameters[?(.name==\"${var}\")].value}" > /tmp/${var}; done'] + outputs: + parameters: + - name: s3-model-url + valueFrom: + path: /tmp/s3-model-url + - name: s3-exported-url + valueFrom: + path: /tmp/s3-exported-url + - name: aws-secret + valueFrom: + path: /tmp/aws-secret + - name: namespace + valueFrom: + path: /tmp/namespace + - name: aws-region + valueFrom: + path: /tmp/aws-region + - name: s3-endpoint + valueFrom: + path: /tmp/s3-endpoint + - name: s3-use-https + valueFrom: + path: /tmp/s3-use-https + - name: s3-verify-ssl + valueFrom: + path: /tmp/s3-verify-ssl + - name: job-name + valueFrom: + path: /tmp/job-name + - name: tf-serving-image + valueFrom: + path: /tmp/tf-serving-image + - name: model-serving-servicetype + valueFrom: + path: /tmp/model-serving-servicetype + - name: model-serving-ks-url + valueFrom: + path: /tmp/model-serving-ks-url + - name: model-serving-ks-tag + valueFrom: + path: /tmp/model-serving-ks-tag + - name: model-name + valueFrom: + path: /tmp/model-name + - name: tf-inference + inputs: + parameters: + - name: s3-model-url + - name: s3-exported-url + - name: aws-secret + - name: namespace + - name: aws-region + - name: s3-endpoint + - name: s3-use-https + - name: s3-verify-ssl + - name: job-name + - name: tf-serving-image + - name: model-serving-servicetype + - name: model-serving-ks-url + - name: model-serving-ks-tag + - name: model-name + script: + image: elsonrodriguez/ksonnet:0.8.0-test6 + command: ["/ksonnet-entrypoint.sh"] + source: | + ks init my-model-server + cd my-model-server + ks registry add kubeflow {{inputs.parameters.model-serving-ks-url}} + ks pkg install kubeflow/tf-serving@{{inputs.parameters.model-serving-ks-tag}} + ks env add default + # TODO change mnist name to be specific to a job. Right now mnist name is required to serve the model. + ks generate tf-serving {{inputs.parameters.model-name}} --name=mnist-{{inputs.parameters.job-name}} --namespace={{inputs.parameters.namespace}} --model_path={{inputs.parameters.s3-exported-url}} + ks param set {{inputs.parameters.model-name}} model_server_image {{inputs.parameters.tf-serving-image}} + ks param set {{inputs.parameters.model-name}} model_name {{inputs.parameters.model-name}} + ks param set {{inputs.parameters.model-name}} namespace {{inputs.parameters.namespace}} + ks param set {{inputs.parameters.model-name}} service_type {{inputs.parameters.model-serving-servicetype}} + ks param set {{inputs.parameters.model-name}} s3_create_secret false + ks param set {{inputs.parameters.model-name}} s3_secret_name {{inputs.parameters.aws-secret}} + ks param set {{inputs.parameters.model-name}} s3_secret_accesskeyid_key_name awsAccessKeyID + ks param set {{inputs.parameters.model-name}} s3_secret_secretaccesskey_key_name awsSecretAccessKey + ks param set {{inputs.parameters.model-name}} s3_aws_region {{inputs.parameters.aws-region}} + ks param set {{inputs.parameters.model-name}} s3_endpoint {{inputs.parameters.s3-endpoint}} + ks param set {{inputs.parameters.model-name}} s3_use_https \'{{inputs.parameters.s3-use-https}}\' + ks param set {{inputs.parameters.model-name}} s3_verify_ssl \'{{inputs.parameters.s3-verify-ssl}}\' + ks apply default -c {{inputs.parameters.model-name}} + #FIXME This doesn't actually work in the current version of argo. We're using a default of `tf-user` in the container entrypoint for now. + env: + - name: SERVICE_ACCOUNT + value: tf-user diff --git a/mnist/model-train.yaml b/mnist/model-train.yaml new file mode 100644 index 000000000..23a3e7770 --- /dev/null +++ b/mnist/model-train.yaml @@ -0,0 +1,368 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Workflow +metadata: + generateName: tf-workflow- +spec: + entrypoint: tests + onExit: exit-handler + # Parameters can be passed/overridden via the argo CLI. + # To override the printed message, run `argo submit` with the -p option: + # $ argo submit examples/arguments-parameters.yaml -p message="goodbye world" + arguments: + parameters: + - name: tf-worker # number of tf workers + value: 1 + - name: tf-ps # number of tf parameter servers + value: 2 + - name: tf-model-image + value: elsonrodriguez/mytfmodel:1.0 + - name: tf-serving-image #this image is a mirror of a private kubeflow-ci image + value: elsonrodriguez/model-server:1.0 + - name: tf-tensorboard-image + value: tensorflow/tensorflow:1.5.1 + - name: ks-image + value: elsonrodriguez/ksonnet:0.8.0-test7 + - name: model-name + value: mnist + - name: model-hidden-units + value: 100 + - name: model-train-steps + value: 200 + - name: model-batch-size + value: 100 + - name: model-learning-rate + value: 0.01 + - name: model-serving + value: true + - name: model-serving-servicetype + value: ClusterIP + - name: model-serving-ks-url + value: github.com/kubeflow/kubeflow/tree/master/kubeflow + - name: model-serving-ks-tag + value: 1f474f30 + - name: job-name + value: myjob + - name: namespace + value: default + - name: s3-data-url + value: s3://mybucket/data/mnist/ + - name: s3-train-base-url + value: s3://mybucket/models + - name: aws-endpoint-url + value: https://s3.us-west-1.amazonaws.com + - name: s3-endpoint + value: s3.us-west-1.amazonaws.com + - name: s3-use-https + value: true + - name: s3-verify-ssl + value: true + - name: aws-region + value: us-west-1 + - name: aws-secret + value: aws-creds + volumes: + - name: training-data + emptyDir: {} + - name: training-output + templates: + - name: tests + steps: + - - name: get-workflow-info + template: get-workflow-info + - - name: tensorboard + template: tf-tensorboard + arguments: + parameters: + - name: s3-model-url + value: "{{steps.get-workflow-info.outputs.parameters.s3-model-url}}" + - - name: train-model + template: tf-train + arguments: + parameters: + - name: s3-model-url + value: "{{steps.get-workflow-info.outputs.parameters.s3-model-url}}" + - - name: serve-model + template: tf-inference + arguments: + parameters: + - name: s3-exported-url + value: "{{steps.get-workflow-info.outputs.parameters.s3-exported-url}}" + when: "{{workflow.parameters.model-serving}} == true" + - name: exit-handler + steps: + - - name: cleanup + template: clean + - name: get-workflow-info + container: + image: nervana/circleci:master + imagePullPolicy: Always + command: ["bash", "-c", "echo '{{workflow.parameters.s3-train-base-url}}/{{workflow.parameters.job-name}}/' | tr -d '[:space:]' > /tmp/s3-model-url; echo '{{workflow.parameters.s3-train-base-url}}/{{workflow.parameters.job-name}}/export/{{workflow.parameters.model-name}}/' | tr -d '[:space:]' > /tmp/s3-exported-url"] + outputs: + parameters: + - name: s3-model-url + valueFrom: + path: /tmp/s3-model-url + - name: s3-exported-url + valueFrom: + path: /tmp/s3-exported-url + - name: tf-train + inputs: + parameters: + - name: s3-model-url + resource: + action: apply + # NOTE: need to detect master node complete + successCondition: status.state == Succeeded + manifest: | + apiVersion: "kubeflow.org/v1alpha1" + kind: "TFJob" + metadata: + name: {{workflow.parameters.job-name}} + namespace: {{workflow.parameters.namespace}} + spec: + replicaSpecs: + - replicas: 1 + tfReplicaType: MASTER + template: + spec: + serviceAccountName: tf-job-operator + containers: + - image: {{workflow.parameters.tf-model-image}} + name: tensorflow + imagePullPolicy: Always + env: + - name: TF_MODEL_DIR + value: {{inputs.parameters.s3-model-url}} + - name: TF_EXPORT_DIR + value: {{workflow.parameters.model-name}} + - name: TF_TRAIN_STEPS + value: "{{workflow.parameters.model-train-steps}}" + - name: TF_TF_BATCH_SIZE + value: "{{workflow.parameters.model-batch-size}}" + - name: TF_LEARNING_RATE + value: "{{workflow.parameters.model-learning-rate}}" + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: {{workflow.parameters.aws-secret}} + key: awsAccessKeyID + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: {{workflow.parameters.aws-secret}} + key: awsSecretAccessKey + - name: AWS_DEFAULT_REGION + value: {{workflow.parameters.aws-region}} + - name: AWS_REGION + value: {{workflow.parameters.aws-region}} + - name: S3_REGION + value: {{workflow.parameters.aws-region}} + - name: S3_USE_HTTPS + value: "{{workflow.parameters.s3-use-https}}" + - name: S3_VERIFY_SSL + value: "{{workflow.parameters.s3-verify-ssl}}" + - name: S3_ENDPOINT + value: {{workflow.parameters.s3-endpoint}} + restartPolicy: OnFailure + - replicas: {{workflow.parameters.tf-worker}} + tfReplicaType: WORKER + template: + spec: + serviceAccountName: tf-job-operator + containers: + - image: {{workflow.parameters.tf-model-image}} + name: tensorflow + imagePullPolicy: Always + env: + - name: TF_MODEL_DIR + value: {{inputs.parameters.s3-model-url}} + - name: TF_EXPORT_DIR + value: {{workflow.parameters.model-name}} + - name: TF_TRAIN_STEPS + value: "{{workflow.parameters.model-train-steps}}" + - name: TF_TF_BATCH_SIZE + value: "{{workflow.parameters.model-batch-size}}" + - name: TF_LEARNING_RATE + value: "{{workflow.parameters.model-learning-rate}}" + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: {{workflow.parameters.aws-secret}} + key: awsAccessKeyID + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: {{workflow.parameters.aws-secret}} + key: awsSecretAccessKey + - name: AWS_DEFAULT_REGION + value: {{workflow.parameters.aws-region}} + - name: AWS_REGION + value: {{workflow.parameters.aws-region}} + - name: S3_REGION + value: {{workflow.parameters.aws-region}} + - name: S3_USE_HTTPS + value: "{{workflow.parameters.s3-use-https}}" + - name: S3_VERIFY_SSL + value: "{{workflow.parameters.s3-verify-ssl}}" + - name: S3_ENDPOINT + value: {{workflow.parameters.s3-endpoint}} + restartPolicy: OnFailure + - replicas: {{workflow.parameters.tf-ps}} + tfReplicaType: PS + template: + spec: + containers: + - image: {{workflow.parameters.tf-model-image}} + name: tensorflow + imagePullPolicy: Always + env: + - name: TF_MODEL_DIR + value: {{inputs.parameters.s3-model-url}} + - name: TF_EXPORT_DIR + value: {{workflow.parameters.model-name}} + - name: TF_TRAIN_STEPS + value: "{{workflow.parameters.model-train-steps}}" + - name: TF_TF_BATCH_SIZE + value: "{{workflow.parameters.model-batch-size}}" + - name: TF_LEARNING_RATE + value: "{{workflow.parameters.model-learning-rate}}" + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: {{workflow.parameters.aws-secret}} + key: awsAccessKeyID + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: {{workflow.parameters.aws-secret}} + key: awsSecretAccessKey + - name: AWS_DEFAULT_REGION + value: {{workflow.parameters.aws-region}} + - name: AWS_REGION + value: {{workflow.parameters.aws-region}} + - name: S3_REGION + value: {{workflow.parameters.aws-region}} + - name: S3_USE_HTTPS + value: "{{workflow.parameters.s3-use-https}}" + - name: S3_VERIFY_SSL + value: "{{workflow.parameters.s3-verify-ssl}}" + - name: S3_ENDPOINT + value: {{workflow.parameters.s3-endpoint}} + restartPolicy: OnFailure + terminationPolicy: + chief: + replicaIndex: 0 + replicaName: MASTER + - name: tf-tensorboard + inputs: + parameters: + - name: s3-model-url + resource: + action: apply + manifest: | + apiVersion: extensions/v1beta1 + kind: Deployment + metadata: + labels: + app: tensorboard-{{workflow.parameters.job-name}} + name: tensorboard-{{workflow.parameters.job-name}} + namespace: {{workflow.parameters.namespace}} + spec: + replicas: 1 + selector: + matchLabels: + app: tensorboard-{{workflow.parameters.job-name}} + template: + metadata: + labels: + app: tensorboard-{{workflow.parameters.job-name}} + spec: + containers: + - name: tensorboard-{{workflow.parameters.job-name}} + image: {{workflow.parameters.tf-tensorboard-image}} + imagePullPolicy: Always + command: + - /usr/local/bin/tensorboard + args: + - --logdir + - {{inputs.parameters.s3-model-url}} + env: + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + key: awsAccessKeyID + name: {{workflow.parameters.aws-secret}} + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + key: awsSecretAccessKey + name: {{workflow.parameters.aws-secret}} + - name: AWS_REGION + value: {{workflow.parameters.aws-region}} + - name: S3_REGION + value: {{workflow.parameters.aws-region}} + - name: S3_USE_HTTPS + value: "{{workflow.parameters.s3-use-https}}" + - name: S3_VERIFY_SSL + value: "{{workflow.parameters.s3-verify-ssl}}" + - name: S3_ENDPOINT + value: {{workflow.parameters.s3-endpoint}} + ports: + - containerPort: 6006 + protocol: TCP + dnsPolicy: ClusterFirst + restartPolicy: Always + --- + apiVersion: v1 + kind: Service + metadata: + labels: + app: tensorboard-{{workflow.parameters.job-name}} + name: tensorboard-{{workflow.parameters.job-name}} + namespace: {{workflow.parameters.namespace}} + spec: + ports: + - port: 80 + protocol: TCP + targetPort: 6006 + selector: + app: tensorboard-{{workflow.parameters.job-name}} + sessionAffinity: None + type: ClusterIP + - name: tf-inference + inputs: + parameters: + - name: s3-exported-url + script: + image: "{{workflow.parameters.ks-image}}" + command: ["/ksonnet-entrypoint.sh"] + source: | + ks init my-model-server + cd my-model-server + ks registry add kubeflow {{workflow.parameters.model-serving-ks-url}} + ks pkg install kubeflow/tf-serving@{{workflow.parameters.model-serving-ks-tag}} + ks env add default + # TODO change mnist name to be specific to a job. Right now mnist name is required to serve the model. + ks generate tf-serving {{workflow.parameters.model-name}} --name=mnist-{{workflow.parameters.job-name}} --namespace={{workflow.parameters.namespace}} --model_path={{inputs.parameters.s3-exported-url}} + ks param set {{workflow.parameters.model-name}} model_server_image {{workflow.parameters.tf-serving-image}} + ks param set {{workflow.parameters.model-name}} model_name {{workflow.parameters.model-name}} + ks param set {{workflow.parameters.model-name}} namespace {{workflow.parameters.namespace}} + ks param set {{workflow.parameters.model-name}} service_type {{workflow.parameters.model-serving-servicetype}} + ks param set {{workflow.parameters.model-name}} s3_create_secret false + ks param set {{workflow.parameters.model-name}} s3_secret_name {{workflow.parameters.aws-secret}} + ks param set {{workflow.parameters.model-name}} s3_secret_accesskeyid_key_name awsAccessKeyID + ks param set {{workflow.parameters.model-name}} s3_secret_secretaccesskey_key_name awsSecretAccessKey + ks param set {{workflow.parameters.model-name}} s3_aws_region {{workflow.parameters.aws-region}} + ks param set {{workflow.parameters.model-name}} s3_endpoint {{workflow.parameters.s3-endpoint}} + ks param set {{workflow.parameters.model-name}} s3_use_https \'{{workflow.parameters.s3-use-https}}\' + ks param set {{workflow.parameters.model-name}} s3_verify_ssl \'{{workflow.parameters.s3-verify-ssl}}\' + ks apply default -c {{workflow.parameters.model-name}} + #FIXME This doesn't actually work in the current version of argo. We're using a default of `tf-user` in the container entrypoint for now. + env: + - name: SERVICE_ACCOUNT + value: tf-user + - name: clean + container: + image: nervana/circleci:master + imagePullPolicy: Always + command: ["bash", "-c", "kubectl delete tfjob {{workflow.parameters.job-name}} || true"] diff --git a/mnist/model.py b/mnist/model.py new file mode 100644 index 000000000..1d53ece26 --- /dev/null +++ b/mnist/model.py @@ -0,0 +1,178 @@ +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""This showcases how simple it is to build image classification networks. + +It follows description from this TensorFlow tutorial: + https://www.tensorflow.org/versions/master/tutorials/mnist/pros/index.html#deep-mnist-for-experts +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys +import numpy as np +import tensorflow as tf + +# Configure model options +TF_DATA_DIR = os.getenv("TF_DATA_DIR", "/tmp/data/") +TF_MODEL_DIR = os.getenv("TF_MODEL_DIR", None) +TF_EXPORT_DIR = os.getenv("TF_EXPORT_DIR", "mnist/") +TF_MODEL_TYPE = os.getenv("TF_MODEL_TYPE", "CNN") +TF_TRAIN_STEPS = int(os.getenv("TF_TRAIN_STEPS", 200)) +TF_BATCH_SIZE = int(os.getenv("TF_BATCH_SIZE", 100)) +TF_LEARNING_RATE = float(os.getenv("TF_LEARNING_RATE", 0.01)) + +N_DIGITS = 10 # Number of digits. +X_FEATURE = 'x' # Name of the input feature. + + +def conv_model(features, labels, mode): + """2-layer convolution model.""" + # Reshape feature to 4d tensor with 2nd and 3rd dimensions being + # image width and height final dimension being the number of color channels. + feature = tf.reshape(features[X_FEATURE], [-1, 28, 28, 1]) + + # First conv layer will compute 32 features for each 5x5 patch + with tf.variable_scope('conv_layer1'): + h_conv1 = tf.layers.conv2d( + feature, + filters=32, + kernel_size=[5, 5], + padding='same', + activation=tf.nn.relu) + h_pool1 = tf.layers.max_pooling2d( + h_conv1, pool_size=2, strides=2, padding='same') + + # Second conv layer will compute 64 features for each 5x5 patch. + with tf.variable_scope('conv_layer2'): + h_conv2 = tf.layers.conv2d( + h_pool1, + filters=64, + kernel_size=[5, 5], + padding='same', + activation=tf.nn.relu) + h_pool2 = tf.layers.max_pooling2d( + h_conv2, pool_size=2, strides=2, padding='same') + # reshape tensor into a batch of vectors + h_pool2_flat = tf.reshape(h_pool2, [-1, 7 * 7 * 64]) + + # Densely connected layer with 1024 neurons. + h_fc1 = tf.layers.dense(h_pool2_flat, 1024, activation=tf.nn.relu) + h_fc1 = tf.layers.dropout( + h_fc1, + rate=0.5, + training=(mode == tf.estimator.ModeKeys.TRAIN)) + + # Compute logits (1 per class) and compute loss. + logits = tf.layers.dense(h_fc1, N_DIGITS, activation=None) + predict = tf.nn.softmax(logits) + classes = tf.cast(tf.argmax(predict, 1), tf.uint8) + + # Compute predictions. + predicted_classes = tf.argmax(logits, 1) + if mode == tf.estimator.ModeKeys.PREDICT: + predictions = { + 'class': predicted_classes, + 'prob': tf.nn.softmax(logits) + } + return tf.estimator.EstimatorSpec(mode, predictions=predictions, + export_outputs={'classes': + tf.estimator.export.PredictOutput({"predictions": predict, + "classes": classes})}) + + # Compute loss. + loss = tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=logits) + + # Create training op. + if mode == tf.estimator.ModeKeys.TRAIN: + optimizer = tf.train.GradientDescentOptimizer( + learning_rate=TF_LEARNING_RATE) + train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step()) + return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op) + + # Compute evaluation metrics. + eval_metric_ops = { + 'accuracy': tf.metrics.accuracy( + labels=labels, predictions=predicted_classes) + } + return tf.estimator.EstimatorSpec( + mode, loss=loss, eval_metric_ops=eval_metric_ops) + + +def cnn_serving_input_receiver_fn(): + inputs = {X_FEATURE: tf.placeholder(tf.float32, [None, 28, 28])} + return tf.estimator.export.ServingInputReceiver(inputs, inputs) + + +def linear_serving_input_receiver_fn(): + inputs = {X_FEATURE: tf.placeholder(tf.float32, (784,))} + return tf.estimator.export.ServingInputReceiver(inputs, inputs) + + +def main(unused_args): # pylint: disable=unused-argument + tf.logging.set_verbosity(tf.logging.INFO) + + # Download and load MNIST dataset. + mnist = tf.contrib.learn.datasets.DATASETS['mnist'](TF_DATA_DIR) + train_input_fn = tf.estimator.inputs.numpy_input_fn( + x={X_FEATURE: mnist.train.images}, + y=mnist.train.labels.astype(np.int32), + batch_size=TF_BATCH_SIZE, + num_epochs=None, + shuffle=True) + test_input_fn = tf.estimator.inputs.numpy_input_fn( + x={X_FEATURE: mnist.train.images}, + y=mnist.train.labels.astype(np.int32), + num_epochs=1, + shuffle=False) + + if TF_MODEL_TYPE == "LINEAR": + # Linear classifier. + feature_columns = [ + tf.feature_column.numeric_column( + X_FEATURE, shape=mnist.train.images.shape[1:])] + + classifier = tf.estimator.LinearClassifier( + feature_columns=feature_columns, n_classes=N_DIGITS, model_dir=TF_MODEL_DIR) + classifier.train(input_fn=train_input_fn, steps=TF_TRAIN_STEPS) + scores = classifier.evaluate(input_fn=test_input_fn) + print('Accuracy (LinearClassifier): {0:f}'.format(scores['accuracy'])) + # FIXME This doesn't seem to work. sticking to CNN for the example for now. + classifier.export_savedmodel( + TF_EXPORT_DIR, linear_serving_input_receiver_fn) + elif TF_MODEL_TYPE == "CNN": + # Convolutional network + training_config = tf.estimator.RunConfig( + model_dir=TF_MODEL_DIR, save_summary_steps=100, save_checkpoints_steps=1000) + classifier = tf.estimator.Estimator( + model_fn=conv_model, model_dir=TF_MODEL_DIR, config=training_config) + export_final = tf.estimator.FinalExporter( + TF_EXPORT_DIR, serving_input_receiver_fn=cnn_serving_input_receiver_fn) + train_spec = tf.estimator.TrainSpec( + input_fn=train_input_fn, max_steps=TF_TRAIN_STEPS) + eval_spec = tf.estimator.EvalSpec(input_fn=test_input_fn, + steps=1, + exporters=export_final, + throttle_secs=1, + start_delay_secs=1) + tf.estimator.train_and_evaluate(classifier, train_spec, eval_spec) + else: + print("No such model type: %s" % TF_MODEL_TYPE) + sys.exit(1) + + +if __name__ == '__main__': + tf.app.run() diff --git a/mnist/requirements.txt b/mnist/requirements.txt new file mode 100644 index 000000000..fd2085ac9 --- /dev/null +++ b/mnist/requirements.txt @@ -0,0 +1,22 @@ +absl-py==0.1.10 +backports.weakref==1.0.post1 +bleach==1.5.0 +enum34==1.1.6 +funcsigs==1.0.2 +futures==3.2.0 +grpc==0.3.post19 +grpcio==1.9.1 +html5lib==0.9999999 +Markdown==2.6.11 +mock==2.0.0 +msgpack-python==0.5.4 +numpy==1.14.0 +pbr==3.1.1 +Pillow==5.0.0 +protobuf==3.5.1 +python-mnist==0.5 +six==1.11.0 +tensorflow==1.5.0 +tensorflow-serving-api==1.4.0 +tensorflow-tensorboard==1.5.1 +Werkzeug==0.14.1 diff --git a/mnist/tf-user.yaml b/mnist/tf-user.yaml new file mode 100644 index 000000000..14db65573 --- /dev/null +++ b/mnist/tf-user.yaml @@ -0,0 +1,97 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: tf-user +rules: +- apiGroups: + - "" + resources: + - pods + - pods/exec + verbs: + - create + - get + - list + - watch + - update + - patch +- apiGroups: + - "" + resources: + - configmaps + - serviceaccounts + - secrets + verbs: + - get + - watch + - list +- apiGroups: + - "" + resources: + - persistentvolumeclaims + verbs: + - create + - delete +- apiGroups: + - "" + resources: + - services + verbs: + - create + - get + - list + - watch + - update + - patch +- apiGroups: + - apps + - extensions + resources: + - deployments + verbs: + - create + - get + - list + - watch + - update + - patch + - delete +- apiGroups: + - argoproj.io + resources: + - workflows + verbs: + - get + - list + - watch + - update + - patch +- apiGroups: + - kubeflow.org + resources: + - tfjobs + verbs: + - create + - get + - list + - watch + - update + - patch + - delete +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: tf-user +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: tf-user +subjects: +- kind: ServiceAccount + name: tf-user +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: tf-user