Make distributed training work; Create some components to train models

* Check in a ksonnet component to train a model using the tinyparam hyperparameter set. * We want to check in the ksonnet component to facilitate reproducibility. We need a better way to separate the particular experiments used for the CS search demo effort from the jobs we want customers to try. Related to kubeflow#239 train a high quality model. * Check in the cs_demo ks environment; this was being ignored as a result of .gitignore Make distributed training work kubeflow#208 * We got distributed synchronous training to work with TensorTensor 1.10 * This required creating a simple python script to start the TF standard server and run it as a sidecar of the chief pod and as the main container for the workers/ps. * Rename the model to kf_similarity_transformer to be consistent with other code. * We don't want to use the default name because we don't want to inadvertently use the SimilarityTransformer model defined in the Tensor2Tensor project. * replace build.sh by a Makefile. Makes it easier to add variant commands * Use the GitHash not a random id as the tag. * Add a label to the docker image to indicate the git version. * Put the Makefile at the top of the code_search tree; makes it easier to pull all the different sources for the Docker images. * Add an option to build the Docker iamges with GCB; this is more efficient when you are on a poor network connection because you don't have to download images locally. * Use jsonnet to define and parameterize the GCB workflow. * Build separate docker images for running Dataflow and for running the trainer. This helps avoid versioning conflicts caused by different versions of protobuf pulled in by the TF version used as the base image vs. the version used with apache beam. Fix kubeflow#310 - Training fails with GPUs. * Changes to support distributed training. * Simplify t2t-entrypoint.sh so that all we do is parse TF_CONFIG and pass requisite config information as command line arguments; everything else can be set in the K8s spec. * Upgrade to T2T 1.10.
jlewi · Nov 8, 2018 · da0dd33 · da0dd33
1 parent 1043bc0
commit da0dd33
Show file tree

Hide file tree

Showing 20 changed files with 444 additions and 226 deletions.
diff --git a/code_search/.gitignore b/code_search/.gitignore
@@ -1,109 +1 @@
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-
-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-.hypothesis/
-.pytest_cache/
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-target/
-
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# pyenv
-.python-version
-
-# celery beat schedule file
-celerybeat-schedule
-
-# SageMath parsed files
-*.sage.py
-
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
-# Spyder project settings
-.spyderproject
-.spyproject
-
-# Rope project settings
-.ropeproject
-
-# mkdocs documentation
-/site
-
-# mypy
-.mypy_cache/
-
-# Virtual Environments
-venv/
-
-*.key.json
+build/**
diff --git a/code_search/Makefile b/code_search/Makefile
@@ -0,0 +1,83 @@
+IMG = gcr.io/kubeflow-examples/code-search
+
+# List any changed  files. We only include files in the notebooks directory.
+# because that is the code in the docker image.
+# In particular we exclude changes to the ksonnet configs.
+CHANGED_FILES := $(shell git diff-files --relative=code_search)
+
+ifeq ($(strip $(CHANGED_FILES)),)
+# Changed files is empty; not dirty
+# Don't include --dirty because it could be dirty if files outside the ones we care
+# about changed.
+GIT_VERSION := $(shell git log --pretty=format:'%h' -n 1)
+else
+GIT_VERSION := $(shell git log --pretty=format:'%h' -n 1)-dirty-$(shell git diff | shasum -a256 | cut -c -6)
+endif
+
+TAG := $(shell date +v%Y%m%d)-$(GIT_VERSION)
+all: build
+
+TF_VERSION=1.11.0
+
+# To build without the cache set the environment variable
+# export DOCKER_BUILD_OPTS=--no-cache
+build-cpu:
+	docker build -f "./docker/t2t/Dockerfile" \
+             -t $(IMG):$(TAG) \
+             --label=git-versions=$(GIT_VERSION) \
+             --build-arg BASE_IMAGE_TAG=$(TF_VERSION) \
+             ./
+	@echo Built $(IMG):$(TAG)
+
+# TODO(jlewi): We could always use build.jsonnet and then just
+# Parse out the docker build command.	
+build-gpu:
+	docker build -f "./docker/t2t/Dockerfile" \
+             -t $(IMG)-gpu:$(TAG) \
+             --label=git-versions=$(GIT_VERSION) \
+             --build-arg BASE_IMAGE_TAG=$(TF_VERSION)-gpu  \
+             ./
+	@echo Built $(IMG)-gpu:$(TAG)
+
+build-dataflow:
+	docker build -f "./docker/t2t/Dockerfile.dataflow" \
+             -t $(IMG)-dataflow:$(TAG) \
+             --label=git-versions=$(GIT_VERSION)
+			 .
+	@echo Built $(IMG)-dataflow:$(TAG)
+
+build: build-cpu build-gpu build-dataflow
+
+# Build using GCB. This is useful if we are on a slow internet connection
+# and don't want to pull 
+build-gcb:
+	mkdir  -p build
+	jsonnet ./docker/t2t/build.jsonnet --ext-str gitVersion=$(GIT_VERSION) --ext-str tag=$(TAG) \
+	  > ./build/build.json	
+	cp -r ./docker ./build/
+	cp -r ./src ../build/
+	rm -rf ./build/src/code_search/dataflow/cli/test_data
+	rm -rf ./build/src/code_search/t2t/test_data
+	gcloud builds submit --project=kubeflow-ci --config=./build/build.json ./build
+
+
+# Build but don't attach the latest tag. This allows manual testing/inspection of the image
+# first.
+push-cpu: build-cpu
+	gcloud docker --authorize-only 
+	docker push $(IMG):$(TAG)
+	@echo Pushed $(IMG):$(TAG)
+
+push-gpu: build-gpu
+	gcloud docker --authorize-only 
+	docker push $(IMG)-gpu:$(TAG)
+	@echo Pushed $(IMG)-gpu:$(TAG)
+
+push-trainer: push-cpu push-gpu
+
+push-dataflow: build-dataflow
+	gcloud docker --authorize-only 
+	docker push $(IMG)-dataflow:$(TAG)
+	@echo Pushed $(IMG)-dataflow:$(TAG)
+
+push: push-cpu push-gpu push-dataflow
diff --git a/code_search/demo/README.md b/code_search/demo/README.md
@@ -24,6 +24,15 @@ jlewi@ ran experiments that produced the following results
 | Preprocessed data|  gs://code-search-demo/20181104/data/func-doc-pairs-00???-of-00100.csv |  This is the output of the Dataflow preprocessing job
 | Training data | gs://code-search-demo/20181104/data/kf_github_function_docstring-train-00???-of-00100 | TFRecord files produced by running T2T datagen
 
+## Performance
+
+| hparams | Resources | Steps/sec
+|----------|----------|---------------------
+| transformer_tiny | 1 CPU worker|  ~1.8 global step /sec
+| transformer_base_single_gpu | 1 GPU worker (K80) | ~3.22611 global step /sec
+| transformer_base | 1 chief with K80, 8 workers with 1 K80, sync training| ~ 0.0588723 global step /sec
+| transformer_base | 1 chief (no GPU), 8 workers (no GPU), sync training| ~ 0.707014 global step /sec
+
 
 
 

diff --git a/code_search/docker/t2t/Dockerfile b/code_search/docker/t2t/Dockerfile
@@ -2,36 +2,22 @@ ARG BASE_IMAGE_TAG=1.8.0
 
 FROM tensorflow/tensorflow:$BASE_IMAGE_TAG
 
-RUN pip --no-cache-dir install tensor2tensor~=1.8.0 oauth2client~=4.1.0 &&\
-    apt-get update && apt-get install -y jq &&\
+RUN pip --no-cache-dir install oauth2client~=4.1.0 &&\
+    apt-get update && apt-get install -y jq git &&\
     rm -rf /var/lib/apt/lists/*
 
-
-# These dependencies are primarily needed with Dataflow
-# so we need to install them for Python2.
-# We do this before copying the code because we don't want to have
-# reinstall the requirements just because the code changed.
-COPY src/requirements.txt /tmp/requirements.txt
-RUN pip install -r /tmp/requirements.txt
-RUN pip install https://github.com/kubeflow/batch-predict/tarball/master
-
-# install the spacy model
-RUN python -m spacy download en
-
-# The version of protobuf that ends up getting installed as the result
-# of the above commands is too old (3.5); we end up getting errors
-# when import apache_beam upgrading appears to fix this.
-RUN pip install -U protobuf==3.6.1
+RUN pip --no-cache-dir install tensor2tensor~=1.10.0
 
 ADD src/code_search /app/code_search
 ADD src             /src
 
 ADD docker/t2t/t2t-entrypoint.sh /usr/local/sbin/t2t-entrypoint
+ADD docker/t2t/run_and_wait.sh /usr/local/sbin/run_and_wait.sh
+RUN chmod a+x /usr/local/sbin/run_and_wait.sh
 
 WORKDIR /app
 
 ENV PYTHONIOENCODING=utf-8 T2T_USR_DIR=/app/code_search/t2t
 
 VOLUME ["/data", "/output"]
 
-ENTRYPOINT ["bash"]
diff --git a/code_search/docker/t2t/Dockerfile.dataflow b/code_search/docker/t2t/Dockerfile.dataflow
@@ -0,0 +1,22 @@
+# Dockerfile suitable for submitting Dataflow jobs.
+# We don't use the Docker image used for running the training jobs
+# because we have different versioning requirements.
+FROM python:2.7-jessie
+
+# These dependencies are primarily needed with Dataflow
+# so we need to install them for Python2.
+# We do this before copying the code because we don't want to have
+# reinstall the requirements just because the code changed.
+COPY src/requirements.txt /tmp/requirements.txt
+RUN pip install -r /tmp/requirements.txt
+RUN pip install https://github.com/kubeflow/batch-predict/tarball/master
+
+# install the spacy model
+RUN python -m spacy download en
+
+ADD src/code_search /app/code_search
+ADD src             /src
+
+WORKDIR /app
+
+ENV PYTHONIOENCODING=utf-8 T2T_USR_DIR=/app/code_search/t2t
diff --git a/code_search/docker/t2t/README.md b/code_search/docker/t2t/README.md
@@ -0,0 +1 @@
+To build the docker images use the Makefile at the root of the code_search tree.
diff --git a/code_search/docker/t2t/build.jsonnet b/code_search/docker/t2t/build.jsonnet
@@ -0,0 +1,21 @@
+{
+
+	"steps": [
+    {
+      "name": "gcr.io/cloud-builders/docker",
+      "args": ["build", "-t", "gcr.io/kubeflow-examples/code-search:" + std.extVar("tag"), 
+             	"--label=git-versions=" + std.extVar("gitVersion"), 
+                "--build-arg", "BASE_IMAGE_TAG=1.11.0",
+      		   "./docker/t2t"],
+    },
+    {
+      "name": "gcr.io/cloud-builders/docker",
+      "args": ["build", "-t", "gcr.io/kubeflow-examples/code-search-gpu:" + std.extVar("tag"), 
+             	"--label=git-versions=" + std.extVar("gitVersion"), 
+                "--build-arg", "BASE_IMAGE_TAG=1.11.0-gpu",
+      		   "./docker/t2t"],
+    },
+  ],
+  "images": ["gcr.io/kubeflow-examples/code-search:" + std.extVar("tag"), 
+             "gcr.io/kubeflow-examples/code-search-gpu:" + std.extVar("tag")],
+}
diff --git a/code_search/docker/t2t/build.sh b/code_search/docker/t2t/build.sh
diff --git a/code_search/docker/t2t/run_and_wait.sh b/code_search/docker/t2t/run_and_wait.sh
@@ -0,0 +1,12 @@
+#!/usr/bin/env bash
+# This script is a wrapper script for T2T.
+# T2T doesn't respect TF_CONFIG; instead it relies on command line variables.
+# So this script parses TF_CONFIG and then appends appropriate command line arguments
+# to whatever command was entered.
+set -x
+env | sort
+echo running "$@"
+"$@" 
+
+# Sleep to give fluentd time to capture logs
+sleep 120