diff --git a/images/gcb_build/image_build.json b/images/gcb_build/image_build.json index 55db86ce4..741787669 100644 --- a/images/gcb_build/image_build.json +++ b/images/gcb_build/image_build.json @@ -1,6 +1,6 @@ { "images": [ - "gcr.io/kubeflow-ci/test-worker:v20190207-281ec9b-dirty-939141", + "gcr.io/kubeflow-ci/test-worker:v20190302-c0829e8-dirty-f1d98c", "gcr.io/kubeflow-ci/test-worker:latest" ], "steps": [ @@ -19,7 +19,7 @@ "args": [ "build", "-t", - "gcr.io/kubeflow-ci/test-worker:v20190207-281ec9b-dirty-939141", + "gcr.io/kubeflow-ci/test-worker:v20190302-c0829e8-dirty-f1d98c", "--label=git-versions=", "--file=./Dockerfile", "--cache-from=gcr.io/kubeflow-ci/test-worker:latest", @@ -34,7 +34,7 @@ { "args": [ "tag", - "gcr.io/kubeflow-ci/test-worker:v20190207-281ec9b-dirty-939141", + "gcr.io/kubeflow-ci/test-worker:v20190302-c0829e8-dirty-f1d98c", "gcr.io/kubeflow-ci/test-worker:latest" ], "id": "tag-test-worker", diff --git a/py/kubeflow/testing/create_kf_instance.py b/py/kubeflow/testing/create_kf_instance.py index f9285821c..c9d3cd18a 100644 --- a/py/kubeflow/testing/create_kf_instance.py +++ b/py/kubeflow/testing/create_kf_instance.py @@ -5,17 +5,39 @@ """ import argparse import logging +import json import os import re import yaml +from googleapiclient import discovery from google.cloud import storage from kubeflow.testing import util from retrying import retry +from oauth2client.client import GoogleCredentials @retry(wait_fixed=60000, stop_max_attempt_number=5) -def kfctl_apply_with_retry(kfctl, cwd, env): - util.run([kfctl, "apply", "all"], cwd=cwd, env=env) +def run_with_retry(*args, **kwargs): + util.run(*args, **kwargs) + +def delete_storage_deployment(project, name): + credentials = GoogleCredentials.get_application_default() + dm = discovery.build("deploymentmanager", "v2", credentials=credentials) + + deployments_client = dm.deployments() + + try: + op = deployments_client.delete(project=project, deployment=name, + deletePolicy="DELETE").execute() + except Exception as e: + if hasattr(e, 'content'): + m = json.loads(e.content) + if m.get("error", {}).get("code") == 404: + return + raise + raise + + util.wait_for_gcp_operation(dm.operations(), project, None, op["name"]) def main(): # pylint: disable=too-many-locals,too-many-statements logging.basicConfig(level=logging.INFO, @@ -27,10 +49,6 @@ def main(): # pylint: disable=too-many-locals,too-many-statements parser = argparse.ArgumentParser() - parser.add_argument( - "--base_name", default="kf-v0-4", type=str, - help=("The base name for the deployment typically kf-vX-Y or kf-vmaster.")) - parser.add_argument( "--project", default="kubeflow-ci", type=str, help=("The project.")) @@ -54,13 +72,13 @@ def main(): # pylint: disable=too-many-locals,too-many-statements type=str, help=("Directory to store kubeflow apps.")) parser.add_argument( - "--deployment_worker_cluster", - default="kubeflow-testing", - type=str, help=("Name of cluster deployment cronjob workers use.")) + "--name", + default="", type=str, help=("Name for the deployment.")) parser.add_argument( - "--cluster_num", - default="", type=int, help=("Number of cluster to deploy to.")) + "--snapshot_file", + default="", type=str, help=("A json file containing information about the " + "snapshot to use.")) parser.add_argument( "--timestamp", @@ -85,15 +103,16 @@ def main(): # pylint: disable=too-many-locals,too-many-statements git_describe = util.run(["git", "describe", "--tags", "--always", "--dirty"], cwd=args.kubeflow_repo).strip("'") - # TODO(https://github.com/kubeflow/testing/issues/95): We want to cycle - # between N different names e.g. - # kf-vX-Y-n00, kf-vX-Y-n01, ... kf-vX-Y-n05 - # The reason to reuse names is because for IAP we need to manually - # set the redirect URIs. So we want to cycle between a set of known - # endpoints. We should add logic to automatically recycle deployments. - # i.e. we should find the oldest one and reuse that. - num = args.cluster_num - name = "{0}-n{1:02d}".format(args.base_name, num) + timestamp = args.timestamp + if args.snapshot_file: + logging.info("Loading info from snapshot file %s", args.snapshot_file) + with open(args.snapshot_file) as hf: + snapshot_info = json.load(hf) + name = snapshot_info["name"] + timestamp = snapshot_info.get("timestamp", "") + else: + name = args.name + # Clean up previous deployment. We are not able to run "kfctl delete all" # since we are not able to guarantee apps config in repository is up to date. util.run(["rm", "-rf", name], cwd=args.apps_dir) @@ -103,12 +122,12 @@ def main(): # pylint: disable=too-many-locals,too-many-statements # re-create it. delete_deployment = os.path.join(args.kubeflow_repo, "scripts", "gke", "delete_deployment.sh") + util.run([delete_deployment, "--project=" + args.project, "--deployment=" + name, "--zone=" + args.zone], cwd=args.apps_dir) - # Create a dummy kubeconfig in cronjob worker. - util.run(["gcloud", "container", "clusters", "get-credentials", args.deployment_worker_cluster, - "--zone", args.zone, "--project", args.project], cwd=args.apps_dir) + # Delete script doesn't delete storage deployment by design. + delete_storage_deployment(args.project, name + "-storage") app_dir = os.path.join(args.apps_dir, name) kfctl = os.path.join(args.kubeflow_repo, "scripts", "kfctl.sh") @@ -125,8 +144,8 @@ def main(): # pylint: disable=too-many-locals,too-many-statements "PURPOSE": "kf-test-cluster", }, } - if args.timestamp: - app["labels"]["SNAPSHOT_TIMESTAMP"] = args.timestamp + if timestamp: + app["labels"]["SNAPSHOT_TIMESTAMP"] = timestamp if args.job_name: app["labels"]["DEPLOYMENT_JOB"] = args.job_name labels = app.get("labels", {}) @@ -140,24 +159,34 @@ def main(): # pylint: disable=too-many-locals,too-many-statements val = re.sub(r"[^a-z0-9\-_]", "-", val) label_args.append("{key}={val}".format(key=k.lower(), val=val)) - util.run([kfctl, "generate", "all"], cwd=app_dir) - util.run(["ks", "generate", "seldon", "seldon"], cwd=ks_app_dir) env = {} env.update(os.environ) env.update(oauth_info) + + # We need to apply platform before doing generate k8s because we need + # to have a cluster for ksonnet. # kfctl apply all might break during cronjob invocation when depending # components are not ready. Make it retry several times should be enough. - kfctl_apply_with_retry(kfctl, app_dir, env) + run_with_retry([kfctl, "generate", "platform"], cwd=app_dir, env=env) + run_with_retry([kfctl, "apply", "platform"], cwd=app_dir, env=env) + run_with_retry([kfctl, "generate", "k8s"], cwd=app_dir, env=env) + run_with_retry([kfctl, "apply", "k8s"], cwd=app_dir, env=env) + run_with_retry(["ks", "generate", "seldon", "seldon"], cwd=ks_app_dir, env=env) logging.info("Annotating cluster with labels: %s", str(label_args)) - util.run(["gcloud", "container", "clusters", "update", name, - "--zone", args.zone, + + # Set labels on the deployment + util.run(["gcloud", "--project", args.project, + "deployment-manager", "deployments", "update", name, "--update-labels", ",".join(label_args)], - cwd=app_dir) + cwd=app_dir) + + # To work around lets-encrypt certificate uses create a self-signed + # certificate util.run(["gcloud", "container", "clusters", "get-credentials", name, "--zone", args.zone, - "--protject", args.project]) + "--project", args.project]) tls_endpoint = "--host=%s.endpoints.kubeflow-ci.cloud.goog" % name util.run(["kube-rsa", tls_endpoint]) util.run(["kubectl", "-n", "kubeflow", "create", "secret", "tls", diff --git a/py/kubeflow/testing/util.py b/py/kubeflow/testing/util.py index e5515f9c0..7d80fb66f 100755 --- a/py/kubeflow/testing/util.py +++ b/py/kubeflow/testing/util.py @@ -331,6 +331,51 @@ def wait_for_operation(client, # Linter complains if we don't have a return here even though its unreachable. return None +def wait_for_gcp_operation(client, + project, + zone, + op_id, + timeout=datetime.timedelta(hours=1), + polling_interval=datetime.timedelta(seconds=5)): + """Wait for the specified operation to complete. + + Args: + client: Operations client for the API that owns the operation; should + have get + project: project + zone: Zone. Set to none if its a global operation + op_id: Operation id. + timeout: A datetime.timedelta expressing the amount of time to wait before + giving up. + polling_interval: A datetime.timedelta to represent the amount of time to + wait between requests polling for the operation status. + + Returns: + op: The final operation. + + Raises: + TimeoutError: if we timeout waiting for the operation to complete. + """ + endtime = datetime.datetime.now() + timeout + while True: + if zone: + op = client.get( + projectId=project, zone=zone, operationId=op_id).execute() + else: + op = client.get( + project=project, operation=op_id).execute() + + status = op.get("status", "") + # Need to handle other status's + if status == "DONE": + return op + if datetime.datetime.now() > endtime: + raise TimeoutError( + "Timed out waiting for op: {0} to complete.".format(op_id)) + time.sleep(polling_interval.total_seconds()) + + # Linter complains if we don't have a return here even though its unreachable. + return None def configure_kubectl(project, zone, cluster_name): logging.info("Configuring kubectl") diff --git a/test-infra/auto-deploy/Dockerfile b/test-infra/auto-deploy/Dockerfile index 878cb492d..daa356bfe 100644 --- a/test-infra/auto-deploy/Dockerfile +++ b/test-infra/auto-deploy/Dockerfile @@ -1,142 +1,12 @@ # Docker image for nightly deployment cronjob. - -FROM ubuntu:xenial +# +FROM gcr.io/kubeflow-ci/test-worker:v20190302-c0829e8-dirty-f1d98c MAINTAINER Gabriel Wen -# Never prompt the user for choices on installation/configuration of packages -ENV DEBIAN_FRONTEND=noninteractive -ENV TERM=linux -ENV LC_ALL=C.UTF-8 -ENV LANG=C.UTF-8 - -# gcc & python-dev are needed so we can install crcmod for gsutil -# also includes installations for Python3 -RUN set -ex \ - && apt-get update -yqq \ - && apt-get install -yqq --no-install-recommends \ - build-essential \ - curl \ - wget \ - git \ - jq \ - zip \ - unzip \ - gcc \ - ssh \ - python-dev \ - python-setuptools \ - python-pip \ - python3-dev \ - python3-setuptools \ - python3-pip \ - && python -V \ - && python3 -V \ - && apt-get clean \ - && rm -rf \ - /var/lib/apt/lists/* \ - /tmp/* \ - /var/tmp/* \ - /usr/share/man \ - /usr/share/doc \ - /usr/share/doc-base - -# Install go -RUN cd /tmp && \ - wget -O /tmp/go.tar.gz https://redirector.gvt1.com/edgedl/go/go1.9.2.linux-amd64.tar.gz && \ - tar -C /usr/local -xzf go.tar.gz - -# Install gcloud -ENV PATH=/usr/local/go/bin:/google-cloud-sdk/bin:/workspace:${PATH} \ - CLOUDSDK_CORE_DISABLE_PROMPTS=1 - -RUN wget -q https://dl.google.com/dl/cloudsdk/channels/rapid/google-cloud-sdk.tar.gz && \ - tar xzf google-cloud-sdk.tar.gz -C / && \ - rm google-cloud-sdk.tar.gz && \ - /google-cloud-sdk/install.sh \ - --disable-installation-options \ - --bash-completion=false \ - --path-update=false \ - --usage-reporting=false && \ - gcloud components install alpha beta - -# Install yarn -RUN curl -sS http://dl.yarnpkg.com/debian/pubkey.gpg | apt-key add - \ - && echo "deb http://dl.yarnpkg.com/debian/ stable main" | tee /etc/apt/sources.list.d/yarn.list \ - && apt-get update -yqq \ - && apt-get install -yqq --no-install-recommends yarn - -# Install glide -RUN cd /tmp && \ - wget -O glide-v0.13.0-linux-amd64.tar.gz \ - https://github.com/Masterminds/glide/releases/download/v0.13.0/glide-v0.13.0-linux-amd64.tar.gz && \ - tar -xvf glide-v0.13.0-linux-amd64.tar.gz && \ - mv ./linux-amd64/glide /usr/local/bin/ - -# Install ksonnet. We install multiple versions of ks to support different versions -# of ksonnet applications. Newer versions of ksonnet are backwards compatible but -# that can require upgrading the app which isn't something we want to be forced to. -# (see https://github.com/kubeflow/testing/issues/220). -RUN cd /tmp && \ - wget -O ks.tar.gz \ - https://github.com/ksonnet/ksonnet/releases/download/v0.11.0/ks_0.11.0_linux_amd64.tar.gz && \ - tar -xvf ks.tar.gz && \ - mv ks_0.11.0_linux_amd64/ks /usr/local/bin && \ - chmod a+x /usr/local/bin/ks - -RUN cd /tmp && \ - wget -O ks-12.tar.gz \ - https://github.com/ksonnet/ksonnet/releases/download/v0.12.0/ks_0.12.0_linux_amd64.tar.gz && \ - tar -xvf ks-12.tar.gz && \ - mv ks_0.12.0_linux_amd64/ks /usr/local/bin/ks-12 && \ - chmod a+x /usr/local/bin/ks-12 - -RUN cd /tmp && \ - wget -O ks-13.tar.gz \ - https://github.com/ksonnet/ksonnet/releases/download/v0.13.1/ks_0.13.1_linux_amd64.tar.gz && \ - tar -xvf ks-13.tar.gz && \ - mv ks_0.13.1_linux_amd64/ks /usr/local/bin/ks-13 && \ - chmod a+x /usr/local/bin/ks-13 - -RUN cd /tmp && \ - wget https://github.com/google/jsonnet/archive/v0.11.2.tar.gz && \ - tar -xvf v0.11.2.tar.gz && \ - cd jsonnet-0.11.2 && \ - make && \ - mv jsonnet /usr/local/bin && \ - rm -rf /tmp/v0.11.2.tar.gz && \ - rm -rf /tmp/jsonnet-0.11.2 - -# Install various python libraries for both Python 2 and 3 (for now) -# Don't upgrade pip for now because it seems to be broken -# https://github.com/pypa/pip/issues/5240 -COPY ./Pipfile ./Pipfile.lock /tmp/ - -RUN cd /tmp/ && \ - pip2 install -U wheel filelock && \ - pip2 install pipenv && \ - pipenv install --system --two && \ - pip3 install -U wheel filelock - -RUN pip3 install pipenv==2018.10.9 -RUN cd /tmp/ && pipenv install --system --three - -# Install docker. -RUN curl https://get.docker.com/ | sh - -# Install kubectl -RUN curl -LO https://storage.googleapis.com/kubernetes-release/release/v1.13.0/bin/linux/amd64/kubectl && \ - mv kubectl /usr/local/bin && \ - chmod a+x /usr/local/bin/kubectl - -# Work around for https://github.com/ksonnet/ksonnet/issues/298 -ENV USER root - -# Purpose of init.sh is to have a script as kickstarter. This script is used to pull fresh copy from +# Purpose of auto_deploy.sh is to have a script as kickstarter. This script is used to pull fresh copy from # Github and run with them. COPY checkout_lib /usr/local/bin/py/checkout_lib -COPY lib-args.sh /usr/local/lib -RUN chmod a+x /usr/local/lib/lib-args.sh -COPY init.sh /usr/local/bin -RUN chmod a+x /usr/local/bin/init.sh -COPY checkout-snapshot.sh /usr/local/bin -RUN chmod a+x /usr/local/bin/checkout-snapshot.sh +COPY lib-args.sh /usr/local/bin +RUN chmod a+x /usr/local/bin/lib-args.sh +COPY auto_deploy.sh /usr/local/bin +RUN chmod a+x /usr/local/bin/auto_deploy.sh diff --git a/test-infra/auto-deploy/Makefile b/test-infra/auto-deploy/Makefile index 9980c2438..0e019da33 100644 --- a/test-infra/auto-deploy/Makefile +++ b/test-infra/auto-deploy/Makefile @@ -39,8 +39,10 @@ push: build # Add tag starting hitting problems so tag and push docker tag $(IMG):$(TAG) $(IMG):latest gcloud docker -- push $(IMG):$(TAG) + ./set_images.sh $(IMG):$(TAG) build-gcb: gcloud builds submit --machine-type=n1-highcpu-32 --project=kubeflow-ci \ --tag=$(IMG):$(TAG) \ --timeout=3600 + ./set_images.sh $(IMG):$(TAG) diff --git a/test-infra/auto-deploy/auto_deploy.sh b/test-infra/auto-deploy/auto_deploy.sh new file mode 100755 index 000000000..37a4dc45d --- /dev/null +++ b/test-infra/auto-deploy/auto_deploy.sh @@ -0,0 +1,71 @@ +#!/bin/bash +# +# To run this locally: +# ./auto_deploy.sh --data_dir=/tmp/data --repos='kubeflow/kubeflow;jlewi/testing@auto_manual' --project=kubeflow-ci --base_name=kf-vmaster --max_num_cluster=5 --zone=us-east1-b + +set -ex + +# Include library that helps on argument parsing. +DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" > /dev/null && pwd)" +. ${DIR}/lib-args.sh + +# Deployment configs. +required_args=(data_dir repos project job_labels base_name max_num_cluster zone) + +parseArgs $* +validateRequiredArgs ${required_args} + +# Activate service account auth. +if [ ! -z ${GOOGLE_APPLICATION_CREDENTIALS} ]; then + gcloud auth activate-service-account --key-file=${GOOGLE_APPLICATION_CREDENTIALS} + gcloud config list +fi + +export PYTHONPATH="${PYTHONPATH}:/usr/local/bin/py" + +# Extract worker job name using checkout_util. +# We make the data dir dependent on the job name. +# This way if the pod exits with an error and gets retried +# it will use the same directory +if [ ! -z ${job_labels} ]; then + header="from checkout_lib import checkout_util;" + job_name="checkout_util.get_job_name(\"${job_labels}\")" + get_job_name="${header} print(${job_name})" + job_name=$(python -c "${get_job_name}") + echo job_name=${job_name} + data_dir=${data_dir}/${job_name} +fi + +echo data_dir=${data_dir} + +# Get a snapshot of the repos. +python -m checkout_lib.snapshot_kf_deployment \ + --snapshot_repos=${repos} \ + --base_name=${base_name} \ + --project=${project} \ + --job_labels=${job_labels} \ + --data_dir=${data_dir} \ + --max_cluster_num=${max_num_cluster} \ + --github_token_file=${github_token_file} + +# Check out fresh copy of KF and deployment workflow. +python -m checkout_lib.repo_clone_snapshot \ + --data_dir=${data_dir} + +export PYTHONPATH="${PYTHONPATH}:${data_dir}/testing/py" + +# Create the deployment +KF_DIR=${data_dir}/kubeflow + +# Directory where apps should be checked out. +APPS_DIR=${data_dir} + +# Trigger create_kf_instance. +python -m kubeflow.testing.create_kf_instance \ + --kubeflow_repo=${KF_DIR} \ + --apps_dir=${APPS_DIR} \ + --project=${project} \ + --snapshot_file=${data_dir}/snapshot.json \ + --zone=${zone} + +# TODO(gabrielwen): Push changes to app folders to git. \ No newline at end of file diff --git a/test-infra/auto-deploy/checkout-snapshot.sh b/test-infra/auto-deploy/checkout-snapshot.sh deleted file mode 100755 index 34aeb4fda..000000000 --- a/test-infra/auto-deploy/checkout-snapshot.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash -# Simple way to take a snapshot of a github repository at a given commit. -set -xe - -# Include library that helps on argument parsing. -. /usr/local/lib/lib-args.sh - -required_args=(src_dir repo_owner repo_name branch commit_sha) -parseArgs $* -validateRequiredArgs ${required_args} - -ORIGIN_DIR=$PWD - -mkdir -p ${src_dir}/${repo_owner} -REPO_DIR=${src_dir}/${repo_owner}/${repo_name} - -echo "Checking out git repo: ${repo_owner}/${repo_name}.git at branch ${branch}" -git clone --single-branch --branch ${branch} \ - https://github.com/${repo_owner}/${repo_name}.git ${REPO_DIR} - -cd ${REPO_DIR} - -echo "Taking snapshot at ${commit_sha}" -git reset --hard ${commit_sha} - -cd ${ORIGIN_DIR} diff --git a/test-infra/auto-deploy/checkout_lib/checkout_util.py b/test-infra/auto-deploy/checkout_lib/checkout_util.py index 3d01b7b55..e8f452af2 100644 --- a/test-infra/auto-deploy/checkout_lib/checkout_util.py +++ b/test-infra/auto-deploy/checkout_lib/checkout_util.py @@ -37,6 +37,9 @@ def get_job_name(label_file): return job_name raise RuntimeError("Not able to find job_name from labels.") +# TODO(jlewi): I don't think we need this anymore +# We should now be using the downward API to set a unique directory based +# on the pod name. def get_snapshot_path(nfs_path, job_name): """Helper function to format folder path for snapshots given mounted NFS path and job name. diff --git a/test-infra/auto-deploy/checkout_lib/repo_clone_snapshot.py b/test-infra/auto-deploy/checkout_lib/repo_clone_snapshot.py index 7024c020d..1894958e8 100644 --- a/test-infra/auto-deploy/checkout_lib/repo_clone_snapshot.py +++ b/test-infra/auto-deploy/checkout_lib/repo_clone_snapshot.py @@ -10,8 +10,6 @@ import os import subprocess -import checkout_util - def main(): logging.basicConfig(level=logging.INFO, format=('%(levelname)s|%(asctime)s' @@ -23,53 +21,39 @@ def main(): parser = argparse.ArgumentParser() parser.add_argument( - "--src_dir", default="", type=str, - help=("Directory to write repositories to.")) - - parser.add_argument( - "--project", default="kubeflow-ci", type=str, help=("The project.")) - - parser.add_argument( - "--zone", default="us-east1-d", type=str, help=("The zone to deploy in.")) - - parser.add_argument( - "--repo_owner", default="kubeflow", type=str, help=("Repository owner.")) - - parser.add_argument( - "--job_labels", - default="/etc/pod-info/labels", - type=str, help=("DownwardAPIVolumeFile for job labels.")) - - parser.add_argument( - "--nfs_path", default="", type=str, help=("GCP Filestore PVC mount path.")) + "--data_dir", default="", type=str, help=("Directory to store the data.")) args = parser.parse_args() - job_name = checkout_util.get_job_name(args.job_labels) - snapshot_path = checkout_util.get_snapshot_path(args.nfs_path, job_name) - logging.info("Job name: %s", job_name) + snapshot_path = os.path.join(args.data_dir, "snapshot.json") logging.info("Reading: %s", snapshot_path) - snapshot = json.load(open(os.path.join(snapshot_path, "snapshot.json"), "r")) + snapshot = json.load(open(snapshot_path, "r")) logging.info("Snapshot: %s", str(snapshot)) - repos = snapshot.get("repos", {}) + repos = snapshot.get("repos", []) + for repo in repos: - branch = repos.get(repo, {}).get("branch", "") - sha = repos.get(repo, {}).get("sha", "") + branch = repo.get("branch", "") + sha = repo.get("sha", "") logging.info("Checking out: %s at branch %s with SHA %s", repo, branch, sha) - subprocess.call(("/usr/local/bin/checkout-snapshot.sh " - "--src_dir={src_dir} " - "--repo_owner={repo_owner} " - "--repo_name={repo_name} " - "--branch={branch} " - "--commit_sha={sha}").format( - src_dir=args.src_dir, - repo_owner=args.repo_owner, - repo_name=repo, - branch=branch, - sha=sha - ), - shell=True) + + target_dir = os.path.join(args.data_dir, repo["repo"]) + if os.path.exists(target_dir): + logging.info("Directory %s already exists; not checking out repo", + repo["repo"]) + continue + git_url = "https://github.com/{repo_owner}/{repo_name}.git".format( + repo_owner=repo["owner"], repo_name=repo["repo"],) + command = ["git", "clone", "--single-branch", "--branch", branch, + git_url, repo["repo"]] + logging.info("Executing: %s", command) + subprocess.check_call(command, cwd=args.data_dir) + + logging.info("Taking snapshot at %s", sha) + command = ["git", "reset", "--hard", sha] + logging.info("Executing: %s", command) + subprocess.check_call(command, cwd=os.path.join(args.data_dir, + repo["repo"])) if __name__ == '__main__': main() diff --git a/test-infra/auto-deploy/checkout_lib/snapshot_kf_deployment.py b/test-infra/auto-deploy/checkout_lib/snapshot_kf_deployment.py index 488bf835f..5e419e921 100644 --- a/test-infra/auto-deploy/checkout_lib/snapshot_kf_deployment.py +++ b/test-infra/auto-deploy/checkout_lib/snapshot_kf_deployment.py @@ -11,58 +11,62 @@ import json import logging import os +import re import requests import checkout_util -import googleapiclient from googleapiclient import discovery from oauth2client.client import GoogleCredentials -RESOURCE_LABELS = "resourceLabels" -SNAPSHOT_TIMESTAMP = "snapshot_timestamp" - -def get_deployment_cluster(project, location, base_name, cluster_nums): +def get_deployment_name(project, base_name, max_num): """Retrieve deployment metadata from GCP and choose the oldest cluster. Args: project: Name of GCP project. - location: Cluster location. base_name: Base name of clusters. - cluster_nums: A list of integers as suffix of cluster names. + max_num: Maximum number of deployments Returns: - integer as the number points to the oldest cluster. + Name to use for the deployment """ credentials = GoogleCredentials.get_application_default() - container = discovery.build("container", "v1", credentials=credentials) - # Using clusters client instead of deployments as deployments API doesn't - # return deployment labels anymore. - clusters_client = container.projects().locations().clusters() - cluster_timestamps = [] - for n in cluster_nums: - cluster = "{0}-n{1:02d}".format(base_name, n) - name = "projects/{p}/locations/{l}/clusters/{c}".format( - p=project, - l=location, - c=cluster) - logging.info("Getting cluster info: %s", name) - try: - info = clusters_client.get(name=name, fields=RESOURCE_LABELS).execute() - if (RESOURCE_LABELS in info and - SNAPSHOT_TIMESTAMP in info.get(RESOURCE_LABELS, {})): - cluster_timestamps.append({"num": n, "timestamp": info.get( - RESOURCE_LABELS, {}).get(SNAPSHOT_TIMESTAMP, "")}) - except googleapiclient.errors.HttpError as e: - logging.error("Cluster %s not reachable, deploying to it: %s", - cluster, str(e)) - return n - - if not cluster_timestamps: - raise RuntimeError("Not able to find available cluster to deploy to.") - - cluster_timestamps.sort(key=lambda x: x.get("timestamp", "")) - return cluster_timestamps[0].get("num", 0) + + dm = discovery.build("deploymentmanager", "v2", credentials=credentials) + dm_client = dm.deployments() + + matching = {} + + next_page_token = None + + m = re.compile(base_name + r"-n\d\d$") + while True: + deployments = dm_client.list(project=project, + pageToken=next_page_token).execute() + + for d in deployments["deployments"]: + if m.match(d["name"]): + matching[d["name"]] = d + + if "nextPageToken" not in deployments: + break + next_page_token = deployments["nextPageToken"] + + # Check if there any any unused deployments. + + allowed = set() + for num in range(max_num): + allowed.add("{0}-n{1:02d}".format(base_name, num)) + + remaining = sorted(allowed - set(matching.keys())) + + if remaining: + return remaining[0] + + # Sort matching items by create time + results = matching.values() + results.sort(key=lambda x: x.get("insertTime", "")) + return results[0]["name"] def repo_snapshot_hash(github_token, repo_owner, repo, branch, snapshot_time): """Look into commit history and pick the latest commit SHA. @@ -112,22 +116,18 @@ def sort_by_time(record): return sha_time[0].get("sha", "") # pylint: disable=unsubscriptable-object -def lock_and_write(folder, payload): - dirname = os.path.dirname(folder) - dir_lock = filelock.FileLock(os.path.join(dirname, "dir.lock")) - with dir_lock: - if not os.path.exists(folder): - os.makedirs(folder) - file_lock = filelock.FileLock(os.path.join(folder, "file.lock")) +def lock_and_write(target, payload): + dirname = os.path.dirname(target) + if not os.path.exists(dirname): + os.makedirs(dirname) + file_lock = filelock.FileLock(os.path.join(dirname, "file.lock")) with file_lock: - path = os.path.join(folder, "snapshot.json") - if os.path.exists(path): + if os.path.exists(target): return - logging.info("Writing to file: %s", path) - with open(path, "w") as f: + logging.info("Writing to file: %s", target) + with open(target, "w") as f: f.write(payload) - def main(): logging.basicConfig(level=logging.INFO, format=('%(levelname)s|%(asctime)s' @@ -139,10 +139,10 @@ def main(): parser = argparse.ArgumentParser() parser.add_argument( - "snapshot_repos", nargs="+", - help=("Repositories needed to take snapshot. Should be in the format of " - "/. If branch_name is not given, default is " - "master.")) + "--snapshot_repos", type=str, + help=("A semi-colon separated list of repositories to check out." + "{ORG}/{REPO}@{BRANCH};{ORG2}/{REPO2}@{BRANCH}" + "If branch is none master is used.")) parser.add_argument( "--base_name", default="kf-v0-4", type=str, @@ -155,67 +155,77 @@ def main(): parser.add_argument( "--project", default="kubeflow-ci", type=str, help=("The GCP project.")) - parser.add_argument( - "--zone", default="us-east1-d", type=str, help=("The zone to deploy in.")) - - parser.add_argument( - "--repo_owner", default="kubeflow", type=str, help=("Github repo owner.")) - parser.add_argument( "--github_token_file", - default="/secret/github-token/github_token", + default="", type=str, help=("The file containing Github API token.")) parser.add_argument( "--job_labels", - default="/etc/pod-info/labels", + default="", type=str, help=("DownwardAPIVolumeFile for job labels.")) parser.add_argument( - "--nfs_path", - default="", type=str, help=("GCP Filestore PVC mount path.")) + "--data_dir", + default="", type=str, help=("Directory where data shoud be written.")) args = parser.parse_args() - token_file = open(args.github_token_file, "r") - github_token = token_file.readline() - token_file.close() + github_token = None + if args.github_token_file: + logging.info("Reading GITHUB_TOKEN from file: %s", args.github_token_file) + token_file = open(args.github_token_file, "r") + github_token = token_file.readline() + token_file.close() + else: + logging.info("Looking for GITHUB token in environment variable " + "GITHUB_TOKEN") + github_token = os.getenv("GITHUB_TOKEN", "") - cluster_num = get_deployment_cluster(args.project, args.zone, - args.base_name, [ - n for n in range(args.max_cluster_num)]) + if not github_token: + raise ValueError("No GITHUB token set") - logging.info("Deploying to %d", cluster_num) + name = get_deployment_name(args.project, args.base_name, args.max_cluster_num) - job_name = checkout_util.get_job_name(args.job_labels) + logging.info("Using deployment name %s", name) + + job_name = "" + if args.job_labels: + logging.info("Reading labels form file %s", args.job_labels) + job_name = checkout_util.get_job_name(args.job_labels) logging.info("Job name: %s", job_name) logging.info("Repos: %s", str(args.snapshot_repos)) logging.info("Project: %s", args.project) - logging.info("Repo owner: %s", args.repo_owner) snapshot_time = datetime.datetime.utcnow().isoformat() logging.info("Snapshotting at %s", snapshot_time) repo_snapshot = { "timestamp": snapshot_time, - "cluster_num": cluster_num, - "repos": {}, + "name": name, + "repos": [], } - for repo_args in args.snapshot_repos: - repo_branch = repo_args.split("/") - repo = repo_branch[0] # pylint: disable=unsubscriptable-object - branch = repo_branch[1] if len(repo_branch) > 1 else "master" # pylint: disable=unsubscriptable-object - sha = repo_snapshot_hash(github_token, args.repo_owner, repo, branch, + for repo_path in args.snapshot_repos.split(";"): + if "@" in repo_path: + repo, branch = repo_path.split("@") + else: + repo = repo_path + branch = "master" + + repo_org, repo_name = repo.split("/") + sha = repo_snapshot_hash(github_token, repo_org, repo_name, branch, snapshot_time) logging.info("Snapshot repo %s at %s, branch is %s", repo, sha, branch) - repo_snapshot["repos"][repo] = { + repo_snapshot["repos"].append({ + "owner": repo_org, + "repo": repo_name, "sha": sha, "branch": branch - } + }) logging.info("Snapshot = %s", str(repo_snapshot)) - folder = checkout_util.get_snapshot_path(args.nfs_path, job_name) - lock_and_write(folder, json.dumps(repo_snapshot)) + snapshot_path = os.path.join(args.data_dir, "snapshot.json") + lock_and_write(snapshot_path, json.dumps(repo_snapshot)) if __name__ == '__main__': diff --git a/test-infra/auto-deploy/deploy-cron-master.yaml b/test-infra/auto-deploy/deploy-cron-master.yaml index ab4adbb65..4995670d7 100644 --- a/test-infra/auto-deploy/deploy-cron-master.yaml +++ b/test-infra/auto-deploy/deploy-cron-master.yaml @@ -5,43 +5,41 @@ metadata: clusterName: kubeflow-testing namespace: kubeflow-test-infra spec: - concurrencyPolicy: "Forbid" - # Deploy every 8 hours. - schedule: "0 */8 * * *" + concurrencyPolicy: Forbid + schedule: 0 */12 * * * jobTemplate: spec: + backoffLimit: 2 template: spec: containers: - name: deploy-worker - image: gcr.io/kubeflow-ci/deploy-worker:live + image: gcr.io/kubeflow-ci/deploy-worker:v20190302-afc8ef8-dirty-201fe5 env: - name: GOOGLE_APPLICATION_CREDENTIALS value: /secret/gcp-credentials/key.json command: - - /usr/local/bin/init.sh - - --src_dir=/src - - --repo_owner=kubeflow - - --repo_branches=kubeflow/master,testing/master + - /usr/local/bin/auto_deploy.sh + - --repos=kubeflow/kubeflow;kubeflow/testing - --project=kubeflow-ci - - --worker_cluster=kubeflow-testing - --job_labels=/etc/pod-info/labels - - --nfs_mnt=/mnt/test-data-volume + - --data_dir=/mnt/test-data-volume/auto_deploy - --base_name=kf-vmaster - --max_num_cluster=5 - --zone=us-east1-b + - --github_token_file=/secret/github-token/github_token volumeMounts: - name: gcp-credentials - mountPath: "/secret/gcp-credentials" + mountPath: /secret/gcp-credentials readOnly: true - name: pod-info - mountPath: "/etc/pod-info" + mountPath: /etc/pod-info readOnly: true - name: github-token - mountPath: "/secret/github-token" + mountPath: /secret/github-token readOnly: true - name: test-data-volume - mountPath: "/mnt/test-data-volume" + mountPath: /mnt/test-data-volume readOnly: false restartPolicy: Never volumes: @@ -57,6 +55,6 @@ spec: - name: pod-info downwardAPI: items: - - path: "labels" - fieldRef: - fieldPath: metadata.labels + - path: labels + fieldRef: + fieldPath: metadata.labels diff --git a/test-infra/auto-deploy/deploy-cron-v0-4.yaml b/test-infra/auto-deploy/deploy-cron-v0-4.yaml index 971d9c6e6..35e86f839 100644 --- a/test-infra/auto-deploy/deploy-cron-v0-4.yaml +++ b/test-infra/auto-deploy/deploy-cron-v0-4.yaml @@ -5,43 +5,41 @@ metadata: clusterName: kubeflow-testing namespace: kubeflow-test-infra spec: - concurrencyPolicy: "Forbid" - # Deploy every 8 hours. - schedule: "0 */8 * * *" + concurrencyPolicy: Forbid + schedule: 0 */12 * * * jobTemplate: spec: + backoffLimit: 2 template: spec: containers: - name: deploy-worker - image: gcr.io/kubeflow-ci/deploy-worker:live + image: gcr.io/kubeflow-ci/deploy-worker:v20190302-afc8ef8-dirty-201fe5 env: - name: GOOGLE_APPLICATION_CREDENTIALS value: /secret/gcp-credentials/key.json command: - - /usr/local/bin/init.sh - - --src_dir=/src - - --repo_owner=kubeflow - - --repo_branches=kubeflow/v0.4-branch,testing/master + - /usr/local/bin/auto_deploy.sh + - --repos=kubeflow/kubeflow;kubeflow/testing@v0.4-branch - --project=kubeflow-ci - - --worker_cluster=kubeflow-testing - --job_labels=/etc/pod-info/labels - - --nfs_mnt=/mnt/test-data-volume - - --base_name=kf-v0-4 + - --data_dir=/mnt/test-data-volume/auto_deploy + - --base_name=kf-vmaster - --max_num_cluster=5 - --zone=us-east1-b + - --github_token_file=/secret/github-token/github_token volumeMounts: - name: gcp-credentials - mountPath: "/secret/gcp-credentials" + mountPath: /secret/gcp-credentials readOnly: true - name: pod-info - mountPath: "/etc/pod-info" + mountPath: /etc/pod-info readOnly: true - name: github-token - mountPath: "/secret/github-token" + mountPath: /secret/github-token readOnly: true - name: test-data-volume - mountPath: "/mnt/test-data-volume" + mountPath: /mnt/test-data-volume readOnly: false restartPolicy: Never volumes: @@ -57,6 +55,6 @@ spec: - name: pod-info downwardAPI: items: - - path: "labels" - fieldRef: - fieldPath: metadata.labels + - path: labels + fieldRef: + fieldPath: metadata.labels diff --git a/test-infra/auto-deploy/deploy-master.yaml b/test-infra/auto-deploy/deploy-master.yaml index 9d08eced3..08cbbccbe 100644 --- a/test-infra/auto-deploy/deploy-master.yaml +++ b/test-infra/auto-deploy/deploy-master.yaml @@ -3,39 +3,45 @@ kind: Job metadata: generateName: deploy-master- namespace: kubeflow-test-infra + labels: + app: deploy-master + version: master spec: + backoffLimit: 1 template: + metadata: + labels: + job: deploy-master + version: master spec: containers: - name: deploy-worker - image: gcr.io/kubeflow-ci/deploy-worker:v20190228-e625074-dirty-decd62 + image: gcr.io/kubeflow-ci/deploy-worker:v20190302-afc8ef8-dirty-201fe5 env: - name: GOOGLE_APPLICATION_CREDENTIALS value: /secret/gcp-credentials/key.json command: - - /usr/local/bin/init.sh - - --src_dir=/src - - --repo_owner=kubeflow - - --repo_branches=kubeflow/master,testing/master + - /usr/local/bin/auto_deploy.sh + - --repos=kubeflow/kubeflow;jlewi/testing@auto_manual - --project=kubeflow-ci - - --worker_cluster=kubeflow-testing - --job_labels=/etc/pod-info/labels - - --nfs_mnt=/mnt/test-data-volume + - --data_dir=/mnt/test-data-volume/auto_deploy - --base_name=kf-vmaster - --max_num_cluster=5 - --zone=us-east1-b + - --github_token_file=/secret/github-token/github_token volumeMounts: - name: gcp-credentials - mountPath: "/secret/gcp-credentials" + mountPath: /secret/gcp-credentials readOnly: true - name: pod-info - mountPath: "/etc/pod-info" + mountPath: /etc/pod-info readOnly: true - name: github-token - mountPath: "/secret/github-token" + mountPath: /secret/github-token readOnly: true - name: test-data-volume - mountPath: "/mnt/test-data-volume" + mountPath: /mnt/test-data-volume readOnly: false restartPolicy: Never volumes: @@ -51,6 +57,6 @@ spec: - name: pod-info downwardAPI: items: - - path: "labels" - fieldRef: - fieldPath: metadata.labels + - path: labels + fieldRef: + fieldPath: metadata.labels diff --git a/test-infra/auto-deploy/deployment-workflows.sh b/test-infra/auto-deploy/deployment-workflows.sh deleted file mode 100755 index 6f0c12b25..000000000 --- a/test-infra/auto-deploy/deployment-workflows.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/bin/bash -# Workflow used to invoke deployment process. -set -ex - -# Include library that helps on argument parsing. -. /usr/local/lib/lib-args.sh - -# Deployment configs. -required_args=(src_dir repo_owner project worker_cluster job_labels nfs_mnt \ - base_name zone) - -parseArgs $* -validateRequiredArgs ${required_args} - -APPS_DIR=${src_dir}/${repo_owner}/testing/test-infra -KF_DIR=${src_dir}/${repo_owner}/kubeflow - -# Extract worker job name using checkout_util. -header="from checkout_lib import checkout_util;" -job_name="checkout_util.get_job_name(\"${job_labels}\")" -get_job_name="${header} print(${job_name})" -job_name=$(python -c "${get_job_name}") - -# Load snapshot JSON. -get_path="checkout_util.get_snapshot_path(\"${nfs_mnt}\", \"${job_name}\")" -get_snapshot_path="${header} print(${get_path})" -snapshot_path=$(python -c "${get_snapshot_path}") - -# Extract cluster_num from JSON file. -read_snapshot="cat ${snapshot_path}/snapshot.json" -get_cluster_num="jq .cluster_num" -get_timestamp="jq .timestamp" -cluster_num=$(${read_snapshot} | ${get_cluster_num}) -timestamp=$(${read_snapshot} | ${get_timestamp}) - -# Trigger create_kf_instance. -python -m kubeflow.testing.create_kf_instance \ - --base=${base_name} \ - --kubeflow_repo=${KF_DIR} \ - --apps_dir=${APPS_DIR} \ - --project=${project} \ - --deployment_worker_cluster=${worker_cluster} \ - --cluster_num=${cluster_num} \ - --timestamp=${timestamp} \ - --job_name=${job_name} \ - --zone=${zone} - -# TODO(gabrielwen): Push changes to app folders to git. diff --git a/test-infra/auto-deploy/init.sh b/test-infra/auto-deploy/init.sh deleted file mode 100755 index ce48a62cb..000000000 --- a/test-infra/auto-deploy/init.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/bin/bash -set -ex - -# Include library that helps on argument parsing. -. /usr/local/lib/lib-args.sh - -# Deployment configs. -required_args=(src_dir repo_owner repo_branches project worker_cluster \ - job_labels nfs_mnt base_name max_num_cluster zone) - -parseArgs $* -validateRequiredArgs ${required_args} - -# Activate service account auth. -gcloud auth activate-service-account --key-file=${GOOGLE_APPLICATION_CREDENTIALS} -gcloud config list - -export PYTHONPATH="${PYTHONPATH}:/usr/local/bin/py" - -# Split args by comma and replace with space. -repos=$(echo ${repo_branches} | tr "," " ") - -python -m checkout_lib.snapshot_kf_deployment \ - ${repos} \ - --base_name=${base_name} \ - --project=${project} \ - --repo_owner=${repo_owner} \ - --job_labels=${job_labels} \ - --nfs_path=${nfs_mnt} \ - --max_cluster_num=${max_num_cluster} \ - --zone=${zone} - -# Check out fresh copy of KF and deployment workflow. -python -m checkout_lib.repo_clone_snapshot \ - --src_dir=${src_dir} \ - --project=${project} \ - --repo_owner=${repo_owner} \ - --job_labels=${job_labels} \ - --nfs_path=${nfs_mnt} \ - --zone=${zone} - -export PYTHONPATH="${PYTHONPATH}:${src_dir}/${repo_owner}/testing/py" - -# Initiate deployment workflow. -${src_dir}/${repo_owner}/testing/test-infra/auto-deploy/deployment-workflows.sh \ - --src_dir=${src_dir} \ - --repo_owner=${repo_owner} \ - --project=${project} \ - --worker_cluster=${worker_cluster} \ - --job_labels=${job_labels} \ - --nfs_mnt=${nfs_mnt} \ - --base_name=${base_name} \ - --zone=${zone} diff --git a/test-infra/auto-deploy/lib-args.sh b/test-infra/auto-deploy/lib-args.sh index e826add0a..d2f20fcbf 100755 --- a/test-infra/auto-deploy/lib-args.sh +++ b/test-infra/auto-deploy/lib-args.sh @@ -8,11 +8,18 @@ parseArgs() { # Parameters should be of the form # --{name}=${value} echo parsing "$1" + if [[ $1 =~ ^--(.*)=(.*)$ ]]; then _name=${BASH_REMATCH[1]} - _value=${BASH_REMATCH[2]} - eval ${_name}="${_value}" + if [[ ${_name} == "repos" ]]; then + # We handle repos specially because it has ";" in it which causes problems + repos=${BASH_REMATCH[2]} + else + _value=${BASH_REMATCH[2]} + eval ${_name}="${_value}" + fi + elif [[ $1 =~ ^--(.*)$ ]]; then _name=${BASH_REMATCH[1]} _value=true diff --git a/test-infra/auto-deploy/set_images.sh b/test-infra/auto-deploy/set_images.sh new file mode 100755 index 000000000..c2feb0322 --- /dev/null +++ b/test-infra/auto-deploy/set_images.sh @@ -0,0 +1,17 @@ +#!/bin/bash +# A simple shell scripy to set the images in the yaml files +set -ex + +IMAGE=$1 + +substitute() { + WHAT=$1 + TARGET=$2 + + yq --yaml-output -r ${WHAT} ${TARGET} > /tmp/${TARGET}.new + mv /tmp/${TARGET}.new ${TARGET} +} +# +substitute ".spec.template.spec.containers[0].image=\"${IMAGE}\"" deploy-master.yaml +substitute ".spec.jobTemplate.spec.template.spec.containers[0].image=\"${IMAGE}\"" deploy-cron-master.yaml +substitute ".spec.jobTemplate.spec.template.spec.containers[0].image=\"${IMAGE}\"" deploy-cron-v0-4.yaml \ No newline at end of file diff --git a/test-infra/ks_app/components/nfs-external.jsonnet b/test-infra/ks_app/components/nfs-external.jsonnet index ad6496ca7..8fa5dd214 100644 --- a/test-infra/ks_app/components/nfs-external.jsonnet +++ b/test-infra/ks_app/components/nfs-external.jsonnet @@ -54,6 +54,8 @@ local pvc = { storage: "500Mi", }, }, + storageClassName: storageClassName, + volumeName: "gcfs", }, };