Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve auto_deploy to support changing zone and testing changes. #323

Merged
merged 5 commits into from
Mar 8, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions images/gcb_build/image_build.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"images": [
"gcr.io/kubeflow-ci/test-worker:v20190207-281ec9b-dirty-939141",
"gcr.io/kubeflow-ci/test-worker:v20190302-c0829e8-dirty-f1d98c",
"gcr.io/kubeflow-ci/test-worker:latest"
],
"steps": [
Expand All @@ -19,7 +19,7 @@
"args": [
"build",
"-t",
"gcr.io/kubeflow-ci/test-worker:v20190207-281ec9b-dirty-939141",
"gcr.io/kubeflow-ci/test-worker:v20190302-c0829e8-dirty-f1d98c",
"--label=git-versions=",
"--file=./Dockerfile",
"--cache-from=gcr.io/kubeflow-ci/test-worker:latest",
Expand All @@ -34,7 +34,7 @@
{
"args": [
"tag",
"gcr.io/kubeflow-ci/test-worker:v20190207-281ec9b-dirty-939141",
"gcr.io/kubeflow-ci/test-worker:v20190302-c0829e8-dirty-f1d98c",
"gcr.io/kubeflow-ci/test-worker:latest"
],
"id": "tag-test-worker",
Expand Down
93 changes: 61 additions & 32 deletions py/kubeflow/testing/create_kf_instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,39 @@
"""
import argparse
import logging
import json
import os
import re
import yaml

from googleapiclient import discovery
from google.cloud import storage
from kubeflow.testing import util
from retrying import retry
from oauth2client.client import GoogleCredentials

@retry(wait_fixed=60000, stop_max_attempt_number=5)
def kfctl_apply_with_retry(kfctl, cwd, env):
util.run([kfctl, "apply", "all"], cwd=cwd, env=env)
def run_with_retry(*args, **kwargs):
util.run(*args, **kwargs)

def delete_storage_deployment(project, name):
credentials = GoogleCredentials.get_application_default()
dm = discovery.build("deploymentmanager", "v2", credentials=credentials)

deployments_client = dm.deployments()

try:
op = deployments_client.delete(project=project, deployment=name,
deletePolicy="DELETE").execute()
except Exception as e:
if hasattr(e, 'content'):
m = json.loads(e.content)
if m.get("error", {}).get("code") == 404:
return
raise
raise

util.wait_for_gcp_operation(dm.operations(), project, None, op["name"])

def main(): # pylint: disable=too-many-locals,too-many-statements
logging.basicConfig(level=logging.INFO,
Expand All @@ -27,10 +49,6 @@ def main(): # pylint: disable=too-many-locals,too-many-statements

parser = argparse.ArgumentParser()

parser.add_argument(
"--base_name", default="kf-v0-4", type=str,
help=("The base name for the deployment typically kf-vX-Y or kf-vmaster."))

parser.add_argument(
"--project", default="kubeflow-ci", type=str, help=("The project."))

Expand All @@ -54,13 +72,13 @@ def main(): # pylint: disable=too-many-locals,too-many-statements
type=str, help=("Directory to store kubeflow apps."))

parser.add_argument(
"--deployment_worker_cluster",
default="kubeflow-testing",
type=str, help=("Name of cluster deployment cronjob workers use."))
"--name",
default="", type=str, help=("Name for the deployment."))

parser.add_argument(
"--cluster_num",
default="", type=int, help=("Number of cluster to deploy to."))
"--snapshot_file",
default="", type=str, help=("A json file containing information about the "
"snapshot to use."))

parser.add_argument(
"--timestamp",
Expand All @@ -85,15 +103,16 @@ def main(): # pylint: disable=too-many-locals,too-many-statements
git_describe = util.run(["git", "describe", "--tags", "--always", "--dirty"],
cwd=args.kubeflow_repo).strip("'")

# TODO(https://github.com/kubeflow/testing/issues/95): We want to cycle
# between N different names e.g.
# kf-vX-Y-n00, kf-vX-Y-n01, ... kf-vX-Y-n05
# The reason to reuse names is because for IAP we need to manually
# set the redirect URIs. So we want to cycle between a set of known
# endpoints. We should add logic to automatically recycle deployments.
# i.e. we should find the oldest one and reuse that.
num = args.cluster_num
name = "{0}-n{1:02d}".format(args.base_name, num)
timestamp = args.timestamp
if args.snapshot_file:
logging.info("Loading info from snapshot file %s", args.snapshot_file)
with open(args.snapshot_file) as hf:
snapshot_info = json.load(hf)
name = snapshot_info["name"]
timestamp = snapshot_info.get("timestamp", "")
else:
name = args.name

# Clean up previous deployment. We are not able to run "kfctl delete all"
# since we are not able to guarantee apps config in repository is up to date.
util.run(["rm", "-rf", name], cwd=args.apps_dir)
Expand All @@ -103,12 +122,12 @@ def main(): # pylint: disable=too-many-locals,too-many-statements
# re-create it.
delete_deployment = os.path.join(args.kubeflow_repo, "scripts", "gke",
"delete_deployment.sh")

util.run([delete_deployment, "--project=" + args.project,
"--deployment=" + name, "--zone=" + args.zone], cwd=args.apps_dir)

# Create a dummy kubeconfig in cronjob worker.
util.run(["gcloud", "container", "clusters", "get-credentials", args.deployment_worker_cluster,
"--zone", args.zone, "--project", args.project], cwd=args.apps_dir)
# Delete script doesn't delete storage deployment by design.
delete_storage_deployment(args.project, name + "-storage")

app_dir = os.path.join(args.apps_dir, name)
kfctl = os.path.join(args.kubeflow_repo, "scripts", "kfctl.sh")
Expand All @@ -125,8 +144,8 @@ def main(): # pylint: disable=too-many-locals,too-many-statements
"PURPOSE": "kf-test-cluster",
},
}
if args.timestamp:
app["labels"]["SNAPSHOT_TIMESTAMP"] = args.timestamp
if timestamp:
app["labels"]["SNAPSHOT_TIMESTAMP"] = timestamp
if args.job_name:
app["labels"]["DEPLOYMENT_JOB"] = args.job_name
labels = app.get("labels", {})
Expand All @@ -140,24 +159,34 @@ def main(): # pylint: disable=too-many-locals,too-many-statements
val = re.sub(r"[^a-z0-9\-_]", "-", val)
label_args.append("{key}={val}".format(key=k.lower(), val=val))

util.run([kfctl, "generate", "all"], cwd=app_dir)
util.run(["ks", "generate", "seldon", "seldon"], cwd=ks_app_dir)

env = {}
env.update(os.environ)
env.update(oauth_info)

# We need to apply platform before doing generate k8s because we need
# to have a cluster for ksonnet.
# kfctl apply all might break during cronjob invocation when depending
# components are not ready. Make it retry several times should be enough.
kfctl_apply_with_retry(kfctl, app_dir, env)
run_with_retry([kfctl, "generate", "platform"], cwd=app_dir, env=env)
run_with_retry([kfctl, "apply", "platform"], cwd=app_dir, env=env)
run_with_retry([kfctl, "generate", "k8s"], cwd=app_dir, env=env)
run_with_retry([kfctl, "apply", "k8s"], cwd=app_dir, env=env)
run_with_retry(["ks", "generate", "seldon", "seldon"], cwd=ks_app_dir, env=env)

logging.info("Annotating cluster with labels: %s", str(label_args))
util.run(["gcloud", "container", "clusters", "update", name,
"--zone", args.zone,

# Set labels on the deployment
util.run(["gcloud", "--project", args.project,
"deployment-manager", "deployments", "update", name,
"--update-labels", ",".join(label_args)],
cwd=app_dir)
cwd=app_dir)

# To work around lets-encrypt certificate uses create a self-signed
# certificate
util.run(["gcloud", "container", "clusters", "get-credentials", name,
"--zone", args.zone,
"--protject", args.project])
"--project", args.project])
tls_endpoint = "--host=%s.endpoints.kubeflow-ci.cloud.goog" % name
util.run(["kube-rsa", tls_endpoint])
util.run(["kubectl", "-n", "kubeflow", "create", "secret", "tls",
Expand Down
45 changes: 45 additions & 0 deletions py/kubeflow/testing/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,6 +331,51 @@ def wait_for_operation(client,
# Linter complains if we don't have a return here even though its unreachable.
return None

def wait_for_gcp_operation(client,
project,
zone,
op_id,
timeout=datetime.timedelta(hours=1),
polling_interval=datetime.timedelta(seconds=5)):
"""Wait for the specified operation to complete.

Args:
client: Operations client for the API that owns the operation; should
have get
project: project
zone: Zone. Set to none if its a global operation
op_id: Operation id.
timeout: A datetime.timedelta expressing the amount of time to wait before
giving up.
polling_interval: A datetime.timedelta to represent the amount of time to
wait between requests polling for the operation status.

Returns:
op: The final operation.

Raises:
TimeoutError: if we timeout waiting for the operation to complete.
"""
endtime = datetime.datetime.now() + timeout
while True:
if zone:
op = client.get(
projectId=project, zone=zone, operationId=op_id).execute()
else:
op = client.get(
project=project, operation=op_id).execute()

status = op.get("status", "")
# Need to handle other status's
if status == "DONE":
return op
if datetime.datetime.now() > endtime:
raise TimeoutError(
"Timed out waiting for op: {0} to complete.".format(op_id))
time.sleep(polling_interval.total_seconds())

# Linter complains if we don't have a return here even though its unreachable.
return None

def configure_kubectl(project, zone, cluster_name):
logging.info("Configuring kubectl")
Expand Down
144 changes: 7 additions & 137 deletions test-infra/auto-deploy/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,142 +1,12 @@
# Docker image for nightly deployment cronjob.

FROM ubuntu:xenial
#
FROM gcr.io/kubeflow-ci/test-worker:v20190302-c0829e8-dirty-f1d98c
MAINTAINER Gabriel Wen

# Never prompt the user for choices on installation/configuration of packages
ENV DEBIAN_FRONTEND=noninteractive
ENV TERM=linux
ENV LC_ALL=C.UTF-8
ENV LANG=C.UTF-8

# gcc & python-dev are needed so we can install crcmod for gsutil
# also includes installations for Python3
RUN set -ex \
&& apt-get update -yqq \
&& apt-get install -yqq --no-install-recommends \
build-essential \
curl \
wget \
git \
jq \
zip \
unzip \
gcc \
ssh \
python-dev \
python-setuptools \
python-pip \
python3-dev \
python3-setuptools \
python3-pip \
&& python -V \
&& python3 -V \
&& apt-get clean \
&& rm -rf \
/var/lib/apt/lists/* \
/tmp/* \
/var/tmp/* \
/usr/share/man \
/usr/share/doc \
/usr/share/doc-base

# Install go
RUN cd /tmp && \
wget -O /tmp/go.tar.gz https://redirector.gvt1.com/edgedl/go/go1.9.2.linux-amd64.tar.gz && \
tar -C /usr/local -xzf go.tar.gz

# Install gcloud
ENV PATH=/usr/local/go/bin:/google-cloud-sdk/bin:/workspace:${PATH} \
CLOUDSDK_CORE_DISABLE_PROMPTS=1

RUN wget -q https://dl.google.com/dl/cloudsdk/channels/rapid/google-cloud-sdk.tar.gz && \
tar xzf google-cloud-sdk.tar.gz -C / && \
rm google-cloud-sdk.tar.gz && \
/google-cloud-sdk/install.sh \
--disable-installation-options \
--bash-completion=false \
--path-update=false \
--usage-reporting=false && \
gcloud components install alpha beta

# Install yarn
RUN curl -sS http://dl.yarnpkg.com/debian/pubkey.gpg | apt-key add - \
&& echo "deb http://dl.yarnpkg.com/debian/ stable main" | tee /etc/apt/sources.list.d/yarn.list \
&& apt-get update -yqq \
&& apt-get install -yqq --no-install-recommends yarn

# Install glide
RUN cd /tmp && \
wget -O glide-v0.13.0-linux-amd64.tar.gz \
https://github.com/Masterminds/glide/releases/download/v0.13.0/glide-v0.13.0-linux-amd64.tar.gz && \
tar -xvf glide-v0.13.0-linux-amd64.tar.gz && \
mv ./linux-amd64/glide /usr/local/bin/

# Install ksonnet. We install multiple versions of ks to support different versions
# of ksonnet applications. Newer versions of ksonnet are backwards compatible but
# that can require upgrading the app which isn't something we want to be forced to.
# (see https://github.com/kubeflow/testing/issues/220).
RUN cd /tmp && \
wget -O ks.tar.gz \
https://github.com/ksonnet/ksonnet/releases/download/v0.11.0/ks_0.11.0_linux_amd64.tar.gz && \
tar -xvf ks.tar.gz && \
mv ks_0.11.0_linux_amd64/ks /usr/local/bin && \
chmod a+x /usr/local/bin/ks

RUN cd /tmp && \
wget -O ks-12.tar.gz \
https://github.com/ksonnet/ksonnet/releases/download/v0.12.0/ks_0.12.0_linux_amd64.tar.gz && \
tar -xvf ks-12.tar.gz && \
mv ks_0.12.0_linux_amd64/ks /usr/local/bin/ks-12 && \
chmod a+x /usr/local/bin/ks-12

RUN cd /tmp && \
wget -O ks-13.tar.gz \
https://github.com/ksonnet/ksonnet/releases/download/v0.13.1/ks_0.13.1_linux_amd64.tar.gz && \
tar -xvf ks-13.tar.gz && \
mv ks_0.13.1_linux_amd64/ks /usr/local/bin/ks-13 && \
chmod a+x /usr/local/bin/ks-13

RUN cd /tmp && \
wget https://github.com/google/jsonnet/archive/v0.11.2.tar.gz && \
tar -xvf v0.11.2.tar.gz && \
cd jsonnet-0.11.2 && \
make && \
mv jsonnet /usr/local/bin && \
rm -rf /tmp/v0.11.2.tar.gz && \
rm -rf /tmp/jsonnet-0.11.2

# Install various python libraries for both Python 2 and 3 (for now)
# Don't upgrade pip for now because it seems to be broken
# https://github.com/pypa/pip/issues/5240
COPY ./Pipfile ./Pipfile.lock /tmp/

RUN cd /tmp/ && \
pip2 install -U wheel filelock && \
pip2 install pipenv && \
pipenv install --system --two && \
pip3 install -U wheel filelock

RUN pip3 install pipenv==2018.10.9
RUN cd /tmp/ && pipenv install --system --three

# Install docker.
RUN curl https://get.docker.com/ | sh

# Install kubectl
RUN curl -LO https://storage.googleapis.com/kubernetes-release/release/v1.13.0/bin/linux/amd64/kubectl && \
mv kubectl /usr/local/bin && \
chmod a+x /usr/local/bin/kubectl

# Work around for https://github.com/ksonnet/ksonnet/issues/298
ENV USER root

# Purpose of init.sh is to have a script as kickstarter. This script is used to pull fresh copy from
# Purpose of auto_deploy.sh is to have a script as kickstarter. This script is used to pull fresh copy from
# Github and run with them.
COPY checkout_lib /usr/local/bin/py/checkout_lib
COPY lib-args.sh /usr/local/lib
RUN chmod a+x /usr/local/lib/lib-args.sh
COPY init.sh /usr/local/bin
RUN chmod a+x /usr/local/bin/init.sh
COPY checkout-snapshot.sh /usr/local/bin
RUN chmod a+x /usr/local/bin/checkout-snapshot.sh
COPY lib-args.sh /usr/local/bin
RUN chmod a+x /usr/local/bin/lib-args.sh
COPY auto_deploy.sh /usr/local/bin
RUN chmod a+x /usr/local/bin/auto_deploy.sh
Loading