Skip to content

Commit

Permalink
A simply python script to deploy Kubeflow using the GCP blueprint. (k…
Browse files Browse the repository at this point in the history
…ubeflow#652)

* Create a simple script to deploy Kubeflow using the GCP blueprint.
  This is basically just a wrapper around make commands.

  * This is the first step in setting up auto deployments of the GCP
    blueprint for CI purposes.

* Fix some bugs in the management cluster that popped up while testing
  the blueprint

  * Fix CNRM install for the kubeflow-ci-deployment namespace.
    CNRM wasn't properly configured to administer that namespace.
   The appropriate role bindings weren't being created in the correct
  namespaces and the statefulset was using the host project and not the
  managed project.

  * See kubeflow#644 for reference on the management cluster settup

  * We should use the kubeflow-testing@kubeflow-ci service account
    and not a service account owned by project kubeflow-ci-deployment
    as the latter is being GC'd by our cleanup ci scripts which breaks
    the management cluster.

  * Also per kubeflow#644 permissions are now set at the folder level to
    prevent the permissions from being GC'd.
  • Loading branch information
jlewi authored May 5, 2020
1 parent 5ce11a5 commit 832a723
Show file tree
Hide file tree
Showing 4 changed files with 210 additions and 52 deletions.
191 changes: 191 additions & 0 deletions py/kubeflow/testing/create_kf_from_gcp_blueprint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
"""Create a Kubeflow instance using a GCP blueprint.
The purpose of this script is to automate the creation of Kubeflow Deployments
corresponding to different versions of Kubeflow.
This script should replace create_kf_instance and potentially
create_unique_kf_instance.py.
Unlike create_unique_kf_instance.py this script
1. Uses GCP blueprints(https://github.com/kubeflow/gcp-blueprints) to deploy
kubeflow
2. This script doesn't do any Git checkouts.
* Assumption is Git repos are already checkout (e.g. via Tekton)
This script doesn't do any cleanup wecause we will rely on cleanup_ci to GC old
auto deployments.
TODO(jlewi): We should add commonLabels to all the GCP infrastructure to
make it easy to delete.
"""
import datetime
import fire
import logging
import os
import re
import retrying
import subprocess
import uuid
import yaml

from google.cloud import storage
from kubeflow.testing import gcp_util
from kubeflow.testing import util


DEFAULT_OAUTH_FILE = ("gs://kubeflow-ci-deployment_kf-data/"
"kf-iap-oauth.kubeflow-ci-deployment.yaml")

class ApiNotEnabledError(Exception):
pass

def get_oauth(project, oauth_file):
"""Get the OAuth information"""
bucket, blob_path = util.split_gcs_uri(oauth_file)

client = storage.Client(project=project)
bucket = client.get_bucket(bucket)

blob = bucket.get_blob(blob_path)
contents = blob.download_as_string()

oauth_info = yaml.load(contents)
return oauth_info

def add_common_labels(kustomization_file, labels):
kustomize_dir = os.path.dirname(kustomization_file)
for k, v in labels.items():
# We shell out to kustomize edit because we want to preserve
# comments and kpt annotations in the file.
util.run(["kustomize", "edit", "add", "label", "-f", f"{k}:{v}"],
cwd=kustomize_dir)


class BlueprintRunner:
@staticmethod
def deploy(blueprint_dir, management_context, name="kf-vbp-{uid}",
project="kubeflow-ci-deployment",
location="us-central1", zone="us-central1-f",
oauth_file=DEFAULT_OAUTH_FILE):
"""Deploy the blueprint:
Args:
blueprint_dir: The directory where
https://github.com/kubeflow/gcp-blueprints/tree/master/kubeflow is checked
out.
management_context: The name of the management context.
name: Name for the deployment. This can be a python format string
with the variable uid. Uid will automatically be substituted "
for a unique value based on the time.
project: The GCP project where the blueprint should be created.
location: The zone or region where Kubeflow should be deployed.
zone: The zone to use for disks must be in the same region as location
when using a regional cluster and must be location when location
is zone.
oauth_file: The file containing the OAuth client ID & secret for IAP.
"""
# Wait for credentials to deal with workload identity issues
gcp_util.get_gcp_credentials()

try:
util.run(["make", "get-pkg"], cwd=blueprint_dir)
except subprocess.CalledProcessError as e:
if re.search(".*resources must be annotated with config.kubernetes.io/"
"index.*", e.output):
logging.warning(f"make get-pkg returned error: {e.output}; ignoring "
"and continuing")

elif re.search(".*already exists.*", e.output):
logging.warning("The package directory already exists; continuing")
else:
logging.error(f"Command exited with error: {e.output}")
raise

util.run(["kpt", "cfg", "set", "instance", "mgmt-ctxt", management_context],
cwd=blueprint_dir)

# We need to keep the name short to avoid hitting limits with certificates.
uid = datetime.datetime.now().strftime("%m%d") + "-"
uid = uid + uuid.uuid4().hex[0:3]

name = name.format(uid=uid)
logging.info("Using name %s", name)

values = {
"name": name,
"gcloud.core.project": project,
"gcloud.compute.zone": zone,
"location": location,
}

for subdir in ["./upstream/manifests/gcp", "./instance"]:
for k, v in values.items():
util.run(["kpt", "cfg", "set", subdir, k, v],
cwd=blueprint_dir)

# TODO(jlewi): We should add an expiration time; either as a label
# or as as an annotation.
# GCP labels can only take as input alphanumeric characters, hyphens, and
# underscores. Replace not valid characters with hyphens.
labels = {"purpose": "kf-test-cluster",
"auto-deploy": "true",}

kustomization_file = os.path.join(blueprint_dir, "instance", "gcp_config",
"kustomization.yaml")

add_common_labels(kustomization_file, labels)

oauth_info = get_oauth(project, oauth_file)

env = {}
env.update(os.environ)
env.update(oauth_info)

# To work around various bugs in our manifests that can be fixed by
# retrying we see if a particular error occurs and then retry.
# As thes issues are fixed we should remove the retries.
retryable_errors = [
# TODO(https://github.com/kubeflow/manifests/issues/1149):
# Once this is fixed we should be able to remove this.
re.compile(".*no matches for kind \"Application\" in version "
"\"app.k8s.io/v1beta1\""),
]

# The total time to wait needs to take into account the actual time
# it takes to run otherwise we won't retry.
total_time = datetime.timedelta(minutes=30)

def is_retryable_esception(exception):
"""Return True if we should retry False otherwise"""

if not isinstance(exception, subprocess.CalledProcessError):
return False

for m in retryable_errors:
if m.search(exception.output):
logging.warning("make apply failed with retryable error. The "
f"output matched regex: {m.pattern}")
return True

return False

@retrying.retry(stop_max_delay=total_time.total_seconds() * 1000,
retry_on_exception=is_retryable_esception)
def run_apply():
util.run(["make", "apply"], cwd=blueprint_dir, env=env)

run_apply()

if __name__ == "__main__":
logging.basicConfig(level=logging.INFO,
format=('%(levelname)s|%(asctime)s'
'|%(pathname)s|%(lineno)d| %(message)s'),
datefmt='%Y-%m-%dT%H:%M:%S',
)
logging.getLogger().setLevel(logging.INFO)
try:
fire.Fire(BlueprintRunner)
except subprocess.CalledProcessError as e:
logging.error(f"Subprocess exited with error; output:\n{e.output}")
raise
2 changes: 2 additions & 0 deletions py/kubeflow/testing/create_kf_instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
import os
import re
import requests
import retrying

import shutil
import subprocess
import tempfile
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ kind: ServiceAccount
metadata:
annotations:
cnrm.cloud.google.com/version: 1.7.1
iam.gke.io/gcp-service-account: cnrm-kf-ci-deployment@kubeflow-ci-deployment.iam.gserviceaccount.com # {"type":"string","x-kustomize":{"partialSetters":[{"name":"managed_project","value":"kubeflow-ci-deployment"},{"name":"managed_gsa_name","value":"cnrm-kf-ci-deployment"}]}}
iam.gke.io/gcp-service-account: kubeflow-testing@kubeflow-ci.iam.gserviceaccount.com # {"type":"string","x-kustomize":{"partialSetters":[{"name":"managed_project","value":"kubeflow-ci-deployment"},{"name":"managed_gsa_name","value":"cnrm-kf-ci-deployment"}]}}
labels:
cnrm.cloud.google.com/scoped-namespace: kubeflow-ci-deployment # {"type":"string","x-kustomize":{"partialSetters":[{"name":"managed_project","value":"kubeflow-ci-deployment"}]}}
cnrm.cloud.google.com/system: "true"
Expand All @@ -24,7 +24,7 @@ metadata:
cnrm.cloud.google.com/scoped-namespace: kubeflow-ci-deployment # {"type":"string","x-kustomize":{"partialSetters":[{"name":"managed_project","value":"kubeflow-ci-deployment"}]}}
cnrm.cloud.google.com/system: "true"
name: cnrm-admin-binding-kubeflow-ci-deployment # {"type":"string","x-kustomize":{"partialSetters":[{"name":"managed_project","value":"kubeflow-ci-deployment"}]}}
namespace: cnrm-system
namespace: kubeflow-ci-deployment # {"type":"string","x-kustomize":{"partialSetters":[{"name":"managed_project","value":"kubeflow-ci-deployment"}]}}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
Expand All @@ -43,7 +43,7 @@ metadata:
cnrm.cloud.google.com/scoped-namespace: kubeflow-ci-deployment # {"type":"string","x-kustomize":{"partialSetters":[{"name":"managed_project","value":"kubeflow-ci-deployment"}]}}
cnrm.cloud.google.com/system: "true"
name: cnrm-manager-ns-binding-kubeflow-ci-deployment # {"type":"string","x-kustomize":{"partialSetters":[{"name":"managed_project","value":"kubeflow-ci-deployment"}]}}
namespace: cnrm-system
namespace: kubeflow-ci-deployment # {"type":"string","x-kustomize":{"partialSetters":[{"name":"managed_project","value":"kubeflow-ci-deployment"}]}}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
Expand Down Expand Up @@ -143,7 +143,7 @@ spec:
spec:
containers:
- args:
- --scoped-namespace=kubeflow-ci # {"type":"string","x-kustomize":{"partialSetters":[{"name":"host_project","value":"kubeflow-ci"}]}}
- --scoped-namespace=kubeflow-ci-deployment # {"type":"string","x-kustomize":{"partialSetters":[{"name":"managed_project","value":"kubeflow-ci-deployment"}]}}
- --stderrthreshold=INFO
- --prometheus-scrape-endpoint=:8888
command:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,49 +1,14 @@
# Define the Google Service Account to be used with CNRM
# in the management cluster.
# Also define the workload identity binding.
# This file exists a placeholder.
#
# To setup the appropriate GSA in our test infra management cluster
# we deviated from the GCP Kubeflow blueprint.
#
# These resources should be created with AnthosCLI since we
# need to bootstrap the management cluster.
apiVersion: iam.cnrm.cloud.google.com/v1beta1
kind: IAMServiceAccount
metadata:
name: cnrm-kf-ci-deployment # {"type":"string","x-kustomize":{"partialSetters":[{"name":"managed_gsa_name","value":"cnrm-kf-ci-deployment"}]}}
namespace: "kubeflow-ci-deployment" # {"type":"string","x-kustomize":{"setBy":"kpt","setter":{"name":"managed_project","value":"kubeflow-ci-deployment"}}}
spec:
displayName: Service account for CNRM
---
# TODO(jlewi): Switch to IAMPolicyMember once anthos CLI supports that.
apiVersion: iam.cnrm.cloud.google.com/v1alpha1
kind: IAMPolicy
metadata:
name: cnrm-system-kubeflow-ci-deployment-wi # {"type":"string","x-kustomize":{"partialSetters":[{"name":"managed_project","value":"kubeflow-ci-deployment"}]}}
namespace: "kubeflow-ci-deployment" # {"type":"string","x-kustomize":{"setBy":"kpt","setter":{"name":"managed_project","value":"kubeflow-ci-deployment"}}}
spec:
resourceRef:
apiVersion: iam.cnrm.cloud.google.com/v1alpha1
kind: IAMServiceAccount
name: cnrm-kf-ci-deployment # {"type":"string","x-kustomize":{"partialSetters":[{"name":"managed_gsa_name","value":"cnrm-kf-ci-deployment"}]}}
bindings:
- role: roles/iam.workloadIdentityUser
members:
# We use a partial setter for the entire workload id pool; i.e. ${PROJECT}.svc.id.goog
# Because in the case where the managed project and host project are the same setters
# for host and managed project would be the same and kpt would no longer know which field goes where.
- serviceAccount:kubeflow-ci.svc.id.goog[cnrm-system/cnrm-controller-manager-kubeflow-ci-deployment] # {"type":"string","x-kustomize":{"setBy":"kpt","partialSetters":[{"name":"host_id_pool","value":"kubeflow-ci.svc.id.goog"},{"name":"managed_project","value":"kubeflow-ci-deployment"}]}}
---
# Make the GCP SA a project owner
# TODO(jlewi): AnthosCLI doesn't appear to support IAMPolicy Member yet so
# as a work around you will need to use gcloud

apiVersion: iam.cnrm.cloud.google.com/v1beta1
kind: IAMPolicyMember
metadata:
name: cnrm-system-kubeflow-ci-deployment-owner # {"type":"string","x-kustomize":{"partialSetters":[{"name":"managed_project","value":"kubeflow-ci-deployment"}]}}
namespace: "kubeflow-ci-deployment" # {"type":"string","x-kustomize":{"setBy":"kpt","setter":{"name":"managed_project","value":"kubeflow-ci-deployment"}}}
spec:
member: serviceAccount:cnrm-kf-ci-deployment@kubeflow-ci-deployment.iam.gserviceaccount.com # {"type":"string","x-kustomize":{"partialSetters":[{"name":"managed_project","value":"kubeflow-ci-deployment"},{"name":"managed_gsa_name","value":"cnrm-kf-ci-deployment"}]}}
role: roles/owner
resourceRef:
apiVersion: resourcemanager.cnrm.cloud.google.com/v1beta1
kind: Project
external: projects/kubeflow-ci-deployment # {"type":"string","x-kustomize":{"partialSetters":[{"name":"managed_project","value":"kubeflow-ci-deployment"}]}}
# We used the service account "kubeflow-testing@kubeflow-ci.iam.gserviceaccount.com"
# so we don't need to create a namespace specific google service account.
#
# We also granted permissions to this GSA on the folder ci-projects and not on individual
# projects. Likewise we added the workload identity binding to this GSA for the
# CNRM KSA.
#
# We did that through the Cloud console and gcloud so we don't have YAML specs
# for it.

0 comments on commit 832a723

Please sign in to comment.