From 832a723bc3a625bd9254feabd298c361739f6b1b Mon Sep 17 00:00:00 2001 From: Jeremy Lewi Date: Tue, 5 May 2020 14:21:09 -0700 Subject: [PATCH] A simply python script to deploy Kubeflow using the GCP blueprint. (#652) * Create a simple script to deploy Kubeflow using the GCP blueprint. This is basically just a wrapper around make commands. * This is the first step in setting up auto deployments of the GCP blueprint for CI purposes. * Fix some bugs in the management cluster that popped up while testing the blueprint * Fix CNRM install for the kubeflow-ci-deployment namespace. CNRM wasn't properly configured to administer that namespace. The appropriate role bindings weren't being created in the correct namespaces and the statefulset was using the host project and not the managed project. * See #644 for reference on the management cluster settup * We should use the kubeflow-testing@kubeflow-ci service account and not a service account owned by project kubeflow-ci-deployment as the latter is being GC'd by our cleanup ci scripts which breaks the management cluster. * Also per #644 permissions are now set at the folder level to prevent the permissions from being GC'd. --- .../testing/create_kf_from_gcp_blueprint.py | 191 ++++++++++++++++++ py/kubeflow/testing/create_kf_instance.py | 2 + .../per-namespace-components.yaml | 8 +- .../service_account.yaml | 61 ++---- 4 files changed, 210 insertions(+), 52 deletions(-) create mode 100644 py/kubeflow/testing/create_kf_from_gcp_blueprint.py diff --git a/py/kubeflow/testing/create_kf_from_gcp_blueprint.py b/py/kubeflow/testing/create_kf_from_gcp_blueprint.py new file mode 100644 index 00000000000..047bd2c4a12 --- /dev/null +++ b/py/kubeflow/testing/create_kf_from_gcp_blueprint.py @@ -0,0 +1,191 @@ +"""Create a Kubeflow instance using a GCP blueprint. + +The purpose of this script is to automate the creation of Kubeflow Deployments +corresponding to different versions of Kubeflow. + +This script should replace create_kf_instance and potentially +create_unique_kf_instance.py. + +Unlike create_unique_kf_instance.py this script + +1. Uses GCP blueprints(https://github.com/kubeflow/gcp-blueprints) to deploy + kubeflow +2. This script doesn't do any Git checkouts. + * Assumption is Git repos are already checkout (e.g. via Tekton) + +This script doesn't do any cleanup wecause we will rely on cleanup_ci to GC old +auto deployments. + +TODO(jlewi): We should add commonLabels to all the GCP infrastructure to +make it easy to delete. +""" +import datetime +import fire +import logging +import os +import re +import retrying +import subprocess +import uuid +import yaml + +from google.cloud import storage +from kubeflow.testing import gcp_util +from kubeflow.testing import util + + +DEFAULT_OAUTH_FILE = ("gs://kubeflow-ci-deployment_kf-data/" + "kf-iap-oauth.kubeflow-ci-deployment.yaml") + +class ApiNotEnabledError(Exception): + pass + +def get_oauth(project, oauth_file): + """Get the OAuth information""" + bucket, blob_path = util.split_gcs_uri(oauth_file) + + client = storage.Client(project=project) + bucket = client.get_bucket(bucket) + + blob = bucket.get_blob(blob_path) + contents = blob.download_as_string() + + oauth_info = yaml.load(contents) + return oauth_info + +def add_common_labels(kustomization_file, labels): + kustomize_dir = os.path.dirname(kustomization_file) + for k, v in labels.items(): + # We shell out to kustomize edit because we want to preserve + # comments and kpt annotations in the file. + util.run(["kustomize", "edit", "add", "label", "-f", f"{k}:{v}"], + cwd=kustomize_dir) + + +class BlueprintRunner: + @staticmethod + def deploy(blueprint_dir, management_context, name="kf-vbp-{uid}", + project="kubeflow-ci-deployment", + location="us-central1", zone="us-central1-f", + oauth_file=DEFAULT_OAUTH_FILE): + """Deploy the blueprint: + + Args: + blueprint_dir: The directory where + https://github.com/kubeflow/gcp-blueprints/tree/master/kubeflow is checked + out. + management_context: The name of the management context. + name: Name for the deployment. This can be a python format string + with the variable uid. Uid will automatically be substituted " + for a unique value based on the time. + project: The GCP project where the blueprint should be created. + location: The zone or region where Kubeflow should be deployed. + zone: The zone to use for disks must be in the same region as location + when using a regional cluster and must be location when location + is zone. + oauth_file: The file containing the OAuth client ID & secret for IAP. + """ + # Wait for credentials to deal with workload identity issues + gcp_util.get_gcp_credentials() + + try: + util.run(["make", "get-pkg"], cwd=blueprint_dir) + except subprocess.CalledProcessError as e: + if re.search(".*resources must be annotated with config.kubernetes.io/" + "index.*", e.output): + logging.warning(f"make get-pkg returned error: {e.output}; ignoring " + "and continuing") + + elif re.search(".*already exists.*", e.output): + logging.warning("The package directory already exists; continuing") + else: + logging.error(f"Command exited with error: {e.output}") + raise + + util.run(["kpt", "cfg", "set", "instance", "mgmt-ctxt", management_context], + cwd=blueprint_dir) + + # We need to keep the name short to avoid hitting limits with certificates. + uid = datetime.datetime.now().strftime("%m%d") + "-" + uid = uid + uuid.uuid4().hex[0:3] + + name = name.format(uid=uid) + logging.info("Using name %s", name) + + values = { + "name": name, + "gcloud.core.project": project, + "gcloud.compute.zone": zone, + "location": location, + } + + for subdir in ["./upstream/manifests/gcp", "./instance"]: + for k, v in values.items(): + util.run(["kpt", "cfg", "set", subdir, k, v], + cwd=blueprint_dir) + + # TODO(jlewi): We should add an expiration time; either as a label + # or as as an annotation. + # GCP labels can only take as input alphanumeric characters, hyphens, and + # underscores. Replace not valid characters with hyphens. + labels = {"purpose": "kf-test-cluster", + "auto-deploy": "true",} + + kustomization_file = os.path.join(blueprint_dir, "instance", "gcp_config", + "kustomization.yaml") + + add_common_labels(kustomization_file, labels) + + oauth_info = get_oauth(project, oauth_file) + + env = {} + env.update(os.environ) + env.update(oauth_info) + + # To work around various bugs in our manifests that can be fixed by + # retrying we see if a particular error occurs and then retry. + # As thes issues are fixed we should remove the retries. + retryable_errors = [ + # TODO(https://github.com/kubeflow/manifests/issues/1149): + # Once this is fixed we should be able to remove this. + re.compile(".*no matches for kind \"Application\" in version " + "\"app.k8s.io/v1beta1\""), + ] + + # The total time to wait needs to take into account the actual time + # it takes to run otherwise we won't retry. + total_time = datetime.timedelta(minutes=30) + + def is_retryable_esception(exception): + """Return True if we should retry False otherwise""" + + if not isinstance(exception, subprocess.CalledProcessError): + return False + + for m in retryable_errors: + if m.search(exception.output): + logging.warning("make apply failed with retryable error. The " + f"output matched regex: {m.pattern}") + return True + + return False + + @retrying.retry(stop_max_delay=total_time.total_seconds() * 1000, + retry_on_exception=is_retryable_esception) + def run_apply(): + util.run(["make", "apply"], cwd=blueprint_dir, env=env) + + run_apply() + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, + format=('%(levelname)s|%(asctime)s' + '|%(pathname)s|%(lineno)d| %(message)s'), + datefmt='%Y-%m-%dT%H:%M:%S', + ) + logging.getLogger().setLevel(logging.INFO) + try: + fire.Fire(BlueprintRunner) + except subprocess.CalledProcessError as e: + logging.error(f"Subprocess exited with error; output:\n{e.output}") + raise diff --git a/py/kubeflow/testing/create_kf_instance.py b/py/kubeflow/testing/create_kf_instance.py index e0836f0656d..7703f9e994f 100644 --- a/py/kubeflow/testing/create_kf_instance.py +++ b/py/kubeflow/testing/create_kf_instance.py @@ -12,6 +12,8 @@ import os import re import requests +import retrying + import shutil import subprocess import tempfile diff --git a/test-infra/management/instance/cnrm-install-kubeflow-ci-deployment/per-namespace-components.yaml b/test-infra/management/instance/cnrm-install-kubeflow-ci-deployment/per-namespace-components.yaml index bc5afda8426..517fc71ca30 100644 --- a/test-infra/management/instance/cnrm-install-kubeflow-ci-deployment/per-namespace-components.yaml +++ b/test-infra/management/instance/cnrm-install-kubeflow-ci-deployment/per-namespace-components.yaml @@ -8,7 +8,7 @@ kind: ServiceAccount metadata: annotations: cnrm.cloud.google.com/version: 1.7.1 - iam.gke.io/gcp-service-account: cnrm-kf-ci-deployment@kubeflow-ci-deployment.iam.gserviceaccount.com # {"type":"string","x-kustomize":{"partialSetters":[{"name":"managed_project","value":"kubeflow-ci-deployment"},{"name":"managed_gsa_name","value":"cnrm-kf-ci-deployment"}]}} + iam.gke.io/gcp-service-account: kubeflow-testing@kubeflow-ci.iam.gserviceaccount.com # {"type":"string","x-kustomize":{"partialSetters":[{"name":"managed_project","value":"kubeflow-ci-deployment"},{"name":"managed_gsa_name","value":"cnrm-kf-ci-deployment"}]}} labels: cnrm.cloud.google.com/scoped-namespace: kubeflow-ci-deployment # {"type":"string","x-kustomize":{"partialSetters":[{"name":"managed_project","value":"kubeflow-ci-deployment"}]}} cnrm.cloud.google.com/system: "true" @@ -24,7 +24,7 @@ metadata: cnrm.cloud.google.com/scoped-namespace: kubeflow-ci-deployment # {"type":"string","x-kustomize":{"partialSetters":[{"name":"managed_project","value":"kubeflow-ci-deployment"}]}} cnrm.cloud.google.com/system: "true" name: cnrm-admin-binding-kubeflow-ci-deployment # {"type":"string","x-kustomize":{"partialSetters":[{"name":"managed_project","value":"kubeflow-ci-deployment"}]}} - namespace: cnrm-system + namespace: kubeflow-ci-deployment # {"type":"string","x-kustomize":{"partialSetters":[{"name":"managed_project","value":"kubeflow-ci-deployment"}]}} roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole @@ -43,7 +43,7 @@ metadata: cnrm.cloud.google.com/scoped-namespace: kubeflow-ci-deployment # {"type":"string","x-kustomize":{"partialSetters":[{"name":"managed_project","value":"kubeflow-ci-deployment"}]}} cnrm.cloud.google.com/system: "true" name: cnrm-manager-ns-binding-kubeflow-ci-deployment # {"type":"string","x-kustomize":{"partialSetters":[{"name":"managed_project","value":"kubeflow-ci-deployment"}]}} - namespace: cnrm-system + namespace: kubeflow-ci-deployment # {"type":"string","x-kustomize":{"partialSetters":[{"name":"managed_project","value":"kubeflow-ci-deployment"}]}} roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole @@ -143,7 +143,7 @@ spec: spec: containers: - args: - - --scoped-namespace=kubeflow-ci # {"type":"string","x-kustomize":{"partialSetters":[{"name":"host_project","value":"kubeflow-ci"}]}} + - --scoped-namespace=kubeflow-ci-deployment # {"type":"string","x-kustomize":{"partialSetters":[{"name":"managed_project","value":"kubeflow-ci-deployment"}]}} - --stderrthreshold=INFO - --prometheus-scrape-endpoint=:8888 command: diff --git a/test-infra/management/instance/cnrm-install-kubeflow-ci-deployment/service_account.yaml b/test-infra/management/instance/cnrm-install-kubeflow-ci-deployment/service_account.yaml index b7d74472c75..ef805b3b156 100644 --- a/test-infra/management/instance/cnrm-install-kubeflow-ci-deployment/service_account.yaml +++ b/test-infra/management/instance/cnrm-install-kubeflow-ci-deployment/service_account.yaml @@ -1,49 +1,14 @@ -# Define the Google Service Account to be used with CNRM -# in the management cluster. -# Also define the workload identity binding. +# This file exists a placeholder. +# +# To setup the appropriate GSA in our test infra management cluster +# we deviated from the GCP Kubeflow blueprint. # -# These resources should be created with AnthosCLI since we -# need to bootstrap the management cluster. -apiVersion: iam.cnrm.cloud.google.com/v1beta1 -kind: IAMServiceAccount -metadata: - name: cnrm-kf-ci-deployment # {"type":"string","x-kustomize":{"partialSetters":[{"name":"managed_gsa_name","value":"cnrm-kf-ci-deployment"}]}} - namespace: "kubeflow-ci-deployment" # {"type":"string","x-kustomize":{"setBy":"kpt","setter":{"name":"managed_project","value":"kubeflow-ci-deployment"}}} -spec: - displayName: Service account for CNRM ---- -# TODO(jlewi): Switch to IAMPolicyMember once anthos CLI supports that. -apiVersion: iam.cnrm.cloud.google.com/v1alpha1 -kind: IAMPolicy -metadata: - name: cnrm-system-kubeflow-ci-deployment-wi # {"type":"string","x-kustomize":{"partialSetters":[{"name":"managed_project","value":"kubeflow-ci-deployment"}]}} - namespace: "kubeflow-ci-deployment" # {"type":"string","x-kustomize":{"setBy":"kpt","setter":{"name":"managed_project","value":"kubeflow-ci-deployment"}}} -spec: - resourceRef: - apiVersion: iam.cnrm.cloud.google.com/v1alpha1 - kind: IAMServiceAccount - name: cnrm-kf-ci-deployment # {"type":"string","x-kustomize":{"partialSetters":[{"name":"managed_gsa_name","value":"cnrm-kf-ci-deployment"}]}} - bindings: - - role: roles/iam.workloadIdentityUser - members: - # We use a partial setter for the entire workload id pool; i.e. ${PROJECT}.svc.id.goog - # Because in the case where the managed project and host project are the same setters - # for host and managed project would be the same and kpt would no longer know which field goes where. - - serviceAccount:kubeflow-ci.svc.id.goog[cnrm-system/cnrm-controller-manager-kubeflow-ci-deployment] # {"type":"string","x-kustomize":{"setBy":"kpt","partialSetters":[{"name":"host_id_pool","value":"kubeflow-ci.svc.id.goog"},{"name":"managed_project","value":"kubeflow-ci-deployment"}]}} ---- -# Make the GCP SA a project owner -# TODO(jlewi): AnthosCLI doesn't appear to support IAMPolicy Member yet so -# as a work around you will need to use gcloud - -apiVersion: iam.cnrm.cloud.google.com/v1beta1 -kind: IAMPolicyMember -metadata: - name: cnrm-system-kubeflow-ci-deployment-owner # {"type":"string","x-kustomize":{"partialSetters":[{"name":"managed_project","value":"kubeflow-ci-deployment"}]}} - namespace: "kubeflow-ci-deployment" # {"type":"string","x-kustomize":{"setBy":"kpt","setter":{"name":"managed_project","value":"kubeflow-ci-deployment"}}} -spec: - member: serviceAccount:cnrm-kf-ci-deployment@kubeflow-ci-deployment.iam.gserviceaccount.com # {"type":"string","x-kustomize":{"partialSetters":[{"name":"managed_project","value":"kubeflow-ci-deployment"},{"name":"managed_gsa_name","value":"cnrm-kf-ci-deployment"}]}} - role: roles/owner - resourceRef: - apiVersion: resourcemanager.cnrm.cloud.google.com/v1beta1 - kind: Project - external: projects/kubeflow-ci-deployment # {"type":"string","x-kustomize":{"partialSetters":[{"name":"managed_project","value":"kubeflow-ci-deployment"}]}} +# We used the service account "kubeflow-testing@kubeflow-ci.iam.gserviceaccount.com" +# so we don't need to create a namespace specific google service account. +# +# We also granted permissions to this GSA on the folder ci-projects and not on individual +# projects. Likewise we added the workload identity binding to this GSA for the +# CNRM KSA. +# +# We did that through the Cloud console and gcloud so we don't have YAML specs +# for it.