add katib studyjob launcher (kubeflow#754)

* add katib studyjob launcher * delete tmp file * fix link error to tf-laucher * import studyjob client from katib project * specify output file with a parameter undo tf-launcher
alibaba · Mar 4, 2019 · 7737025 · 7737025
1 parent 6ed804b
commit 7737025
Show file tree

Hide file tree

Showing 6 changed files with 355 additions and 0 deletions.
diff --git a/components/kubeflow/katib-launcher/Dockerfile b/components/kubeflow/katib-launcher/Dockerfile
@@ -0,0 +1,29 @@
+# Copyright 2019 The Kubeflow Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+FROM ubuntu:16.04
+
+RUN apt-get update -y && \
+    apt-get install --no-install-recommends -y -q ca-certificates python-dev python-setuptools wget unzip git && \
+    easy_install pip && \
+    pip install pyyaml==3.12 six==1.11.0 requests==2.18.4 grpcio gcloud google-api-python-client protobuf kubernetes && \
+    wget https://github.com/kubeflow/katib/archive/master.zip && unzip master.zip
+
+ENV PYTHONPATH $PYTHONPATH:/katib-master/pkg/api/python:/katib-master/py
+
+ADD build /ml
+
+RUN mkdir /usr/licenses && \
+    /ml/license.sh /ml/third_party_licenses.csv /usr/licenses
+
+ENTRYPOINT ["python", "/ml/launch_study_job.py"]
diff --git a/components/kubeflow/katib-launcher/build_image.sh b/components/kubeflow/katib-launcher/build_image.sh
@@ -0,0 +1,58 @@
+#!/bin/bash -e
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+while getopts ":hp:t:i:" opt; do
+  case "${opt}" in
+    h) echo "-p: project name"
+       echo "-t: tag name"
+       echo "-i: image name. If provided, project name and tag name are not necessary"
+       exit
+      ;;
+    p) PROJECT_ID=${OPTARG}
+      ;;
+    t) TAG_NAME=${OPTARG}
+      ;;
+    i) LAUNCHER_IMAGE_NAME=${OPTARG}
+      ;;
+    \? ) echo "Usage: cmd [-p] project [-t] tag [-i] image"
+      exit
+      ;;
+  esac
+done
+
+mkdir -p ./build
+rsync -arvp ./src/ ./build/
+
+cp ../../license.sh ./build
+cp ../../third_party_licenses.csv ./build
+
+LOCAL_LAUNCHER_IMAGE_NAME=ml-pipeline-kubeflow-studyjob
+
+docker build -t ${LOCAL_LAUNCHER_IMAGE_NAME} .
+if [ -z "${LAUNCHER_IMAGE_NAME}" ]; then
+  if [ -z "${TAG_NAME}" ]; then
+    TAG_NAME=$(date +v%Y%m%d)-$(git describe --tags --always --dirty)-$(git diff | shasum -a256 | cut -c -6)
+  fi
+  if [ -z "${PROJECT_ID}" ]; then
+    PROJECT_ID=$(gcloud config config-helper --format "value(configuration.properties.core.project)")
+  fi
+  docker tag ${LOCAL_LAUNCHER_IMAGE_NAME} gcr.io/${PROJECT_ID}/${LOCAL_LAUNCHER_IMAGE_NAME}:${TAG_NAME}
+  docker push gcr.io/${PROJECT_ID}/${LOCAL_LAUNCHER_IMAGE_NAME}:${TAG_NAME}
+else
+  docker tag ${LOCAL_LAUNCHER_IMAGE_NAME} ${LAUNCHER_IMAGE_NAME}
+  docker push ${LAUNCHER_IMAGE_NAME}
+fi
+
+rm -rf ./build
diff --git a/components/kubeflow/katib-launcher/kubeflow_katib_launcher_op.py b/components/kubeflow/katib-launcher/kubeflow_katib_launcher_op.py
@@ -0,0 +1,40 @@
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import kfp.dsl as dsl
+
+def kubeflow_studyjob_launcher_op(name, namespace, optimizationtype, objectivevaluename, optimizationgoal, requestcount, metricsnames,
+                                  parameterconfigs, nasConfig, workertemplatepath, mcollectortemplatepath, suggestionspec,
+                                  studyjob_timeout_minutes, output_file='/output.txt', step_name='StudyJob-Launcher'):
+    return dsl.ContainerOp(
+        name = step_name,
+        image = 'liuhougangxa/ml-pipeline-kubeflow-studyjob:latest',
+        arguments = [
+            '--name', name,
+            '--namespace', namespace,
+            "--optimizationtype", optimizationtype,
+            "--objectivevaluename", objectivevaluename,
+            "--optimizationgoal", optimizationgoal,
+            "--requestcount", requestcount,
+            "--metricsnames", metricsnames,
+            "--parameterconfigs", parameterconfigs,
+            "--nasConfig", nasConfig,
+            "--workertemplatepath", workertemplatepath,
+            "--mcollectortemplatepath", mcollectortemplatepath,
+            "--suggestionspec", suggestionspec,
+            "--outputfile", output_file,
+            '--studyjobtimeoutminutes', studyjob_timeout_minutes,
+        ],
+        file_outputs = {'hyperparameter': output_file}
+    )
diff --git a/components/kubeflow/katib-launcher/src/__init__.py b/components/kubeflow/katib-launcher/src/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .kubeflow_katib_launcher_op import kubeflow_studyjob_launcher_op
diff --git a/components/kubeflow/katib-launcher/src/hp.template.yaml b/components/kubeflow/katib-launcher/src/hp.template.yaml
@@ -0,0 +1,49 @@
+apiVersion: "kubeflow.org/v1alpha1"
+kind: StudyJob
+metadata:
+  namespace: kubeflow
+  labels:
+    controller-tools.k8s.io: "1.0"
+  name: study-example
+spec:
+  studyName: study-example
+  owner: crd
+  optimizationtype: ""
+  objectivevaluename: ""
+  optimizationgoal: 0.99
+  requestcount: 4
+  metricsnames:
+    - accuracy_1
+  nasConfig:
+    graphConfig:
+      numLayers: 8
+      inputSize:
+        - 32
+        - 32
+        - 3
+      outputSize:
+        - 10
+    operations:
+      - operationType: convolution
+        parameterconfigs:
+          - name: filter_size
+            parametertype: categorical
+            feasible:
+              list:
+              - "3"
+              - "5"
+              - "7"
+  parameterconfigs:
+    - name: --learning_rate
+      parametertype: double
+      feasible:
+        min: "0.01"
+        max: "0.05"
+  workerSpec:
+    goTemplate:
+      templatePath: ""
+  metricsCollectorSpec:
+    goTemplate:
+      templatePath: ""
+  suggestionSpec:
+    suggestionAlgorithm: "random"
diff --git a/components/kubeflow/katib-launcher/src/launch_study_job.py b/components/kubeflow/katib-launcher/src/launch_study_job.py
@@ -0,0 +1,164 @@
+# Copyright 2018 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import datetime
+import json
+import os
+import logging
+import requests
+import subprocess
+import yaml
+import grpc
+
+import api_pb2
+import api_pb2_grpc
+
+from kubernetes import client as k8s_client
+from kubernetes import config
+import study_job_client
+
+def yamlOrJsonStr(str):
+    if str == "" or str == None:
+        return None
+    try:
+        return json.loads(str)
+    except:
+        return yaml.load(str)
+
+def strToList(str):
+    return str.split(",")
+
+def _update_or_pop(spec, name, value):
+    if value:
+        spec[name] = value
+    else:
+        spec.pop(name)
+
+def _generate_studyjob_yaml(src_filename, name, namespace, optimizationtype, objectivevaluename, optimizationgoal, requestcount,
+                            metricsnames, parameterconfigs, nasConfig, workertemplatepath, mcollectortemplatepath, suggestionspec):
+  """_generate_studyjob_yaml generates studyjob yaml file based on hp.template.yaml"""
+  with open(src_filename, 'r') as f:
+    content = yaml.load(f)
+
+  content['metadata']['name'] = name
+  content['metadata']['namespace'] = namespace
+  content['spec']['studyName'] = name
+  content['spec']['optimizationtype'] = optimizationtype
+  content['spec']['objectivevaluename'] = objectivevaluename
+  content['spec']['optimizationgoal'] = optimizationgoal
+  content['spec']['requestcount'] = requestcount
+
+  _update_or_pop(content['spec'], 'parameterconfigs', parameterconfigs)
+  _update_or_pop(content['spec'], 'nasConfig', nasConfig)
+  _update_or_pop(content['spec'], 'metricsnames', metricsnames)
+  _update_or_pop(content['spec'], 'suggestionSpec', suggestionspec)
+
+  if workertemplatepath:
+    content['spec']['workerSpec']['goTemplate']['templatePath'] = workertemplatepath
+  else:
+    content['spec'].pop('workerSpec')
+
+  if mcollectortemplatepath:
+    content['spec']['metricsCollectorSpec']['goTemplate']['templatePath'] = mcollectortemplatepath
+  else :
+    content['spec'].pop('metricsCollectorSpec')
+
+  return content
+
+def get_best_trial(trial_id):
+    vizier_core = "vizier-core.kubeflow:6789"
+    with grpc.insecure_channel(vizier_core) as channel:
+        stub = api_pb2_grpc.ManagerStub(channel)
+        response = stub.GetTrial(api_pb2.GetTrialRequest(trial_id=trial_id))
+        return response.trial
+
+def main(argv=None):
+  parser = argparse.ArgumentParser(description='Kubeflow StudyJob launcher')
+  parser.add_argument('--name', type=str,
+                      help='StudyJob name.')
+  parser.add_argument('--namespace', type=str,
+                      default='kubeflow',
+                      help='StudyJob namespace.')
+  parser.add_argument('--optimizationtype', type=str,
+                      default='minimize',
+                      help='Direction of optimization. minimize or maximize.')
+  parser.add_argument('--objectivevaluename', type=str,
+                      help='Objective value name which trainer optimizes.')
+  parser.add_argument('--optimizationgoal', type=float,
+                      help='Stop studying once objectivevaluename value ' +
+                           'exceeds optimizationgoal')
+  parser.add_argument('--requestcount', type=int,
+                      default=1,
+                      help='The times asking request to suggestion service.')
+  parser.add_argument('--metricsnames', type=strToList,
+                      help='StudyJob metrics name list.')
+  parser.add_argument('--parameterconfigs', type=yamlOrJsonStr,
+                      default={},
+                      help='StudyJob parameterconfigs.')
+  parser.add_argument('--nasConfig',type=yamlOrJsonStr,
+                      default={},
+                      help='StudyJob nasConfig.')
+  parser.add_argument('--workertemplatepath', type=str,
+                      default="",
+                      help='StudyJob worker spec.')
+  parser.add_argument('--mcollectortemplatepath', type=str,
+                      default="",
+                      help='StudyJob worker spec.')
+  parser.add_argument('--suggestionspec', type=yamlOrJsonStr,
+                      default={},
+                      help='StudyJob suggestion spec.')
+  parser.add_argument('--outputfile', type=str,
+                      default='/output.txt',
+                      help='The file which stores the best trial of the studyJob.')
+  parser.add_argument('--studyjobtimeoutminutes', type=int,
+                      default=10,
+                      help='Time in minutes to wait for the StudyJob to complete')
+
+  args = parser.parse_args()
+
+  logging.getLogger().setLevel(logging.INFO)
+
+
+  logging.info('Generating studyjob template.')
+  template_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'hp.template.yaml')
+  content_yaml = _generate_studyjob_yaml(template_file, args.name, args.namespace, args.optimizationtype, args.objectivevaluename,
+                                         args.optimizationgoal, args.requestcount, args.metricsnames, args.parameterconfigs,
+                                         args.nasConfig, args.workertemplatepath, args.mcollectortemplatepath, args.suggestionspec)
+
+  config.load_incluster_config()
+  api_client = k8s_client.ApiClient()
+  create_response = study_job_client.create_study_job(api_client, content_yaml)
+  job_name = create_response['metadata']['name']
+  job_namespace = create_response['metadata']['namespace']
+
+  expected_condition = ["Completed", "Failed"]
+  wait_response = study_job_client.wait_for_condition(
+      api_client, job_namespace, job_name, expected_condition, 
+      timeout=datetime.timedelta(minutes=args.studyjobtimeoutminutes))
+  succ = False
+  if wait_response.get("status", {}).get("condition") == "Completed":
+    succ = True
+    trial = get_best_trial(wait_response["status"]["bestTrialId"])
+    with open(args.outputfile, 'w') as f:
+      ps_dict = {}
+      for ps in trial.parameter_set:
+          ps_dict[ps.name] = ps.value
+      f.write(json.dumps(ps_dict))
+  if succ:
+    logging.info('Study success.')
+
+  study_job_client.delete_study_job(api_client, job_name, job_namespace)
+
+if __name__== "__main__":
+  main()