Skip to content

Commit

Permalink
add katib studyjob launcher (kubeflow#754)
Browse files Browse the repository at this point in the history
* add katib studyjob launcher

* delete tmp file

* fix link error to tf-laucher

* import studyjob client from katib project

* specify output file with a parameter
undo tf-launcher
  • Loading branch information
hougangliu authored and k8s-ci-robot committed Mar 4, 2019
1 parent 6ed804b commit 7737025
Show file tree
Hide file tree
Showing 6 changed files with 355 additions and 0 deletions.
29 changes: 29 additions & 0 deletions components/kubeflow/katib-launcher/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Copyright 2019 The Kubeflow Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
FROM ubuntu:16.04

RUN apt-get update -y && \
apt-get install --no-install-recommends -y -q ca-certificates python-dev python-setuptools wget unzip git && \
easy_install pip && \
pip install pyyaml==3.12 six==1.11.0 requests==2.18.4 grpcio gcloud google-api-python-client protobuf kubernetes && \
wget https://github.com/kubeflow/katib/archive/master.zip && unzip master.zip

ENV PYTHONPATH $PYTHONPATH:/katib-master/pkg/api/python:/katib-master/py

ADD build /ml

RUN mkdir /usr/licenses && \
/ml/license.sh /ml/third_party_licenses.csv /usr/licenses

ENTRYPOINT ["python", "/ml/launch_study_job.py"]
58 changes: 58 additions & 0 deletions components/kubeflow/katib-launcher/build_image.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#!/bin/bash -e
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

while getopts ":hp:t:i:" opt; do
case "${opt}" in
h) echo "-p: project name"
echo "-t: tag name"
echo "-i: image name. If provided, project name and tag name are not necessary"
exit
;;
p) PROJECT_ID=${OPTARG}
;;
t) TAG_NAME=${OPTARG}
;;
i) LAUNCHER_IMAGE_NAME=${OPTARG}
;;
\? ) echo "Usage: cmd [-p] project [-t] tag [-i] image"
exit
;;
esac
done

mkdir -p ./build
rsync -arvp ./src/ ./build/

cp ../../license.sh ./build
cp ../../third_party_licenses.csv ./build

LOCAL_LAUNCHER_IMAGE_NAME=ml-pipeline-kubeflow-studyjob

docker build -t ${LOCAL_LAUNCHER_IMAGE_NAME} .
if [ -z "${LAUNCHER_IMAGE_NAME}" ]; then
if [ -z "${TAG_NAME}" ]; then
TAG_NAME=$(date +v%Y%m%d)-$(git describe --tags --always --dirty)-$(git diff | shasum -a256 | cut -c -6)
fi
if [ -z "${PROJECT_ID}" ]; then
PROJECT_ID=$(gcloud config config-helper --format "value(configuration.properties.core.project)")
fi
docker tag ${LOCAL_LAUNCHER_IMAGE_NAME} gcr.io/${PROJECT_ID}/${LOCAL_LAUNCHER_IMAGE_NAME}:${TAG_NAME}
docker push gcr.io/${PROJECT_ID}/${LOCAL_LAUNCHER_IMAGE_NAME}:${TAG_NAME}
else
docker tag ${LOCAL_LAUNCHER_IMAGE_NAME} ${LAUNCHER_IMAGE_NAME}
docker push ${LAUNCHER_IMAGE_NAME}
fi

rm -rf ./build
40 changes: 40 additions & 0 deletions components/kubeflow/katib-launcher/kubeflow_katib_launcher_op.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import kfp.dsl as dsl

def kubeflow_studyjob_launcher_op(name, namespace, optimizationtype, objectivevaluename, optimizationgoal, requestcount, metricsnames,
parameterconfigs, nasConfig, workertemplatepath, mcollectortemplatepath, suggestionspec,
studyjob_timeout_minutes, output_file='/output.txt', step_name='StudyJob-Launcher'):
return dsl.ContainerOp(
name = step_name,
image = 'liuhougangxa/ml-pipeline-kubeflow-studyjob:latest',
arguments = [
'--name', name,
'--namespace', namespace,
"--optimizationtype", optimizationtype,
"--objectivevaluename", objectivevaluename,
"--optimizationgoal", optimizationgoal,
"--requestcount", requestcount,
"--metricsnames", metricsnames,
"--parameterconfigs", parameterconfigs,
"--nasConfig", nasConfig,
"--workertemplatepath", workertemplatepath,
"--mcollectortemplatepath", mcollectortemplatepath,
"--suggestionspec", suggestionspec,
"--outputfile", output_file,
'--studyjobtimeoutminutes', studyjob_timeout_minutes,
],
file_outputs = {'hyperparameter': output_file}
)
15 changes: 15 additions & 0 deletions components/kubeflow/katib-launcher/src/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Copyright 2019 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .kubeflow_katib_launcher_op import kubeflow_studyjob_launcher_op
49 changes: 49 additions & 0 deletions components/kubeflow/katib-launcher/src/hp.template.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
apiVersion: "kubeflow.org/v1alpha1"
kind: StudyJob
metadata:
namespace: kubeflow
labels:
controller-tools.k8s.io: "1.0"
name: study-example
spec:
studyName: study-example
owner: crd
optimizationtype: ""
objectivevaluename: ""
optimizationgoal: 0.99
requestcount: 4
metricsnames:
- accuracy_1
nasConfig:
graphConfig:
numLayers: 8
inputSize:
- 32
- 32
- 3
outputSize:
- 10
operations:
- operationType: convolution
parameterconfigs:
- name: filter_size
parametertype: categorical
feasible:
list:
- "3"
- "5"
- "7"
parameterconfigs:
- name: --learning_rate
parametertype: double
feasible:
min: "0.01"
max: "0.05"
workerSpec:
goTemplate:
templatePath: ""
metricsCollectorSpec:
goTemplate:
templatePath: ""
suggestionSpec:
suggestionAlgorithm: "random"
164 changes: 164 additions & 0 deletions components/kubeflow/katib-launcher/src/launch_study_job.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import datetime
import json
import os
import logging
import requests
import subprocess
import yaml
import grpc

import api_pb2
import api_pb2_grpc

from kubernetes import client as k8s_client
from kubernetes import config
import study_job_client

def yamlOrJsonStr(str):
if str == "" or str == None:
return None
try:
return json.loads(str)
except:
return yaml.load(str)

def strToList(str):
return str.split(",")

def _update_or_pop(spec, name, value):
if value:
spec[name] = value
else:
spec.pop(name)

def _generate_studyjob_yaml(src_filename, name, namespace, optimizationtype, objectivevaluename, optimizationgoal, requestcount,
metricsnames, parameterconfigs, nasConfig, workertemplatepath, mcollectortemplatepath, suggestionspec):
"""_generate_studyjob_yaml generates studyjob yaml file based on hp.template.yaml"""
with open(src_filename, 'r') as f:
content = yaml.load(f)

content['metadata']['name'] = name
content['metadata']['namespace'] = namespace
content['spec']['studyName'] = name
content['spec']['optimizationtype'] = optimizationtype
content['spec']['objectivevaluename'] = objectivevaluename
content['spec']['optimizationgoal'] = optimizationgoal
content['spec']['requestcount'] = requestcount

_update_or_pop(content['spec'], 'parameterconfigs', parameterconfigs)
_update_or_pop(content['spec'], 'nasConfig', nasConfig)
_update_or_pop(content['spec'], 'metricsnames', metricsnames)
_update_or_pop(content['spec'], 'suggestionSpec', suggestionspec)

if workertemplatepath:
content['spec']['workerSpec']['goTemplate']['templatePath'] = workertemplatepath
else:
content['spec'].pop('workerSpec')

if mcollectortemplatepath:
content['spec']['metricsCollectorSpec']['goTemplate']['templatePath'] = mcollectortemplatepath
else :
content['spec'].pop('metricsCollectorSpec')

return content

def get_best_trial(trial_id):
vizier_core = "vizier-core.kubeflow:6789"
with grpc.insecure_channel(vizier_core) as channel:
stub = api_pb2_grpc.ManagerStub(channel)
response = stub.GetTrial(api_pb2.GetTrialRequest(trial_id=trial_id))
return response.trial

def main(argv=None):
parser = argparse.ArgumentParser(description='Kubeflow StudyJob launcher')
parser.add_argument('--name', type=str,
help='StudyJob name.')
parser.add_argument('--namespace', type=str,
default='kubeflow',
help='StudyJob namespace.')
parser.add_argument('--optimizationtype', type=str,
default='minimize',
help='Direction of optimization. minimize or maximize.')
parser.add_argument('--objectivevaluename', type=str,
help='Objective value name which trainer optimizes.')
parser.add_argument('--optimizationgoal', type=float,
help='Stop studying once objectivevaluename value ' +
'exceeds optimizationgoal')
parser.add_argument('--requestcount', type=int,
default=1,
help='The times asking request to suggestion service.')
parser.add_argument('--metricsnames', type=strToList,
help='StudyJob metrics name list.')
parser.add_argument('--parameterconfigs', type=yamlOrJsonStr,
default={},
help='StudyJob parameterconfigs.')
parser.add_argument('--nasConfig',type=yamlOrJsonStr,
default={},
help='StudyJob nasConfig.')
parser.add_argument('--workertemplatepath', type=str,
default="",
help='StudyJob worker spec.')
parser.add_argument('--mcollectortemplatepath', type=str,
default="",
help='StudyJob worker spec.')
parser.add_argument('--suggestionspec', type=yamlOrJsonStr,
default={},
help='StudyJob suggestion spec.')
parser.add_argument('--outputfile', type=str,
default='/output.txt',
help='The file which stores the best trial of the studyJob.')
parser.add_argument('--studyjobtimeoutminutes', type=int,
default=10,
help='Time in minutes to wait for the StudyJob to complete')

args = parser.parse_args()

logging.getLogger().setLevel(logging.INFO)


logging.info('Generating studyjob template.')
template_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'hp.template.yaml')
content_yaml = _generate_studyjob_yaml(template_file, args.name, args.namespace, args.optimizationtype, args.objectivevaluename,
args.optimizationgoal, args.requestcount, args.metricsnames, args.parameterconfigs,
args.nasConfig, args.workertemplatepath, args.mcollectortemplatepath, args.suggestionspec)

config.load_incluster_config()
api_client = k8s_client.ApiClient()
create_response = study_job_client.create_study_job(api_client, content_yaml)
job_name = create_response['metadata']['name']
job_namespace = create_response['metadata']['namespace']

expected_condition = ["Completed", "Failed"]
wait_response = study_job_client.wait_for_condition(
api_client, job_namespace, job_name, expected_condition,
timeout=datetime.timedelta(minutes=args.studyjobtimeoutminutes))
succ = False
if wait_response.get("status", {}).get("condition") == "Completed":
succ = True
trial = get_best_trial(wait_response["status"]["bestTrialId"])
with open(args.outputfile, 'w') as f:
ps_dict = {}
for ps in trial.parameter_set:
ps_dict[ps.name] = ps.value
f.write(json.dumps(ps_dict))
if succ:
logging.info('Study success.')

study_job_client.delete_study_job(api_client, job_name, job_namespace)

if __name__== "__main__":
main()

0 comments on commit 7737025

Please sign in to comment.