diff --git a/components/kubeflow/katib-launcher/Dockerfile b/components/kubeflow/katib-launcher/Dockerfile index 651eaae1621..81d927e559a 100644 --- a/components/kubeflow/katib-launcher/Dockerfile +++ b/components/kubeflow/katib-launcher/Dockerfile @@ -19,7 +19,7 @@ RUN apt-get update -y && \ pip install pyyaml==3.12 six==1.11.0 requests==2.18.4 grpcio gcloud google-api-python-client protobuf kubernetes && \ wget https://github.com/kubeflow/katib/archive/master.zip && unzip master.zip -ENV PYTHONPATH $PYTHONPATH:/katib-master/pkg/api/python:/katib-master/py +ENV PYTHONPATH $PYTHONPATH:/katib-master/pkg/api/v1alpha1/python:/katib-master/py ADD build /ml diff --git a/components/kubeflow/katib-launcher/OWNERS b/components/kubeflow/katib-launcher/OWNERS new file mode 100644 index 00000000000..808ae38f7a9 --- /dev/null +++ b/components/kubeflow/katib-launcher/OWNERS @@ -0,0 +1,4 @@ +approvers: + - hougangliu +reviewers: + - hougangliu diff --git a/components/kubeflow/katib-launcher/component.yaml b/components/kubeflow/katib-launcher/component.yaml index 0cbcdf4c88a..49c847f5921 100644 --- a/components/kubeflow/katib-launcher/component.yaml +++ b/components/kubeflow/katib-launcher/component.yaml @@ -5,7 +5,7 @@ inputs: - {name: Namespace, type: String, default: kubeflow, description: 'Namespace.'} - {name: Optimization type, type: String, default: minimize, description: 'Direction of optimization. minimize or maximize.'} - {name: Objective value name, type: String, description: 'Objective value name which trainer optimizes.'} -- {name: Optimization goal, type: Float, description: 'Stop studying once objectivevaluename value exceeds optimizationgoal'} +- {name: Optimization goal, type: Float, description: 'Stop studying once objectivevaluename value exceeds optimizationgoal.'} - {name: Request count, type: Integer, default: 1, description: 'Number of requests to the suggestion service.'} - {name: Metrics names, type: String, description: 'List of metric names (comma-delimited).'} - {name: Parameter configs, type: YAML, default: '', description: 'Parameter configs (YAML/JSON format).'} @@ -13,7 +13,8 @@ inputs: - {name: Worker template path, type: String, default: '', description: 'Worker spec.'} - {name: Metrics collector template path, type: String, default: '', description: 'Metrics collector spec.'} - {name: Suggestion spec, type: YAML, default: '', description: 'Suggestion spec (YAML/JSON format).'} -- {name: StudyJob timeout minutes, type: Integer, default: '10', description: 'Time in minutes to wait for the StudyJob to complete'} +- {name: StudyJob timeout minutes, type: Integer, default: '10', description: 'Time in minutes to wait for the StudyJob to complete.'} +- {name: Delete finished job, type: Boolean, default: 'True', description: 'Whether to delete the job after it is finished.'} outputs: - {name: Best parameter set, type: JSON, description: 'The parameter set of the best StudyJob trial.'} implementation: @@ -34,5 +35,6 @@ implementation: --mcollectortemplatepath, {inputValue: Metrics collector template path}, --suggestionspec, {inputValue: Suggestion spec}, --studyjobtimeoutminutes, {inputValue: StudyJob timeout minutes}, + --deleteAfterDone, {inputValue: Delete finished job}, --outputfile, {outputPath: Best parameter set}, ] diff --git a/components/kubeflow/katib-launcher/kubeflow_katib_launcher_op.py b/components/kubeflow/katib-launcher/kubeflow_katib_launcher_op.py index 5a0a931d2ee..8db12e933ec 100644 --- a/components/kubeflow/katib-launcher/kubeflow_katib_launcher_op.py +++ b/components/kubeflow/katib-launcher/kubeflow_katib_launcher_op.py @@ -16,7 +16,7 @@ def kubeflow_studyjob_launcher_op(name, namespace, optimizationtype, objectivevaluename, optimizationgoal, requestcount, metricsnames, parameterconfigs, nasConfig, workertemplatepath, mcollectortemplatepath, suggestionspec, - studyjob_timeout_minutes, output_file='/output.txt', step_name='StudyJob-Launcher'): + studyjob_timeout_minutes, delete=True, output_file='/output.txt', step_name='StudyJob-Launcher'): return dsl.ContainerOp( name = step_name, image = 'liuhougangxa/ml-pipeline-kubeflow-studyjob:latest', @@ -34,6 +34,7 @@ def kubeflow_studyjob_launcher_op(name, namespace, optimizationtype, objectiveva "--mcollectortemplatepath", mcollectortemplatepath, "--suggestionspec", suggestionspec, "--outputfile", output_file, + "--deleteAfterDone", delete, '--studyjobtimeoutminutes', studyjob_timeout_minutes, ], file_outputs = {'hyperparameter': output_file} diff --git a/components/kubeflow/katib-launcher/src/launch_study_job.py b/components/kubeflow/katib-launcher/src/launch_study_job.py index 1def4b51be2..8f901a00d82 100644 --- a/components/kubeflow/katib-launcher/src/launch_study_job.py +++ b/components/kubeflow/katib-launcher/src/launch_study_job.py @@ -13,6 +13,7 @@ # limitations under the License. import argparse import datetime +from distutils.util import strtobool import json import os import logging @@ -121,6 +122,9 @@ def main(argv=None): parser.add_argument('--outputfile', type=str, default='/output.txt', help='The file which stores the best trial of the studyJob.') + parser.add_argument('--deleteAfterDone', type=strtobool, + default=True, + help='When studyjob done, delete the studyjob automatically if it is True.') parser.add_argument('--studyjobtimeoutminutes', type=int, default=10, help='Time in minutes to wait for the StudyJob to complete') @@ -129,7 +133,6 @@ def main(argv=None): logging.getLogger().setLevel(logging.INFO) - logging.info('Generating studyjob template.') template_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'hp.template.yaml') content_yaml = _generate_studyjob_yaml(template_file, args.name, args.namespace, args.optimizationtype, args.objectivevaluename, @@ -157,8 +160,8 @@ def main(argv=None): f.write(json.dumps(ps_dict)) if succ: logging.info('Study success.') - - study_job_client.delete_study_job(api_client, job_name, job_namespace) + if args.deleteAfterDone: + study_job_client.delete_study_job(api_client, job_name, job_namespace) if __name__== "__main__": main()