From b1eb0e989c19a4d46980e6ce14ab1db0ffbd2913 Mon Sep 17 00:00:00 2001 From: avelichk Date: Tue, 2 Jun 2020 15:26:25 +0100 Subject: [PATCH 01/10] Add new Trial Template API --- .../experiments/v1beta1/experiment_types.go | 31 ++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/pkg/apis/controller/experiments/v1beta1/experiment_types.go b/pkg/apis/controller/experiments/v1beta1/experiment_types.go index 483010c6f3e..318be04b23e 100644 --- a/pkg/apis/controller/experiments/v1beta1/experiment_types.go +++ b/pkg/apis/controller/experiments/v1beta1/experiment_types.go @@ -19,6 +19,7 @@ import ( common "github.com/kubeflow/katib/pkg/apis/controller/common/v1beta1" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" ) type ExperimentSpec struct { @@ -187,8 +188,36 @@ type FeasibleSpace struct { } type TrialTemplate struct { - Retain bool `json:"retain,omitempty"` + // Retain indicates that Trial resources must be not cleanup + Retain bool `json:"retain,omitempty"` + GoTemplate *GoTemplate `json:"goTemplate,omitempty"` + + // List of parameres that are used in Trial template + TrialParameters []TrialParameterSpec `json:"trialParameters,omitempty"` + + // Trial spec contains Trial template in unstructured format + TrialSpec *unstructured.Unstructured `json:"trialSpec,omitempty"` + + // Name of config map where Trial template is located + ConfigMapName string `json:"configMapName,omitempty"` + + // Namespace of config map where Trial template is located + ConfigMapNamespace string `json:"configMapNamespace,omitempty"` + + // Path in config map where Trial template is located + TemplatePath string `json:"templatePath,omitempty"` +} + +type TrialParameterSpec struct { + // Name of the parameter that must be replaced in Trial template + Name string `json:"name,omitempty"` + + // Description of the parameter + Description string `json:"description,omitempty"` + + // Reference to the parameter in search space + Reference string `json:"reference,omitempty"` } type TemplateSpec struct { From 59ea302d3e9ebd05d917826bdbb46cc7cbf61397 Mon Sep 17 00:00:00 2001 From: avelichk Date: Thu, 4 Jun 2020 19:29:11 +0100 Subject: [PATCH 02/10] Add TrialSource in TrialTemplate Remove GoTemplate from Experiment API --- .../experiments/v1beta1/experiment_types.go | 29 ++++++++++--------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/pkg/apis/controller/experiments/v1beta1/experiment_types.go b/pkg/apis/controller/experiments/v1beta1/experiment_types.go index 318be04b23e..56c03b647b5 100644 --- a/pkg/apis/controller/experiments/v1beta1/experiment_types.go +++ b/pkg/apis/controller/experiments/v1beta1/experiment_types.go @@ -187,18 +187,31 @@ type FeasibleSpace struct { Step string `json:"step,omitempty"` } +// TrialTemplate describes structure of Trial template type TrialTemplate struct { // Retain indicates that Trial resources must be not cleanup Retain bool `json:"retain,omitempty"` - GoTemplate *GoTemplate `json:"goTemplate,omitempty"` + // Source for Trial template (unstructured structure or config map) + TrialSource `json:",inline"` // List of parameres that are used in Trial template TrialParameters []TrialParameterSpec `json:"trialParameters,omitempty"` +} + +// TrialSource represent the source for Trial template +// Only one source can be specified +type TrialSource struct { - // Trial spec contains Trial template in unstructured format + // TrialSpec represents Trial template in unstructured format TrialSpec *unstructured.Unstructured `json:"trialSpec,omitempty"` + // ConfigMap spec represents a reference to ConfigMap + ConfigMap *ConfigMapSource `json:"configMap,omitempty"` +} + +// ConfigMapSource references the config map where Trial template is located +type ConfigMapSource struct { // Name of config map where Trial template is located ConfigMapName string `json:"configMapName,omitempty"` @@ -209,6 +222,7 @@ type TrialTemplate struct { TemplatePath string `json:"templatePath,omitempty"` } +// TrialParameterSpec describes parameters that must be replaced in Trial template type TrialParameterSpec struct { // Name of the parameter that must be replaced in Trial template Name string `json:"name,omitempty"` @@ -220,17 +234,6 @@ type TrialParameterSpec struct { Reference string `json:"reference,omitempty"` } -type TemplateSpec struct { - ConfigMapName string `json:"configMapName,omitempty"` - ConfigMapNamespace string `json:"configMapNamespace,omitempty"` - TemplatePath string `json:"templatePath,omitempty"` -} - -type GoTemplate struct { - TemplateSpec *TemplateSpec `json:"templateSpec,omitempty"` - RawTemplate string `json:"rawTemplate,omitempty"` -} - // +genclient // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object From 99ef718b8bf57a511100e001ec52bee8a164f674 Mon Sep 17 00:00:00 2001 From: avelichk Date: Fri, 5 Jun 2020 22:32:54 +0100 Subject: [PATCH 03/10] Add init logic for new trial template --- examples/v1beta1/random-example.yaml | 52 ++-- .../v1beta1/experiment_defaults.go | 16 +- .../v1beta1/zz_generated.deepcopy.go | 79 ++++-- .../controller/trials/v1beta1/trial_types.go | 3 +- .../trials/v1beta1/zz_generated.deepcopy.go | 4 + .../experiment/experiment_util.go | 14 +- .../experiment/manifest/generator.go | 122 ++++---- .../experiment/manifest/generator_test.go | 263 ++++++++++-------- .../trial/trial_controller.go | 14 +- .../v1beta1/experiment/manifest/generator.go | 26 +- .../v1beta1/experiment/validator/validator.go | 51 ++-- 11 files changed, 340 insertions(+), 304 deletions(-) diff --git a/examples/v1beta1/random-example.yaml b/examples/v1beta1/random-example.yaml index 0ecf61cd1df..0a763d5a34c 100644 --- a/examples/v1beta1/random-example.yaml +++ b/examples/v1beta1/random-example.yaml @@ -18,17 +18,17 @@ spec: maxTrialCount: 12 maxFailedTrialCount: 3 parameters: - - name: --lr + - name: lr parameterType: double feasibleSpace: min: "0.01" max: "0.03" - - name: --num-layers + - name: num-layers parameterType: int feasibleSpace: min: "2" max: "5" - - name: --optimizer + - name: optimizer parameterType: categorical feasibleSpace: list: @@ -36,26 +36,30 @@ spec: - adam - ftrl trialTemplate: - goTemplate: - rawTemplate: |- - apiVersion: batch/v1 - kind: Job - metadata: - name: {{.Trial}} - namespace: {{.NameSpace}} - spec: - template: - spec: - containers: - - name: {{.Trial}} + trialParameters: + - name: learningRate + description: Learning rate for the training model + reference: lr + - name: numberLayers + description: Number of training model layers + reference: num-layers + - name: optimizer + description: Training model optimizer (sdg, adam or ftrl) + reference: optimizer + trialSpec: + apiVersion: batch/v1 + kind: Job + spec: + template: + spec: + containers: + - name: training-container image: docker.io/kubeflowkatib/mxnet-mnist command: - - "python3" - - "/opt/mxnet-mnist/mnist.py" - - "--batch-size=64" - {{- with .HyperParameters}} - {{- range .}} - - "{{.Name}}={{.Value}}" - {{- end}} - {{- end}} - restartPolicy: Never + - "python3" + - "/opt/mxnet-mnist/mnist.py" + - "--batch-size=64" + - "--lr=${trialParameters.learningRate}" + - "--num-layers=${trialParameters.numberLayers}" + - "--optimizer=${trialParameters.optimizer}" + restartPolicy: Never diff --git a/pkg/apis/controller/experiments/v1beta1/experiment_defaults.go b/pkg/apis/controller/experiments/v1beta1/experiment_defaults.go index f2c9d43935b..7d47e41e8b1 100644 --- a/pkg/apis/controller/experiments/v1beta1/experiment_defaults.go +++ b/pkg/apis/controller/experiments/v1beta1/experiment_defaults.go @@ -47,6 +47,7 @@ func (e *Experiment) setDefaultResumePolicy() { } } +// TODO: Verify it func (e *Experiment) setDefaultTrialTemplate() { t := e.Spec.TrialTemplate if t == nil { @@ -54,14 +55,13 @@ func (e *Experiment) setDefaultTrialTemplate() { Retain: true, } } - if t.GoTemplate == nil { - t.GoTemplate = &GoTemplate{} - } - if t.GoTemplate.RawTemplate == "" && t.GoTemplate.TemplateSpec == nil { - t.GoTemplate.TemplateSpec = &TemplateSpec{ - ConfigMapNamespace: consts.DefaultKatibNamespace, - ConfigMapName: DefaultTrialConfigMapName, - TemplatePath: DefaultTrialTemplatePath, + if t.TrialSource.TrialSpec == nil && t.TrialSource.ConfigMap == nil { + t.TrialSource = TrialSource{ + ConfigMap: &ConfigMapSource{ + ConfigMapNamespace: consts.DefaultKatibNamespace, + ConfigMapName: DefaultTrialConfigMapName, + TemplatePath: DefaultTrialTemplatePath, + }, } } e.Spec.TrialTemplate = t diff --git a/pkg/apis/controller/experiments/v1beta1/zz_generated.deepcopy.go b/pkg/apis/controller/experiments/v1beta1/zz_generated.deepcopy.go index 28377d4449e..b66f5de2617 100644 --- a/pkg/apis/controller/experiments/v1beta1/zz_generated.deepcopy.go +++ b/pkg/apis/controller/experiments/v1beta1/zz_generated.deepcopy.go @@ -24,6 +24,22 @@ import ( runtime "k8s.io/apimachinery/pkg/runtime" ) +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ConfigMapSource) DeepCopyInto(out *ConfigMapSource) { + *out = *in + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ConfigMapSource. +func (in *ConfigMapSource) DeepCopy() *ConfigMapSource { + if in == nil { + return nil + } + out := new(ConfigMapSource) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Experiment) DeepCopyInto(out *Experiment) { *out = *in @@ -248,27 +264,6 @@ func (in *FeasibleSpace) DeepCopy() *FeasibleSpace { return out } -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *GoTemplate) DeepCopyInto(out *GoTemplate) { - *out = *in - if in.TemplateSpec != nil { - in, out := &in.TemplateSpec, &out.TemplateSpec - *out = new(TemplateSpec) - **out = **in - } - return -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GoTemplate. -func (in *GoTemplate) DeepCopy() *GoTemplate { - if in == nil { - return nil - } - out := new(GoTemplate) - in.DeepCopyInto(out) - return out -} - // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *GraphConfig) DeepCopyInto(out *GraphConfig) { *out = *in @@ -387,17 +382,42 @@ func (in *ParameterSpec) DeepCopy() *ParameterSpec { } // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *TemplateSpec) DeepCopyInto(out *TemplateSpec) { +func (in *TrialParameterSpec) DeepCopyInto(out *TrialParameterSpec) { + *out = *in + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrialParameterSpec. +func (in *TrialParameterSpec) DeepCopy() *TrialParameterSpec { + if in == nil { + return nil + } + out := new(TrialParameterSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TrialSource) DeepCopyInto(out *TrialSource) { *out = *in + if in.TrialSpec != nil { + in, out := &in.TrialSpec, &out.TrialSpec + *out = (*in).DeepCopy() + } + if in.ConfigMap != nil { + in, out := &in.ConfigMap, &out.ConfigMap + *out = new(ConfigMapSource) + **out = **in + } return } -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TemplateSpec. -func (in *TemplateSpec) DeepCopy() *TemplateSpec { +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrialSource. +func (in *TrialSource) DeepCopy() *TrialSource { if in == nil { return nil } - out := new(TemplateSpec) + out := new(TrialSource) in.DeepCopyInto(out) return out } @@ -405,10 +425,11 @@ func (in *TemplateSpec) DeepCopy() *TemplateSpec { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *TrialTemplate) DeepCopyInto(out *TrialTemplate) { *out = *in - if in.GoTemplate != nil { - in, out := &in.GoTemplate, &out.GoTemplate - *out = new(GoTemplate) - (*in).DeepCopyInto(*out) + in.TrialSource.DeepCopyInto(&out.TrialSource) + if in.TrialParameters != nil { + in, out := &in.TrialParameters, &out.TrialParameters + *out = make([]TrialParameterSpec, len(*in)) + copy(*out, *in) } return } diff --git a/pkg/apis/controller/trials/v1beta1/trial_types.go b/pkg/apis/controller/trials/v1beta1/trial_types.go index 3579f0af88d..4da68aab28f 100644 --- a/pkg/apis/controller/trials/v1beta1/trial_types.go +++ b/pkg/apis/controller/trials/v1beta1/trial_types.go @@ -19,6 +19,7 @@ import ( common "github.com/kubeflow/katib/pkg/apis/controller/common/v1beta1" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" ) type TrialSpec struct { @@ -32,7 +33,7 @@ type TrialSpec struct { // runtime object. The trial operator should create the resource as written, // and let the corresponding resource controller (e.g. tf-operator) handle // the rest. - RunSpec string `json:"runSpec,omitempty"` + RunSpec *unstructured.Unstructured `json:"runSpec,omitempty"` // Whether to retain the trial run object after completed. RetainRun bool `json:"retainRun,omitempty"` diff --git a/pkg/apis/controller/trials/v1beta1/zz_generated.deepcopy.go b/pkg/apis/controller/trials/v1beta1/zz_generated.deepcopy.go index 6474671f43d..60144a3e70c 100644 --- a/pkg/apis/controller/trials/v1beta1/zz_generated.deepcopy.go +++ b/pkg/apis/controller/trials/v1beta1/zz_generated.deepcopy.go @@ -116,6 +116,10 @@ func (in *TrialSpec) DeepCopyInto(out *TrialSpec) { *out = make([]commonv1beta1.ParameterAssignment, len(*in)) copy(*out, *in) } + if in.RunSpec != nil { + in, out := &in.RunSpec, &out.RunSpec + *out = (*in).DeepCopy() + } in.MetricsCollector.DeepCopyInto(&out.MetricsCollector) return } diff --git a/pkg/controller.v1beta1/experiment/experiment_util.go b/pkg/controller.v1beta1/experiment/experiment_util.go index 9ba59b159c6..d6e0d986716 100644 --- a/pkg/controller.v1beta1/experiment/experiment_util.go +++ b/pkg/controller.v1beta1/experiment/experiment_util.go @@ -1,15 +1,11 @@ package experiment import ( - "bytes" "context" - "fmt" "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/types" - k8syaml "k8s.io/apimachinery/pkg/util/yaml" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/reconcile" @@ -25,7 +21,6 @@ const ( ) func (r *ReconcileExperiment) createTrialInstance(expInstance *experimentsv1beta1.Experiment, trialAssignment *suggestionsv1beta1.TrialAssignment) error { - BUFSIZE := 1024 logger := log.WithValues("Experiment", types.NamespacedName{Name: expInstance.GetName(), Namespace: expInstance.GetNamespace()}) trial := &trialsv1beta1.Trial{} @@ -42,7 +37,8 @@ func (r *ReconcileExperiment) createTrialInstance(expInstance *experimentsv1beta hps := trialAssignment.ParameterAssignments trial.Spec.ParameterAssignments = trialAssignment.ParameterAssignments - runSpec, err := r.GetRunSpecWithHyperParameters(expInstance, expInstance.GetName(), trial.Name, trial.Namespace, hps) + + runSpec, err := r.GetRunSpecWithHyperParameters(expInstance, trial.Name, trial.Namespace, hps) if err != nil { logger.Error(err, "Fail to get RunSpec from experiment", expInstance.Name) return err @@ -53,12 +49,6 @@ func (r *ReconcileExperiment) createTrialInstance(expInstance *experimentsv1beta trial.Spec.RetainRun = expInstance.Spec.TrialTemplate.Retain } - buf := bytes.NewBufferString(runSpec) - job := &unstructured.Unstructured{} - if err := k8syaml.NewYAMLOrJSONDecoder(buf, BUFSIZE).Decode(job); err != nil { - return fmt.Errorf("Invalid spec.trialTemplate: %v.", err) - } - if expInstance.Spec.MetricsCollectorSpec != nil { trial.Spec.MetricsCollector = *expInstance.Spec.MetricsCollectorSpec } diff --git a/pkg/controller.v1beta1/experiment/manifest/generator.go b/pkg/controller.v1beta1/experiment/manifest/generator.go index 93827f379b0..2114f687535 100644 --- a/pkg/controller.v1beta1/experiment/manifest/generator.go +++ b/pkg/controller.v1beta1/experiment/manifest/generator.go @@ -3,9 +3,11 @@ package manifest import ( "bytes" "errors" - "text/template" + "fmt" + "strings" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "sigs.k8s.io/controller-runtime/pkg/client" commonapiv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/common/v1beta1" @@ -13,6 +15,7 @@ import ( "github.com/kubeflow/katib/pkg/controller.v1beta1/consts" "github.com/kubeflow/katib/pkg/util/v1beta1/katibclient" "github.com/kubeflow/katib/pkg/util/v1beta1/katibconfig" + k8syaml "k8s.io/apimachinery/pkg/util/yaml" ) const ( @@ -22,8 +25,8 @@ const ( // Generator is the type for manifests Generator. type Generator interface { InjectClient(c client.Client) - GetRunSpec(e *experimentsv1beta1.Experiment, experiment, trial, namespace string) (string, error) - GetRunSpecWithHyperParameters(e *experimentsv1beta1.Experiment, experiment, trial, namespace string, hps []commonapiv1beta1.ParameterAssignment) (string, error) + // GetRunSpec(e *experimentsv1beta1.Experiment, experiment, trial, namespace string) (string, error) + GetRunSpecWithHyperParameters(experiment *experimentsv1beta1.Experiment, trialName, trialNamespace string, assignments []commonapiv1beta1.ParameterAssignment) (*unstructured.Unstructured, error) GetSuggestionConfigData(algorithmName string) (map[string]string, error) GetMetricsCollectorImage(cKind commonapiv1beta1.CollectorKind) (string, error) } @@ -57,77 +60,82 @@ func (g *DefaultGenerator) GetSuggestionConfigData(algorithmName string) (map[st return katibconfig.GetSuggestionConfigData(algorithmName, g.client.GetClient()) } -// GetRunSpec get the specification for trial. -func (g *DefaultGenerator) GetRunSpec(e *experimentsv1beta1.Experiment, experiment, trial, namespace string) (string, error) { - params := trialTemplateParams{ - Experiment: experiment, - Trial: trial, - NameSpace: namespace, - } - return g.getRunSpec(e, params) -} - // GetRunSpecWithHyperParameters get the specification for trial with hyperparameters. -func (g *DefaultGenerator) GetRunSpecWithHyperParameters(e *experimentsv1beta1.Experiment, experiment, trial, namespace string, hps []commonapiv1beta1.ParameterAssignment) (string, error) { - params := trialTemplateParams{ - Experiment: experiment, - Trial: trial, - NameSpace: namespace, - HyperParameters: hps, +func (g *DefaultGenerator) GetRunSpecWithHyperParameters(experiment *experimentsv1beta1.Experiment, trialName, trialNamespace string, assignments []commonapiv1beta1.ParameterAssignment) (*unstructured.Unstructured, error) { + + // Get string Trial template from Experiment spec + trialTemplate, err := g.getTrialTemplate(experiment) + if err != nil { + return nil, err } - return g.getRunSpec(e, params) -} -func (g *DefaultGenerator) getRunSpec(e *experimentsv1beta1.Experiment, params trialTemplateParams) (string, error) { - var buf bytes.Buffer - trialTemplate, err := g.getTrialTemplate(e) + // Apply parameters to Trial Template from assignment + replacedTemplate, err := g.applyParameters(trialTemplate, experiment.Spec.TrialTemplate.TrialParameters, assignments) if err != nil { - return "", err + return nil, err } - if err := trialTemplate.Execute(&buf, params); err != nil { - return "", err + // Convert Trial template to unstructured + replacedTemplateBytes := bytes.NewBufferString(replacedTemplate) + runSpec := &unstructured.Unstructured{} + err = k8syaml.NewYAMLOrJSONDecoder(replacedTemplateBytes, 1024).Decode(runSpec) + if err != nil { + return nil, err } - return buf.String(), nil + + // Set name and namespace for Run Spec + runSpec.SetName(trialName) + runSpec.SetNamespace(trialNamespace) + + return runSpec, nil } -func (g *DefaultGenerator) getTrialTemplate(instance *experimentsv1beta1.Experiment) (*template.Template, error) { - var err error - var tpl *template.Template = nil +func (g *DefaultGenerator) applyParameters(trialTemplate string, trialParams []experimentsv1beta1.TrialParameterSpec, assignments []commonapiv1beta1.ParameterAssignment) (string, error) { + // Number of parameters must be equal + if len(assignments) != len(trialParams) { + return "", fmt.Errorf("Number of Trial assignment from Suggestion: %v not equal to number Trial parameters from Experiment: %v", len(assignments), len(trialParams)) + } + // Convert parameter assignment to map key = parameter name, value = parameter value + assignmentsMap := make(map[string]string) + for _, assignment := range assignments { + assignmentsMap[assignment.Name] = assignment.Value + } + + // Replacing parameters from Trial parameters + for _, parameter := range trialParams { + + if parameterValue, ok := assignmentsMap[parameter.Reference]; ok { + trialTemplate = strings.Replace(trialTemplate, fmt.Sprintf("${trialParameters.%v}", parameter.Name), parameterValue, -1) + } else { + return "", fmt.Errorf("Unable to find parameter: %v in parameter assignment %v", parameter.Reference, assignmentsMap) + } + } + return trialTemplate, nil +} - trialTemplate := instance.Spec.TrialTemplate - if trialTemplate.GoTemplate.RawTemplate != "" { - tpl, err = template.New("Trial").Parse(trialTemplate.GoTemplate.RawTemplate) +func (g *DefaultGenerator) getTrialTemplate(instance *experimentsv1beta1.Experiment) (string, error) { + var trialTemplateString string + trialSource := instance.Spec.TrialTemplate.TrialSource + if trialSource.TrialSpec != nil { + trialTemplateByte, err := trialSource.TrialSpec.MarshalJSON() if err != nil { - return nil, err + return "", err } + trialTemplateString = string(trialTemplateByte) } else { - templateSpec := trialTemplate.GoTemplate.TemplateSpec - configMapNS := templateSpec.ConfigMapNamespace - configMapName := templateSpec.ConfigMapName - templatePath := templateSpec.TemplatePath - + configMapNS := trialSource.ConfigMap.ConfigMapNamespace + configMapName := trialSource.ConfigMap.ConfigMapName + templatePath := trialSource.ConfigMap.TemplatePath configMap, err := g.client.GetConfigMap(configMapName, configMapNS) if err != nil { - return nil, err + return "", err } - - if configMapTemplate, ok := configMap[templatePath]; !ok { + var ok bool + trialTemplateString, ok = configMap[templatePath] + if !ok { err = errors.New(string(metav1.StatusReasonNotFound)) - return nil, err - } else { - tpl, err = template.New("Trial").Parse(configMapTemplate) - if err != nil { - return nil, err - } + return "", err } } - return tpl, nil -} - -type trialTemplateParams struct { - Experiment string - Trial string - NameSpace string - HyperParameters []commonapiv1beta1.ParameterAssignment + return trialTemplateString, nil } diff --git a/pkg/controller.v1beta1/experiment/manifest/generator_test.go b/pkg/controller.v1beta1/experiment/manifest/generator_test.go index 229c3b0e018..9a885df574f 100644 --- a/pkg/controller.v1beta1/experiment/manifest/generator_test.go +++ b/pkg/controller.v1beta1/experiment/manifest/generator_test.go @@ -1,41 +1,25 @@ package manifest import ( + "bytes" "reflect" "testing" - "text/template" "github.com/golang/mock/gomock" - commonapiv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/common/v1beta1" experimentsv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/experiments/v1beta1" katibclientmock "github.com/kubeflow/katib/pkg/mock/v1beta1/util/katibclient" + batchv1 "k8s.io/api/batch/v1" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + k8syaml "k8s.io/apimachinery/pkg/util/yaml" ) -const ( - rawTemplate = `apiVersion: batch/v1 -kind: Job -metadata: -name: {{.Trial}} -namespace: {{.NameSpace}} -spec: - template: - spec: - containers: - - name: {{.Trial}} - image: katib/mxnet-mnist-example - command: - - "python" - - "/mxnet/example/image-classification/train_mnist.py" - - "--batch-size=64" - {{- with .HyperParameters}} - {{- range .}} - - "{{.Name}}={{.Value}}" - {{- end}} - {{- end}}` -) +func TestGetRunSpecWithHP(t *testing.T) { + tc := newFakeInstance() -func TestGetTrialTemplateConfigMap(t *testing.T) { mockCtrl := gomock.NewController(t) defer mockCtrl.Finish() @@ -45,61 +29,64 @@ func TestGetTrialTemplateConfigMap(t *testing.T) { client: c, } - templatePath := "test.yaml" - - c.EXPECT().GetConfigMap(gomock.Any(), gomock.Any()).Return(map[string]string{ - templatePath: rawTemplate, - }, nil) + // TODO: Add more test cases + actual, err := p.GetRunSpecWithHyperParameters(tc, "trial-name", "trial-namespace", []commonapiv1beta1.ParameterAssignment{ + { + Name: "lr", + Value: "0.05", + }, + { + Name: "num-layers", + Value: "5", + }, + }) - instance := newFakeInstance() - instance.Spec.TrialTemplate.GoTemplate.TemplateSpec = &experimentsv1beta1.TemplateSpec{ - TemplatePath: templatePath, - } - instance.Spec.TrialTemplate.GoTemplate.RawTemplate = "" - actual, err := p.getTrialTemplate(instance) - if err != nil { - t.Errorf("Expected nil, got %v", err) - } - expected, err := template.New("Trial").Parse(rawTemplate) if err != nil { t.Errorf("Expected nil, got %v", err) } - if !reflect.DeepEqual(expected, actual) { - t.Errorf("Expected %v, got %v", *expected, *actual) - } -} - -func TestGetTrialTemplate(t *testing.T) { - mockCtrl := gomock.NewController(t) - defer mockCtrl.Finish() - c := katibclientmock.NewMockClient(mockCtrl) - - p := &DefaultGenerator{ - client: c, + expectedJob := batchv1.Job{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "batch/v1", + Kind: "Job", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "trial-name", + Namespace: "trial-namespace", + }, + Spec: batchv1.JobSpec{ + Template: v1.PodTemplateSpec{ + Spec: v1.PodSpec{ + Containers: []v1.Container{ + v1.Container{ + Name: "training-container", + Image: "docker.io/kubeflowkatib/mxnet-mnist", + Command: []string{ + "python3", + "/opt/mxnet-mnist/mnist.py", + "--lr=0.05", + "--num-layers=5", + }, + }, + }, + }, + }, + }, } - tc := newFakeInstance() + expected := &unstructured.Unstructured{} - expected, err := template.New("Trial"). - Parse(tc.Spec.TrialTemplate.GoTemplate.RawTemplate) + expected.Object, err = runtime.DefaultUnstructuredConverter.ToUnstructured(&expectedJob) if err != nil { - t.Errorf("Failed to compose expected result") + t.Errorf("ToUnstructured failed: %v", err) } - actual, err := p.getTrialTemplate(tc) - if err != nil { - t.Errorf("Expected nil, got %v", err) - } - - if !reflect.DeepEqual(expected, actual) { - t.Errorf("Expected %v, got %v", *expected, *actual) + if !reflect.DeepEqual(expected.Object, actual.Object) { + t.Errorf("Expected %v\n got %v", expected.Object, actual.Object) } } -func TestGetRunSpec(t *testing.T) { - tc := newFakeInstance() - +func TestGetRunSpecWithHPConfigMap(t *testing.T) { mockCtrl := gomock.NewController(t) defer mockCtrl.Finish() @@ -109,78 +96,120 @@ func TestGetRunSpec(t *testing.T) { client: c, } - actual, err := p.GetRunSpec(tc, "", "test", "testns") - if err != nil { - t.Errorf("Expected nil, got %v", err) - } - expected := `apiVersion: batch/v1 + templatePath := "trial-template.yaml" + + trialSpec := `apiVersion: batch/v1 kind: Job -metadata: -name: test -namespace: testns spec: - template: - spec: - containers: - - name: test - image: katib/mxnet-mnist-example - command: - - "python" - - "/mxnet/example/image-classification/train_mnist.py" - - "--batch-size=64"` - if expected != actual { - t.Errorf("Expected %s, got %s", expected, actual) - } -} - -func TestGetRunSpecWithHP(t *testing.T) { - tc := newFakeInstance() - - mockCtrl := gomock.NewController(t) - defer mockCtrl.Finish() + template: + spec: + containers: + - name: training-container + image: docker.io/kubeflowkatib/mxnet-mnist + command: + - "python3" + - "/opt/mxnet-mnist/mnist.py" + - "--lr=${trialParameters.learningRate}" + - "--num-layers=${trialParameters.numberLayers}"` - c := katibclientmock.NewMockClient(mockCtrl) + c.EXPECT().GetConfigMap(gomock.Any(), gomock.Any()).Return(map[string]string{ + templatePath: trialSpec, + }, nil) - p := &DefaultGenerator{ - client: c, + instance := newFakeInstance() + instance.Spec.TrialTemplate.TrialSource.ConfigMap = &experimentsv1beta1.ConfigMapSource{ + TemplatePath: templatePath, } - - actual, err := p.GetRunSpecWithHyperParameters(tc, "", "test", "testns", []commonapiv1beta1.ParameterAssignment{ + instance.Spec.TrialTemplate.TrialSource.TrialSpec = nil + actual, err := p.GetRunSpecWithHyperParameters(instance, "trial-name", "trial-namespace", []commonapiv1beta1.ParameterAssignment{ { - Name: "testname", - Value: "testvalue", + Name: "lr", + Value: "0.05", + }, + { + Name: "num-layers", + Value: "5", }, }) if err != nil { t.Errorf("Expected nil, got %v", err) } - expected := `apiVersion: batch/v1 + // We can't compare structures, because trialSpec is a string and creationTimestamp was not added + expectedJob := `apiVersion: batch/v1 kind: Job metadata: -name: test -namespace: testns + name: trial-name + namespace: trial-namespace spec: - template: - spec: - containers: - - name: test - image: katib/mxnet-mnist-example - command: - - "python" - - "/mxnet/example/image-classification/train_mnist.py" - - "--batch-size=64" - - "testname=testvalue"` - if expected != actual { - t.Errorf("Expected %s, got %s", expected, actual) + template: + spec: + containers: + - name: training-container + image: docker.io/kubeflowkatib/mxnet-mnist + command: + - "python3" + - "/opt/mxnet-mnist/mnist.py" + - "--lr=0.05" + - "--num-layers=5"` + + expectedBytes := bytes.NewBufferString(expectedJob) + expected := &unstructured.Unstructured{} + err = k8syaml.NewYAMLOrJSONDecoder(expectedBytes, 1024).Decode(expected) + if err != nil { + t.Errorf("NewYAMLOrJSONDecoder failed: %v", err) + } + if !reflect.DeepEqual(expected, actual) { + t.Errorf("Expected %s\n got %s", expected.Object, actual.Object) } } func newFakeInstance() *experimentsv1beta1.Experiment { + + trialTemplateJob := &batchv1.Job{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "batch/v1", + Kind: "Job", + }, + Spec: batchv1.JobSpec{ + Template: v1.PodTemplateSpec{ + Spec: v1.PodSpec{ + Containers: []v1.Container{ + v1.Container{ + Name: "training-container", + Image: "docker.io/kubeflowkatib/mxnet-mnist", + Command: []string{ + "python3", + "/opt/mxnet-mnist/mnist.py", + "--lr=${trialParameters.learningRate}", + "--num-layers=${trialParameters.numberLayers}", + }, + }, + }, + }, + }, + }, + } + + trialSpec := &unstructured.Unstructured{} + trialSpec.Object, _ = runtime.DefaultUnstructuredConverter.ToUnstructured(trialTemplateJob) + return &experimentsv1beta1.Experiment{ Spec: experimentsv1beta1.ExperimentSpec{ TrialTemplate: &experimentsv1beta1.TrialTemplate{ - GoTemplate: &experimentsv1beta1.GoTemplate{ - RawTemplate: rawTemplate, + TrialSource: experimentsv1beta1.TrialSource{ + TrialSpec: trialSpec, + }, + TrialParameters: []experimentsv1beta1.TrialParameterSpec{ + experimentsv1beta1.TrialParameterSpec{ + Name: "learningRate", + Description: "Learning Rate", + Reference: "lr", + }, + experimentsv1beta1.TrialParameterSpec{ + Name: "numberLayers", + Description: "Number of layers", + Reference: "num-layers", + }, }, }, }, diff --git a/pkg/controller.v1beta1/trial/trial_controller.go b/pkg/controller.v1beta1/trial/trial_controller.go index 95027b05cb1..18d39f66de3 100644 --- a/pkg/controller.v1beta1/trial/trial_controller.go +++ b/pkg/controller.v1beta1/trial/trial_controller.go @@ -17,7 +17,6 @@ limitations under the License. package trial import ( - "bytes" "context" "fmt" @@ -31,7 +30,6 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" - k8syaml "k8s.io/apimachinery/pkg/util/yaml" "k8s.io/client-go/tools/record" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller" @@ -228,6 +226,7 @@ func (r *ReconcileTrial) reconcileTrial(instance *trialsv1beta1.Trial) error { logger.Error(err, "Failed to create the provider") return err } + // Currently jobCondition - part of commonv1 TF package for all jobs jobCondition, err := jobProvider.GetDeployedJobStatus(deployedJob) if err != nil { logger.Error(err, "Get deployed status error") @@ -259,7 +258,7 @@ func (r *ReconcileTrial) reconcileJob(instance *trialsv1beta1.Trial, desiredJob kind := desiredJob.GetKind() gvk := schema.FromAPIVersionAndKind(apiVersion, kind) - // Add annotation to desired Job + // Add annotation to desired Job to disable istio sidecar err = util.TrainingJobAnnotations(desiredJob) if err != nil { logger.Error(err, "TrainingJobAnnotations error") @@ -314,15 +313,10 @@ func (r *ReconcileTrial) reconcileJob(instance *trialsv1beta1.Trial, desiredJob func (r *ReconcileTrial) getDesiredJobSpec(instance *trialsv1beta1.Trial) (*unstructured.Unstructured, error) { - bufSize := 1024 logger := log.WithValues("Trial", types.NamespacedName{Name: instance.GetName(), Namespace: instance.GetNamespace()}) - buf := bytes.NewBufferString(instance.Spec.RunSpec) - desiredJobSpec := &unstructured.Unstructured{} - if err := k8syaml.NewYAMLOrJSONDecoder(buf, bufSize).Decode(desiredJobSpec); err != nil { - logger.Error(err, "Yaml decode error") - return nil, err - } + desiredJobSpec := instance.Spec.RunSpec + if err := controllerutil.SetControllerReference(instance, desiredJobSpec, r.scheme); err != nil { logger.Error(err, "Set controller reference error") return nil, err diff --git a/pkg/mock/v1beta1/experiment/manifest/generator.go b/pkg/mock/v1beta1/experiment/manifest/generator.go index 62878b5a3a4..c92167a35e1 100644 --- a/pkg/mock/v1beta1/experiment/manifest/generator.go +++ b/pkg/mock/v1beta1/experiment/manifest/generator.go @@ -8,6 +8,7 @@ import ( gomock "github.com/golang/mock/gomock" v1beta1 "github.com/kubeflow/katib/pkg/apis/controller/common/v1beta1" v1beta10 "github.com/kubeflow/katib/pkg/apis/controller/experiments/v1beta1" + unstructured "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" reflect "reflect" client "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -50,34 +51,19 @@ func (mr *MockGeneratorMockRecorder) GetMetricsCollectorImage(arg0 interface{}) return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetMetricsCollectorImage", reflect.TypeOf((*MockGenerator)(nil).GetMetricsCollectorImage), arg0) } -// GetRunSpec mocks base method -func (m *MockGenerator) GetRunSpec(arg0 *v1beta10.Experiment, arg1, arg2, arg3 string) (string, error) { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "GetRunSpec", arg0, arg1, arg2, arg3) - ret0, _ := ret[0].(string) - ret1, _ := ret[1].(error) - return ret0, ret1 -} - -// GetRunSpec indicates an expected call of GetRunSpec -func (mr *MockGeneratorMockRecorder) GetRunSpec(arg0, arg1, arg2, arg3 interface{}) *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetRunSpec", reflect.TypeOf((*MockGenerator)(nil).GetRunSpec), arg0, arg1, arg2, arg3) -} - // GetRunSpecWithHyperParameters mocks base method -func (m *MockGenerator) GetRunSpecWithHyperParameters(arg0 *v1beta10.Experiment, arg1, arg2, arg3 string, arg4 []v1beta1.ParameterAssignment) (string, error) { +func (m *MockGenerator) GetRunSpecWithHyperParameters(arg0 *v1beta10.Experiment, arg1, arg2 string, arg3 []v1beta1.ParameterAssignment) (*unstructured.Unstructured, error) { m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "GetRunSpecWithHyperParameters", arg0, arg1, arg2, arg3, arg4) - ret0, _ := ret[0].(string) + ret := m.ctrl.Call(m, "GetRunSpecWithHyperParameters", arg0, arg1, arg2, arg3) + ret0, _ := ret[0].(*unstructured.Unstructured) ret1, _ := ret[1].(error) return ret0, ret1 } // GetRunSpecWithHyperParameters indicates an expected call of GetRunSpecWithHyperParameters -func (mr *MockGeneratorMockRecorder) GetRunSpecWithHyperParameters(arg0, arg1, arg2, arg3, arg4 interface{}) *gomock.Call { +func (mr *MockGeneratorMockRecorder) GetRunSpecWithHyperParameters(arg0, arg1, arg2, arg3 interface{}) *gomock.Call { mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetRunSpecWithHyperParameters", reflect.TypeOf((*MockGenerator)(nil).GetRunSpecWithHyperParameters), arg0, arg1, arg2, arg3, arg4) + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetRunSpecWithHyperParameters", reflect.TypeOf((*MockGenerator)(nil).GetRunSpecWithHyperParameters), arg0, arg1, arg2, arg3) } // GetSuggestionConfigData mocks base method diff --git a/pkg/webhook/v1beta1/experiment/validator/validator.go b/pkg/webhook/v1beta1/experiment/validator/validator.go index f154d96cc9a..ee6015712e9 100644 --- a/pkg/webhook/v1beta1/experiment/validator/validator.go +++ b/pkg/webhook/v1beta1/experiment/validator/validator.go @@ -1,7 +1,6 @@ package validator import ( - "bytes" "fmt" "path/filepath" "regexp" @@ -10,7 +9,6 @@ import ( "k8s.io/apimachinery/pkg/api/equality" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - k8syaml "k8s.io/apimachinery/pkg/util/yaml" "sigs.k8s.io/controller-runtime/pkg/client" logf "sigs.k8s.io/controller-runtime/pkg/runtime/log" @@ -130,31 +128,32 @@ func (g *DefaultValidator) validateResumePolicy(resume experimentsv1beta1.Resume return nil } +//TODO: Add Validation for new Trial Template func (g *DefaultValidator) validateTrialTemplate(instance *experimentsv1beta1.Experiment) error { - trialName := fmt.Sprintf("%s-trial", instance.GetName()) - runSpec, err := g.GetRunSpec(instance, instance.GetName(), trialName, instance.GetNamespace()) - if err != nil { - return fmt.Errorf("Invalid spec.trialTemplate: %v.", err) - } - - bufSize := 1024 - buf := bytes.NewBufferString(runSpec) - - job := &unstructured.Unstructured{} - if err := k8syaml.NewYAMLOrJSONDecoder(buf, bufSize).Decode(job); err != nil { - return fmt.Errorf("Invalid spec.trialTemplate: %v.", err) - } - - if err := g.validateSupportedJob(job); err != nil { - return fmt.Errorf("Invalid spec.trialTemplate: %v.", err) - } - - if job.GetNamespace() != instance.GetNamespace() { - return fmt.Errorf("Invalid spec.trialTemplate: metadata.namespace should be %s or {{.NameSpace}}", instance.GetNamespace()) - } - if job.GetName() != trialName { - return fmt.Errorf("Invalid spec.trialTemplate: metadata.name should be {{.Trial}}") - } + // trialName := fmt.Sprintf("%s-trial", instance.GetName()) + // runSpec, err := g.GetRunSpec(instance, instance.GetName(), trialName, instance.GetNamespace()) + // if err != nil { + // return fmt.Errorf("Invalid spec.trialTemplate: %v.", err) + // } + + // bufSize := 1024 + // buf := bytes.NewBufferString(runSpec) + + // job := &unstructured.Unstructured{} + // if err := k8syaml.NewYAMLOrJSONDecoder(buf, bufSize).Decode(job); err != nil { + // return fmt.Errorf("Invalid spec.trialTemplate: %v.", err) + // } + + // if err := g.validateSupportedJob(job); err != nil { + // return fmt.Errorf("Invalid spec.trialTemplate: %v.", err) + // } + + // if job.GetNamespace() != instance.GetNamespace() { + // return fmt.Errorf("Invalid spec.trialTemplate: metadata.namespace should be %s or {{.NameSpace}}", instance.GetNamespace()) + // } + // if job.GetName() != trialName { + // return fmt.Errorf("Invalid spec.trialTemplate: metadata.name should be {{.Trial}}") + // } return nil } From 05c1ef6d34b29d0f7e89e38dc0349816a1cf704a Mon Sep 17 00:00:00 2001 From: avelichk Date: Sun, 7 Jun 2020 22:12:50 +0100 Subject: [PATCH 04/10] Change examples Implement new Trial Template to controller --- .../v1beta1/bayesianoptimization-example.yaml | 52 +- examples/v1beta1/cmaes-example.yaml | 52 +- .../custom-metricscollector-example.yaml | 46 +- .../file-metricscollector-example.yaml | 48 +- examples/v1beta1/grid-example.yaml | 54 +- examples/v1beta1/hyperband-example.yaml | 58 +- examples/v1beta1/nas/darts-example-gpu.yaml | 44 +- examples/v1beta1/nas/enas-example-cpu.yaml | 50 +- examples/v1beta1/nas/enas-example-gpu.yaml | 54 +- examples/v1beta1/never-resume-example.yaml | 52 +- examples/v1beta1/pytorchjob-example.yaml | 57 +- examples/v1beta1/tfjob-example.yaml | 36 +- examples/v1beta1/tpe-example.yaml | 52 +- .../experiment/experiment_controller.go | 16 +- .../experiment/experiment_controller_test.go | 196 ++++-- .../experiment/experiment_util.go | 1 + .../experiment/manifest/generator.go | 17 +- .../experiment/manifest/generator_test.go | 21 +- .../trial/trial_controller_test.go | 116 ++-- pkg/controller.v1beta1/util/unstructured.go | 57 ++ pkg/job/v1beta1/job.go | 2 +- pkg/job/v1beta1/kubeflow.go | 2 +- .../experiment/validator/validator_test.go | 567 +++++++++--------- 23 files changed, 923 insertions(+), 727 deletions(-) create mode 100644 pkg/controller.v1beta1/util/unstructured.go diff --git a/examples/v1beta1/bayesianoptimization-example.yaml b/examples/v1beta1/bayesianoptimization-example.yaml index 102dc413362..ae9faf5bc9a 100644 --- a/examples/v1beta1/bayesianoptimization-example.yaml +++ b/examples/v1beta1/bayesianoptimization-example.yaml @@ -21,17 +21,17 @@ spec: maxTrialCount: 12 maxFailedTrialCount: 3 parameters: - - name: --lr + - name: lr parameterType: double feasibleSpace: min: "0.01" max: "0.03" - - name: --num-layers + - name: num-layers parameterType: int feasibleSpace: min: "2" max: "5" - - name: --optimizer + - name: optimizer parameterType: categorical feasibleSpace: list: @@ -39,26 +39,30 @@ spec: - adam - ftrl trialTemplate: - goTemplate: - rawTemplate: |- - apiVersion: batch/v1 - kind: Job - metadata: - name: {{.Trial}} - namespace: {{.NameSpace}} - spec: - template: - spec: - containers: - - name: {{.Trial}} + trialParameters: + - name: learningRate + description: Learning rate for the training model + reference: lr + - name: numberLayers + description: Number of training model layers + reference: num-layers + - name: optimizer + description: Training model optimizer (sdg, adam or ftrl) + reference: optimizer + trialSpec: + apiVersion: batch/v1 + kind: Job + spec: + template: + spec: + containers: + - name: training-container image: docker.io/kubeflowkatib/mxnet-mnist command: - - "python3" - - "/opt/mxnet-mnist/mnist.py" - - "--batch-size=64" - {{- with .HyperParameters}} - {{- range .}} - - "{{.Name}}={{.Value}}" - {{- end}} - {{- end}} - restartPolicy: Never + - "python3" + - "/opt/mxnet-mnist/mnist.py" + - "--batch-size=64" + - "--lr=${trialParameters.learningRate}" + - "--num-layers=${trialParameters.numberLayers}" + - "--optimizer=${trialParameters.optimizer}" + restartPolicy: Never diff --git a/examples/v1beta1/cmaes-example.yaml b/examples/v1beta1/cmaes-example.yaml index 9ec37f9ecb7..9f1ad36df5a 100644 --- a/examples/v1beta1/cmaes-example.yaml +++ b/examples/v1beta1/cmaes-example.yaml @@ -18,17 +18,17 @@ spec: maxTrialCount: 12 maxFailedTrialCount: 3 parameters: - - name: --lr + - name: lr parameterType: double feasibleSpace: min: "0.01" max: "0.03" - - name: --num-layers + - name: num-layers parameterType: int feasibleSpace: min: "2" max: "5" - - name: --optimizer + - name: optimizer parameterType: categorical feasibleSpace: list: @@ -36,26 +36,30 @@ spec: - adam - ftrl trialTemplate: - goTemplate: - rawTemplate: |- - apiVersion: batch/v1 - kind: Job - metadata: - name: {{.Trial}} - namespace: {{.NameSpace}} - spec: - template: - spec: - containers: - - name: {{.Trial}} + trialParameters: + - name: learningRate + description: Learning rate for the training model + reference: lr + - name: numberLayers + description: Number of training model layers + reference: num-layers + - name: optimizer + description: Training model optimizer (sdg, adam or ftrl) + reference: optimizer + trialSpec: + apiVersion: batch/v1 + kind: Job + spec: + template: + spec: + containers: + - name: training-container image: docker.io/kubeflowkatib/mxnet-mnist command: - - "python3" - - "/opt/mxnet-mnist/mnist.py" - - "--batch-size=64" - {{- with .HyperParameters}} - {{- range .}} - - "{{.Name}}={{.Value}}" - {{- end}} - {{- end}} - restartPolicy: Never + - "python3" + - "/opt/mxnet-mnist/mnist.py" + - "--batch-size=64" + - "--lr=${trialParameters.learningRate}" + - "--num-layers=${trialParameters.numberLayers}" + - "--optimizer=${trialParameters.optimizer}" + restartPolicy: Never diff --git a/examples/v1beta1/custom-metricscollector-example.yaml b/examples/v1beta1/custom-metricscollector-example.yaml index 59ea1a0a286..939a74fcd13 100644 --- a/examples/v1beta1/custom-metricscollector-example.yaml +++ b/examples/v1beta1/custom-metricscollector-example.yaml @@ -41,38 +41,38 @@ spec: maxTrialCount: 12 maxFailedTrialCount: 3 parameters: - - name: --lr + - name: lr parameterType: double feasibleSpace: min: "0.01" max: "0.03" - - name: --momentum + - name: momentum parameterType: double feasibleSpace: min: "0.3" max: "0.7" trialTemplate: - goTemplate: - rawTemplate: |- - apiVersion: batch/v1 - kind: Job - metadata: - name: {{.Trial}} - namespace: {{.NameSpace}} - spec: - template: - spec: - containers: - - name: {{.Trial}} + trialParameters: + - name: learningRate + description: Learning rate for the training model + reference: lr + - name: momentum + description: Momentum for the training model + reference: momentum + trialSpec: + apiVersion: batch/v1 + kind: Job + spec: + template: + spec: + containers: + - name: training-container image: docker.io/kubeflowkatib/pytorch-mnist imagePullPolicy: Always command: - - "python" - - "/var/mnist.py" - - "--epochs=1" - {{- with .HyperParameters}} - {{- range .}} - - "{{.Name}}={{.Value}}" - {{- end}} - {{- end}} - restartPolicy: Never + - "python" + - "/var/mnist.py" + - "--epochs=1" + - "--lr=${trialParameters.learningRate}" + - "--momentum=${trialParameters.momentum}" + restartPolicy: Never diff --git a/examples/v1beta1/file-metricscollector-example.yaml b/examples/v1beta1/file-metricscollector-example.yaml index 539870bda40..d10b0d5c68d 100644 --- a/examples/v1beta1/file-metricscollector-example.yaml +++ b/examples/v1beta1/file-metricscollector-example.yaml @@ -28,38 +28,38 @@ spec: maxTrialCount: 12 maxFailedTrialCount: 3 parameters: - - name: --lr + - name: lr parameterType: double feasibleSpace: min: "0.01" max: "0.03" - - name: --momentum + - name: momentum parameterType: double feasibleSpace: min: "0.3" max: "0.7" trialTemplate: - goTemplate: - rawTemplate: |- - apiVersion: batch/v1 - kind: Job - metadata: - name: {{.Trial}} - namespace: {{.NameSpace}} - spec: - template: - spec: - containers: - - name: {{.Trial}} - image: docker.io/kubeflowkatib/pytorch-mnist:1.0 + trialParameters: + - name: learningRate + description: Learning rate for the training model + reference: lr + - name: momentum + description: Momentum for the training model + reference: momentum + trialSpec: + apiVersion: batch/v1 + kind: Job + spec: + template: + spec: + containers: + - name: training-container + image: docker.io/kubeflowkatib/pytorch-mnist imagePullPolicy: Always command: - - "python" - - "/var/mnist.py" - - "--epochs=1" - {{- with .HyperParameters}} - {{- range .}} - - "{{.Name}}={{.Value}}" - {{- end}} - {{- end}} - restartPolicy: Never + - "python" + - "/var/mnist.py" + - "--epochs=1" + - "--lr=${trialParameters.learningRate}" + - "--momentum=${trialParameters.momentum}" + restartPolicy: Never diff --git a/examples/v1beta1/grid-example.yaml b/examples/v1beta1/grid-example.yaml index 6d1f48b5f48..9e847a0cf54 100644 --- a/examples/v1beta1/grid-example.yaml +++ b/examples/v1beta1/grid-example.yaml @@ -18,23 +18,23 @@ spec: maxTrialCount: 12 maxFailedTrialCount: 3 parameters: - - name: --lr + - name: lr parameterType: double feasibleSpace: min: "0.001" max: "0.01" step: "0.001" - - name: --num-layers + - name: num-layers parameterType: int feasibleSpace: min: "2" max: "5" - - name: --num-epochs + - name: num-epochs parameterType: int feasibleSpace: min: "10" max: "15" - - name: --optimizer + - name: optimizer parameterType: categorical feasibleSpace: list: @@ -42,26 +42,30 @@ spec: - adam - ftrl trialTemplate: - goTemplate: - rawTemplate: |- - apiVersion: batch/v1 - kind: Job - metadata: - name: {{.Trial}} - namespace: {{.NameSpace}} - spec: - template: - spec: - containers: - - name: {{.Trial}} + trialParameters: + - name: learningRate + description: Learning rate for the training model + reference: lr + - name: numberLayers + description: Number of training model layers + reference: num-layers + - name: optimizer + description: Training model optimizer (sdg, adam or ftrl) + reference: optimizer + trialSpec: + apiVersion: batch/v1 + kind: Job + spec: + template: + spec: + containers: + - name: training-container image: docker.io/kubeflowkatib/mxnet-mnist command: - - "python3" - - "/opt/mxnet-mnist/mnist.py" - - "--batch-size=64" - {{- with .HyperParameters}} - {{- range .}} - - "{{.Name}}={{.Value}}" - {{- end}} - {{- end}} - restartPolicy: Never + - "python3" + - "/opt/mxnet-mnist/mnist.py" + - "--batch-size=64" + - "--lr=${trialParameters.learningRate}" + - "--num-layers=${trialParameters.numberLayers}" + - "--optimizer=${trialParameters.optimizer}" + restartPolicy: Never diff --git a/examples/v1beta1/hyperband-example.yaml b/examples/v1beta1/hyperband-example.yaml index 8dea1acc07c..f5b55980c07 100644 --- a/examples/v1beta1/hyperband-example.yaml +++ b/examples/v1beta1/hyperband-example.yaml @@ -23,49 +23,57 @@ spec: value: "9" maxFailedTrialCount: 9 parameters: - - name: --lr + - name: lr parameterType: double feasibleSpace: min: "0.01" max: "0.03" - - name: --num-layers + - name: num-layers parameterType: int feasibleSpace: min: "2" max: "5" - - name: --optimizer + - name: optimizer parameterType: categorical feasibleSpace: list: - sgd - adam - ftrl - - name: --num-epochs + - name: num-epochs parameterType: int feasibleSpace: min: "20" max: "20" trialTemplate: - goTemplate: - rawTemplate: |- - apiVersion: batch/v1 - kind: Job - metadata: - name: {{.Trial}} - namespace: {{.NameSpace}} - spec: - template: - spec: - containers: - - name: {{.Trial}} + trialParameters: + - name: learningRate + description: Learning rate for the training model + reference: lr + - name: numberLayers + description: Number of training model layers + reference: num-layers + - name: optimizer + description: Training model optimizer (sdg, adam or ftrl) + reference: optimizer + - name: numberEpochs + description: Number of epochs to train the model + reference: num-epochs + trialSpec: + apiVersion: batch/v1 + kind: Job + spec: + template: + spec: + containers: + - name: training-container image: docker.io/kubeflowkatib/mxnet-mnist command: - - "python3" - - "/opt/mxnet-mnist/mnist.py" - - "--batch-size=64" - {{- with .HyperParameters}} - {{- range .}} - - "{{.Name}}={{.Value}}" - {{- end}} - {{- end}} - restartPolicy: Never + - "python3" + - "/opt/mxnet-mnist/mnist.py" + - "--batch-size=64" + - "--lr=${trialParameters.learningRate}" + - "--num-layers=${trialParameters.numberLayers}" + - "--optimizer=${trialParameters.optimizer}" + - "--num-epochs=${trialParameters.numberEpochs}" + restartPolicy: Never diff --git a/examples/v1beta1/nas/darts-example-gpu.yaml b/examples/v1beta1/nas/darts-example-gpu.yaml index a44a6c111c1..c7fc6fdb31c 100644 --- a/examples/v1beta1/nas/darts-example-gpu.yaml +++ b/examples/v1beta1/nas/darts-example-gpu.yaml @@ -57,29 +57,33 @@ spec: - "3" - operationType: skip_connection trialTemplate: - goTemplate: - rawTemplate: |- - apiVersion: batch/v1 - kind: Job - metadata: - name: {{.Trial}} - namespace: {{.NameSpace}} - spec: - template: - spec: - containers: - - name: {{.Trial}} + trialParameters: + - name: algorithmSettings + description: Algorithm settings of DARTS Experiment + reference: algorithm-settings + - name: searchSpace + description: Search Space of DARTS Experiment + reference: search-space + - name: num-layers + description: Number of layers of Neural Network + reference: numberLayers + trialSpec: + apiVersion: batch/v1 + kind: Job + spec: + template: + spec: + containers: + - name: training-container image: docker.io/kubeflowkatib/darts-cnn-cifar10 imagePullPolicy: Always command: - - "python3" - - "run_trial.py" - {{- with .HyperParameters}} - {{- range .}} - - "--{{.Name}}=\"{{.Value}}\"" - {{- end}} - {{- end}} + - python3 + - run_trial.py + - --algorithm-settings="${trialParameters.algorithmSettings}" + - --search-space="${trialParameters.searchSpace}" + - --num-layers="${trialParameters.numberLayers}" resources: limits: nvidia.com/gpu: 1 - restartPolicy: Never + restartPolicy: Never diff --git a/examples/v1beta1/nas/enas-example-cpu.yaml b/examples/v1beta1/nas/enas-example-cpu.yaml index af45ec9a13b..b979deb948e 100644 --- a/examples/v1beta1/nas/enas-example-cpu.yaml +++ b/examples/v1beta1/nas/enas-example-cpu.yaml @@ -23,31 +23,6 @@ spec: objectiveMetricName: Validation-Accuracy algorithm: algorithmName: enas - trialTemplate: - goTemplate: - rawTemplate: |- - apiVersion: batch/v1 - kind: Job - metadata: - name: {{.Trial}} - namespace: {{.NameSpace}} - spec: - template: - spec: - containers: - - name: {{.Trial}} - image: docker.io/kubeflowkatib/enas-cnn-cifar10-cpu - command: - - "python3" - - "-u" - - "RunTrial.py" - {{- with .HyperParameters}} - {{- range .}} - - "--{{.Name}}=\"{{.Value}}\"" - {{- end}} - {{- end}} - - "--num_epochs=1" - restartPolicy: Never nasConfig: graphConfig: numLayers: 1 @@ -147,3 +122,28 @@ spec: min: "2" max: "3" step: "1" + trialTemplate: + trialParameters: + - name: neuralNetworkArchitecture + description: NN architecture contains operations ID on each NN layer and skip connections between layers + reference: architecture + - name: neuralNetworkConfig + description: Configuration contains NN number of layers, input and output sizes, description what each operation ID means + reference: nn_config + trialSpec: + apiVersion: batch/v1 + kind: Job + spec: + template: + spec: + containers: + - name: training-container + image: docker.io/kubeflowkatib/enas-cnn-cifar10-cpu + command: + - python3 + - -u + - RunTrial.py + - --num_epochs=1 + - --architecture="${trialParameters.neuralNetworkArchitecture}" + - --nn_config="${trialParameters.neuralNetworkConfig}" + restartPolicy: Never diff --git a/examples/v1beta1/nas/enas-example-gpu.yaml b/examples/v1beta1/nas/enas-example-gpu.yaml index 1799c45f20b..938c263e844 100644 --- a/examples/v1beta1/nas/enas-example-gpu.yaml +++ b/examples/v1beta1/nas/enas-example-gpu.yaml @@ -20,33 +20,6 @@ spec: objectiveMetricName: Validation-Accuracy algorithm: algorithmName: enas - trialTemplate: - goTemplate: - rawTemplate: |- - apiVersion: batch/v1 - kind: Job - metadata: - name: {{.Trial}} - namespace: {{.NameSpace}} - spec: - template: - spec: - containers: - - name: {{.Trial}} - image: docker.io/kubeflowkatib/enas-cnn-cifar10-gpu - command: - - "python3" - - "-u" - - "RunTrial.py" - {{- with .HyperParameters}} - {{- range .}} - - "--{{.Name}}=\"{{.Value}}\"" - {{- end}} - {{- end}} - resources: - limits: - nvidia.com/gpu: 1 - restartPolicy: Never nasConfig: graphConfig: numLayers: 8 @@ -146,3 +119,30 @@ spec: min: "2" max: "3" step: "1" + trialTemplate: + trialParameters: + - name: neuralNetworkArchitecture + description: NN architecture contains operations ID on each NN layer and skip connections between layers + reference: architecture + - name: neuralNetworkConfig + description: Configuration contains NN number of layers, input and output sizes, description what each operation ID means + reference: nn_config + trialSpec: + apiVersion: batch/v1 + kind: Job + spec: + template: + spec: + containers: + - name: training-container + image: docker.io/kubeflowkatib/enas-cnn-cifar10-gpu + command: + - python3 + - -u + - RunTrial.py + - --architecture="${trialParameters.neuralNetworkArchitecture}" + - --nn_config="${trialParameters.neuralNetworkConfig}" + resources: + limits: + nvidia.com/gpu: 1 + restartPolicy: Never diff --git a/examples/v1beta1/never-resume-example.yaml b/examples/v1beta1/never-resume-example.yaml index 8ee684e6317..bbcfb79c29e 100644 --- a/examples/v1beta1/never-resume-example.yaml +++ b/examples/v1beta1/never-resume-example.yaml @@ -19,17 +19,17 @@ spec: maxFailedTrialCount: 3 resumePolicy: Never parameters: - - name: --lr + - name: lr parameterType: double feasibleSpace: min: "0.01" max: "0.03" - - name: --num-layers + - name: num-layers parameterType: int feasibleSpace: min: "2" max: "5" - - name: --optimizer + - name: optimizer parameterType: categorical feasibleSpace: list: @@ -37,26 +37,30 @@ spec: - adam - ftrl trialTemplate: - goTemplate: - rawTemplate: |- - apiVersion: batch/v1 - kind: Job - metadata: - name: {{.Trial}} - namespace: {{.NameSpace}} - spec: - template: - spec: - containers: - - name: {{.Trial}} + trialParameters: + - name: learningRate + description: Learning rate for the training model + reference: lr + - name: numberLayers + description: Number of training model layers + reference: num-layers + - name: optimizer + description: Training model optimizer (sdg, adam or ftrl) + reference: optimizer + trialSpec: + apiVersion: batch/v1 + kind: Job + spec: + template: + spec: + containers: + - name: training-container image: docker.io/kubeflowkatib/mxnet-mnist command: - - "python3" - - "/opt/mxnet-mnist/mnist.py" - - "--batch-size=64" - {{- with .HyperParameters}} - {{- range .}} - - "{{.Name}}={{.Value}}" - {{- end}} - {{- end}} - restartPolicy: Never + - "python3" + - "/opt/mxnet-mnist/mnist.py" + - "--batch-size=64" + - "--lr=${trialParameters.learningRate}" + - "--num-layers=${trialParameters.numberLayers}" + - "--optimizer=${trialParameters.optimizer}" + restartPolicy: Never diff --git a/examples/v1beta1/pytorchjob-example.yaml b/examples/v1beta1/pytorchjob-example.yaml index 33a5a888819..0bf9bcebbff 100644 --- a/examples/v1beta1/pytorchjob-example.yaml +++ b/examples/v1beta1/pytorchjob-example.yaml @@ -13,16 +13,30 @@ spec: objectiveMetricName: accuracy algorithm: algorithmName: random + parameters: + - name: lr + parameterType: double + feasibleSpace: + min: "0.01" + max: "0.05" + - name: momentum + parameterType: double + feasibleSpace: + min: "0.5" + max: "0.9" trialTemplate: - goTemplate: - rawTemplate: |- - apiVersion: "kubeflow.org/v1" - kind: PyTorchJob - metadata: - name: {{.Trial}} - namespace: {{.NameSpace}} - spec: - pytorchReplicaSpecs: + trialParameters: + - name: learningRate + description: Learning rate for the training model + reference: lr + - name: momentum + description: Momentum for the training model + reference: momentum + trialSpec: + apiVersion: "kubeflow.org/v1" + kind: PyTorchJob + spec: + pytorchReplicaSpecs: Master: replicas: 1 restartPolicy: OnFailure @@ -35,11 +49,8 @@ spec: command: - "python" - "/var/mnist.py" - {{- with .HyperParameters}} - {{- range .}} - - "{{.Name}}={{.Value}}" - {{- end}} - {{- end}} + - "--lr=${trialParameters.learningRate}" + - "--momentum=${trialParameters.momentum}" Worker: replicas: 2 restartPolicy: OnFailure @@ -52,19 +63,5 @@ spec: command: - "python" - "/var/mnist.py" - {{- with .HyperParameters}} - {{- range .}} - - "{{.Name}}={{.Value}}" - {{- end}} - {{- end}} - parameters: - - name: --lr - parameterType: double - feasibleSpace: - min: "0.01" - max: "0.05" - - name: --momentum - parameterType: double - feasibleSpace: - min: "0.5" - max: "0.9" + - "--lr=${trialParameters.learningRate}" + - "--momentum=${trialParameters.momentum}" diff --git a/examples/v1beta1/tfjob-example.yaml b/examples/v1beta1/tfjob-example.yaml index ade5c38379a..153cf50839c 100644 --- a/examples/v1beta1/tfjob-example.yaml +++ b/examples/v1beta1/tfjob-example.yaml @@ -21,41 +21,41 @@ spec: collector: kind: TensorFlowEvent parameters: - - name: --learning_rate + - name: learning_rate parameterType: double feasibleSpace: min: "0.01" max: "0.05" - - name: --batch_size + - name: batch_size parameterType: int feasibleSpace: min: "100" max: "200" trialTemplate: - goTemplate: - rawTemplate: |- - apiVersion: "kubeflow.org/v1" - kind: TFJob - metadata: - name: {{.Trial}} - namespace: {{.NameSpace}} - spec: - tfReplicaSpecs: + trialParameters: + - name: learningRate + description: Learning rate for the training model + reference: learning_rate + - name: batchSize + description: Batch Size + reference: batch_size + trialSpec: + apiVersion: "kubeflow.org/v1" + kind: TFJob + spec: + tfReplicaSpecs: Worker: - replicas: 1 + replicas: 2 restartPolicy: OnFailure template: spec: containers: - - name: tensorflow + - name: tensorflow image: gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0 imagePullPolicy: Always command: - "python" - "/var/tf_mnist/mnist_with_summaries.py" - "--log_dir=/train/metrics" - {{- with .HyperParameters}} - {{- range .}} - - "{{.Name}}={{.Value}}" - {{- end}} - {{- end}} + - "--learning_rate=${trialParameters.learningRate}" + - "--batch_size=${trialParameters.batchSize}" diff --git a/examples/v1beta1/tpe-example.yaml b/examples/v1beta1/tpe-example.yaml index 5f9714c34d8..63291cb1a2e 100644 --- a/examples/v1beta1/tpe-example.yaml +++ b/examples/v1beta1/tpe-example.yaml @@ -18,17 +18,17 @@ spec: maxTrialCount: 12 maxFailedTrialCount: 3 parameters: - - name: --lr + - name: lr parameterType: double feasibleSpace: min: "0.01" max: "0.03" - - name: --num-layers + - name: num-layers parameterType: int feasibleSpace: min: "2" max: "5" - - name: --optimizer + - name: optimizer parameterType: categorical feasibleSpace: list: @@ -36,26 +36,30 @@ spec: - adam - ftrl trialTemplate: - goTemplate: - rawTemplate: |- - apiVersion: batch/v1 - kind: Job - metadata: - name: {{.Trial}} - namespace: {{.NameSpace}} - spec: - template: - spec: - containers: - - name: {{.Trial}} + trialParameters: + - name: learningRate + description: Learning rate for the training model + reference: lr + - name: numberLayers + description: Number of training model layers + reference: num-layers + - name: optimizer + description: Training model optimizer (sdg, adam or ftrl) + reference: optimizer + trialSpec: + apiVersion: batch/v1 + kind: Job + spec: + template: + spec: + containers: + - name: training-container image: docker.io/kubeflowkatib/mxnet-mnist command: - - "python3" - - "/opt/mxnet-mnist/mnist.py" - - "--batch-size=64" - {{- with .HyperParameters}} - {{- range .}} - - "{{.Name}}={{.Value}}" - {{- end}} - {{- end}} - restartPolicy: Never + - "python3" + - "/opt/mxnet-mnist/mnist.py" + - "--batch-size=64" + - "--lr=${trialParameters.learningRate}" + - "--num-layers=${trialParameters.numberLayers}" + - "--optimizer=${trialParameters.optimizer}" + restartPolicy: Never diff --git a/pkg/controller.v1beta1/experiment/experiment_controller.go b/pkg/controller.v1beta1/experiment/experiment_controller.go index 82cb10e5b9d..7e30f49807b 100644 --- a/pkg/controller.v1beta1/experiment/experiment_controller.go +++ b/pkg/controller.v1beta1/experiment/experiment_controller.go @@ -325,7 +325,6 @@ func (r *ReconcileExperiment) ReconcileTrials(instance *experimentsv1beta1.Exper //skip if no trials need to be created if addCount > 0 { //create "addCount" number of trials - logger.Info("CreateTrials", "addCount", addCount) if err := r.createTrials(instance, trials, addCount); err != nil { logger.Error(err, "Create trials error") return err @@ -341,15 +340,24 @@ func (r *ReconcileExperiment) createTrials(instance *experimentsv1beta1.Experime logger := log.WithValues("Experiment", types.NamespacedName{Name: instance.GetName(), Namespace: instance.GetNamespace()}) currentCount := int32(len(trialList)) + logger.Info("Reconcile Suggestion", "addCount", addCount) trials, err := r.ReconcileSuggestions(instance, currentCount, addCount) if err != nil { logger.Error(err, "Get suggestions error") return err } + var trialNames []string for _, trial := range trials { - if err = r.createTrialInstance(instance, &trial); err != nil { - logger.Error(err, "Create trial instance error", "trial", trial) - continue + trialNames = append(trialNames, trial.Name) + } + // If Trial Assignment is not ready we don't need to create Trials + if len(trialNames) != 0 { + logger.Info("Create Trials", "trialNames", trialNames) + for _, trial := range trials { + if err = r.createTrialInstance(instance, &trial); err != nil { + logger.Error(err, "Create trial instance error", "trial", trial) + continue + } } } return nil diff --git a/pkg/controller.v1beta1/experiment/experiment_controller_test.go b/pkg/controller.v1beta1/experiment/experiment_controller_test.go index 038172a6002..135c02adb36 100644 --- a/pkg/controller.v1beta1/experiment/experiment_controller_test.go +++ b/pkg/controller.v1beta1/experiment/experiment_controller_test.go @@ -23,9 +23,13 @@ import ( suggestionsv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/suggestions/v1beta1" trialsv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/trials/v1beta1" "github.com/kubeflow/katib/pkg/controller.v1beta1/consts" - "github.com/kubeflow/katib/pkg/controller.v1beta1/experiment/util" + experimentUtil "github.com/kubeflow/katib/pkg/controller.v1beta1/experiment/util" + util "github.com/kubeflow/katib/pkg/controller.v1beta1/util" manifestmock "github.com/kubeflow/katib/pkg/mock/v1beta1/experiment/manifest" suggestionmock "github.com/kubeflow/katib/pkg/mock/v1beta1/experiment/suggestion" + kubeflowcommonv1 "github.com/kubeflow/tf-operator/pkg/apis/common/v1" + tfv1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1" + v1 "k8s.io/api/core/v1" ) const ( @@ -125,8 +129,12 @@ func TestReconcileExperiment(t *testing.T) { Name: trialKey.Name, ParameterAssignments: []commonapiv1beta1.ParameterAssignment{ { - Name: "--lr", - Value: "0.5", + Name: "lr", + Value: "0.01", + }, + { + Name: "num-layers", + Value: "5", }, }, }, @@ -137,30 +145,66 @@ func TestReconcileExperiment(t *testing.T) { mockCtrl3 := gomock.NewController(t) defer mockCtrl3.Finish() generator := manifestmock.NewMockGenerator(mockCtrl) + + returnedTFJob := &tfv1.TFJob{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "kubeflow.org/v1", + Kind: "TFJob", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "trial-name", + Namespace: "trial-namespace", + }, + Spec: tfv1.TFJobSpec{ + TFReplicaSpecs: map[tfv1.TFReplicaType]*kubeflowcommonv1.ReplicaSpec{ + tfv1.TFReplicaTypePS: &kubeflowcommonv1.ReplicaSpec{ + Replicas: func() *int32 { i := int32(1); return &i }(), + RestartPolicy: kubeflowcommonv1.RestartPolicyOnFailure, + Template: v1.PodTemplateSpec{ + Spec: v1.PodSpec{ + Containers: []v1.Container{ + v1.Container{ + Name: "tensorflow", + Image: "gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0", + Command: []string{ + "python", + "/var/tf_mnist/mnist_with_summaries.py", + "--log_dir=/train/metrics", + "--lr=0.01", + "--num-layers=5", + }, + VolumeMounts: []v1.VolumeMount{ + v1.VolumeMount{ + Name: "train", + MountPath: "/train", + }, + }, + }, + }, + Volumes: []v1.Volume{ + v1.Volume{ + Name: "train", + VolumeSource: v1.VolumeSource{ + PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ + ClaimName: "tfevent-volume", + }, + }, + }, + }, + }, + }, + }, + }, + }, + } + returnedUnstructured, err := util.ConvertObjectToUnstructured(returnedTFJob) + if err != nil { + t.Errorf("ConvertObjectToUnstructured failed: %v", err) + } generator.EXPECT().GetRunSpecWithHyperParameters(gomock.Any(), gomock.Any(), - gomock.Any(), gomock.Any(), gomock.Any()).Return(`apiVersion: "kubeflow.org/v1" -kind: "TFJob" -metadata: - name: "test" - namespace: "default" -spec: - tfReplicaSpecs: - PS: - replicas: 2 - restartPolicy: Never - template: - spec: - containers: - - name: tensorflow - image: kubeflow/tf-dist-mnist-test:1.0 - Worker: - replicas: 4 - restartPolicy: Never - template: - spec: - containers: - - name: tensorflow - image: kubeflow/tf-dist-mnist-test:1.0`, nil).AnyTimes() + gomock.Any(), gomock.Any()).Return( + returnedUnstructured, + nil).AnyTimes() mgr, err := manager.New(cfg, manager.Options{}) g.Expect(err).NotTo(gomega.HaveOccurred()) @@ -171,7 +215,7 @@ spec: scheme: mgr.GetScheme(), Suggestion: suggestion, Generator: generator, - collector: util.NewExpsCollector(mgr.GetCache(), prometheus.NewRegistry()), + collector: experimentUtil.NewExpsCollector(mgr.GetCache(), prometheus.NewRegistry()), } r.updateStatusHandler = func(instance *experimentsv1beta1.Experiment) error { if !instance.IsCreated() { @@ -222,6 +266,57 @@ spec: func newFakeInstance() *experimentsv1beta1.Experiment { var parallelCount int32 = 1 var goal float64 = 99.9 + + trialTemplateJob := &tfv1.TFJob{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "kubeflow.org/v1", + Kind: "TFJob", + }, + Spec: tfv1.TFJobSpec{ + TFReplicaSpecs: map[tfv1.TFReplicaType]*kubeflowcommonv1.ReplicaSpec{ + tfv1.TFReplicaTypePS: &kubeflowcommonv1.ReplicaSpec{ + Replicas: func() *int32 { i := int32(1); return &i }(), + RestartPolicy: kubeflowcommonv1.RestartPolicyOnFailure, + Template: v1.PodTemplateSpec{ + Spec: v1.PodSpec{ + Containers: []v1.Container{ + v1.Container{ + Name: "tensorflow", + Image: "gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0", + Command: []string{ + "python", + "/var/tf_mnist/mnist_with_summaries.py", + "--log_dir=/train/metrics", + "--lr=${trialParameters.learningRate}", + "--num-layers=${trialParameters.numberLayers}", + }, + VolumeMounts: []v1.VolumeMount{ + v1.VolumeMount{ + Name: "train", + MountPath: "/train", + }, + }, + }, + }, + Volumes: []v1.Volume{ + v1.Volume{ + Name: "train", + VolumeSource: v1.VolumeSource{ + PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ + ClaimName: "tfevent-volume", + }, + }, + }, + }, + }, + }, + }, + }, + }, + } + + trialSpec, _ := util.ConvertObjectToUnstructured(trialTemplateJob) + return &experimentsv1beta1.Experiment{ ObjectMeta: metav1.ObjectMeta{ Name: experimentName, @@ -236,39 +331,20 @@ func newFakeInstance() *experimentsv1beta1.Experiment { ObjectiveMetricName: "accuracy", }, TrialTemplate: &experimentsv1beta1.TrialTemplate{ - GoTemplate: &experimentsv1beta1.GoTemplate{ - RawTemplate: `apiVersion: "kubeflow.org/v1" -kind: TFJob -metadata: - name: {{.Trial}} - namespace: {{.NameSpace}} -spec: - tfReplicaSpecs: - Worker: - replicas: 1 - restartPolicy: OnFailure - template: - spec: - containers: - - name: tensorflow - image: gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0 - imagePullPolicy: Always - command: - - "python" - - "/var/tf_mnist/mnist_with_summaries.py" - - "--log_dir=/train/{{.Trial}}" - {{- with .HyperParameters}} - {{- range .}} - - "{{.Name}}={{.Value}}" - {{- end}} - {{- end}} - volumeMounts: - - mountPath: "/train" - name: "train" - volumes: - - name: "train" - persistentVolumeClaim: - claimName: "tfevent-volume"`, + TrialParameters: []experimentsv1beta1.TrialParameterSpec{ + experimentsv1beta1.TrialParameterSpec{ + Name: "learningRate", + Description: "Learning Rate", + Reference: "lr", + }, + experimentsv1beta1.TrialParameterSpec{ + Name: "numberLayers", + Description: "Number of layers", + Reference: "num-layers", + }, + }, + TrialSource: experimentsv1beta1.TrialSource{ + TrialSpec: trialSpec, }, }, }, diff --git a/pkg/controller.v1beta1/experiment/experiment_util.go b/pkg/controller.v1beta1/experiment/experiment_util.go index d6e0d986716..3e8513222aa 100644 --- a/pkg/controller.v1beta1/experiment/experiment_util.go +++ b/pkg/controller.v1beta1/experiment/experiment_util.go @@ -39,6 +39,7 @@ func (r *ReconcileExperiment) createTrialInstance(expInstance *experimentsv1beta trial.Spec.ParameterAssignments = trialAssignment.ParameterAssignments runSpec, err := r.GetRunSpecWithHyperParameters(expInstance, trial.Name, trial.Namespace, hps) + logger.Info("RUN SPEC-------------", "runSpec", runSpec) if err != nil { logger.Error(err, "Fail to get RunSpec from experiment", expInstance.Name) return err diff --git a/pkg/controller.v1beta1/experiment/manifest/generator.go b/pkg/controller.v1beta1/experiment/manifest/generator.go index 2114f687535..55531ee80bd 100644 --- a/pkg/controller.v1beta1/experiment/manifest/generator.go +++ b/pkg/controller.v1beta1/experiment/manifest/generator.go @@ -1,7 +1,6 @@ package manifest import ( - "bytes" "errors" "fmt" "strings" @@ -13,9 +12,9 @@ import ( commonapiv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/common/v1beta1" experimentsv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/experiments/v1beta1" "github.com/kubeflow/katib/pkg/controller.v1beta1/consts" + util "github.com/kubeflow/katib/pkg/controller.v1beta1/util" "github.com/kubeflow/katib/pkg/util/v1beta1/katibclient" "github.com/kubeflow/katib/pkg/util/v1beta1/katibconfig" - k8syaml "k8s.io/apimachinery/pkg/util/yaml" ) const ( @@ -25,6 +24,7 @@ const ( // Generator is the type for manifests Generator. type Generator interface { InjectClient(c client.Client) + // TODO (andreyvelich): Add this after changing validation for new Trial Template // GetRunSpec(e *experimentsv1beta1.Experiment, experiment, trial, namespace string) (string, error) GetRunSpecWithHyperParameters(experiment *experimentsv1beta1.Experiment, trialName, trialNamespace string, assignments []commonapiv1beta1.ParameterAssignment) (*unstructured.Unstructured, error) GetSuggestionConfigData(algorithmName string) (map[string]string, error) @@ -75,11 +75,9 @@ func (g *DefaultGenerator) GetRunSpecWithHyperParameters(experiment *experiments return nil, err } // Convert Trial template to unstructured - replacedTemplateBytes := bytes.NewBufferString(replacedTemplate) - runSpec := &unstructured.Unstructured{} - err = k8syaml.NewYAMLOrJSONDecoder(replacedTemplateBytes, 1024).Decode(runSpec) + runSpec, err := util.ConvertStringToUnstructured(replacedTemplate) if err != nil { - return nil, err + return nil, fmt.Errorf("ConvertStringToUnstructured failed: %v", err) } // Set name and namespace for Run Spec @@ -114,13 +112,14 @@ func (g *DefaultGenerator) applyParameters(trialTemplate string, trialParams []e func (g *DefaultGenerator) getTrialTemplate(instance *experimentsv1beta1.Experiment) (string, error) { var trialTemplateString string + var err error + trialSource := instance.Spec.TrialTemplate.TrialSource if trialSource.TrialSpec != nil { - trialTemplateByte, err := trialSource.TrialSpec.MarshalJSON() + trialTemplateString, err = util.ConvertUnstructuredToString(trialSource.TrialSpec) if err != nil { - return "", err + return "", fmt.Errorf("ConvertUnstructuredToString failed: %v", err) } - trialTemplateString = string(trialTemplateByte) } else { configMapNS := trialSource.ConfigMap.ConfigMapNamespace configMapName := trialSource.ConfigMap.ConfigMapName diff --git a/pkg/controller.v1beta1/experiment/manifest/generator_test.go b/pkg/controller.v1beta1/experiment/manifest/generator_test.go index 9a885df574f..fa54a532527 100644 --- a/pkg/controller.v1beta1/experiment/manifest/generator_test.go +++ b/pkg/controller.v1beta1/experiment/manifest/generator_test.go @@ -1,20 +1,17 @@ package manifest import ( - "bytes" "reflect" "testing" "github.com/golang/mock/gomock" commonapiv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/common/v1beta1" experimentsv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/experiments/v1beta1" + util "github.com/kubeflow/katib/pkg/controller.v1beta1/util" katibclientmock "github.com/kubeflow/katib/pkg/mock/v1beta1/util/katibclient" batchv1 "k8s.io/api/batch/v1" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - "k8s.io/apimachinery/pkg/runtime" - k8syaml "k8s.io/apimachinery/pkg/util/yaml" ) func TestGetRunSpecWithHP(t *testing.T) { @@ -74,11 +71,9 @@ func TestGetRunSpecWithHP(t *testing.T) { }, } - expected := &unstructured.Unstructured{} - - expected.Object, err = runtime.DefaultUnstructuredConverter.ToUnstructured(&expectedJob) + expected, err := util.ConvertObjectToUnstructured(expectedJob) if err != nil { - t.Errorf("ToUnstructured failed: %v", err) + t.Errorf("ConvertObjectToUnstructured failed: %v", err) } if !reflect.DeepEqual(expected.Object, actual.Object) { @@ -152,11 +147,9 @@ spec: - "--lr=0.05" - "--num-layers=5"` - expectedBytes := bytes.NewBufferString(expectedJob) - expected := &unstructured.Unstructured{} - err = k8syaml.NewYAMLOrJSONDecoder(expectedBytes, 1024).Decode(expected) + expected, err := util.ConvertStringToUnstructured(expectedJob) if err != nil { - t.Errorf("NewYAMLOrJSONDecoder failed: %v", err) + t.Errorf("ConvertStringToUnstructured failed: %v", err) } if !reflect.DeepEqual(expected, actual) { t.Errorf("Expected %s\n got %s", expected.Object, actual.Object) @@ -189,9 +182,7 @@ func newFakeInstance() *experimentsv1beta1.Experiment { }, }, } - - trialSpec := &unstructured.Unstructured{} - trialSpec.Object, _ = runtime.DefaultUnstructuredConverter.ToUnstructured(trialTemplateJob) + trialSpec, _ := util.ConvertObjectToUnstructured(trialTemplateJob) return &experimentsv1beta1.Experiment{ Spec: experimentsv1beta1.ExperimentSpec{ diff --git a/pkg/controller.v1beta1/trial/trial_controller_test.go b/pkg/controller.v1beta1/trial/trial_controller_test.go index 8f91a817319..286b1bd2a37 100644 --- a/pkg/controller.v1beta1/trial/trial_controller_test.go +++ b/pkg/controller.v1beta1/trial/trial_controller_test.go @@ -1,20 +1,21 @@ package trial import ( - "bytes" "testing" "time" "github.com/golang/mock/gomock" + tfv1 "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1" "github.com/onsi/gomega" "github.com/prometheus/client_golang/prometheus" "golang.org/x/net/context" corev1 "k8s.io/api/core/v1" + v1 "k8s.io/api/core/v1" + + util "github.com/kubeflow/katib/pkg/controller.v1beta1/util" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/types" - k8syaml "k8s.io/apimachinery/pkg/util/yaml" "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/reconcile" logf "sigs.k8s.io/controller-runtime/pkg/runtime/log" @@ -23,18 +24,20 @@ import ( trialsv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/trials/v1beta1" api_pb "github.com/kubeflow/katib/pkg/apis/manager/v1beta1" managerclientmock "github.com/kubeflow/katib/pkg/mock/v1beta1/trial/managerclient" + kubeflowcommonv1 "github.com/kubeflow/tf-operator/pkg/apis/common/v1" ) const ( - trialName = "foo" - namespace = "default" + trialName = "trial-name" + trialNamespace = "trial-namespace" + tfJobName = "tfjob-name" timeout = time.Second * 40 ) -var expectedRequest = reconcile.Request{NamespacedName: types.NamespacedName{Name: trialName, Namespace: namespace}} +var expectedRequest = reconcile.Request{NamespacedName: types.NamespacedName{Name: trialName, Namespace: trialNamespace}} var expectedResult = reconcile.Result{Requeue: true} -var tfJobKey = types.NamespacedName{Name: "test", Namespace: namespace} +var tfJobKey = types.NamespacedName{Name: tfJobName, Namespace: trialNamespace} func init() { logf.SetLogger(logf.ZapLogger(true)) @@ -146,12 +149,8 @@ func TestReconcileTFJobTrial(t *testing.T) { } g.Expect(err).NotTo(gomega.HaveOccurred()) - tfJob := &unstructured.Unstructured{} - bufSize := 1024 - buf := bytes.NewBufferString(instance.Spec.RunSpec) - if err := k8syaml.NewYAMLOrJSONDecoder(buf, bufSize).Decode(tfJob); err != nil { - t.Errorf("Expected nil, got %v", err) - } + tfJob := instance.Spec.RunSpec + g.Eventually(func() error { return c.Get(context.TODO(), tfJobKey, tfJob) }, timeout). Should(gomega.Succeed()) @@ -162,14 +161,15 @@ func TestReconcileTFJobTrial(t *testing.T) { // Manually delete TFJob since GC isn't enabled in the test control plane g.Eventually(func() error { return c.Delete(context.TODO(), tfJob) }, timeout). - Should(gomega.MatchError("tfjobs.kubeflow.org \"test\" not found")) + Should(gomega.MatchError("tfjobs.kubeflow.org \"tfjob-name\" not found")) g.Expect(c.Delete(context.TODO(), instance)).NotTo(gomega.HaveOccurred()) } func TestReconcileCompletedTFJobTrial(t *testing.T) { g := gomega.NewGomegaWithT(t) instance := newFakeTrialWithTFJob() - instance.Name = "tfjob-trial" + // Trial name must be different to avoid errors + instance.Name = "new-trial-name" mockCtrl := gomock.NewController(t) defer mockCtrl.Finish() @@ -240,37 +240,73 @@ func TestReconcileCompletedTFJobTrial(t *testing.T) { func newFakeTrialWithTFJob() *trialsv1beta1.Trial { objectiveSpec := commonv1beta1.ObjectiveSpec{ObjectiveMetricName: "test"} + runSpecTFJob := &tfv1.TFJob{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "kubeflow.org/v1", + Kind: "TFJob", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: tfJobName, + Namespace: trialNamespace, + }, + Spec: tfv1.TFJobSpec{ + TFReplicaSpecs: map[tfv1.TFReplicaType]*kubeflowcommonv1.ReplicaSpec{ + tfv1.TFReplicaTypePS: &kubeflowcommonv1.ReplicaSpec{ + Replicas: func() *int32 { i := int32(2); return &i }(), + RestartPolicy: kubeflowcommonv1.RestartPolicyNever, + Template: v1.PodTemplateSpec{ + Spec: v1.PodSpec{ + Containers: []v1.Container{ + v1.Container{ + Name: "tensorflow", + Image: "gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0", + Command: []string{ + "python", + "/var/tf_mnist/mnist_with_summaries.py", + "--log_dir=/train/metrics", + "--lr=0.01", + "--num-layers=5", + }, + }, + }, + }, + }, + }, + tfv1.TFReplicaTypeWorker: &kubeflowcommonv1.ReplicaSpec{ + Replicas: func() *int32 { i := int32(4); return &i }(), + RestartPolicy: kubeflowcommonv1.RestartPolicyNever, + Template: v1.PodTemplateSpec{ + Spec: v1.PodSpec{ + Containers: []v1.Container{ + v1.Container{ + Name: "tensorflow", + Image: "gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0", + Command: []string{ + "python", + "/var/tf_mnist/mnist_with_summaries.py", + "--log_dir=/train/metrics", + "--lr=0.01", + "--num-layers=5", + }, + }, + }, + }, + }, + }, + }, + }, + } + + runSpec, _ := util.ConvertObjectToUnstructured(runSpecTFJob) + t := &trialsv1beta1.Trial{ ObjectMeta: metav1.ObjectMeta{ Name: trialName, - Namespace: namespace, + Namespace: trialNamespace, }, Spec: trialsv1beta1.TrialSpec{ Objective: &objectiveSpec, - RunSpec: `apiVersion: "kubeflow.org/v1" -kind: "TFJob" -metadata: - name: "test" - namespace: "default" -spec: - tfReplicaSpecs: - PS: - replicas: 2 - restartPolicy: Never - template: - spec: - containers: - - name: tensorflow - image: kubeflow/tf-dist-mnist-test:1.0 - Worker: - replicas: 4 - restartPolicy: Never - template: - spec: - containers: - - name: tensorflow - image: kubeflow/tf-dist-mnist-test:1.0 -`, + RunSpec: runSpec, }, } return t diff --git a/pkg/controller.v1beta1/util/unstructured.go b/pkg/controller.v1beta1/util/unstructured.go new file mode 100644 index 00000000000..1f538c8e59f --- /dev/null +++ b/pkg/controller.v1beta1/util/unstructured.go @@ -0,0 +1,57 @@ +package util + +import ( + "bytes" + + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + k8syaml "k8s.io/apimachinery/pkg/util/yaml" + logf "sigs.k8s.io/controller-runtime/pkg/runtime/log" +) + +const ( + bufferSize = 1024 +) + +var ( + logUnstructured = logf.Log.WithName("util-unstructured") +) + +// ConvertUnstructuredToString returns string from Unstructured value +func ConvertUnstructuredToString(in *unstructured.Unstructured) (string, error) { + inByte, err := in.MarshalJSON() + if err != nil { + logUnstructured.Error(err, "MarshalJSON failed") + return "", err + } + + return string(inByte), nil +} + +// ConvertStringToUnstructured returns Unstructured from string value +func ConvertStringToUnstructured(in string) (*unstructured.Unstructured, error) { + inBytes := bytes.NewBufferString(in) + out := &unstructured.Unstructured{} + + err := k8syaml.NewYAMLOrJSONDecoder(inBytes, bufferSize).Decode(out) + if err != nil { + logUnstructured.Error(err, "Decode Unstructured to String failed") + return nil, err + } + + return out, nil +} + +// ConvertObjectToUnstructured returns Unstructured from Kubernetes Object value +func ConvertObjectToUnstructured(in interface{}) (*unstructured.Unstructured, error) { + out := &unstructured.Unstructured{} + var err error + + out.Object, err = runtime.DefaultUnstructuredConverter.ToUnstructured(&in) + if err != nil { + logUnstructured.Error(err, "Convert Object to Unstructured failed") + return nil, err + } + + return out, nil +} diff --git a/pkg/job/v1beta1/job.go b/pkg/job/v1beta1/job.go index 2e55a8928ba..ac8caf009f5 100644 --- a/pkg/job/v1beta1/job.go +++ b/pkg/job/v1beta1/job.go @@ -34,7 +34,7 @@ func (j Job) GetDeployedJobStatus( } // Job does not have the running condition in status, thus we think // the job is running when it is created. - jobLogger.Info("NestedFieldCopy", "err", "status cannot be found in job") + jobLogger.Info("NestedFieldCopy", "Info", "Job doesn't have status yet") return nil, nil } diff --git a/pkg/job/v1beta1/kubeflow.go b/pkg/job/v1beta1/kubeflow.go index 925e5f752f7..a4e698c3aec 100644 --- a/pkg/job/v1beta1/kubeflow.go +++ b/pkg/job/v1beta1/kubeflow.go @@ -36,7 +36,7 @@ func (k Kubeflow) GetDeployedJobStatus( return nil, unerr } kfLogger.Info("NestedFieldCopy unstructured to status error", - "err", "Status is not found in job") + "Info", "Kubeflow Job doesn't have status yet") return nil, nil } diff --git a/pkg/webhook/v1beta1/experiment/validator/validator_test.go b/pkg/webhook/v1beta1/experiment/validator/validator_test.go index 2a692ff037a..f39f96fe868 100644 --- a/pkg/webhook/v1beta1/experiment/validator/validator_test.go +++ b/pkg/webhook/v1beta1/experiment/validator/validator_test.go @@ -1,7 +1,6 @@ package validator import ( - "strings" "testing" "github.com/golang/mock/gomock" @@ -11,7 +10,6 @@ import ( commonv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/common/v1beta1" experimentsv1beta1 "github.com/kubeflow/katib/pkg/apis/controller/experiments/v1beta1" - "github.com/kubeflow/katib/pkg/controller.v1beta1/consts" manifestmock "github.com/kubeflow/katib/pkg/mock/v1beta1/experiment/manifest" v1 "k8s.io/api/core/v1" ) @@ -20,280 +18,281 @@ func init() { logf.SetLogger(logf.ZapLogger(false)) } -func TestValidateTFJobTrialTemplate(t *testing.T) { - trialTFJobTemplate := `apiVersion: "kubeflow.org/v1" -kind: "TFJob" -metadata: - name: "dist-mnist-for-e2e-test" -spec: - tfReplicaSpecs: - Worker: - template: - spec: - containers: - - name: tensorflow - image: gaocegege/mnist:1` - - mockCtrl := gomock.NewController(t) - defer mockCtrl.Finish() - - p := manifestmock.NewMockGenerator(mockCtrl) - g := New(p) - - p.EXPECT().GetRunSpec(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(trialTFJobTemplate, nil) - - instance := newFakeInstance() - if err := g.(*DefaultValidator).validateTrialTemplate(instance); err == nil { - t.Errorf("Expected error, got nil") - } -} - -func TestValidateJobTrialTemplate(t *testing.T) { - trialJobTemplate := `apiVersion: batch/v1 -kind: Job -metadata: - name: fake-trial - namespace: fakens -spec: - template: - spec: - containers: - - name: fake-trial - image: test-image` - - mockCtrl := gomock.NewController(t) - defer mockCtrl.Finish() - - p := manifestmock.NewMockGenerator(mockCtrl) - g := New(p) - - invalidYaml := strings.Replace(trialJobTemplate, "- name", "- * -", -1) - invalidJobType := strings.Replace(trialJobTemplate, "Job", "NewJobType", -1) - invalidNamespace := strings.Replace(trialJobTemplate, "fakens", "not-fakens", -1) - invalidJobName := strings.Replace(trialJobTemplate, "fake-trial", "new-name", -1) - - validRun := p.EXPECT().GetRunSpec(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(trialJobTemplate, nil) - invalidYamlRun := p.EXPECT().GetRunSpec(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(invalidYaml, nil) - invalidJobTypeRun := p.EXPECT().GetRunSpec(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(invalidJobType, nil) - invalidNamespaceRun := p.EXPECT().GetRunSpec(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(invalidNamespace, nil) - invalidJobNameRun := p.EXPECT().GetRunSpec(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(invalidJobName, nil) - - gomock.InOrder( - validRun, - invalidYamlRun, - invalidJobTypeRun, - invalidNamespaceRun, - invalidJobNameRun, - ) - - tcs := []struct { - Instance *experimentsv1beta1.Experiment - Err bool - }{ - { - Instance: func() *experimentsv1beta1.Experiment { - i := newFakeInstance() - i.Spec.TrialTemplate = newFakeTrialTemplate(trialJobTemplate) - return i - }(), - Err: false, - }, - { - Instance: func() *experimentsv1beta1.Experiment { - i := newFakeInstance() - i.Spec.TrialTemplate = newFakeTrialTemplate(invalidYaml) - return i - }(), - Err: true, - }, - { - Instance: func() *experimentsv1beta1.Experiment { - i := newFakeInstance() - i.Spec.TrialTemplate = newFakeTrialTemplate(invalidJobType) - return i - }(), - Err: true, - }, - { - Instance: func() *experimentsv1beta1.Experiment { - i := newFakeInstance() - i.Spec.TrialTemplate = newFakeTrialTemplate(invalidNamespace) - return i - }(), - Err: true, - }, - { - Instance: func() *experimentsv1beta1.Experiment { - i := newFakeInstance() - i.Spec.TrialTemplate = newFakeTrialTemplate(invalidJobName) - return i - }(), - Err: true, - }, - } - for _, tc := range tcs { - err := g.(*DefaultValidator).validateTrialTemplate(tc.Instance) - if !tc.Err && err != nil { - t.Errorf("Expected nil, got %v", err) - } else if tc.Err && err == nil { - t.Errorf("Expected err, got nil") - } - } -} - -func TestValidateExperiment(t *testing.T) { - mockCtrl := gomock.NewController(t) - defer mockCtrl.Finish() - - p := manifestmock.NewMockGenerator(mockCtrl) - g := New(p) - - trialJobTemplate := `apiVersion: "batch/v1" -kind: "Job" -metadata: - name: "fake-trial" - namespace: fakens` - - suggestionConfigData := map[string]string{} - suggestionConfigData[consts.LabelSuggestionImageTag] = "algorithmImage" - fakeNegativeInt := int32(-1) - - p.EXPECT().GetRunSpec(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(trialJobTemplate, nil).AnyTimes() - p.EXPECT().GetSuggestionConfigData(gomock.Any()).Return(suggestionConfigData, nil).AnyTimes() - p.EXPECT().GetMetricsCollectorImage(gomock.Any()).Return("metricsCollectorImage", nil).AnyTimes() - - tcs := []struct { - Instance *experimentsv1beta1.Experiment - Err bool - oldInstance *experimentsv1beta1.Experiment - }{ - //Objective - { - Instance: func() *experimentsv1beta1.Experiment { - i := newFakeInstance() - i.Spec.Objective = nil - return i - }(), - Err: true, - }, - { - Instance: func() *experimentsv1beta1.Experiment { - i := newFakeInstance() - i.Spec.Objective.Type = commonv1beta1.ObjectiveTypeUnknown - return i - }(), - Err: true, - }, - { - Instance: func() *experimentsv1beta1.Experiment { - i := newFakeInstance() - i.Spec.Objective.ObjectiveMetricName = "" - return i - }(), - Err: true, - }, - //Algorithm - { - Instance: func() *experimentsv1beta1.Experiment { - i := newFakeInstance() - i.Spec.Algorithm = nil - return i - }(), - Err: true, - }, - { - Instance: func() *experimentsv1beta1.Experiment { - i := newFakeInstance() - i.Spec.Algorithm.AlgorithmName = "" - return i - }(), - Err: true, - }, - { - Instance: newFakeInstance(), - Err: false, - }, - { - Instance: func() *experimentsv1beta1.Experiment { - i := newFakeInstance() - i.Spec.MaxFailedTrialCount = &fakeNegativeInt - return i - }(), - Err: true, - }, - { - Instance: func() *experimentsv1beta1.Experiment { - i := newFakeInstance() - i.Spec.MaxTrialCount = &fakeNegativeInt - return i - }(), - Err: true, - }, - { - Instance: func() *experimentsv1beta1.Experiment { - i := newFakeInstance() - i.Spec.ParallelTrialCount = &fakeNegativeInt - return i - }(), - Err: true, - }, - { - Instance: newFakeInstance(), - Err: false, - oldInstance: newFakeInstance(), - }, - { - Instance: newFakeInstance(), - Err: true, - oldInstance: func() *experimentsv1beta1.Experiment { - i := newFakeInstance() - i.Spec.Algorithm.AlgorithmName = "not-test" - return i - }(), - }, - { - Instance: newFakeInstance(), - Err: true, - oldInstance: func() *experimentsv1beta1.Experiment { - i := newFakeInstance() - i.Spec.ResumePolicy = "invalid-policy" - return i - }(), - }, - { - Instance: func() *experimentsv1beta1.Experiment { - i := newFakeInstance() - i.Spec.Parameters = []experimentsv1beta1.ParameterSpec{} - i.Spec.NasConfig = nil - return i - }(), - Err: true, - }, - { - Instance: func() *experimentsv1beta1.Experiment { - i := newFakeInstance() - i.Spec.NasConfig = &experimentsv1beta1.NasConfig{ - Operations: []experimentsv1beta1.Operation{ - { - OperationType: "op1", - }, - }, - } - return i - }(), - Err: true, - }, - } - - for _, tc := range tcs { - err := g.ValidateExperiment(tc.Instance, tc.oldInstance) - if !tc.Err && err != nil { - t.Errorf("Expected nil, got %v", err) - } else if tc.Err && err == nil { - t.Errorf("Expected err, got nil") - } - } -} +// TODO (andreyvelich): Refactor this test after changing validation for new Trial Template +// func TestValidateTFJobTrialTemplate(t *testing.T) { +// trialTFJobTemplate := `apiVersion: "kubeflow.org/v1" +// kind: "TFJob" +// metadata: +// name: "dist-mnist-for-e2e-test" +// spec: +// tfReplicaSpecs: +// Worker: +// template: +// spec: +// containers: +// - name: tensorflow +// image: gaocegege/mnist:1` + +// mockCtrl := gomock.NewController(t) +// defer mockCtrl.Finish() + +// p := manifestmock.NewMockGenerator(mockCtrl) +// g := New(p) + +// p.EXPECT().GetRunSpec(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(trialTFJobTemplate, nil) + +// instance := newFakeInstance() +// if err := g.(*DefaultValidator).validateTrialTemplate(instance); err == nil { +// t.Errorf("Expected error, got nil") +// } +// } + +// func TestValidateJobTrialTemplate(t *testing.T) { +// trialJobTemplate := `apiVersion: batch/v1 +// kind: Job +// metadata: +// name: fake-trial +// namespace: fakens +// spec: +// template: +// spec: +// containers: +// - name: fake-trial +// image: test-image` + +// mockCtrl := gomock.NewController(t) +// defer mockCtrl.Finish() + +// p := manifestmock.NewMockGenerator(mockCtrl) +// g := New(p) + +// invalidYaml := strings.Replace(trialJobTemplate, "- name", "- * -", -1) +// invalidJobType := strings.Replace(trialJobTemplate, "Job", "NewJobType", -1) +// invalidNamespace := strings.Replace(trialJobTemplate, "fakens", "not-fakens", -1) +// invalidJobName := strings.Replace(trialJobTemplate, "fake-trial", "new-name", -1) + +// validRun := p.EXPECT().GetRunSpec(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(trialJobTemplate, nil) +// invalidYamlRun := p.EXPECT().GetRunSpec(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(invalidYaml, nil) +// invalidJobTypeRun := p.EXPECT().GetRunSpec(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(invalidJobType, nil) +// invalidNamespaceRun := p.EXPECT().GetRunSpec(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(invalidNamespace, nil) +// invalidJobNameRun := p.EXPECT().GetRunSpec(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(invalidJobName, nil) + +// gomock.InOrder( +// validRun, +// invalidYamlRun, +// invalidJobTypeRun, +// invalidNamespaceRun, +// invalidJobNameRun, +// ) + +// tcs := []struct { +// Instance *experimentsv1beta1.Experiment +// Err bool +// }{ +// { +// Instance: func() *experimentsv1beta1.Experiment { +// i := newFakeInstance() +// i.Spec.TrialTemplate = newFakeTrialTemplate(trialJobTemplate) +// return i +// }(), +// Err: false, +// }, +// { +// Instance: func() *experimentsv1beta1.Experiment { +// i := newFakeInstance() +// i.Spec.TrialTemplate = newFakeTrialTemplate(invalidYaml) +// return i +// }(), +// Err: true, +// }, +// { +// Instance: func() *experimentsv1beta1.Experiment { +// i := newFakeInstance() +// i.Spec.TrialTemplate = newFakeTrialTemplate(invalidJobType) +// return i +// }(), +// Err: true, +// }, +// { +// Instance: func() *experimentsv1beta1.Experiment { +// i := newFakeInstance() +// i.Spec.TrialTemplate = newFakeTrialTemplate(invalidNamespace) +// return i +// }(), +// Err: true, +// }, +// { +// Instance: func() *experimentsv1beta1.Experiment { +// i := newFakeInstance() +// i.Spec.TrialTemplate = newFakeTrialTemplate(invalidJobName) +// return i +// }(), +// Err: true, +// }, +// } +// for _, tc := range tcs { +// err := g.(*DefaultValidator).validateTrialTemplate(tc.Instance) +// if !tc.Err && err != nil { +// t.Errorf("Expected nil, got %v", err) +// } else if tc.Err && err == nil { +// t.Errorf("Expected err, got nil") +// } +// } +// } + +// func TestValidateExperiment(t *testing.T) { +// mockCtrl := gomock.NewController(t) +// defer mockCtrl.Finish() + +// p := manifestmock.NewMockGenerator(mockCtrl) +// g := New(p) + +// trialJobTemplate := `apiVersion: "batch/v1" +// kind: "Job" +// metadata: +// name: "fake-trial" +// namespace: fakens` + +// suggestionConfigData := map[string]string{} +// suggestionConfigData[consts.LabelSuggestionImageTag] = "algorithmImage" +// fakeNegativeInt := int32(-1) + +// p.EXPECT().GetRunSpec(gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).Return(trialJobTemplate, nil).AnyTimes() +// p.EXPECT().GetSuggestionConfigData(gomock.Any()).Return(suggestionConfigData, nil).AnyTimes() +// p.EXPECT().GetMetricsCollectorImage(gomock.Any()).Return("metricsCollectorImage", nil).AnyTimes() + +// tcs := []struct { +// Instance *experimentsv1beta1.Experiment +// Err bool +// oldInstance *experimentsv1beta1.Experiment +// }{ +// //Objective +// { +// Instance: func() *experimentsv1beta1.Experiment { +// i := newFakeInstance() +// i.Spec.Objective = nil +// return i +// }(), +// Err: true, +// }, +// { +// Instance: func() *experimentsv1beta1.Experiment { +// i := newFakeInstance() +// i.Spec.Objective.Type = commonv1beta1.ObjectiveTypeUnknown +// return i +// }(), +// Err: true, +// }, +// { +// Instance: func() *experimentsv1beta1.Experiment { +// i := newFakeInstance() +// i.Spec.Objective.ObjectiveMetricName = "" +// return i +// }(), +// Err: true, +// }, +// //Algorithm +// { +// Instance: func() *experimentsv1beta1.Experiment { +// i := newFakeInstance() +// i.Spec.Algorithm = nil +// return i +// }(), +// Err: true, +// }, +// { +// Instance: func() *experimentsv1beta1.Experiment { +// i := newFakeInstance() +// i.Spec.Algorithm.AlgorithmName = "" +// return i +// }(), +// Err: true, +// }, +// { +// Instance: newFakeInstance(), +// Err: false, +// }, +// { +// Instance: func() *experimentsv1beta1.Experiment { +// i := newFakeInstance() +// i.Spec.MaxFailedTrialCount = &fakeNegativeInt +// return i +// }(), +// Err: true, +// }, +// { +// Instance: func() *experimentsv1beta1.Experiment { +// i := newFakeInstance() +// i.Spec.MaxTrialCount = &fakeNegativeInt +// return i +// }(), +// Err: true, +// }, +// { +// Instance: func() *experimentsv1beta1.Experiment { +// i := newFakeInstance() +// i.Spec.ParallelTrialCount = &fakeNegativeInt +// return i +// }(), +// Err: true, +// }, +// { +// Instance: newFakeInstance(), +// Err: false, +// oldInstance: newFakeInstance(), +// }, +// { +// Instance: newFakeInstance(), +// Err: true, +// oldInstance: func() *experimentsv1beta1.Experiment { +// i := newFakeInstance() +// i.Spec.Algorithm.AlgorithmName = "not-test" +// return i +// }(), +// }, +// { +// Instance: newFakeInstance(), +// Err: true, +// oldInstance: func() *experimentsv1beta1.Experiment { +// i := newFakeInstance() +// i.Spec.ResumePolicy = "invalid-policy" +// return i +// }(), +// }, +// { +// Instance: func() *experimentsv1beta1.Experiment { +// i := newFakeInstance() +// i.Spec.Parameters = []experimentsv1beta1.ParameterSpec{} +// i.Spec.NasConfig = nil +// return i +// }(), +// Err: true, +// }, +// { +// Instance: func() *experimentsv1beta1.Experiment { +// i := newFakeInstance() +// i.Spec.NasConfig = &experimentsv1beta1.NasConfig{ +// Operations: []experimentsv1beta1.Operation{ +// { +// OperationType: "op1", +// }, +// }, +// } +// return i +// }(), +// Err: true, +// }, +// } + +// for _, tc := range tcs { +// err := g.ValidateExperiment(tc.Instance, tc.oldInstance) +// if !tc.Err && err != nil { +// t.Errorf("Expected nil, got %v", err) +// } else if tc.Err && err == nil { +// t.Errorf("Expected err, got nil") +// } +// } +// } func TestValidateMetricsCollector(t *testing.T) { @@ -553,11 +552,11 @@ func newFakeInstance() *experimentsv1beta1.Experiment { } } -func newFakeTrialTemplate(template string) *experimentsv1beta1.TrialTemplate { - return &experimentsv1beta1.TrialTemplate{ - Retain: false, - GoTemplate: &experimentsv1beta1.GoTemplate{ - RawTemplate: template, - }, - } -} +// func newFakeTrialTemplate(template string) *experimentsv1beta1.TrialTemplate { +// return &experimentsv1beta1.TrialTemplate{ +// Retain: false, +// GoTemplate: &experimentsv1beta1.GoTemplate{ +// RawTemplate: template, +// }, +// } +// } From cebec02ce21976e455eaf52a42266490578a6322 Mon Sep 17 00:00:00 2001 From: avelichk Date: Mon, 8 Jun 2020 01:44:29 +0100 Subject: [PATCH 05/10] Modify trial template configMap for new version of Trial Template Fix experiment defaults change valid/invalid experiment --- .../trialTemplateConfigmap.yaml | 76 ++++++++++++--- .../trialTemplateConfigmapLabeled.yaml | 95 ------------------- .../v1beta1/experiment_defaults.go | 20 +++- .../experiment/experiment_util.go | 1 - test/e2e/v1beta1/invalid-experiment.yaml | 52 +++++----- test/e2e/v1beta1/valid-experiment.yaml | 53 ++++++----- 6 files changed, 141 insertions(+), 156 deletions(-) delete mode 100644 manifests/v1beta1/katib-controller/trialTemplateConfigmapLabeled.yaml diff --git a/manifests/v1beta1/katib-controller/trialTemplateConfigmap.yaml b/manifests/v1beta1/katib-controller/trialTemplateConfigmap.yaml index eff3c7c3baa..876ee59bd53 100644 --- a/manifests/v1beta1/katib-controller/trialTemplateConfigmap.yaml +++ b/manifests/v1beta1/katib-controller/trialTemplateConfigmap.yaml @@ -3,26 +3,74 @@ kind: ConfigMap metadata: name: trial-template namespace: kubeflow + labels: + app: katib-trial-templates data: defaultTrialTemplate.yaml: |- apiVersion: batch/v1 kind: Job - metadata: - name: {{.Trial}} - namespace: {{.NameSpace}} spec: template: spec: containers: - - name: {{.Trial}} - image: docker.io/kubeflowkatib/mxnet-mnist - command: - - "python3" - - "/opt/mxnet-mnist/mnist.py" - - "--batch-size=64" - {{- with .HyperParameters}} - {{- range .}} - - "{{.Name}}={{.Value}}" - {{- end}} - {{- end}} + - name: training-container + image: docker.io/kubeflowkatib/mxnet-mnist + command: + - "python3" + - "/opt/mxnet-mnist/mnist.py" + - "--batch-size=64" + - "--lr=${trialParameters.learningRate}" + - "--num-layers=${trialParameters.numberLayers}" + - "--optimizer=${trialParameters.optimizer}" restartPolicy: Never + # For ConfigMap templates double quotes must set in commands to correct parse JSON parameters in Trial Template (e.g nn_config, architecture) + enasCPUTemplate: |- + apiVersion: batch/v1 + kind: Job + spec: + template: + spec: + containers: + - name: training-container + image: docker.io/kubeflowkatib/enas-cnn-cifar10-cpu + command: + - python3 + - -u + - RunTrial.py + - --num_epochs=1 + - "--architecture=\"${trialParameters.neuralNetworkArchitecture}\"" + - "--nn_config=\"${trialParameters.neuralNetworkConfig}\"" + restartPolicy: Never + pytorchJobTemplate: |- + apiVersion: "kubeflow.org/v1" + kind: PyTorchJob + spec: + pytorchReplicaSpecs: + Master: + replicas: 1 + restartPolicy: OnFailure + template: + spec: + containers: + - name: pytorch + image: gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0 + imagePullPolicy: Always + command: + - "python" + - "/var/mnist.py" + - "--lr=${trialParameters.learningRate}" + - "--momentum=${trialParameters.momentum}" + Worker: + replicas: 2 + restartPolicy: OnFailure + template: + spec: + containers: + - name: pytorch + image: gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0 + imagePullPolicy: Always + command: + - "python" + - "/var/mnist.py" + - "--lr=${trialParameters.learningRate}" + - "--momentum=${trialParameters.momentum}" diff --git a/manifests/v1beta1/katib-controller/trialTemplateConfigmapLabeled.yaml b/manifests/v1beta1/katib-controller/trialTemplateConfigmapLabeled.yaml deleted file mode 100644 index 211f77bd17d..00000000000 --- a/manifests/v1beta1/katib-controller/trialTemplateConfigmapLabeled.yaml +++ /dev/null @@ -1,95 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: trial-template-labeled - namespace: kubeflow - labels: - app: katib-trial-templates -data: - defaultTrialTemplate.yaml: |- - apiVersion: batch/v1 - kind: Job - metadata: - name: {{.Trial}} - namespace: {{.NameSpace}} - spec: - template: - spec: - containers: - - name: {{.Trial}} - image: docker.io/kubeflowkatib/mxnet-mnist - command: - - "python3" - - "/opt/mxnet-mnist/mnist.py" - - "--batch-size=64" - {{- with .HyperParameters}} - {{- range .}} - - "{{.Name}}={{.Value}}" - {{- end}} - {{- end}} - restartPolicy: Never - enasCPUTemplate: |- - apiVersion: batch/v1 - kind: Job - metadata: - name: {{.Trial}} - namespace: {{.NameSpace}} - spec: - template: - spec: - containers: - - name: {{.Trial}} - image: docker.io/kubeflowkatib/enas-cnn-cifar10-cpu - command: - - "python3.5" - - "-u" - - "RunTrial.py" - {{- with .HyperParameters}} - {{- range .}} - - "--{{.Name}}=\"{{.Value}}\"" - {{- end}} - {{- end}} - - "--num_epochs=1" - restartPolicy: Never - pytorchJobTemplate: |- - apiVersion: "kubeflow.org/v1" - kind: PyTorchJob - metadata: - name: {{.Trial}} - namespace: {{.NameSpace}} - spec: - pytorchReplicaSpecs: - Master: - replicas: 1 - restartPolicy: OnFailure - template: - spec: - containers: - - name: pytorch - image: gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0 - imagePullPolicy: Always - command: - - "python" - - "/var/mnist.py" - {{- with .HyperParameters}} - {{- range .}} - - "{{.Name}}={{.Value}}" - {{- end}} - {{- end}} - Worker: - replicas: 2 - restartPolicy: OnFailure - template: - spec: - containers: - - name: pytorch - image: gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0 - imagePullPolicy: Always - command: - - "python" - - "/var/mnist.py" - {{- with .HyperParameters}} - {{- range .}} - - "{{.Name}}={{.Value}}" - {{- end}} - {{- end}} diff --git a/pkg/apis/controller/experiments/v1beta1/experiment_defaults.go b/pkg/apis/controller/experiments/v1beta1/experiment_defaults.go index 7d47e41e8b1..888b71811a3 100644 --- a/pkg/apis/controller/experiments/v1beta1/experiment_defaults.go +++ b/pkg/apis/controller/experiments/v1beta1/experiment_defaults.go @@ -47,7 +47,6 @@ func (e *Experiment) setDefaultResumePolicy() { } } -// TODO: Verify it func (e *Experiment) setDefaultTrialTemplate() { t := e.Spec.TrialTemplate if t == nil { @@ -55,7 +54,7 @@ func (e *Experiment) setDefaultTrialTemplate() { Retain: true, } } - if t.TrialSource.TrialSpec == nil && t.TrialSource.ConfigMap == nil { + if t.TrialSource.TrialSpec == nil && t.TrialSource.ConfigMap == nil && t.TrialParameters == nil { t.TrialSource = TrialSource{ ConfigMap: &ConfigMapSource{ ConfigMapNamespace: consts.DefaultKatibNamespace, @@ -63,6 +62,23 @@ func (e *Experiment) setDefaultTrialTemplate() { TemplatePath: DefaultTrialTemplatePath, }, } + t.TrialParameters = []TrialParameterSpec{ + TrialParameterSpec{ + Name: "learningRate", + Description: "Learning rate for the training model", + Reference: "lr", + }, + TrialParameterSpec{ + Name: "numberLayers", + Description: "Number of training model layers", + Reference: "num-layers", + }, + TrialParameterSpec{ + Name: "optimizer", + Description: "Training model optimizer (sdg, adam or ftrl)", + Reference: "optimizer", + }, + } } e.Spec.TrialTemplate = t } diff --git a/pkg/controller.v1beta1/experiment/experiment_util.go b/pkg/controller.v1beta1/experiment/experiment_util.go index 3e8513222aa..d6e0d986716 100644 --- a/pkg/controller.v1beta1/experiment/experiment_util.go +++ b/pkg/controller.v1beta1/experiment/experiment_util.go @@ -39,7 +39,6 @@ func (r *ReconcileExperiment) createTrialInstance(expInstance *experimentsv1beta trial.Spec.ParameterAssignments = trialAssignment.ParameterAssignments runSpec, err := r.GetRunSpecWithHyperParameters(expInstance, trial.Name, trial.Namespace, hps) - logger.Info("RUN SPEC-------------", "runSpec", runSpec) if err != nil { logger.Error(err, "Fail to get RunSpec from experiment", expInstance.Name) return err diff --git a/test/e2e/v1beta1/invalid-experiment.yaml b/test/e2e/v1beta1/invalid-experiment.yaml index e3d4e097770..04eb5cb94d2 100644 --- a/test/e2e/v1beta1/invalid-experiment.yaml +++ b/test/e2e/v1beta1/invalid-experiment.yaml @@ -14,40 +14,48 @@ spec: - Train-accuracy algorithm: algorithmName: random - trialTemplate: - goTemplate: - rawTemplate: |- - apiVersion: batch/v1 - kind: invalid-kind # invalid - metadata: - name: {{.Trial}} - namespace: {{.NameSpace}} - spec: - template: - spec: - containers: - - name: {{.Trial}} - image: docker.io/kubeflowkatib/mxnet-mnist - command: - - "python3" - - "/opt/mxnet-mnist/mnist.py" - - "--batch-size=64" - restartPolicy: Never parameters: - - name: --lr + - name: lr parameterType: double feasibleSpace: min: "0.01" max: "0.03" - - name: --num-layers + - name: num-layers parameterType: int feasibleSpace: min: "2" max: "5" - - name: --optimizer + - name: optimizer parameterType: categorical feasibleSpace: list: - sgd - adam - ftrl + trialTemplate: + trialParameters: + - name: learningRate + description: Learning rate for the training model + reference: lr + - name: numberLayers + description: Number of training model layers + reference: num-layers + - name: optimizer + description: Training model optimizer (sdg, adam or ftrl) + reference: optimizer + trialSpec: + apiVersion: batch/v1 # Invalid, Kind must be specified + spec: + template: + spec: + containers: + - name: training-container + image: docker.io/kubeflowkatib/mxnet-mnist + command: + - "python3" + - "/opt/mxnet-mnist/mnist.py" + - "--batch-size=64" + - "--lr=${trialParameters.learningRate}" + - "--num-layers=${trialParameters.numberLayers}" + - "--optimizer=${trialParameters.optimizer}" + restartPolicy: Never diff --git a/test/e2e/v1beta1/valid-experiment.yaml b/test/e2e/v1beta1/valid-experiment.yaml index e5cf7225fc2..dc3699f64ab 100644 --- a/test/e2e/v1beta1/valid-experiment.yaml +++ b/test/e2e/v1beta1/valid-experiment.yaml @@ -14,40 +14,49 @@ spec: - Train-accuracy algorithm: algorithmName: random - trialTemplate: - goTemplate: - rawTemplate: |- - apiVersion: batch/v1 - kind: Job - metadata: - name: {{.Trial}} - namespace: {{.NameSpace}} - spec: - template: - spec: - containers: - - name: {{.Trial}} - image: docker.io/kubeflowkatib/mxnet-mnist - command: - - "python3" - - "/opt/mxnet-mnist/mnist.py" - - "--batch-size=64" - restartPolicy: Never parameters: - - name: --lr + - name: lr parameterType: double feasibleSpace: min: "0.01" max: "0.03" - - name: --num-layers + - name: num-layers parameterType: int feasibleSpace: min: "2" max: "5" - - name: --optimizer + - name: optimizer parameterType: categorical feasibleSpace: list: - sgd - adam - ftrl + trialTemplate: + trialParameters: + - name: learningRate + description: Learning rate for the training model + reference: lr + - name: numberLayers + description: Number of training model layers + reference: num-layers + - name: optimizer + description: Training model optimizer (sdg, adam or ftrl) + reference: optimizer + trialSpec: + apiVersion: batch/v1 + kind: Job + spec: + template: + spec: + containers: + - name: training-container + image: docker.io/kubeflowkatib/mxnet-mnist + command: + - "python3" + - "/opt/mxnet-mnist/mnist.py" + - "--batch-size=64" + - "--lr=${trialParameters.learningRate}" + - "--num-layers=${trialParameters.numberLayers}" + - "--optimizer=${trialParameters.optimizer}" + restartPolicy: Never From bca595e8bdd460d1bd1d21f6b19f6449b941332d Mon Sep 17 00:00:00 2001 From: avelichk Date: Mon, 8 Jun 2020 02:15:51 +0100 Subject: [PATCH 06/10] Run gofmt --- .../v1beta1/experiment_defaults.go | 6 +++--- .../experiment/experiment_controller_test.go | 20 +++++++++---------- .../experiment/manifest/generator_test.go | 8 ++++---- .../trial/trial_controller_test.go | 8 ++++---- 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/pkg/apis/controller/experiments/v1beta1/experiment_defaults.go b/pkg/apis/controller/experiments/v1beta1/experiment_defaults.go index 888b71811a3..529b1da12cf 100644 --- a/pkg/apis/controller/experiments/v1beta1/experiment_defaults.go +++ b/pkg/apis/controller/experiments/v1beta1/experiment_defaults.go @@ -63,17 +63,17 @@ func (e *Experiment) setDefaultTrialTemplate() { }, } t.TrialParameters = []TrialParameterSpec{ - TrialParameterSpec{ + { Name: "learningRate", Description: "Learning rate for the training model", Reference: "lr", }, - TrialParameterSpec{ + { Name: "numberLayers", Description: "Number of training model layers", Reference: "num-layers", }, - TrialParameterSpec{ + { Name: "optimizer", Description: "Training model optimizer (sdg, adam or ftrl)", Reference: "optimizer", diff --git a/pkg/controller.v1beta1/experiment/experiment_controller_test.go b/pkg/controller.v1beta1/experiment/experiment_controller_test.go index 135c02adb36..cd21844db52 100644 --- a/pkg/controller.v1beta1/experiment/experiment_controller_test.go +++ b/pkg/controller.v1beta1/experiment/experiment_controller_test.go @@ -157,13 +157,13 @@ func TestReconcileExperiment(t *testing.T) { }, Spec: tfv1.TFJobSpec{ TFReplicaSpecs: map[tfv1.TFReplicaType]*kubeflowcommonv1.ReplicaSpec{ - tfv1.TFReplicaTypePS: &kubeflowcommonv1.ReplicaSpec{ + tfv1.TFReplicaTypePS: { Replicas: func() *int32 { i := int32(1); return &i }(), RestartPolicy: kubeflowcommonv1.RestartPolicyOnFailure, Template: v1.PodTemplateSpec{ Spec: v1.PodSpec{ Containers: []v1.Container{ - v1.Container{ + { Name: "tensorflow", Image: "gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0", Command: []string{ @@ -174,7 +174,7 @@ func TestReconcileExperiment(t *testing.T) { "--num-layers=5", }, VolumeMounts: []v1.VolumeMount{ - v1.VolumeMount{ + { Name: "train", MountPath: "/train", }, @@ -182,7 +182,7 @@ func TestReconcileExperiment(t *testing.T) { }, }, Volumes: []v1.Volume{ - v1.Volume{ + { Name: "train", VolumeSource: v1.VolumeSource{ PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ @@ -274,13 +274,13 @@ func newFakeInstance() *experimentsv1beta1.Experiment { }, Spec: tfv1.TFJobSpec{ TFReplicaSpecs: map[tfv1.TFReplicaType]*kubeflowcommonv1.ReplicaSpec{ - tfv1.TFReplicaTypePS: &kubeflowcommonv1.ReplicaSpec{ + tfv1.TFReplicaTypePS: { Replicas: func() *int32 { i := int32(1); return &i }(), RestartPolicy: kubeflowcommonv1.RestartPolicyOnFailure, Template: v1.PodTemplateSpec{ Spec: v1.PodSpec{ Containers: []v1.Container{ - v1.Container{ + { Name: "tensorflow", Image: "gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0", Command: []string{ @@ -291,7 +291,7 @@ func newFakeInstance() *experimentsv1beta1.Experiment { "--num-layers=${trialParameters.numberLayers}", }, VolumeMounts: []v1.VolumeMount{ - v1.VolumeMount{ + { Name: "train", MountPath: "/train", }, @@ -299,7 +299,7 @@ func newFakeInstance() *experimentsv1beta1.Experiment { }, }, Volumes: []v1.Volume{ - v1.Volume{ + { Name: "train", VolumeSource: v1.VolumeSource{ PersistentVolumeClaim: &v1.PersistentVolumeClaimVolumeSource{ @@ -332,12 +332,12 @@ func newFakeInstance() *experimentsv1beta1.Experiment { }, TrialTemplate: &experimentsv1beta1.TrialTemplate{ TrialParameters: []experimentsv1beta1.TrialParameterSpec{ - experimentsv1beta1.TrialParameterSpec{ + { Name: "learningRate", Description: "Learning Rate", Reference: "lr", }, - experimentsv1beta1.TrialParameterSpec{ + { Name: "numberLayers", Description: "Number of layers", Reference: "num-layers", diff --git a/pkg/controller.v1beta1/experiment/manifest/generator_test.go b/pkg/controller.v1beta1/experiment/manifest/generator_test.go index fa54a532527..a0fe320631c 100644 --- a/pkg/controller.v1beta1/experiment/manifest/generator_test.go +++ b/pkg/controller.v1beta1/experiment/manifest/generator_test.go @@ -55,7 +55,7 @@ func TestGetRunSpecWithHP(t *testing.T) { Template: v1.PodTemplateSpec{ Spec: v1.PodSpec{ Containers: []v1.Container{ - v1.Container{ + { Name: "training-container", Image: "docker.io/kubeflowkatib/mxnet-mnist", Command: []string{ @@ -167,7 +167,7 @@ func newFakeInstance() *experimentsv1beta1.Experiment { Template: v1.PodTemplateSpec{ Spec: v1.PodSpec{ Containers: []v1.Container{ - v1.Container{ + { Name: "training-container", Image: "docker.io/kubeflowkatib/mxnet-mnist", Command: []string{ @@ -191,12 +191,12 @@ func newFakeInstance() *experimentsv1beta1.Experiment { TrialSpec: trialSpec, }, TrialParameters: []experimentsv1beta1.TrialParameterSpec{ - experimentsv1beta1.TrialParameterSpec{ + { Name: "learningRate", Description: "Learning Rate", Reference: "lr", }, - experimentsv1beta1.TrialParameterSpec{ + { Name: "numberLayers", Description: "Number of layers", Reference: "num-layers", diff --git a/pkg/controller.v1beta1/trial/trial_controller_test.go b/pkg/controller.v1beta1/trial/trial_controller_test.go index 286b1bd2a37..73b1ed8f7eb 100644 --- a/pkg/controller.v1beta1/trial/trial_controller_test.go +++ b/pkg/controller.v1beta1/trial/trial_controller_test.go @@ -251,13 +251,13 @@ func newFakeTrialWithTFJob() *trialsv1beta1.Trial { }, Spec: tfv1.TFJobSpec{ TFReplicaSpecs: map[tfv1.TFReplicaType]*kubeflowcommonv1.ReplicaSpec{ - tfv1.TFReplicaTypePS: &kubeflowcommonv1.ReplicaSpec{ + tfv1.TFReplicaTypePS: { Replicas: func() *int32 { i := int32(2); return &i }(), RestartPolicy: kubeflowcommonv1.RestartPolicyNever, Template: v1.PodTemplateSpec{ Spec: v1.PodSpec{ Containers: []v1.Container{ - v1.Container{ + { Name: "tensorflow", Image: "gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0", Command: []string{ @@ -272,13 +272,13 @@ func newFakeTrialWithTFJob() *trialsv1beta1.Trial { }, }, }, - tfv1.TFReplicaTypeWorker: &kubeflowcommonv1.ReplicaSpec{ + tfv1.TFReplicaTypeWorker: { Replicas: func() *int32 { i := int32(4); return &i }(), RestartPolicy: kubeflowcommonv1.RestartPolicyNever, Template: v1.PodTemplateSpec{ Spec: v1.PodSpec{ Containers: []v1.Container{ - v1.Container{ + { Name: "tensorflow", Image: "gcr.io/kubeflow-ci/tf-mnist-with-summaries:1.0", Command: []string{ From 29c73832b7931de563835b63aa019ec424d57031 Mon Sep 17 00:00:00 2001 From: avelichk Date: Mon, 8 Jun 2020 02:37:38 +0100 Subject: [PATCH 07/10] Fix hyperband --- examples/v1beta1/hyperband-example.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/v1beta1/hyperband-example.yaml b/examples/v1beta1/hyperband-example.yaml index f5b55980c07..1316cd27248 100644 --- a/examples/v1beta1/hyperband-example.yaml +++ b/examples/v1beta1/hyperband-example.yaml @@ -16,7 +16,7 @@ spec: algorithmName: hyperband algorithmSettings: - name: "resource_name" - value: "--num-epochs" + value: "num-epochs" - name: "eta" value: "3" - name: "r_l" From 9428fc0c9183dad114fc6d08824528b6d9fed6c7 Mon Sep 17 00:00:00 2001 From: avelichk Date: Mon, 8 Jun 2020 03:30:03 +0100 Subject: [PATCH 08/10] Remove num-epochs from grid example --- examples/v1beta1/grid-example.yaml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/examples/v1beta1/grid-example.yaml b/examples/v1beta1/grid-example.yaml index 9e847a0cf54..07e7f94f94a 100644 --- a/examples/v1beta1/grid-example.yaml +++ b/examples/v1beta1/grid-example.yaml @@ -29,11 +29,6 @@ spec: feasibleSpace: min: "2" max: "5" - - name: num-epochs - parameterType: int - feasibleSpace: - min: "10" - max: "15" - name: optimizer parameterType: categorical feasibleSpace: From 62931074e9392932b5157c2446b675ebaf2a5452 Mon Sep 17 00:00:00 2001 From: avelichk Date: Mon, 8 Jun 2020 04:13:54 +0100 Subject: [PATCH 09/10] Add tag to file mc example --- examples/v1beta1/file-metricscollector-example.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/v1beta1/file-metricscollector-example.yaml b/examples/v1beta1/file-metricscollector-example.yaml index d10b0d5c68d..7f5449a59d8 100644 --- a/examples/v1beta1/file-metricscollector-example.yaml +++ b/examples/v1beta1/file-metricscollector-example.yaml @@ -54,7 +54,7 @@ spec: spec: containers: - name: training-container - image: docker.io/kubeflowkatib/pytorch-mnist + image: docker.io/kubeflowkatib/pytorch-mnist:1.0 imagePullPolicy: Always command: - "python" From d441cb4ba4f0037d1a48fa72ad59954420ee1e35 Mon Sep 17 00:00:00 2001 From: avelichk Date: Tue, 9 Jun 2020 15:21:03 +0100 Subject: [PATCH 10/10] Modify create Trials loop --- .../experiment/experiment_controller.go | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/pkg/controller.v1beta1/experiment/experiment_controller.go b/pkg/controller.v1beta1/experiment/experiment_controller.go index 7e30f49807b..0362dd38b2a 100644 --- a/pkg/controller.v1beta1/experiment/experiment_controller.go +++ b/pkg/controller.v1beta1/experiment/experiment_controller.go @@ -348,18 +348,17 @@ func (r *ReconcileExperiment) createTrials(instance *experimentsv1beta1.Experime } var trialNames []string for _, trial := range trials { + if err = r.createTrialInstance(instance, &trial); err != nil { + logger.Error(err, "Create trial instance error", "trial", trial) + continue + } trialNames = append(trialNames, trial.Name) } - // If Trial Assignment is not ready we don't need to create Trials + // Print created Trial names if len(trialNames) != 0 { - logger.Info("Create Trials", "trialNames", trialNames) - for _, trial := range trials { - if err = r.createTrialInstance(instance, &trial); err != nil { - logger.Error(err, "Create trial instance error", "trial", trial) - continue - } - } + logger.Info("Created Trials", "trialNames", trialNames) } + return nil }