diff --git a/Gopkg.lock b/Gopkg.lock index dfa7ab8e50b..ea8c8496cd9 100644 --- a/Gopkg.lock +++ b/Gopkg.lock @@ -33,14 +33,6 @@ pruneopts = "T" revision = "de5bf2ad457846296e2031421a34e2568e304e35" -[[projects]] - digest = "1:9187ad53cd8b43cc82d1615ca6319503735fd87cd2b4dbeb80c57831e9affd79" - name = "github.com/caicloud/serving-controller" - packages = ["pkg/consts"] - pruneopts = "T" - revision = "bec825cac979e73109541ca7518dd6b96ce9e68a" - version = "v0.0.1-rc.5" - [[projects]] digest = "1:9f42202ac457c462ad8bb9642806d275af9ab4850cf0b1960b9c6f083d4a309a" name = "github.com/davecgh/go-spew" @@ -344,7 +336,10 @@ [[projects]] digest = "1:83ac030e105f19c505940ec8cbeab42c08a46c4acaf019d601aac265097ff386" name = "github.com/kubeflow/pytorch-operator" - packages = ["pkg/apis/pytorch/v1beta1"] + packages = [ + "pkg/apis/pytorch/v1beta1", + "pkg/apis/pytorch/v1beta2", + ] pruneopts = "T" revision = "da7798e7c2c127a270735c409f9305f3a6c06fd8" version = "v0.5.0-rc.1" @@ -354,7 +349,9 @@ name = "github.com/kubeflow/tf-operator" packages = [ "pkg/apis/common/v1beta1", + "pkg/apis/common/v1beta2", "pkg/apis/tensorflow/v1beta1", + "pkg/apis/tensorflow/v1beta2", ] pruneopts = "T" revision = "c2849477dffdeacc2ebc11de66f826a6ce5cf690" @@ -1142,7 +1139,6 @@ analyzer-version = 1 input-imports = [ "git.apache.org/thrift.git/lib/go/thrift", - "github.com/caicloud/serving-controller/pkg/consts", "github.com/emicklei/go-restful", "github.com/go-sql-driver/mysql", "github.com/golang/glog", @@ -1155,8 +1151,11 @@ "github.com/grpc-ecosystem/grpc-gateway/runtime", "github.com/grpc-ecosystem/grpc-gateway/utilities", "github.com/kubeflow/pytorch-operator/pkg/apis/pytorch/v1beta1", + "github.com/kubeflow/pytorch-operator/pkg/apis/pytorch/v1beta2", "github.com/kubeflow/tf-operator/pkg/apis/common/v1beta1", + "github.com/kubeflow/tf-operator/pkg/apis/common/v1beta2", "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1beta1", + "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1beta2", "github.com/onsi/ginkgo", "github.com/onsi/gomega", "github.com/pressly/chi", diff --git a/examples/v1alpha2/pytorchjob-example.yaml b/examples/v1alpha2/pytorchjob-example.yaml new file mode 100644 index 00000000000..b80c8d8ded0 --- /dev/null +++ b/examples/v1alpha2/pytorchjob-example.yaml @@ -0,0 +1,71 @@ +apiVersion: "kubeflow.org/v1alpha2" +kind: Experiment +metadata: + namespace: kubeflow + name: random-experiment +spec: + parallelTrialCount: 3 + maxTrialCount: 12 + maxFailedTrialCount: 3 + objective: + type: maximize + goal: 0.99 + objectiveMetricName: accuracy + algorithm: + algorithmName: random + trialTemplate: + retain: true + goTemplate: + rawTemplate: |- + apiVersion: "kubeflow.org/v1beta2" + kind: PyTorchJob + metadata: + name: {{.Trial}} + namespace: {{.NameSpace}} + spec: + pytorchReplicaSpecs: + Master: + replicas: 1 + restartPolicy: OnFailure + template: + spec: + containers: + - name: pytorch + image: gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0 + imagePullPolicy: Always + command: + - "python" + - "/var/mnist.py" + {{- with .HyperParameters}} + {{- range .}} + - "{{.Name}}={{.Value}}" + {{- end}} + {{- end}} + Worker: + replicas: 2 + restartPolicy: OnFailure + template: + spec: + containers: + - name: pytorch + image: gcr.io/kubeflow-ci/pytorch-dist-mnist-test:v1.0 + imagePullPolicy: Always + command: + - "python" + - "/var/mnist.py" + {{- with .HyperParameters}} + {{- range .}} + - "{{.Name}}={{.Value}}" + {{- end}} + {{- end}} + parameters: + - name: --lr + parameterType: double + feasibleSpace: + min: "0.01" + max: "0.05" + - name: --momentum + parameterType: double + feasibleSpace: + min: "0.5" + max: "0.9" diff --git a/pkg/controller/v1alpha2/experiment/util/runjob_util.go b/pkg/api/operators/apis/addtoscheme_pytorchjob_v1beta2.go similarity index 60% rename from pkg/controller/v1alpha2/experiment/util/runjob_util.go rename to pkg/api/operators/apis/addtoscheme_pytorchjob_v1beta2.go index 765141243b6..7e1f2d30cce 100644 --- a/pkg/controller/v1alpha2/experiment/util/runjob_util.go +++ b/pkg/api/operators/apis/addtoscheme_pytorchjob_v1beta2.go @@ -1,11 +1,8 @@ /* - Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -13,20 +10,13 @@ See the License for the specific language governing permissions and limitations under the License. */ -package util +package apis import ( - "k8s.io/apimachinery/pkg/runtime/schema" + "github.com/kubeflow/pytorch-operator/pkg/apis/pytorch/v1beta2" ) -func GetSupportedJobList() []schema.GroupVersionKind { - // TODO: append other supported jobs, such as tfjob, pytorch and so on - supportedJobList := []schema.GroupVersionKind{ - schema.GroupVersionKind{ - Group: "batch", - Version: "v1", - Kind: "Job", - }, - } - return supportedJobList +func init() { + // Register the types with the Scheme so the components can map objects to GroupVersionKinds and back + AddToSchemes = append(AddToSchemes, v1beta2.SchemeBuilder.AddToScheme) } diff --git a/pkg/api/operators/apis/addtoscheme_tfjob_v1beta2.go b/pkg/api/operators/apis/addtoscheme_tfjob_v1beta2.go new file mode 100644 index 00000000000..3c177474783 --- /dev/null +++ b/pkg/api/operators/apis/addtoscheme_tfjob_v1beta2.go @@ -0,0 +1,22 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package apis + +import ( + "github.com/kubeflow/tf-operator/pkg/apis/tensorflow/v1beta2" +) + +func init() { + // Register the types with the Scheme so the components can map objects to GroupVersionKinds and back + AddToSchemes = append(AddToSchemes, v1beta2.SchemeBuilder.AddToScheme) +} diff --git a/pkg/common/v1alpha2/common.go b/pkg/common/v1alpha2/common.go new file mode 100644 index 00000000000..4671c4af30f --- /dev/null +++ b/pkg/common/v1alpha2/common.go @@ -0,0 +1,26 @@ +package v1alpha2 + +import ( + "k8s.io/apimachinery/pkg/runtime/schema" +) + +func GetSupportedJobList() []schema.GroupVersionKind { + supportedJobList := []schema.GroupVersionKind{ + schema.GroupVersionKind{ + Group: "batch", + Version: "v1", + Kind: "Job", + }, + schema.GroupVersionKind{ + Group: "kubeflow.org", + Version: "v1beta2", + Kind: "TFJob", + }, + schema.GroupVersionKind{ + Group: "kubeflow.org", + Version: "v1beta2", + Kind: "PyTorchJob", + }, + } + return supportedJobList +} diff --git a/pkg/controller/v1alpha2/experiment/util/webhook_util.go b/pkg/controller/v1alpha2/experiment/util/webhook_util.go index 4f481d82e33..b50a4267615 100644 --- a/pkg/controller/v1alpha2/experiment/util/webhook_util.go +++ b/pkg/controller/v1alpha2/experiment/util/webhook_util.go @@ -22,14 +22,16 @@ import ( "fmt" logger "log" - commonv1alpha2 "github.com/kubeflow/katib/pkg/api/operators/apis/common/v1alpha2" - ep_v1alpha2 "github.com/kubeflow/katib/pkg/api/operators/apis/experiment/v1alpha2" batchv1beta "k8s.io/api/batch/v1beta1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" k8syaml "k8s.io/apimachinery/pkg/util/yaml" + + commonapiv1alpha2 "github.com/kubeflow/katib/pkg/api/operators/apis/common/v1alpha2" + experimentsv1alpha2 "github.com/kubeflow/katib/pkg/api/operators/apis/experiment/v1alpha2" + commonv1alpha2 "github.com/kubeflow/katib/pkg/common/v1alpha2" ) -func ValidateExperiment(instance *ep_v1alpha2.Experiment) error { +func ValidateExperiment(instance *experimentsv1alpha2.Experiment) error { if !instance.IsCreated() { if err := validateForCreate(instance); err != nil { return err @@ -65,17 +67,17 @@ func ValidateExperiment(instance *ep_v1alpha2.Experiment) error { return nil } -func validateAlgorithmSettings(inst *ep_v1alpha2.Experiment) error { +func validateAlgorithmSettings(inst *experimentsv1alpha2.Experiment) error { // TODO: it need call ValidateAlgorithmSettings API of vizier-core manager, implement it when vizier-core done return nil } -func validateObjective(obj *commonv1alpha2.ObjectiveSpec) error { +func validateObjective(obj *commonapiv1alpha2.ObjectiveSpec) error { if obj == nil { return fmt.Errorf("No spec.objective specified.") } - if obj.Type != commonv1alpha2.ObjectiveTypeMinimize && obj.Type != commonv1alpha2.ObjectiveTypeMaximize { - return fmt.Errorf("spec.objective.type must be %s or %s.", commonv1alpha2.ObjectiveTypeMinimize, commonv1alpha2.ObjectiveTypeMaximize) + if obj.Type != commonapiv1alpha2.ObjectiveTypeMinimize && obj.Type != commonapiv1alpha2.ObjectiveTypeMaximize { + return fmt.Errorf("spec.objective.type must be %s or %s.", commonapiv1alpha2.ObjectiveTypeMinimize, commonapiv1alpha2.ObjectiveTypeMaximize) } if obj.ObjectiveMetricName == "" { return fmt.Errorf("No spec.objective.objectiveMetricName specified.") @@ -83,7 +85,7 @@ func validateObjective(obj *commonv1alpha2.ObjectiveSpec) error { return nil } -func validateAlgorithm(ag *ep_v1alpha2.AlgorithmSpec) error { +func validateAlgorithm(ag *experimentsv1alpha2.AlgorithmSpec) error { if ag == nil { return fmt.Errorf("No spec.algorithm specified.") } @@ -94,7 +96,7 @@ func validateAlgorithm(ag *ep_v1alpha2.AlgorithmSpec) error { return nil } -func validateTrialTemplate(instance *ep_v1alpha2.Experiment) error { +func validateTrialTemplate(instance *experimentsv1alpha2.Experiment) error { trialName := fmt.Sprintf("%s-trial", instance.GetName()) trialParams := TrialTemplateParams{ Experiment: instance.GetName(), @@ -129,16 +131,16 @@ func validateTrialTemplate(instance *ep_v1alpha2.Experiment) error { func validateSupportedJob(job *unstructured.Unstructured) error { gvk := job.GroupVersionKind() - supportedJobs := GetSupportedJobList() + supportedJobs := commonv1alpha2.GetSupportedJobList() for _, sJob := range supportedJobs { if gvk == sJob { return nil } } - return fmt.Errorf("Cannot support to run job: %v", gvk) + return fmt.Errorf("Job type %v not supported", gvk) } -func validateForCreate(inst *ep_v1alpha2.Experiment) error { +func validateForCreate(inst *experimentsv1alpha2.Experiment) error { if _, err := GetExperimentFromDB(inst); err != nil { if err != sql.ErrNoRows { return fmt.Errorf("Fail to check record for the experiment in DB: %v", err) @@ -149,7 +151,7 @@ func validateForCreate(inst *ep_v1alpha2.Experiment) error { } } -func validateMetricsCollector(inst *ep_v1alpha2.Experiment) error { +func validateMetricsCollector(inst *experimentsv1alpha2.Experiment) error { BUFSIZE := 1024 experimentName := inst.GetName() trialName := fmt.Sprintf("%s-trial", inst.GetName()) diff --git a/pkg/controller/v1alpha2/trial/trial_controller.go b/pkg/controller/v1alpha2/trial/trial_controller.go index 5b963dab684..287acd5a0ea 100644 --- a/pkg/controller/v1alpha2/trial/trial_controller.go +++ b/pkg/controller/v1alpha2/trial/trial_controller.go @@ -20,10 +20,10 @@ import ( "bytes" "context" - batchv1 "k8s.io/api/batch/v1" batchv1beta "k8s.io/api/batch/v1beta1" "k8s.io/apimachinery/pkg/api/equality" "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime" @@ -40,10 +40,13 @@ import ( "sigs.k8s.io/controller-runtime/pkg/source" trialsv1alpha2 "github.com/kubeflow/katib/pkg/api/operators/apis/trial/v1alpha2" - "github.com/kubeflow/katib/pkg/controller/v1alpha2/trial/util" + commonv1alpha2 "github.com/kubeflow/katib/pkg/common/v1alpha2" + trialutil "github.com/kubeflow/katib/pkg/controller/v1alpha2/trial/util" ) -var log = logf.Log.WithName("trial-controller") +var ( + log = logf.Log.WithName("trial-controller") +) /** * USER ACTION REQUIRED: This is a scaffold file intended for the user to modify with their own Controller @@ -77,16 +80,25 @@ func add(mgr manager.Manager, r reconcile.Reconciler) error { return err } - err = c.Watch(&source.Kind{Type: &batchv1.Job{}}, - &handler.EnqueueRequestForOwner{ - IsController: true, - OwnerType: &trialsv1alpha2.Trial{}, - }) - if err != nil { - log.Error(err, "Job watch error") - return err + for _, gvk := range commonv1alpha2.GetSupportedJobList() { + unstructuredJob := &unstructured.Unstructured{} + unstructuredJob.SetGroupVersionKind(gvk) + err = c.Watch( + &source.Kind{Type: unstructuredJob}, + &handler.EnqueueRequestForOwner{ + IsController: true, + OwnerType: &trialsv1alpha2.Trial{}, + }) + if err != nil { + if meta.IsNoMatchError(err) { + log.Info("Job watch error. CRD might be missing. Please install CRD and restart katib-controller", "CRD Kind", gvk.Kind) + continue + } + return err + } else { + log.Info("Job watch added successfully", "CRD Kind", gvk.Kind) + } } - log.Info("Trial controller created") return nil } @@ -129,7 +141,7 @@ func (r *ReconcileTrial) Reconcile(request reconcile.Request) (reconcile.Result, } if !instance.IsCreated() { //Trial not created in DB - err = util.CreateTrialInDB(instance) + err = trialutil.CreateTrialInDB(instance) if err != nil { logger.Error(err, "Create trial in DB error") return reconcile.Result{}, err @@ -139,7 +151,7 @@ func (r *ReconcileTrial) Reconcile(request reconcile.Request) (reconcile.Result, instance.Status.StartTime = &now } msg := "Trial is created" - instance.MarkTrialStatusCreated(util.TrialCreatedReason, msg) + instance.MarkTrialStatusCreated(trialutil.TrialCreatedReason, msg) requeue = true } else { @@ -153,7 +165,7 @@ func (r *ReconcileTrial) Reconcile(request reconcile.Request) (reconcile.Result, if !equality.Semantic.DeepEqual(original.Status, instance.Status) { //assuming that only status change - err = util.UpdateTrialStatusInDB(instance) + err = trialutil.UpdateTrialStatusInDB(instance) if err != nil { logger.Error(err, "Update trial status in DB error") return reconcile.Result{}, err @@ -187,11 +199,11 @@ func (r *ReconcileTrial) reconcileTrial(instance *trialsv1alpha2.Trial) error { //Job already exists //TODO Can desired Spec differ from deployedSpec? if deployedJob != nil { - if err = util.UpdateTrialStatusCondition(instance, deployedJob); err != nil { + if err = trialutil.UpdateTrialStatusCondition(instance, deployedJob); err != nil { logger.Error(err, "Update trial status condition error") return err } - if err = util.UpdateTrialStatusObservation(instance, deployedJob); err != nil { + if err = trialutil.UpdateTrialStatusObservation(instance, deployedJob); err != nil { logger.Error(err, "Update trial status observation error") return err } @@ -231,7 +243,7 @@ func (r *ReconcileTrial) reconcileJob(instance *trialsv1alpha2.Trial, desiredJob } msg := "Trial is running" - instance.MarkTrialStatusRunning(util.TrialRunningReason, msg) + instance.MarkTrialStatusRunning(trialutil.TrialRunningReason, msg) return deployedJob, nil } diff --git a/pkg/controller/v1alpha2/trial/util/status_util.go b/pkg/controller/v1alpha2/trial/util/status_util.go index 64688f922aa..73477ee4df0 100644 --- a/pkg/controller/v1alpha2/trial/util/status_util.go +++ b/pkg/controller/v1alpha2/trial/util/status_util.go @@ -24,7 +24,7 @@ import ( logf "sigs.k8s.io/controller-runtime/pkg/runtime/log" trialsv1alpha2 "github.com/kubeflow/katib/pkg/api/operators/apis/trial/v1alpha2" - commonv1beta1 "github.com/kubeflow/tf-operator/pkg/apis/common/v1beta1" + commonv1beta2 "github.com/kubeflow/tf-operator/pkg/apis/common/v1beta2" ) var log = logf.Log.WithName("trial-status-util") @@ -62,7 +62,7 @@ func UpdateTrialStatusCondition(instance *trialsv1alpha2.Trial, deployedJob *uns instance.MarkTrialStatusFailed(TrialFailedReason, msg) } default: - jobStatus := commonv1beta1.JobStatus{} + jobStatus := commonv1beta2.JobStatus{} err := runtime.DefaultUnstructuredConverter.FromUnstructured(statusMap, &jobStatus) if err != nil { @@ -71,10 +71,10 @@ func UpdateTrialStatusCondition(instance *trialsv1alpha2.Trial, deployedJob *uns } if len(jobStatus.Conditions) > 0 { lc := jobStatus.Conditions[len(jobStatus.Conditions)-1] - if lc.Type == commonv1beta1.JobSucceeded { + if lc.Type == commonv1beta2.JobSucceeded { msg := "Trial has succeeded" instance.MarkTrialStatusSucceeded(TrialSucceededReason, msg) - } else if lc.Type == commonv1beta1.JobFailed { + } else if lc.Type == commonv1beta2.JobFailed { msg := "Trial has failed" instance.MarkTrialStatusFailed(TrialFailedReason, msg) }