From 53cc7cbd37aac099876b466038c38382df69fb18 Mon Sep 17 00:00:00 2001 From: Wen Zhou Date: Tue, 5 Nov 2024 11:41:58 +0100 Subject: [PATCH] feat: add support for trainingoperator to component Signed-off-by: Wen Zhou --- apis/components/v1/trainingoperator_types.go | 54 ++-- apis/components/v1/zz_generated.deepcopy.go | 36 ++- .../v1/datasciencecluster_types.go | 3 +- ...ents.opendatahub.io_trainingoperators.yaml | 44 ++- .../trainingoperator/trainingoperator.go | 113 ------- .../trainingoperator/zz_generated.deepcopy.go | 39 --- ...ents.opendatahub.io_trainingoperators.yaml | 44 ++- .../trainingoperator/trainingoperator.go | 60 ++++ .../trainingoperator_controller.go | 75 +++-- .../trainingoperator_controller_actions.go | 59 ++++ .../datasciencecluster_controller.go | 9 + .../datasciencecluster/kubebuilder_rbac.go | 2 +- controllers/dscinitialization/suite_test.go | 1 + controllers/webhook/webhook_suite_test.go | 1 + docs/api-overview.md | 67 +++-- main.go | 6 + pkg/upgrade/upgrade.go | 5 +- tests/e2e/controller_test.go | 2 + tests/e2e/dashboard_test.go | 2 +- tests/e2e/helper_test.go | 7 +- tests/e2e/kfto_test.go | 278 ++++++++++++++++++ tests/e2e/odh_manager_test.go | 6 + tests/e2e/ray_test.go | 2 +- 23 files changed, 664 insertions(+), 251 deletions(-) delete mode 100644 components/trainingoperator/trainingoperator.go delete mode 100644 components/trainingoperator/zz_generated.deepcopy.go create mode 100644 controllers/components/trainingoperator/trainingoperator.go create mode 100644 controllers/components/trainingoperator/trainingoperator_controller_actions.go create mode 100644 tests/e2e/kfto_test.go diff --git a/apis/components/v1/trainingoperator_types.go b/apis/components/v1/trainingoperator_types.go index 1adc3a2fa21..d5b8c9863b0 100644 --- a/apis/components/v1/trainingoperator_types.go +++ b/apis/components/v1/trainingoperator_types.go @@ -21,26 +21,21 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) -// EDIT THIS FILE! THIS IS SCAFFOLDING FOR YOU TO OWN! -// NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized. - -// TrainingOperatorSpec defines the desired state of TrainingOperator -type TrainingOperatorSpec struct { - // INSERT ADDITIONAL SPEC FIELDS - desired state of cluster - // Important: Run "make" to regenerate code after modifying this file +const ( + TrainingOperatorComponentName = "trainingoperator" + // value should match whats set in the XValidation below + TrainingOperatorInstanceName = "default-trainingoperator" + TrainingOperatorKind = "TrainingOperator" +) - // Foo is an example field of TrainingOperator. Edit trainingoperator_types.go to remove/update - Foo string `json:"foo,omitempty"` -} - -// TrainingOperatorStatus defines the observed state of TrainingOperator -type TrainingOperatorStatus struct { - components.Status `json:",inline"` -} +// NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized. // +kubebuilder:object:root=true // +kubebuilder:subresource:status // +kubebuilder:resource:scope=Cluster +// +kubebuilder:validation:XValidation:rule="self.metadata.name == 'default-trainingoperator'",message="TrainingOperator name must be default-trainingoperator" +// +kubebuilder:printcolumn:name="Ready",type=string,JSONPath=`.status.conditions[?(@.type=="Ready")].status`,description="Ready" +// +kubebuilder:printcolumn:name="Reason",type=string,JSONPath=`.status.conditions[?(@.type=="Ready")].reason`,description="Reason" // TrainingOperator is the Schema for the trainingoperators API type TrainingOperator struct { @@ -51,16 +46,21 @@ type TrainingOperator struct { Status TrainingOperatorStatus `json:"status,omitempty"` } -func (c *TrainingOperator) GetDevFlags() *components.DevFlags { - return nil +// TrainingOperatorSpec defines the desired state of TrainingOperator +type TrainingOperatorSpec struct { + TrainingOperatorCommonSpec `json:",inline"` } -func (c *TrainingOperator) GetStatus() *components.Status { - return &c.Status.Status +type TrainingOperatorCommonSpec struct { + components.DevFlagsSpec `json:",inline"` } -// +kubebuilder:object:root=true +// TrainingOperatorStatus defines the observed state of TrainingOperator +type TrainingOperatorStatus struct { + components.Status `json:",inline"` +} +// +kubebuilder:object:root=true // TrainingOperatorList contains a list of TrainingOperator type TrainingOperatorList struct { metav1.TypeMeta `json:",inline"` @@ -71,3 +71,17 @@ type TrainingOperatorList struct { func init() { SchemeBuilder.Register(&TrainingOperator{}, &TrainingOperatorList{}) } + +func (c *TrainingOperator) GetDevFlags() *components.DevFlags { + return c.Spec.DevFlags +} +func (c *TrainingOperator) GetStatus() *components.Status { + return &c.Status.Status +} + +// DSCTrainingOperator contains all the configuration exposed in DSC instance for TrainingOperator component +type DSCTrainingOperator struct { + components.ManagementSpec `json:",inline"` + // configuration fields common across components + TrainingOperatorCommonSpec `json:",inline"` +} diff --git a/apis/components/v1/zz_generated.deepcopy.go b/apis/components/v1/zz_generated.deepcopy.go index d357c5076e6..861abab5453 100644 --- a/apis/components/v1/zz_generated.deepcopy.go +++ b/apis/components/v1/zz_generated.deepcopy.go @@ -180,6 +180,23 @@ func (in *DSCRay) DeepCopy() *DSCRay { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *DSCTrainingOperator) DeepCopyInto(out *DSCTrainingOperator) { + *out = *in + out.ManagementSpec = in.ManagementSpec + in.TrainingOperatorCommonSpec.DeepCopyInto(&out.TrainingOperatorCommonSpec) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new DSCTrainingOperator. +func (in *DSCTrainingOperator) DeepCopy() *DSCTrainingOperator { + if in == nil { + return nil + } + out := new(DSCTrainingOperator) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *Dashboard) DeepCopyInto(out *Dashboard) { *out = *in @@ -867,7 +884,7 @@ func (in *TrainingOperator) DeepCopyInto(out *TrainingOperator) { *out = *in out.TypeMeta = in.TypeMeta in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) - out.Spec = in.Spec + in.Spec.DeepCopyInto(&out.Spec) in.Status.DeepCopyInto(&out.Status) } @@ -889,6 +906,22 @@ func (in *TrainingOperator) DeepCopyObject() runtime.Object { return nil } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *TrainingOperatorCommonSpec) DeepCopyInto(out *TrainingOperatorCommonSpec) { + *out = *in + in.DevFlagsSpec.DeepCopyInto(&out.DevFlagsSpec) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainingOperatorCommonSpec. +func (in *TrainingOperatorCommonSpec) DeepCopy() *TrainingOperatorCommonSpec { + if in == nil { + return nil + } + out := new(TrainingOperatorCommonSpec) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *TrainingOperatorList) DeepCopyInto(out *TrainingOperatorList) { *out = *in @@ -924,6 +957,7 @@ func (in *TrainingOperatorList) DeepCopyObject() runtime.Object { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *TrainingOperatorSpec) DeepCopyInto(out *TrainingOperatorSpec) { *out = *in + in.TrainingOperatorCommonSpec.DeepCopyInto(&out.TrainingOperatorCommonSpec) } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainingOperatorSpec. diff --git a/apis/datasciencecluster/v1/datasciencecluster_types.go b/apis/datasciencecluster/v1/datasciencecluster_types.go index cd473164f24..f79a1e28565 100644 --- a/apis/datasciencecluster/v1/datasciencecluster_types.go +++ b/apis/datasciencecluster/v1/datasciencecluster_types.go @@ -31,7 +31,6 @@ import ( "github.com/opendatahub-io/opendatahub-operator/v2/components/kserve" "github.com/opendatahub-io/opendatahub-operator/v2/components/kueue" "github.com/opendatahub-io/opendatahub-operator/v2/components/modelmeshserving" - "github.com/opendatahub-io/opendatahub-operator/v2/components/trainingoperator" "github.com/opendatahub-io/opendatahub-operator/v2/components/trustyai" "github.com/opendatahub-io/opendatahub-operator/v2/components/workbenches" "github.com/opendatahub-io/opendatahub-operator/v2/pkg/cluster" @@ -81,7 +80,7 @@ type Components struct { ModelRegistry componentsv1.DSCModelRegistry `json:"modelregistry,omitempty"` // Training Operator component configuration. - TrainingOperator trainingoperator.TrainingOperator `json:"trainingoperator,omitempty"` + TrainingOperator componentsv1.DSCTrainingOperator `json:"trainingoperator,omitempty"` } // ComponentsStatus defines the custom status of DataScienceCluster components. diff --git a/bundle/manifests/components.opendatahub.io_trainingoperators.yaml b/bundle/manifests/components.opendatahub.io_trainingoperators.yaml index 0e9c5608d68..a8f2acea895 100644 --- a/bundle/manifests/components.opendatahub.io_trainingoperators.yaml +++ b/bundle/manifests/components.opendatahub.io_trainingoperators.yaml @@ -14,7 +14,16 @@ spec: singular: trainingoperator scope: Cluster versions: - - name: v1 + - additionalPrinterColumns: + - description: Ready + jsonPath: .status.conditions[?(@.type=="Ready")].status + name: Ready + type: string + - description: Reason + jsonPath: .status.conditions[?(@.type=="Ready")].reason + name: Reason + type: string + name: v1 schema: openAPIV3Schema: description: TrainingOperator is the Schema for the trainingoperators API @@ -39,10 +48,32 @@ spec: spec: description: TrainingOperatorSpec defines the desired state of TrainingOperator properties: - foo: - description: Foo is an example field of TrainingOperator. Edit trainingoperator_types.go - to remove/update - type: string + devFlags: + description: Add developer fields + properties: + manifests: + description: List of custom manifests for the given component + items: + properties: + contextDir: + default: manifests + description: contextDir is the relative path to the folder + containing manifests in a repository, default value "manifests" + type: string + sourcePath: + default: "" + description: 'sourcePath is the subpath within contextDir + where kustomize builds start. Examples include any sub-folder + or path: `base`, `overlays/dev`, `default`, `odh` etc.' + type: string + uri: + default: "" + description: uri is the URI point to a git repo with tag/branch. + e.g. https://github.com/org/repo/tarball/ + type: string + type: object + type: array + type: object type: object status: description: TrainingOperatorStatus defines the observed state of TrainingOperator @@ -110,6 +141,9 @@ spec: type: string type: object type: object + x-kubernetes-validations: + - message: TrainingOperator name must be default-trainingoperator + rule: self.metadata.name == 'default-trainingoperator' served: true storage: true subresources: diff --git a/components/trainingoperator/trainingoperator.go b/components/trainingoperator/trainingoperator.go deleted file mode 100644 index a6a7c8f87e7..00000000000 --- a/components/trainingoperator/trainingoperator.go +++ /dev/null @@ -1,113 +0,0 @@ -// Package trainingoperator provides utility functions to config trainingoperator as part of the stack -// which makes managing distributed compute infrastructure in the cloud easy and intuitive for Data Scientists -// +groupName=datasciencecluster.opendatahub.io -package trainingoperator - -import ( - "context" - "fmt" - "path/filepath" - - "github.com/go-logr/logr" - operatorv1 "github.com/openshift/api/operator/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "sigs.k8s.io/controller-runtime/pkg/client" - logf "sigs.k8s.io/controller-runtime/pkg/log" - - dsciv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/dscinitialization/v1" - "github.com/opendatahub-io/opendatahub-operator/v2/components" - "github.com/opendatahub-io/opendatahub-operator/v2/pkg/cluster" - "github.com/opendatahub-io/opendatahub-operator/v2/pkg/deploy" -) - -var ( - ComponentName = "trainingoperator" - TrainingOperatorPath = deploy.DefaultManifestPath + "/" + ComponentName + "/rhoai" -) - -// Verifies that TrainingOperator implements ComponentInterface. -var _ components.ComponentInterface = (*TrainingOperator)(nil) - -// TrainingOperator struct holds the configuration for the TrainingOperator component. -// +kubebuilder:object:generate=true -type TrainingOperator struct { - components.Component `json:""` -} - -func (r *TrainingOperator) Init(ctx context.Context, _ cluster.Platform) error { - log := logf.FromContext(ctx).WithName(ComponentName) - - var imageParamMap = map[string]string{ - "odh-training-operator-controller-image": "RELATED_IMAGE_ODH_TRAINING_OPERATOR_IMAGE", - } - - if err := deploy.ApplyParams(TrainingOperatorPath, imageParamMap); err != nil { - log.Error(err, "failed to update image", "path", TrainingOperatorPath) - } - - return nil -} - -func (r *TrainingOperator) OverrideManifests(ctx context.Context, _ cluster.Platform) error { - // If devflags are set, update default manifests path - if len(r.DevFlags.Manifests) != 0 { - manifestConfig := r.DevFlags.Manifests[0] - if err := deploy.DownloadManifests(ctx, ComponentName, manifestConfig); err != nil { - return err - } - // If overlay is defined, update paths - defaultKustomizePath := "rhoai" - if manifestConfig.SourcePath != "" { - defaultKustomizePath = manifestConfig.SourcePath - } - TrainingOperatorPath = filepath.Join(deploy.DefaultManifestPath, ComponentName, defaultKustomizePath) - } - - return nil -} - -func (r *TrainingOperator) GetComponentName() string { - return ComponentName -} - -func (r *TrainingOperator) ReconcileComponent(ctx context.Context, cli client.Client, l logr.Logger, - owner metav1.Object, dscispec *dsciv1.DSCInitializationSpec, platform cluster.Platform, _ bool) error { - enabled := r.GetManagementState() == operatorv1.Managed - monitoringEnabled := dscispec.Monitoring.ManagementState == operatorv1.Managed - - if enabled { - if r.DevFlags != nil { - // Download manifests and update paths - if err := r.OverrideManifests(ctx, platform); err != nil { - return err - } - } - } - // Deploy Training Operator - if err := deploy.DeployManifestsFromPath(ctx, cli, owner, TrainingOperatorPath, dscispec.ApplicationsNamespace, ComponentName, enabled); err != nil { - return err - } - l.Info("apply manifests done") - - if enabled { - if err := cluster.WaitForDeploymentAvailable(ctx, cli, ComponentName, dscispec.ApplicationsNamespace, 20, 2); err != nil { - return fmt.Errorf("deployment for %s is not ready to server: %w", ComponentName, err) - } - } - - // CloudService Monitoring handling - if platform == cluster.ManagedRhods { - if err := r.UpdatePrometheusConfig(cli, l, enabled && monitoringEnabled, ComponentName); err != nil { - return err - } - if err := deploy.DeployManifestsFromPath(ctx, cli, owner, - filepath.Join(deploy.DefaultManifestPath, "monitoring", "prometheus", "apps"), - dscispec.Monitoring.Namespace, - "prometheus", true); err != nil { - return err - } - l.Info("updating SRE monitoring done") - } - - return nil -} diff --git a/components/trainingoperator/zz_generated.deepcopy.go b/components/trainingoperator/zz_generated.deepcopy.go deleted file mode 100644 index 57245a95044..00000000000 --- a/components/trainingoperator/zz_generated.deepcopy.go +++ /dev/null @@ -1,39 +0,0 @@ -//go:build !ignore_autogenerated - -/* -Copyright 2023. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -// Code generated by controller-gen. DO NOT EDIT. - -package trainingoperator - -import () - -// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. -func (in *TrainingOperator) DeepCopyInto(out *TrainingOperator) { - *out = *in - in.Component.DeepCopyInto(&out.Component) -} - -// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new TrainingOperator. -func (in *TrainingOperator) DeepCopy() *TrainingOperator { - if in == nil { - return nil - } - out := new(TrainingOperator) - in.DeepCopyInto(out) - return out -} diff --git a/config/crd/bases/components.opendatahub.io_trainingoperators.yaml b/config/crd/bases/components.opendatahub.io_trainingoperators.yaml index 630a7ad0f16..261ae83051f 100644 --- a/config/crd/bases/components.opendatahub.io_trainingoperators.yaml +++ b/config/crd/bases/components.opendatahub.io_trainingoperators.yaml @@ -14,7 +14,16 @@ spec: singular: trainingoperator scope: Cluster versions: - - name: v1 + - additionalPrinterColumns: + - description: Ready + jsonPath: .status.conditions[?(@.type=="Ready")].status + name: Ready + type: string + - description: Reason + jsonPath: .status.conditions[?(@.type=="Ready")].reason + name: Reason + type: string + name: v1 schema: openAPIV3Schema: description: TrainingOperator is the Schema for the trainingoperators API @@ -39,10 +48,32 @@ spec: spec: description: TrainingOperatorSpec defines the desired state of TrainingOperator properties: - foo: - description: Foo is an example field of TrainingOperator. Edit trainingoperator_types.go - to remove/update - type: string + devFlags: + description: Add developer fields + properties: + manifests: + description: List of custom manifests for the given component + items: + properties: + contextDir: + default: manifests + description: contextDir is the relative path to the folder + containing manifests in a repository, default value "manifests" + type: string + sourcePath: + default: "" + description: 'sourcePath is the subpath within contextDir + where kustomize builds start. Examples include any sub-folder + or path: `base`, `overlays/dev`, `default`, `odh` etc.' + type: string + uri: + default: "" + description: uri is the URI point to a git repo with tag/branch. + e.g. https://github.com/org/repo/tarball/ + type: string + type: object + type: array + type: object type: object status: description: TrainingOperatorStatus defines the observed state of TrainingOperator @@ -110,6 +141,9 @@ spec: type: string type: object type: object + x-kubernetes-validations: + - message: TrainingOperator name must be default-trainingoperator + rule: self.metadata.name == 'default-trainingoperator' served: true storage: true subresources: diff --git a/controllers/components/trainingoperator/trainingoperator.go b/controllers/components/trainingoperator/trainingoperator.go new file mode 100644 index 00000000000..3c51fe8e525 --- /dev/null +++ b/controllers/components/trainingoperator/trainingoperator.go @@ -0,0 +1,60 @@ +package trainingoperator + +import ( + "fmt" + + operatorv1 "github.com/openshift/api/operator/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + componentsv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/components/v1" + dscv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/datasciencecluster/v1" + "github.com/opendatahub-io/opendatahub-operator/v2/pkg/cluster" + odhdeploy "github.com/opendatahub-io/opendatahub-operator/v2/pkg/deploy" + "github.com/opendatahub-io/opendatahub-operator/v2/pkg/metadata/annotations" +) + +const ( + ComponentName = componentsv1.TrainingOperatorComponentName +) + +var ( + DefaultPath = odhdeploy.DefaultManifestPath + "/" + ComponentName + "/rhoai" +) + +// for DSC to get compoment TrainingOperator's CR. +func GetComponentCR(dsc *dscv1.DataScienceCluster) *componentsv1.TrainingOperator { + trainingoperatorAnnotations := make(map[string]string) + switch dsc.Spec.Components.TrainingOperator.ManagementState { + case operatorv1.Managed, operatorv1.Removed: + trainingoperatorAnnotations[annotations.ManagementStateAnnotation] = string(dsc.Spec.Components.TrainingOperator.ManagementState) + default: // Force and Unmanaged case for unknown values, we do not support these yet + trainingoperatorAnnotations[annotations.ManagementStateAnnotation] = "Unknown" + } + + return &componentsv1.TrainingOperator{ + TypeMeta: metav1.TypeMeta{ + Kind: componentsv1.TrainingOperatorKind, + APIVersion: componentsv1.GroupVersion.String(), + }, + ObjectMeta: metav1.ObjectMeta{ + Name: componentsv1.TrainingOperatorInstanceName, + Annotations: trainingoperatorAnnotations, + }, + Spec: componentsv1.TrainingOperatorSpec{ + TrainingOperatorCommonSpec: dsc.Spec.Components.TrainingOperator.TrainingOperatorCommonSpec, + }, + } +} + +// Init for set images. +func Init(platform cluster.Platform) error { + imageParamMap := map[string]string{ + "odh-training-operator-controller-image": "RELATED_IMAGE_ODH_TRAINING_OPERATOR_IMAGE", + } + + if err := odhdeploy.ApplyParams(DefaultPath, imageParamMap); err != nil { + return fmt.Errorf("failed to update images on path %s: %w", DefaultPath, err) + } + + return nil +} diff --git a/controllers/components/trainingoperator/trainingoperator_controller.go b/controllers/components/trainingoperator/trainingoperator_controller.go index 138247fe644..e98a32ad52b 100644 --- a/controllers/components/trainingoperator/trainingoperator_controller.go +++ b/controllers/components/trainingoperator/trainingoperator_controller.go @@ -19,40 +19,57 @@ package trainingoperator import ( "context" - "k8s.io/apimachinery/pkg/runtime" + promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + rbacv1 "k8s.io/api/rbac/v1" + extv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/builder" componentsv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/components/v1" + "github.com/opendatahub-io/opendatahub-operator/v2/pkg/controller/actions/deploy" + "github.com/opendatahub-io/opendatahub-operator/v2/pkg/controller/actions/render" + "github.com/opendatahub-io/opendatahub-operator/v2/pkg/controller/actions/updatestatus" + "github.com/opendatahub-io/opendatahub-operator/v2/pkg/controller/predicates/resources" + "github.com/opendatahub-io/opendatahub-operator/v2/pkg/controller/reconciler" + "github.com/opendatahub-io/opendatahub-operator/v2/pkg/metadata/labels" ) -// TrainingOperatorReconciler reconciles a TrainingOperator object. -type TrainingOperatorReconciler struct { - client.Client - Scheme *runtime.Scheme -} +func NewComponentReconciler(ctx context.Context, mgr ctrl.Manager) error { + _, err := reconciler.ComponentReconcilerFor[*componentsv1.TrainingOperator]( + mgr, + componentsv1.TrainingOperatorInstanceName, + &componentsv1.TrainingOperator{}, + ). + // customized Owns() for Component with new predicates + Owns(&corev1.ConfigMap{}). + Owns(&promv1.PodMonitor{}). + Owns(&rbacv1.ClusterRoleBinding{}). + Owns(&rbacv1.ClusterRole{}). + Owns(&corev1.ServiceAccount{}). + Owns(&appsv1.Deployment{}, builder.WithPredicates(resources.NewDeploymentPredicate())). + Watches(&extv1.CustomResourceDefinition{}). // call ForLabel() + new predicates + // Add TrainingOperator-specific actions + WithAction(initialize). + WithAction(devFlags). + WithAction(render.NewAction( + render.WithCache(true, render.DefaultCachingKeyFn), + render.WithLabel(labels.ODH.Component(ComponentName), "true"), + render.WithLabel(labels.K8SCommon.PartOf, ComponentName), + )). + WithAction(deploy.NewAction( + deploy.WithFieldOwner(componentsv1.TrainingOperatorInstanceName), + deploy.WithLabel(labels.ComponentManagedBy, componentsv1.TrainingOperatorInstanceName), + )). + WithAction(updatestatus.NewAction( + updatestatus.WithSelectorLabel(labels.ComponentManagedBy, componentsv1.TrainingOperatorInstanceName), + )). + Build(ctx) -// Reconcile is part of the main kubernetes reconciliation loop which aims to -// move the current state of the cluster closer to the desired state. -// TODO(user): Modify the Reconcile function to compare the state specified by -// the TrainingOperator object against the actual cluster state, and then -// perform operations to make the cluster state reflect the state specified by -// the user. -// -// For more details, check Reconcile and its Result here: -// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.12.2/pkg/reconcile -func (r *TrainingOperatorReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { - _ = log.FromContext(ctx) - - // TODO(user): your logic here - - return ctrl.Result{}, nil -} + if err != nil { + return err // no need customize error, it is done in the caller main + } -// SetupWithManager sets up the controller with the Manager. -func (r *TrainingOperatorReconciler) SetupWithManager(mgr ctrl.Manager) error { - return ctrl.NewControllerManagedBy(mgr). - For(&componentsv1.TrainingOperator{}). - Complete(r) + return nil } diff --git a/controllers/components/trainingoperator/trainingoperator_controller_actions.go b/controllers/components/trainingoperator/trainingoperator_controller_actions.go new file mode 100644 index 00000000000..9906e9fc33c --- /dev/null +++ b/controllers/components/trainingoperator/trainingoperator_controller_actions.go @@ -0,0 +1,59 @@ +/* +Copyright 2023. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package trainingoperator + +import ( + "context" + "fmt" + + componentsv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/components/v1" + odhtypes "github.com/opendatahub-io/opendatahub-operator/v2/pkg/controller/types" + odhdeploy "github.com/opendatahub-io/opendatahub-operator/v2/pkg/deploy" +) + +func initialize(ctx context.Context, rr *odhtypes.ReconciliationRequest) error { + rr.Manifests = append(rr.Manifests, odhtypes.ManifestInfo{ + Path: odhdeploy.DefaultManifestPath, + ContextDir: ComponentName, + SourcePath: "rhoai", + }) + return nil +} + +func devFlags(ctx context.Context, rr *odhtypes.ReconciliationRequest) error { + kfto, ok := rr.Instance.(*componentsv1.TrainingOperator) + if !ok { + return fmt.Errorf("resource instance %v is not a componentsv1.TrainingOperator)", rr.Instance) + } + + if kfto.Spec.DevFlags == nil { + return nil + } + if len(kfto.Spec.DevFlags.Manifests) != 0 { + manifestConfig := kfto.Spec.DevFlags.Manifests[0] + if err := odhdeploy.DownloadManifests(ctx, ComponentName, manifestConfig); err != nil { + return err + } + if manifestConfig.SourcePath != "" { + rr.Manifests[0].Path = odhdeploy.DefaultManifestPath + rr.Manifests[0].ContextDir = ComponentName + rr.Manifests[0].SourcePath = manifestConfig.SourcePath + } + } + // TODO: Implement devflags logmode logic + return nil +} diff --git a/controllers/datasciencecluster/datasciencecluster_controller.go b/controllers/datasciencecluster/datasciencecluster_controller.go index 7f00a7ec38c..f57fab3fc98 100644 --- a/controllers/datasciencecluster/datasciencecluster_controller.go +++ b/controllers/datasciencecluster/datasciencecluster_controller.go @@ -56,6 +56,7 @@ import ( dashboardctrl "github.com/opendatahub-io/opendatahub-operator/v2/controllers/components/dashboard" modelregistryctrl "github.com/opendatahub-io/opendatahub-operator/v2/controllers/components/modelregistry" rayctrl "github.com/opendatahub-io/opendatahub-operator/v2/controllers/components/ray" + kftoctrl "github.com/opendatahub-io/opendatahub-operator/v2/controllers/components/trainingoperator" "github.com/opendatahub-io/opendatahub-operator/v2/controllers/status" "github.com/opendatahub-io/opendatahub-operator/v2/pkg/cluster" odhClient "github.com/opendatahub-io/opendatahub-operator/v2/pkg/controller/client" @@ -270,6 +271,14 @@ func (r *DataScienceClusterReconciler) Reconcile(ctx context.Context, req ctrl.R componentErrors = multierror.Append(componentErrors, err) } + // Deploy TrainingOperator + if instance, err = r.ReconcileComponent(ctx, instance, componentsv1.TrainingOperatorComponentName, func() (error, bool) { + kfto := kftoctrl.GetComponentCR(instance) + return r.apply(ctx, instance, kfto), instance.Spec.Components.TrainingOperator.ManagementState == operatorv1.Managed + }); err != nil { + componentErrors = multierror.Append(componentErrors, err) + } + // Process errors for components if componentErrors != nil { log.Info("DataScienceCluster Deployment Incomplete.") diff --git a/controllers/datasciencecluster/kubebuilder_rbac.go b/controllers/datasciencecluster/kubebuilder_rbac.go index 6a91f3a588c..0320a1a639e 100644 --- a/controllers/datasciencecluster/kubebuilder_rbac.go +++ b/controllers/datasciencecluster/kubebuilder_rbac.go @@ -210,7 +210,7 @@ package datasciencecluster // +kubebuilder:rbac:groups="datasciencepipelinesapplications.opendatahub.io",resources=datasciencepipelinesapplications,verbs=create;delete;list;update;watch;patch;get // +kubebuilder:rbac:groups="argoproj.io",resources=workflows,verbs=* -// TODO: KFTO +// TrainingOperator // +kubebuilder:rbac:groups=components.opendatahub.io,resources=trainingoperators,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=components.opendatahub.io,resources=trainingoperators/status,verbs=get;update;patch // +kubebuilder:rbac:groups=components.opendatahub.io,resources=trainingoperators/finalizers,verbs=update diff --git a/controllers/dscinitialization/suite_test.go b/controllers/dscinitialization/suite_test.go index 618b9a62e91..1ab7dfa10c7 100644 --- a/controllers/dscinitialization/suite_test.go +++ b/controllers/dscinitialization/suite_test.go @@ -77,6 +77,7 @@ func TestDataScienceClusterInitialization(t *testing.T) { var testScheme = runtime.NewScheme() +//nolint:fatcontext var _ = BeforeSuite(func() { // can't use suite's context as the manager should survive the function gCtx, gCancel = context.WithCancel(context.Background()) diff --git a/controllers/webhook/webhook_suite_test.go b/controllers/webhook/webhook_suite_test.go index cb116af69ba..961374324d0 100644 --- a/controllers/webhook/webhook_suite_test.go +++ b/controllers/webhook/webhook_suite_test.go @@ -79,6 +79,7 @@ func TestAPIs(t *testing.T) { RunSpecs(t, "Webhook Suite") } +//nolint:fatcontext var _ = BeforeSuite(func() { // can't use suite's context as the manager should survive the function gCtx, gCancel = context.WithCancel(context.Background()) diff --git a/docs/api-overview.md b/docs/api-overview.md index 7e16979b22a..23af5e59620 100644 --- a/docs/api-overview.md +++ b/docs/api-overview.md @@ -172,6 +172,23 @@ DSCRay contains all the configuration exposed in DSC instance for Ray component +_Appears in:_ +- [Components](#components) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `managementState` _[ManagementState](#managementstate)_ | Set to one of the following values:

- "Managed" : the operator is actively managing the component and trying to keep it active.
It will only upgrade the component if it is safe to do so

- "Removed" : the operator is actively managing the component and will not install it,
or if it is installed, the operator will try to remove it | | Enum: [Managed Removed]
| +| `devFlags` _[DevFlags](#devflags)_ | Add developer fields | | | + + +#### DSCTrainingOperator + + + +DSCTrainingOperator contains all the configuration exposed in DSC instance for TrainingOperator component + + + _Appears in:_ - [Components](#components) @@ -790,6 +807,23 @@ _Appears in:_ | `status` _[TrainingOperatorStatus](#trainingoperatorstatus)_ | | | | +#### TrainingOperatorCommonSpec + + + + + + + +_Appears in:_ +- [DSCTrainingOperator](#dsctrainingoperator) +- [TrainingOperatorSpec](#trainingoperatorspec) + +| Field | Description | Default | Validation | +| --- | --- | --- | --- | +| `devFlags` _[DevFlags](#devflags)_ | Add developer fields | | | + + #### TrainingOperatorList @@ -823,7 +857,7 @@ _Appears in:_ | Field | Description | Default | Validation | | --- | --- | --- | --- | -| `foo` _string_ | Foo is an example field of TrainingOperator. Edit trainingoperator_types.go to remove/update | | | +| `devFlags` _[DevFlags](#devflags)_ | Add developer fields | | | #### TrainingOperatorStatus @@ -1040,7 +1074,6 @@ _Appears in:_ - [Kserve](#kserve) - [Kueue](#kueue) - [ModelMeshServing](#modelmeshserving) -- [TrainingOperator](#trainingoperator) - [TrustyAI](#trustyai) - [Workbenches](#workbenches) @@ -1084,12 +1117,15 @@ _Appears in:_ - [DSCDashboard](#dscdashboard) - [DSCModelRegistry](#dscmodelregistry) - [DSCRay](#dscray) +- [DSCTrainingOperator](#dsctrainingoperator) - [DashboardCommonSpec](#dashboardcommonspec) - [DashboardSpec](#dashboardspec) - [ModelRegistryCommonSpec](#modelregistrycommonspec) - [ModelRegistrySpec](#modelregistryspec) - [RayCommonSpec](#raycommonspec) - [RaySpec](#rayspec) +- [TrainingOperatorCommonSpec](#trainingoperatorcommonspec) +- [TrainingOperatorSpec](#trainingoperatorspec) | Field | Description | Default | Validation | | --- | --- | --- | --- | @@ -1109,6 +1145,7 @@ _Appears in:_ - [DSCDashboard](#dscdashboard) - [DSCModelRegistry](#dscmodelregistry) - [DSCRay](#dscray) +- [DSCTrainingOperator](#dsctrainingoperator) | Field | Description | Default | Validation | | --- | --- | --- | --- | @@ -1258,30 +1295,6 @@ _Appears in:_ -## datasciencecluster.opendatahub.io/trainingoperator - -Package trainingoperator provides utility functions to config trainingoperator as part of the stack -which makes managing distributed compute infrastructure in the cloud easy and intuitive for Data Scientists - - - -#### TrainingOperator - - - -TrainingOperator struct holds the configuration for the TrainingOperator component. - - - -_Appears in:_ -- [Components](#components) - -| Field | Description | Default | Validation | -| --- | --- | --- | --- | -| `Component` _[Component](#component)_ | | | | - - - ## datasciencecluster.opendatahub.io/trustyai Package trustyai provides utility functions to config TrustyAI, a bias/fairness and explainability toolkit @@ -1389,7 +1402,7 @@ _Appears in:_ | `ray` _[DSCRay](#dscray)_ | Ray component configuration. | | | | `trustyai` _[TrustyAI](#trustyai)_ | TrustyAI component configuration. | | | | `modelregistry` _[DSCModelRegistry](#dscmodelregistry)_ | ModelRegistry component configuration. | | | -| `trainingoperator` _[TrainingOperator](#trainingoperator)_ | Training Operator component configuration. | | | +| `trainingoperator` _[DSCTrainingOperator](#dsctrainingoperator)_ | Training Operator component configuration. | | | #### ComponentsStatus diff --git a/main.go b/main.go index 1c49ead8c82..7cd18e10954 100644 --- a/main.go +++ b/main.go @@ -66,6 +66,7 @@ import ( dashboardctrl "github.com/opendatahub-io/opendatahub-operator/v2/controllers/components/dashboard" modelregistryctrl "github.com/opendatahub-io/opendatahub-operator/v2/controllers/components/modelregistry" rayctrl "github.com/opendatahub-io/opendatahub-operator/v2/controllers/components/ray" + kftoctrl "github.com/opendatahub-io/opendatahub-operator/v2/controllers/components/trainingoperator" dscctrl "github.com/opendatahub-io/opendatahub-operator/v2/controllers/datasciencecluster" dscictrl "github.com/opendatahub-io/opendatahub-operator/v2/controllers/dscinitialization" "github.com/opendatahub-io/opendatahub-operator/v2/controllers/secretgenerator" @@ -420,5 +421,10 @@ func CreateComponentReconcilers(ctx context.Context, mgr manager.Manager) error return err } + if err := kftoctrl.NewComponentReconciler(ctx, mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "TrainingOperatorReconciler") + return err + } + return nil } diff --git a/pkg/upgrade/upgrade.go b/pkg/upgrade/upgrade.go index 2a6963bdf00..c47af2381ef 100644 --- a/pkg/upgrade/upgrade.go +++ b/pkg/upgrade/upgrade.go @@ -37,7 +37,6 @@ import ( "github.com/opendatahub-io/opendatahub-operator/v2/components/kserve" "github.com/opendatahub-io/opendatahub-operator/v2/components/kueue" "github.com/opendatahub-io/opendatahub-operator/v2/components/modelmeshserving" - "github.com/opendatahub-io/opendatahub-operator/v2/components/trainingoperator" "github.com/opendatahub-io/opendatahub-operator/v2/components/trustyai" "github.com/opendatahub-io/opendatahub-operator/v2/components/workbenches" "github.com/opendatahub-io/opendatahub-operator/v2/pkg/cluster" @@ -97,8 +96,8 @@ func CreateDefaultDSC(ctx context.Context, cli client.Client) error { ModelRegistry: componentsv1.DSCModelRegistry{ ManagementSpec: components.ManagementSpec{ManagementState: operatorv1.Managed}, }, - TrainingOperator: trainingoperator.TrainingOperator{ - Component: componentsold.Component{ManagementState: operatorv1.Managed}, + TrainingOperator: componentsv1.DSCTrainingOperator{ + ManagementSpec: components.ManagementSpec{ManagementState: operatorv1.Managed}, }, }, }, diff --git a/tests/e2e/controller_test.go b/tests/e2e/controller_test.go index e1aca7abfa4..aa52ef41264 100644 --- a/tests/e2e/controller_test.go +++ b/tests/e2e/controller_test.go @@ -122,6 +122,8 @@ func TestOdhOperator(t *testing.T) { t.Run("validate installation of Ray Component", rayTestSuite) t.Run("validate installation of ModelRegistry Component", modelRegistryTestSuite) + t.Run("validate installation of TrainingOperator Component", kftoTestSuite) + // Run deletion if skipDeletion is not set if !skipDeletion { // this is a negative test case, since by using the positive CM('true'), even CSV gets deleted which leaves no operator pod in prow diff --git a/tests/e2e/dashboard_test.go b/tests/e2e/dashboard_test.go index 746eedbf8b6..f433db93bfd 100644 --- a/tests/e2e/dashboard_test.go +++ b/tests/e2e/dashboard_test.go @@ -119,7 +119,7 @@ func (tc *DashboardTestCtx) testOwnerReferences() error { } // Test Dashboard CR ownerref - if tc.testDashboardInstance.OwnerReferences[0].Kind != "DataScienceCluster" { + if tc.testDashboardInstance.OwnerReferences[0].Kind != dscKind { return fmt.Errorf("expected ownerreference DataScienceCluster not found. Got ownereferrence: %v", tc.testDashboardInstance.OwnerReferences[0].Kind) } diff --git a/tests/e2e/helper_test.go b/tests/e2e/helper_test.go index ef82d7ad6db..651ad0ab3d4 100644 --- a/tests/e2e/helper_test.go +++ b/tests/e2e/helper_test.go @@ -32,7 +32,6 @@ import ( "github.com/opendatahub-io/opendatahub-operator/v2/components/kserve" "github.com/opendatahub-io/opendatahub-operator/v2/components/kueue" "github.com/opendatahub-io/opendatahub-operator/v2/components/modelmeshserving" - "github.com/opendatahub-io/opendatahub-operator/v2/components/trainingoperator" "github.com/opendatahub-io/opendatahub-operator/v2/components/trustyai" "github.com/opendatahub-io/opendatahub-operator/v2/components/workbenches" "github.com/opendatahub-io/opendatahub-operator/v2/controllers/components/modelregistry" @@ -175,9 +174,9 @@ func setupDSCInstance(name string) *dscv1.DataScienceCluster { RegistriesNamespace: modelregistry.DefaultModelRegistriesNamespace, }, }, - TrainingOperator: trainingoperator.TrainingOperator{ - Component: componentsold.Component{ - ManagementState: operatorv1.Removed, + TrainingOperator: componentsv1.DSCTrainingOperator{ + ManagementSpec: components.ManagementSpec{ + ManagementState: operatorv1.Managed, }, }, }, diff --git a/tests/e2e/kfto_test.go b/tests/e2e/kfto_test.go new file mode 100644 index 00000000000..738bf11da33 --- /dev/null +++ b/tests/e2e/kfto_test.go @@ -0,0 +1,278 @@ +package e2e_test + +import ( + "context" + "errors" + "fmt" + "reflect" + "testing" + "time" + + operatorv1 "github.com/openshift/api/operator/v1" + "github.com/stretchr/testify/require" + autoscalingv1 "k8s.io/api/autoscaling/v1" + k8serr "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/client-go/util/retry" + "sigs.k8s.io/controller-runtime/pkg/client" + + componentsv1 "github.com/opendatahub-io/opendatahub-operator/v2/apis/components/v1" + "github.com/opendatahub-io/opendatahub-operator/v2/pkg/metadata/labels" +) + +type TrainingOperatorTestCtx struct { + testCtx *testContext + testTrainingOperatorInstance componentsv1.TrainingOperator +} + +func kftoTestSuite(t *testing.T) { + kftoCtx := TrainingOperatorTestCtx{} + var err error + kftoCtx.testCtx, err = NewTestContext() + require.NoError(t, err) + + testCtx := kftoCtx.testCtx + + t.Run(testCtx.testDsc.Name, func(t *testing.T) { + // creation + t.Run("Creation of TrainingOperator CR", func(t *testing.T) { + err = kftoCtx.testTrainingOperatorCreation() + require.NoError(t, err, "error creating TrainingOperator CR") + }) + + t.Run("Validate TrainingOperator instance", func(t *testing.T) { + err = kftoCtx.validateTrainingOperator() + require.NoError(t, err, "error validating TrainingOperator instance") + }) + + t.Run("Validate Ownerrefrences exist", func(t *testing.T) { + err = kftoCtx.testOwnerReferences() + require.NoError(t, err, "error getting all TrainingOperator's Ownerrefrences") + }) + + t.Run("Validate TrainingOperator Ready", func(t *testing.T) { + err = kftoCtx.validateTrainingOperatorReady() + require.NoError(t, err, "TrainingOperator instance is not Ready") + }) + + // reconcile + t.Run("Validate Controller reconcile", func(t *testing.T) { + err = kftoCtx.testUpdateOnTrainingOperatorResources() + require.NoError(t, err, "error testing updates for TrainingOperator's managed resources") + }) + + t.Run("Validate Disabling TrainingOperator Component", func(t *testing.T) { + err = kftoCtx.testUpdateTrainingOperatorComponentDisabled() + require.NoError(t, err, "error testing kfto component enabled field") + }) + }) +} + +func (tc *TrainingOperatorTestCtx) testTrainingOperatorCreation() error { + if tc.testCtx.testDsc.Spec.Components.TrainingOperator.ManagementState != operatorv1.Managed { + return nil + } + + err := tc.testCtx.wait(func(ctx context.Context) (bool, error) { + existingTrainingOperatorList := &componentsv1.TrainingOperatorList{} + + if err := tc.testCtx.customClient.List(ctx, existingTrainingOperatorList); err != nil { + return false, err + } + + switch { + case len(existingTrainingOperatorList.Items) == 1: + tc.testTrainingOperatorInstance = existingTrainingOperatorList.Items[0] + return true, nil + case len(existingTrainingOperatorList.Items) > 1: + return false, fmt.Errorf( + "unexpected TrainingOperator CR instances. Expected 1 , Found %v instance", len(existingTrainingOperatorList.Items)) + default: + return false, nil + } + }) + + if err != nil { + return fmt.Errorf("unable to find TrainingOperator CR instance: %w", err) + } + + return nil +} + +func (tc *TrainingOperatorTestCtx) validateTrainingOperator() error { + // TrainingOperator spec should match the spec of TrainingOperator component in DSC + if !reflect.DeepEqual(tc.testCtx.testDsc.Spec.Components.TrainingOperator.TrainingOperatorCommonSpec, tc.testTrainingOperatorInstance.Spec.TrainingOperatorCommonSpec) { + err := fmt.Errorf("expected .spec for TrainingOperator %v, got %v", + tc.testCtx.testDsc.Spec.Components.TrainingOperator.TrainingOperatorCommonSpec, tc.testTrainingOperatorInstance.Spec.TrainingOperatorCommonSpec) + return err + } + return nil +} + +func (tc *TrainingOperatorTestCtx) testOwnerReferences() error { + if len(tc.testTrainingOperatorInstance.OwnerReferences) != 1 { + return errors.New("expect CR has ownerreferences set") + } + + // Test TrainingOperator CR ownerref + if tc.testTrainingOperatorInstance.OwnerReferences[0].Kind != dscKind { + return fmt.Errorf("expected ownerreference DataScienceCluster not found. Got ownereferrence: %v", + tc.testTrainingOperatorInstance.OwnerReferences[0].Kind) + } + + // Test TrainingOperator resources + appDeployments, err := tc.testCtx.kubeClient.AppsV1().Deployments(tc.testCtx.applicationsNamespace).List(tc.testCtx.ctx, metav1.ListOptions{ + LabelSelector: labels.ODH.Component(componentsv1.TrainingOperatorComponentName), + }) + if err != nil { + return fmt.Errorf("error listing component deployments %w", err) + } + // test any one deployment for ownerreference + if len(appDeployments.Items) != 0 && appDeployments.Items[0].OwnerReferences[0].Kind != componentsv1.TrainingOperatorKind { + return fmt.Errorf("expected ownerreference not found. Got ownereferrence: %v", + appDeployments.Items[0].OwnerReferences) + } + + return nil +} + +// Verify TrainingOperator instance is in Ready phase when kfto deployments are up and running. +func (tc *TrainingOperatorTestCtx) validateTrainingOperatorReady() error { + err := wait.PollUntilContextTimeout(tc.testCtx.ctx, generalRetryInterval, componentReadyTimeout, true, func(ctx context.Context) (bool, error) { + key := types.NamespacedName{Name: tc.testTrainingOperatorInstance.Name} + kfto := &componentsv1.TrainingOperator{} + + err := tc.testCtx.customClient.Get(ctx, key, kfto) + if err != nil { + return false, err + } + return kfto.Status.Phase == readyStatus, nil + }) + + if err != nil { + return fmt.Errorf("error waiting Ready state for TrainingOperator %v: %w", tc.testTrainingOperatorInstance.Name, err) + } + + return nil +} + +func (tc *TrainingOperatorTestCtx) testUpdateOnTrainingOperatorResources() error { + // Test Updating TrainingOperator Replicas + + appDeployments, err := tc.testCtx.kubeClient.AppsV1().Deployments(tc.testCtx.applicationsNamespace).List(tc.testCtx.ctx, metav1.ListOptions{ + LabelSelector: labels.ComponentManagedBy + "=" + tc.testTrainingOperatorInstance.Name, + }) + if err != nil { + return err + } + + if len(appDeployments.Items) != 1 { + return fmt.Errorf("error getting deployment for component %s", tc.testTrainingOperatorInstance.Name) + } + + const expectedReplica int32 = 2 // from 1 to 2 + + testDeployment := appDeployments.Items[0] + patchedReplica := &autoscalingv1.Scale{ + ObjectMeta: metav1.ObjectMeta{ + Name: testDeployment.Name, + Namespace: testDeployment.Namespace, + }, + Spec: autoscalingv1.ScaleSpec{ + Replicas: expectedReplica, + }, + Status: autoscalingv1.ScaleStatus{}, + } + updatedDep, err := tc.testCtx.kubeClient.AppsV1().Deployments(tc.testCtx.applicationsNamespace).UpdateScale(tc.testCtx.ctx, + testDeployment.Name, patchedReplica, metav1.UpdateOptions{}) + if err != nil { + return fmt.Errorf("error patching component resources : %w", err) + } + if updatedDep.Spec.Replicas != patchedReplica.Spec.Replicas { + return fmt.Errorf("failed to patch replicas : expect to be %v but got %v", patchedReplica.Spec.Replicas, updatedDep.Spec.Replicas) + } + + // Sleep for 20 seconds to allow the operator to reconcile + // we expect it should not revert back to original value because of AllowList + time.Sleep(2 * generalRetryInterval) + reconciledDep, err := tc.testCtx.kubeClient.AppsV1().Deployments(tc.testCtx.applicationsNamespace).Get(tc.testCtx.ctx, testDeployment.Name, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("error getting component resource after reconcile: %w", err) + } + if *reconciledDep.Spec.Replicas != expectedReplica { + return fmt.Errorf("failed to revert back replicas : expect to be %v but got %v", expectedReplica, *reconciledDep.Spec.Replicas) + } + + return nil +} + +func (tc *TrainingOperatorTestCtx) testUpdateTrainingOperatorComponentDisabled() error { + // Test Updating TrainingOperator to be disabled + var kftoDeploymentName string + + if tc.testCtx.testDsc.Spec.Components.TrainingOperator.ManagementState == operatorv1.Managed { + appDeployments, err := tc.testCtx.kubeClient.AppsV1().Deployments(tc.testCtx.applicationsNamespace).List(tc.testCtx.ctx, metav1.ListOptions{ + LabelSelector: labels.ODH.Component(componentsv1.TrainingOperatorComponentName), + }) + if err != nil { + return fmt.Errorf("error getting enabled component %v", componentsv1.TrainingOperatorComponentName) + } + if len(appDeployments.Items) > 0 { + kftoDeploymentName = appDeployments.Items[0].Name + if appDeployments.Items[0].Status.ReadyReplicas == 0 { + return fmt.Errorf("error getting enabled component: %s its deployment 'ReadyReplicas'", kftoDeploymentName) + } + } + } else { + return errors.New("kfto spec should be in 'enabled: true' state in order to perform test") + } + + // Disable component TrainingOperator + err := retry.RetryOnConflict(retry.DefaultRetry, func() error { + // refresh DSC instance in case it was updated during the reconcile + err := tc.testCtx.customClient.Get(tc.testCtx.ctx, types.NamespacedName{Name: tc.testCtx.testDsc.Name}, tc.testCtx.testDsc) + if err != nil { + return fmt.Errorf("error getting resource %w", err) + } + // Disable the Component + tc.testCtx.testDsc.Spec.Components.TrainingOperator.ManagementState = operatorv1.Removed + + // Try to update + err = tc.testCtx.customClient.Update(tc.testCtx.ctx, tc.testCtx.testDsc) + // Return err itself here (not wrapped inside another error) + // so that RetryOnConflict can identify it correctly. + if err != nil { + return fmt.Errorf("error updating component from 'enabled: true' to 'enabled: false': %w", err) + } + + return nil + }) + if err != nil { + return fmt.Errorf("error after retry %w", err) + } + + if err = tc.testCtx.wait(func(ctx context.Context) (bool, error) { + // Verify kfto CR is deleted + kfto := &componentsv1.TrainingOperator{} + err = tc.testCtx.customClient.Get(ctx, client.ObjectKey{Name: tc.testTrainingOperatorInstance.Name}, kfto) + return k8serr.IsNotFound(err), nil + }); err != nil { + return fmt.Errorf("component kfto is disabled, should not get the TrainingOperator CR %v", tc.testTrainingOperatorInstance.Name) + } + + // Sleep for 20 seconds to allow the operator to reconcile + time.Sleep(2 * generalRetryInterval) + _, err = tc.testCtx.kubeClient.AppsV1().Deployments(tc.testCtx.applicationsNamespace).Get(tc.testCtx.ctx, kftoDeploymentName, metav1.GetOptions{}) + if err != nil { + if k8serr.IsNotFound(err) { + return nil // correct result: should not find deployment after we disable it already + } + return fmt.Errorf("error getting component resource after reconcile: %w", err) + } + return fmt.Errorf("component %v is disabled, should not get its deployment %v from NS %v any more", + componentsv1.TrainingOperatorKind, + kftoDeploymentName, + tc.testCtx.applicationsNamespace) +} diff --git a/tests/e2e/odh_manager_test.go b/tests/e2e/odh_manager_test.go index 90fbba36943..89020b4267d 100644 --- a/tests/e2e/odh_manager_test.go +++ b/tests/e2e/odh_manager_test.go @@ -56,4 +56,10 @@ func (tc *testContext) validateOwnedCRDs(t *testing.T) { require.NoErrorf(t, tc.validateCRD("modelregistries.components.opendatahub.io"), "error in validating CRD : modelregistries.components.opendatahub.io") }) + + t.Run("Validate TrainingOperator CRD", func(t *testing.T) { + t.Parallel() + require.NoErrorf(t, tc.validateCRD("trainingoperators.components.opendatahub.io"), + "error in validating CRD : trainingoperators.components.opendatahub.io") + }) } diff --git a/tests/e2e/ray_test.go b/tests/e2e/ray_test.go index 30defef3992..f40f197d791 100644 --- a/tests/e2e/ray_test.go +++ b/tests/e2e/ray_test.go @@ -117,7 +117,7 @@ func (tc *RayTestCtx) testOwnerReferences() error { } // Test Ray CR ownerref - if tc.testRayInstance.OwnerReferences[0].Kind != "DataScienceCluster" { + if tc.testRayInstance.OwnerReferences[0].Kind != dscKind { return fmt.Errorf("expected ownerreference DataScienceCluster not found. Got ownereferrence: %v", tc.testRayInstance.OwnerReferences[0].Kind) }