Skip to content

Commit

Permalink
KEP-2170: Add manifests for Kubeflow Training V2 (#2289)
Browse files Browse the repository at this point in the history
* KEP-2170: Add manifests for Kubeflow Training V2

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>

* Fix invalid name for webhook config in cert

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>

* Fix integration tests

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>

* Move kubebuilder markers to runtime framework

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>

* Use Kubernetes recommended labels

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>

---------

Signed-off-by: Andrey Velichkevich <andrey.velichkevich@gmail.com>
  • Loading branch information
andreyvelich authored Oct 21, 2024
1 parent 81d02bf commit fd4d102
Show file tree
Hide file tree
Showing 22 changed files with 264 additions and 13 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/publish-core-images.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
dockerfile: build/images/training-operator/Dockerfile
platforms: linux/amd64,linux/arm64,linux/ppc64le
tag-prefix: v1
- component-name: training-operator
- component-name: training-operator-v2
dockerfile: cmd/training-operator.v2alpha1/Dockerfile
platforms: linux/amd64,linux/arm64,linux/ppc64le
tag-prefix: v2alpha1
Expand Down
4 changes: 3 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,10 @@ manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and Cust
output:crd:artifacts:config=manifests/base/crds \
output:rbac:artifacts:config=manifests/base/rbac \
output:webhook:artifacts:config=manifests/base/webhook
$(CONTROLLER_GEN) "crd:generateEmbeddedObjectMeta=true" "webhook" paths="./pkg/apis/kubeflow.org/v2alpha1/...;./pkg/webhook.v2/..." \
$(CONTROLLER_GEN) "crd:generateEmbeddedObjectMeta=true" rbac:roleName=training-operator-v2 webhook \
paths="./pkg/apis/kubeflow.org/v2alpha1/...;./pkg/controller.v2/...;./pkg/runtime.v2/...;./pkg/webhook.v2/...;./pkg/cert/..." \
output:crd:artifacts:config=manifests/v2/base/crds \
output:rbac:artifacts:config=manifests/v2/base/rbac \
output:webhook:artifacts:config=manifests/v2/base/webhook

generate: controller-gen ## Generate apidoc, sdk and code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations.
Expand Down
7 changes: 5 additions & 2 deletions cmd/training-operator.v1/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ import (
const (
// EnvKubeflowNamespace is an environment variable for namespace when deployed on kubernetes
EnvKubeflowNamespace = "KUBEFLOW_NAMESPACE"

webhookConfigurationName = "validator.training-operator.kubeflow.org"
)

var (
Expand Down Expand Up @@ -150,8 +152,9 @@ func main() {
certsReady := make(chan struct{})
defer close(certsReady)
certGenerationConfig := cert.Config{
WebhookSecretName: webhookSecretName,
WebhookServiceName: webhookServiceName,
WebhookSecretName: webhookSecretName,
WebhookServiceName: webhookServiceName,
WebhookConfigurationName: webhookConfigurationName,
}
if err = cert.ManageCerts(mgr, certGenerationConfig, certsReady); err != nil {
setupLog.Error(err, "Unable to set up cert rotation")
Expand Down
9 changes: 7 additions & 2 deletions cmd/training-operator.v2alpha1/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,10 @@ import (
webhookv2 "github.com/kubeflow/training-operator/pkg/webhook.v2"
)

const (
webhookConfigurationName = "validator.training-operator-v2.kubeflow.org"
)

var (
scheme = apiruntime.NewScheme()
setupLog = ctrl.Log.WithName("setup")
Expand Down Expand Up @@ -124,8 +128,9 @@ func main() {

certsReady := make(chan struct{})
if err = cert.ManageCerts(mgr, cert.Config{
WebhookSecretName: webhookSecretName,
WebhookServiceName: webhookServiceName,
WebhookSecretName: webhookSecretName,
WebhookServiceName: webhookServiceName,
WebhookConfigurationName: webhookConfigurationName,
}, certsReady); err != nil {
setupLog.Error(err, "unable to set up cert rotation")
os.Exit(1)
Expand Down
9 changes: 9 additions & 0 deletions manifests/v2/base/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
# We can't set namespace in the overlays since we use remote JobSet manifests in the resources.
namespace: kubeflow-system
resources:
- ./crds
- ./rbac
- ./webhook
- ./manager
2 changes: 2 additions & 0 deletions manifests/v2/base/manager/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
resources:
- manager.yaml
70 changes: 70 additions & 0 deletions manifests/v2/base/manager/manager.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: training-operator-v2
labels:
app.kubernetes.io/name: training
app.kubernetes.io/component: manager
app.kubernetes.io/part-of: kubeflow
spec:
selector:
matchLabels:
app.kubernetes.io/name: training
app.kubernetes.io/component: manager
app.kubernetes.io/part-of: kubeflow
template:
metadata:
labels:
app.kubernetes.io/name: training
app.kubernetes.io/component: manager
app.kubernetes.io/part-of: kubeflow
spec:
containers:
- name: manager
image: kubeflow/training-operator-v2
env:
- name: MY_POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
volumeMounts:
- mountPath: /tmp/k8s-webhook-server/serving-certs
name: cert
readOnly: true
livenessProbe:
httpGet:
path: /healthz
port: 8081
initialDelaySeconds: 15
periodSeconds: 20
timeoutSeconds: 3
readinessProbe:
httpGet:
path: /readyz
port: 8081
initialDelaySeconds: 10
periodSeconds: 15
timeoutSeconds: 3
serviceAccountName: training-operator-v2
volumes:
- name: cert
secret:
defaultMode: 420
secretName: training-operator-v2-webhook-cert
---
apiVersion: v1
kind: Service
metadata:
name: training-operator-v2
spec:
ports:
- name: monitoring-port
port: 8080
targetPort: 8080
- name: webhook-server
port: 443
protocol: TCP
targetPort: 9443
selector:
app.kubernetes.io/component: manager
4 changes: 4 additions & 0 deletions manifests/v2/base/rbac/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
resources:
- role.yaml
- role_binding.yaml
- service_account.yaml
78 changes: 78 additions & 0 deletions manifests/v2/base/rbac/role.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: training-operator-v2
rules:
- apiGroups:
- ""
resources:
- secrets
verbs:
- get
- list
- update
- watch
- apiGroups:
- admissionregistration.k8s.io
resources:
- validatingwebhookconfigurations
verbs:
- get
- list
- update
- watch
- apiGroups:
- jobset.x-k8s.io
resources:
- jobsets
verbs:
- create
- get
- list
- watch
- apiGroups:
- kubeflow.org
resources:
- clustertrainingruntimes
verbs:
- get
- list
- watch
- apiGroups:
- kubeflow.org
resources:
- trainingruntimes
verbs:
- get
- list
- watch
- apiGroups:
- kubeflow.org
resources:
- trainjobs
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- kubeflow.org
resources:
- trainjobs/status
verbs:
- get
- patch
- update
- apiGroups:
- scheduling.x-k8s.io
resources:
- podgroups
verbs:
- create
- get
- list
- watch
12 changes: 12 additions & 0 deletions manifests/v2/base/rbac/role_binding.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: training-operator-v2
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: training-operator-v2
subjects:
- kind: ServiceAccount
name: training-operator-v2
5 changes: 5 additions & 0 deletions manifests/v2/base/rbac/service_account.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: training-operator-v2
10 changes: 10 additions & 0 deletions manifests/v2/base/webhook/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -1,2 +1,12 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- manifests.yaml
patches:
- path: patch.yaml
target:
group: admissionregistration.k8s.io
version: v1
kind: ValidatingWebhookConfiguration
configurations:
- kustomizeconfig.yaml
10 changes: 10 additions & 0 deletions manifests/v2/base/webhook/kustomizeconfig.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# the following config is for teaching kustomize where to look at when substituting vars.
# It requires kustomize v2.1.0 or newer to work properly.
namespace:
- kind: ValidatingWebhookConfiguration
group: admissionregistration.k8s.io
path: webhooks/clientConfig/service/namespace
create: true

varReference:
- path: metadata/annotations
12 changes: 12 additions & 0 deletions manifests/v2/base/webhook/patch.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
- op: replace
path: /webhooks/0/clientConfig/service/name
value: training-operator-v2
- op: replace
path: /webhooks/1/clientConfig/service/name
value: training-operator-v2
- op: replace
path: /webhooks/2/clientConfig/service/name
value: training-operator-v2
- op: replace
path: /metadata/name
value: validator.training-operator-v2.kubeflow.org
15 changes: 15 additions & 0 deletions manifests/v2/overlays/standalone/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- namespace.yaml
- ../../base
# TODO (andreyvelich): JobSet should support kubeflow-system namespace.
- https://github.com/kubernetes-sigs/jobset/releases/download/v0.6.0/manifests.yaml
images:
- name: kubeflow/training-operator-v2
newTag: latest
secretGenerator:
- name: training-operator-v2-webhook-cert
namespace: kubeflow-system
options:
disableNameSuffixHash: true
4 changes: 4 additions & 0 deletions manifests/v2/overlays/standalone/namespace.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
apiVersion: v1
kind: Namespace
metadata:
name: kubeflow-system
12 changes: 6 additions & 6 deletions pkg/cert/cert.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,19 +27,19 @@ import (

const (
certDir = "/tmp/k8s-webhook-server/serving-certs"
vwcName = "validator.training-operator.kubeflow.org"
caName = "training-operator-ca"
caOrganization = "training-operator"
defaultOperatorNamespace = "kubeflow"
)

type Config struct {
WebhookServiceName string
WebhookSecretName string
WebhookServiceName string
WebhookSecretName string
WebhookConfigurationName string
}

// +kubebuilder:rbac:groups="",resources=secrets,verbs=get;list;watch;update
// +kubebuilder:rbac:groups="admissionregistration.k8s.io",resources=validatingwebhookconfigurations,verbs=get;list;watch;update
//+kubebuilder:rbac:groups="",resources=secrets,verbs=get;list;watch;update
//+kubebuilder:rbac:groups="admissionregistration.k8s.io",resources=validatingwebhookconfigurations,verbs=get;list;watch;update

// ManageCerts creates all certs for webhooks.
func ManageCerts(mgr ctrl.Manager, cfg Config, setupFinished chan struct{}) error {
Expand All @@ -61,7 +61,7 @@ func ManageCerts(mgr ctrl.Manager, cfg Config, setupFinished chan struct{}) erro
IsReady: setupFinished,
Webhooks: []cert.WebhookInfo{{
Type: cert.Validating,
Name: vwcName,
Name: cfg.WebhookConfigurationName,
}},
// When training-operator is running in the leader election mode,
// we expect webhook server will run in primary and secondary instance
Expand Down
3 changes: 3 additions & 0 deletions pkg/controller.v2/trainjob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ func NewTrainJobReconciler(client client.Client, recorder record.EventRecorder)
}
}

//+kubebuilder:rbac:groups=kubeflow.org,resources=trainjobs,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=kubeflow.org,resources=trainjobs/status,verbs=get;update;patch

func (r *TrainJobReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
var trainJob kubeflowv2.TrainJob
if err := r.client.Get(ctx, req.NamespacedName, &trainJob); err != nil {
Expand Down
3 changes: 3 additions & 0 deletions pkg/runtime.v2/core/core.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ import (
runtime "github.com/kubeflow/training-operator/pkg/runtime.v2"
)

//+kubebuilder:rbac:groups=kubeflow.org,resources=trainingruntimes,verbs=get;list;watch
//+kubebuilder:rbac:groups=kubeflow.org,resources=clustertrainingruntimes,verbs=get;list;watch

func New(ctx context.Context, client client.Client, indexer client.FieldIndexer) (map[string]runtime.Runtime, error) {
registry := NewRuntimeRegistry()
runtimes := make(map[string]runtime.Runtime, len(registry))
Expand Down
2 changes: 2 additions & 0 deletions pkg/runtime.v2/framework/plugins/coscheduling/coscheduling.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ var (

const Name = "CoScheduling"

//+kubebuilder:rbac:groups=scheduling.x-k8s.io,resources=podgroups,verbs=get;list;watch;create

func New(ctx context.Context, c client.Client, indexer client.FieldIndexer) (framework.Plugin, error) {
if err := indexer.IndexField(ctx, &kubeflowv2.TrainingRuntime{}, TrainingRuntimeContainerRuntimeClassKey,
IndexTrainingRuntimeContainerRuntimeClass); err != nil {
Expand Down
2 changes: 2 additions & 0 deletions pkg/runtime.v2/framework/plugins/jobset/jobset.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ var _ framework.ComponentBuilderPlugin = (*JobSet)(nil)

const Name = "JobSet"

//+kubebuilder:rbac:groups=jobset.x-k8s.io,resources=jobsets,verbs=get;list;watch;create

func New(ctx context.Context, c client.Client, _ client.FieldIndexer) (framework.Plugin, error) {
return &JobSet{
client: c,
Expand Down
2 changes: 1 addition & 1 deletion test/integration/framework/framework.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ func (f *Framework) Init() *rest.Config {
f.testEnv = &envtest.Environment{
CRDDirectoryPaths: []string{filepath.Join("..", "..", "..", "manifests", "v2", "base", "crds")},
WebhookInstallOptions: envtest.WebhookInstallOptions{
Paths: []string{filepath.Join("..", "..", "..", "manifests", "v2", "base", "webhook")},
Paths: []string{filepath.Join("..", "..", "..", "manifests", "v2", "base", "webhook", "manifests.yaml")},
},
ErrorIfCRDPathMissing: true,
}
Expand Down

0 comments on commit fd4d102

Please sign in to comment.