diff --git a/cmd/katib-controller/v1beta1/main.go b/cmd/katib-controller/v1beta1/main.go index b0177b7a95d..f83bb088cd3 100644 --- a/cmd/katib-controller/v1beta1/main.go +++ b/cmd/katib-controller/v1beta1/main.go @@ -48,6 +48,8 @@ func main() { var injectSecurityContext bool var enableGRPCProbeInSuggestion bool var trialResources trialutil.GvkListFlag + var enableLeaderElection bool + var leaderElectionID string flag.StringVar(&experimentSuggestionName, "experiment-suggestion-name", "default", "The implementation of suggestion interface in experiment controller (default)") @@ -56,6 +58,9 @@ func main() { flag.BoolVar(&enableGRPCProbeInSuggestion, "enable-grpc-probe-in-suggestion", true, "enable grpc probe in suggestions") flag.Var(&trialResources, "trial-resources", "The list of resources that can be used as trial template, in the form: Kind.version.group (e.g. TFJob.v1.kubeflow.org)") flag.IntVar(&webhookPort, "webhook-port", 8443, "The port number to be used for admission webhook server.") + // For leader election + flag.BoolVar(&enableLeaderElection, "enable-leader-election", false, "Enable leader election for katib-controller. Enabling this will ensure there is only one active katib-controller.") + flag.StringVar(&leaderElectionID, "leader-election-id", "3fbc96e9.katib.kubeflow.org", "The ID for leader election.") // TODO (andreyvelich): Currently it is not possible to set different webhook service name. // flag.StringVar(&serviceName, "webhook-service-name", "katib-controller", "The service name which will be used in webhook") @@ -95,6 +100,8 @@ func main() { // Create a new katib controller to provide shared dependencies and start components mgr, err := manager.New(cfg, manager.Options{ MetricsBindAddress: metricsAddr, + LeaderElection: enableLeaderElection, + LeaderElectionID: leaderElectionID, }) if err != nil { log.Error(err, "Failed to create the manager") diff --git a/docs/developer-guide.md b/docs/developer-guide.md index 7e637e9df89..73bb2f8b89b 100644 --- a/docs/developer-guide.md +++ b/docs/developer-guide.md @@ -56,14 +56,16 @@ make generate Below is a list of command-line flags accepted by Katib controller: -| Name | Type | Default | Description | -| ------------------------------- | ------------------------- | --------- | ---------------------------------------------------------------------------------------------------------------------- | -| enable-grpc-probe-in-suggestion | bool | true | Enable grpc probe in suggestions | -| experiment-suggestion-name | string | "default" | The implementation of suggestion interface in experiment controller | -| metrics-addr | string | ":8080" | The address the metric endpoint binds to | -| trial-resources | []schema.GroupVersionKind | null | The list of resources that can be used as trial template, in the form: Kind.version.group (e.g. TFJob.v1.kubeflow.org) | -| webhook-inject-securitycontext | bool | false | Inject the securityContext of container[0] in the sidecar | -| webhook-port | int | 8443 | The port number to be used for admission webhook server | +| Name | Type | Default | Description | +| ------------------------------- | ------------------------- | ----------------------------- | ---------------------------------------------------------------------------------------------------------------------- | +| enable-grpc-probe-in-suggestion | bool | true | Enable grpc probe in suggestions | +| experiment-suggestion-name | string | "default" | The implementation of suggestion interface in experiment controller | +| metrics-addr | string | ":8080" | The address the metric endpoint binds to | +| trial-resources | []schema.GroupVersionKind | null | The list of resources that can be used as trial template, in the form: Kind.version.group (e.g. TFJob.v1.kubeflow.org) | +| webhook-inject-securitycontext | bool | false | Inject the securityContext of container[0] in the sidecar | +| webhook-port | int | 8443 | The port number to be used for admission webhook server | +| enable-leader-election | bool | false | Enable leader election for katib-controller. Enabling this will ensure there is only one active katib-controller. | +| leader-election-id | string | "3fbc96e9.katib.kubeflow.org" | The ID for leader election. | ## Workflow design diff --git a/manifests/v1beta1/installs/katib-leader-election/kustomization.yaml b/manifests/v1beta1/installs/katib-leader-election/kustomization.yaml new file mode 100644 index 00000000000..d0bfa3dc0bf --- /dev/null +++ b/manifests/v1beta1/installs/katib-leader-election/kustomization.yaml @@ -0,0 +1,17 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: kubeflow +resources: + - ../katib-standalone + # rbac for leader-election + - leader-election-rbac.yaml +replicas: + - name: katib-controller + count: 2 +patchesJson6902: + - target: + group: apps + version: v1 + kind: Deployment + name: katib-controller + path: ./patches/controller.yaml diff --git a/manifests/v1beta1/installs/katib-leader-election/leader-election-rbac.yaml b/manifests/v1beta1/installs/katib-leader-election/leader-election-rbac.yaml new file mode 100644 index 00000000000..a0323a0116b --- /dev/null +++ b/manifests/v1beta1/installs/katib-leader-election/leader-election-rbac.yaml @@ -0,0 +1,26 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: leader-election + namespace: kubeflow +rules: + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - "*" +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: leader-election + namespace: kubeflow +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: leader-election +subjects: + - kind: ServiceAccount + name: katib-controller + namespace: kubeflow diff --git a/manifests/v1beta1/installs/katib-leader-election/patches/controller.yaml b/manifests/v1beta1/installs/katib-leader-election/patches/controller.yaml new file mode 100644 index 00000000000..9e2c9c1df6f --- /dev/null +++ b/manifests/v1beta1/installs/katib-leader-election/patches/controller.yaml @@ -0,0 +1,3 @@ +- op: add + path: /spec/template/spec/containers/0/args/- + value: "--enable-leader-election" diff --git a/test/e2e/v1beta1/scripts/setup-katib.sh b/test/e2e/v1beta1/scripts/setup-katib.sh index f4789164de4..42c94d3ce7d 100755 --- a/test/e2e/v1beta1/scripts/setup-katib.sh +++ b/test/e2e/v1beta1/scripts/setup-katib.sh @@ -93,20 +93,9 @@ cd "${GOPATH}/src/github.com/kubeflow/katib" make deploy # Wait until all Katib pods is running. -TIMEOUT=120 -PODNUM=$(kubectl get deploy -n kubeflow | grep -v NAME | wc -l) -# 1 Pod for the cert-generator Job -PODNUM=$((PODNUM + 1)) -until kubectl get pods -n kubeflow | grep -E 'Running|Completed' | [[ $(wc -l) -eq $PODNUM ]]; do - echo Pod Status $(kubectl get pods -n kubeflow | grep "1/1" | wc -l)/$PODNUM - sleep 10 - TIMEOUT=$((TIMEOUT - 1)) - if [[ $TIMEOUT -eq 0 ]]; then - echo "NG" - kubectl get pods -n kubeflow - exit 1 - fi -done +TIMEOUT=120s +kubectl wait --for=condition=complete --timeout=${TIMEOUT} -l katib.kubeflow.org/component=cert-generator -n kubeflow job +kubectl wait --for=condition=ready --timeout=${TIMEOUT} -l "katib.kubeflow.org/component in (controller,db-manager,mysql,ui)" -n kubeflow pod echo "All Katib components are running." echo "Katib deployments"