apply k8s leader election fix locally

knative · Jun 24, 2020 · c8412c5 · c8412c5
1 parent abd45a1
commit c8412c5
Show file tree

Hide file tree

Showing 2 changed files with 351 additions and 1 deletion.
diff --git a/pkg/leaderelection/context.go b/pkg/leaderelection/context.go
@@ -19,6 +19,8 @@ package leaderelection
 import (
 	"context"
 
+	"knative.dev/eventing/pkg/leaderelection/k8s"
+
 	"go.uber.org/zap"
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/watch"
@@ -124,7 +126,7 @@ func (b *standardBuilder) BuildElector(ctx context.Context, adapter Adapter) (El
 		logger.Fatalw("Error creating lock", zap.Error(err))
 	}
 
-	return leaderelection.NewLeaderElector(leaderelection.LeaderElectionConfig{
+	return k8s.NewLeaderElector(leaderelection.LeaderElectionConfig{
 		Lock:          rl,
 		LeaseDuration: b.lec.LeaseDuration,
 		RenewDeadline: b.lec.RenewDeadline,

diff --git a/pkg/leaderelection/k8s/leaderelection.go b/pkg/leaderelection/k8s/leaderelection.go
@@ -0,0 +1,348 @@
+/*
+Copyright 2020 The Knative Authors
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+// This file is a copy of this one: https://github.com/kubernetes/kubernetes/blob/master/staging/src/k8s.io/client-go/tools/leaderelection/leaderelection.go
+// with this fix applied: shttps://github.com/kubernetes/kubernetes/pull/91942/files
+
+// Package leaderelection implements leader election of a set of endpoints.
+// It uses an annotation in the endpoints object to store the record of the
+// election state. This implementation does not guarantee that only one
+// client is acting as a leader (a.k.a. fencing).
+//
+// A client only acts on timestamps captured locally to infer the state of the
+// leader election. The client does not consider timestamps in the leader
+// election record to be accurate because these timestamps may not have been
+// produced by a local clock. The implemention does not depend on their
+// accuracy and only uses their change to indicate that another client has
+// renewed the leader lease. Thus the implementation is tolerant to arbitrary
+// clock skew, but is not tolerant to arbitrary clock skew rate.
+//
+// However the level of tolerance to skew rate can be configured by setting
+// RenewDeadline and LeaseDuration appropriately. The tolerance expressed as a
+// maximum tolerated ratio of time passed on the fastest node to time passed on
+// the slowest node can be approximately achieved with a configuration that sets
+// the same ratio of LeaseDuration to RenewDeadline. For example if a user wanted
+// to tolerate some nodes progressing forward in time twice as fast as other nodes,
+// the user could set LeaseDuration to 60 seconds and RenewDeadline to 30 seconds.
+//
+// While not required, some method of clock synchronization between nodes in the
+// cluster is highly recommended. It's important to keep in mind when configuring
+// this client that the tolerance to skew rate varies inversely to master
+// availability.
+//
+// Larger clusters often have a more lenient SLA for API latency. This should be
+// taken into account when configuring the client. The rate of leader transitions
+// should be monitored and RetryPeriod and LeaseDuration should be increased
+// until the rate is stable and acceptably low. It's important to keep in mind
+// when configuring this client that the tolerance to API latency varies inversely
+// to master availability.
+//
+// DISCLAIMER: this is an alpha API. This library will likely change significantly
+// or even be removed entirely in subsequent releases. Depend on this API at
+// your own risk.
+package k8s
+
+import (
+	"bytes"
+	"context"
+	"fmt"
+	"time"
+
+	"k8s.io/apimachinery/pkg/api/errors"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/clock"
+	"k8s.io/apimachinery/pkg/util/runtime"
+	"k8s.io/apimachinery/pkg/util/wait"
+	"k8s.io/client-go/tools/leaderelection"
+	rl "k8s.io/client-go/tools/leaderelection/resourcelock"
+	"k8s.io/klog"
+)
+
+const (
+	JitterFactor = 1.2
+)
+
+// NewLeaderElector creates a LeaderElector from a LeaderElectionConfig
+func NewLeaderElector(lec leaderelection.LeaderElectionConfig) (*LeaderElector, error) {
+	if lec.LeaseDuration <= lec.RenewDeadline {
+		return nil, fmt.Errorf("leaseDuration must be greater than renewDeadline")
+	}
+	if lec.RenewDeadline <= time.Duration(JitterFactor*float64(lec.RetryPeriod)) {
+		return nil, fmt.Errorf("renewDeadline must be greater than retryPeriod*JitterFactor")
+	}
+	if lec.LeaseDuration < 1 {
+		return nil, fmt.Errorf("leaseDuration must be greater than zero")
+	}
+	if lec.RenewDeadline < 1 {
+		return nil, fmt.Errorf("renewDeadline must be greater than zero")
+	}
+	if lec.RetryPeriod < 1 {
+		return nil, fmt.Errorf("retryPeriod must be greater than zero")
+	}
+	if lec.Callbacks.OnStartedLeading == nil {
+		return nil, fmt.Errorf("OnStartedLeading callback must not be nil")
+	}
+	if lec.Callbacks.OnStoppedLeading == nil {
+		return nil, fmt.Errorf("OnStoppedLeading callback must not be nil")
+	}
+
+	if lec.Lock == nil {
+		return nil, fmt.Errorf("Lock must not be nil.")
+	}
+	le := LeaderElector{
+		config: lec,
+		clock:  clock.RealClock{},
+	}
+	return &le, nil
+}
+
+// LeaderCallbacks are callbacks that are triggered during certain
+// lifecycle events of the LeaderElector. These are invoked asynchronously.
+//
+// possible future callbacks:
+//  * OnChallenge()
+type LeaderCallbacks struct {
+	// OnStartedLeading is called when a LeaderElector client starts leading
+	OnStartedLeading func(context.Context)
+	// OnStoppedLeading is called when a LeaderElector client stops leading
+	OnStoppedLeading func()
+	// OnNewLeader is called when the client observes a leader that is
+	// not the previously observed leader. This includes the first observed
+	// leader when the client starts.
+	OnNewLeader func(identity string)
+}
+
+// LeaderElector is a leader election client.
+type LeaderElector struct {
+	config leaderelection.LeaderElectionConfig
+	// internal bookkeeping
+	observedRecord    rl.LeaderElectionRecord
+	observedRawRecord []byte
+	observedTime      time.Time
+	// used to implement OnNewLeader(), may lag slightly from the
+	// value observedRecord.HolderIdentity if the transition has
+	// not yet been reported.
+	reportedLeader string
+
+	// clock is wrapper around time to allow for less flaky testing
+	clock clock.Clock
+
+	// name is the name of the resource lock for debugging
+	name string
+}
+
+// Run starts the leader election loop
+func (le *LeaderElector) Run(ctx context.Context) {
+	defer func() {
+		runtime.HandleCrash()
+		le.config.Callbacks.OnStoppedLeading()
+	}()
+	if !le.acquire(ctx) {
+		return // ctx signalled done
+	}
+	ctx, cancel := context.WithCancel(ctx)
+	defer cancel()
+	go le.config.Callbacks.OnStartedLeading(ctx)
+	le.renew(ctx)
+}
+
+// RunOrDie starts a client with the provided config or panics if the config
+// fails to validate.
+func RunOrDie(ctx context.Context, lec leaderelection.LeaderElectionConfig) {
+	le, err := NewLeaderElector(lec)
+	if err != nil {
+		panic(err)
+	}
+
+	le.Run(ctx)
+}
+
+// GetLeader returns the identity of the last observed leader or returns the empty string if
+// no leader has yet been observed.
+func (le *LeaderElector) GetLeader() string {
+	return le.observedRecord.HolderIdentity
+}
+
+// IsLeader returns true if the last observed leader was this client else returns false.
+func (le *LeaderElector) IsLeader() bool {
+	return le.observedRecord.HolderIdentity == le.config.Lock.Identity()
+}
+
+// acquire loops calling tryAcquireOrRenew and returns true immediately when tryAcquireOrRenew succeeds.
+// Returns false if ctx signals done.
+func (le *LeaderElector) acquire(ctx context.Context) bool {
+	ctx, cancel := context.WithCancel(ctx)
+	defer cancel()
+	succeeded := false
+	desc := le.config.Lock.Describe()
+	klog.Infof("attempting to acquire leader lease  %v...", desc)
+	wait.JitterUntil(func() {
+		succeeded = le.tryAcquireOrRenew()
+		le.maybeReportTransition()
+		if !succeeded {
+			klog.V(4).Infof("failed to acquire lease %v", desc)
+			return
+		}
+		le.config.Lock.RecordEvent("became leader")
+		klog.Infof("successfully acquired lease %v", desc)
+		cancel()
+	}, le.config.RetryPeriod, JitterFactor, true, ctx.Done())
+	return succeeded
+}
+
+// renew loops calling tryAcquireOrRenew and returns immediately when tryAcquireOrRenew fails or ctx signals done.
+func (le *LeaderElector) renew(ctx context.Context) {
+	ctx, cancel := context.WithCancel(ctx)
+	defer cancel()
+	wait.Until(func() {
+		timeoutCtx, timeoutCancel := context.WithTimeout(ctx, le.config.RenewDeadline)
+		defer timeoutCancel()
+		err := wait.PollImmediateUntil(le.config.RetryPeriod, func() (bool, error) {
+			done := make(chan bool, 1)
+			go func() {
+				defer close(done)
+				done <- le.tryAcquireOrRenew()
+			}()
+
+			select {
+			case <-timeoutCtx.Done():
+				return false, fmt.Errorf("failed to tryAcquireOrRenew %s", timeoutCtx.Err())
+			case result := <-done:
+				return result, nil
+			}
+		}, timeoutCtx.Done())
+
+		le.maybeReportTransition()
+		desc := le.config.Lock.Describe()
+		if err == nil {
+			klog.V(5).Infof("successfully renewed lease %v", desc)
+			return
+		}
+		le.config.Lock.RecordEvent("stopped leading")
+		klog.Infof("failed to renew lease %v: %v", desc, err)
+		cancel()
+	}, le.config.RetryPeriod, ctx.Done())
+
+	// if we hold the lease, give it up
+	if le.config.ReleaseOnCancel {
+		le.release()
+	}
+}
+
+// release attempts to release the leader lease if we have acquired it.
+func (le *LeaderElector) release() bool {
+	if !le.IsLeader() {
+		return true
+	}
+	leaderElectionRecord := rl.LeaderElectionRecord{
+		LeaderTransitions:    le.observedRecord.LeaderTransitions,
+		LeaseDurationSeconds: le.observedRecord.LeaseDurationSeconds,
+	}
+	if err := le.config.Lock.Update(leaderElectionRecord); err != nil {
+		klog.Errorf("Failed to release lock: %v", err)
+		return false
+	}
+	le.observedRecord = leaderElectionRecord
+	le.observedTime = le.clock.Now()
+	return true
+}
+
+// tryAcquireOrRenew tries to acquire a leader lease if it is not already acquired,
+// else it tries to renew the lease if it has already been acquired. Returns true
+// on success else returns false.
+func (le *LeaderElector) tryAcquireOrRenew() bool {
+	now := metav1.Now()
+	leaderElectionRecord := rl.LeaderElectionRecord{
+		HolderIdentity:       le.config.Lock.Identity(),
+		LeaseDurationSeconds: int(le.config.LeaseDuration / time.Second),
+		RenewTime:            now,
+		AcquireTime:          now,
+	}
+
+	// 1. obtain or create the ElectionRecord
+	oldLeaderElectionRecord, oldLeaderElectionRawRecord, err := le.config.Lock.Get()
+	if err != nil {
+		if !errors.IsNotFound(err) {
+			klog.Errorf("error retrieving resource lock %v: %v", le.config.Lock.Describe(), err)
+			return false
+		}
+		if err = le.config.Lock.Create(leaderElectionRecord); err != nil {
+			klog.Errorf("error initially creating leader election record: %v", err)
+			return false
+		}
+		le.observedRecord = leaderElectionRecord
+		le.observedTime = le.clock.Now()
+		return true
+	}
+
+	// 2. Record obtained, check the Identity & Time
+	if !bytes.Equal(le.observedRawRecord, oldLeaderElectionRawRecord) {
+		le.observedRecord = *oldLeaderElectionRecord
+		le.observedRawRecord = oldLeaderElectionRawRecord
+		le.observedTime = le.clock.Now()
+	}
+	if len(oldLeaderElectionRecord.HolderIdentity) > 0 &&
+		le.observedTime.Add(le.config.LeaseDuration).After(now.Time) &&
+		!le.IsLeader() {
+		klog.V(4).Infof("lock is held by %v and has not yet expired", oldLeaderElectionRecord.HolderIdentity)
+		return false
+	}
+
+	// 3. We're going to try to update. The leaderElectionRecord is set to it's default
+	// here. Let's correct it before updating.
+	if le.IsLeader() {
+		leaderElectionRecord.AcquireTime = oldLeaderElectionRecord.AcquireTime
+		leaderElectionRecord.LeaderTransitions = oldLeaderElectionRecord.LeaderTransitions
+	} else {
+		leaderElectionRecord.LeaderTransitions = oldLeaderElectionRecord.LeaderTransitions + 1
+	}
+
+	// update the lock itself
+	if err = le.config.Lock.Update(leaderElectionRecord); err != nil {
+		klog.Errorf("Failed to update lock: %v", err)
+		return false
+	}
+
+	le.observedRecord = leaderElectionRecord
+	le.observedTime = le.clock.Now()
+	return true
+}
+
+func (le *LeaderElector) maybeReportTransition() {
+	if le.observedRecord.HolderIdentity == le.reportedLeader {
+		return
+	}
+	le.reportedLeader = le.observedRecord.HolderIdentity
+	if le.config.Callbacks.OnNewLeader != nil {
+		go le.config.Callbacks.OnNewLeader(le.reportedLeader)
+	}
+}
+
+// Check will determine if the current lease is expired by more than timeout.
+func (le *LeaderElector) Check(maxTolerableExpiredLease time.Duration) error {
+	if !le.IsLeader() {
+		// Currently not concerned with the case that we are hot standby
+		return nil
+	}
+	// If we are more than timeout seconds after the lease duration that is past the timeout
+	// on the lease renew. Time to start reporting ourselves as unhealthy. We should have
+	// died but conditions like deadlock can prevent this. (See #70819)
+	if le.clock.Since(le.observedTime) > le.config.LeaseDuration+maxTolerableExpiredLease {
+		return fmt.Errorf("failed election to renew leadership on lease %s", le.config.Name)
+	}
+
+	return nil
+}