feat(konnect): propagate 429's retry-after header value

Kong · May 15, 2023 · 9c4f619 · 9c4f619
1 parent 7b70f70
commit 9c4f619
Show file tree

Hide file tree

Showing 3 changed files with 90 additions and 19 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -109,6 +109,7 @@ Adding a new version? You'll need three changes:
 - Configuration updates to Konnect Runtime Group's Admin API now respect a backoff
   strategy that prevents KIC from exceeding API calls limits.
   [#3989](https://github.com/Kong/kubernetes-ingress-controller/pull/3989)
+  [#4015](https://github.com/Kong/kubernetes-ingress-controller/pull/4015)
 
 ### Fixed
 

diff --git a/internal/adminapi/backoff_strategy_konnect.go b/internal/adminapi/backoff_strategy_konnect.go
@@ -3,6 +3,7 @@ package adminapi
 import (
 	"bytes"
 	"fmt"
+	"net/http"
 	"strings"
 	"sync"
 	"time"
@@ -88,28 +89,25 @@ func (s *KonnectBackoffStrategy) RegisterUpdateFailure(err error, configHash []b
 	s.lock.Lock()
 	defer s.lock.Unlock()
 
-	if errs := deckerrors.ExtractAPIErrors(err); len(errs) > 0 {
-		_, hasClientError := lo.Find(errs, func(item *kong.APIError) bool {
-			return item.Code() >= 400 && item.Code() < 500
-		})
-
-		// We store the failed configuration hash only in case we receive a client error code [400, 500).
-		// It's because we don't want to repeatedly try sending the config that we know is faulty on our side.
-		// It only makes sense to retry when the config changes.
-		if hasClientError {
-			s.lastFailedConfigHash = configHash
-		} else {
-			s.lastFailedConfigHash = nil
-		}
+	apiErrs := deckerrors.ExtractAPIErrors(err)
+	tooManyRequestsErr, isTooManyRequests := lo.Find(apiErrs, func(err *kong.APIError) bool {
+		return err.Code() == http.StatusTooManyRequests
+	})
+	if isTooManyRequests {
+		s.handleTooManyRequests(tooManyRequestsErr)
+		return
 	}
 
-	// Backoff.Duration() call returns backoff time we need to wait until next attempt.
-	// It also increments the internal attempts counter so the next time we call it, the
-	// duration will be multiplied accordingly.
-	timeLeft := s.b.Duration()
+	isClientError := lo.ContainsBy(apiErrs, func(err *kong.APIError) bool {
+		return err.Code() >= 400 && err.Code() < 500
+	})
+	if isClientError {
+		s.handleGenericClientError(configHash)
+		return
+	}
 
-	// We're storing the exact point in time after which we'll be allowed to perform the next update attempt.
-	s.nextAttempt = s.clock.Now().Add(timeLeft)
+	// If it's neither of the specific cases above, we just increment the standard exponential backoff.
+	s.incrementExponentialBackoff()
 }
 
 func (s *KonnectBackoffStrategy) RegisterUpdateSuccess() {
@@ -121,6 +119,36 @@ func (s *KonnectBackoffStrategy) RegisterUpdateSuccess() {
 	s.lastFailedConfigHash = nil
 }
 
+func (s *KonnectBackoffStrategy) handleTooManyRequests(tooManyRequestsErr *kong.APIError) {
+	if details, ok := tooManyRequestsErr.Details().(kong.ErrTooManyRequestsDetails); ok && details.RetryAfter != 0 {
+		// In case we get 429 with details embedded, we just retry after the suggested Retry-After time.
+		s.nextAttempt = s.clock.Now().Add(details.RetryAfter)
+	} else {
+		// In case the details for 429 are missing, we retry after the standard exponential backoff time.
+		s.incrementExponentialBackoff()
+	}
+
+	// Despite whether we've got details or not, we prune the last failed config hash to not block update after the
+	// period we set up above.
+	s.lastFailedConfigHash = nil
+}
+
+func (s *KonnectBackoffStrategy) handleGenericClientError(configHash []byte) {
+	// We increment the standard exponential backoff time and store the faulty config hash to prevent pushing it again.
+	s.incrementExponentialBackoff()
+	s.lastFailedConfigHash = configHash
+}
+
+func (s *KonnectBackoffStrategy) incrementExponentialBackoff() {
+	// Backoff.Duration() call returns backoff time we need to wait until next attempt.
+	// It also increments the internal attempts counter so the next time we call it, the
+	// duration will be multiplied accordingly.
+	timeLeft := s.b.Duration()
+
+	// We're storing the exact point in time after which we'll be allowed to perform the next update attempt.
+	s.nextAttempt = s.clock.Now().Add(timeLeft)
+}
+
 func (s *KonnectBackoffStrategy) whyCannotUpdate(
 	timeLeft time.Duration,
 	isTheSameFaultyConfig bool,

diff --git a/internal/adminapi/backoff_strategy_konnect_test.go b/internal/adminapi/backoff_strategy_konnect_test.go
@@ -120,4 +120,46 @@ func TestKonnectBackoffStrategy(t *testing.T) {
 		assert.True(t, canUpdate, "should allow update for another hash")
 		assert.Empty(t, whyNot)
 	})
+
+	t.Run("too many requests code with no details embedded", func(t *testing.T) {
+		strategy := adminapi.NewKonnectBackoffStrategy(clock)
+		strategy.RegisterUpdateFailure(kong.NewAPIError(http.StatusTooManyRequests, ""), hashOne)
+
+		canUpdate, whyNot := strategy.CanUpdate(hashOne)
+		assert.False(t, canUpdate, "shouldn't allow update due to a standard backoff time")
+		assert.Equal(t, "next attempt allowed in 3s", whyNot)
+
+		clock.MoveBy(time.Second * 5)
+
+		canUpdate, whyNot = strategy.CanUpdate(hashOne)
+		assert.True(t, canUpdate, "should allow update for the same hash after standard backoff time")
+		assert.Empty(t, whyNot)
+
+		canUpdate, whyNot = strategy.CanUpdate(hashTwo)
+		assert.True(t, canUpdate, "should allow update for another hash")
+		assert.Empty(t, whyNot)
+	})
+
+	t.Run("too many requests code with details embedded", func(t *testing.T) {
+		strategy := adminapi.NewKonnectBackoffStrategy(clock)
+		tooManyRequestsAPIErr := kong.NewAPIError(http.StatusTooManyRequests, "")
+		const retryAfter = time.Minute
+		tooManyRequestsAPIErr.SetDetails(kong.ErrTooManyRequestsDetails{
+			RetryAfter: retryAfter,
+		})
+		strategy.RegisterUpdateFailure(tooManyRequestsAPIErr, hashOne)
+
+		canUpdate, whyNot := strategy.CanUpdate(hashOne)
+		assert.False(t, canUpdate, "shouldn't allow update due to the suggested retry-after backoff")
+		assert.Equal(t, "next attempt allowed in 1m0s", whyNot)
+
+		canUpdate, whyNot = strategy.CanUpdate(hashTwo)
+		assert.False(t, canUpdate, "shouldn't allow update due to the suggested retry-after backoff (different hash)")
+		assert.Equal(t, "next attempt allowed in 1m0s", whyNot)
+
+		clock.MoveBy(time.Minute + time.Second)
+		canUpdate, whyNot = strategy.CanUpdate(hashOne)
+		assert.True(t, canUpdate, "should allow update after the suggested retry-after backoff")
+		assert.Empty(t, whyNot)
+	})
 }