Skip to content

Commit

Permalink
feat(konnect): propagate 429's retry-after header value
Browse files Browse the repository at this point in the history
  • Loading branch information
czeslavo committed May 15, 2023
1 parent 7b70f70 commit 9c4f619
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 19 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ Adding a new version? You'll need three changes:
- Configuration updates to Konnect Runtime Group's Admin API now respect a backoff
strategy that prevents KIC from exceeding API calls limits.
[#3989](https://github.com/Kong/kubernetes-ingress-controller/pull/3989)
[#4015](https://github.com/Kong/kubernetes-ingress-controller/pull/4015)

### Fixed

Expand Down
66 changes: 47 additions & 19 deletions internal/adminapi/backoff_strategy_konnect.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package adminapi
import (
"bytes"
"fmt"
"net/http"
"strings"
"sync"
"time"
Expand Down Expand Up @@ -88,28 +89,25 @@ func (s *KonnectBackoffStrategy) RegisterUpdateFailure(err error, configHash []b
s.lock.Lock()
defer s.lock.Unlock()

if errs := deckerrors.ExtractAPIErrors(err); len(errs) > 0 {
_, hasClientError := lo.Find(errs, func(item *kong.APIError) bool {
return item.Code() >= 400 && item.Code() < 500
})

// We store the failed configuration hash only in case we receive a client error code [400, 500).
// It's because we don't want to repeatedly try sending the config that we know is faulty on our side.
// It only makes sense to retry when the config changes.
if hasClientError {
s.lastFailedConfigHash = configHash
} else {
s.lastFailedConfigHash = nil
}
apiErrs := deckerrors.ExtractAPIErrors(err)
tooManyRequestsErr, isTooManyRequests := lo.Find(apiErrs, func(err *kong.APIError) bool {
return err.Code() == http.StatusTooManyRequests
})
if isTooManyRequests {
s.handleTooManyRequests(tooManyRequestsErr)
return
}

// Backoff.Duration() call returns backoff time we need to wait until next attempt.
// It also increments the internal attempts counter so the next time we call it, the
// duration will be multiplied accordingly.
timeLeft := s.b.Duration()
isClientError := lo.ContainsBy(apiErrs, func(err *kong.APIError) bool {
return err.Code() >= 400 && err.Code() < 500
})
if isClientError {
s.handleGenericClientError(configHash)
return
}

// We're storing the exact point in time after which we'll be allowed to perform the next update attempt.
s.nextAttempt = s.clock.Now().Add(timeLeft)
// If it's neither of the specific cases above, we just increment the standard exponential backoff.
s.incrementExponentialBackoff()
}

func (s *KonnectBackoffStrategy) RegisterUpdateSuccess() {
Expand All @@ -121,6 +119,36 @@ func (s *KonnectBackoffStrategy) RegisterUpdateSuccess() {
s.lastFailedConfigHash = nil
}

func (s *KonnectBackoffStrategy) handleTooManyRequests(tooManyRequestsErr *kong.APIError) {
if details, ok := tooManyRequestsErr.Details().(kong.ErrTooManyRequestsDetails); ok && details.RetryAfter != 0 {
// In case we get 429 with details embedded, we just retry after the suggested Retry-After time.
s.nextAttempt = s.clock.Now().Add(details.RetryAfter)
} else {
// In case the details for 429 are missing, we retry after the standard exponential backoff time.
s.incrementExponentialBackoff()
}

// Despite whether we've got details or not, we prune the last failed config hash to not block update after the
// period we set up above.
s.lastFailedConfigHash = nil
}

func (s *KonnectBackoffStrategy) handleGenericClientError(configHash []byte) {
// We increment the standard exponential backoff time and store the faulty config hash to prevent pushing it again.
s.incrementExponentialBackoff()
s.lastFailedConfigHash = configHash
}

func (s *KonnectBackoffStrategy) incrementExponentialBackoff() {
// Backoff.Duration() call returns backoff time we need to wait until next attempt.
// It also increments the internal attempts counter so the next time we call it, the
// duration will be multiplied accordingly.
timeLeft := s.b.Duration()

// We're storing the exact point in time after which we'll be allowed to perform the next update attempt.
s.nextAttempt = s.clock.Now().Add(timeLeft)
}

func (s *KonnectBackoffStrategy) whyCannotUpdate(
timeLeft time.Duration,
isTheSameFaultyConfig bool,
Expand Down
42 changes: 42 additions & 0 deletions internal/adminapi/backoff_strategy_konnect_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,4 +120,46 @@ func TestKonnectBackoffStrategy(t *testing.T) {
assert.True(t, canUpdate, "should allow update for another hash")
assert.Empty(t, whyNot)
})

t.Run("too many requests code with no details embedded", func(t *testing.T) {
strategy := adminapi.NewKonnectBackoffStrategy(clock)
strategy.RegisterUpdateFailure(kong.NewAPIError(http.StatusTooManyRequests, ""), hashOne)

canUpdate, whyNot := strategy.CanUpdate(hashOne)
assert.False(t, canUpdate, "shouldn't allow update due to a standard backoff time")
assert.Equal(t, "next attempt allowed in 3s", whyNot)

clock.MoveBy(time.Second * 5)

canUpdate, whyNot = strategy.CanUpdate(hashOne)
assert.True(t, canUpdate, "should allow update for the same hash after standard backoff time")
assert.Empty(t, whyNot)

canUpdate, whyNot = strategy.CanUpdate(hashTwo)
assert.True(t, canUpdate, "should allow update for another hash")
assert.Empty(t, whyNot)
})

t.Run("too many requests code with details embedded", func(t *testing.T) {
strategy := adminapi.NewKonnectBackoffStrategy(clock)
tooManyRequestsAPIErr := kong.NewAPIError(http.StatusTooManyRequests, "")
const retryAfter = time.Minute
tooManyRequestsAPIErr.SetDetails(kong.ErrTooManyRequestsDetails{
RetryAfter: retryAfter,
})
strategy.RegisterUpdateFailure(tooManyRequestsAPIErr, hashOne)

canUpdate, whyNot := strategy.CanUpdate(hashOne)
assert.False(t, canUpdate, "shouldn't allow update due to the suggested retry-after backoff")
assert.Equal(t, "next attempt allowed in 1m0s", whyNot)

canUpdate, whyNot = strategy.CanUpdate(hashTwo)
assert.False(t, canUpdate, "shouldn't allow update due to the suggested retry-after backoff (different hash)")
assert.Equal(t, "next attempt allowed in 1m0s", whyNot)

clock.MoveBy(time.Minute + time.Second)
canUpdate, whyNot = strategy.CanUpdate(hashOne)
assert.True(t, canUpdate, "should allow update after the suggested retry-after backoff")
assert.Empty(t, whyNot)
})
}

0 comments on commit 9c4f619

Please sign in to comment.