Skip to content

Commit

Permalink
fix(k8s): automatic retry for failed API requests
Browse files Browse the repository at this point in the history
We now automatically retry failed Kubernetes API requests if the reason
for the failure matches certain conditions.

For example, timeouts or DNS-related errors will result in retries, but
not 404/not found errors (and so forth), which will be thrown without
retrying.

We can easily add more error codes and/or conditions to this logic if we
discover further error cases that should result in retries.
  • Loading branch information
thsig committed May 12, 2021
1 parent 9dd044a commit 423a7d7
Showing 1 changed file with 103 additions and 29 deletions.
132 changes: 103 additions & 29 deletions core/src/plugins/kubernetes/api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
// tslint:disable-next-line:no-unused
import { IncomingMessage } from "http"
import { Agent } from "https"
import httpStatusCodes from "http-status-codes"
import { ReadStream } from "tty"
import {
KubeConfig,
Expand Down Expand Up @@ -50,7 +51,7 @@ import {
} from "./types"
import { LogEntry } from "../../logger/log-entry"
import { kubectl } from "./kubectl"
import { urlJoin } from "../../util/string"
import { deline, urlJoin } from "../../util/string"
import { KubernetesProvider } from "./config"
import { StringMap } from "../../config/common"
import { PluginContext } from "../../plugin-context"
Expand Down Expand Up @@ -329,12 +330,14 @@ export class KubeApi {
// apply auth
await this.config.applyToRequest(requestOpts)

try {
log.silly(`${requestOpts.method.toUpperCase()} ${url}`)
return await request(requestOpts)
} catch (err) {
throw handleRequestPromiseError(err)
}
return await requestWithRetry(log, async () => {
try {
log.silly(`${requestOpts.method.toUpperCase()} ${url}`)
return await request(requestOpts)
} catch (err) {
throw handleRequestPromiseError(err)
}
})
}

/**
Expand Down Expand Up @@ -595,28 +598,30 @@ export class KubeApi {
target["defaultHeaders"] = { ...defaultHeaders, "content-type": "application/strategic-merge-patch+json" }
}

const output = target[name](...args)
target["defaultHeaders"] = defaultHeaders

if (typeof output.then === "function") {
return (
output
// return the result body directly if applicable
.then((res: any) => {
if (isPlainObject(res) && res.hasOwnProperty("body")) {
return res["body"]
} else {
return res
}
})
// the API errors are not properly formed Error objects
.catch((err: Error) => {
throw wrapError(err)
})
)
}

return output
return requestWithRetry(null, () => {
const output = target[name](...args)
target["defaultHeaders"] = defaultHeaders

if (typeof output.then === "function") {
return (
output
// return the result body directly if applicable
.then((res: any) => {
if (isPlainObject(res) && res.hasOwnProperty("body")) {
return res["body"]
} else {
return res
}
})
// the API errors are not properly formed Error objects
.catch((err: Error) => {
throw wrapError(err)
})
)
}

return output
})
}
},
})
Expand Down Expand Up @@ -861,3 +866,72 @@ function handleRequestPromiseError(err: Error) {
return wrapError(err)
}
}

/**
* Helper function for retrying failed k8s API requests, using exponential backoff.
*
* Only retries the request when it fails with an error that matches certain status codes and/or error
* message contents (see the `shouldRetry` helper for details).
*
* The rationale here is that some errors occur because of network issues, intermittent timeouts etc.
* and should be retried automatically.
*
* TODO: We don't have access to an enclosing log entry when this is called from the proxied API
* methods (see the call to `Proxy` above), so the `log` argument is optional.
*/
async function requestWithRetry<R>(
log: LogEntry | null,
req: () => Promise<R>,
opts?: { maxRetries?: number; minTimeoutMs?: number }
): Promise<R> {
const maxRetries = opts?.maxRetries || 5
const minTimeoutMs = opts?.minTimeoutMs || 2000
const retry = async (usedRetries: number): Promise<R> => {
try {
return await req()
} catch (err) {
if (shouldRetry(err) && usedRetries < maxRetries) {
const sleepMsec = minTimeoutMs + usedRetries * minTimeoutMs
if (log) {
log.debug(
deline`
Kubernetes API: Request failed with error ${err.message}, sleeping for ${sleepMsec}ms and retrying
(#${usedRetries}/${maxRetries})
`
)
}
await sleep(sleepMsec)
return await retry(usedRetries + 1)
} else {
if (usedRetries === maxRetries) {
if (log) {
log.debug(`Kubernetes API: Maximum retry count exceeded, throwing error`)
}
}
throw err
}
}
}
const result = await retry(0)
return result
}

/**
* This helper determines whether an error thrown by a k8s API request should result in the request being retried.
*
* Add more error codes / regexes / filters etc. here as needed.
*/
function shouldRetry(err: any): boolean {
const code = err.statusCode
const msg = err.message || ""
return (code && statusCodesForRetry.includes(code)) || !!errorMessageRegexesForRetry.find((regex) => msg.match(regex))
}

const statusCodesForRetry: number[] = [
httpStatusCodes.BAD_GATEWAY,
httpStatusCodes.GATEWAY_TIMEOUT,
httpStatusCodes.REQUEST_TIMEOUT,
httpStatusCodes.SERVICE_UNAVAILABLE,
]

const errorMessageRegexesForRetry = [/getaddrinfo ENOTFOUND/, /getaddrinfo EAI_AGAIN/]

0 comments on commit 423a7d7

Please sign in to comment.