-
Notifications
You must be signed in to change notification settings - Fork 4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix(logs): LogRetention resources fail with rate exceeded errors (#26858
) The LogRetention Custom Resource used to be able to handle server-side throttling, when a lot of requests to the CloudWatch Logs service are made at the same time. Handling of this error case got lost during the migration to SDK v3. If we have (read: a lot) `LogRetention` Custom Resources in a _single_ Stack, CloudFormation apparently applies some internal breaks to the amount of parallelism. For example it appears that resources are batched in smaller groups that need to be completed before the next group is provisioned. And within the groups there appears to be a ever so slight delay between individual resources. Together this is enough to avoid rate limiting in most circumstances. **Therefore, in practice this issues only occurs when multiple stacks are deployed in parallel.** To test this scenario, I have added support for `integ-runner` to deploy all stacks of a test case concurrently. Support for arbitrary command args already existed, but needed to explicitly include the `concurrency` option. I then create an integration test that deploys 3 stacks à 25 LogRetention resources. This triggers the error cases described in #26837. The fix itself is twofold: - Pass the `maxRetries` prop value to the SDK client to increase the number of attempts of the SDK internal throttling handling. But also enforce a minimum for these retries since they might catch additional retryable failures that our custom outer loop does not account for. - Explicitly catch `ThrottlingException` errors in the outer retry loop. Closes #26837 ---- *By submitting this pull request, I confirm that my contribution is made under the terms of the Apache-2.0 license*
- Loading branch information
Showing
22 changed files
with
9,771 additions
and
4 deletions.
There are no files selected for viewing
19 changes: 19 additions & 0 deletions
19
...n-retries.js.snapshot/LogRetentionIntegRetriesDefaultTestDeployAssert6D1A1A1C.assets.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
{ | ||
"version": "33.0.0", | ||
"files": { | ||
"21fbb51d7b23f6a6c262b46a9caee79d744a3ac019fd45422d988b96d44b2a22": { | ||
"source": { | ||
"path": "LogRetentionIntegRetriesDefaultTestDeployAssert6D1A1A1C.template.json", | ||
"packaging": "file" | ||
}, | ||
"destinations": { | ||
"current_account-current_region": { | ||
"bucketName": "cdk-hnb659fds-assets-${AWS::AccountId}-${AWS::Region}", | ||
"objectKey": "21fbb51d7b23f6a6c262b46a9caee79d744a3ac019fd45422d988b96d44b2a22.json", | ||
"assumeRoleArn": "arn:${AWS::Partition}:iam::${AWS::AccountId}:role/cdk-hnb659fds-file-publishing-role-${AWS::AccountId}-${AWS::Region}" | ||
} | ||
} | ||
} | ||
}, | ||
"dockerImages": {} | ||
} |
36 changes: 36 additions & 0 deletions
36
...retries.js.snapshot/LogRetentionIntegRetriesDefaultTestDeployAssert6D1A1A1C.template.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
{ | ||
"Parameters": { | ||
"BootstrapVersion": { | ||
"Type": "AWS::SSM::Parameter::Value<String>", | ||
"Default": "/cdk-bootstrap/hnb659fds/version", | ||
"Description": "Version of the CDK Bootstrap resources in this environment, automatically retrieved from SSM Parameter Store. [cdk:skip]" | ||
} | ||
}, | ||
"Rules": { | ||
"CheckBootstrapVersion": { | ||
"Assertions": [ | ||
{ | ||
"Assert": { | ||
"Fn::Not": [ | ||
{ | ||
"Fn::Contains": [ | ||
[ | ||
"1", | ||
"2", | ||
"3", | ||
"4", | ||
"5" | ||
], | ||
{ | ||
"Ref": "BootstrapVersion" | ||
} | ||
] | ||
} | ||
] | ||
}, | ||
"AssertDescription": "CDK bootstrap stack version 6 required. Please run 'cdk bootstrap' with a recent version of the CDK CLI." | ||
} | ||
] | ||
} | ||
} | ||
} |
14 changes: 14 additions & 0 deletions
14
...napshot/asset.a8515c042d9c942705087943220417be929ac44f968d8fcef2681681b400c0c0/index.d.ts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
interface LogRetentionEvent extends Omit<AWSLambda.CloudFormationCustomResourceEvent, 'ResourceProperties'> { | ||
ResourceProperties: { | ||
ServiceToken: string; | ||
LogGroupName: string; | ||
LogGroupRegion?: string; | ||
RetentionInDays?: string; | ||
SdkRetry?: { | ||
maxRetries?: string; | ||
}; | ||
RemovalPolicy?: string; | ||
}; | ||
} | ||
export declare function handler(event: LogRetentionEvent, context: AWSLambda.Context): Promise<void>; | ||
export {}; |
192 changes: 192 additions & 0 deletions
192
....snapshot/asset.a8515c042d9c942705087943220417be929ac44f968d8fcef2681681b400c0c0/index.js
Large diffs are not rendered by default.
Oops, something went wrong.
229 changes: 229 additions & 0 deletions
229
....snapshot/asset.a8515c042d9c942705087943220417be929ac44f968d8fcef2681681b400c0c0/index.ts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,229 @@ | ||
/* eslint-disable no-console */ | ||
// eslint-disable-next-line import/no-extraneous-dependencies | ||
import * as Logs from '@aws-sdk/client-cloudwatch-logs'; | ||
|
||
interface LogRetentionEvent extends Omit<AWSLambda.CloudFormationCustomResourceEvent, 'ResourceProperties'> { | ||
ResourceProperties: { | ||
ServiceToken: string; | ||
LogGroupName: string; | ||
LogGroupRegion?: string; | ||
RetentionInDays?: string; | ||
SdkRetry?: { | ||
maxRetries?: string; | ||
}; | ||
RemovalPolicy?: string | ||
}; | ||
} | ||
|
||
/** | ||
* Creates a log group and doesn't throw if it exists. | ||
*/ | ||
async function createLogGroupSafe(logGroupName: string, client: Logs.CloudWatchLogsClient, withDelay: (block: () => Promise<void>) => Promise<void>) { | ||
await withDelay(async () => { | ||
try { | ||
const params = { logGroupName }; | ||
const command = new Logs.CreateLogGroupCommand(params); | ||
await client.send(command); | ||
|
||
} catch (error: any) { | ||
if (error instanceof Logs.ResourceAlreadyExistsException || error.name === 'ResourceAlreadyExistsException') { | ||
// The log group is already created by the lambda execution | ||
return; | ||
} | ||
|
||
throw error; | ||
} | ||
}); | ||
} | ||
|
||
/** | ||
* Deletes a log group and doesn't throw if it does not exist. | ||
*/ | ||
async function deleteLogGroup(logGroupName: string, client: Logs.CloudWatchLogsClient, withDelay: (block: () => Promise<void>) => Promise<void>) { | ||
await withDelay(async () => { | ||
try { | ||
const params = { logGroupName }; | ||
const command = new Logs.DeleteLogGroupCommand(params); | ||
await client.send(command); | ||
|
||
} catch (error: any) { | ||
if (error instanceof Logs.ResourceNotFoundException || error.name === 'ResourceNotFoundException') { | ||
// The log group doesn't exist | ||
return; | ||
} | ||
|
||
throw error; | ||
} | ||
}); | ||
} | ||
|
||
/** | ||
* Puts or deletes a retention policy on a log group. | ||
*/ | ||
async function setRetentionPolicy( | ||
logGroupName: string, | ||
client: Logs.CloudWatchLogsClient, | ||
withDelay: (block: () => Promise<void>) => Promise<void>, | ||
retentionInDays?: number, | ||
) { | ||
|
||
await withDelay(async () => { | ||
if (!retentionInDays) { | ||
const params = { logGroupName }; | ||
const deleteCommand = new Logs.DeleteRetentionPolicyCommand(params); | ||
await client.send(deleteCommand); | ||
} else { | ||
const params = { logGroupName, retentionInDays }; | ||
const putCommand = new Logs.PutRetentionPolicyCommand(params); | ||
await client.send(putCommand); | ||
} | ||
}); | ||
} | ||
|
||
export async function handler(event: LogRetentionEvent, context: AWSLambda.Context) { | ||
try { | ||
console.log(JSON.stringify({ ...event, ResponseURL: '...' })); | ||
|
||
// The target log group | ||
const logGroupName = event.ResourceProperties.LogGroupName; | ||
|
||
// The region of the target log group | ||
const logGroupRegion = event.ResourceProperties.LogGroupRegion; | ||
|
||
// Parse to AWS SDK retry options | ||
const maxRetries = parseIntOptional(event.ResourceProperties.SdkRetry?.maxRetries) ?? 5; | ||
const withDelay = makeWithDelay(maxRetries); | ||
|
||
const sdkConfig: Logs.CloudWatchLogsClientConfig = { | ||
logger: console, | ||
region: logGroupRegion, | ||
maxAttempts: Math.max(5, maxRetries), // Use a minimum for SDK level retries, because it might include retryable failures that withDelay isn't checking for | ||
}; | ||
const client = new Logs.CloudWatchLogsClient(sdkConfig); | ||
|
||
if (event.RequestType === 'Create' || event.RequestType === 'Update') { | ||
// Act on the target log group | ||
await createLogGroupSafe(logGroupName, client, withDelay); | ||
await setRetentionPolicy(logGroupName, client, withDelay, parseIntOptional(event.ResourceProperties.RetentionInDays)); | ||
|
||
// Configure the Log Group for the Custom Resource function itself | ||
if (event.RequestType === 'Create') { | ||
const clientForCustomResourceFunction = new Logs.CloudWatchLogsClient({ | ||
logger: console, | ||
region: process.env.AWS_REGION, | ||
}); | ||
// Set a retention policy of 1 day on the logs of this very function. | ||
// Due to the async nature of the log group creation, the log group for this function might | ||
// still be not created yet at this point. Therefore we attempt to create it. | ||
// In case it is being created, createLogGroupSafe will handle the conflict. | ||
await createLogGroupSafe(`/aws/lambda/${context.functionName}`, clientForCustomResourceFunction, withDelay); | ||
// If createLogGroupSafe fails, the log group is not created even after multiple attempts. | ||
// In this case we have nothing to set the retention policy on but an exception will skip | ||
// the next line. | ||
await setRetentionPolicy(`/aws/lambda/${context.functionName}`, clientForCustomResourceFunction, withDelay, 1); | ||
} | ||
} | ||
|
||
// When the requestType is delete, delete the log group if the removal policy is delete | ||
if (event.RequestType === 'Delete' && event.ResourceProperties.RemovalPolicy === 'destroy') { | ||
await deleteLogGroup(logGroupName, client, withDelay); | ||
// else retain the log group | ||
} | ||
|
||
await respond('SUCCESS', 'OK', logGroupName); | ||
} catch (e: any) { | ||
console.log(e); | ||
await respond('FAILED', e.message, event.ResourceProperties.LogGroupName); | ||
} | ||
|
||
function respond(responseStatus: string, reason: string, physicalResourceId: string) { | ||
const responseBody = JSON.stringify({ | ||
Status: responseStatus, | ||
Reason: reason, | ||
PhysicalResourceId: physicalResourceId, | ||
StackId: event.StackId, | ||
RequestId: event.RequestId, | ||
LogicalResourceId: event.LogicalResourceId, | ||
Data: { | ||
// Add log group name as part of the response so that it's available via Fn::GetAtt | ||
LogGroupName: event.ResourceProperties.LogGroupName, | ||
}, | ||
}); | ||
|
||
console.log('Responding', responseBody); | ||
|
||
// eslint-disable-next-line @typescript-eslint/no-require-imports | ||
const parsedUrl = require('url').parse(event.ResponseURL); | ||
const requestOptions = { | ||
hostname: parsedUrl.hostname, | ||
path: parsedUrl.path, | ||
method: 'PUT', | ||
headers: { | ||
'content-type': '', | ||
'content-length': Buffer.byteLength(responseBody, 'utf8'), | ||
}, | ||
}; | ||
|
||
return new Promise((resolve, reject) => { | ||
try { | ||
// eslint-disable-next-line @typescript-eslint/no-require-imports | ||
const request = require('https').request(requestOptions, resolve); | ||
request.on('error', reject); | ||
request.write(responseBody); | ||
request.end(); | ||
} catch (e) { | ||
reject(e); | ||
} | ||
}); | ||
} | ||
} | ||
|
||
function parseIntOptional(value?: string, base = 10): number | undefined { | ||
if (value === undefined) { | ||
return undefined; | ||
} | ||
|
||
return parseInt(value, base); | ||
} | ||
|
||
function makeWithDelay( | ||
maxRetries: number, | ||
delayBase: number = 100, | ||
delayCap = 10 * 1000, // 10s | ||
): (block: () => Promise<void>) => Promise<void> { | ||
// If we try to update the log group, then due to the async nature of | ||
// Lambda logging there could be a race condition when the same log group is | ||
// already being created by the lambda execution. This can sometime result in | ||
// an error "OperationAbortedException: A conflicting operation is currently | ||
// in progress...Please try again." | ||
// To avoid an error, we do as requested and try again. | ||
|
||
return async (block: () => Promise<void>) => { | ||
let attempts = 0; | ||
do { | ||
try { | ||
return await block(); | ||
} catch (error: any) { | ||
if ( | ||
error instanceof Logs.OperationAbortedException | ||
|| error.name === 'OperationAbortedException' | ||
|| error.name === 'ThrottlingException' // There is no class to check with instanceof, see https://github.com/aws/aws-sdk-js-v3/issues/5140 | ||
) { | ||
if (attempts < maxRetries ) { | ||
attempts++; | ||
await new Promise(resolve => setTimeout(resolve, calculateDelay(attempts, delayBase, delayCap))); | ||
continue; | ||
} else { | ||
// The log group is still being changed by another execution but we are out of retries | ||
throw new Error('Out of attempts to change log group'); | ||
} | ||
} | ||
throw error; | ||
} | ||
} while (true); // exit happens on retry count check | ||
}; | ||
} | ||
|
||
function calculateDelay(attempt: number, base: number, cap: number): number { | ||
return Math.round(Math.random() * Math.min(cap, base * 2 ** attempt)); | ||
} |
32 changes: 32 additions & 0 deletions
32
.../integ.log-retention-retries.js.snapshot/aws-cdk-log-retention-integ-retries0.assets.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
{ | ||
"version": "33.0.0", | ||
"files": { | ||
"a8515c042d9c942705087943220417be929ac44f968d8fcef2681681b400c0c0": { | ||
"source": { | ||
"path": "asset.a8515c042d9c942705087943220417be929ac44f968d8fcef2681681b400c0c0", | ||
"packaging": "zip" | ||
}, | ||
"destinations": { | ||
"current_account-current_region": { | ||
"bucketName": "cdk-hnb659fds-assets-${AWS::AccountId}-${AWS::Region}", | ||
"objectKey": "a8515c042d9c942705087943220417be929ac44f968d8fcef2681681b400c0c0.zip", | ||
"assumeRoleArn": "arn:${AWS::Partition}:iam::${AWS::AccountId}:role/cdk-hnb659fds-file-publishing-role-${AWS::AccountId}-${AWS::Region}" | ||
} | ||
} | ||
}, | ||
"9990a29f03d0c5431a972aeb27fc605359cf0093ddd08bfbdf611189e8116726": { | ||
"source": { | ||
"path": "aws-cdk-log-retention-integ-retries0.template.json", | ||
"packaging": "file" | ||
}, | ||
"destinations": { | ||
"current_account-current_region": { | ||
"bucketName": "cdk-hnb659fds-assets-${AWS::AccountId}-${AWS::Region}", | ||
"objectKey": "9990a29f03d0c5431a972aeb27fc605359cf0093ddd08bfbdf611189e8116726.json", | ||
"assumeRoleArn": "arn:${AWS::Partition}:iam::${AWS::AccountId}:role/cdk-hnb659fds-file-publishing-role-${AWS::AccountId}-${AWS::Region}" | ||
} | ||
} | ||
} | ||
}, | ||
"dockerImages": {} | ||
} |
Oops, something went wrong.