Skip to content

Commit

Permalink
Raise an alarm when any stages are in a Failed state. (#6)
Browse files Browse the repository at this point in the history
Configure a lambda function to watch the pipeline and emit a metric for the number of failed stages, enabling consistent alarming on a 'failed pipeline'.
  • Loading branch information
sam-goodwin authored Dec 13, 2018
1 parent 7dff479 commit ce744c0
Show file tree
Hide file tree
Showing 7 changed files with 412 additions and 80 deletions.
1 change: 1 addition & 0 deletions lib/pipeline-watcher/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
export * from './watcher';
34 changes: 34 additions & 0 deletions lib/pipeline-watcher/watcher-handler.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import AWS = require('aws-sdk');

// export for tests
export const codePipeline = new AWS.CodePipeline();
export const logger = {
log: (line: string) => process.stdout.write(line)
};

/**
* Lambda function for checking the stages of a CodePipeline and emitting log
* entries with { failedCount = <no. of failed stages> } for async metric
* aggregation via metric filters.
*
* It requires the pipeline's name be set as the 'PIPELINE_NAME' environment variable.
*/
export async function handler() {
const pipelineName = process.env.PIPELINE_NAME;
if (!pipelineName) {
throw new Error("Pipeline name expects environment variable: 'PIPELINE_NAME'");
}
const state = await codePipeline.getPipelineState({
name: pipelineName
}).promise();

let failedCount = 0;
if (state.stageStates) {
failedCount = state.stageStates
.filter(stage => stage.latestExecution !== undefined && stage.latestExecution.status === 'Failed')
.length;
}
logger.log(JSON.stringify({
failedCount
}));
}
96 changes: 96 additions & 0 deletions lib/pipeline-watcher/watcher.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import cloudwatch = require('@aws-cdk/aws-cloudwatch');
import cpipeline = require('@aws-cdk/aws-codepipeline');
import events = require('@aws-cdk/aws-events');
import iam = require('@aws-cdk/aws-iam');
import lambda = require('@aws-cdk/aws-lambda');
import logs = require('@aws-cdk/aws-logs');
import cdk = require('@aws-cdk/cdk');
import fs = require('fs');
import path = require('path');

export interface PipelineWatcherProps {
/**
* Code Pipeline to monitor for failed stages
*/
pipeline: cpipeline.Pipeline;

/**
* Set the pipelineName of the alarm description.
*
* Description is set to 'Pipeline <title> has failed stages'
*
* @default pipeline's name
*/
title?: string;
}

/**
* Construct which watches a Code Pipeline for failed stages and raises an alarm
* if there are any failed stages.
*
* A function runs every minute and calls GetPipelineState for the provided pipeline's
* name, counts the number of failed stages and emits a JSON log { failedCount: <number> }.
* A metric filter is then configured to track this value as a CloudWatch metric, and
* a corresponding alarm is set to fire when the maximim value of a single 5-minute interval
* is >= 1.
*/
export class PipelineWatcher extends cdk.Construct {
public readonly alarm: cloudwatch.Alarm;

constructor(parent: cdk.Construct, name: string, props: PipelineWatcherProps) {
super(parent, name);

const pipelineWatcher = new lambda.Function(this, 'Poller', {
handler: 'index.handler',
runtime: lambda.Runtime.NodeJS810,
code: lambda.Code.inline(fs.readFileSync(path.join(__dirname, 'watcher-handler.js')).toString('utf8')),
environment: {
PIPELINE_NAME: props.pipeline.pipelineName
}
});

// See https://github.com/awslabs/aws-cdk/issues/1340 for exposing grants on the pipeline.
pipelineWatcher.addToRolePolicy(new iam.PolicyStatement()
.addResource(props.pipeline.pipelineArn)
.addAction('codepipeline:GetPipelineState'));

// ex: arn:aws:logs:us-east-1:123456789012:log-group:my-log-group
const logGroup = new logs.LogGroup(this, 'Logs', {
logGroupName: `/aws/lambda/${pipelineWatcher.functionName}`,
retentionDays: 731
});

const trigger = new events.EventRule(this, 'Trigger', {
scheduleExpression: 'rate(1 minute)',
targets: [pipelineWatcher]
});

const logGroupResource = logGroup.findChild('Resource') as cdk.Resource;
const triggerResource = trigger.findChild('Resource') as cdk.Resource;
triggerResource.addDependency(logGroupResource);

const metricNamespace = `CDK/Delivlib`;
const metricName = `${props.pipeline.pipelineName}_FailedStages`;

new logs.MetricFilter(this, 'MetricFilter', {
filterPattern: logs.FilterPattern.exists('$.failedCount'),
metricNamespace,
metricName,
metricValue: '$.failedCount',
logGroup
});

this.alarm = new cloudwatch.Alarm(this, 'Alarm', {
alarmDescription: `Pipeline ${props.title || props.pipeline.pipelineName} has failed stages`,
metric: new cloudwatch.Metric({
metricName,
namespace: metricNamespace,
statistic: cloudwatch.Statistic.Maximum
}),
threshold: 1,
comparisonOperator: cloudwatch.ComparisonOperator.GreaterThanOrEqualToThreshold,
evaluationPeriods: 1,
treatMissingData: cloudwatch.TreatMissingData.Ignore, // We expect a steady stream of data points
});
}
}
26 changes: 6 additions & 20 deletions lib/pipeline.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import iam = require('@aws-cdk/aws-iam');
import sns = require('@aws-cdk/aws-sns');
import cdk = require('@aws-cdk/cdk');
import path = require('path');
import { PipelineWatcher } from './pipeline-watcher';
import publishing = require('./publishing');
import { IRepo } from './repo';
import { Testable, TestableProps } from './testable';
Expand Down Expand Up @@ -255,26 +256,11 @@ export class Pipeline extends cdk.Construct {
}));
}

private addFailureAlarm(title?: string) {
const pipelineFailureTopic = new sns.Topic(this, 'PipelineFailureTopic');

this.pipeline.onStateChange('PipelineFailureEvent', pipelineFailureTopic, {
eventPattern: { detail: { state: [ 'FAILED' ] } }
});

new cloudwatch.Alarm(this, 'PipelineFailureAlarm', {
alarmDescription: `Pipeline ${title || ''} Failed`,
metric: new cloudwatch.Metric({
metricName: 'NumberOfMessagesPublished',
namespace: 'SNS',
statistic: cloudwatch.Statistic.Sum,
dimensions: { TopicName: pipelineFailureTopic.topicName }
}),
threshold: 1,
comparisonOperator: cloudwatch.ComparisonOperator.GreaterThanOrEqualToThreshold,
evaluationPeriods: 1,
treatMissingData: cloudwatch.TreatMissingData.NotBreaching,
});
private addFailureAlarm(title?: string): cloudwatch.Alarm {
return new PipelineWatcher(this, 'PipelineWatcher', {
pipeline: this.pipeline,
title
}).alarm;
}
}

Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
"build": "tsc && tslint --fix --project .",
"package": "/bin/bash ./package.sh",
"watch": "tsc -w",
"test": "/bin/bash ./test/run-test.sh",
"test": "/bin/bash ./test/run-test.sh && jest",
"cdk": "cdk",
"pipeline-update": "npm run build && cdk -a pipeline/delivlib.js deploy",
"pipeline-diff": "npm run build && cdk -a pipeline/delivlib.js diff"
Expand Down
Loading

0 comments on commit ce744c0

Please sign in to comment.