Skip to content

Commit

Permalink
Merge pull request #4 from salasberryfin/janitor-initial-version
Browse files Browse the repository at this point in the history
✨ Prepare janitor for initial version
  • Loading branch information
salasberryfin committed Apr 9, 2024
2 parents 6da317f + c7d8849 commit 68a6919
Show file tree
Hide file tree
Showing 11 changed files with 508 additions and 56 deletions.
31 changes: 22 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,22 +1,35 @@
# AWS Janitor

A GitHub Action to cleanup AWS resources that have exceeded a TTL.
A GitHub Action to cleanup AWS resources.

It uses a mark and delete approach:
- First time it runs, it describes resources and marks them for deletion.
- Next execution, it deletes previously marked resources.

The tag `aws-janitor/marked-for-deletion` is used as deletion marker.

**Any resource that includes the tag key defined by `ignore-tag`, will never be deleted.**

> By default the action will not perform the delete (i.e. it will be a dry-run). You need to explicitly set commit to `true`.
It supports cleaning up the following services:

- EKS Clusters
- Auto Scaling Groups
- Load Balancers
- Security Groups
- CloudFormation Stacks

It follows this strict order to avoid failures caused by inter-resource dependencies. Although intermittent failures may occur, they should be resolved in subsequent executions.

## Inputs

| Name | Required | Description |
| ----------------- | -------- | -------------------------------------------------------------------------------------- |
| regions | Y | A comma seperated list of regions to clean resources in. You can use * for all regions |
| allow-all-regions | N | Set to true if use * from regions. |
| ttl | Y | The duration that a resource can live for. For example, use 24h for 1 day. |
| commit | N | Whether to perform the delete. Defaults to `false` which is a dry run |
| Name | Required | Description |
| ----------------- | -------- | ------------------------------------------------------------------------------------------------- |
| regions | Y | A comma separated list of regions to clean resources in. You can use * for all regions |
| allow-all-regions | N | Set to true if use * from regions. |
| commit | N | Whether to perform the delete. Defaults to `false` which is a dry run |
| ignore-tag | N | The name of the tag that indicates a resource should not be deleted. Defaults to `janitor-ignore` |

## Example Usage

Expand All @@ -30,12 +43,12 @@ jobs:
uses: rancher-sandbox/aws-janitor@v0.1.0
with:
regions: eu-west-1
ttl: 168h
ignore-tag: janitor-ignore
env:
AWS_ACCESS_KEY_ID: {{secrets.AWS_ACCESS_KEY_ID}}
AWS_SECRET_ACCESS_KEY: {{secrets.AWS_SECRET_ACCESS_KEY}}
```
## Implementation Notes
It currently assumes that an instance of a service will have some form of creation date. This means that the implementation can be simpler as it doesn't need to adopt a "mark & sweep" pattern that requires saving state between runs of the action.
The original implementation of the janitor avoided using the mark and delete approach for simplicity but this solution is not viable when supporting deletion on resources that do not have a creation date.
11 changes: 6 additions & 5 deletions action.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: 'AWS Janitor'
author: 'Rancher Sandbox'
description: 'Clean-up AWS resources based on a TTL.'
description: 'Mark and clean AWS resources.'
inputs:
regions:
description: 'A comma separated list of regions to clean resources in. You can use * for all regions.'
Expand All @@ -9,16 +9,17 @@ inputs:
description: 'Set to true if you want to allow cleaning resources in all regions. If true then * must be used for regions.'
required: false
default: 'false'
ttl:
description: 'The duration that a resource can live for. For example, use 24h for 1 day.'
required: true
commit:
description: 'Should the action just report or do the actual delete.'
required: false
default: 'false'
ignore-tag:
description: 'The name of the tag that indicates a resource should not be deleted. Defaults to `janitor-ignore`'
required: false
default: 'janitor-ignore'
runs:
using: 'docker'
image: 'docker://ghcr.io/rancher-sandbox/aws-janitor:v0.1.0'
branding:
icon: 'delete'
color: 'blue'
color: 'blue'
35 changes: 23 additions & 12 deletions action/action.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@ import (
"github.com/aws/aws-sdk-go/aws/endpoints"
"github.com/aws/aws-sdk-go/aws/session"
"github.com/aws/aws-sdk-go/service/autoscaling"
"github.com/aws/aws-sdk-go/service/cloudformation"
"github.com/aws/aws-sdk-go/service/ec2"
"github.com/aws/aws-sdk-go/service/eks"
"github.com/aws/aws-sdk-go/service/elb"
)

type AwsJanitorAction interface {
Expand All @@ -26,17 +29,25 @@ type action struct {
commit bool
}

type Cleaner struct {
Service string
Run CleanupFunc
}

func (a *action) Cleanup(ctx context.Context, input *Input) error {

//NOTE: ordering matters here!
cleanupFuncs := map[string]CleanupFunc{
eks.ServiceName: a.cleanEKSClusters,
autoscaling.ServiceName: a.cleanASGs,
// use []Cleaner to keep the order
cleaners := []Cleaner{
{Service: eks.ServiceName, Run: a.cleanEKSClusters},
{Service: autoscaling.ServiceName, Run: a.cleanASGs},
{Service: elb.ServiceName, Run: a.cleanLoadBalancers},
{Service: ec2.ServiceName, Run: a.cleanSecurityGroups},
{Service: cloudformation.ServiceName, Run: a.cleanCfStacks},
}
inputRegions := strings.Split(input.Regions, ",")

for service, cleanupFunc := range cleanupFuncs {
regions := getServiceRegions(service, inputRegions)
for _, cleaner := range cleaners {
regions := getServiceRegions(cleaner.Service, inputRegions)

for _, region := range regions {
sess, err := session.NewSession(&aws.Config{
Expand All @@ -47,14 +58,14 @@ func (a *action) Cleanup(ctx context.Context, input *Input) error {
}

scope := &CleanupScope{
TTL: input.TTL,
Session: sess,
Commit: input.Commit,
Session: sess,
Commit: input.Commit,
IgnoreTag: input.IgnoreTag,
}

Log("Cleaning up resources for service %s in region %s", service, region)
if err := cleanupFunc(ctx, scope); err != nil {
return fmt.Errorf("failed running cleanup for service %s: %w", service, err)
Log("Cleaning up resources for service %s in region %s", cleaner.Service, region)
if err := cleaner.Run(ctx, scope); err != nil {
return fmt.Errorf("failed running cleanup for service %s: %w", cleaner.Service, err)
}
}
}
Expand Down
11 changes: 7 additions & 4 deletions action/cleanup.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,18 @@ package action

import (
"context"
"time"

"github.com/aws/aws-sdk-go/aws/session"
)

const (
DeletionTag = "aws-janitor/marked-for-deletion"
)

type CleanupScope struct {
Session *session.Session
TTL time.Duration
Commit bool
Session *session.Session
Commit bool
IgnoreTag string
}

type CleanupFunc func(ctx context.Context, input *CleanupScope) error
43 changes: 39 additions & 4 deletions action/cleanup_asg.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package action
import (
"context"
"fmt"
"time"

"github.com/aws/aws-sdk-go/aws"
"github.com/aws/aws-sdk-go/service/autoscaling"
Expand All @@ -15,12 +14,32 @@ func (a *action) cleanASGs(ctx context.Context, input *CleanupScope) error {
asgToDelete := []*autoscaling.Group{}
pageFunc := func(page *autoscaling.DescribeAutoScalingGroupsOutput, _ bool) bool {
for _, asg := range page.AutoScalingGroups {
maxAge := asg.CreatedTime.Add(input.TTL)
var ignore, markedForDeletion bool
for _, tag := range asg.Tags {
if *tag.Key == input.IgnoreTag {
ignore = true
} else if *tag.Key == DeletionTag {
markedForDeletion = true
}
}

if time.Now().Before(maxAge) {
LogDebug("asg %s has max age greater than now, skipping cleanup", *asg.AutoScalingGroupName)
if ignore {
LogDebug("asg %s has ignore tag, skipping cleanup", *asg.AutoScalingGroupName)
continue
}

if !markedForDeletion {
// NOTE: only mark for future deletion if we're not running in dry-mode
if a.commit {
LogDebug("asg %s does not have deletion tag, marking for future deletion and skipping cleanup", *asg.AutoScalingGroupName)
if err := a.markAsgForFutureDeletion(ctx, *asg.AutoScalingGroupName, client); err != nil {
LogError("failed to mark asg %s for future deletion: %s", *asg.AutoScalingGroupName, err.Error())
}
}
continue
}

LogDebug("adding asg %s to delete list", *asg.AutoScalingGroupName)
asgToDelete = append(asgToDelete, asg)
}

Expand Down Expand Up @@ -62,3 +81,19 @@ func (a *action) cleanASGs(ctx context.Context, input *CleanupScope) error {

return nil
}

func (a *action) markAsgForFutureDeletion(ctx context.Context, asgName string, client *autoscaling.AutoScaling) error {
Log("Marking ASG %s for future deletion", asgName)

_, err := client.CreateOrUpdateTagsWithContext(ctx, &autoscaling.CreateOrUpdateTagsInput{Tags: []*autoscaling.Tag{
{
Key: aws.String(DeletionTag),
PropagateAtLaunch: aws.Bool(true),
ResourceId: aws.String(asgName),
ResourceType: aws.String("auto-scaling-group"),
Value: aws.String("true"),
},
}})

return err
}
114 changes: 114 additions & 0 deletions action/cleanup_cf.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
package action

import (
"context"
"fmt"

"github.com/aws/aws-sdk-go/aws"
cf "github.com/aws/aws-sdk-go/service/cloudformation"
)

func (a *action) cleanCfStacks(ctx context.Context, input *CleanupScope) error {
client := cf.New(input.Session)

stacksToDelete := []*string{}
pageFunc := func(page *cf.DescribeStacksOutput, _ bool) bool {
for _, stack := range page.Stacks {
var ignore, markedForDeletion bool
for _, tag := range stack.Tags {
if *tag.Key == input.IgnoreTag {
ignore = true
} else if *tag.Key == DeletionTag {
markedForDeletion = true
}
}

if ignore {
LogDebug("cloudformation stack %s has ignore tag, skipping cleanup", *stack.StackName)
continue
}

if !markedForDeletion {
// NOTE: only mark for future deletion if we're not running in dry-mode
if a.commit {
LogDebug("cloudformation stack %s does not have deletion tag, marking for future deletion and skipping cleanup", *stack.StackName)
if err := a.markCfStackForFutureDeletion(ctx, stack, client); err != nil {
LogError("failed to mark cloudformation stack %s for future deletion: %s", *stack.StackName, err.Error())
}
}
continue
}

switch aws.StringValue(stack.StackStatus) {
case cf.ResourceStatusDeleteComplete,
cf.ResourceStatusDeleteInProgress:
LogDebug("cloudformation stack %s is already deleted/deleting, skipping cleanup", *stack.StackName)
continue
}

LogDebug("adding cloudformation stack %s to delete list", *stack.StackName)
stacksToDelete = append(stacksToDelete, stack.StackName)
}

return true
}

if err := client.DescribeStacksPagesWithContext(ctx, &cf.DescribeStacksInput{}, pageFunc); err != nil {
return fmt.Errorf("failed getting list of cloudformation stacks: %w", err)
}

if len(stacksToDelete) == 0 {
Log("no cloudformation stacks to delete")
return nil
}

for _, stackName := range stacksToDelete {
if !a.commit {
LogDebug("skipping deletion of cloudformation stack %s as running in dry-mode", *stackName)
continue
}

if err := a.deleteCfStack(ctx, *stackName, client); err != nil {
LogError("failed to delete cloudformation stack %s: %s", *stackName, err.Error())
}
}

return nil
}

func (a *action) markCfStackForFutureDeletion(ctx context.Context, stack *cf.Stack, client *cf.CloudFormation) error {
Log("Marking CloudFormation stack %s for future deletion", *stack.StackName)

stack.SetTags(append(stack.Tags, &cf.Tag{Key: aws.String(DeletionTag), Value: aws.String("true")}))

LogDebug("Updating tags for cloudformation stack %s", *stack.StackName)

if _, err := client.UpdateStackWithContext(ctx, &cf.UpdateStackInput{
Capabilities: stack.Capabilities,
StackName: stack.StackName,
Tags: stack.Tags,
UsePreviousTemplate: aws.Bool(true),
}); err != nil {
return fmt.Errorf("failed to update cloudformation stack %s: %w", *stack.StackName, err)
}

if err := client.WaitUntilStackUpdateCompleteWithContext(ctx, &cf.DescribeStacksInput{StackName: stack.StackName}); err != nil {
return fmt.Errorf("failed to wait for cloudformation stack %s to update: %w", *stack.StackName, err)
}

return nil
}

func (a *action) deleteCfStack(ctx context.Context, stackName string, client *cf.CloudFormation) error {
Log("Deleting CloudFormation stack %s", stackName)

if _, err := client.DeleteStackWithContext(ctx, &cf.DeleteStackInput{StackName: &stackName}); err != nil {
return fmt.Errorf("failed to delete cloudformation stack %s: %w", stackName, err)
}

if err := client.WaitUntilStackDeleteCompleteWithContext(ctx, &cf.DescribeStacksInput{StackName: &stackName}); err != nil {
return fmt.Errorf("failed to wait for cloudformation stack %s to delete: %w", stackName, err)
}

return nil
}
Loading

0 comments on commit 68a6919

Please sign in to comment.