Skip to content
This repository has been archived by the owner on Feb 11, 2024. It is now read-only.

Commit

Permalink
feat: Add Grafana panels and alerts
Browse files Browse the repository at this point in the history
  • Loading branch information
Xavier Basty committed Jun 7, 2023
1 parent f5a2edb commit bc9e0a5
Show file tree
Hide file tree
Showing 27 changed files with 662 additions and 82 deletions.
5 changes: 2 additions & 3 deletions terraform/docdb/terraform.tf
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,11 @@ terraform {
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 4.51.0"
version = "~> 4.50"
}
random = {
source = "hashicorp/random"
version = "3.4.3"
version = "3.5"
}
}
}

3 changes: 2 additions & 1 deletion terraform/ecs/main.tf
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
locals {
service_name = "${module.this.id}-service"
file_descriptor_soft_limit = pow(2, 18)
file_descriptor_hard_limit = local.file_descriptor_soft_limit * 2
}
Expand Down Expand Up @@ -111,7 +112,7 @@ resource "aws_ecs_task_definition" "app_task_definition" {

## Service
resource "aws_ecs_service" "app_service" {
name = "${module.this.id}-service"
name = local.service_name
cluster = aws_ecs_cluster.app_cluster.id
task_definition = aws_ecs_task_definition.app_task_definition.arn
launch_type = "FARGATE"
Expand Down
18 changes: 17 additions & 1 deletion terraform/ecs/outputs.tf
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
output "service_name" {
description = "The name of the service"
value = local.service_name
}

output "target_group_arn" {
description = "The ARN of the target group"
value = aws_lb_target_group.target_group.arn
}

output "load_balancer_arn" {
value = aws_lb.application_load_balancer.arn
description = "The ARN of the load balancer"
value = aws_lb.application_load_balancer.arn
}

output "load_balancer_arn_suffix" {
description = "The ARN suffix of the load balancer"
value = aws_lb.application_load_balancer.arn_suffix
}
12 changes: 9 additions & 3 deletions terraform/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,11 @@ data "github_release" "latest_release" {
################################################################################
# Networking

#tflint-ignore: terraform_module_version
module "vpc" {
source = "terraform-aws-modules/vpc/aws"
name = "${module.this.stage}-${module.this.name}"
source = "terraform-aws-modules/vpc/aws"
version = "4.0"

name = "${module.this.stage}-${module.this.name}"

cidr = "10.0.0.0/16"

Expand Down Expand Up @@ -103,4 +104,9 @@ module "monitoring" {
context = module.this.context

prometheus_workspace_id = aws_prometheus_workspace.prometheus.id

docdb_cluster_id = module.history_docdb.cluster_id
ecs_service_name = module.ecs.service_name
load_balancer_arn = module.ecs.load_balancer_arn
target_group_arn = module.ecs.target_group_arn
}
46 changes: 37 additions & 9 deletions terraform/monitoring/dashboard.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,28 @@ local grafana = import 'grafonnet-lib/grafana.libsonnet';
local panels = import 'panels/panels.libsonnet';

local dashboard = grafana.dashboard;
local row = grafana.row;
local layout = grafana.layout;

local ds = {
cloudwatch: {
type: 'cloudwatch',
uid: std.extVar('cloudwatch_uid'),
},
prometheus: {
type: 'prometheus',
uid: std.extVar('prometheus_uid'),
},
};
local vars = {
namespace: 'History',
environment: std.extVar('environment'),
notifications: std.parseJson(std.extVar('notifications')),

ecs_service_name: std.extVar('ecs_service_name'),
load_balancer: std.extVar('load_balancer'),
target_group: std.extVar('target_group'),
docdb_cluster_id: std.extVar('docdb_cluster_id'),
};

////////////////////////////////////////////////////////////////////////////////
Expand All @@ -35,14 +49,28 @@ dashboard.new(
},
)
)
.addPanels(
grafana.layout.generate_grid([
panels.received_items(ds, vars) { gridPos: pos._2 },
panels.stored_items(ds, vars) { gridPos: pos._2 },

panels.get_queries(ds, vars) { gridPos: pos._2 },
panels.served_items(ds, vars) { gridPos: pos._2 },
.addPanels(layout.generate_grid([
row.new('Application'),
panels.app.cpu(ds, vars) { gridPos: pos._2 },
panels.app.memory(ds, vars) { gridPos: pos._2 },

panels.registrations(ds, vars) { gridPos: pos._1 },
])
)
row.new('History'),
panels.history.received_items(ds, vars) { gridPos: pos._2 },
panels.history.stored_items(ds, vars) { gridPos: pos._2 },
panels.history.get_queries(ds, vars) { gridPos: pos._2 },
panels.history.served_items(ds, vars) { gridPos: pos._2 },
panels.history.registrations(ds, vars) { gridPos: pos._1 },

row.new('Load Balancer'),
panels.lb.healthy_hosts(ds, vars) { gridPos: pos._2 },

row.new('Database'),
panels.db.cpu(ds, vars) { gridPos: pos._3 },
panels.db.available_memory(ds, vars) { gridPos: pos._3 },
panels.db.connections(ds, vars) { gridPos: pos._3 },

panels.db.low_mem_op_throttled(ds, vars) { gridPos: pos._3 },
panels.db.volume(ds, vars) { gridPos: pos._3 },
panels.db.buffer_cache_hit_ratio(ds, vars) { gridPos: pos._3 },
]))
27 changes: 27 additions & 0 deletions terraform/monitoring/dashboard.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
data "jsonnet_file" "dashboard" {
source = "${path.module}/dashboard.jsonnet"

ext_str = {
dashboard_title = "${module.this.stage} - ${module.this.name}"
dashboard_uid = "${module.this.stage}-${module.this.name}"

prometheus_uid = grafana_data_source.prometheus.uid
cloudwatch_uid = grafana_data_source.cloudwatch.uid

environment = module.this.stage
notifications = jsonencode(local.notifications)

ecs_service_name = var.ecs_service_name
load_balancer = local.load_balancer
target_group = local.target_group
docdb_cluster_id = var.docdb_cluster_id
}
}

# JSON Dashboard. When exporting from Grafana make sure that all
# variables are replaced properly
resource "grafana_dashboard" "main" {
overwrite = true
message = "Updated by Terraform"
config_json = data.jsonnet_file.dashboard.rendered
}
35 changes: 13 additions & 22 deletions terraform/monitoring/main.tf
Original file line number Diff line number Diff line change
@@ -1,23 +1,14 @@
#locals {
# opsgenie_notification_channel = "NNOynGwVz"
# notifications = (var.environment == "prod" ? [{ uid : local.opsgenie_notification_channel }] : [])
#}

data "jsonnet_file" "dashboard" {
source = "${path.module}/dashboard.jsonnet"

ext_str = {
dashboard_title = "${module.this.stage} - ${module.this.name}"
dashboard_uid = "${module.this.stage}-${module.this.name}"

prometheus_uid = grafana_data_source.prometheus.uid
}
}

# JSON Dashboard. When exporting from Grafana make sure that all
# variables are replaced properly
resource "grafana_dashboard" "main" {
overwrite = true
message = "Updated by Terraform"
config_json = data.jsonnet_file.dashboard.rendered
locals {
opsgenie_notification_channel = "NNOynGwVz"
notifications = (
var.environment == "prod" ?
[{ uid = local.opsgenie_notification_channel }] :
[]
)

target_group = split(":", var.target_group_arn)[5]

# Turns the arn into the format expected by the Grafana provider e.g.
# net/prod-relay-load-balancer/e9a51c46020a0f85
load_balancer = join("/", slice(split("/", var.load_balancer_arn), 1, 4))
}
44 changes: 44 additions & 0 deletions terraform/monitoring/panels/app/cpu.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
local grafana = import '../../grafonnet-lib/grafana.libsonnet';
local defaults = import '../../grafonnet-lib/defaults.libsonnet';

local panels = grafana.panels;
local targets = grafana.targets;
local overrides = defaults.overrides;

{
new(ds, vars)::
panels.timeseries(
title = 'CPU Utilization',
datasource = ds.cloudwatch,
)
.configure(overrides.cpu(defaults.configuration.timeseries_resource))
.setAlert(defaults.alerts.cpu(
namespace = vars.namespace,
env = vars.environment,
title = 'ECS',
notifications = vars.notifications,
))

.addTarget(targets.cloudwatch(
alias = 'CPU (Max)',
datasource = ds.cloudwatch,
dimensions = {
ServiceName: vars.ecs_service_name
},
metricName = 'CPUUtilization',
namespace = 'AWS/ECS',
statistic = 'Maximum',
refId = 'CPU_Max',
))
.addTarget(targets.cloudwatch(
alias = 'CPU (Avg)',
datasource = ds.cloudwatch,
dimensions = {
ServiceName: vars.ecs_service_name
},
metricName = 'CPUUtilization',
namespace = 'AWS/ECS',
statistic = 'Average',
refId = 'CPU_Avg',
))
}
42 changes: 42 additions & 0 deletions terraform/monitoring/panels/app/memory.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
local grafana = import '../../grafonnet-lib/grafana.libsonnet';
local defaults = import '../../grafonnet-lib/defaults.libsonnet';

local panels = grafana.panels;
local targets = grafana.targets;

{
new(ds, vars)::
panels.timeseries(
title = 'Memory Utilization',
datasource = ds.cloudwatch,
)
.configure(defaults.overrides.memory(defaults.configuration.timeseries_resource))
.setAlert(defaults.alerts.memory(
namespace = vars.namespace,
env = vars.environment,
notifications = vars.notifications,
))

.addTarget(targets.cloudwatch(
alias = 'Memory (Max)',
datasource = ds.cloudwatch,
namespace = 'AWS/ECS',
metricName = 'MemoryUtilization',
dimensions = {
ServiceName: vars.ecs_service_name
},
statistic = 'Maximum',
refId = 'Mem_Max',
))
.addTarget(targets.cloudwatch(
alias = 'Memory (Avg)',
datasource = ds.cloudwatch,
namespace = 'AWS/ECS',
metricName = 'MemoryUtilization',
dimensions = {
ServiceName: vars.ecs_service_name
},
statistic = 'Average',
refId = 'Mem_Avg',
))
}
Loading

0 comments on commit bc9e0a5

Please sign in to comment.