Skip to content

Commit

Permalink
delius core microservices - Update ecs_monitoring.tf (#6770)
Browse files Browse the repository at this point in the history
* Update ecs_monitoring.tf

* correct metric namespaces

* correct mem metric

* Update ecs_monitoring.tf

* Update ecs_monitoring.tf

* Update ecs_monitoring.tf

* Update ecs_monitoring.tf

* Update ecs_monitoring.tf

* Update ecs_monitoring.tf

* Update ecs_monitoring.tf

* Update ecs_monitoring.tf

* Update ecs_monitoring.tf

* Update ecs_monitoring.tf

* Update ecs_monitoring.tf

* ldap ecs alarms

* ldap anomaly alarms

* threshold

* log error thresholds

* detection bands customisation

* include blanket alarm for 80%

* typo

* times

* customisable

* tuning

* not sum

* timings

* Update ecs_monitoring.tf
  • Loading branch information
georgepstaylor authored Jun 29, 2024
1 parent 2522497 commit d1e1e35
Show file tree
Hide file tree
Showing 5 changed files with 319 additions and 164 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3,48 +3,75 @@ locals {
cluster_name = split("/", var.ecs_cluster_arn)[1]
}
# Alarm for high CPU usage
resource "aws_cloudwatch_metric_alarm" "cpu_over_threshold" {
resource "aws_cloudwatch_metric_alarm" "ecs_cpu_over_threshold" {
alarm_name = "ldap-${var.env_name}-ecs-cpu-threshold"
alarm_description = "Triggers alarm if ECS CPU crosses a threshold"
namespace = "AWS/ECS"
metric_name = "CPUUtilization"
statistic = "Average"
period = "60"
evaluation_periods = "5"
actions_enabled = true
alarm_actions = [var.sns_topic_arn]
ok_actions = [var.sns_topic_arn]
threshold = "80"
evaluation_periods = 5
datapoints_to_alarm = 5
threshold_metric_id = "ad1"
comparison_operator = "GreaterThanUpperThreshold"
treat_missing_data = "missing"
comparison_operator = "GreaterThanThreshold"

dimensions = {
ClusterName = local.cluster_name
ServiceName = local.cluster_name
metric_query {
id = "m1"
return_data = true
metric {
namespace = "AWS/ECS"
metric_name = "CPUUtilization"
dimensions = {
ServiceName = "openldap"
ClusterName = local.cluster_name
}
period = 60
stat = "Average"
}
}

tags = var.tags
metric_query {
id = "ad1"
label = "CPUUtilization (expected)"
return_data = true
expression = "ANOMALY_DETECTION_BAND(m1, 50)"
}
}

# Alarm for high memory usage
resource "aws_cloudwatch_metric_alarm" "memory_over_threshold" {
alarm_name = "ldap-${var.env_name}-ecs-memory-threshold"
alarm_description = "Triggers alarm if ECS memory crosses a threshold"
namespace = "AWS/ECS"
metric_name = "MemoryUtilization"
statistic = "Average"
period = "60"
evaluation_periods = "5"
actions_enabled = true
alarm_actions = [var.sns_topic_arn]
ok_actions = [var.sns_topic_arn]
threshold = "80"
evaluation_periods = 5
datapoints_to_alarm = 5
threshold_metric_id = "ad1"
comparison_operator = "GreaterThanUpperThreshold"
treat_missing_data = "missing"
comparison_operator = "GreaterThanThreshold"

dimensions = {
ClusterName = local.cluster_name
ServiceName = local.cluster_name
metric_query {
id = "m1"
return_data = true
metric {
namespace = "AWS/ECS"
metric_name = "MemoryUtilization"
dimensions = {
ServiceName = "openldap"
ClusterName = local.cluster_name
}
period = 60
stat = "Average"
}
}

metric_query {
id = "ad1"
label = "MemoryUtilization (expected)"
return_data = true
expression = "ANOMALY_DETECTION_BAND(m1, 20)"
}
}

resource "aws_cloudwatch_log_metric_filter" "log_error_filter" {
Expand Down Expand Up @@ -90,33 +117,51 @@ resource "aws_cloudwatch_metric_alarm" "warning_error_volume" {
comparison_operator = "GreaterThanThreshold"
}

resource "aws_cloudwatch_metric_alarm" "ecs_running_tasks_less_than_desired" {
alarm_name = "ldap-${var.env_name}-running-tasks-lt-desired"
actions_enabled = true
alarm_actions = [var.sns_topic_arn]
ok_actions = [var.sns_topic_arn]
evaluation_periods = 1
datapoints_to_alarm = 1
threshold = 1
comparison_operator = "GreaterThanOrEqualToThreshold"
treat_missing_data = "missing"

# resource "aws_cloudwatch_metric_alarm" "log_error_warning_alarm" {
# alarm_name = "ldap-${var.env_name}-logged-errors-warning"
# alarm_description = "Error messages were detected in the `ldap` logs."
# comparison_operator = "GreaterThanUpperThreshold"
# threshold_metric_id = "ad1"
# evaluation_periods = 2
# alarm_actions = [var.sns_topic_arn]
# ok_actions = [var.sns_topic_arn]
# actions_enabled = true
#
# metric_query {
# id = "ad1"
# expression = "ANOMALY_DETECTION_BAND(m1)"
# label = "${aws_cloudwatch_log_metric_filter.log_error_filter.metric_transformation.0.name} (expected)"
# return_data = true
# }
#
# metric_query {
# id = "m1"
# label = aws_cloudwatch_log_metric_filter.log_error_filter.metric_transformation.0.name
# return_data = true
# metric {
# namespace = aws_cloudwatch_log_metric_filter.log_error_filter.metric_transformation.0.namespace
# metric_name = aws_cloudwatch_log_metric_filter.log_error_filter.metric_transformation.0.name
# period = 300
# stat = "Sum"
# }
# }
# }
metric_query {
id = "e1"
label = "Expression1"
return_data = true
expression = "IF(m1 < m2, 1, 0)"
}

metric_query {
id = "m1"
return_data = false
metric {
namespace = "ECS/ContainerInsights"
metric_name = "RunningTaskCount"
dimensions = {
ServiceName = "openldap"
ClusterName = local.cluster_name
}
period = 30
stat = "Minimum"
}
}

metric_query {
id = "m2"
return_data = false
metric {
namespace = "ECS/ContainerInsights"
metric_name = "DesiredTaskCount"
dimensions = {
ServiceName = "openldap"
ClusterName = local.cluster_name
}
period = 30
stat = "Maximum"
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -89,14 +89,30 @@ module "pwm" {
"JAVA_OPTS" = "-Xmx${floor(var.delius_microservice_configs.pwm.container_memory * 0.75)}m -Xms${floor(var.delius_microservice_configs.pwm.container_memory * 0.25)}m"
}
container_vars_env_specific = try(var.delius_microservice_configs.pwm.container_vars_env_specific, {})
ignore_changes_service_task_definition = false
ignore_changes_service_task_definition = true

providers = {
aws.core-vpc = aws.core-vpc
aws.core-network-services = aws.core-network-services
}

log_error_pattern = "ERROR"
log_error_pattern = "ERROR"

log_error_threshold_config = {
warning = {
threshold = 10
period = 60
}
critical = {
threshold = 20
period = 180
}
}
ecs_monitoring_anomaly_detection_thresholds = {
memory = 5
cpu = 20
}

sns_topic_arn = aws_sns_topic.delius_core_alarms.arn
frontend_lb_arn_suffix = aws_lb.delius_core_ancillary.arn_suffix
enable_platform_backups = var.enable_platform_backups
Expand Down
Loading

0 comments on commit d1e1e35

Please sign in to comment.