diff --git a/terraform/environments/delius-core/modules/components/ldap/ecs_monitoring.tf b/terraform/environments/delius-core/modules/components/ldap/ecs_monitoring.tf index 60e0ebeeac4..f183f24eda1 100644 --- a/terraform/environments/delius-core/modules/components/ldap/ecs_monitoring.tf +++ b/terraform/environments/delius-core/modules/components/ldap/ecs_monitoring.tf @@ -3,48 +3,75 @@ locals { cluster_name = split("/", var.ecs_cluster_arn)[1] } # Alarm for high CPU usage -resource "aws_cloudwatch_metric_alarm" "cpu_over_threshold" { +resource "aws_cloudwatch_metric_alarm" "ecs_cpu_over_threshold" { alarm_name = "ldap-${var.env_name}-ecs-cpu-threshold" alarm_description = "Triggers alarm if ECS CPU crosses a threshold" - namespace = "AWS/ECS" - metric_name = "CPUUtilization" - statistic = "Average" - period = "60" - evaluation_periods = "5" + actions_enabled = true alarm_actions = [var.sns_topic_arn] ok_actions = [var.sns_topic_arn] - threshold = "80" + evaluation_periods = 5 + datapoints_to_alarm = 5 + threshold_metric_id = "ad1" + comparison_operator = "GreaterThanUpperThreshold" treat_missing_data = "missing" - comparison_operator = "GreaterThanThreshold" - dimensions = { - ClusterName = local.cluster_name - ServiceName = local.cluster_name + metric_query { + id = "m1" + return_data = true + metric { + namespace = "AWS/ECS" + metric_name = "CPUUtilization" + dimensions = { + ServiceName = "openldap" + ClusterName = local.cluster_name + } + period = 60 + stat = "Average" + } } - tags = var.tags + metric_query { + id = "ad1" + label = "CPUUtilization (expected)" + return_data = true + expression = "ANOMALY_DETECTION_BAND(m1, 50)" + } } # Alarm for high memory usage resource "aws_cloudwatch_metric_alarm" "memory_over_threshold" { alarm_name = "ldap-${var.env_name}-ecs-memory-threshold" alarm_description = "Triggers alarm if ECS memory crosses a threshold" - namespace = "AWS/ECS" - metric_name = "MemoryUtilization" - statistic = "Average" - period = "60" - evaluation_periods = "5" + actions_enabled = true alarm_actions = [var.sns_topic_arn] ok_actions = [var.sns_topic_arn] - threshold = "80" + evaluation_periods = 5 + datapoints_to_alarm = 5 + threshold_metric_id = "ad1" + comparison_operator = "GreaterThanUpperThreshold" treat_missing_data = "missing" - comparison_operator = "GreaterThanThreshold" - dimensions = { - ClusterName = local.cluster_name - ServiceName = local.cluster_name + metric_query { + id = "m1" + return_data = true + metric { + namespace = "AWS/ECS" + metric_name = "MemoryUtilization" + dimensions = { + ServiceName = "openldap" + ClusterName = local.cluster_name + } + period = 60 + stat = "Average" + } } + metric_query { + id = "ad1" + label = "MemoryUtilization (expected)" + return_data = true + expression = "ANOMALY_DETECTION_BAND(m1, 20)" + } } resource "aws_cloudwatch_log_metric_filter" "log_error_filter" { @@ -90,33 +117,51 @@ resource "aws_cloudwatch_metric_alarm" "warning_error_volume" { comparison_operator = "GreaterThanThreshold" } +resource "aws_cloudwatch_metric_alarm" "ecs_running_tasks_less_than_desired" { + alarm_name = "ldap-${var.env_name}-running-tasks-lt-desired" + actions_enabled = true + alarm_actions = [var.sns_topic_arn] + ok_actions = [var.sns_topic_arn] + evaluation_periods = 1 + datapoints_to_alarm = 1 + threshold = 1 + comparison_operator = "GreaterThanOrEqualToThreshold" + treat_missing_data = "missing" -# resource "aws_cloudwatch_metric_alarm" "log_error_warning_alarm" { -# alarm_name = "ldap-${var.env_name}-logged-errors-warning" -# alarm_description = "Error messages were detected in the `ldap` logs." -# comparison_operator = "GreaterThanUpperThreshold" -# threshold_metric_id = "ad1" -# evaluation_periods = 2 -# alarm_actions = [var.sns_topic_arn] -# ok_actions = [var.sns_topic_arn] -# actions_enabled = true -# -# metric_query { -# id = "ad1" -# expression = "ANOMALY_DETECTION_BAND(m1)" -# label = "${aws_cloudwatch_log_metric_filter.log_error_filter.metric_transformation.0.name} (expected)" -# return_data = true -# } -# -# metric_query { -# id = "m1" -# label = aws_cloudwatch_log_metric_filter.log_error_filter.metric_transformation.0.name -# return_data = true -# metric { -# namespace = aws_cloudwatch_log_metric_filter.log_error_filter.metric_transformation.0.namespace -# metric_name = aws_cloudwatch_log_metric_filter.log_error_filter.metric_transformation.0.name -# period = 300 -# stat = "Sum" -# } -# } -# } + metric_query { + id = "e1" + label = "Expression1" + return_data = true + expression = "IF(m1 < m2, 1, 0)" + } + + metric_query { + id = "m1" + return_data = false + metric { + namespace = "ECS/ContainerInsights" + metric_name = "RunningTaskCount" + dimensions = { + ServiceName = "openldap" + ClusterName = local.cluster_name + } + period = 30 + stat = "Minimum" + } + } + + metric_query { + id = "m2" + return_data = false + metric { + namespace = "ECS/ContainerInsights" + metric_name = "DesiredTaskCount" + dimensions = { + ServiceName = "openldap" + ClusterName = local.cluster_name + } + period = 30 + stat = "Maximum" + } + } +} diff --git a/terraform/environments/delius-core/modules/delius_environment/pwm.tf b/terraform/environments/delius-core/modules/delius_environment/pwm.tf index a8eb3b05ece..282ea1352aa 100644 --- a/terraform/environments/delius-core/modules/delius_environment/pwm.tf +++ b/terraform/environments/delius-core/modules/delius_environment/pwm.tf @@ -89,14 +89,30 @@ module "pwm" { "JAVA_OPTS" = "-Xmx${floor(var.delius_microservice_configs.pwm.container_memory * 0.75)}m -Xms${floor(var.delius_microservice_configs.pwm.container_memory * 0.25)}m" } container_vars_env_specific = try(var.delius_microservice_configs.pwm.container_vars_env_specific, {}) - ignore_changes_service_task_definition = false + ignore_changes_service_task_definition = true providers = { aws.core-vpc = aws.core-vpc aws.core-network-services = aws.core-network-services } - log_error_pattern = "ERROR" + log_error_pattern = "ERROR" + + log_error_threshold_config = { + warning = { + threshold = 10 + period = 60 + } + critical = { + threshold = 20 + period = 180 + } + } + ecs_monitoring_anomaly_detection_thresholds = { + memory = 5 + cpu = 20 + } + sns_topic_arn = aws_sns_topic.delius_core_alarms.arn frontend_lb_arn_suffix = aws_lb.delius_core_ancillary.arn_suffix enable_platform_backups = var.enable_platform_backups diff --git a/terraform/environments/delius-core/modules/helpers/delius_microservice/ecs_monitoring.tf b/terraform/environments/delius-core/modules/helpers/delius_microservice/ecs_monitoring.tf index 42393820dde..ad57c5400fa 100644 --- a/terraform/environments/delius-core/modules/helpers/delius_microservice/ecs_monitoring.tf +++ b/terraform/environments/delius-core/modules/helpers/delius_microservice/ecs_monitoring.tf @@ -2,154 +2,168 @@ locals { cluster_name = split("/", var.ecs_cluster_arn)[1] } + # Alarm for high CPU usage -resource "aws_cloudwatch_metric_alarm" "ecs_cpu_over_threshold" { - alarm_name = "${var.name}-${var.env_name}-ecs-cpu-threshold" - comparison_operator = "GreaterThanUpperThreshold" - evaluation_periods = "5" - threshold_metric_id = "e1" - alarm_description = "Triggers alarm if ECS CPU crosses a threshold" - insufficient_data_actions = [] - alarm_actions = [var.sns_topic_arn] - ok_actions = [var.sns_topic_arn] - treat_missing_data = "missing" +resource "aws_cloudwatch_metric_alarm" "ecs_cpu_over_critical_threshold" { + alarm_name = "${var.name}-${var.env_name}-ecs-cpu-critical-threshold" + alarm_description = "Triggers alarm if ECS CPU crosses a critical threshold" + namespace = "AWS/ECS" + metric_name = "CPUUtilization" + statistic = "Average" + period = "60" + evaluation_periods = "5" + alarm_actions = [var.sns_topic_arn] + ok_actions = [var.sns_topic_arn] + threshold = "90" + treat_missing_data = "missing" + comparison_operator = "GreaterThanThreshold" - metric_query { - id = "e1" - expression = "ANOMALY_DETECTION_BAND(m1)" - label = "CpuUtilized (Expected)" - return_data = "true" + dimensions = { + ServiceName = var.name + ClusterName = local.cluster_name } +} + +# Alarm for high memory usage +resource "aws_cloudwatch_metric_alarm" "ecs_memory_over_critical_threshold" { + alarm_name = "${var.name}-${var.env_name}-ecs-memory-critical-threshold" + alarm_description = "Triggers alarm if ECS memory crosses a critical threshold" + namespace = "AWS/ECS" + metric_name = "MemoryUtilization" + statistic = "Average" + period = "60" + evaluation_periods = "5" + alarm_actions = [var.sns_topic_arn] + ok_actions = [var.sns_topic_arn] + threshold = "90" + treat_missing_data = "missing" + comparison_operator = "GreaterThanThreshold" + + dimensions = { + ServiceName = var.name + ClusterName = local.cluster_name + } + +} + +# Alarm for high CPU usage anomaly detection +resource "aws_cloudwatch_metric_alarm" "ecs_cpu_over_threshold" { + alarm_name = "${var.name}-${var.env_name}-ecs-cpu-threshold" + alarm_description = "Triggers alarm if ECS CPU crosses a threshold" + actions_enabled = true + alarm_actions = [var.sns_topic_arn] + ok_actions = [var.sns_topic_arn] + evaluation_periods = 5 + datapoints_to_alarm = 5 + threshold_metric_id = "ad1" + comparison_operator = "GreaterThanUpperThreshold" + treat_missing_data = "missing" metric_query { id = "m1" - return_data = "true" + return_data = true metric { - metric_name = "CpuUtilized" - namespace = "ECS/ContainerInsights" - period = "60" - stat = "Average" - unit = "Count" - + namespace = "AWS/ECS" + metric_name = "CPUUtilization" dimensions = { + ServiceName = var.name ClusterName = local.cluster_name } + period = 60 + stat = "Average" } } -} - -# Alarm for high memory usage -resource "aws_cloudwatch_metric_alarm" "memory_over_threshold" { - alarm_name = "${var.name}-${var.env_name}-ecs-memory-threshold" - comparison_operator = "GreaterThanUpperThreshold" - evaluation_periods = "5" - threshold_metric_id = "e1" - alarm_description = "Triggers alarm if ECS memory crosses a threshold" - insufficient_data_actions = [] - alarm_actions = [var.sns_topic_arn] - ok_actions = [var.sns_topic_arn] - treat_missing_data = "missing" metric_query { - id = "e1" - expression = "ANOMALY_DETECTION_BAND(m1)" - label = "MemoryUtilized (Expected)" - return_data = "true" + id = "ad1" + label = "CPUUtilization (expected)" + return_data = true + expression = "ANOMALY_DETECTION_BAND(m1, ${var.ecs_monitoring_anomaly_detection_thresholds.cpu})" } +} + +# Alarm for high memory usage anomaly detection +resource "aws_cloudwatch_metric_alarm" "ecs_memory_over_threshold" { + alarm_name = "${var.name}-${var.env_name}-ecs-memory-threshold" + alarm_description = "Triggers alarm if ECS memory crosses a threshold" + actions_enabled = true + alarm_actions = [var.sns_topic_arn] + ok_actions = [var.sns_topic_arn] + evaluation_periods = 5 + datapoints_to_alarm = 5 + threshold_metric_id = "ad1" + comparison_operator = "GreaterThanUpperThreshold" + treat_missing_data = "missing" metric_query { id = "m1" - return_data = "true" + return_data = true metric { - metric_name = "MemoryUtilized" - namespace = "ECS/ContainerInsights" - period = "60" - stat = "Average" - unit = "Count" - + namespace = "AWS/ECS" + metric_name = "MemoryUtilization" dimensions = { + ServiceName = var.name ClusterName = local.cluster_name } + period = 60 + stat = "Average" } } -} - -// log metric filter for error logs in container that contain the phrase "Error in Helpdesk" -resource "aws_cloudwatch_log_metric_filter" "error" { - name = "${var.name}-${var.env_name}-application-error" - pattern = "Error in Helpdesk" - log_group_name = aws_cloudwatch_log_group.ecs.name - metric_transformation { - name = "ErrorCount" - namespace = "${var.name}Metrics" - value = "1" - default_value = "0" + metric_query { + id = "ad1" + label = "MemoryUtilization (expected)" + return_data = true + expression = "ANOMALY_DETECTION_BAND(m1, ${var.ecs_monitoring_anomaly_detection_thresholds.memory})" } } -resource "aws_cloudwatch_metric_alarm" "high_error_volume" { - alarm_name = "${var.name}-${var.env_name}-high-error-count" - alarm_description = "Triggers alarm if there are more than 5 errors in the last 5 minutes" - namespace = "${var.name}Metrics" - metric_name = "ErrorCount" - statistic = "Sum" - period = "300" - evaluation_periods = "1" - alarm_actions = [var.sns_topic_arn] - ok_actions = [var.sns_topic_arn] - threshold = "10" - treat_missing_data = "missing" - comparison_operator = "GreaterThanThreshold" -} - - -resource "aws_cloudwatch_log_metric_filter" "log_error_filter" { +resource "aws_cloudwatch_log_metric_filter" "ecs_log_error_filter" { count = var.log_error_pattern != "" ? 1 : 0 log_group_name = aws_cloudwatch_log_group.ecs.name name = "${var.name}-${var.env_name}-logged-errors" pattern = var.log_error_pattern metric_transformation { - name = "LoggedErrors" + name = "${var.name}-${var.env_name}-logged-errors" namespace = "${var.env_name}/${var.name}" value = 1 default_value = 0 } } -resource "aws_cloudwatch_metric_alarm" "log_error_warning_alarm" { +resource "aws_cloudwatch_metric_alarm" "ecs_critical_error_volume" { count = var.log_error_pattern != "" ? 1 : 0 - alarm_name = "${var.name}-${var.env_name}-logged-errors-warning" - alarm_description = "Error messages were detected in the `${var.name}` logs." - comparison_operator = "GreaterThanUpperThreshold" - threshold_metric_id = "ad1" - evaluation_periods = 2 + alarm_name = "${var.name}-${var.env_name}-critical-error-count" + alarm_description = "Critical alarm for log error threshold" + namespace = "${var.env_name}/${var.name}" + metric_name = "${var.name}-${var.env_name}-logged-errors" + statistic = "Sum" + period = var.log_error_threshold_config.critical.period + evaluation_periods = "1" alarm_actions = [var.sns_topic_arn] ok_actions = [var.sns_topic_arn] - actions_enabled = false # Disabled initially, while anomaly detection models are trained - - metric_query { - id = "ad1" - expression = "ANOMALY_DETECTION_BAND(m1)" - label = "${aws_cloudwatch_log_metric_filter.log_error_filter.0.metric_transformation.0.name} (expected)" - return_data = true - } - - metric_query { - id = "m1" - label = aws_cloudwatch_log_metric_filter.log_error_filter.0.metric_transformation.0.name - return_data = true - metric { - namespace = aws_cloudwatch_log_metric_filter.log_error_filter.0.metric_transformation.0.namespace - metric_name = aws_cloudwatch_log_metric_filter.log_error_filter.0.metric_transformation.0.name - period = 300 - stat = "Sum" - } - } + threshold = var.log_error_threshold_config.critical.threshold + treat_missing_data = "missing" + comparison_operator = "GreaterThanThreshold" } +resource "aws_cloudwatch_metric_alarm" "ecs_warning_error_volume" { + count = var.log_error_pattern != "" ? 1 : 0 + alarm_name = "${var.name}-${var.env_name}-warning-error-count" + alarm_description = "Warning alarm for log error threshold" + namespace = "${var.env_name}/${var.name}" + metric_name = "${var.name}-${var.env_name}-logged-errors" + statistic = "Sum" + period = var.log_error_threshold_config.warning.period + evaluation_periods = "1" + alarm_actions = [var.sns_topic_arn] + ok_actions = [var.sns_topic_arn] + threshold = var.log_error_threshold_config.warning.threshold + treat_missing_data = "missing" + comparison_operator = "GreaterThanThreshold" +} -resource "aws_cloudwatch_metric_alarm" "healthy_hosts_fatal_alarm" { +resource "aws_cloudwatch_metric_alarm" "ecs_healthy_hosts_fatal_alarm" { alarm_name = "${var.name}-${var.env_name}-healthy-hosts-fatal" alarm_description = "All `${var.name}` instances stopped responding." namespace = "AWS/ApplicationELB" @@ -167,9 +181,8 @@ resource "aws_cloudwatch_metric_alarm" "healthy_hosts_fatal_alarm" { } } - # Response time alarms -resource "aws_cloudwatch_metric_alarm" "response_time_critical_alarm" { +resource "aws_cloudwatch_metric_alarm" "alb_response_time_critical_alarm" { alarm_name = "${var.name}-${var.env_name}-response-time-critical" alarm_description = "Average response time for the `${var.name}` service exceeded 5 seconds." namespace = "AWS/ApplicationELB" @@ -188,7 +201,7 @@ resource "aws_cloudwatch_metric_alarm" "response_time_critical_alarm" { } # Response code alarms -resource "aws_cloudwatch_metric_alarm" "response_code_5xx_warning_alarm" { +resource "aws_cloudwatch_metric_alarm" "alb_response_code_5xx_warning_alarm" { alarm_name = "${var.name}-${var.env_name}-5xx-response-warning" alarm_description = "The `${var.name}` service responded with 5xx errors." namespace = "AWS/ApplicationELB" @@ -206,7 +219,7 @@ resource "aws_cloudwatch_metric_alarm" "response_code_5xx_warning_alarm" { } } -resource "aws_cloudwatch_metric_alarm" "response_code_5xx_critical_alarm" { +resource "aws_cloudwatch_metric_alarm" "alb_response_code_5xx_critical_alarm" { alarm_name = "${var.name}-${var.env_name}-5xx-response-critical" alarm_description = "The `${var.name}` service responded with 5xx errors at an elevated rate (over 10/minute)." namespace = "AWS/ApplicationELB" @@ -224,3 +237,51 @@ resource "aws_cloudwatch_metric_alarm" "response_code_5xx_critical_alarm" { } } +resource "aws_cloudwatch_metric_alarm" "ecs_running_tasks_less_than_desired" { + alarm_name = "${var.name}-${var.env_name}-running-tasks-lt-desired" + actions_enabled = true + alarm_actions = [var.sns_topic_arn] + ok_actions = [var.sns_topic_arn] + evaluation_periods = 1 + datapoints_to_alarm = 1 + threshold = 1 + comparison_operator = "GreaterThanOrEqualToThreshold" + treat_missing_data = "ignore" + + metric_query { + id = "e1" + label = "Expression1" + return_data = true + expression = "IF(m1 < m2, 1, 0)" + } + + metric_query { + id = "m1" + return_data = false + metric { + namespace = "ECS/ContainerInsights" + metric_name = "RunningTaskCount" + dimensions = { + ServiceName = var.name + ClusterName = local.cluster_name + } + period = var.ecs_monitoring_running_tasks_less_than_desired_period + stat = "Minimum" + } + } + + metric_query { + id = "m2" + return_data = false + metric { + namespace = "ECS/ContainerInsights" + metric_name = "DesiredTaskCount" + dimensions = { + ServiceName = var.name + ClusterName = local.cluster_name + } + period = var.ecs_monitoring_running_tasks_less_than_desired_period + stat = "Maximum" + } + } +} diff --git a/terraform/environments/delius-core/modules/helpers/delius_microservice/templates/dashboard-ecs.json b/terraform/environments/delius-core/modules/helpers/delius_microservice/templates/dashboard-ecs.json index 46a55650905..2c5a0110832 100644 --- a/terraform/environments/delius-core/modules/helpers/delius_microservice/templates/dashboard-ecs.json +++ b/terraform/environments/delius-core/modules/helpers/delius_microservice/templates/dashboard-ecs.json @@ -9,8 +9,8 @@ "properties": { "metrics": [ [ - "ECS/ContainerInsights", - "CpuUtilized", + "AWS/ECS", + "CPUUtilization", "ServiceName", "${ecs_service_name}", "ClusterName", @@ -46,8 +46,8 @@ "properties": { "metrics": [ [ - "ECS/ContainerInsights", - "MemoryUtilized", + "AWS/ECS", + "MemoryUtilization", "ServiceName", "${ecs_service_name}", "ClusterName", diff --git a/terraform/environments/delius-core/modules/helpers/delius_microservice/variables.tf b/terraform/environments/delius-core/modules/helpers/delius_microservice/variables.tf index f26520c290d..493ccd2c28f 100644 --- a/terraform/environments/delius-core/modules/helpers/delius_microservice/variables.tf +++ b/terraform/environments/delius-core/modules/helpers/delius_microservice/variables.tf @@ -524,6 +524,39 @@ variable "log_error_pattern" { type = string } +variable "log_error_threshold_config" { + description = "Used by log error alarms" + type = map(object({ + threshold = number + period = number + })) + default = { + warning = { + threshold = 5 + period = 120 + } + critical = { + threshold = 10 + period = 300 + } + } +} + +variable "ecs_monitoring_anomaly_detection_thresholds" { + description = "The threshold for the anomaly detection" + type = map(number) + default = { + memory = 5 + cpu = 5 + } +} + +variable "ecs_monitoring_running_tasks_less_than_desired_period" { + description = "The period for the running tasks less than desired alarm" + type = number + default = 60 +} + variable "sns_topic_arn" { description = "Used by alarms" type = string