From 3df9b7a707e4bce503038c2d124c0da196c32dcb Mon Sep 17 00:00:00 2001 From: Dave Steinberg Date: Mon, 24 Apr 2023 17:14:58 +0000 Subject: [PATCH 1/4] add an autoscaling module for app-level scaling + setup a cpu-based policy as a starting point --- tf/envs/staging/.terraform.lock.hcl | 15 ++++ tf/envs/staging/main.tf | 35 +++++++++ tf/modules/app/main.tf | 4 ++ tf/modules/autoscale/autoscale.tf | 108 ++++++++++++++++++++++++++++ tf/modules/autoscale/variables.tf | 60 ++++++++++++++++ 5 files changed, 222 insertions(+) create mode 100644 tf/modules/autoscale/autoscale.tf create mode 100644 tf/modules/autoscale/variables.tf diff --git a/tf/envs/staging/.terraform.lock.hcl b/tf/envs/staging/.terraform.lock.hcl index 6a09c5055..96e914980 100644 --- a/tf/envs/staging/.terraform.lock.hcl +++ b/tf/envs/staging/.terraform.lock.hcl @@ -19,5 +19,20 @@ provider "registry.terraform.io/hashicorp/aws" { "h1:xstX5ub6MZ+PSrrZbB0ElhThX8N2ShQThR3m8nMZ928=", "h1:yubsPHpcKUBeoMPFi1mHAyqDaLCucjKmswHmBMMySC4=", "h1:zN2UtLLw3WTQlG9bnqjTozKstMMdy/hx9DQTHZOiGV8=", + "zh:092614f767995140cf444cad1a97fb569885db16cb1c1dc9ee56e801232bac29", + "zh:142e262fbb162c8a86493cfab4aadaf96a8572f1a3a6be444d465a4aee377dba", + "zh:1c58c8cb9934dc98a2dd9dc48a8a3d94a14c2c3f2bc0136410a9344938d4ecfb", + "zh:36efdf30cd52b92668cf6f912538c6e176b1a140a00e63ee0f753b85878c8b53", + "zh:4c631e367fd69692b57f85564de561733380e9674e146d3a7725b781ec5db944", + "zh:57ace91cb022ec944ad3af9272b78f48e7f71e9d1bf113ca56c6ce8deb4341fe", + "zh:7fc9581b530ebf28fda80c62c20c6fbbb936a878c24872349eb107b7f198e64c", + "zh:8280cd8f04c31af83f3e74f07704b258fbaa8bf1d70679d5ea2f0cbda2571de2", + "zh:8e6217a9443b651d4349d75bdc37af9298970d854bf515d8c305919b193e4a38", + "zh:9b12af85486a96aedd8d7984b0ff811a4b42e3d88dad1a3fb4c0b580d04fa425", + "zh:9c62bc4a9034a6caf15b8863da6f5a621b947d5fca161b4bd2f2e8e78eec8e3b", + "zh:9d0a45cd4a031d19ee14c0a15f25df6359dcd342ccf4e2ee4751b3ee496edb57", + "zh:ab47f4e300c46dc1757e2b8d8d749f34f044f219479106a00bf40572091a8999", + "zh:b55119290497dda96ab9ba3dca00d648808dc99d18960ad8aa875775bfaf95db", + "zh:df513941e6979f557edcac28d84bd91af9786104b0deba45b3b259a5ad215897", ] } diff --git a/tf/envs/staging/main.tf b/tf/envs/staging/main.tf index 9f56da883..c1dae47a7 100644 --- a/tf/envs/staging/main.tf +++ b/tf/envs/staging/main.tf @@ -58,3 +58,38 @@ module "app" { module "env_defns" { source = "../../modules/env_defns" } + +module "autoscaling" { + source = "../../modules/autoscale" + + env = module.envconfig.env + ecs_cluster = module.envconfig.ecs_cluster + service_name = module.app.service_name + + metrics = { + CPUUtilization = { + metric_name = "CPUUtilization" + adjustment_type = "ChangeInCapacity" + cooldown = 60 + datapoints_to_alarm = 1 + evaluation_periods = 1 + metric_aggregation_type = "Average" + period = 60 + statistic = "Average" + + down = { + comparison_operator = "LessThanThreshold" + metric_interval_upper_bound = 0 + scaling_adjustment = -1 + threshold = 40 + } + + up = { + comparison_operator = "GreaterThanOrEqualToThreshold" + metric_interval_lower_bound = 1 + scaling_adjustment = 1 + threshold = 70 + } + } + } +} diff --git a/tf/modules/app/main.tf b/tf/modules/app/main.tf index 3e039672a..604af0085 100644 --- a/tf/modules/app/main.tf +++ b/tf/modules/app/main.tf @@ -111,6 +111,10 @@ resource "aws_ecs_service" "api" { ] } } +output service_name { + description = "ECS service name" + value = aws_ecs_service.api.name +} data "aws_region" "current" {} diff --git a/tf/modules/autoscale/autoscale.tf b/tf/modules/autoscale/autoscale.tf new file mode 100644 index 000000000..eb47d8ec5 --- /dev/null +++ b/tf/modules/autoscale/autoscale.tf @@ -0,0 +1,108 @@ +# Originally inspired by: https://github.com/techservicesillinois/terraform-aws-ecs-service/blob/main/autoscale.tf + +data "aws_partition" "current" {} +data "aws_region" "current" {} +data "aws_caller_identity" "current" {} + +resource "aws_appautoscaling_target" "default" { + max_capacity = var.max_capacity + min_capacity = var.min_capacity + resource_id = format("service/%s/%s", var.ecs_cluster, var.service_name) + role_arn = format("arn:aws:iam::%s:role/aws-service-role/ecs.application-autoscaling.amazonaws.com/AWSServiceRoleForApplicationAutoScaling_ECSService", data.aws_caller_identity.current.account_id) + scalable_dimension = "ecs:service:DesiredCount" + service_namespace = "ecs" +} + +# Scale-down alarm for each metric. + +resource "aws_cloudwatch_metric_alarm" "down" { + for_each = var.metrics + + actions_enabled = each.value.actions_enabled + alarm_actions = [aws_appautoscaling_policy.down[each.key].arn] + alarm_description = format("scale-down alarm for %s on %s metric", var.service_name, each.key) + alarm_name = format("ecs-%s-%s-down", var.service_name, lower(each.key)) + comparison_operator = each.value.down.comparison_operator + datapoints_to_alarm = each.value.datapoints_to_alarm + evaluation_periods = each.value.evaluation_periods + metric_name = each.key + namespace = "AWS/ECS" + period = each.value.period + statistic = each.value.statistic + threshold = each.value.down.threshold + + dimensions = { + ClusterName = var.ecs_cluster + ServiceName = var.service_name + } +} + +# Scale-up alarm for each metric. + +resource "aws_cloudwatch_metric_alarm" "up" { + for_each = var.metrics + + actions_enabled = each.value.actions_enabled + alarm_actions = [aws_appautoscaling_policy.up[each.key].arn] + alarm_description = format("scale-up alarm for %s on %s metric", var.service_name, each.key) + alarm_name = format("ecs-%s-%s-up", var.service_name, lower(each.key)) + comparison_operator = each.value.up.comparison_operator + datapoints_to_alarm = each.value.datapoints_to_alarm + evaluation_periods = each.value.evaluation_periods + metric_name = each.key + namespace = "AWS/ECS" + period = each.value.period + statistic = each.value.statistic + threshold = each.value.up.threshold + + dimensions = { + ClusterName = var.ecs_cluster + ServiceName = var.service_name + } +} + +# Scale-down policy for each metric. + +resource "aws_appautoscaling_policy" "down" { + for_each = var.metrics + + name = format("ecs-%s-%s-down", var.service_name, lower(each.key)) + resource_id = aws_appautoscaling_target.default.resource_id + scalable_dimension = aws_appautoscaling_target.default.scalable_dimension + service_namespace = aws_appautoscaling_target.default.service_namespace + + step_scaling_policy_configuration { + adjustment_type = each.value.adjustment_type + cooldown = each.value.cooldown + metric_aggregation_type = each.value.metric_aggregation_type + + step_adjustment { + metric_interval_lower_bound = each.value.down.metric_interval_lower_bound + metric_interval_upper_bound = each.value.down.metric_interval_upper_bound + scaling_adjustment = each.value.down.scaling_adjustment + } + } +} + +# Scale-up policy for each metric. + +resource "aws_appautoscaling_policy" "up" { + for_each = var.metrics + + name = format("ecs-%s-%s-up", var.service_name, lower(each.key)) + resource_id = aws_appautoscaling_target.default.resource_id + scalable_dimension = aws_appautoscaling_target.default.scalable_dimension + service_namespace = aws_appautoscaling_target.default.service_namespace + + step_scaling_policy_configuration { + adjustment_type = each.value.adjustment_type + cooldown = each.value.cooldown + metric_aggregation_type = each.value.metric_aggregation_type + + step_adjustment { + metric_interval_lower_bound = each.value.up.metric_interval_lower_bound + metric_interval_upper_bound = each.value.up.metric_interval_upper_bound + scaling_adjustment = each.value.up.scaling_adjustment + } + } +} diff --git a/tf/modules/autoscale/variables.tf b/tf/modules/autoscale/variables.tf new file mode 100644 index 000000000..82d9f8682 --- /dev/null +++ b/tf/modules/autoscale/variables.tf @@ -0,0 +1,60 @@ +variable env { + type = string + description = "Environment name" +} + +variable ecs_cluster { + type = string +} + +variable service_name { + type = string +} + +variable min_capacity { + type = number + default = 1 +} + +variable max_capacity { + type = number + default = 1 +} + +variable "metrics" { + description = "Autoscaling metrics configuration" + type = map( + object({ + metric_name = string + actions_enabled = optional(bool, true) + adjustment_type = string + cooldown = optional(number, null) + datapoints_to_alarm = optional(number, null) + evaluation_periods = number + metric_aggregation_type = string + period = number + statistic = string + # TODO: Validate that either lower or upper bound are non-null. + down = object({ + comparison_operator = string + metric_interval_lower_bound = optional(number, null) + metric_interval_upper_bound = optional(number, null) + scaling_adjustment = number + threshold = number + }) + # TODO: Validate that either lower or upper bound are non-null. + up = object({ + comparison_operator = string + metric_interval_lower_bound = optional(number, null) + metric_interval_upper_bound = optional(number, null) + scaling_adjustment = number + threshold = number + }) + }) + ) + default = null + validation { + condition = var.metrics == null || try(length(var.metrics) > 0, true) + error_message = "The 'metrics' block must have one or more metrics" + } +} From a9a46790a948de74c324c1e2ee75710030537404 Mon Sep 17 00:00:00 2001 From: Dave Steinberg Date: Mon, 24 Apr 2023 19:41:44 +0000 Subject: [PATCH 2/4] switch to a target tracking policy type --- tf/envs/staging/main.tf | 27 ++----- tf/modules/autoscale/autoscale.tf | 125 ++++++++++-------------------- tf/modules/autoscale/variables.tf | 40 ++++------ 3 files changed, 60 insertions(+), 132 deletions(-) diff --git a/tf/envs/staging/main.tf b/tf/envs/staging/main.tf index c1dae47a7..16e51b83a 100644 --- a/tf/envs/staging/main.tf +++ b/tf/envs/staging/main.tf @@ -66,30 +66,15 @@ module "autoscaling" { ecs_cluster = module.envconfig.ecs_cluster service_name = module.app.service_name + min_capacity = 1 + max_capacity = 10 + metrics = { CPUUtilization = { - metric_name = "CPUUtilization" - adjustment_type = "ChangeInCapacity" - cooldown = 60 - datapoints_to_alarm = 1 - evaluation_periods = 1 - metric_aggregation_type = "Average" - period = 60 - statistic = "Average" - - down = { - comparison_operator = "LessThanThreshold" - metric_interval_upper_bound = 0 - scaling_adjustment = -1 - threshold = 40 + target = 60 + predefined_metric = { + type = "ECSServiceAverageCPUUtilization" } - - up = { - comparison_operator = "GreaterThanOrEqualToThreshold" - metric_interval_lower_bound = 1 - scaling_adjustment = 1 - threshold = 70 - } } } } diff --git a/tf/modules/autoscale/autoscale.tf b/tf/modules/autoscale/autoscale.tf index eb47d8ec5..8c95149fb 100644 --- a/tf/modules/autoscale/autoscale.tf +++ b/tf/modules/autoscale/autoscale.tf @@ -4,105 +4,58 @@ data "aws_partition" "current" {} data "aws_region" "current" {} data "aws_caller_identity" "current" {} +resource "aws_iam_role" "ecs-autoscale-role" { + name = format("%s-%s-app-scaling", var.env, var.service_name) + + assume_role_policy = < Date: Mon, 24 Apr 2023 20:53:29 +0000 Subject: [PATCH 3/4] tweak our variable formatting to support using predefined or custom metrics optionally --- tf/envs/staging/main.tf | 25 ++++++++++++++++++++++--- tf/modules/autoscale/autoscale.tf | 31 +++++++++++++++++++++++++++++-- tf/modules/autoscale/variables.tf | 9 +++++---- 3 files changed, 56 insertions(+), 9 deletions(-) diff --git a/tf/envs/staging/main.tf b/tf/envs/staging/main.tf index 16e51b83a..12a992be5 100644 --- a/tf/envs/staging/main.tf +++ b/tf/envs/staging/main.tf @@ -70,11 +70,30 @@ module "autoscaling" { max_capacity = 10 metrics = { - CPUUtilization = { + CPUAverage = { target = 60 - predefined_metric = { + predefined_metric = [{ type = "ECSServiceAverageCPUUtilization" - } + }] + } + MemoryAverage = { + target = 60 + predefined_metric = [{ + type = "ECSServiceAverageMemoryUtilization" + }] + } + CPUSpike = { + target = 85 + customized_metric = [{ + metric_name = "CPUUtilization" + namespace = "AWS/ECS" + statistic = "Maximum" + unit = "Percent" + dimensions = { + "ClusterName" = module.envconfig.ecs_cluster + "ServiceName" = module.app.service_name + } + }] } } } diff --git a/tf/modules/autoscale/autoscale.tf b/tf/modules/autoscale/autoscale.tf index 8c95149fb..08cef52e2 100644 --- a/tf/modules/autoscale/autoscale.tf +++ b/tf/modules/autoscale/autoscale.tf @@ -53,8 +53,35 @@ resource "aws_appautoscaling_policy" "default" { scale_in_cooldown = each.value.scale_in_cooldown scale_out_cooldown = each.value.scale_out_cooldown - predefined_metric_specification { - predefined_metric_type = each.value.predefined_metric.type + dynamic predefined_metric_specification { + for_each = each.value.predefined_metric + iterator = metric + + content { + predefined_metric_type = metric.value.type + } + } + + dynamic customized_metric_specification { + for_each = each.value.customized_metric + iterator = metric + + content { + metric_name = metric.value.metric_name + namespace = metric.value.namespace + statistic = metric.value.statistic + unit = metric.value.unit + + dynamic dimensions { + for_each = metric.value.dimensions + iterator = dim + + content { + name = dim.key + value = dim.value + } + } + } } } } diff --git a/tf/modules/autoscale/variables.tf b/tf/modules/autoscale/variables.tf index ba4e75fa3..db2a8bd66 100644 --- a/tf/modules/autoscale/variables.tf +++ b/tf/modules/autoscale/variables.tf @@ -30,16 +30,17 @@ variable "metrics" { scale_in_cooldown = optional(number, 120) scale_out_cooldown = optional(number, 120) - predefined_metric = optional(object({ + predefined_metric = optional(list(object({ type = string resource_label = optional(string, null) - }), null) - custom_metric_type = optional(object({ + })), []) + customized_metric = optional(list(object({ metric_name = string namespace = string statistic = optional(string, "Average") unit = optional(string, null) - }), null) + dimensions = optional(map(string), {}) + })), []) }) ) default = null From 7b862405b21d8c2f67a8348712cbec5ddab2667a Mon Sep 17 00:00:00 2001 From: Dave Steinberg Date: Tue, 25 Apr 2023 14:23:17 +0000 Subject: [PATCH 4/4] bump desired count / min capacity to 2 + add an ignore so TF doesn't undo the scaling --- tf/envs/staging/main.tf | 2 +- tf/modules/app/main.tf | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tf/envs/staging/main.tf b/tf/envs/staging/main.tf index 12a992be5..ebcecb06d 100644 --- a/tf/envs/staging/main.tf +++ b/tf/envs/staging/main.tf @@ -66,7 +66,7 @@ module "autoscaling" { ecs_cluster = module.envconfig.ecs_cluster service_name = module.app.service_name - min_capacity = 1 + min_capacity = 2 max_capacity = 10 metrics = { diff --git a/tf/modules/app/main.tf b/tf/modules/app/main.tf index 604af0085..4697c6ce5 100644 --- a/tf/modules/app/main.tf +++ b/tf/modules/app/main.tf @@ -78,7 +78,7 @@ resource "aws_ecs_service" "api" { name = "capp-api" cluster = var.ecs_cluster task_definition = aws_ecs_task_definition.api.arn - desired_count = 1 + desired_count = 2 health_check_grace_period_seconds = 30 enable_ecs_managed_tags = true propagate_tags = "SERVICE" @@ -107,7 +107,8 @@ resource "aws_ecs_service" "api" { # could explicitly set the strategy to be the default strategy, which would be acceptable. lifecycle { ignore_changes = [ - capacity_provider_strategy + capacity_provider_strategy, + desired_count ] } }