Skip to content
This repository has been archived by the owner on Jul 12, 2023. It is now read-only.

Commit

Permalink
Simplify alerts (#1535)
Browse files Browse the repository at this point in the history
* Reduced number of SLOs and alerts.

* added toggles for fast and slow burn alerts
  • Loading branch information
yuriatgoogle authored Jan 8, 2021
1 parent 1b0a580 commit fbf8aaa
Show file tree
Hide file tree
Showing 5 changed files with 53 additions and 45 deletions.
8 changes: 4 additions & 4 deletions terraform/alerting/module.availability-slo/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,9 @@ resource "google_monitoring_slo" "slo" {
resource "google_monitoring_alert_policy" "fast_burn" {
count = var.enabled ? 1 : 0
project = var.project
display_name = "FastErrorBudgetBurn-${var.service_name}"
display_name = "AvailabilityFastErrorBudgetBurn-${var.service_name}"
combiner = "AND"
enabled = var.enable_alert
enabled = var.enable_fast_burn_alert

conditions {
display_name = "Fast burn over last hour"
Expand Down Expand Up @@ -104,9 +104,9 @@ resource "google_monitoring_alert_policy" "fast_burn" {
resource "google_monitoring_alert_policy" "slow_burn" {
count = var.enabled ? 1 : 0
project = var.project
display_name = "SlowErrorBudgetBurn-${var.service_name}"
display_name = "AvailabilitySlowErrorBudgetBurn-${var.service_name}"
combiner = "AND"
enabled = var.enable_alert
enabled = var.enable_slow_burn_alert

conditions {
display_name = "Slow burn over last 6 hours"
Expand Down
9 changes: 7 additions & 2 deletions terraform/alerting/module.availability-slo/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,13 @@ variable "project" {
type = string
}

variable "enable_alert" {
variable "enable_fast_burn_alert" {
type = bool
description = "Whether to enable the alerts."
description = "Whether to enable the fast error budget burn alert."
}

variable "enable_slow_burn_alert" {
type = bool
description = "Whether to enable the slow error budget burn alert."
}

8 changes: 4 additions & 4 deletions terraform/alerting/module.latency-slo/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@ resource "google_monitoring_slo" "slo" {
resource "google_monitoring_alert_policy" "fast_burn" {
count = var.enabled ? 1 : 0
project = var.project
display_name = "FastLatencyBudgetBurn-${var.service_name}"
display_name = "LatencyFastLatencyBudgetBurn-${var.service_name}"
combiner = "AND"
enabled = var.enable_alert
enabled = var.enable_fast_burn_alert

conditions {
display_name = "Fast burn over last hour"
Expand Down Expand Up @@ -100,9 +100,9 @@ resource "google_monitoring_alert_policy" "fast_burn" {
resource "google_monitoring_alert_policy" "slow_burn" {
count = var.enabled ? 1 : 0
project = var.project
display_name = "SlowLatencyBudgetBurn-${var.service_name}"
display_name = "LatencySlowLatencyBudgetBurn-${var.service_name}"
combiner = "AND"
enabled = var.enable_alert
enabled = var.enable_slow_burn_alert

conditions {
display_name = "Slow burn over last 6 hours"
Expand Down
9 changes: 7 additions & 2 deletions terraform/alerting/module.latency-slo/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,13 @@ variable "threshold" {
description = "Latency SLO threshold (in ms)."
}

variable "enable_alert" {
variable "enable_fast_burn_alert" {
type = bool
description = "Whether to enable the alerts."
description = "Whether to enable the fast error budget burn alert."
}

variable "enable_slow_burn_alert" {
type = bool
description = "Whether to enable the slow error budget burn alert."
}

64 changes: 31 additions & 33 deletions terraform/alerting/slos.tf
Original file line number Diff line number Diff line change
Expand Up @@ -14,35 +14,31 @@

locals {
default_per_service_slo = {
enable_alert = false
availability_goal = 0.995
enable_latency_slo = false # disabled by default due to low request volume; use latency alert for those
latency_goal = 0.95
latency_threshold = 60000 # 60 seconds, in ms
enable_latency_alert = false
latency_alert_duration = 300000 # 5 minutes, in ms
enable_fast_burn_alert = false
enable_slow_burn_alert = false
availability_goal = 0.995
enable_availability_slo = false
enable_latency_slo = false # disabled by default due to low request volume; use latency alert for those
latency_goal = 0.95
latency_threshold = 60000 # 60 seconds, in ms
enable_latency_alert = false
latency_alert_duration = 300000 # 5 minutes, in ms

}
service_configs = {
adminapi = merge(local.default_per_service_slo,
{ enable_alert = true,
enable_latency_alert = true
{ enable_latency_alert = true
latency_threshold = 6000 })
apiserver = merge(local.default_per_service_slo,
{ enable_alert = true,
enable_latency_alert = true,
latency_threshold = 2000 })
appsync = local.default_per_service_slo
cleanup = local.default_per_service_slo
e2e-runner = local.default_per_service_slo
enx-redirect = merge(local.default_per_service_slo,
{ enable_alert = true,
enable_latency_alert = true,
latency_threshold = 2000 })
modeler = local.default_per_service_slo
{ enable_availability_slo = true,
enable_fast_burn_alert = true })
appsync = local.default_per_service_slo
cleanup = local.default_per_service_slo
e2e-runner = local.default_per_service_slo
enx-redirect = local.default_per_service_slo
modeler = local.default_per_service_slo
server = merge(local.default_per_service_slo,
{ enable_alert = true,
enable_latency_alert = true,
{ enable_latency_alert = true,
latency_threshold = 2000 })
}
}
Expand Down Expand Up @@ -77,15 +73,16 @@ module "availability-slos" {
source = "./module.availability-slo"

project = var.project
enabled = var.https-forwarding-rule != ""
notification_channels = google_monitoring_notification_channel.channels

for_each = merge(local.service_configs, var.slo_thresholds_overrides)

custom_service_id = each.key
service_name = each.key
goal = each.value.availability_goal
enable_alert = each.value.enable_alert
enabled = each.value.enable_availability_slo
custom_service_id = each.key
service_name = each.key
goal = each.value.availability_goal
enable_fast_burn_alert = each.value.enable_fast_burn_alert
enable_slow_burn_alert = each.value.enable_slow_burn_alert
}

module "latency-slos" {
Expand All @@ -97,10 +94,11 @@ module "latency-slos" {

for_each = merge(local.service_configs, var.slo_thresholds_overrides)

enabled = each.value.enable_latency_slo
custom_service_id = each.key
service_name = each.key
goal = each.value.latency_goal
threshold = each.value.latency_threshold
enable_alert = each.value.enable_alert
enabled = each.value.enable_latency_slo
custom_service_id = each.key
service_name = each.key
goal = each.value.latency_goal
threshold = each.value.latency_threshold
enable_fast_burn_alert = each.value.enable_fast_burn_alert
enable_slow_burn_alert = each.value.enable_slow_burn_alert
}

0 comments on commit fbf8aaa

Please sign in to comment.