From 4c9db5af60ee53f0c7b35020e8487fcc0f186268 Mon Sep 17 00:00:00 2001
From: Seth Vargo <seth@sethvargo.com>
Date: Fri, 19 Mar 2021 20:08:38 -0400
Subject: [PATCH] Switch to forward-progress alerting

This changes most of our background jobs to alert when forward-progress is not achieved instead of on specific failures. This did require we increase the frequency of the modeler and appsync workers because of how Cloud Monitoring calculates windows.
---
 .../alerts/CloudSchedulerJobFailed.md         | 29 -----------
 .../playbooks/alerts/ForwardProgressFailed.md | 45 +++++++++++++++++
 pkg/controller/appsync/handle_sync.go         |  1 +
 pkg/controller/rotation/metrics.go            |  4 +-
 terraform/alerting/alerts.tf                  | 50 ++++++++-----------
 terraform/alerting/variables.tf               | 27 ++++++++++
 terraform/service_appsync.tf                  |  2 +-
 terraform/service_modeler.tf                  |  2 +-
 terraform/service_stats_puller.tf             |  2 +-
 9 files changed, 100 insertions(+), 62 deletions(-)
 delete mode 100644 docs/playbooks/alerts/CloudSchedulerJobFailed.md
 create mode 100644 docs/playbooks/alerts/ForwardProgressFailed.md

diff --git a/docs/playbooks/alerts/CloudSchedulerJobFailed.md b/docs/playbooks/alerts/CloudSchedulerJobFailed.md
deleted file mode 100644
index 0d8dd05c1..000000000
--- a/docs/playbooks/alerts/CloudSchedulerJobFailed.md
+++ /dev/null
@@ -1,29 +0,0 @@
-# CloudSchedulerJobFailed
-
-This is currently the main alert to track errors overall in the system. You should be familiar with each cron job scheduled below to easily identify the error message when this alert is issued.
-
-Cloud Scheduler is responsible for running the following jobs:
-  * `appsync-worker` - Syncs the published list of mobile apps to the server's database.
-  * `backupdatabase-worker` - Generates a database backup every interval. Currently configured to every 6 hours.
-  * `cleanup-worker` - Runs the database cleanup. Currently configured to run every hour.
-  * `e2e-default-workflow` - Runs the [End to End test](../../../../cmd/e2e-runner/main.go) every 10 minutes.
-  * `e2e-enx-redirect-workflow` - Runs the End to End workflow using the `enx-redirect` service every 10 minutes.
-  * `e2e-revise-workflow` - Runs the same end to end test to the revise endpoint every 10 minutes.
-  * `modeler-worker` - Implements periodic statistical calculations. Currently configured to run once a day.
-  * `realm-key-rotation-worker` - Job that implements periodic secret rotation. Scheduled to run twice an hour for realm certificates.
-  * `rotation-worker` -  Implements periodic secret rotation that runs every 5 minutes for tokens. 
-  * `stats-puller-worker` -  Pulls statistics from the key server every 30 minutes.
-
-If Cloud Scheduler produces any ERROR level logs when running any of the above jobs, the system will send an alert notification. It indicates the job is failing for some reason.
-
-## Triage Steps
-
-Go to Logs Explorer, use the following filter:
-
-```
-resource.type="cloud_scheduler_job"
-severity=ERROR
-```
-
-See what the error message is. Depending on the error, you may also need
-to check the corresponding Cloud Run service's log.
diff --git a/docs/playbooks/alerts/ForwardProgressFailed.md b/docs/playbooks/alerts/ForwardProgressFailed.md
new file mode 100644
index 000000000..7a6db42e6
--- /dev/null
+++ b/docs/playbooks/alerts/ForwardProgressFailed.md
@@ -0,0 +1,45 @@
+# ForwardProgressFailed
+
+This alert fires when background jobs have not made forward progress in an acceptable amount of time. The alert will include the name of the job that is failing to make forward progress. The jobs are invoked in the background.
+
+- `appsync-worker` - Syncs the published list of mobile apps to the server's database.
+
+- `backupdatabase-worker` - Generates a database backup every interval.
+
+- `cleanup-worker` - Performs a variety of cleanup tasks including purging old data, secrets, and keys.
+
+- `e2e-default-workflow` - Runs the [End to End test](../../../../cmd/e2e-runner/main.go).
+
+- `e2e-enx-redirect-workflow` - Runs the End to End workflow using the `enx-redirect` service.
+
+- `e2e-revise-workflow` - Runs the same end to end test to the revise endpoint.
+
+- `modeler-worker` - Implements periodic statistical calculations.
+
+- `realm-key-rotation-worker` - Rotates realm signing keys.
+
+- `rotation-worker` - Rotates system signing keys (primarily for tokens).
+
+- `stats-puller-worker` - Imports statistics from the key server.
+
+Each job runs on a different interval. Check your Terraform configuration to see how frequently a specific job runs.
+
+## Triage Steps
+
+When one of the jobs does not return success within a configured interval, this alert will fire. For most cases, this means the job has already failed 2+ times.
+
+To begin triage, locate the logs for the corresponding service name using the Logs Explorer:
+
+```text
+resource.type="cloud_run_revision"
+resource.labels.service_name="<service>"
+```
+
+For example, if the failing service was `appsync`:
+
+```text
+resource.type="cloud_run_revision"
+resource.labels.service_name="appsync"
+```
+
+Check for errors in the logs.
diff --git a/pkg/controller/appsync/handle_sync.go b/pkg/controller/appsync/handle_sync.go
index 8f7dc274e..60119c3be 100644
--- a/pkg/controller/appsync/handle_sync.go
+++ b/pkg/controller/appsync/handle_sync.go
@@ -63,6 +63,7 @@ func (c *Controller) HandleSync() http.Handler {
 		// If there are any errors, return them
 		if merr := c.syncApps(ctx, apps); merr != nil {
 			if errs := merr.WrappedErrors(); len(errs) > 0 {
+				logger.Errorw("failed to sync apps", "errors", errs)
 				c.h.RenderJSON(w, http.StatusInternalServerError, &AppSyncResult{
 					OK:     false,
 					Errors: project.ErrorsToStrings(errs),
diff --git a/pkg/controller/rotation/metrics.go b/pkg/controller/rotation/metrics.go
index 5f362d729..4e50a24b7 100644
--- a/pkg/controller/rotation/metrics.go
+++ b/pkg/controller/rotation/metrics.go
@@ -59,14 +59,14 @@ func init() {
 			Aggregation: view.Count(),
 		},
 		{
-			Name:        metricPrefix + "/token_success",
+			Name:        metricPrefix + "/token/success",
 			Description: "Number of token rotation successes",
 			TagKeys:     observability.CommonTagKeys(),
 			Measure:     mTokenSuccess,
 			Aggregation: view.Count(),
 		},
 		{
-			Name:        metricPrefix + "/verification_success",
+			Name:        metricPrefix + "/verification/success",
 			Description: "Number of verification rotation successes",
 			TagKeys:     observability.CommonTagKeys(),
 			Measure:     mVerificationSuccess,
diff --git a/terraform/alerting/alerts.tf b/terraform/alerting/alerts.tf
index 6ab1259c0..cd7c758e1 100644
--- a/terraform/alerting/alerts.tf
+++ b/terraform/alerting/alerts.tf
@@ -14,6 +14,8 @@
 
 locals {
   playbook_prefix = "https://github.com/google/exposure-notifications-verification-server/blob/main/docs/playbooks/alerts"
+  custom_prefix   = "custom.googleapis.com/opencensus/en-verification-server"
+
   p99_latency_thresholds = {
     adminapi = "5s"
   }
@@ -40,7 +42,7 @@ resource "google_monitoring_alert_policy" "E2ETestErrorRatioHigh" {
       duration = "600s"
       query    = <<-EOT
       fetch
-      generic_task :: custom.googleapis.com/opencensus/en-verification-server/e2e/request_count
+      generic_task :: ${local.custom_prefix}/e2e/request_count
       | {
         NOT_OK: filter metric.result == 'NOT_OK' | align
         ;
@@ -112,7 +114,7 @@ resource "google_monitoring_alert_policy" "rate_limited_count" {
       duration = "300s"
       query    = <<-EOT
       fetch
-      generic_task :: custom.googleapis.com/opencensus/en-verification-server/ratelimit/limitware/request_count
+      generic_task :: ${local.custom_prefix}/ratelimit/limitware/request_count
       | filter metric.result = "RATE_LIMITED"
       | align
       | window 1m
@@ -183,34 +185,26 @@ resource "google_monitoring_alert_policy" "StackdriverExportFailed" {
   ]
 }
 
-resource "google_monitoring_alert_policy" "CloudSchedulerJobFailed" {
+resource "google_monitoring_alert_policy" "ForwardProgressFailed" {
+  for_each = var.forward_progress_indicators
+
   project      = var.project
-  display_name = "CloudSchedulerJobFailed"
+  display_name = "ForwardProgressFailed"
   combiner     = "OR"
+
   conditions {
-    display_name = "Cloud Scheduler Job Error Ratio"
+    display_name = each.key
+
     condition_monitoring_query_language {
       duration = "0s"
-      # NOTE: The query below will be evaluated every 30s. It will look at the latest point that
-      # represents the total count of log entries for the past 10m (align delta (10m)),
-      # and fork it to two streams, one representing only ERROR logs, one representing ALL logs,
-      # and do an outer join with default value 0 for the first stream.
-      # Then it computes the first stream / second stream getting the ratio of ERROR logs over ALL logs,
-      # and finally group by. The alert will fire when the error rate was 100% for the last 10 mins.
-      query = <<-EOT
-      fetch cloud_scheduler_job
-      | metric 'logging.googleapis.com/log_entry_count'
-      | align delta(10m)
-      | { t_0: filter metric.severity == 'ERROR'
-        ; t_1: ident }
-      | outer_join [0]
-      | value
-          [t_0_value_log_entry_count_div:
-            div(t_0.value.log_entry_count, t_1.value.log_entry_count)]
-      | group_by [resource.job_id],
-          [t_0_value_log_entry_count_div_sum: sum(t_0_value_log_entry_count_div)]
-      | condition t_0_value_log_entry_count_div_sum >= 1
+      query    = <<-EOT
+      fetch generic_task
+      | metric '${local.custom_prefix}/${each.value.metric}'
+      | align delta_gauge(${each.value.window})
+      | group_by [], [val: aggregate(value.success)]
+      | absent_for ${each.value.window}
       EOT
+
       trigger {
         count = 1
       }
@@ -218,11 +212,11 @@ resource "google_monitoring_alert_policy" "CloudSchedulerJobFailed" {
   }
 
   documentation {
-    content   = "${local.playbook_prefix}/CloudSchedulerJobFailed.md"
+    content   = "${local.playbook_prefix}/ForwardProgressFailed.md"
     mime_type = "text/markdown"
   }
 
-  notification_channels = [for x in values(google_monitoring_notification_channel.non-paging) : x.id]
+  notification_channels = [for x in values(google_monitoring_notification_channel.paging) : x.id]
 
   depends_on = [
     null_resource.manual-step-to-enable-workspace,
@@ -239,7 +233,7 @@ resource "google_monitoring_alert_policy" "UpstreamUserRecreates" {
       duration = "600s"
       query    = <<-EOT
       fetch
-      generic_task :: custom.googleapis.com/opencensus/en-verification-server/user/upstream_user_recreate_count
+      generic_task :: ${local.custom_prefix}/user/upstream_user_recreate_count
       | align rate(5m)
       | every 1m
       | group_by [], [val: sum(value.upstream_user_recreate_count)]
@@ -271,7 +265,7 @@ resource "google_monitoring_alert_policy" "AuthenticatedSMSFailure" {
       duration = "60s"
       query    = <<-EOT
       fetch
-      generic_task :: custom.googleapis.com/opencensus/en-verification-server/api/issue/authenticated_sms_failure_count
+      generic_task :: ${local.custom_prefix}/api/issue/authenticated_sms_failure_count
       | align rate(5m)
       | every 1m
       | group_by [metric.realm], [val: sum(value.authenticated_sms_failure_count)]
diff --git a/terraform/alerting/variables.tf b/terraform/alerting/variables.tf
index e12aa69d4..7a3923a01 100644
--- a/terraform/alerting/variables.tf
+++ b/terraform/alerting/variables.tf
@@ -85,6 +85,33 @@ variable "alert_on_human_decrypted_value" {
   description = "Alert when a human accesses a secret. You must enable DATA_READ audit logs for Cloud KMS."
 }
 
+variable "forward_progress_indicators" {
+  type = map(object({
+    metric = string
+    window = string
+  }))
+
+  default = {
+    // appsync runs every 4h, alert after 2 failures
+    "appsync" = { metric = "appsync/success", window = "485m" },
+
+    // cleanup runs every 1h, alert after 4 failures
+    "cleanup" = { metric = "cleanup/success", window = "245m" },
+
+    // modeler runs every 4h, alert after 2 failures
+    "modeler" = { metric = "modeler/success", window = "485m" },
+
+    // realm-key-rotation runs every 15m, alert after 2 failures
+    "realm-key-rotation" = { metric = "rotation/verification/success", window = "35m" }
+
+    // rotation runs every 30m, alert after 2 failures
+    "rotation" = { metric = "rotation/token/success", window = "65m" }
+
+    // stats-puller runs every 15m, alert after 2 failures
+    "stats-puller" = { metric = "statspuller/success", window = "35m" }
+  }
+}
+
 terraform {
   required_version = ">= 0.14.2"
 
diff --git a/terraform/service_appsync.tf b/terraform/service_appsync.tf
index 5abf170c0..3b59a76e4 100644
--- a/terraform/service_appsync.tf
+++ b/terraform/service_appsync.tf
@@ -168,7 +168,7 @@ resource "google_cloud_run_service_iam_member" "appsync-invoker" {
 resource "google_cloud_scheduler_job" "appsync-worker" {
   name             = "appsync-worker"
   region           = var.cloudscheduler_location
-  schedule         = "0 */6 * * *"
+  schedule         = "0 */4 * * *"
   time_zone        = "America/Los_Angeles"
   attempt_deadline = "${google_cloud_run_service.appsync.template[0].spec[0].timeout_seconds + 60}s"
 
diff --git a/terraform/service_modeler.tf b/terraform/service_modeler.tf
index d0b4963ec..e3e344636 100644
--- a/terraform/service_modeler.tf
+++ b/terraform/service_modeler.tf
@@ -161,7 +161,7 @@ resource "google_cloud_run_service_iam_member" "modeler-invoker" {
 resource "google_cloud_scheduler_job" "modeler-worker" {
   name             = "modeler-worker"
   region           = var.cloudscheduler_location
-  schedule         = "0 */6 * * *"
+  schedule         = "0 */4 * * *"
   time_zone        = "America/Los_Angeles"
   attempt_deadline = "${google_cloud_run_service.modeler.template[0].spec[0].timeout_seconds + 60}s"
 
diff --git a/terraform/service_stats_puller.tf b/terraform/service_stats_puller.tf
index fe9ebc71d..04b43767e 100644
--- a/terraform/service_stats_puller.tf
+++ b/terraform/service_stats_puller.tf
@@ -171,7 +171,7 @@ resource "google_cloud_run_service_iam_member" "stats-puller-invoker" {
 resource "google_cloud_scheduler_job" "stats-puller-worker" {
   name             = "stats-puller-worker"
   region           = var.cloudscheduler_location
-  schedule         = "10,20,30 * * * *"
+  schedule         = "*/15 * * * *"
   time_zone        = "America/Los_Angeles"
   attempt_deadline = "${google_cloud_run_service.stats-puller.template[0].spec[0].timeout_seconds + 60}s"