google · sethvargo · Mar 20, 2021 · Mar 20, 2021 · sethvargo · Mar 20, 2021
@@ -0,0 +1,45 @@
+# ForwardProgressFailed
+
+This alert fires when background jobs have not made forward progress in an acceptable amount of time. The alert will include the name of the job that is failing to make forward progress. The jobs are invoked in the background.
+
+- `appsync-worker` - Syncs the published list of mobile apps to the server's database.
+
+- `backupdatabase-worker` - Generates a database backup every interval.
+
+- `cleanup-worker` - Performs a variety of cleanup tasks including purging old data, secrets, and keys.
+
+- `e2e-default-workflow` - Runs the [End to End test](../../../../cmd/e2e-runner/main.go).
+
+- `e2e-enx-redirect-workflow` - Runs the End to End workflow using the `enx-redirect` service.
+
+- `e2e-revise-workflow` - Runs the same end to end test to the revise endpoint.
+
+- `modeler-worker` - Implements periodic statistical calculations.
+
+- `realm-key-rotation-worker` - Rotates realm signing keys.
+
+- `rotation-worker` - Rotates system signing keys (primarily for tokens).
+
+- `stats-puller-worker` - Imports statistics from the key server.
+
+Each job runs on a different interval. Check your Terraform configuration to see how frequently a specific job runs.
+
+## Triage Steps
+
+When one of the jobs does not return success within a configured interval, this alert will fire. For most cases, this means the job has already failed 2+ times.
+
+To begin triage, locate the logs for the corresponding service name using the Logs Explorer:
+
+```text
+resource.type="cloud_run_revision"
+resource.labels.service_name="<service>"
+```
+
+For example, if the failing service was `appsync`:
+
+```text
+resource.type="cloud_run_revision"
+resource.labels.service_name="appsync"
+```
+
+Check for errors in the logs.
@@ -63,6 +63,7 @@ func (c *Controller) HandleSync() http.Handler {
 		// If there are any errors, return them
 		if merr := c.syncApps(ctx, apps); merr != nil {
 			if errs := merr.WrappedErrors(); len(errs) > 0 {
+				logger.Errorw("failed to sync apps", "errors", errs)
 				c.h.RenderJSON(w, http.StatusInternalServerError, &AppSyncResult{
 					OK:     false,
 					Errors: project.ErrorsToStrings(errs),

@@ -59,14 +59,14 @@ func init() {
 			Aggregation: view.Count(),
 		},
 		{
-			Name:        metricPrefix + "/token_success",
+			Name:        metricPrefix + "/token/success",
 			Description: "Number of token rotation successes",
 			TagKeys:     observability.CommonTagKeys(),
 			Measure:     mTokenSuccess,
 			Aggregation: view.Count(),
 		},
 		{
-			Name:        metricPrefix + "/verification_success",
+			Name:        metricPrefix + "/verification/success",
 			Description: "Number of verification rotation successes",
 			TagKeys:     observability.CommonTagKeys(),
 			Measure:     mVerificationSuccess,

@@ -14,6 +14,8 @@
 
 locals {
   playbook_prefix = "https://github.com/google/exposure-notifications-verification-server/blob/main/docs/playbooks/alerts"
+  custom_prefix   = "custom.googleapis.com/opencensus/en-verification-server"
+
   p99_latency_thresholds = {
     adminapi = "5s"
   }
@@ -40,7 +42,7 @@ resource "google_monitoring_alert_policy" "E2ETestErrorRatioHigh" {
       duration = "600s"
       query    = <<-EOT
       fetch
-      generic_task :: custom.googleapis.com/opencensus/en-verification-server/e2e/request_count
+      generic_task :: ${local.custom_prefix}/e2e/request_count
       | {
         NOT_OK: filter metric.result == 'NOT_OK' | align
         ;
@@ -112,7 +114,7 @@ resource "google_monitoring_alert_policy" "rate_limited_count" {
       duration = "300s"
       query    = <<-EOT
       fetch
-      generic_task :: custom.googleapis.com/opencensus/en-verification-server/ratelimit/limitware/request_count
+      generic_task :: ${local.custom_prefix}/ratelimit/limitware/request_count
       | filter metric.result = "RATE_LIMITED"
       | align
       | window 1m
@@ -183,46 +185,38 @@ resource "google_monitoring_alert_policy" "StackdriverExportFailed" {
   ]
 }
 
-resource "google_monitoring_alert_policy" "CloudSchedulerJobFailed" {
+resource "google_monitoring_alert_policy" "ForwardProgressFailed" {
+  for_each = var.forward_progress_indicators
+
   project      = var.project
-  display_name = "CloudSchedulerJobFailed"
+  display_name = "ForwardProgressFailed"
   combiner     = "OR"
+
   conditions {
-    display_name = "Cloud Scheduler Job Error Ratio"
+    display_name = each.key
+
     condition_monitoring_query_language {
       duration = "0s"
-      # NOTE: The query below will be evaluated every 30s. It will look at the latest point that
-      # represents the total count of log entries for the past 10m (align delta (10m)),
-      # and fork it to two streams, one representing only ERROR logs, one representing ALL logs,
-      # and do an outer join with default value 0 for the first stream.
-      # Then it computes the first stream / second stream getting the ratio of ERROR logs over ALL logs,
-      # and finally group by. The alert will fire when the error rate was 100% for the last 10 mins.
-      query = <<-EOT
-      fetch cloud_scheduler_job
-      | metric 'logging.googleapis.com/log_entry_count'
-      | align delta(10m)
-      | { t_0: filter metric.severity == 'ERROR'
-        ; t_1: ident }
-      | outer_join [0]
-      | value
-          [t_0_value_log_entry_count_div:
-            div(t_0.value.log_entry_count, t_1.value.log_entry_count)]
-      | group_by [resource.job_id],
-          [t_0_value_log_entry_count_div_sum: sum(t_0_value_log_entry_count_div)]
-      | condition t_0_value_log_entry_count_div_sum >= 1
+      query    = <<-EOT
+      fetch generic_task
+      | metric '${local.custom_prefix}/${each.value.metric}'
+      | align delta_gauge(${each.value.window})
+      | group_by [], [val: aggregate(value.success)]
+      | absent_for ${each.value.window}
       EOT
+
       trigger {
         count = 1
       }
     }
   }
 
   documentation {
-    content   = "${local.playbook_prefix}/CloudSchedulerJobFailed.md"
+    content   = "${local.playbook_prefix}/ForwardProgressFailed.md"
     mime_type = "text/markdown"
   }
 
-  notification_channels = [for x in values(google_monitoring_notification_channel.non-paging) : x.id]
+  notification_channels = [for x in values(google_monitoring_notification_channel.paging) : x.id]
 
   depends_on = [
     null_resource.manual-step-to-enable-workspace,
@@ -239,7 +233,7 @@ resource "google_monitoring_alert_policy" "UpstreamUserRecreates" {
       duration = "600s"
       query    = <<-EOT
       fetch
-      generic_task :: custom.googleapis.com/opencensus/en-verification-server/user/upstream_user_recreate_count
+      generic_task :: ${local.custom_prefix}/user/upstream_user_recreate_count
       | align rate(5m)
       | every 1m
       | group_by [], [val: sum(value.upstream_user_recreate_count)]
@@ -271,7 +265,7 @@ resource "google_monitoring_alert_policy" "AuthenticatedSMSFailure" {
       duration = "60s"
       query    = <<-EOT
       fetch
-      generic_task :: custom.googleapis.com/opencensus/en-verification-server/api/issue/authenticated_sms_failure_count
+      generic_task :: ${local.custom_prefix}/api/issue/authenticated_sms_failure_count
       | align rate(5m)
       | every 1m
       | group_by [metric.realm], [val: sum(value.authenticated_sms_failure_count)]

@@ -85,6 +85,33 @@ variable "alert_on_human_decrypted_value" {
   description = "Alert when a human accesses a secret. You must enable DATA_READ audit logs for Cloud KMS."
 }
 
+variable "forward_progress_indicators" {
+  type = map(object({
+    metric = string
+    window = string
+  }))
+
+  default = {
+    // appsync runs every 4h, alert after 2 failures
+    "appsync" = { metric = "appsync/success", window = "485m" },
+
+    // cleanup runs every 1h, alert after 4 failures
+    "cleanup" = { metric = "cleanup/success", window = "245m" },
+
+    // modeler runs every 4h, alert after 2 failures
+    "modeler" = { metric = "modeler/success", window = "485m" },
+
+    // realm-key-rotation runs every 15m, alert after 2 failures
+    "realm-key-rotation" = { metric = "rotation/verification/success", window = "35m" }
+
+    // rotation runs every 30m, alert after 2 failures
+    "rotation" = { metric = "rotation/token/success", window = "65m" }
+
+    // stats-puller runs every 15m, alert after 2 failures
+    "stats-puller" = { metric = "statspuller/success", window = "35m" }
+  }
+}
+
 terraform {
   required_version = ">= 0.14.2"
 

@@ -168,7 +168,7 @@ resource "google_cloud_run_service_iam_member" "appsync-invoker" {
 resource "google_cloud_scheduler_job" "appsync-worker" {
   name             = "appsync-worker"
   region           = var.cloudscheduler_location
-  schedule         = "0 */6 * * *"
+  schedule         = "0 */4 * * *"
   time_zone        = "America/Los_Angeles"
   attempt_deadline = "${google_cloud_run_service.appsync.template[0].spec[0].timeout_seconds + 60}s"
 

@@ -161,7 +161,7 @@ resource "google_cloud_run_service_iam_member" "modeler-invoker" {
 resource "google_cloud_scheduler_job" "modeler-worker" {
   name             = "modeler-worker"
   region           = var.cloudscheduler_location
-  schedule         = "0 */6 * * *"
+  schedule         = "0 */4 * * *"
   time_zone        = "America/Los_Angeles"
   attempt_deadline = "${google_cloud_run_service.modeler.template[0].spec[0].timeout_seconds + 60}s"
 

@@ -171,7 +171,7 @@ resource "google_cloud_run_service_iam_member" "stats-puller-invoker" {
 resource "google_cloud_scheduler_job" "stats-puller-worker" {
   name             = "stats-puller-worker"
   region           = var.cloudscheduler_location
-  schedule         = "10,20,30 * * * *"
+  schedule         = "*/15 * * * *"
   time_zone        = "America/Los_Angeles"
   attempt_deadline = "${google_cloud_run_service.stats-puller.template[0].spec[0].timeout_seconds + 60}s"