Skip to content
This repository has been archived by the owner on Jul 12, 2023. It is now read-only.

Commit

Permalink
Add monitoring and alerting for realm capacity (#645)
Browse files Browse the repository at this point in the history
* Add monitoring and alerting for realm capacity

* Utilize remaining tokes from Take returnees

* Avoid db calls by using the remaning token count from returnee of Take
* Not recording capacity is not fatal
* Reverse the capacity logic to alert above 90% utilization

* Fix wording for alert display name

* record metrics for remaining and issued tokens
  • Loading branch information
femnad authored Sep 23, 2020
1 parent 9619745 commit 4a8b236
Show file tree
Hide file tree
Showing 4 changed files with 132 additions and 15 deletions.
19 changes: 18 additions & 1 deletion pkg/controller/issueapi/issue.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
package issueapi

import (
"context"
"errors"
"fmt"
"net/http"
Expand Down Expand Up @@ -202,7 +203,7 @@ func (c *Controller) HandleIssue() http.Handler {
return
}
key := fmt.Sprintf("realm:quota:%s", dig)
limit, _, reset, ok, err := c.limiter.Take(ctx, key)
limit, remaining, reset, ok, err := c.limiter.Take(ctx, key)
if err != nil {
logger.Errorw("failed to take from limiter", "error", err)
stats.Record(ctx, c.metrics.QuotaErrors.M(1))
Expand Down Expand Up @@ -254,6 +255,8 @@ func (c *Controller) HandleIssue() http.Handler {
return
}

c.recordCapacity(ctx, realm, remaining)

if request.Phone != "" && smsProvider != nil {
message := realm.BuildSMSText(code, longCode, c.config.GetENXRedirectDomain())
if err := smsProvider.SendSMS(ctx, request.Phone, message); err != nil {
Expand Down Expand Up @@ -296,3 +299,17 @@ func (c *Controller) getAuthorizationFromContext(r *http.Request) (*database.Aut

return authorizedApp, user, nil
}

func (c *Controller) recordCapacity(ctx context.Context, realm *database.Realm, remaining uint64) {
if !realm.AbusePreventionEnabled {
return
}
stats.Record(ctx, c.metrics.RealmTokenRemaining.M(int64(remaining)))

limit := realm.AbusePreventionEffectiveLimit()
issued := uint64(limit) - remaining
stats.Record(ctx, c.metrics.RealmTokenIssued.M(int64(issued)))

capacity := float64(issued) / float64(limit)
stats.Record(ctx, c.metrics.RealmTokenCapacity.M(capacity))
}
67 changes: 53 additions & 14 deletions pkg/controller/issueapi/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,16 @@ var (
)

type Metrics struct {
IssueAttempts *stats.Int64Measure
QuotaErrors *stats.Int64Measure
QuotaExceeded *stats.Int64Measure
CodesIssued *stats.Int64Measure
CodeIssueErrors *stats.Int64Measure
SMSSent *stats.Int64Measure
SMSSendErrors *stats.Int64Measure
IssueAttempts *stats.Int64Measure
QuotaErrors *stats.Int64Measure
QuotaExceeded *stats.Int64Measure
CodesIssued *stats.Int64Measure
CodeIssueErrors *stats.Int64Measure
SMSSent *stats.Int64Measure
SMSSendErrors *stats.Int64Measure
RealmTokenIssued *stats.Int64Measure
RealmTokenRemaining *stats.Int64Measure
RealmTokenCapacity *stats.Float64Measure
}

func registerMetrics() (*Metrics, error) {
Expand Down Expand Up @@ -116,13 +119,49 @@ func registerMetrics() (*Metrics, error) {
return nil, fmt.Errorf("stat view registration failure: %w", err)
}

mRealmTokenRemaining := stats.Int64(MetricPrefix+"/realm_token_remaining", "Remaining number of verification codes", stats.UnitDimensionless)
if err := view.Register(&view.View{
Name: MetricPrefix + "/realm_token_remaining_latest",
Description: "Latest realm remaining tokens",
TagKeys: []tag.Key{observability.RealmTagKey},
Measure: mRealmTokenRemaining,
Aggregation: view.LastValue(),
}); err != nil {
return nil, fmt.Errorf("stat view registration failure: %w", err)
}

mRealmTokenIssued := stats.Int64(MetricPrefix+"/realm_token_issued", "Total issued verification codes", stats.UnitDimensionless)
if err := view.Register(&view.View{
Name: MetricPrefix + "/realm_token_issued_latest",
Description: "Latest realm issued tokens",
TagKeys: []tag.Key{observability.RealmTagKey},
Measure: mRealmTokenIssued,
Aggregation: view.LastValue(),
}); err != nil {
return nil, fmt.Errorf("stat view registration failure: %w", err)
}

mRealmTokenCapacity := stats.Float64(MetricPrefix+"/realm_token_capacity", "Capacity utilization for issuing verification codes", stats.UnitDimensionless)
if err := view.Register(&view.View{
Name: MetricPrefix + "/realm_token_capacity_latest",
Description: "Latest realm token capacity utilization",
TagKeys: []tag.Key{observability.RealmTagKey},
Measure: mRealmTokenCapacity,
Aggregation: view.LastValue(),
}); err != nil {
return nil, fmt.Errorf("stat view registration failure: %w", err)
}

return &Metrics{
IssueAttempts: mIssueAttempts,
QuotaErrors: mQuotaErrors,
QuotaExceeded: mQuotaExceeded,
CodesIssued: mCodesIssued,
CodeIssueErrors: mCodesIssued,
SMSSent: mSMSSent,
SMSSendErrors: mSMSSendErrors,
IssueAttempts: mIssueAttempts,
QuotaErrors: mQuotaErrors,
QuotaExceeded: mQuotaExceeded,
CodesIssued: mCodesIssued,
CodeIssueErrors: mCodesIssued,
SMSSent: mSMSSent,
SMSSendErrors: mSMSSendErrors,
RealmTokenIssued: mRealmTokenIssued,
RealmTokenRemaining: mRealmTokenRemaining,
RealmTokenCapacity: mRealmTokenCapacity,
}, nil
}
13 changes: 13 additions & 0 deletions terraform/alerting/dashboards/verification-server.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,16 @@ gridLayout:
| align rate()
| every 1m
| [resource.job]
- title: /api/issue/realm_token_capacity_latest by label.realm [MAX]
xyChart:
chartOptions:
mode: COLOR
dataSets:
- plotType: LINE
timeSeriesQuery:
timeSeriesQueryLanguage: >
generic_task ::
custom.googleapis.com/opencensus/en-verification-server/api/issue/realm_token_capacity_latest
| align max()
| every 1m
| [metric.realm]
48 changes: 48 additions & 0 deletions terraform/alerting/monitoring.tf
Original file line number Diff line number Diff line change
Expand Up @@ -168,3 +168,51 @@ EOT
null_resource.manual-step-to-enable-workspace
]
}

resource "google_monitoring_alert_policy" "realm_token_capacity" {
project = var.project
display_name = "RealmTokenCapacityUtilizationAboveThreshold"
combiner = "OR"
conditions {
display_name = "/realm_capacity_latest"
condition_threshold {
duration = "300s"
threshold_value = 0.9
comparison = "COMPARISON_GT"
filter = "metric.type=\"custom.googleapis.com/opencensus/en-verification-server/api/issue/realm_token_capacity_latest\" resource.type=\"generic_task\""

aggregations {
alignment_period = "60s"
group_by_fields = [
"resource.label.realm",
]
per_series_aligner = "ALIGN_MAX"
}

trigger {
count = 1
}
}
}

documentation {
content = <<-EOT
## $${policy.display_name}
[$${resource.label.realm}](https://$${resource.label.realm}) realm
daily verification code issuing capacity utilized above 90%.
View the metric here
https://console.cloud.google.com/monitoring/dashboards/custom/${basename(google_monitoring_dashboard.verification-server.id)}?project=${var.project}
EOT
mime_type = "text/markdown"
}

notification_channels = [
google_monitoring_notification_channel.email.id
]
depends_on = [
null_resource.manual-step-to-enable-workspace
]
}

0 comments on commit 4a8b236

Please sign in to comment.