Skip to content

Commit

Permalink
fix: track suppressed notifications metric for inhibit/silence
Browse files Browse the repository at this point in the history
Based on PR feedback:

https://github.com/prometheus/alertmanager/pull/3565/files#r1393068026

Signed-off-by: TJ Hoplock <t.hoplock@gmail.com>
  • Loading branch information
tjhop committed Nov 28, 2023
1 parent 2562373 commit 3169b8d
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 13 deletions.
34 changes: 24 additions & 10 deletions notify/notify.go
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ type Metrics struct {
numTotalFailedNotifications *prometheus.CounterVec
numNotificationRequestsTotal *prometheus.CounterVec
numNotificationRequestsFailedTotal *prometheus.CounterVec
numNotificationSuppressedTotal prometheus.Counter
numNotificationSuppressedTotal *prometheus.CounterVec
notificationLatencySeconds *prometheus.HistogramVec

ff featurecontrol.Flagger
Expand Down Expand Up @@ -285,11 +285,11 @@ func NewMetrics(r prometheus.Registerer, ff featurecontrol.Flagger) *Metrics {
Name: "notification_requests_failed_total",
Help: "The total number of failed notification requests.",
}, labels),
numNotificationSuppressedTotal: prometheus.NewCounter(prometheus.CounterOpts{
numNotificationSuppressedTotal: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: "alertmanager",
Name: "notification_suppressed_total",
Help: "The total number of notifications suppressed for being outside of active time intervals or within muted time intervals.",
}),
}, []string{"reason"}),
notificationLatencySeconds: prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: "alertmanager",
Name: "notification_latency_seconds",
Expand Down Expand Up @@ -387,10 +387,10 @@ func (pb *PipelineBuilder) New(
rs := make(RoutingStage, len(receivers))

ms := NewGossipSettleStage(peer)
is := NewMuteStage(inhibitor)
is := NewMuteStage(inhibitor, pb.metrics)
tas := NewTimeActiveStage(intervener, pb.metrics)
tms := NewTimeMuteStage(intervener, pb.metrics)
ss := NewMuteStage(silencer)
ss := NewMuteStage(silencer, pb.metrics)

for name := range receivers {
st := createReceiverStage(name, receivers[name], wait, notificationLog, pb.metrics)
Expand Down Expand Up @@ -515,12 +515,13 @@ func (n *GossipSettleStage) Exec(ctx context.Context, _ log.Logger, alerts ...*t

// MuteStage filters alerts through a Muter.
type MuteStage struct {
muter types.Muter
muter types.Muter
metrics *Metrics
}

// NewMuteStage return a new MuteStage.
func NewMuteStage(m types.Muter) *MuteStage {
return &MuteStage{muter: m}
func NewMuteStage(m types.Muter, metrics *Metrics) *MuteStage {
return &MuteStage{muter: m, metrics: metrics}
}

// Exec implements the Stage interface.
Expand All @@ -534,6 +535,19 @@ func (n *MuteStage) Exec(ctx context.Context, _ log.Logger, alerts ...*types.Ale
}
// TODO(fabxc): increment muted alerts counter if muted.
}

if len(filtered) > 0 {
var reason string
switch n.muter.(type) {
case *silence.Silencer:
reason = "silence"
case *inhibit.Inhibitor:
reason = "inhibition"
default:
}
n.metrics.numNotificationSuppressedTotal.WithLabelValues(reason).Add(float64(len(filtered)))
}

return ctx, filtered, nil
}

Expand Down Expand Up @@ -909,7 +923,7 @@ func (tms TimeMuteStage) Exec(ctx context.Context, l log.Logger, alerts ...*type

// If the current time is inside a mute time, all alerts are removed from the pipeline.
if muted {
tms.metrics.numNotificationSuppressedTotal.Add(float64(len(alerts)))
tms.metrics.numNotificationSuppressedTotal.WithLabelValues("mute_time_interval").Add(float64(len(alerts)))
level.Debug(l).Log("msg", "Notifications not sent, route is within mute time", "alerts", len(alerts))
return ctx, nil, nil
}
Expand Down Expand Up @@ -947,7 +961,7 @@ func (tas TimeActiveStage) Exec(ctx context.Context, l log.Logger, alerts ...*ty

// If the current time is not inside an active time, all alerts are removed from the pipeline
if !muted {
tas.metrics.numNotificationSuppressedTotal.Add(float64(len(alerts)))
tas.metrics.numNotificationSuppressedTotal.WithLabelValues("active_time_interval").Add(float64(len(alerts)))
level.Debug(l).Log("msg", "Notifications not sent, route is not within active time", "alerts", len(alerts))
return ctx, nil, nil
}
Expand Down
27 changes: 24 additions & 3 deletions notify/notify_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -633,7 +633,8 @@ func TestMuteStage(t *testing.T) {
return ok
})

stage := NewMuteStage(muter)
metrics := NewMetrics(prometheus.NewRegistry(), featurecontrol.NoopFlags{})
stage := NewMuteStage(muter, metrics)

in := []model.LabelSet{
{},
Expand Down Expand Up @@ -672,6 +673,10 @@ func TestMuteStage(t *testing.T) {
if !reflect.DeepEqual(got, out) {
t.Fatalf("Muting failed, expected: %v\ngot %v", out, got)
}
suppressed := int(prom_testutil.ToFloat64(metrics.numNotificationSuppressedTotal))
if len(got) != suppressed {
t.Fatalf("Expected %d alerts counted in suppressed metric but got %d", len(got), suppressed)
}
}

func TestMuteStageWithSilences(t *testing.T) {
Expand All @@ -687,9 +692,11 @@ func TestMuteStageWithSilences(t *testing.T) {
t.Fatal(err)
}

marker := types.NewMarker(prometheus.NewRegistry())
reg := prometheus.NewRegistry()
marker := types.NewMarker(reg)
silencer := silence.NewSilencer(silences, marker, log.NewNopLogger())
stage := NewMuteStage(silencer)
metrics := NewMetrics(reg, featurecontrol.NoopFlags{})
stage := NewMuteStage(silencer, metrics)

in := []model.LabelSet{
{},
Expand Down Expand Up @@ -732,8 +739,13 @@ func TestMuteStageWithSilences(t *testing.T) {
if !reflect.DeepEqual(got, out) {
t.Fatalf("Muting failed, expected: %v\ngot %v", out, got)
}
suppressed := int(prom_testutil.ToFloat64(metrics.numNotificationSuppressedTotal))
if len(got) != suppressed {
t.Fatalf("Expected %d alerts counted in suppressed metric but got %d", len(got), suppressed)
}

// Do it again to exercise the version tracking of silences.
metrics.numNotificationSuppressedTotal.Reset()
_, alerts, err = stage.Exec(context.Background(), log.NewNopLogger(), inAlerts...)
if err != nil {
t.Fatalf("Exec failed: %s", err)
Expand All @@ -747,12 +759,17 @@ func TestMuteStageWithSilences(t *testing.T) {
if !reflect.DeepEqual(got, out) {
t.Fatalf("Muting failed, expected: %v\ngot %v", out, got)
}
suppressed = int(prom_testutil.ToFloat64(metrics.numNotificationSuppressedTotal))
if len(got) != suppressed {
t.Fatalf("Expected %d alerts counted in suppressed metric but got %d", len(got), suppressed)
}

// Expire the silence and verify that no alerts are silenced now.
if err := silences.Expire(silID); err != nil {
t.Fatal(err)
}

metrics.numNotificationSuppressedTotal.Reset()
_, alerts, err = stage.Exec(context.Background(), log.NewNopLogger(), inAlerts...)
if err != nil {
t.Fatalf("Exec failed: %s", err)
Expand All @@ -765,6 +782,10 @@ func TestMuteStageWithSilences(t *testing.T) {
if !reflect.DeepEqual(got, in) {
t.Fatalf("Unmuting failed, expected: %v\ngot %v", in, got)
}
suppressed = int(prom_testutil.ToFloat64(metrics.numNotificationSuppressedTotal))
if len(got) != suppressed {
t.Fatalf("Expected %d alerts counted in suppressed metric but got %d", len(got), suppressed)
}
}

func TestTimeMuteStage(t *testing.T) {
Expand Down

0 comments on commit 3169b8d

Please sign in to comment.