Skip to content

Commit

Permalink
add status code label to the numTotalFailedNotifications metric (#3094)
Browse files Browse the repository at this point in the history
* add reason label to the numTotalFailedNotifications metric

Signed-off-by: Yijie Qin <qinyijie@amazon.com>
  • Loading branch information
qinxx108 authored Feb 3, 2023
1 parent 8f559aa commit 7923bc5
Show file tree
Hide file tree
Showing 4 changed files with 121 additions and 4 deletions.
14 changes: 11 additions & 3 deletions notify/notify.go
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ func NewMetrics(r prometheus.Registerer) *Metrics {
Namespace: "alertmanager",
Name: "notifications_failed_total",
Help: "The total number of failed notifications.",
}, []string{"integration"}),
}, []string{"integration", "reason"}),
numNotificationRequestsTotal: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: "alertmanager",
Name: "notification_requests_total",
Expand Down Expand Up @@ -293,10 +293,13 @@ func NewMetrics(r prometheus.Registerer) *Metrics {
"telegram",
} {
m.numNotifications.WithLabelValues(integration)
m.numTotalFailedNotifications.WithLabelValues(integration)
m.numNotificationRequestsTotal.WithLabelValues(integration)
m.numNotificationRequestsFailedTotal.WithLabelValues(integration)
m.notificationLatencySeconds.WithLabelValues(integration)

for _, reason := range possibleFailureReasonCategory {
m.numTotalFailedNotifications.WithLabelValues(integration, reason)
}
}
r.MustRegister(
m.numNotifications, m.numTotalFailedNotifications,
Expand Down Expand Up @@ -662,8 +665,13 @@ func NewRetryStage(i Integration, groupName string, metrics *Metrics) *RetryStag
func (r RetryStage) Exec(ctx context.Context, l log.Logger, alerts ...*types.Alert) (context.Context, []*types.Alert, error) {
r.metrics.numNotifications.WithLabelValues(r.integration.Name()).Inc()
ctx, alerts, err := r.exec(ctx, l, alerts...)

failureReason := DefaultReason.String()
if err != nil {
r.metrics.numTotalFailedNotifications.WithLabelValues(r.integration.Name()).Inc()
if e, ok := errors.Cause(err).(*ErrorWithReason); ok {
failureReason = e.Reason.String()
}
r.metrics.numTotalFailedNotifications.WithLabelValues(r.integration.Name(), failureReason).Inc()
}
return ctx, alerts, err
}
Expand Down
52 changes: 52 additions & 0 deletions notify/notify_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (

"github.com/go-kit/log"
"github.com/prometheus/client_golang/prometheus"
prom_testutil "github.com/prometheus/client_golang/prometheus/testutil"
"github.com/prometheus/common/model"
"github.com/stretchr/testify/require"
"gopkg.in/yaml.v2"
Expand Down Expand Up @@ -422,6 +423,57 @@ func TestRetryStageWithError(t *testing.T) {
require.NotNil(t, resctx)
}

func TestRetryStageWithErrorCode(t *testing.T) {
testcases := map[string]struct {
isNewErrorWithReason bool
reason Reason
reasonlabel string
expectedCount int
}{
"for clientError": {isNewErrorWithReason: true, reason: ClientErrorReason, reasonlabel: ClientErrorReason.String(), expectedCount: 1},
"for serverError": {isNewErrorWithReason: true, reason: ServerErrorReason, reasonlabel: ServerErrorReason.String(), expectedCount: 1},
"for unexpected code": {isNewErrorWithReason: false, reason: DefaultReason, reasonlabel: DefaultReason.String(), expectedCount: 1},
}
for _, testData := range testcases {
retry := false
testData := testData
i := Integration{
name: "test",
notifier: notifierFunc(func(ctx context.Context, alerts ...*types.Alert) (bool, error) {
if !testData.isNewErrorWithReason {
return retry, errors.New("fail to deliver notification")
}
return retry, NewErrorWithReason(testData.reason, errors.New("fail to deliver notification"))
}),
rs: sendResolved(false),
}
r := RetryStage{
integration: i,
metrics: NewMetrics(prometheus.NewRegistry()),
}

alerts := []*types.Alert{
{
Alert: model.Alert{
EndsAt: time.Now().Add(time.Hour),
},
},
}

ctx := context.Background()
ctx = WithFiringAlerts(ctx, []uint64{0})

// Notify with a non-recoverable error.
resctx, _, err := r.Exec(ctx, log.NewNopLogger(), alerts...)
counter := r.metrics.numTotalFailedNotifications

require.Equal(t, testData.expectedCount, int(prom_testutil.ToFloat64(counter.WithLabelValues(r.integration.Name(), testData.reasonlabel))))

require.NotNil(t, err)
require.NotNil(t, resctx)
}
}

func TestRetryStageNoResolved(t *testing.T) {
sent := []*types.Alert{}
i := Integration{
Expand Down
5 changes: 4 additions & 1 deletion notify/sns/sns.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,10 @@ func (n *Notifier) Notify(ctx context.Context, alert ...*types.Alert) (bool, err
publishOutput, err := client.Publish(publishInput)
if err != nil {
if e, ok := err.(awserr.RequestFailure); ok {
return n.retrier.Check(e.StatusCode(), strings.NewReader(e.Message()))
retryable, error := n.retrier.Check(e.StatusCode(), strings.NewReader(e.Message()))

reasonErr := notify.NewErrorWithReason(notify.GetFailureReasonFromStatusCode(e.StatusCode()), error)
return retryable, reasonErr
}
return true, err
}
Expand Down
54 changes: 54 additions & 0 deletions notify/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -245,3 +245,57 @@ func (r *Retrier) Check(statusCode int, body io.Reader) (bool, error) {
}
return retry, errors.New(s)
}

type ErrorWithReason struct {
Err error

Reason Reason
}

func NewErrorWithReason(reason Reason, err error) *ErrorWithReason {
return &ErrorWithReason{
Err: err,
Reason: reason,
}
}

func (e *ErrorWithReason) Error() string {
return e.Err.Error()
}

// Reason is the failure reason.
type Reason int

const (
DefaultReason Reason = iota
ClientErrorReason
ServerErrorReason
)

func (s Reason) String() string {
switch s {
case DefaultReason:
return "other"
case ClientErrorReason:
return "clientError"
case ServerErrorReason:
return "serverError"
default:
panic(fmt.Sprintf("unknown Reason: %d", s))
}
}

// possibleFailureReasonCategory is a list of possible failure reason.
var possibleFailureReasonCategory = []string{DefaultReason.String(), ClientErrorReason.String(), ServerErrorReason.String()}

// GetFailureReasonFromStatusCode returns the reason for the failure based on the status code provided.
func GetFailureReasonFromStatusCode(statusCode int) Reason {
if statusCode/100 == 4 {
return ClientErrorReason
}
if statusCode/100 == 5 {
return ServerErrorReason
}

return DefaultReason
}

0 comments on commit 7923bc5

Please sign in to comment.