Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(metrics): Add failure_reason label to ingress_controller_configuration_push_count metric #2965

Merged
merged 13 commits into from
Sep 27, 2022
Merged
12 changes: 12 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Table of Contents

- [2.7.0](#270)
- [2.6.0](#260)
- [2.5.0](#250)
- [2.4.2](#242)
Expand Down Expand Up @@ -51,6 +52,17 @@
- [0.0.5](#005)
- [0.0.4 and prior](#004-and-prior)

## [2.7.0]
rainest marked this conversation as resolved.
Show resolved Hide resolved

> Release date: TBD

### Added

- On configuration push to the data plane failure, `ingress_controller_configuration_push_count` Prometheus metric
is now reported with `success="false"` and `failure_reason="conflict|other"` labels, enabling distinguishing
configuration conflicts from other errors (e.g. transient network errors).
[#2965](https://github.com/Kong/kubernetes-ingress-controller/pull/2965)

## [2.6.0]

> Release date: 2022-09-14
Expand Down
7 changes: 3 additions & 4 deletions grafana.json
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@
"targets": [
{
"exemplar": true,
"expr": "ingress_controller_configuration_push_count{success=\"false\"}",
"expr": "sum by (failure_reason) (ingress_controller_configuration_push_count{success=\"false\"})",
"instant": false,
"interval": "",
"legendFormat": "",
Expand Down Expand Up @@ -375,6 +375,5 @@
"timezone": "",
"title": "Kong ingress controller",
"uid": "SaGwMOtnz",
"version": 1,
"weekStart": ""
}
"version": 2
}
22 changes: 18 additions & 4 deletions internal/dataplane/sendconfig/sendconfig.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"context"
"encoding/hex"
"encoding/json"
"errors"
"fmt"
"net/http"
"reflect"
Expand All @@ -16,6 +17,7 @@ import (
"github.com/kong/deck/file"
"github.com/kong/deck/state"
deckutils "github.com/kong/deck/utils"
"github.com/kong/go-kong/kong"
"github.com/prometheus/client_golang/prometheus"
"github.com/sirupsen/logrus"

Expand Down Expand Up @@ -85,8 +87,9 @@ func PerformUpdate(ctx context.Context,

if err != nil {
promMetrics.ConfigPushCount.With(prometheus.Labels{
metrics.SuccessKey: metrics.SuccessFalse,
metrics.ProtocolKey: metricsProtocol,
metrics.SuccessKey: metrics.SuccessFalse,
metrics.ProtocolKey: metricsProtocol,
metrics.FailureReasonKey: pushFailureReason(err),
}).Inc()
promMetrics.ConfigPushDuration.With(prometheus.Labels{
metrics.SuccessKey: metrics.SuccessFalse,
Expand All @@ -96,8 +99,9 @@ func PerformUpdate(ctx context.Context,
}

promMetrics.ConfigPushCount.With(prometheus.Labels{
metrics.SuccessKey: metrics.SuccessTrue,
metrics.ProtocolKey: metricsProtocol,
metrics.SuccessKey: metrics.SuccessTrue,
metrics.ProtocolKey: metricsProtocol,
metrics.FailureReasonKey: "",
}).Inc()
promMetrics.ConfigPushDuration.With(prometheus.Labels{
metrics.SuccessKey: metrics.SuccessTrue,
Expand Down Expand Up @@ -273,3 +277,13 @@ func hasSHAUpdateAlreadyBeenReported(latestUpdateSHA []byte) bool {
latestReportedSHA = latestUpdateSHA
return false
}

// pushFailureReason extracts config push failure reason from an error returned from onUpdateInMemoryMode or onUpdateDBMode.
func pushFailureReason(err error) string {
var apiErr *kong.APIError
if errors.As(err, &apiErr) && apiErr.Code() == http.StatusConflict {
return metrics.FailureReasonConflict
}

return metrics.FailureReasonOther
}
41 changes: 41 additions & 0 deletions internal/dataplane/sendconfig/sendconfig_test.go
Original file line number Diff line number Diff line change
@@ -1,13 +1,19 @@
package sendconfig

import (
"errors"
"fmt"
"net/http"
"reflect"
"testing"

"github.com/kong/deck/file"
"github.com/kong/go-kong/kong"
"github.com/sirupsen/logrus"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

"github.com/kong/kubernetes-ingress-controller/v2/internal/metrics"
)

func Test_renderConfigWithCustomEntities(t *testing.T) {
Expand Down Expand Up @@ -128,3 +134,38 @@ func Test_updateReportingUtilities(t *testing.T) {
assert.True(t, hasSHAUpdateAlreadyBeenReported([]byte("yet-another-fake-sha")))
assert.True(t, hasSHAUpdateAlreadyBeenReported([]byte("yet-another-fake-sha")))
}

func Test_pushFailureReason(t *testing.T) {
apiConflictErr := kong.NewAPIError(http.StatusConflict, "conflict in configuration")

testCases := []struct {
name string
err error
expectedReason string
}{
{
name: "generic_error",
err: errors.New("some generic error"),
expectedReason: metrics.FailureReasonOther,
},
{
name: "api_conflict_error",
err: apiConflictErr,
expectedReason: metrics.FailureReasonConflict,
},
{
name: "api_conflict_error_wrapped",
err: fmt.Errorf("wrapped conflict api err: %w", apiConflictErr),
expectedReason: metrics.FailureReasonConflict,
},
}

for _, testCase := range testCases {
t.Run(testCase.name, func(t *testing.T) {
t.Parallel()

reason := pushFailureReason(testCase.err)
require.Equal(t, testCase.expectedReason, reason)
})
}
czeslavo marked this conversation as resolved.
Show resolved Hide resolved
}
20 changes: 17 additions & 3 deletions internal/metrics/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ type CtrlFuncMetrics struct {
const (
// SuccessTrue indicates that the operation was successful.
SuccessTrue string = "true"
// SuccessTrue indicates that the operation was not successful.
// SuccessFalse indicates that the operation was not successful.
SuccessFalse string = "false"

// SuccessKey defines the key of the metric label indicating success/failure of an operation.
Expand All @@ -36,6 +36,17 @@ const (
ProtocolKey string = "protocol"
)

const (
// FailureReasonConflict indicates that the config push failed due to configuration conflicts.
FailureReasonConflict string = "conflict"

// FailureReasonOther indicates that the config push failed due to other reasons.
FailureReasonOther string = "other"

// FailureReasonKey defines the key of the metric label indicating failure reason.
FailureReasonKey string = "failure_reason"
)

const (
MetricNameConfigPushCount = "ingress_controller_configuration_push_count"
MetricNameTranslationCount = "ingress_controller_translation_count"
Expand All @@ -52,9 +63,12 @@ func NewCtrlFuncMetrics() *CtrlFuncMetrics {
ProtocolKey + "` describes the configuration protocol (" + ProtocolDBLess + " or " +
ProtocolDeck + ") in use. `" +
SuccessKey + "` describes whether there were unrecoverable errors (`" +
SuccessFalse + "`) or not (`" + SuccessTrue + "`).",
SuccessFalse + "`) or not (`" + SuccessTrue + "`). `" +
FailureReasonKey + "` is populated in case of `" + SuccessKey +
"=\"" + SuccessFalse + "\"` " + "and describes the reason of failure (one of `" +
FailureReasonConflict + "`, `" + FailureReasonOther + "`).",
},
rainest marked this conversation as resolved.
Show resolved Hide resolved
[]string{SuccessKey, ProtocolKey},
[]string{SuccessKey, ProtocolKey, FailureReasonKey},
)

controllerMetrics.TranslationCount = prometheus.NewCounterVec(
Expand Down