From 834a7269809030ece75f5c112d3266bd54d64a83 Mon Sep 17 00:00:00 2001 From: Yuri Nikolic Date: Tue, 6 Feb 2024 19:36:03 +0100 Subject: [PATCH 1/2] Open circuit breakers on timeouts and per-instance limit errors only Signed-off-by: Yuri Nikolic --- cmd/mimir/config-descriptor.json | 2 +- cmd/mimir/help-all.txt.tmpl | 2 +- .../configuration-parameters/index.md | 2 +- pkg/ingester/client/circuitbreaker.go | 24 +++++++++++--- pkg/ingester/client/circuitbreaker_test.go | 32 ++++++++++++++++--- pkg/ingester/client/client.go | 2 +- 6 files changed, 51 insertions(+), 13 deletions(-) diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index 319896b36a3..4b44c4ce226 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -2267,7 +2267,7 @@ "required": false, "desc": "How long the circuit breaker will stay in the open state before allowing some requests", "fieldValue": null, - "fieldDefaultValue": 60000000000, + "fieldDefaultValue": 10000000000, "fieldFlag": "ingester.client.circuit-breaker.cooldown-period", "fieldType": "duration", "fieldCategory": "experimental" diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index b3048f026a4..af1689e2425 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -1274,7 +1274,7 @@ Usage of ./cmd/mimir/mimir: -ingester.client.backoff-retries int Number of times to backoff and retry before failing. (default 10) -ingester.client.circuit-breaker.cooldown-period duration - [experimental] How long the circuit breaker will stay in the open state before allowing some requests (default 1m0s) + [experimental] How long the circuit breaker will stay in the open state before allowing some requests (default 10s) -ingester.client.circuit-breaker.enabled [experimental] Enable circuit breaking when making requests to ingesters -ingester.client.circuit-breaker.failure-execution-threshold uint diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md index d362e365c1b..12df1ab40c4 100644 --- a/docs/sources/mimir/configure/configuration-parameters/index.md +++ b/docs/sources/mimir/configure/configuration-parameters/index.md @@ -2356,7 +2356,7 @@ circuit_breaker: # (experimental) How long the circuit breaker will stay in the open state # before allowing some requests # CLI flag: -ingester.client.circuit-breaker.cooldown-period - [cooldown_period: | default = 1m] + [cooldown_period: | default = 10s] # (deprecated) If set to true, gRPC status codes will be reported in # "status_code" label of "cortex_ingester_client_request_duration_seconds" diff --git a/pkg/ingester/client/circuitbreaker.go b/pkg/ingester/client/circuitbreaker.go index cf8e01fbd49..9a06d413f72 100644 --- a/pkg/ingester/client/circuitbreaker.go +++ b/pkg/ingester/client/circuitbreaker.go @@ -10,10 +10,12 @@ import ( "github.com/failsafe-go/failsafe-go/circuitbreaker" "github.com/go-kit/log" "github.com/go-kit/log/level" + "github.com/grafana/dskit/grpcutil" "github.com/grafana/dskit/ring" "google.golang.org/grpc" "google.golang.org/grpc/codes" - "google.golang.org/grpc/status" + + "github.com/grafana/mimir/pkg/mimirpb" ) const ( @@ -97,9 +99,21 @@ func isFailure(err error) bool { return false } - // We only consider timeouts or the ingester being unavailable (returned when hitting - // per-instance limits) to be errors worthy of tripping the circuit breaker since these + // We only consider timeouts or ingester hitting a per-instance limit + // to be errors worthy of tripping the circuit breaker since these // are specific to a particular ingester, not a user or request. - code := status.Code(err) - return code == codes.Unavailable || code == codes.DeadlineExceeded + if stat, ok := grpcutil.ErrorToStatus(err); ok { + if stat.Code() == codes.DeadlineExceeded { + return true + } + + details := stat.Details() + if len(details) != 1 { + return false + } + if errDetails, ok := details[0].(*mimirpb.ErrorDetails); ok { + return errDetails.GetCause() == mimirpb.INSTANCE_LIMIT + } + } + return false } diff --git a/pkg/ingester/client/circuitbreaker_test.go b/pkg/ingester/client/circuitbreaker_test.go index b314b25129f..1ea9e281cea 100644 --- a/pkg/ingester/client/circuitbreaker_test.go +++ b/pkg/ingester/client/circuitbreaker_test.go @@ -10,6 +10,7 @@ import ( "time" "github.com/failsafe-go/failsafe-go/circuitbreaker" + "github.com/gogo/status" "github.com/grafana/dskit/grpcutil" "github.com/grafana/dskit/ring" "github.com/prometheus/client_golang/prometheus" @@ -17,7 +18,8 @@ import ( "github.com/stretchr/testify/require" "google.golang.org/grpc" "google.golang.org/grpc/codes" - "google.golang.org/grpc/status" + + "github.com/grafana/mimir/pkg/mimirpb" "github.com/grafana/mimir/pkg/util/test" ) @@ -44,11 +46,33 @@ func TestIsFailure(t *testing.T) { require.True(t, isFailure(fmt.Errorf("%w", err))) }) - t.Run("gRPC unavailable", func(t *testing.T) { - err := status.Error(codes.Unavailable, "broken!") + t.Run("gRPC unavailable with INSTANCE_LIMIT details", func(t *testing.T) { + err := perInstanceLimitError(t) require.True(t, isFailure(err)) require.True(t, isFailure(fmt.Errorf("%w", err))) }) + + t.Run("gRPC unavailable with SERVICE_UNAVAILABLE details is not a failure", func(t *testing.T) { + stat := status.New(codes.Unavailable, "broken!") + stat, err := stat.WithDetails(&mimirpb.ErrorDetails{Cause: mimirpb.SERVICE_UNAVAILABLE}) + require.NoError(t, err) + err = stat.Err() + require.False(t, isFailure(err)) + require.False(t, isFailure(fmt.Errorf("%w", err))) + }) + + t.Run("gRPC unavailable without details is not a failure", func(t *testing.T) { + err := status.Error(codes.Unavailable, "broken!") + require.False(t, isFailure(err)) + require.False(t, isFailure(fmt.Errorf("%w", err))) + }) +} + +func perInstanceLimitError(t *testing.T) error { + stat := status.New(codes.Unavailable, "broken!") + stat, err := stat.WithDetails(&mimirpb.ErrorDetails{Cause: mimirpb.INSTANCE_LIMIT}) + require.NoError(t, err) + return stat.Err() } func TestNewCircuitBreaker(t *testing.T) { @@ -59,7 +83,7 @@ func TestNewCircuitBreaker(t *testing.T) { // gRPC invoker that returns an error that will be treated as an error by the circuit breaker failure := func(currentCtx context.Context, currentMethod string, currentReq, currentRepl interface{}, currentConn *grpc.ClientConn, currentOpts ...grpc.CallOption) error { - return status.Error(codes.Unavailable, "failed") + return perInstanceLimitError(t) } conn := grpc.ClientConn{} diff --git a/pkg/ingester/client/client.go b/pkg/ingester/client/client.go index 979d7b948d5..03eb92ce69a 100644 --- a/pkg/ingester/client/client.go +++ b/pkg/ingester/client/client.go @@ -116,7 +116,7 @@ func (cfg *CircuitBreakerConfig) RegisterFlagsWithPrefix(prefix string, f *flag. f.UintVar(&cfg.FailureThreshold, prefix+".circuit-breaker.failure-threshold", 10, "Max percentage of requests that can fail over period before the circuit breaker opens") f.UintVar(&cfg.FailureExecutionThreshold, prefix+".circuit-breaker.failure-execution-threshold", 100, "How many requests must have been executed in period for the circuit breaker to be eligible to open for the rate of failures") f.DurationVar(&cfg.ThresholdingPeriod, prefix+".circuit-breaker.thresholding-period", time.Minute, "Moving window of time that the percentage of failed requests is computed over") - f.DurationVar(&cfg.CooldownPeriod, prefix+".circuit-breaker.cooldown-period", time.Minute, "How long the circuit breaker will stay in the open state before allowing some requests") + f.DurationVar(&cfg.CooldownPeriod, prefix+".circuit-breaker.cooldown-period", 10*time.Second, "How long the circuit breaker will stay in the open state before allowing some requests") } func (cfg *CircuitBreakerConfig) Validate() error { From 9c29fec2fa0f33b2c71d7969879d0e9c5ceef773 Mon Sep 17 00:00:00 2001 From: Yuri Nikolic Date: Wed, 7 Feb 2024 07:40:46 +0100 Subject: [PATCH 2/2] Update CHANGELOG Signed-off-by: Yuri Nikolic --- CHANGELOG.md | 1 + pkg/ingester/client/circuitbreaker_test.go | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0a477abc1d4..f3c6b4427f5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -37,6 +37,7 @@ * [CHANGE] Query-frontend: the default value of the CLI flag `-query-frontend.max-cache-freshness` (and its respective YAML configuration parameter) has been changed from `1m` to `10m`. #7161 * [CHANGE] Distributor: default the optimization `-distributor.write-requests-buffer-pooling-enabled` to `true`. #7165 * [CHANGE] Tracing: Move query information to span attributes instead of span logs. #7046 +* [CHANGE] Distributor: the default value of circuit breaker's CLI flag `-ingester.client.circuit-breaker.cooldown-period` has been changed from `1m` to `10s`. #7310 * [FEATURE] Introduce `-server.log-source-ips-full` option to log all IPs from `Forwarded`, `X-Real-IP`, `X-Forwarded-For` headers. #7250 * [FEATURE] Introduce `-tenant-federation.max-tenants` option to limit the max number of tenants allowed for requests when federation is enabled. #6959 * [FEATURE] Cardinality API: added a new `count_method` parameter which enables counting active label values. #7085 diff --git a/pkg/ingester/client/circuitbreaker_test.go b/pkg/ingester/client/circuitbreaker_test.go index 1ea9e281cea..efcb0181ccc 100644 --- a/pkg/ingester/client/circuitbreaker_test.go +++ b/pkg/ingester/client/circuitbreaker_test.go @@ -20,7 +20,6 @@ import ( "google.golang.org/grpc/codes" "github.com/grafana/mimir/pkg/mimirpb" - "github.com/grafana/mimir/pkg/util/test" )