diff --git a/CHANGELOG.md b/CHANGELOG.md index 8673e91635e..a77f0e24665 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -49,6 +49,8 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm - `go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp` does no longer depend on `go.opentelemetry.io/otel/exporters/otlp/otlpmetric`. (#4660) - Retry for `502 Bad Gateway` and `504 Gateway Timeout` HTTP statuses in `go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp`. (#4670) - Retry for `502 Bad Gateway` and `504 Gateway Timeout` HTTP statuses in `go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp`. (#4670) +- Retry for `RESOURCE_EXHAUSTED` only if RetryInfo is returned in `go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc`. (#4669) +- Retry for `RESOURCE_EXHAUSTED` only if RetryInfo is returned in `go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc`. (#4669) - Retry temporary HTTP request failures in `go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp`. (#4679) - Retry temporary HTTP request failures in `go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp`. (#4679) diff --git a/exporters/otlp/otlpmetric/otlpmetricgrpc/client.go b/exporters/otlp/otlpmetric/otlpmetricgrpc/client.go index fe1ab3709a7..16f9af12b66 100644 --- a/exporters/otlp/otlpmetric/otlpmetricgrpc/client.go +++ b/exporters/otlp/otlpmetric/otlpmetricgrpc/client.go @@ -176,28 +176,36 @@ func (c *client) exportContext(parent context.Context) (context.Context, context // duration to wait for if an explicit throttle time is included in err. func retryable(err error) (bool, time.Duration) { s := status.Convert(err) + return retryableGRPCStatus(s) +} + +func retryableGRPCStatus(s *status.Status) (bool, time.Duration) { switch s.Code() { case codes.Canceled, codes.DeadlineExceeded, - codes.ResourceExhausted, codes.Aborted, codes.OutOfRange, codes.Unavailable, codes.DataLoss: - return true, throttleDelay(s) + // Additionally, handle RetryInfo. + _, d := throttleDelay(s) + return true, d + case codes.ResourceExhausted: + // Retry only if the server signals that the recovery from resource exhaustion is possible. + return throttleDelay(s) } // Not a retry-able error. return false, 0 } -// throttleDelay returns a duration to wait for if an explicit throttle time -// is included in the response status. -func throttleDelay(s *status.Status) time.Duration { +// throttleDelay returns if the status is RetryInfo +// and the duration to wait for if an explicit throttle time is included. +func throttleDelay(s *status.Status) (bool, time.Duration) { for _, detail := range s.Details() { if t, ok := detail.(*errdetails.RetryInfo); ok { - return t.RetryDelay.AsDuration() + return true, t.RetryDelay.AsDuration() } } - return 0 + return false, 0 } diff --git a/exporters/otlp/otlpmetric/otlpmetricgrpc/client_test.go b/exporters/otlp/otlpmetric/otlpmetricgrpc/client_test.go index 1e1b3527d79..03f07d69991 100644 --- a/exporters/otlp/otlpmetric/otlpmetricgrpc/client_test.go +++ b/exporters/otlp/otlpmetric/otlpmetricgrpc/client_test.go @@ -33,15 +33,17 @@ import ( "go.opentelemetry.io/otel/sdk/metric/metricdata" ) -func TestThrottleDuration(t *testing.T) { +func TestThrottleDelay(t *testing.T) { c := codes.ResourceExhausted testcases := []struct { - status *status.Status - expected time.Duration + status *status.Status + wantOK bool + wantDuration time.Duration }{ { - status: status.New(c, "NoRetryInfo"), - expected: 0, + status: status.New(c, "NoRetryInfo"), + wantOK: false, + wantDuration: 0, }, { status: func() *status.Status { @@ -53,7 +55,8 @@ func TestThrottleDuration(t *testing.T) { require.NoError(t, err) return s }(), - expected: 15 * time.Millisecond, + wantOK: true, + wantDuration: 15 * time.Millisecond, }, { status: func() *status.Status { @@ -63,7 +66,8 @@ func TestThrottleDuration(t *testing.T) { require.NoError(t, err) return s }(), - expected: 0, + wantOK: false, + wantDuration: 0, }, { status: func() *status.Status { @@ -76,7 +80,8 @@ func TestThrottleDuration(t *testing.T) { require.NoError(t, err) return s }(), - expected: 13 * time.Minute, + wantOK: true, + wantDuration: 13 * time.Minute, }, { status: func() *status.Status { @@ -91,13 +96,16 @@ func TestThrottleDuration(t *testing.T) { require.NoError(t, err) return s }(), - expected: 13 * time.Minute, + wantOK: true, + wantDuration: 13 * time.Minute, }, } for _, tc := range testcases { t.Run(tc.status.Message(), func(t *testing.T) { - require.Equal(t, tc.expected, throttleDelay(tc.status)) + ok, d := throttleDelay(tc.status) + assert.Equal(t, tc.wantOK, ok) + assert.Equal(t, tc.wantDuration, d) }) } } @@ -112,7 +120,7 @@ func TestRetryable(t *testing.T) { codes.NotFound: false, codes.AlreadyExists: false, codes.PermissionDenied: false, - codes.ResourceExhausted: true, + codes.ResourceExhausted: false, codes.FailedPrecondition: false, codes.Aborted: true, codes.OutOfRange: true, @@ -129,6 +137,20 @@ func TestRetryable(t *testing.T) { } } +func TestRetryableGRPCStatusResourceExhaustedWithRetryInfo(t *testing.T) { + delay := 15 * time.Millisecond + s, err := status.New(codes.ResourceExhausted, "WithRetryInfo").WithDetails( + &errdetails.RetryInfo{ + RetryDelay: durationpb.New(delay), + }, + ) + require.NoError(t, err) + + ok, d := retryableGRPCStatus(s) + assert.True(t, ok) + assert.Equal(t, delay, d) +} + type clientShim struct { *client } diff --git a/exporters/otlp/otlptrace/otlptracegrpc/client.go b/exporters/otlp/otlptrace/otlptracegrpc/client.go index 86fb61a0dec..b4cc21d7a3c 100644 --- a/exporters/otlp/otlptrace/otlptracegrpc/client.go +++ b/exporters/otlp/otlptrace/otlptracegrpc/client.go @@ -260,30 +260,38 @@ func (c *client) exportContext(parent context.Context) (context.Context, context // duration to wait for if an explicit throttle time is included in err. func retryable(err error) (bool, time.Duration) { s := status.Convert(err) + return retryableGRPCStatus(s) +} + +func retryableGRPCStatus(s *status.Status) (bool, time.Duration) { switch s.Code() { case codes.Canceled, codes.DeadlineExceeded, - codes.ResourceExhausted, codes.Aborted, codes.OutOfRange, codes.Unavailable, codes.DataLoss: - return true, throttleDelay(s) + // Additionally handle RetryInfo. + _, d := throttleDelay(s) + return true, d + case codes.ResourceExhausted: + // Retry only if the server signals that the recovery from resource exhaustion is possible. + return throttleDelay(s) } // Not a retry-able error. return false, 0 } -// throttleDelay returns a duration to wait for if an explicit throttle time -// is included in the response status. -func throttleDelay(s *status.Status) time.Duration { +// throttleDelay returns of the status is RetryInfo +// and the its duration to wait for if an explicit throttle time. +func throttleDelay(s *status.Status) (bool, time.Duration) { for _, detail := range s.Details() { if t, ok := detail.(*errdetails.RetryInfo); ok { - return t.RetryDelay.AsDuration() + return true, t.RetryDelay.AsDuration() } } - return 0 + return false, 0 } // MarshalLog is the marshaling function used by the logging system to represent this Client. diff --git a/exporters/otlp/otlptrace/otlptracegrpc/client_unit_test.go b/exporters/otlp/otlptrace/otlptracegrpc/client_unit_test.go index 5c43df1e322..df27a208daf 100644 --- a/exporters/otlp/otlptrace/otlptracegrpc/client_unit_test.go +++ b/exporters/otlp/otlptrace/otlptracegrpc/client_unit_test.go @@ -27,19 +27,21 @@ import ( "google.golang.org/protobuf/types/known/durationpb" ) -func TestThrottleDuration(t *testing.T) { +func TestThrottleDelay(t *testing.T) { c := codes.ResourceExhausted testcases := []struct { - status *status.Status - expected time.Duration + status *status.Status + wantOK bool + wantDuration time.Duration }{ { - status: status.New(c, "no retry info"), - expected: 0, + status: status.New(c, "NoRetryInfo"), + wantOK: false, + wantDuration: 0, }, { status: func() *status.Status { - s, err := status.New(c, "single retry info").WithDetails( + s, err := status.New(c, "SingleRetryInfo").WithDetails( &errdetails.RetryInfo{ RetryDelay: durationpb.New(15 * time.Millisecond), }, @@ -47,21 +49,23 @@ func TestThrottleDuration(t *testing.T) { require.NoError(t, err) return s }(), - expected: 15 * time.Millisecond, + wantOK: true, + wantDuration: 15 * time.Millisecond, }, { status: func() *status.Status { - s, err := status.New(c, "error info").WithDetails( + s, err := status.New(c, "ErrorInfo").WithDetails( &errdetails.ErrorInfo{Reason: "no throttle detail"}, ) require.NoError(t, err) return s }(), - expected: 0, + wantOK: false, + wantDuration: 0, }, { status: func() *status.Status { - s, err := status.New(c, "error and retry info").WithDetails( + s, err := status.New(c, "ErrorAndRetryInfo").WithDetails( &errdetails.ErrorInfo{Reason: "with throttle detail"}, &errdetails.RetryInfo{ RetryDelay: durationpb.New(13 * time.Minute), @@ -70,11 +74,12 @@ func TestThrottleDuration(t *testing.T) { require.NoError(t, err) return s }(), - expected: 13 * time.Minute, + wantOK: true, + wantDuration: 13 * time.Minute, }, { status: func() *status.Status { - s, err := status.New(c, "double retry info").WithDetails( + s, err := status.New(c, "DoubleRetryInfo").WithDetails( &errdetails.RetryInfo{ RetryDelay: durationpb.New(13 * time.Minute), }, @@ -85,13 +90,16 @@ func TestThrottleDuration(t *testing.T) { require.NoError(t, err) return s }(), - expected: 13 * time.Minute, + wantOK: true, + wantDuration: 13 * time.Minute, }, } for _, tc := range testcases { t.Run(tc.status.Message(), func(t *testing.T) { - require.Equal(t, tc.expected, throttleDelay(tc.status)) + ok, d := throttleDelay(tc.status) + assert.Equal(t, tc.wantOK, ok) + assert.Equal(t, tc.wantDuration, d) }) } } @@ -106,7 +114,7 @@ func TestRetryable(t *testing.T) { codes.NotFound: false, codes.AlreadyExists: false, codes.PermissionDenied: false, - codes.ResourceExhausted: true, + codes.ResourceExhausted: false, codes.FailedPrecondition: false, codes.Aborted: true, codes.OutOfRange: true, @@ -123,6 +131,20 @@ func TestRetryable(t *testing.T) { } } +func TestRetryableGRPCStatusResourceExhaustedWithRetryInfo(t *testing.T) { + delay := 15 * time.Millisecond + s, err := status.New(codes.ResourceExhausted, "WithRetryInfo").WithDetails( + &errdetails.RetryInfo{ + RetryDelay: durationpb.New(delay), + }, + ) + require.NoError(t, err) + + ok, d := retryableGRPCStatus(s) + assert.True(t, ok) + assert.Equal(t, delay, d) +} + func TestUnstartedStop(t *testing.T) { client := NewClient() assert.ErrorIs(t, client.Stop(context.Background()), errAlreadyStopped)