From 2617e65701840a93fe821ba38f16a2e708ad09f6 Mon Sep 17 00:00:00 2001 From: Yimin Chen Date: Wed, 20 Jul 2022 20:39:45 -0700 Subject: [PATCH] Don't count PermissionDenied as ServiceFailure (#3119) --- common/rpc/interceptor/telemetry.go | 60 ++++++++++++----------------- 1 file changed, 24 insertions(+), 36 deletions(-) diff --git a/common/rpc/interceptor/telemetry.go b/common/rpc/interceptor/telemetry.go index 0bfabed0dc8..8272688b33e 100644 --- a/common/rpc/interceptor/telemetry.go +++ b/common/rpc/interceptor/telemetry.go @@ -235,52 +235,40 @@ func (ti *TelemetryInterceptor) handleError( scope.Tagged(metrics.ServiceErrorTypeTag(err)).IncCounter(metrics.ServiceErrorWithType) - if common.IsContextDeadlineExceededErr(err) { - scope.IncCounter(metrics.ServiceErrContextTimeoutCounter) - return - } - if common.IsContextCanceledErr(err) { - scope.IncCounter(metrics.ServiceErrContextCancelledCounter) + if common.IsContextDeadlineExceededErr(err) || common.IsContextCanceledErr(err) { return } switch err := err.(type) { + // we emit service_error_with_type metrics, no need to emit specific metric for these known error types. case *serviceerror.WorkflowNotReady, + *serviceerror.NamespaceInvalidState, + *serviceerror.InvalidArgument, + *serviceerror.NamespaceNotActive, + *serviceerror.WorkflowExecutionAlreadyStarted, + *serviceerror.NotFound, + *serviceerror.NamespaceNotFound, + *serviceerror.NamespaceAlreadyExists, + *serviceerror.QueryFailed, + *serviceerror.ClientVersionNotSupported, + *serviceerror.PermissionDenied, *serviceerrors.StickyWorkerUnavailable, - *serviceerror.NamespaceInvalidState: - // we emit service_errors_with_type metrics, no need to emit specific metric for this error type. - // TODO deprecate all metrics below - case *serviceerrors.ShardOwnershipLost: - scope.IncCounter(metrics.ServiceErrShardOwnershipLostCounter) - case *serviceerrors.TaskAlreadyStarted: - scope.IncCounter(metrics.ServiceErrTaskAlreadyStartedCounter) - case *serviceerror.InvalidArgument: - scope.IncCounter(metrics.ServiceErrInvalidArgumentCounter) - case *serviceerror.NamespaceNotActive: - scope.IncCounter(metrics.ServiceErrNamespaceNotActiveCounter) - case *serviceerror.WorkflowExecutionAlreadyStarted: - scope.IncCounter(metrics.ServiceErrExecutionAlreadyStartedCounter) - case *serviceerror.NotFound, *serviceerror.NamespaceNotFound: - scope.IncCounter(metrics.ServiceErrNotFoundCounter) + *serviceerrors.ShardOwnershipLost, + *serviceerrors.TaskAlreadyStarted, + *serviceerrors.RetryReplication: + // no-op + + // specific metric for resource exhausted error with throttle reason case *serviceerror.ResourceExhausted: scope.Tagged(metrics.ResourceExhaustedCauseTag(err.Cause)).IncCounter(metrics.ServiceErrResourceExhaustedCounter) - case *serviceerrors.RetryReplication: - scope.IncCounter(metrics.ServiceErrRetryTaskCounter) - case *serviceerror.NamespaceAlreadyExists: - scope.IncCounter(metrics.ServiceErrNamespaceAlreadyExistsCounter) - case *serviceerror.QueryFailed: - scope.IncCounter(metrics.ServiceErrQueryFailedCounter) - case *serviceerror.ClientVersionNotSupported: - scope.IncCounter(metrics.ServiceErrClientVersionNotSupportedCounter) - case *serviceerror.DataLoss: - scope.IncCounter(metrics.ServiceFailures) - ti.logger.Error("unavailable error, data loss", append(logTags, tag.Error(err))...) - case *serviceerror.Unavailable: - scope.IncCounter(metrics.ServiceFailures) - ti.logger.Error("unavailable error", append(logTags, tag.Error(err))...) + + // Any other errors are treated as ServiceFailures against SLA. + // Including below known errors and any other unknown errors. + // *serviceerror.DataLoss, + // *serviceerror.Unavailable: default: scope.IncCounter(metrics.ServiceFailures) - ti.logger.Error("uncategorized error", append(logTags, tag.Error(err))...) + ti.logger.Error("service failures", append(logTags, tag.Error(err))...) } }