Skip to content

Commit

Permalink
Don't count PermissionDenied as ServiceFailure (#3119)
Browse files Browse the repository at this point in the history
  • Loading branch information
yiminc authored Jul 21, 2022
1 parent ba585a9 commit 2617e65
Showing 1 changed file with 24 additions and 36 deletions.
60 changes: 24 additions & 36 deletions common/rpc/interceptor/telemetry.go
Original file line number Diff line number Diff line change
Expand Up @@ -235,52 +235,40 @@ func (ti *TelemetryInterceptor) handleError(

scope.Tagged(metrics.ServiceErrorTypeTag(err)).IncCounter(metrics.ServiceErrorWithType)

if common.IsContextDeadlineExceededErr(err) {
scope.IncCounter(metrics.ServiceErrContextTimeoutCounter)
return
}
if common.IsContextCanceledErr(err) {
scope.IncCounter(metrics.ServiceErrContextCancelledCounter)
if common.IsContextDeadlineExceededErr(err) || common.IsContextCanceledErr(err) {
return
}

switch err := err.(type) {
// we emit service_error_with_type metrics, no need to emit specific metric for these known error types.
case *serviceerror.WorkflowNotReady,
*serviceerror.NamespaceInvalidState,
*serviceerror.InvalidArgument,
*serviceerror.NamespaceNotActive,
*serviceerror.WorkflowExecutionAlreadyStarted,
*serviceerror.NotFound,
*serviceerror.NamespaceNotFound,
*serviceerror.NamespaceAlreadyExists,
*serviceerror.QueryFailed,
*serviceerror.ClientVersionNotSupported,
*serviceerror.PermissionDenied,
*serviceerrors.StickyWorkerUnavailable,
*serviceerror.NamespaceInvalidState:
// we emit service_errors_with_type metrics, no need to emit specific metric for this error type.
// TODO deprecate all metrics below
case *serviceerrors.ShardOwnershipLost:
scope.IncCounter(metrics.ServiceErrShardOwnershipLostCounter)
case *serviceerrors.TaskAlreadyStarted:
scope.IncCounter(metrics.ServiceErrTaskAlreadyStartedCounter)
case *serviceerror.InvalidArgument:
scope.IncCounter(metrics.ServiceErrInvalidArgumentCounter)
case *serviceerror.NamespaceNotActive:
scope.IncCounter(metrics.ServiceErrNamespaceNotActiveCounter)
case *serviceerror.WorkflowExecutionAlreadyStarted:
scope.IncCounter(metrics.ServiceErrExecutionAlreadyStartedCounter)
case *serviceerror.NotFound, *serviceerror.NamespaceNotFound:
scope.IncCounter(metrics.ServiceErrNotFoundCounter)
*serviceerrors.ShardOwnershipLost,
*serviceerrors.TaskAlreadyStarted,
*serviceerrors.RetryReplication:
// no-op

// specific metric for resource exhausted error with throttle reason
case *serviceerror.ResourceExhausted:
scope.Tagged(metrics.ResourceExhaustedCauseTag(err.Cause)).IncCounter(metrics.ServiceErrResourceExhaustedCounter)
case *serviceerrors.RetryReplication:
scope.IncCounter(metrics.ServiceErrRetryTaskCounter)
case *serviceerror.NamespaceAlreadyExists:
scope.IncCounter(metrics.ServiceErrNamespaceAlreadyExistsCounter)
case *serviceerror.QueryFailed:
scope.IncCounter(metrics.ServiceErrQueryFailedCounter)
case *serviceerror.ClientVersionNotSupported:
scope.IncCounter(metrics.ServiceErrClientVersionNotSupportedCounter)
case *serviceerror.DataLoss:
scope.IncCounter(metrics.ServiceFailures)
ti.logger.Error("unavailable error, data loss", append(logTags, tag.Error(err))...)
case *serviceerror.Unavailable:
scope.IncCounter(metrics.ServiceFailures)
ti.logger.Error("unavailable error", append(logTags, tag.Error(err))...)

// Any other errors are treated as ServiceFailures against SLA.
// Including below known errors and any other unknown errors.
// *serviceerror.DataLoss,
// *serviceerror.Unavailable:
default:
scope.IncCounter(metrics.ServiceFailures)
ti.logger.Error("uncategorized error", append(logTags, tag.Error(err))...)
ti.logger.Error("service failures", append(logTags, tag.Error(err))...)
}
}

Expand Down

0 comments on commit 2617e65

Please sign in to comment.