|
| 1 | +package interceptor |
| 2 | + |
| 3 | +//go:generate mockgen -package $GOPACKAGE -source $GOFILE -destination request_error_handler_mock.go |
| 4 | + |
| 5 | +import ( |
| 6 | + "errors" |
| 7 | + |
| 8 | + enumspb "go.temporal.io/api/enums/v1" |
| 9 | + "go.temporal.io/api/serviceerror" |
| 10 | + "go.temporal.io/server/common" |
| 11 | + "go.temporal.io/server/common/dynamicconfig" |
| 12 | + "go.temporal.io/server/common/log" |
| 13 | + "go.temporal.io/server/common/log/tag" |
| 14 | + "go.temporal.io/server/common/metrics" |
| 15 | + "go.temporal.io/server/common/namespace" |
| 16 | + "go.temporal.io/server/common/rpc/interceptor/logtags" |
| 17 | + serviceerrors "go.temporal.io/server/common/serviceerror" |
| 18 | + "go.temporal.io/server/common/tasktoken" |
| 19 | + "google.golang.org/grpc/codes" |
| 20 | +) |
| 21 | + |
| 22 | +type ( |
| 23 | + // ErrorHandler defines the interface for handling request errors |
| 24 | + ErrorHandler interface { |
| 25 | + HandleError( |
| 26 | + req any, |
| 27 | + fullMethod string, |
| 28 | + metricsHandler metrics.Handler, |
| 29 | + logTags []tag.Tag, |
| 30 | + err error, |
| 31 | + nsName namespace.Name, |
| 32 | + ) |
| 33 | + } |
| 34 | + |
| 35 | + // RequestErrorHandler handles error recording and logging for RPC interceptors |
| 36 | + RequestErrorHandler struct { |
| 37 | + logger log.Logger |
| 38 | + workflowTags *logtags.WorkflowTags |
| 39 | + logAllReqErrors dynamicconfig.BoolPropertyFnWithNamespaceFilter |
| 40 | + } |
| 41 | +) |
| 42 | + |
| 43 | +// NewRequestErrorHandler creates a new RequestErrorHandler |
| 44 | +func NewRequestErrorHandler( |
| 45 | + logger log.Logger, |
| 46 | + logAllReqErrors dynamicconfig.BoolPropertyFnWithNamespaceFilter, |
| 47 | +) *RequestErrorHandler { |
| 48 | + return &RequestErrorHandler{ |
| 49 | + logger: logger, |
| 50 | + workflowTags: logtags.NewWorkflowTags(tasktoken.NewSerializer(), logger), |
| 51 | + logAllReqErrors: logAllReqErrors, |
| 52 | + } |
| 53 | +} |
| 54 | + |
| 55 | +// HandleError handles error recording and logging |
| 56 | +func (eh *RequestErrorHandler) HandleError( |
| 57 | + req any, |
| 58 | + fullMethod string, |
| 59 | + metricsHandler metrics.Handler, |
| 60 | + logTags []tag.Tag, |
| 61 | + err error, |
| 62 | + nsName namespace.Name, |
| 63 | +) { |
| 64 | + statusCode := serviceerror.ToStatus(err).Code() |
| 65 | + if statusCode == codes.OK { |
| 66 | + return |
| 67 | + } |
| 68 | + |
| 69 | + isExpectedError := isExpectedErrorByStatusCode(statusCode) || isExpectedErrorByType(err) |
| 70 | + |
| 71 | + recordErrorMetrics(metricsHandler, err, isExpectedError) |
| 72 | + eh.logError(req, fullMethod, nsName, err, statusCode, isExpectedError, logTags) |
| 73 | +} |
| 74 | + |
| 75 | +func (eh *RequestErrorHandler) logError( |
| 76 | + req any, |
| 77 | + fullMethod string, |
| 78 | + nsName namespace.Name, |
| 79 | + err error, |
| 80 | + statusCode codes.Code, |
| 81 | + isExpectedError bool, |
| 82 | + logTags []tag.Tag, |
| 83 | +) { |
| 84 | + logAllErrors := nsName != "" && eh.logAllReqErrors(nsName.String()) |
| 85 | + // context errors may not be user errors, but still too noisy to log by default |
| 86 | + if !logAllErrors && (isExpectedError || |
| 87 | + common.IsContextDeadlineExceededErr(err) || |
| 88 | + common.IsContextCanceledErr(err) || |
| 89 | + common.IsResourceExhausted(err)) { |
| 90 | + return |
| 91 | + } |
| 92 | + |
| 93 | + logTags = append(logTags, tag.NewStringerTag("grpc_code", statusCode)) |
| 94 | + logTags = append(logTags, eh.workflowTags.Extract(req, fullMethod)...) |
| 95 | + |
| 96 | + eh.logger.Error("service failures", append(logTags, tag.Error(err))...) |
| 97 | +} |
| 98 | + |
| 99 | +func recordErrorMetrics(metricsHandler metrics.Handler, err error, isExpectedError bool) { |
| 100 | + metrics.ServiceErrorWithType.With(metricsHandler).Record(1, metrics.ServiceErrorTypeTag(err)) |
| 101 | + |
| 102 | + var resourceExhaustedErr *serviceerror.ResourceExhausted |
| 103 | + if errors.As(err, &resourceExhaustedErr) { |
| 104 | + metrics.ServiceErrResourceExhaustedCounter.With(metricsHandler).Record( |
| 105 | + 1, |
| 106 | + metrics.ResourceExhaustedCauseTag(resourceExhaustedErr.Cause), |
| 107 | + metrics.ResourceExhaustedScopeTag(resourceExhaustedErr.Scope), |
| 108 | + ) |
| 109 | + } |
| 110 | + |
| 111 | + if isExpectedError { |
| 112 | + return |
| 113 | + } |
| 114 | + |
| 115 | + metrics.ServiceFailures.With(metricsHandler).Record(1) |
| 116 | +} |
| 117 | + |
| 118 | +//nolint:revive // explicit cases in switch for documentation purposes |
| 119 | +func isExpectedErrorByStatusCode(statusCode codes.Code) bool { |
| 120 | + switch statusCode { |
| 121 | + case codes.Canceled, |
| 122 | + codes.InvalidArgument, |
| 123 | + codes.NotFound, |
| 124 | + codes.AlreadyExists, |
| 125 | + codes.PermissionDenied, |
| 126 | + codes.FailedPrecondition, |
| 127 | + codes.OutOfRange, |
| 128 | + codes.Unauthenticated: |
| 129 | + return true |
| 130 | + // We could just return false here, but making it explicit what codes are |
| 131 | + // considered (potentially) server errors. |
| 132 | + case codes.Unknown, |
| 133 | + codes.DeadlineExceeded, |
| 134 | + // the result for resource exhausted depends on the resource exhausted scope and |
| 135 | + // will be handled by isExpectedErrorByType() |
| 136 | + codes.ResourceExhausted, |
| 137 | + codes.Aborted, |
| 138 | + codes.Unimplemented, |
| 139 | + codes.Internal, |
| 140 | + codes.Unavailable, |
| 141 | + codes.DataLoss: |
| 142 | + return false |
| 143 | + default: |
| 144 | + return false |
| 145 | + } |
| 146 | +} |
| 147 | + |
| 148 | +func isExpectedErrorByType(err error) bool { |
| 149 | + // This is not a full list of service errors. |
| 150 | + // Only errors with status code that fails the isExpectedErrorByStatusCode() check |
| 151 | + // but are actually expected need to be explicitly handled here. |
| 152 | + // |
| 153 | + // Some of the errors listed below does not failed the isExpectedErrorByStatusCode() check |
| 154 | + // but are listed nonetheless. |
| 155 | + switch err := err.(type) { |
| 156 | + case *serviceerror.ResourceExhausted: |
| 157 | + return err.Scope == enumspb.RESOURCE_EXHAUSTED_SCOPE_NAMESPACE |
| 158 | + case *serviceerror.Canceled, |
| 159 | + *serviceerror.AlreadyExists, |
| 160 | + *serviceerror.CancellationAlreadyRequested, |
| 161 | + *serviceerror.FailedPrecondition, |
| 162 | + *serviceerror.NamespaceInvalidState, |
| 163 | + *serviceerror.NamespaceNotActive, |
| 164 | + *serviceerror.NamespaceNotFound, |
| 165 | + *serviceerror.NamespaceAlreadyExists, |
| 166 | + *serviceerror.InvalidArgument, |
| 167 | + *serviceerror.WorkflowExecutionAlreadyStarted, |
| 168 | + *serviceerror.WorkflowNotReady, |
| 169 | + *serviceerror.NotFound, |
| 170 | + *serviceerror.QueryFailed, |
| 171 | + *serviceerror.ClientVersionNotSupported, |
| 172 | + *serviceerror.ServerVersionNotSupported, |
| 173 | + *serviceerror.PermissionDenied, |
| 174 | + *serviceerror.NewerBuildExists, |
| 175 | + *serviceerrors.StickyWorkerUnavailable, |
| 176 | + *serviceerrors.TaskAlreadyStarted, |
| 177 | + *serviceerrors.RetryReplication, |
| 178 | + *serviceerrors.SyncState: |
| 179 | + return true |
| 180 | + default: |
| 181 | + return false |
| 182 | + } |
| 183 | +} |
0 commit comments