Skip to content

Commit 49e7be8

Browse files
authored
Emit Error Telemetry on Namespace Handover Failure (#8430)
## What changed? Errors that occur prior to the telemetry interceptor are not captured, meaning there is no log, the request isn't counted, and the error isn't counted. I've extracted the telemetry interceptors error handling into a shared obj that I've wired into the NamespaceHandover error handling for now. An alternative approach was to move the TelemetryInterceptor earlier in the chain, but this would have caused double counting on redirects, and adding another dimension to our metrics (pre/post redirect) here to address the double counting would cause other classes of issues As of now, other interceptors that are added prior to the TelemetryInterceptor are also at risk of this. We can wrap PreTelemetry errors and have a new interceptor handle these if we want to more generically address this issue - view my commit history for "Experimental" (which I reverted) to see an example of this. ## Why? To specifically track metrics/logs on NamespaceHandover failures. ## How did you test it? - [x] built - [ ] run locally and tested manually - [ ] covered by existing tests - [ ] added new unit test(s) - [ ] added new functional test(s) ## Potential risks Potential metrics loss if for some reason telemetry metric error handling is not wired correctly after the refactor.
1 parent b1330d8 commit 49e7be8

File tree

12 files changed

+355
-161
lines changed

12 files changed

+355
-161
lines changed

common/rpc/interceptor/namespace_handover.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ type (
3232
nsCacheRefreshInterval dynamicconfig.DurationPropertyFn
3333
metricsHandler metrics.Handler
3434
logger log.Logger
35+
requestErrorHandler ErrorHandler
3536
}
3637
)
3738

@@ -41,6 +42,7 @@ func NewNamespaceHandoverInterceptor(
4142
metricsHandler metrics.Handler,
4243
logger log.Logger,
4344
timeSource clock.TimeSource,
45+
requestErrorHandler ErrorHandler,
4446
) *NamespaceHandoverInterceptor {
4547

4648
return &NamespaceHandoverInterceptor{
@@ -50,6 +52,7 @@ func NewNamespaceHandoverInterceptor(
5052
metricsHandler: metricsHandler,
5153
logger: logger,
5254
timeSource: timeSource,
55+
requestErrorHandler: requestErrorHandler,
5356
}
5457
}
5558

@@ -78,6 +81,24 @@ func (i *NamespaceHandoverInterceptor) Intercept(
7881
}()
7982
waitTime, err := i.waitNamespaceHandoverUpdate(ctx, namespaceName, methodName)
8083
if err != nil {
84+
metricsHandler, logTags := CreateUnaryMetricsHandlerLogTags(
85+
i.metricsHandler,
86+
req,
87+
info.FullMethod,
88+
methodName,
89+
namespaceName,
90+
)
91+
// count the request as this will not be counted
92+
metrics.ServiceRequests.With(metricsHandler).Record(1)
93+
94+
i.requestErrorHandler.HandleError(
95+
req,
96+
info.FullMethod,
97+
metricsHandler,
98+
logTags,
99+
err,
100+
namespaceName,
101+
)
81102
return nil, err
82103
}
83104
}
Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
package interceptor
2+
3+
//go:generate mockgen -package $GOPACKAGE -source $GOFILE -destination request_error_handler_mock.go
4+
5+
import (
6+
"errors"
7+
8+
enumspb "go.temporal.io/api/enums/v1"
9+
"go.temporal.io/api/serviceerror"
10+
"go.temporal.io/server/common"
11+
"go.temporal.io/server/common/dynamicconfig"
12+
"go.temporal.io/server/common/log"
13+
"go.temporal.io/server/common/log/tag"
14+
"go.temporal.io/server/common/metrics"
15+
"go.temporal.io/server/common/namespace"
16+
"go.temporal.io/server/common/rpc/interceptor/logtags"
17+
serviceerrors "go.temporal.io/server/common/serviceerror"
18+
"go.temporal.io/server/common/tasktoken"
19+
"google.golang.org/grpc/codes"
20+
)
21+
22+
type (
23+
// ErrorHandler defines the interface for handling request errors
24+
ErrorHandler interface {
25+
HandleError(
26+
req any,
27+
fullMethod string,
28+
metricsHandler metrics.Handler,
29+
logTags []tag.Tag,
30+
err error,
31+
nsName namespace.Name,
32+
)
33+
}
34+
35+
// RequestErrorHandler handles error recording and logging for RPC interceptors
36+
RequestErrorHandler struct {
37+
logger log.Logger
38+
workflowTags *logtags.WorkflowTags
39+
logAllReqErrors dynamicconfig.BoolPropertyFnWithNamespaceFilter
40+
}
41+
)
42+
43+
// NewRequestErrorHandler creates a new RequestErrorHandler
44+
func NewRequestErrorHandler(
45+
logger log.Logger,
46+
logAllReqErrors dynamicconfig.BoolPropertyFnWithNamespaceFilter,
47+
) *RequestErrorHandler {
48+
return &RequestErrorHandler{
49+
logger: logger,
50+
workflowTags: logtags.NewWorkflowTags(tasktoken.NewSerializer(), logger),
51+
logAllReqErrors: logAllReqErrors,
52+
}
53+
}
54+
55+
// HandleError handles error recording and logging
56+
func (eh *RequestErrorHandler) HandleError(
57+
req any,
58+
fullMethod string,
59+
metricsHandler metrics.Handler,
60+
logTags []tag.Tag,
61+
err error,
62+
nsName namespace.Name,
63+
) {
64+
statusCode := serviceerror.ToStatus(err).Code()
65+
if statusCode == codes.OK {
66+
return
67+
}
68+
69+
isExpectedError := isExpectedErrorByStatusCode(statusCode) || isExpectedErrorByType(err)
70+
71+
recordErrorMetrics(metricsHandler, err, isExpectedError)
72+
eh.logError(req, fullMethod, nsName, err, statusCode, isExpectedError, logTags)
73+
}
74+
75+
func (eh *RequestErrorHandler) logError(
76+
req any,
77+
fullMethod string,
78+
nsName namespace.Name,
79+
err error,
80+
statusCode codes.Code,
81+
isExpectedError bool,
82+
logTags []tag.Tag,
83+
) {
84+
logAllErrors := nsName != "" && eh.logAllReqErrors(nsName.String())
85+
// context errors may not be user errors, but still too noisy to log by default
86+
if !logAllErrors && (isExpectedError ||
87+
common.IsContextDeadlineExceededErr(err) ||
88+
common.IsContextCanceledErr(err) ||
89+
common.IsResourceExhausted(err)) {
90+
return
91+
}
92+
93+
logTags = append(logTags, tag.NewStringerTag("grpc_code", statusCode))
94+
logTags = append(logTags, eh.workflowTags.Extract(req, fullMethod)...)
95+
96+
eh.logger.Error("service failures", append(logTags, tag.Error(err))...)
97+
}
98+
99+
func recordErrorMetrics(metricsHandler metrics.Handler, err error, isExpectedError bool) {
100+
metrics.ServiceErrorWithType.With(metricsHandler).Record(1, metrics.ServiceErrorTypeTag(err))
101+
102+
var resourceExhaustedErr *serviceerror.ResourceExhausted
103+
if errors.As(err, &resourceExhaustedErr) {
104+
metrics.ServiceErrResourceExhaustedCounter.With(metricsHandler).Record(
105+
1,
106+
metrics.ResourceExhaustedCauseTag(resourceExhaustedErr.Cause),
107+
metrics.ResourceExhaustedScopeTag(resourceExhaustedErr.Scope),
108+
)
109+
}
110+
111+
if isExpectedError {
112+
return
113+
}
114+
115+
metrics.ServiceFailures.With(metricsHandler).Record(1)
116+
}
117+
118+
//nolint:revive // explicit cases in switch for documentation purposes
119+
func isExpectedErrorByStatusCode(statusCode codes.Code) bool {
120+
switch statusCode {
121+
case codes.Canceled,
122+
codes.InvalidArgument,
123+
codes.NotFound,
124+
codes.AlreadyExists,
125+
codes.PermissionDenied,
126+
codes.FailedPrecondition,
127+
codes.OutOfRange,
128+
codes.Unauthenticated:
129+
return true
130+
// We could just return false here, but making it explicit what codes are
131+
// considered (potentially) server errors.
132+
case codes.Unknown,
133+
codes.DeadlineExceeded,
134+
// the result for resource exhausted depends on the resource exhausted scope and
135+
// will be handled by isExpectedErrorByType()
136+
codes.ResourceExhausted,
137+
codes.Aborted,
138+
codes.Unimplemented,
139+
codes.Internal,
140+
codes.Unavailable,
141+
codes.DataLoss:
142+
return false
143+
default:
144+
return false
145+
}
146+
}
147+
148+
func isExpectedErrorByType(err error) bool {
149+
// This is not a full list of service errors.
150+
// Only errors with status code that fails the isExpectedErrorByStatusCode() check
151+
// but are actually expected need to be explicitly handled here.
152+
//
153+
// Some of the errors listed below does not failed the isExpectedErrorByStatusCode() check
154+
// but are listed nonetheless.
155+
switch err := err.(type) {
156+
case *serviceerror.ResourceExhausted:
157+
return err.Scope == enumspb.RESOURCE_EXHAUSTED_SCOPE_NAMESPACE
158+
case *serviceerror.Canceled,
159+
*serviceerror.AlreadyExists,
160+
*serviceerror.CancellationAlreadyRequested,
161+
*serviceerror.FailedPrecondition,
162+
*serviceerror.NamespaceInvalidState,
163+
*serviceerror.NamespaceNotActive,
164+
*serviceerror.NamespaceNotFound,
165+
*serviceerror.NamespaceAlreadyExists,
166+
*serviceerror.InvalidArgument,
167+
*serviceerror.WorkflowExecutionAlreadyStarted,
168+
*serviceerror.WorkflowNotReady,
169+
*serviceerror.NotFound,
170+
*serviceerror.QueryFailed,
171+
*serviceerror.ClientVersionNotSupported,
172+
*serviceerror.ServerVersionNotSupported,
173+
*serviceerror.PermissionDenied,
174+
*serviceerror.NewerBuildExists,
175+
*serviceerrors.StickyWorkerUnavailable,
176+
*serviceerrors.TaskAlreadyStarted,
177+
*serviceerrors.RetryReplication,
178+
*serviceerrors.SyncState:
179+
return true
180+
default:
181+
return false
182+
}
183+
}

common/rpc/interceptor/request_error_handler_mock.go

Lines changed: 55 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)