Skip to content

Commit 3882bb6

Browse files
authored
Capture task processing panic (#3779)
1 parent e91760f commit 3882bb6

13 files changed

+116
-15
lines changed

common/util.go

+6
Original file line numberDiff line numberDiff line change
@@ -371,6 +371,12 @@ func IsResourceExhausted(err error) bool {
371371
return false
372372
}
373373

374+
// IsInternalError checks if the error is an internal error.
375+
func IsInternalError(err error) bool {
376+
var internalErr *serviceerror.Internal
377+
return errors.As(err, &internalErr)
378+
}
379+
374380
// WorkflowIDToHistoryShard is used to map namespaceID-workflowID pair to a shardID.
375381
func WorkflowIDToHistoryShard(
376382
namespaceID string,

service/history/archival_queue_task_executor_test.go

+1
Original file line numberDiff line numberDiff line change
@@ -509,6 +509,7 @@ func TestArchivalQueueTaskExecutor(t *testing.T) {
509509
queues.NewNoopPriorityAssigner(),
510510
timeSource,
511511
namespaceRegistry,
512+
mockMetadata,
512513
nil,
513514
metrics.NoopMetricsHandler,
514515
)

service/history/queues/executable.go

+55-8
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ import (
3636
"go.temporal.io/server/common"
3737
"go.temporal.io/server/common/backoff"
3838
"go.temporal.io/server/common/clock"
39+
"go.temporal.io/server/common/cluster"
3940
"go.temporal.io/server/common/headers"
4041
"go.temporal.io/server/common/log"
4142
"go.temporal.io/server/common/log/tag"
@@ -110,6 +111,7 @@ type (
110111
priorityAssigner PriorityAssigner
111112
timeSource clock.TimeSource
112113
namespaceRegistry namespace.Registry
114+
clusterMetadata cluster.Metadata
113115

114116
readerID int32
115117
loadTime time.Time
@@ -135,6 +137,7 @@ func NewExecutable(
135137
priorityAssigner PriorityAssigner,
136138
timeSource clock.TimeSource,
137139
namespaceRegistry namespace.Registry,
140+
clusterMetadata cluster.Metadata,
138141
logger log.Logger,
139142
metricsHandler metrics.Handler,
140143
) Executable {
@@ -148,6 +151,7 @@ func NewExecutable(
148151
priorityAssigner: priorityAssigner,
149152
timeSource: timeSource,
150153
namespaceRegistry: namespaceRegistry,
154+
clusterMetadata: clusterMetadata,
151155
readerID: readerID,
152156
loadTime: util.MaxTime(timeSource.Now(), task.GetKey().FireTime),
153157
logger: log.NewLazyLogger(
@@ -163,15 +167,29 @@ func NewExecutable(
163167
return executable
164168
}
165169

166-
func (e *executableImpl) Execute() error {
170+
func (e *executableImpl) Execute() (retErr error) {
167171
if e.State() == ctasks.TaskStateCancelled {
168172
return nil
169173
}
170174

171-
ctx := metrics.AddMetricsContext(context.Background())
172-
namespace, _ := e.namespaceRegistry.GetNamespaceName(namespace.ID(e.GetNamespaceID()))
175+
namespaceName, _ := e.namespaceRegistry.GetNamespaceName(namespace.ID(e.GetNamespaceID()))
176+
ctx := headers.SetCallerInfo(
177+
metrics.AddMetricsContext(context.Background()),
178+
headers.NewBackgroundCallerInfo(namespaceName.String()),
179+
)
180+
181+
var panicErr error
182+
defer func() {
183+
if panicErr != nil {
184+
retErr = panicErr
185+
186+
// we need to guess the metrics tags here as we don't know which execution logic
187+
// is actually used which is upto the executor implementation
188+
e.taggedMetricsHandler = e.metricsHandler.WithTags(e.estimateTaskMetricTag()...)
189+
}
190+
}()
173191

174-
ctx = headers.SetCallerInfo(ctx, headers.NewBackgroundCallerInfo(namespace.String()))
192+
defer log.CapturePanic(e.logger, &panicErr)
175193

176194
startTime := e.timeSource.Now()
177195

@@ -296,6 +314,12 @@ func (e *executableImpl) IsRetryableError(err error) bool {
296314
return false
297315
}
298316

317+
// Internal error is non-retryable and usually means unexpected error has happened,
318+
// e.g. unknown task, corrupted state, panic etc.
319+
if common.IsInternalError(err) {
320+
return false
321+
}
322+
299323
// ErrTaskRetry means mutable state is not ready for standby task processing
300324
// there's no point for retrying the task immediately which will hold the worker corouinte
301325
// TODO: change ErrTaskRetry to a better name
@@ -424,6 +448,10 @@ func (e *executableImpl) shouldResubmitOnNack(attempt int, err error) bool {
424448
return false
425449
}
426450

451+
if common.IsInternalError(err) {
452+
return false
453+
}
454+
427455
return err != consts.ErrTaskRetry &&
428456
err != consts.ErrDependencyTaskNotCompleted &&
429457
err != consts.ErrNamespaceHandover
@@ -433,13 +461,14 @@ func (e *executableImpl) rescheduleTime(
433461
err error,
434462
attempt int,
435463
) time.Time {
436-
// elapsedTime (the first parameter in ComputeNextDelay) is not relevant here
464+
// elapsedTime, the first parameter in ComputeNextDelay is not relevant here
437465
// since reschedule policy has no expiration interval.
438466

439-
if err == consts.ErrTaskRetry || err == consts.ErrNamespaceHandover {
467+
if err == consts.ErrTaskRetry ||
468+
err == consts.ErrNamespaceHandover ||
469+
common.IsInternalError(err) {
440470
// using a different reschedule policy to slow down retry
441-
// as the error means mutable state or namespace is not ready to handle the task,
442-
// need to wait for replication.
471+
// as immediate retry typically won't resolve the issue.
443472
return e.timeSource.Now().Add(taskNotReadyReschedulePolicy.ComputeNextDelay(0, attempt))
444473
}
445474

@@ -469,3 +498,21 @@ func (e *executableImpl) updatePriority() {
469498
e.lowestPriority = e.priority
470499
}
471500
}
501+
502+
func (e *executableImpl) estimateTaskMetricTag() []metrics.Tag {
503+
namespaceTag := metrics.NamespaceUnknownTag()
504+
isActive := true
505+
506+
namespace, err := e.namespaceRegistry.GetNamespaceByID(namespace.ID(e.GetNamespaceID()))
507+
if err == nil {
508+
namespaceTag = metrics.NamespaceTag(namespace.Name().String())
509+
isActive = namespace.ActiveInCluster(e.clusterMetadata.GetCurrentClusterName())
510+
}
511+
512+
taskType := getTaskTypeTagValue(e.Task, isActive)
513+
return []metrics.Tag{
514+
namespaceTag,
515+
metrics.TaskTypeTag(taskType),
516+
metrics.OperationTag(taskType), // for backward compatibility
517+
}
518+
}

service/history/queues/executable_test.go

+17
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ import (
3737
"go.temporal.io/api/serviceerror"
3838

3939
"go.temporal.io/server/common/clock"
40+
"go.temporal.io/server/common/cluster"
4041
"go.temporal.io/server/common/definition"
4142
"go.temporal.io/server/common/log"
4243
"go.temporal.io/server/common/metrics"
@@ -57,6 +58,7 @@ type (
5758
mockScheduler *MockScheduler
5859
mockRescheduler *MockRescheduler
5960
mockNamespaceRegistry *namespace.MockRegistry
61+
mockClusterMetadata *cluster.MockMetadata
6062

6163
timeSource *clock.EventTimeSource
6264
}
@@ -75,8 +77,11 @@ func (s *executableSuite) SetupTest() {
7577
s.mockScheduler = NewMockScheduler(s.controller)
7678
s.mockRescheduler = NewMockRescheduler(s.controller)
7779
s.mockNamespaceRegistry = namespace.NewMockRegistry(s.controller)
80+
s.mockClusterMetadata = cluster.NewMockMetadata(s.controller)
7881

7982
s.mockNamespaceRegistry.EXPECT().GetNamespaceName(gomock.Any()).Return(tests.Namespace, nil).AnyTimes()
83+
s.mockNamespaceRegistry.EXPECT().GetNamespaceByID(gomock.Any()).Return(tests.GlobalNamespaceEntry, nil).AnyTimes()
84+
s.mockClusterMetadata.EXPECT().GetCurrentClusterName().Return(cluster.TestCurrentClusterName).AnyTimes()
8085

8186
s.timeSource = clock.NewEventTimeSource()
8287
}
@@ -108,6 +113,17 @@ func (s *executableSuite) TestExecute_UserLatency() {
108113
s.Equal(time.Duration(expectedUserLatency), executable.(*executableImpl).userLatency)
109114
}
110115

116+
func (s *executableSuite) TestExecute_CapturePanic() {
117+
executable := s.newTestExecutable()
118+
119+
s.mockExecutor.EXPECT().Execute(gomock.Any(), executable).DoAndReturn(
120+
func(_ context.Context, _ Executable) ([]metrics.Tag, bool, error) {
121+
panic("test panic during execution")
122+
},
123+
)
124+
s.Error(executable.Execute())
125+
}
126+
111127
func (s *executableSuite) TestHandleErr_EntityNotExists() {
112128
executable := s.newTestExecutable()
113129

@@ -219,6 +235,7 @@ func (s *executableSuite) newTestExecutable() Executable {
219235
NewNoopPriorityAssigner(),
220236
s.timeSource,
221237
s.mockNamespaceRegistry,
238+
s.mockClusterMetadata,
222239
log.NewTestLogger(),
223240
metrics.NoopMetricsHandler,
224241
)

service/history/queues/metrics.go

+24
Original file line numberDiff line numberDiff line change
@@ -148,3 +148,27 @@ func GetArchivalTaskTypeTagValue(
148148
return ""
149149
}
150150
}
151+
152+
func getTaskTypeTagValue(
153+
task tasks.Task,
154+
isActive bool,
155+
) string {
156+
switch task.GetCategory() {
157+
case tasks.CategoryTransfer:
158+
if isActive {
159+
return GetActiveTransferTaskTypeTagValue(task)
160+
}
161+
return GetStandbyTransferTaskTypeTagValue(task)
162+
case tasks.CategoryTimer:
163+
if isActive {
164+
return GetActiveTimerTaskTypeTagValue(task)
165+
}
166+
return GetStandbyTimerTaskTypeTagValue(task)
167+
case tasks.CategoryVisibility:
168+
return GetVisibilityTaskTypeTagValue(task)
169+
case tasks.CategoryArchival:
170+
return GetArchivalTaskTypeTagValue(task)
171+
default:
172+
return task.GetType().String()
173+
}
174+
}

service/history/queues/queue_base.go

+1
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,7 @@ func newQueueBase(
164164
priorityAssigner,
165165
timeSource,
166166
shard.GetNamespaceRegistry(),
167+
shard.GetClusterMetadata(),
167168
logger,
168169
metricsHandler,
169170
)

service/history/queues/reader_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ func (s *readerSuite) SetupTest() {
7777
s.metricsHandler = metrics.NoopMetricsHandler
7878

7979
s.executableInitializer = func(readerID int32, t tasks.Task) Executable {
80-
return NewExecutable(readerID, t, nil, nil, nil, NewNoopPriorityAssigner(), clock.NewRealTimeSource(), nil, nil, metrics.NoopMetricsHandler)
80+
return NewExecutable(readerID, t, nil, nil, nil, NewNoopPriorityAssigner(), clock.NewRealTimeSource(), nil, nil, nil, metrics.NoopMetricsHandler)
8181
}
8282
s.monitor = newMonitor(tasks.CategoryTypeScheduled, &MonitorOptions{
8383
PendingTasksCriticalCount: dynamicconfig.GetIntPropertyFn(1000),

service/history/queues/slice_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ func (s *sliceSuite) SetupTest() {
6969
s.controller = gomock.NewController(s.T())
7070

7171
s.executableInitializer = func(readerID int32, t tasks.Task) Executable {
72-
return NewExecutable(readerID, t, nil, nil, nil, NewNoopPriorityAssigner(), clock.NewRealTimeSource(), nil, nil, metrics.NoopMetricsHandler)
72+
return NewExecutable(readerID, t, nil, nil, nil, NewNoopPriorityAssigner(), clock.NewRealTimeSource(), nil, nil, nil, metrics.NoopMetricsHandler)
7373
}
7474
s.monitor = newMonitor(tasks.CategoryTypeScheduled, &MonitorOptions{
7575
PendingTasksCriticalCount: dynamicconfig.GetIntPropertyFn(1000),

service/history/timerQueueActiveTaskExecutor_test.go

+2-1
Original file line numberDiff line numberDiff line change
@@ -1479,7 +1479,8 @@ func (s *timerQueueActiveTaskExecutorSuite) newTaskExecutable(
14791479
nil,
14801480
queues.NewNoopPriorityAssigner(),
14811481
s.mockShard.GetTimeSource(),
1482-
nil,
1482+
s.mockNamespaceCache,
1483+
s.mockClusterMetadata,
14831484
nil,
14841485
metrics.NoopMetricsHandler,
14851486
)

service/history/timerQueueStandbyTaskExecutor_test.go

+2-1
Original file line numberDiff line numberDiff line change
@@ -1542,7 +1542,8 @@ func (s *timerQueueStandbyTaskExecutorSuite) newTaskExecutable(
15421542
nil,
15431543
queues.NewNoopPriorityAssigner(),
15441544
s.mockShard.GetTimeSource(),
1545-
nil,
1545+
s.mockNamespaceCache,
1546+
s.mockClusterMetadata,
15461547
nil,
15471548
metrics.NoopMetricsHandler,
15481549
)

service/history/transferQueueActiveTaskExecutor_test.go

+2-1
Original file line numberDiff line numberDiff line change
@@ -2802,7 +2802,8 @@ func (s *transferQueueActiveTaskExecutorSuite) newTaskExecutable(
28022802
nil,
28032803
queues.NewNoopPriorityAssigner(),
28042804
s.mockShard.GetTimeSource(),
2805-
nil,
2805+
s.mockNamespaceCache,
2806+
s.mockClusterMetadata,
28062807
nil,
28072808
metrics.NoopMetricsHandler,
28082809
)

service/history/transferQueueStandbyTaskExecutor_test.go

+2-1
Original file line numberDiff line numberDiff line change
@@ -1300,7 +1300,8 @@ func (s *transferQueueStandbyTaskExecutorSuite) newTaskExecutable(
13001300
nil,
13011301
queues.NewNoopPriorityAssigner(),
13021302
s.mockShard.GetTimeSource(),
1303-
nil,
1303+
s.mockNamespaceCache,
1304+
s.mockClusterMetadata,
13041305
nil,
13051306
metrics.NoopMetricsHandler,
13061307
)

service/history/visibilityQueueTaskExecutor_test.go

+2-1
Original file line numberDiff line numberDiff line change
@@ -638,7 +638,8 @@ func (s *visibilityQueueTaskExecutorSuite) newTaskExecutable(
638638
nil,
639639
queues.NewNoopPriorityAssigner(),
640640
s.mockShard.GetTimeSource(),
641-
nil,
641+
s.mockShard.GetNamespaceRegistry(),
642+
s.mockShard.GetClusterMetadata(),
642643
nil,
643644
metrics.NoopMetricsHandler,
644645
)

0 commit comments

Comments
 (0)