@@ -94,7 +94,7 @@ func newTask(
94
94
historyClient : historyClient ,
95
95
adminClient : adminClient ,
96
96
97
- metricsHandler : metricsHandler ,
97
+ metricsHandler : metricsHandler . WithTags ( metrics . OperationTag ( metrics . ExecutionsScavengerScope )) ,
98
98
logger : logger ,
99
99
scavenger : scavenger ,
100
100
@@ -112,12 +112,15 @@ func (t *task) Run() executor.TaskStatus {
112
112
))
113
113
114
114
iter := collection .NewPagingIteratorWithToken (t .getPaginationFn (), t .paginationToken )
115
+ var retryTask bool
115
116
for iter .HasNext () {
116
117
_ = t .rateLimiter .Wait (t .ctx )
117
118
record , err := iter .Next ()
118
119
if err != nil {
120
+ t .metricsHandler .Counter (metrics .ScavengerValidationSkipsCount .GetMetricName ()).Record (1 )
121
+ // continue validation process and retry after all workflow records has been iterated.
119
122
t .logger .Error ("unable to paginate concrete execution" , tag .ShardID (t .shardID ), tag .Error (err ))
120
- return executor . TaskStatusDefer
123
+ retryTask = true
121
124
}
122
125
123
126
mutableState := & MutableState {WorkflowMutableState : record }
@@ -130,10 +133,21 @@ func (t *task) Run() executor.TaskStatus {
130
133
)
131
134
err = t .handleFailures (mutableState , results )
132
135
if err != nil {
133
- t .logger .Error ("unable to process failure result" , tag .ShardID (t .shardID ), tag .Error (err ))
134
- return executor .TaskStatusDefer
136
+ // continue validation process and retry after all workflow records has been iterated.
137
+ executionInfo := mutableState .GetExecutionInfo ()
138
+ t .metricsHandler .Counter (metrics .ScavengerValidationSkipsCount .GetMetricName ()).Record (1 )
139
+ t .logger .Error ("unable to process failure result" ,
140
+ tag .ShardID (t .shardID ),
141
+ tag .Error (err ),
142
+ tag .WorkflowNamespaceID (executionInfo .GetNamespaceId ()),
143
+ tag .WorkflowID (executionInfo .GetWorkflowId ()),
144
+ tag .WorkflowRunID (mutableState .GetExecutionState ().GetRunId ()))
145
+ retryTask = true
135
146
}
136
147
}
148
+ if retryTask {
149
+ return executor .TaskStatusDefer
150
+ }
137
151
return executor .TaskStatusDone
138
152
}
139
153
@@ -167,6 +181,11 @@ func (t *task) validate(
167
181
results = append (results , validationResults ... )
168
182
}
169
183
184
+ // Fail fast if the mutable is corrupted, no need to validate history.
185
+ if len (results ) > 0 {
186
+ return results
187
+ }
188
+
170
189
if validationResults , err := NewHistoryEventIDValidator (
171
190
t .shardID ,
172
191
t .executionManager ,
@@ -254,15 +273,14 @@ func printValidationResult(
254
273
metricsHandler metrics.MetricsHandler ,
255
274
logger log.Logger ,
256
275
) {
257
- handler := metricsHandler .WithTags (metrics .OperationTag (metrics .ExecutionsScavengerScope ), metrics .FailureTag ("" ))
258
- handler .Counter (metrics .ScavengerValidationRequestsCount .GetMetricName ()).Record (1 )
276
+ metricsHandler .Counter (metrics .ScavengerValidationRequestsCount .GetMetricName ()).Record (1 )
259
277
if len (results ) == 0 {
260
278
return
261
279
}
262
280
263
- handler .Counter (metrics .ScavengerValidationFailuresCount .GetMetricName ()).Record (1 )
281
+ metricsHandler .Counter (metrics .ScavengerValidationFailuresCount .GetMetricName ()).Record (1 )
264
282
for _ , result := range results {
265
- handler .Counter (metrics .ScavengerValidationFailuresCount .GetMetricName ()).Record (1 , metrics .FailureTag (result .failureType ))
283
+ metricsHandler .Counter (metrics .ScavengerValidationFailuresCount .GetMetricName ()).Record (1 , metrics .FailureTag (result .failureType ))
266
284
logger .Info (
267
285
"validation failed for execution." ,
268
286
tag .WorkflowNamespaceID (mutableState .GetExecutionInfo ().GetNamespaceId ()),
0 commit comments