@@ -43,6 +43,7 @@ import (
43
43
"go.temporal.io/server/common/collection"
44
44
"go.temporal.io/server/common/convert"
45
45
"go.temporal.io/server/common/definition"
46
+ "go.temporal.io/server/common/dynamicconfig"
46
47
"go.temporal.io/server/common/log"
47
48
"go.temporal.io/server/common/log/tag"
48
49
"go.temporal.io/server/common/metrics"
74
75
metricsHandler metrics.MetricsHandler
75
76
logger log.Logger
76
77
retryPolicy backoff.RetryPolicy
77
- pageSize int
78
+ namespaceRegistry namespace.Registry
79
+ pageSize dynamicconfig.IntPropertyFn
80
+ maxSkipTaskCount dynamicconfig.IntPropertyFn
78
81
79
82
sync.Mutex
80
83
// largest replication task ID generated
86
89
87
90
var (
88
91
errUnknownReplicationTask = serviceerror .NewInternal ("unknown replication task" )
89
- emptyReplicationTasks = []* replicationspb.ReplicationTask {}
90
92
)
91
93
92
94
func NewAckManager (
@@ -112,7 +114,9 @@ func NewAckManager(
112
114
metricsHandler : shard .GetMetricsHandler ().WithTags (metrics .OperationTag (metrics .ReplicatorQueueProcessorScope )),
113
115
logger : log .With (logger , tag .ComponentReplicatorQueue ),
114
116
retryPolicy : retryPolicy ,
115
- pageSize : config .ReplicatorProcessorFetchTasksBatchSize (),
117
+ namespaceRegistry : shard .GetNamespaceRegistry (),
118
+ pageSize : config .ReplicatorProcessorFetchTasksBatchSize ,
119
+ maxSkipTaskCount : config .ReplicatorProcessorMaxSkipTaskCount ,
116
120
117
121
maxTaskID : nil ,
118
122
sanityCheckTime : time.Time {},
@@ -224,9 +228,9 @@ func (p *ackMgrImpl) GetTasks(
224
228
minTaskID , maxTaskID := p .taskIDsRange (queryMessageID )
225
229
replicationTasks , lastTaskID , err := p .getTasks (
226
230
ctx ,
231
+ pollingCluster ,
227
232
minTaskID ,
228
233
maxTaskID ,
229
- p .pageSize ,
230
234
)
231
235
if err != nil {
232
236
return nil , err
@@ -255,49 +259,65 @@ func (p *ackMgrImpl) GetTasks(
255
259
256
260
func (p * ackMgrImpl ) getTasks (
257
261
ctx context.Context ,
262
+ pollingCluster string ,
258
263
minTaskID int64 ,
259
264
maxTaskID int64 ,
260
- batchSize int ,
261
265
) ([]* replicationspb.ReplicationTask , int64 , error ) {
262
266
if minTaskID > maxTaskID {
263
- return nil , 0 , serviceerror .NewUnavailable ("min task ID < max task ID, probably due to shard re-balancing" )
267
+ return nil , 0 , serviceerror .NewUnavailable ("min task ID > max task ID, probably due to shard re-balancing" )
264
268
} else if minTaskID == maxTaskID {
265
- return [] * replicationspb. ReplicationTask {} , maxTaskID , nil
269
+ return nil , maxTaskID , nil
266
270
}
267
271
268
- replicationTasks := make ([]* replicationspb.ReplicationTask , 0 , batchSize )
269
- iter := collection .NewPagingIterator (p .getPaginationFn (ctx , minTaskID , maxTaskID , batchSize ))
270
- for iter .HasNext () && len (replicationTasks ) < batchSize {
272
+ replicationTasks := make ([]* replicationspb.ReplicationTask , 0 , p .pageSize ())
273
+ skippedTaskCount := 0
274
+ lastTaskID := maxTaskID // If no tasks are returned, then it means there are no tasks bellow maxTaskID.
275
+ iter := collection .NewPagingIterator (p .getReplicationTasksFn (ctx , minTaskID , maxTaskID , p .pageSize ()))
276
+ // iter.HasNext() should be the last check to avoid extra page read in case if replicationTasks is already full.
277
+ for len (replicationTasks ) < p .pageSize () && skippedTaskCount <= p .maxSkipTaskCount () && iter .HasNext () {
271
278
task , err := iter .Next ()
272
279
if err != nil {
273
- p .logger .Error ("replication task reader encounter error, return earlier" , tag .Error (err ))
274
- if len (replicationTasks ) == 0 {
275
- return nil , 0 , err
276
- } else {
277
- return replicationTasks , replicationTasks [len (replicationTasks )- 1 ].GetSourceTaskId (), nil
278
- }
280
+ return p .swallowPartialResultsError (replicationTasks , lastTaskID , err )
279
281
}
280
282
281
- if replicationTask , err := p .toReplicationTask (ctx , task ); err != nil {
282
- p .logger .Error ("replication task reader encounter error, return earlier" , tag .Error (err ))
283
- if len (replicationTasks ) == 0 {
284
- return nil , 0 , err
285
- } else {
286
- return replicationTasks , replicationTasks [len (replicationTasks )- 1 ].GetSourceTaskId (), nil
283
+ // If, for any reason, task is skipped:
284
+ // - lastTaskID needs to be updated because this task should not be read next time,
285
+ // - skippedTaskCount needs to be incremented to prevent timeout on caller side (too many tasks are skipped).
286
+ // If error has occurred though, lastTaskID shouldn't be updated, and next time task needs to be read again.
287
+
288
+ ns , err := p .namespaceRegistry .GetNamespaceByID (namespace .ID (task .GetNamespaceID ()))
289
+ if err != nil {
290
+ if _ , isNotFound := err .(* serviceerror.NamespaceNotFound ); ! isNotFound {
291
+ return p .swallowPartialResultsError (replicationTasks , lastTaskID , err )
287
292
}
288
- } else if replicationTask != nil {
289
- replicationTasks = append (replicationTasks , replicationTask )
293
+ // Namespace doesn't exist on this cluster (i.e. deleted). It is safe to skip the task.
294
+ lastTaskID = task .GetTaskID ()
295
+ skippedTaskCount ++
296
+ continue
297
+ }
298
+ // If namespace doesn't exist on polling cluster, there is no reason to send the task.
299
+ if ! ns .IsOnCluster (pollingCluster ) {
300
+ lastTaskID = task .GetTaskID ()
301
+ skippedTaskCount ++
302
+ continue
290
303
}
291
- }
292
304
293
- if len (replicationTasks ) == 0 {
294
- return emptyReplicationTasks , maxTaskID , nil
295
- } else {
296
- return replicationTasks , replicationTasks [len (replicationTasks )- 1 ].GetSourceTaskId (), nil
305
+ replicationTask , err := p .toReplicationTask (ctx , task )
306
+ if err != nil {
307
+ return p .swallowPartialResultsError (replicationTasks , lastTaskID , err )
308
+ } else if replicationTask == nil {
309
+ lastTaskID = task .GetTaskID ()
310
+ skippedTaskCount ++
311
+ continue
312
+ }
313
+ lastTaskID = task .GetTaskID ()
314
+ replicationTasks = append (replicationTasks , replicationTask )
297
315
}
316
+
317
+ return replicationTasks , lastTaskID , nil
298
318
}
299
319
300
- func (p * ackMgrImpl ) getPaginationFn (
320
+ func (p * ackMgrImpl ) getReplicationTasksFn (
301
321
ctx context.Context ,
302
322
minTaskID int64 ,
303
323
maxTaskID int64 ,
@@ -319,6 +339,19 @@ func (p *ackMgrImpl) getPaginationFn(
319
339
}
320
340
}
321
341
342
+ func (p * ackMgrImpl ) swallowPartialResultsError (
343
+ replicationTasks []* replicationspb.ReplicationTask ,
344
+ lastTaskID int64 ,
345
+ err error ,
346
+ ) ([]* replicationspb.ReplicationTask , int64 , error ) {
347
+
348
+ p .logger .Error ("Replication tasks reader encountered error, return earlier." , tag .Error (err ), tag .Value (len (replicationTasks )))
349
+ if len (replicationTasks ) == 0 {
350
+ return nil , 0 , err
351
+ }
352
+ return replicationTasks , lastTaskID , nil
353
+ }
354
+
322
355
func (p * ackMgrImpl ) taskIDsRange (
323
356
lastReadMessageID int64 ,
324
357
) (minTaskID int64 , maxTaskID int64 ) {
0 commit comments