@@ -172,10 +172,10 @@ var (
172
172
)
173
173
174
174
const (
175
- logWarnImmediateTaskLevelDiff = 3000000 // 3 million
176
- logWarnScheduledTaskLevelDiff = time .Duration (30 * time .Minute )
177
- historySizeLogThreshold = 10 * 1024 * 1024
178
- minContextTimeout = 2 * time .Second
175
+ logWarnImmediateTaskLag = 3000000 // 3 million
176
+ logWarnScheduledTaskLag = time .Duration (30 * time .Minute )
177
+ historySizeLogThreshold = 10 * 1024 * 1024
178
+ minContextTimeout = 2 * time .Second
179
179
)
180
180
181
181
func (s * ContextImpl ) String () string {
@@ -1276,58 +1276,40 @@ func (s *ContextImpl) updateShardInfoLocked() error {
1276
1276
return nil
1277
1277
}
1278
1278
1279
- // TODO: Instead of having separate metric definition for each task category, we should
1280
- // use one metrics (or two, one for immedidate task, one for scheduled task),
1281
- // and add tags indicating the task category.
1282
1279
func (s * ContextImpl ) emitShardInfoMetricsLogsLocked () {
1283
- currentCluster := s .GetClusterMetadata ().GetCurrentClusterName ()
1284
- clusterInfo := s .GetClusterMetadata ().GetAllClusterInfo ()
1285
-
1286
- minTransferLevel := s .getQueueClusterAckLevelLocked (tasks .CategoryTransfer , currentCluster ) // s.shardInfo.ClusterTransferAckLevel[currentCluster]
1287
- maxTransferLevel := minTransferLevel
1288
- minTimerLevel := s .getQueueClusterAckLevelLocked (tasks .CategoryTimer , currentCluster )
1289
- maxTimerLevel := minTimerLevel
1290
- for clusterName , info := range clusterInfo {
1291
- if ! info .Enabled {
1292
- continue
1293
- }
1280
+ handler := s .GetMetricsHandler ().WithTags (metrics .OperationTag (metrics .ShardInfoScope ))
1281
+ logWarnLagExceeded := false
1294
1282
1295
- clusterTransferLevel := s .getQueueClusterAckLevelLocked (tasks .CategoryTransfer , clusterName )
1296
- if clusterTransferLevel .CompareTo (minTransferLevel ) < 0 {
1297
- minTransferLevel = clusterTransferLevel
1298
- }
1299
- if clusterTransferLevel .CompareTo (maxTransferLevel ) > 0 {
1300
- maxTransferLevel = clusterTransferLevel
1283
+ for categoryID := range s .shardInfo .QueueAckLevels {
1284
+ category , ok := tasks .GetCategoryByID (categoryID )
1285
+ if ! ok {
1286
+ continue
1301
1287
}
1302
1288
1303
- clusterTimerLevel := s .getQueueClusterAckLevelLocked (tasks .CategoryTimer , clusterName )
1304
- if clusterTimerLevel .CompareTo (minTimerLevel ) < 0 {
1305
- minTimerLevel = clusterTimerLevel
1306
- }
1307
- if clusterTimerLevel .CompareTo (maxTimerLevel ) > 0 {
1308
- maxTimerLevel = clusterTimerLevel
1289
+ switch category .Type () {
1290
+ case tasks .CategoryTypeImmediate :
1291
+ lag := s .immediateTaskExclusiveMaxReadLevel - s .getQueueAckLevelLocked (category ).TaskID - 1
1292
+ if lag > logWarnImmediateTaskLag {
1293
+ logWarnLagExceeded = true
1294
+ }
1295
+ handler .Histogram (
1296
+ metrics .ShardInfoImmediateQueueLagHistogram .GetMetricName (),
1297
+ metrics .ShardInfoImmediateQueueLagHistogram .GetMetricUnit (),
1298
+ ).Record (lag , metrics .TaskCategoryTag (category .Name ()))
1299
+ case tasks .CategoryTypeScheduled :
1300
+ lag := s .scheduledTaskMaxReadLevel .Sub (s .getQueueAckLevelLocked (category ).FireTime )
1301
+ if lag > logWarnScheduledTaskLag {
1302
+ logWarnLagExceeded = true
1303
+ }
1304
+ handler .Timer (
1305
+ metrics .ShardInfoScheduledQueueLagTimer .GetMetricName (),
1306
+ ).Record (lag , metrics .TaskCategoryTag (category .Name ()))
1307
+ default :
1308
+ s .contextTaggedLogger .Error ("Unknown task category type" , tag .NewStringTag ("task-category" , category .Type ().String ()))
1309
1309
}
1310
1310
}
1311
1311
1312
- diffTransferLevel := maxTransferLevel .TaskID - minTransferLevel .TaskID
1313
- diffTimerLevel := maxTimerLevel .FireTime .Sub (minTimerLevel .FireTime )
1314
-
1315
- replicationLag := s .immediateTaskExclusiveMaxReadLevel - s .getQueueAckLevelLocked (tasks .CategoryReplication ).TaskID - 1
1316
- transferLag := s .immediateTaskExclusiveMaxReadLevel - s .getQueueAckLevelLocked (tasks .CategoryTransfer ).TaskID - 1
1317
- timerLag := s .timeSource .Now ().Sub (s .getQueueAckLevelLocked (tasks .CategoryTimer ).FireTime )
1318
- visibilityLag := s .immediateTaskExclusiveMaxReadLevel - s .getQueueAckLevelLocked (tasks .CategoryVisibility ).TaskID - 1
1319
-
1320
- transferFailoverInProgress := len (s .shardInfo .FailoverLevels [tasks .CategoryTransfer ])
1321
- timerFailoverInProgress := len (s .shardInfo .FailoverLevels [tasks .CategoryTimer ])
1322
-
1323
- if s .config .EmitShardDiffLog () &&
1324
- (logWarnImmediateTaskLevelDiff < diffTransferLevel ||
1325
- logWarnScheduledTaskLevelDiff < diffTimerLevel ||
1326
- logWarnImmediateTaskLevelDiff < transferLag ||
1327
- logWarnScheduledTaskLevelDiff < timerLag ||
1328
- logWarnImmediateTaskLevelDiff < visibilityLag ||
1329
- logWarnImmediateTaskLevelDiff < replicationLag ) {
1330
-
1312
+ if logWarnLagExceeded && s .config .EmitShardLagLog () {
1331
1313
ackLevelTags := make ([]tag.Tag , 0 , len (s .shardInfo .QueueAckLevels ))
1332
1314
for categoryID , ackLevel := range s .shardInfo .QueueAckLevels {
1333
1315
category , ok := tasks .GetCategoryByID (categoryID )
@@ -1336,22 +1318,20 @@ func (s *ContextImpl) emitShardInfoMetricsLogsLocked() {
1336
1318
}
1337
1319
ackLevelTags = append (ackLevelTags , tag .ShardQueueAcks (category .Name (), ackLevel ))
1338
1320
}
1339
- s .contextTaggedLogger .Warn ("Shard ack levels diff exceeds warn threshold." , ackLevelTags ... )
1321
+ s .contextTaggedLogger .Warn ("Shard ack levels lag exceeds warn threshold." , ackLevelTags ... )
1340
1322
}
1341
1323
1342
- handler := s .GetMetricsHandler ().WithTags (metrics .OperationTag (metrics .ShardInfoScope ))
1343
- handler .Histogram (metrics .ShardInfoTransferDiffHistogram .GetMetricName (), metrics .ShardInfoTransferDiffHistogram .GetMetricUnit ()).Record (diffTransferLevel )
1344
- handler .Timer (metrics .ShardInfoTimerDiffTimer .GetMetricName ()).Record (diffTimerLevel )
1324
+ // Following metrics are double-emitted for backward compatibility so that old dashboard/alert won't break
1325
+ // TODO: remove in 1.21 release
1326
+ replicationLag := s .immediateTaskExclusiveMaxReadLevel - s .getQueueAckLevelLocked (tasks .CategoryReplication ).TaskID - 1
1327
+ transferLag := s .immediateTaskExclusiveMaxReadLevel - s .getQueueAckLevelLocked (tasks .CategoryTransfer ).TaskID - 1
1328
+ visibilityLag := s .immediateTaskExclusiveMaxReadLevel - s .getQueueAckLevelLocked (tasks .CategoryVisibility ).TaskID - 1
1329
+ timerLag := s .timeSource .Now ().Sub (s .getQueueAckLevelLocked (tasks .CategoryTimer ).FireTime )
1345
1330
1346
1331
handler .Histogram (metrics .ShardInfoReplicationLagHistogram .GetMetricName (), metrics .ShardInfoReplicationLagHistogram .GetMetricUnit ()).Record (replicationLag )
1347
1332
handler .Histogram (metrics .ShardInfoTransferLagHistogram .GetMetricName (), metrics .ShardInfoTransferLagHistogram .GetMetricUnit ()).Record (transferLag )
1348
1333
handler .Histogram (metrics .ShardInfoVisibilityLagHistogram .GetMetricName (), metrics .ShardInfoVisibilityLagHistogram .GetMetricUnit ()).Record (visibilityLag )
1349
1334
handler .Timer (metrics .ShardInfoTimerLagTimer .GetMetricName ()).Record (timerLag )
1350
-
1351
- handler .Histogram (metrics .ShardInfoTransferFailoverInProgressHistogram .GetMetricName (), metrics .ShardInfoTransferFailoverInProgressHistogram .GetMetricUnit ()).
1352
- Record (int64 (transferFailoverInProgress ))
1353
- handler .Histogram (metrics .ShardInfoTimerFailoverInProgressHistogram .GetMetricName (), metrics .ShardInfoTimerFailoverInProgressHistogram .GetMetricUnit ()).
1354
- Record (int64 (timerFailoverInProgress ))
1355
1335
}
1356
1336
1357
1337
func (s * ContextImpl ) allocateTaskIDAndTimestampLocked (
0 commit comments