Skip to content

Commit 9a4f19e

Browse files
authored
Mark state transition & completion metrics with namespace activeness (#3831)
1 parent 6e732e7 commit 9a4f19e

13 files changed

+264
-93
lines changed

common/metrics/tags.go

+21-9
Original file line numberDiff line numberDiff line change
@@ -41,15 +41,16 @@ const (
4141
buildPlatformTag = "build_platform"
4242
goVersionTag = "go_version"
4343

44-
instance = "instance"
45-
namespace = "namespace"
46-
targetCluster = "target_cluster"
47-
taskQueue = "taskqueue"
48-
workflowType = "workflowType"
49-
activityType = "activityType"
50-
commandType = "commandType"
51-
serviceName = "service_name"
52-
actionType = "action_type"
44+
instance = "instance"
45+
namespace = "namespace"
46+
namespaceState = "namespace_state"
47+
targetCluster = "target_cluster"
48+
taskQueue = "taskqueue"
49+
workflowType = "workflowType"
50+
activityType = "activityType"
51+
commandType = "commandType"
52+
serviceName = "service_name"
53+
actionType = "action_type"
5354

5455
namespaceAllValue = "all"
5556
unknownValue = "_unknown_"
@@ -103,6 +104,17 @@ func NamespaceUnknownTag() Tag {
103104
return namespaceUnknownTag
104105
}
105106

107+
// NamespaceStateTag returns a new namespace state tag.
108+
func NamespaceStateTag(value string) Tag {
109+
if len(value) == 0 {
110+
value = unknownValue
111+
}
112+
return &tagImpl{
113+
key: namespaceState,
114+
value: value,
115+
}
116+
}
117+
106118
var taskQueueUnknownTag = &tagImpl{key: taskQueue, value: unknownValue}
107119

108120
// TaskQueueUnknownTag returns a new taskqueue:unknown tag-value

service/history/historyEngine2_test.go

+2
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,8 @@ func (s *engine2Suite) SetupTest() {
153153
s.mockNamespaceCache.EXPECT().GetNamespaceByID(tests.ParentNamespaceID).Return(tests.GlobalParentNamespaceEntry, nil).AnyTimes()
154154
s.mockNamespaceCache.EXPECT().GetNamespace(tests.ChildNamespace).Return(tests.GlobalChildNamespaceEntry, nil).AnyTimes()
155155
s.mockEventsCache.EXPECT().PutEvent(gomock.Any(), gomock.Any()).AnyTimes()
156+
s.mockClusterMetadata.EXPECT().GetClusterID().Return(tests.Version).AnyTimes()
157+
s.mockClusterMetadata.EXPECT().IsVersionFromSameCluster(tests.Version, tests.Version).Return(true).AnyTimes()
156158
s.mockClusterMetadata.EXPECT().IsGlobalNamespaceEnabled().Return(false).AnyTimes()
157159
s.mockClusterMetadata.EXPECT().GetCurrentClusterName().Return(cluster.TestCurrentClusterName).AnyTimes()
158160
s.mockClusterMetadata.EXPECT().ClusterNameForFailoverVersion(false, common.EmptyVersion).Return(cluster.TestCurrentClusterName).AnyTimes()

service/history/ndc/workflow_resetter.go

+3
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ import (
3535
enumspb "go.temporal.io/api/enums/v1"
3636
historypb "go.temporal.io/api/history/v1"
3737
"go.temporal.io/api/serviceerror"
38+
3839
persistencespb "go.temporal.io/server/api/persistence/v1"
3940
"go.temporal.io/server/common"
4041
"go.temporal.io/server/common/cluster"
@@ -364,8 +365,10 @@ func (r *workflowResetterImpl) persistToDB(
364365
if currentWorkflowSizeDiff, resetWorkflowSizeDiff, err := r.transaction.UpdateWorkflowExecution(
365366
ctx,
366367
persistence.UpdateWorkflowModeUpdateCurrent,
368+
currentWorkflow.GetMutableState().GetCurrentVersion(),
367369
currentWorkflowMutation,
368370
currentWorkflowEventsSeq,
371+
workflow.MutableStateFailoverVersion(resetWorkflow.GetMutableState()),
369372
resetWorkflowSnapshot,
370373
resetWorkflowEventsSeq,
371374
); err != nil {

service/history/ndc/workflow_resetter_test.go

+23-2
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ import (
3737
enumspb "go.temporal.io/api/enums/v1"
3838
historypb "go.temporal.io/api/history/v1"
3939
taskqueuepb "go.temporal.io/api/taskqueue/v1"
40+
41+
historyspb "go.temporal.io/server/api/history/v1"
4042
persistencespb "go.temporal.io/server/api/persistence/v1"
4143
"go.temporal.io/server/common"
4244
"go.temporal.io/server/common/collection"
@@ -47,6 +49,7 @@ import (
4749
"go.temporal.io/server/common/namespace"
4850
"go.temporal.io/server/common/payloads"
4951
"go.temporal.io/server/common/persistence"
52+
"go.temporal.io/server/common/persistence/versionhistory"
5053
"go.temporal.io/server/common/primitives/timestamp"
5154
"go.temporal.io/server/service/history/consts"
5255
"go.temporal.io/server/service/history/shard"
@@ -139,10 +142,18 @@ func (s *workflowResetterSuite) TestPersistToDB_CurrentTerminated() {
139142
currentWorkflow.EXPECT().GetMutableState().Return(currentMutableState).AnyTimes()
140143
currentWorkflow.EXPECT().GetReleaseFn().Return(currentReleaseFn).AnyTimes()
141144

145+
currentMutableState.EXPECT().GetCurrentVersion().Return(int64(0)).AnyTimes()
142146
currentEventsSize := int64(2333)
143147
currentNewEventsSize := int64(3444)
144148
currentMutation := &persistence.WorkflowMutation{
145-
ExecutionInfo: &persistencespb.WorkflowExecutionInfo{},
149+
ExecutionInfo: &persistencespb.WorkflowExecutionInfo{
150+
VersionHistories: versionhistory.NewVersionHistories(&historyspb.VersionHistory{
151+
BranchToken: []byte{1, 2, 3},
152+
Items: []*historyspb.VersionHistoryItem{
153+
{EventId: 234, Version: 0},
154+
},
155+
}),
156+
},
146157
}
147158
currentEventsSeq := []*persistence.WorkflowEvents{{
148159
NamespaceID: s.namespaceID.String(),
@@ -165,10 +176,18 @@ func (s *workflowResetterSuite) TestPersistToDB_CurrentTerminated() {
165176
resetWorkflow.EXPECT().GetMutableState().Return(resetMutableState).AnyTimes()
166177
resetWorkflow.EXPECT().GetReleaseFn().Return(tarGetReleaseFn).AnyTimes()
167178

179+
resetMutableState.EXPECT().GetCurrentVersion().Return(int64(0)).AnyTimes()
168180
resetEventsSize := int64(1444)
169181
resetNewEventsSize := int64(4321)
170182
resetSnapshot := &persistence.WorkflowSnapshot{
171-
ExecutionInfo: &persistencespb.WorkflowExecutionInfo{},
183+
ExecutionInfo: &persistencespb.WorkflowExecutionInfo{
184+
VersionHistories: versionhistory.NewVersionHistories(&historyspb.VersionHistory{
185+
BranchToken: []byte{1, 2, 3},
186+
Items: []*historyspb.VersionHistoryItem{
187+
{EventId: 123, Version: 0},
188+
},
189+
}),
190+
},
172191
}
173192
resetEventsSeq := []*persistence.WorkflowEvents{{
174193
NamespaceID: s.namespaceID.String(),
@@ -189,8 +208,10 @@ func (s *workflowResetterSuite) TestPersistToDB_CurrentTerminated() {
189208
s.mockTransaction.EXPECT().UpdateWorkflowExecution(
190209
gomock.Any(),
191210
persistence.UpdateWorkflowModeUpdateCurrent,
211+
int64(0),
192212
currentMutation,
193213
currentEventsSeq,
214+
convert.Int64Ptr(0),
194215
resetSnapshot,
195216
resetEventsSeq,
196217
).Return(currentNewEventsSize, resetNewEventsSize, nil)

service/history/timerQueueActiveTaskExecutor_test.go

+3-2
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ import (
5454
"go.temporal.io/server/common/persistence"
5555
"go.temporal.io/server/common/persistence/versionhistory"
5656
"go.temporal.io/server/common/primitives/timestamp"
57-
deletemanager "go.temporal.io/server/service/history/deletemanager"
57+
"go.temporal.io/server/service/history/deletemanager"
5858
"go.temporal.io/server/service/history/events"
5959
"go.temporal.io/server/service/history/queues"
6060
"go.temporal.io/server/service/history/shard"
@@ -154,7 +154,8 @@ func (s *timerQueueActiveTaskExecutorSuite) SetupTest() {
154154
// ack manager will use the namespace information
155155
s.mockNamespaceCache.EXPECT().GetNamespaceByID(gomock.Any()).Return(tests.GlobalNamespaceEntry, nil).AnyTimes()
156156
s.mockNamespaceCache.EXPECT().GetNamespaceName(gomock.Any()).Return(tests.Namespace, nil).AnyTimes()
157-
s.mockClusterMetadata.EXPECT().GetClusterID().Return(cluster.TestCurrentClusterInitialFailoverVersion).AnyTimes()
157+
s.mockClusterMetadata.EXPECT().GetClusterID().Return(tests.Version).AnyTimes()
158+
s.mockClusterMetadata.EXPECT().IsVersionFromSameCluster(tests.Version, tests.Version).Return(true).AnyTimes()
158159
s.mockClusterMetadata.EXPECT().GetCurrentClusterName().Return(cluster.TestCurrentClusterName).AnyTimes()
159160
s.mockClusterMetadata.EXPECT().GetAllClusterInfo().Return(cluster.TestAllClusterInfo).AnyTimes()
160161
s.mockClusterMetadata.EXPECT().IsGlobalNamespaceEnabled().Return(true).AnyTimes()

service/history/timerQueueStandbyTaskExecutor_test.go

+3-2
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ import (
5858
"go.temporal.io/server/common/primitives/timestamp"
5959
"go.temporal.io/server/common/xdc"
6060
"go.temporal.io/server/service/history/consts"
61-
deletemanager "go.temporal.io/server/service/history/deletemanager"
61+
"go.temporal.io/server/service/history/deletemanager"
6262
"go.temporal.io/server/service/history/events"
6363
"go.temporal.io/server/service/history/queues"
6464
"go.temporal.io/server/service/history/shard"
@@ -159,7 +159,8 @@ func (s *timerQueueStandbyTaskExecutorSuite) SetupTest() {
159159
s.mockMatchingClient = s.mockShard.Resource.MatchingClient
160160
s.mockNamespaceCache.EXPECT().GetNamespaceByID(gomock.Any()).Return(s.namespaceEntry, nil).AnyTimes()
161161
s.mockNamespaceCache.EXPECT().GetNamespaceName(gomock.Any()).Return(s.namespaceEntry.Name(), nil).AnyTimes()
162-
s.mockClusterMetadata.EXPECT().GetClusterID().Return(cluster.TestCurrentClusterInitialFailoverVersion).AnyTimes()
162+
s.mockClusterMetadata.EXPECT().GetClusterID().Return(tests.Version).AnyTimes()
163+
s.mockClusterMetadata.EXPECT().IsVersionFromSameCluster(tests.Version, tests.Version).Return(true).AnyTimes()
163164
s.mockClusterMetadata.EXPECT().GetCurrentClusterName().Return(cluster.TestCurrentClusterName).AnyTimes()
164165
s.mockClusterMetadata.EXPECT().GetAllClusterInfo().Return(cluster.TestAllClusterInfo).AnyTimes()
165166
s.mockClusterMetadata.EXPECT().IsGlobalNamespaceEnabled().Return(true).AnyTimes()

service/history/transferQueueActiveTaskExecutor_test.go

+3-2
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ import (
7171
"go.temporal.io/server/common/searchattribute"
7272
"go.temporal.io/server/service/history/configs"
7373
"go.temporal.io/server/service/history/consts"
74-
deletemanager "go.temporal.io/server/service/history/deletemanager"
74+
"go.temporal.io/server/service/history/deletemanager"
7575
"go.temporal.io/server/service/history/events"
7676
"go.temporal.io/server/service/history/queues"
7777
"go.temporal.io/server/service/history/shard"
@@ -199,7 +199,8 @@ func (s *transferQueueActiveTaskExecutorSuite) SetupTest() {
199199
s.mockNamespaceCache.EXPECT().GetNamespaceByID(tests.ChildNamespaceID).Return(tests.GlobalChildNamespaceEntry, nil).AnyTimes()
200200
s.mockNamespaceCache.EXPECT().GetNamespace(tests.ChildNamespace).Return(tests.GlobalChildNamespaceEntry, nil).AnyTimes()
201201
s.mockNamespaceCache.EXPECT().GetNamespaceByID(tests.MissedNamespaceID).Return(nil, serviceerror.NewNamespaceNotFound(tests.MissedNamespaceID.String())).AnyTimes()
202-
s.mockClusterMetadata.EXPECT().GetClusterID().Return(cluster.TestCurrentClusterInitialFailoverVersion).AnyTimes()
202+
s.mockClusterMetadata.EXPECT().GetClusterID().Return(tests.Version).AnyTimes()
203+
s.mockClusterMetadata.EXPECT().IsVersionFromSameCluster(tests.Version, tests.Version).Return(true).AnyTimes()
203204
s.mockClusterMetadata.EXPECT().GetCurrentClusterName().Return(cluster.TestCurrentClusterName).AnyTimes()
204205
s.mockClusterMetadata.EXPECT().GetAllClusterInfo().Return(cluster.TestAllClusterInfo).AnyTimes()
205206
s.mockClusterMetadata.EXPECT().IsGlobalNamespaceEnabled().Return(true).AnyTimes()

service/history/workflow/context.go

+79-23
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ import (
4040
persistencespb "go.temporal.io/server/api/persistence/v1"
4141
"go.temporal.io/server/common"
4242
"go.temporal.io/server/common/clock"
43+
"go.temporal.io/server/common/cluster"
44+
"go.temporal.io/server/common/convert"
4345
"go.temporal.io/server/common/definition"
4446
"go.temporal.io/server/common/locks"
4547
"go.temporal.io/server/common/log"
@@ -147,13 +149,14 @@ type (
147149

148150
type (
149151
ContextImpl struct {
150-
shard shard.Context
151-
workflowKey definition.WorkflowKey
152-
logger log.Logger
153-
metricsHandler metrics.Handler
154-
timeSource clock.TimeSource
155-
config *configs.Config
156-
transaction Transaction
152+
shard shard.Context
153+
workflowKey definition.WorkflowKey
154+
logger log.Logger
155+
metricsHandler metrics.Handler
156+
clusterMetadata cluster.Metadata
157+
timeSource clock.TimeSource
158+
config *configs.Config
159+
transaction Transaction
157160

158161
mutex locks.PriorityMutex
159162
MutableState MutableState
@@ -169,14 +172,15 @@ func NewContext(
169172
logger log.Logger,
170173
) *ContextImpl {
171174
return &ContextImpl{
172-
shard: shard,
173-
workflowKey: workflowKey,
174-
logger: logger,
175-
metricsHandler: shard.GetMetricsHandler().WithTags(metrics.OperationTag(metrics.WorkflowContextScope)),
176-
timeSource: shard.GetTimeSource(),
177-
config: shard.GetConfig(),
178-
mutex: locks.NewPriorityMutex(),
179-
transaction: NewTransaction(shard),
175+
shard: shard,
176+
workflowKey: workflowKey,
177+
logger: logger,
178+
metricsHandler: shard.GetMetricsHandler().WithTags(metrics.OperationTag(metrics.WorkflowContextScope)),
179+
clusterMetadata: shard.GetClusterMetadata(),
180+
timeSource: shard.GetTimeSource(),
181+
config: shard.GetConfig(),
182+
mutex: locks.NewPriorityMutex(),
183+
transaction: NewTransaction(shard),
180184
stats: &persistencespb.ExecutionStats{
181185
HistorySize: 0,
182186
},
@@ -349,6 +353,7 @@ func (c *ContextImpl) CreateWorkflowExecution(
349353
resp, err := createWorkflowExecution(
350354
ctx,
351355
c.shard,
356+
newMutableState.GetCurrentVersion(),
352357
createRequest,
353358
)
354359
if err != nil {
@@ -361,7 +366,7 @@ func (c *ContextImpl) CreateWorkflowExecution(
361366
return err
362367
}
363368
NotifyWorkflowSnapshotTasks(engine, newWorkflow)
364-
emitStateTransitionCount(c.metricsHandler, newMutableState)
369+
emitStateTransitionCount(c.metricsHandler, c.clusterMetadata, newMutableState)
365370

366371
return nil
367372
}
@@ -451,10 +456,13 @@ func (c *ContextImpl) ConflictResolveWorkflowExecution(
451456
if resetWorkflowSizeDiff, newWorkflowSizeDiff, currentWorkflowSizeDiff, err := c.transaction.ConflictResolveWorkflowExecution(
452457
ctx,
453458
conflictResolveMode,
459+
resetMutableState.GetCurrentVersion(),
454460
resetWorkflow,
455461
resetWorkflowEventsSeq,
462+
MutableStateFailoverVersion(newMutableState),
456463
newWorkflow,
457464
newWorkflowEventsSeq,
465+
MutableStateFailoverVersion(currentMutableState),
458466
currentWorkflow,
459467
currentWorkflowEventsSeq,
460468
); err != nil {
@@ -469,9 +477,9 @@ func (c *ContextImpl) ConflictResolveWorkflowExecution(
469477
}
470478
}
471479

472-
emitStateTransitionCount(c.metricsHandler, resetMutableState)
473-
emitStateTransitionCount(c.metricsHandler, newMutableState)
474-
emitStateTransitionCount(c.metricsHandler, currentMutableState)
480+
emitStateTransitionCount(c.metricsHandler, c.clusterMetadata, resetMutableState)
481+
emitStateTransitionCount(c.metricsHandler, c.clusterMetadata, newMutableState)
482+
emitStateTransitionCount(c.metricsHandler, c.clusterMetadata, currentMutableState)
475483

476484
return nil
477485
}
@@ -628,8 +636,10 @@ func (c *ContextImpl) UpdateWorkflowExecutionWithNew(
628636
if currentWorkflowSizeDiff, newWorkflowSizeDiff, err := c.transaction.UpdateWorkflowExecution(
629637
ctx,
630638
updateMode,
639+
c.MutableState.GetCurrentVersion(),
631640
currentWorkflow,
632641
currentWorkflowEventsSeq,
642+
MutableStateFailoverVersion(newMutableState),
633643
newWorkflow,
634644
newWorkflowEventsSeq,
635645
); err != nil {
@@ -641,8 +651,8 @@ func (c *ContextImpl) UpdateWorkflowExecutionWithNew(
641651
}
642652
}
643653

644-
emitStateTransitionCount(c.metricsHandler, c.MutableState)
645-
emitStateTransitionCount(c.metricsHandler, newMutableState)
654+
emitStateTransitionCount(c.metricsHandler, c.clusterMetadata, c.MutableState)
655+
emitStateTransitionCount(c.metricsHandler, c.clusterMetadata, newMutableState)
646656

647657
// finally emit session stats
648658
namespace := c.GetNamespace()
@@ -922,12 +932,58 @@ func (c *ContextImpl) enforceSizeCheck(
922932

923933
func emitStateTransitionCount(
924934
metricsHandler metrics.Handler,
935+
clusterMetadata cluster.Metadata,
925936
mutableState MutableState,
926937
) {
927938
if mutableState == nil {
928939
return
929940
}
930941

931-
metricsHandler.Histogram(metrics.StateTransitionCount.GetMetricName(), metrics.StateTransitionCount.GetMetricUnit()).
932-
Record(mutableState.GetExecutionInfo().StateTransitionCount, metrics.NamespaceTag(mutableState.GetNamespaceEntry().Name().String()))
942+
namespaceEntry := mutableState.GetNamespaceEntry()
943+
metricsHandler.Histogram(
944+
metrics.StateTransitionCount.GetMetricName(),
945+
metrics.StateTransitionCount.GetMetricUnit(),
946+
).Record(
947+
mutableState.GetExecutionInfo().StateTransitionCount,
948+
metrics.NamespaceTag(namespaceEntry.Name().String()),
949+
metrics.NamespaceStateTag(namespaceState(clusterMetadata, convert.Int64Ptr(mutableState.GetCurrentVersion()))),
950+
)
951+
}
952+
953+
const (
954+
namespaceStateActive = "active"
955+
namespaceStatePassive = "passive"
956+
namespaceStateUnknown = "_unknown_"
957+
)
958+
959+
func namespaceState(
960+
clusterMetadata cluster.Metadata,
961+
mutableStateCurrentVersion *int64,
962+
) string {
963+
964+
if mutableStateCurrentVersion == nil {
965+
return namespaceStateUnknown
966+
}
967+
968+
// default value, need to special handle
969+
if *mutableStateCurrentVersion == 0 {
970+
return namespaceStateActive
971+
}
972+
973+
if clusterMetadata.IsVersionFromSameCluster(
974+
clusterMetadata.GetClusterID(),
975+
*mutableStateCurrentVersion,
976+
) {
977+
return namespaceStateActive
978+
}
979+
return namespaceStatePassive
980+
}
981+
982+
func MutableStateFailoverVersion(
983+
mutableState MutableState,
984+
) *int64 {
985+
if mutableState == nil {
986+
return nil
987+
}
988+
return convert.Int64Ptr(mutableState.GetCurrentVersion())
933989
}

service/history/workflow/metrics.go

+2
Original file line numberDiff line numberDiff line change
@@ -83,12 +83,14 @@ func emitMutableStateStatus(
8383
func emitWorkflowCompletionStats(
8484
metricsHandler metrics.Handler,
8585
namespace namespace.Name,
86+
namespaceState string,
8687
taskQueue string,
8788
status enumspb.WorkflowExecutionStatus,
8889
) {
8990
handler := metricsHandler.WithTags(
9091
metrics.OperationTag(metrics.WorkflowCompletionStatsScope),
9192
metrics.NamespaceTag(namespace.String()),
93+
metrics.NamespaceStateTag(namespaceState),
9294
metrics.TaskQueueTag(taskQueue),
9395
)
9496

0 commit comments

Comments
 (0)