Skip to content

Commit a4943bb

Browse files
authored
Add metrics + max buffer size to schedule workflow (#3872)
1 parent c7d7a89 commit a4943bb

File tree

2 files changed

+58
-20
lines changed

2 files changed

+58
-20
lines changed

common/metrics/metric_defs.go

+13
Original file line numberDiff line numberDiff line change
@@ -1311,6 +1311,12 @@ const (
13111311
TaskTypeTimerStandbyTaskDeleteHistoryEvent = "TimerStandbyTaskDeleteHistoryEvent"
13121312
)
13131313

1314+
// Schedule action types
1315+
const (
1316+
ScheduleActionTypeTag = "schedule_action"
1317+
ScheduleActionStartWorkflow = "start_workflow"
1318+
)
1319+
13141320
var (
13151321
ServiceRequests = NewCounterDef("service_requests")
13161322
ServicePendingRequests = NewGaugeDef("service_pending_requests")
@@ -1714,6 +1720,13 @@ var (
17141720
NamespaceReplicationEnqueueDLQCount = NewCounterDef("namespace_replication_dlq_enqueue_requests")
17151721
ParentClosePolicyProcessorSuccess = NewCounterDef("parent_close_policy_processor_requests")
17161722
ParentClosePolicyProcessorFailures = NewCounterDef("parent_close_policy_processor_errors")
1723+
ScheduleMissedCatchupWindow = NewCounterDef("schedule_missed_catchup_window")
1724+
ScheduleRateLimited = NewCounterDef("schedule_rate_limited")
1725+
ScheduleBufferOverruns = NewCounterDef("schedule_buffer_overruns")
1726+
ScheduleActionSuccess = NewCounterDef("schedule_action_success")
1727+
ScheduleActionErrors = NewCounterDef("schedule_action_errors")
1728+
ScheduleCancelWorkflowErrors = NewCounterDef("schedule_cancel_workflow_errors")
1729+
ScheduleTerminateWorkflowErrors = NewCounterDef("schedule_terminate_workflow_errors")
17171730

17181731
// Replication
17191732
NamespaceReplicationTaskAckLevelGauge = NewGaugeDef("namespace_replication_task_ack_level")

service/worker/scheduler/workflow.go

+45-20
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,10 @@ package scheduler
2727
import (
2828
"errors"
2929
"fmt"
30+
"strings"
3031
"time"
3132

33+
"github.com/gogo/protobuf/jsonpb"
3234
"github.com/gogo/protobuf/proto"
3335
"github.com/google/uuid"
3436
"golang.org/x/exp/slices"
@@ -38,11 +40,13 @@ import (
3840
schedpb "go.temporal.io/api/schedule/v1"
3941
workflowpb "go.temporal.io/api/workflow/v1"
4042
"go.temporal.io/api/workflowservice/v1"
43+
sdkclient "go.temporal.io/sdk/client"
4144
sdklog "go.temporal.io/sdk/log"
4245
"go.temporal.io/sdk/temporal"
4346
"go.temporal.io/sdk/workflow"
4447

4548
schedspb "go.temporal.io/server/api/schedule/v1"
49+
"go.temporal.io/server/common/metrics"
4650
"go.temporal.io/server/common/payload"
4751
"go.temporal.io/server/common/primitives/timestamp"
4852
"go.temporal.io/server/common/searchattribute"
@@ -81,9 +85,10 @@ type (
8185
scheduler struct {
8286
schedspb.StartScheduleArgs
8387

84-
ctx workflow.Context
85-
a *activities
86-
logger sdklog.Logger
88+
ctx workflow.Context
89+
a *activities
90+
logger sdklog.Logger
91+
metrics sdkclient.MetricsHandler
8792

8893
cspec *CompiledSpec
8994

@@ -111,6 +116,9 @@ type (
111116
RecentActionCountForList int // The number of recent actual action results to include in List (search attr).
112117
IterationsBeforeContinueAsNew int
113118
SleepWhilePaused bool // If true, don't set timers while paused/out of actions
119+
// MaxBufferSize limits the number of buffered starts. This also limits the number of
120+
// workflows that can be backfilled at once (since they all have to fit in the buffer).
121+
MaxBufferSize int
114122
}
115123
)
116124

@@ -141,6 +149,7 @@ var (
141149
RecentActionCountForList: 5,
142150
IterationsBeforeContinueAsNew: 500,
143151
SleepWhilePaused: true,
152+
MaxBufferSize: 1000,
144153
}
145154

146155
errUpdateConflict = errors.New("conflicting concurrent update")
@@ -151,14 +160,13 @@ func SchedulerWorkflow(ctx workflow.Context, args *schedspb.StartScheduleArgs) e
151160
StartScheduleArgs: *args,
152161
ctx: ctx,
153162
a: nil,
154-
logger: sdklog.With(workflow.GetLogger(ctx), "schedule-id", args.State.ScheduleId),
163+
logger: sdklog.With(workflow.GetLogger(ctx), "wf-namespace", args.State.Namespace, "schedule-id", args.State.ScheduleId),
164+
metrics: workflow.GetMetricsHandler(ctx).WithTags(map[string]string{"namespace": args.State.Namespace}),
155165
}
156166
return scheduler.run()
157167
}
158168

159169
func (s *scheduler) run() error {
160-
s.logger.Info("Schedule starting", "schedule", s.Schedule)
161-
162170
s.updateTweakables()
163171
s.ensureFields()
164172
s.compileSpec()
@@ -171,7 +179,13 @@ func (s *scheduler) run() error {
171179
}
172180

173181
if s.State.LastProcessedTime == nil {
174-
s.logger.Debug("Initializing internal state")
182+
// log these as json since it's more readable than the Go representation
183+
var m jsonpb.Marshaler
184+
var specJson, policiesJson strings.Builder
185+
_ = m.Marshal(&specJson, s.Schedule.Spec)
186+
_ = m.Marshal(&policiesJson, s.Schedule.Policies)
187+
s.logger.Info("Starting schedule", "spec", specJson.String(), "policies", policiesJson.String())
188+
175189
s.State.LastProcessedTime = timestamp.TimePtr(s.now())
176190
s.State.ConflictToken = InitialConflictToken
177191
s.Info.CreateTime = s.State.LastProcessedTime
@@ -218,7 +232,7 @@ func (s *scheduler) run() error {
218232

219233
// Any watcher activities will get cancelled automatically if running.
220234

221-
s.logger.Info("Schedule doing continue-as-new")
235+
s.logger.Debug("Schedule doing continue-as-new")
222236
return workflow.NewContinueAsNewError(s.ctx, WorkflowType, &s.StartScheduleArgs)
223237
}
224238

@@ -287,7 +301,7 @@ func (s *scheduler) now() time.Time {
287301
}
288302

289303
func (s *scheduler) processPatch(patch *schedpb.SchedulePatch) {
290-
s.logger.Info("Schedule patch", "patch", patch.String())
304+
s.logger.Debug("Schedule patch", "patch", patch.String())
291305

292306
if trigger := patch.TriggerImmediately; trigger != nil {
293307
now := s.now()
@@ -320,7 +334,7 @@ func (s *scheduler) processTimeRange(
320334
overlapPolicy enumspb.ScheduleOverlapPolicy,
321335
manual bool,
322336
) time.Duration {
323-
s.logger.Debug("processTimeRange", "t1", t1, "t2", t2, "overlapPolicy", overlapPolicy, "manual", manual)
337+
s.logger.Debug("processTimeRange", "t1", t1, "t2", t2, "overlap-policy", overlapPolicy, "manual", manual)
324338

325339
if s.cspec == nil {
326340
return invalidDuration
@@ -343,6 +357,7 @@ func (s *scheduler) processTimeRange(
343357
}
344358
if !manual && t2.Sub(t1) > catchupWindow {
345359
s.logger.Warn("Schedule missed catchup window", "now", t2, "time", t1)
360+
s.metrics.Counter(metrics.ScheduleMissedCatchupWindow.GetMetricName()).Inc(1)
346361
s.Info.MissedCatchupWindow++
347362
continue
348363
}
@@ -410,7 +425,7 @@ func (s *scheduler) sleep(nextSleep time.Duration) {
410425
sel.AddFuture(s.watchingFuture, s.wfWatcherReturned)
411426
}
412427

413-
s.logger.Debug("sleeping", "nextSleep", nextSleep, "watching", s.watchingFuture != nil)
428+
s.logger.Debug("sleeping", "next-sleep", nextSleep, "watching", s.watchingFuture != nil)
414429
sel.Select(s.ctx)
415430
for sel.HasPending() {
416431
sel.Select(s.ctx)
@@ -456,10 +471,10 @@ func (s *scheduler) processWatcherResult(id string, f workflow.Future) {
456471
s.Schedule.State.Paused = true
457472
if res.Status == enumspb.WORKFLOW_EXECUTION_STATUS_FAILED {
458473
s.Schedule.State.Notes = fmt.Sprintf("paused due to workflow failure: %s: %s", id, res.GetFailure().GetMessage())
459-
s.logger.Info("paused due to workflow failure", "workflow", id, "message", res.GetFailure().GetMessage())
474+
s.logger.Debug("paused due to workflow failure", "workflow", id, "message", res.GetFailure().GetMessage())
460475
} else if res.Status == enumspb.WORKFLOW_EXECUTION_STATUS_TIMED_OUT {
461476
s.Schedule.State.Notes = fmt.Sprintf("paused due to workflow timeout: %s", id)
462-
s.logger.Info("paused due to workflow timeout", "workflow", id)
477+
s.logger.Debug("paused due to workflow timeout", "workflow", id)
463478
}
464479
s.incSeqNo()
465480
}
@@ -473,7 +488,7 @@ func (s *scheduler) processWatcherResult(id string, f workflow.Future) {
473488
s.State.ContinuedFailure = res.GetFailure()
474489
}
475490

476-
s.logger.Info("started workflow finished", "workflow", id, "status", res.Status, "pause-after-failure", pauseOnFailure)
491+
s.logger.Debug("started workflow finished", "workflow", id, "status", res.Status, "pause-after-failure", pauseOnFailure)
477492
}
478493

479494
func (s *scheduler) processUpdate(req *schedspb.FullUpdateRequest) {
@@ -482,7 +497,7 @@ func (s *scheduler) processUpdate(req *schedspb.FullUpdateRequest) {
482497
return
483498
}
484499

485-
s.logger.Info("Schedule update", "new-schedule", req.Schedule.String())
500+
s.logger.Debug("Schedule update", "new-schedule", req.Schedule.String())
486501

487502
s.Schedule.Spec = req.Schedule.GetSpec()
488503
s.Schedule.Action = req.Schedule.GetAction()
@@ -672,7 +687,12 @@ func (s *scheduler) resolveOverlapPolicy(overlapPolicy enumspb.ScheduleOverlapPo
672687
}
673688

674689
func (s *scheduler) addStart(nominalTime, actualTime time.Time, overlapPolicy enumspb.ScheduleOverlapPolicy, manual bool) {
675-
s.logger.Debug("addStart", "nominal", nominalTime, "actual", actualTime, "overlapPolicy", overlapPolicy, "manual", manual)
690+
s.logger.Debug("addStart", "start-time", nominalTime, "actual-start-time", actualTime, "overlap-policy", overlapPolicy, "manual", manual)
691+
if s.tweakables.MaxBufferSize > 0 && len(s.State.BufferedStarts) >= s.tweakables.MaxBufferSize {
692+
s.logger.Warn("Buffer too large", "start-time", nominalTime, "overlap-policy", overlapPolicy, "manual", manual)
693+
s.metrics.Counter(metrics.ScheduleBufferOverruns.GetMetricName()).Inc(1)
694+
return
695+
}
676696
s.State.BufferedStarts = append(s.State.BufferedStarts, &schedspb.BufferedStart{
677697
NominalTime: timestamp.TimePtr(nominalTime),
678698
ActualTime: timestamp.TimePtr(actualTime),
@@ -688,7 +708,7 @@ func (s *scheduler) addStart(nominalTime, actualTime time.Time, overlapPolicy en
688708
//
689709
//nolint:revive
690710
func (s *scheduler) processBuffer() bool {
691-
s.logger.Debug("processBuffer", "buffer", len(s.State.BufferedStarts), "running", len(s.Info.RunningWorkflows), "needRefresh", s.State.NeedRefresh)
711+
s.logger.Debug("processBuffer", "buffer", len(s.State.BufferedStarts), "running", len(s.Info.RunningWorkflows), "need-refresh", s.State.NeedRefresh)
692712

693713
// TODO: consider doing this always and removing needRefresh? we only end up here without
694714
// needRefresh in the case of update, or patch without an immediate run, so it's not much
@@ -727,14 +747,18 @@ func (s *scheduler) processBuffer() bool {
727747
continue
728748
}
729749
result, err := s.startWorkflow(start, req)
750+
metricsWithTag := s.metrics.WithTags(map[string]string{
751+
metrics.ScheduleActionTypeTag: metrics.ScheduleActionStartWorkflow})
730752
if err != nil {
731753
s.logger.Error("Failed to start workflow", "error", err)
754+
metricsWithTag.Counter(metrics.ScheduleActionErrors.GetMetricName()).Inc(1)
732755
// TODO: we could put this back in the buffer and retry (after a delay) up until
733756
// the catchup window. of course, it's unlikely that this workflow would be making
734757
// progress while we're unable to start a new one, so maybe it's not that valuable.
735758
tryAgain = true
736759
continue
737760
}
761+
metricsWithTag.Counter(metrics.ScheduleActionSuccess.GetMetricName()).Inc(1)
738762
s.recordAction(result)
739763
}
740764

@@ -826,9 +850,8 @@ func (s *scheduler) startWorkflow(
826850

827851
var appErr *temporal.ApplicationError
828852
var details rateLimitedDetails
829-
if errors.As(err, &appErr) &&
830-
appErr.Type() == rateLimitedErrorType &&
831-
appErr.Details(&details) == nil {
853+
if errors.As(err, &appErr) && appErr.Type() == rateLimitedErrorType && appErr.Details(&details) == nil {
854+
s.metrics.Counter(metrics.ScheduleRateLimited.GetMetricName()).Inc(1)
832855
workflow.Sleep(s.ctx, details.Delay)
833856
req.CompletedRateLimitSleep = true // only use rate limiter once
834857
continue
@@ -920,6 +943,7 @@ func (s *scheduler) cancelWorkflow(ex *commonpb.WorkflowExecution) {
920943
err := workflow.ExecuteLocalActivity(ctx, s.a.CancelWorkflow, areq).Get(s.ctx, nil)
921944
if err != nil {
922945
s.logger.Error("cancel workflow failed", "workflow", ex.WorkflowId, "error", err)
946+
s.metrics.Counter(metrics.ScheduleCancelWorkflowErrors.GetMetricName()).Inc(1)
923947
}
924948
// Note: the local activity has completed (or failed) here but the workflow might take time
925949
// to close since a cancel is only a request.
@@ -937,6 +961,7 @@ func (s *scheduler) terminateWorkflow(ex *commonpb.WorkflowExecution) {
937961
err := workflow.ExecuteLocalActivity(ctx, s.a.TerminateWorkflow, areq).Get(s.ctx, nil)
938962
if err != nil {
939963
s.logger.Error("terminate workflow failed", "workflow", ex.WorkflowId, "error", err)
964+
s.metrics.Counter(metrics.ScheduleTerminateWorkflowErrors.GetMetricName()).Inc(1)
940965
}
941966
// Note: the local activity has completed (or failed) here but we'll still wait until we
942967
// observe the workflow close (with a watcher) to start the next one.

0 commit comments

Comments
 (0)