@@ -27,8 +27,10 @@ package scheduler
27
27
import (
28
28
"errors"
29
29
"fmt"
30
+ "strings"
30
31
"time"
31
32
33
+ "github.com/gogo/protobuf/jsonpb"
32
34
"github.com/gogo/protobuf/proto"
33
35
"github.com/google/uuid"
34
36
"golang.org/x/exp/slices"
@@ -38,11 +40,13 @@ import (
38
40
schedpb "go.temporal.io/api/schedule/v1"
39
41
workflowpb "go.temporal.io/api/workflow/v1"
40
42
"go.temporal.io/api/workflowservice/v1"
43
+ sdkclient "go.temporal.io/sdk/client"
41
44
sdklog "go.temporal.io/sdk/log"
42
45
"go.temporal.io/sdk/temporal"
43
46
"go.temporal.io/sdk/workflow"
44
47
45
48
schedspb "go.temporal.io/server/api/schedule/v1"
49
+ "go.temporal.io/server/common/metrics"
46
50
"go.temporal.io/server/common/payload"
47
51
"go.temporal.io/server/common/primitives/timestamp"
48
52
"go.temporal.io/server/common/searchattribute"
@@ -81,9 +85,10 @@ type (
81
85
scheduler struct {
82
86
schedspb.StartScheduleArgs
83
87
84
- ctx workflow.Context
85
- a * activities
86
- logger sdklog.Logger
88
+ ctx workflow.Context
89
+ a * activities
90
+ logger sdklog.Logger
91
+ metrics sdkclient.MetricsHandler
87
92
88
93
cspec * CompiledSpec
89
94
@@ -111,6 +116,9 @@ type (
111
116
RecentActionCountForList int // The number of recent actual action results to include in List (search attr).
112
117
IterationsBeforeContinueAsNew int
113
118
SleepWhilePaused bool // If true, don't set timers while paused/out of actions
119
+ // MaxBufferSize limits the number of buffered starts. This also limits the number of
120
+ // workflows that can be backfilled at once (since they all have to fit in the buffer).
121
+ MaxBufferSize int
114
122
}
115
123
)
116
124
@@ -141,6 +149,7 @@ var (
141
149
RecentActionCountForList : 5 ,
142
150
IterationsBeforeContinueAsNew : 500 ,
143
151
SleepWhilePaused : true ,
152
+ MaxBufferSize : 1000 ,
144
153
}
145
154
146
155
errUpdateConflict = errors .New ("conflicting concurrent update" )
@@ -151,14 +160,13 @@ func SchedulerWorkflow(ctx workflow.Context, args *schedspb.StartScheduleArgs) e
151
160
StartScheduleArgs : * args ,
152
161
ctx : ctx ,
153
162
a : nil ,
154
- logger : sdklog .With (workflow .GetLogger (ctx ), "schedule-id" , args .State .ScheduleId ),
163
+ logger : sdklog .With (workflow .GetLogger (ctx ), "wf-namespace" , args .State .Namespace , "schedule-id" , args .State .ScheduleId ),
164
+ metrics : workflow .GetMetricsHandler (ctx ).WithTags (map [string ]string {"namespace" : args .State .Namespace }),
155
165
}
156
166
return scheduler .run ()
157
167
}
158
168
159
169
func (s * scheduler ) run () error {
160
- s .logger .Info ("Schedule starting" , "schedule" , s .Schedule )
161
-
162
170
s .updateTweakables ()
163
171
s .ensureFields ()
164
172
s .compileSpec ()
@@ -171,7 +179,13 @@ func (s *scheduler) run() error {
171
179
}
172
180
173
181
if s .State .LastProcessedTime == nil {
174
- s .logger .Debug ("Initializing internal state" )
182
+ // log these as json since it's more readable than the Go representation
183
+ var m jsonpb.Marshaler
184
+ var specJson , policiesJson strings.Builder
185
+ _ = m .Marshal (& specJson , s .Schedule .Spec )
186
+ _ = m .Marshal (& policiesJson , s .Schedule .Policies )
187
+ s .logger .Info ("Starting schedule" , "spec" , specJson .String (), "policies" , policiesJson .String ())
188
+
175
189
s .State .LastProcessedTime = timestamp .TimePtr (s .now ())
176
190
s .State .ConflictToken = InitialConflictToken
177
191
s .Info .CreateTime = s .State .LastProcessedTime
@@ -218,7 +232,7 @@ func (s *scheduler) run() error {
218
232
219
233
// Any watcher activities will get cancelled automatically if running.
220
234
221
- s .logger .Info ("Schedule doing continue-as-new" )
235
+ s .logger .Debug ("Schedule doing continue-as-new" )
222
236
return workflow .NewContinueAsNewError (s .ctx , WorkflowType , & s .StartScheduleArgs )
223
237
}
224
238
@@ -287,7 +301,7 @@ func (s *scheduler) now() time.Time {
287
301
}
288
302
289
303
func (s * scheduler ) processPatch (patch * schedpb.SchedulePatch ) {
290
- s .logger .Info ("Schedule patch" , "patch" , patch .String ())
304
+ s .logger .Debug ("Schedule patch" , "patch" , patch .String ())
291
305
292
306
if trigger := patch .TriggerImmediately ; trigger != nil {
293
307
now := s .now ()
@@ -320,7 +334,7 @@ func (s *scheduler) processTimeRange(
320
334
overlapPolicy enumspb.ScheduleOverlapPolicy ,
321
335
manual bool ,
322
336
) time.Duration {
323
- s .logger .Debug ("processTimeRange" , "t1" , t1 , "t2" , t2 , "overlapPolicy " , overlapPolicy , "manual" , manual )
337
+ s .logger .Debug ("processTimeRange" , "t1" , t1 , "t2" , t2 , "overlap-policy " , overlapPolicy , "manual" , manual )
324
338
325
339
if s .cspec == nil {
326
340
return invalidDuration
@@ -343,6 +357,7 @@ func (s *scheduler) processTimeRange(
343
357
}
344
358
if ! manual && t2 .Sub (t1 ) > catchupWindow {
345
359
s .logger .Warn ("Schedule missed catchup window" , "now" , t2 , "time" , t1 )
360
+ s .metrics .Counter (metrics .ScheduleMissedCatchupWindow .GetMetricName ()).Inc (1 )
346
361
s .Info .MissedCatchupWindow ++
347
362
continue
348
363
}
@@ -410,7 +425,7 @@ func (s *scheduler) sleep(nextSleep time.Duration) {
410
425
sel .AddFuture (s .watchingFuture , s .wfWatcherReturned )
411
426
}
412
427
413
- s .logger .Debug ("sleeping" , "nextSleep " , nextSleep , "watching" , s .watchingFuture != nil )
428
+ s .logger .Debug ("sleeping" , "next-sleep " , nextSleep , "watching" , s .watchingFuture != nil )
414
429
sel .Select (s .ctx )
415
430
for sel .HasPending () {
416
431
sel .Select (s .ctx )
@@ -456,10 +471,10 @@ func (s *scheduler) processWatcherResult(id string, f workflow.Future) {
456
471
s .Schedule .State .Paused = true
457
472
if res .Status == enumspb .WORKFLOW_EXECUTION_STATUS_FAILED {
458
473
s .Schedule .State .Notes = fmt .Sprintf ("paused due to workflow failure: %s: %s" , id , res .GetFailure ().GetMessage ())
459
- s .logger .Info ("paused due to workflow failure" , "workflow" , id , "message" , res .GetFailure ().GetMessage ())
474
+ s .logger .Debug ("paused due to workflow failure" , "workflow" , id , "message" , res .GetFailure ().GetMessage ())
460
475
} else if res .Status == enumspb .WORKFLOW_EXECUTION_STATUS_TIMED_OUT {
461
476
s .Schedule .State .Notes = fmt .Sprintf ("paused due to workflow timeout: %s" , id )
462
- s .logger .Info ("paused due to workflow timeout" , "workflow" , id )
477
+ s .logger .Debug ("paused due to workflow timeout" , "workflow" , id )
463
478
}
464
479
s .incSeqNo ()
465
480
}
@@ -473,7 +488,7 @@ func (s *scheduler) processWatcherResult(id string, f workflow.Future) {
473
488
s .State .ContinuedFailure = res .GetFailure ()
474
489
}
475
490
476
- s .logger .Info ("started workflow finished" , "workflow" , id , "status" , res .Status , "pause-after-failure" , pauseOnFailure )
491
+ s .logger .Debug ("started workflow finished" , "workflow" , id , "status" , res .Status , "pause-after-failure" , pauseOnFailure )
477
492
}
478
493
479
494
func (s * scheduler ) processUpdate (req * schedspb.FullUpdateRequest ) {
@@ -482,7 +497,7 @@ func (s *scheduler) processUpdate(req *schedspb.FullUpdateRequest) {
482
497
return
483
498
}
484
499
485
- s .logger .Info ("Schedule update" , "new-schedule" , req .Schedule .String ())
500
+ s .logger .Debug ("Schedule update" , "new-schedule" , req .Schedule .String ())
486
501
487
502
s .Schedule .Spec = req .Schedule .GetSpec ()
488
503
s .Schedule .Action = req .Schedule .GetAction ()
@@ -672,7 +687,12 @@ func (s *scheduler) resolveOverlapPolicy(overlapPolicy enumspb.ScheduleOverlapPo
672
687
}
673
688
674
689
func (s * scheduler ) addStart (nominalTime , actualTime time.Time , overlapPolicy enumspb.ScheduleOverlapPolicy , manual bool ) {
675
- s .logger .Debug ("addStart" , "nominal" , nominalTime , "actual" , actualTime , "overlapPolicy" , overlapPolicy , "manual" , manual )
690
+ s .logger .Debug ("addStart" , "start-time" , nominalTime , "actual-start-time" , actualTime , "overlap-policy" , overlapPolicy , "manual" , manual )
691
+ if s .tweakables .MaxBufferSize > 0 && len (s .State .BufferedStarts ) >= s .tweakables .MaxBufferSize {
692
+ s .logger .Warn ("Buffer too large" , "start-time" , nominalTime , "overlap-policy" , overlapPolicy , "manual" , manual )
693
+ s .metrics .Counter (metrics .ScheduleBufferOverruns .GetMetricName ()).Inc (1 )
694
+ return
695
+ }
676
696
s .State .BufferedStarts = append (s .State .BufferedStarts , & schedspb.BufferedStart {
677
697
NominalTime : timestamp .TimePtr (nominalTime ),
678
698
ActualTime : timestamp .TimePtr (actualTime ),
@@ -688,7 +708,7 @@ func (s *scheduler) addStart(nominalTime, actualTime time.Time, overlapPolicy en
688
708
//
689
709
//nolint:revive
690
710
func (s * scheduler ) processBuffer () bool {
691
- s .logger .Debug ("processBuffer" , "buffer" , len (s .State .BufferedStarts ), "running" , len (s .Info .RunningWorkflows ), "needRefresh " , s .State .NeedRefresh )
711
+ s .logger .Debug ("processBuffer" , "buffer" , len (s .State .BufferedStarts ), "running" , len (s .Info .RunningWorkflows ), "need-refresh " , s .State .NeedRefresh )
692
712
693
713
// TODO: consider doing this always and removing needRefresh? we only end up here without
694
714
// needRefresh in the case of update, or patch without an immediate run, so it's not much
@@ -727,14 +747,18 @@ func (s *scheduler) processBuffer() bool {
727
747
continue
728
748
}
729
749
result , err := s .startWorkflow (start , req )
750
+ metricsWithTag := s .metrics .WithTags (map [string ]string {
751
+ metrics .ScheduleActionTypeTag : metrics .ScheduleActionStartWorkflow })
730
752
if err != nil {
731
753
s .logger .Error ("Failed to start workflow" , "error" , err )
754
+ metricsWithTag .Counter (metrics .ScheduleActionErrors .GetMetricName ()).Inc (1 )
732
755
// TODO: we could put this back in the buffer and retry (after a delay) up until
733
756
// the catchup window. of course, it's unlikely that this workflow would be making
734
757
// progress while we're unable to start a new one, so maybe it's not that valuable.
735
758
tryAgain = true
736
759
continue
737
760
}
761
+ metricsWithTag .Counter (metrics .ScheduleActionSuccess .GetMetricName ()).Inc (1 )
738
762
s .recordAction (result )
739
763
}
740
764
@@ -826,9 +850,8 @@ func (s *scheduler) startWorkflow(
826
850
827
851
var appErr * temporal.ApplicationError
828
852
var details rateLimitedDetails
829
- if errors .As (err , & appErr ) &&
830
- appErr .Type () == rateLimitedErrorType &&
831
- appErr .Details (& details ) == nil {
853
+ if errors .As (err , & appErr ) && appErr .Type () == rateLimitedErrorType && appErr .Details (& details ) == nil {
854
+ s .metrics .Counter (metrics .ScheduleRateLimited .GetMetricName ()).Inc (1 )
832
855
workflow .Sleep (s .ctx , details .Delay )
833
856
req .CompletedRateLimitSleep = true // only use rate limiter once
834
857
continue
@@ -920,6 +943,7 @@ func (s *scheduler) cancelWorkflow(ex *commonpb.WorkflowExecution) {
920
943
err := workflow .ExecuteLocalActivity (ctx , s .a .CancelWorkflow , areq ).Get (s .ctx , nil )
921
944
if err != nil {
922
945
s .logger .Error ("cancel workflow failed" , "workflow" , ex .WorkflowId , "error" , err )
946
+ s .metrics .Counter (metrics .ScheduleCancelWorkflowErrors .GetMetricName ()).Inc (1 )
923
947
}
924
948
// Note: the local activity has completed (or failed) here but the workflow might take time
925
949
// to close since a cancel is only a request.
@@ -937,6 +961,7 @@ func (s *scheduler) terminateWorkflow(ex *commonpb.WorkflowExecution) {
937
961
err := workflow .ExecuteLocalActivity (ctx , s .a .TerminateWorkflow , areq ).Get (s .ctx , nil )
938
962
if err != nil {
939
963
s .logger .Error ("terminate workflow failed" , "workflow" , ex .WorkflowId , "error" , err )
964
+ s .metrics .Counter (metrics .ScheduleTerminateWorkflowErrors .GetMetricName ()).Inc (1 )
940
965
}
941
966
// Note: the local activity has completed (or failed) here but we'll still wait until we
942
967
// observe the workflow close (with a watcher) to start the next one.
0 commit comments