Skip to content

Commit 34e256a

Browse files
Test that ShardOwnershipLostErrors are never retried (#3625)
1 parent c94d2bf commit 34e256a

File tree

2 files changed

+68
-8
lines changed

2 files changed

+68
-8
lines changed

service/history/shard/context_impl.go

+10-5
Original file line numberDiff line numberDiff line change
@@ -129,8 +129,9 @@ type (
129129
scheduledTaskMaxReadLevelMap map[string]time.Time // cluster -> scheduledTaskMaxReadLevel
130130

131131
// exist only in memory
132-
remoteClusterInfos map[string]*remoteClusterInfo
133-
handoverNamespaces map[string]*namespaceHandOverInfo // keyed on namespace name
132+
remoteClusterInfos map[string]*remoteClusterInfo
133+
handoverNamespaces map[string]*namespaceHandOverInfo // keyed on namespace name
134+
acquireShardRetryPolicy backoff.RetryPolicy
134135
}
135136

136137
remoteClusterInfo struct {
@@ -1621,7 +1622,9 @@ func (s *ContextImpl) transition(request contextRequest) error {
16211622
// Cancel lifecycle context as soon as we know we're shutting down
16221623
s.lifecycleCancel()
16231624
// This will cause the controller to remove this shard from the map and then call s.finishStop()
1624-
go s.closeCallback(s)
1625+
if s.closeCallback != nil {
1626+
go s.closeCallback(s)
1627+
}
16251628
}
16261629

16271630
setStateStopped := func() {
@@ -1889,8 +1892,10 @@ func (s *ContextImpl) acquireShard() {
18891892
// lifecycleCtx. The persistence operations called here use lifecycleCtx as their context,
18901893
// so if we were blocked in any of them, they should return immediately with a context
18911894
// canceled error.
1892-
policy := backoff.NewExponentialRetryPolicy(1 * time.Second).
1893-
WithExpirationInterval(5 * time.Minute)
1895+
policy := s.acquireShardRetryPolicy
1896+
if policy == nil {
1897+
policy = backoff.NewExponentialRetryPolicy(1 * time.Second).WithExpirationInterval(5 * time.Minute)
1898+
}
18941899

18951900
// Remember this value across attempts
18961901
ownershipChanged := false

service/history/shard/context_test.go

+58-3
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ package shard
2727
import (
2828
"context"
2929
"errors"
30+
"fmt"
3031
"math/rand"
3132
"testing"
3233
"time"
@@ -36,6 +37,7 @@ import (
3637
"github.com/stretchr/testify/suite"
3738

3839
persistencespb "go.temporal.io/server/api/persistence/v1"
40+
"go.temporal.io/server/common/backoff"
3941
"go.temporal.io/server/common/clock"
4042
"go.temporal.io/server/common/cluster"
4143
"go.temporal.io/server/common/convert"
@@ -53,7 +55,7 @@ type (
5355
*require.Assertions
5456

5557
controller *gomock.Controller
56-
mockShard Context
58+
mockShard *ContextTest
5759
mockClusterMetadata *cluster.MockMetadata
5860
mockShardManager *persistence.MockShardManager
5961
mockExecutionManager *persistence.MockExecutionManager
@@ -157,7 +159,7 @@ func (s *contextSuite) TestTimerMaxReadLevelInitialization() {
157159
)
158160

159161
// clear shardInfo and load from persistence
160-
shardContextImpl := s.mockShard.(*ContextTest)
162+
shardContextImpl := s.mockShard
161163
shardContextImpl.shardInfo = nil
162164
err := shardContextImpl.loadShardMetadata(convert.BoolPtr(false))
163165
s.NoError(err)
@@ -211,7 +213,7 @@ func (s *contextSuite) TestTimerMaxReadLevelUpdate_SingleProcessor() {
211213

212214
// update in single processor mode
213215
s.mockShard.UpdateScheduledQueueExclusiveHighReadWatermark(cluster.TestCurrentClusterName, true)
214-
scheduledTaskMaxReadLevelMap := s.mockShard.(*ContextTest).scheduledTaskMaxReadLevelMap
216+
scheduledTaskMaxReadLevelMap := s.mockShard.scheduledTaskMaxReadLevelMap
215217
s.Len(scheduledTaskMaxReadLevelMap, 2)
216218
s.True(scheduledTaskMaxReadLevelMap[cluster.TestCurrentClusterName].After(now))
217219
s.True(scheduledTaskMaxReadLevelMap[cluster.TestAlternativeClusterName].After(now))
@@ -365,3 +367,56 @@ func (s *contextSuite) TestDeleteWorkflowExecution_ErrorAndContinue_Success() {
365367
s.NoError(err)
366368
s.Equal(tasks.DeleteWorkflowExecutionStageCurrent|tasks.DeleteWorkflowExecutionStageMutableState|tasks.DeleteWorkflowExecutionStageVisibility|tasks.DeleteWorkflowExecutionStageHistory, stage)
367369
}
370+
371+
func (s *contextSuite) TestAcquireShardOwnershipLostErrorIsNotRetried() {
372+
s.mockShard.state = contextStateAcquiring
373+
s.mockShard.acquireShardRetryPolicy = backoff.NewExponentialRetryPolicy(time.Nanosecond).
374+
WithMaximumAttempts(5)
375+
s.mockShardManager.EXPECT().UpdateShard(gomock.Any(), gomock.Any()).
376+
Return(&persistence.ShardOwnershipLostError{}).Times(1)
377+
378+
s.mockShard.acquireShard()
379+
380+
s.Assert().Equal(contextStateStopping, s.mockShard.state)
381+
}
382+
383+
func (s *contextSuite) TestAcquireShardNonOwnershipLostErrorIsRetried() {
384+
s.mockShard.state = contextStateAcquiring
385+
s.mockShard.acquireShardRetryPolicy = backoff.NewExponentialRetryPolicy(time.Nanosecond).
386+
WithMaximumAttempts(5)
387+
// TODO: make this 5 times instead of 6 when retry policy is fixed
388+
s.mockShardManager.EXPECT().UpdateShard(gomock.Any(), gomock.Any()).
389+
Return(fmt.Errorf("temp error")).Times(6)
390+
391+
s.mockShard.acquireShard()
392+
393+
s.Assert().Equal(contextStateStopping, s.mockShard.state)
394+
}
395+
396+
func (s *contextSuite) TestAcquireShardEventuallySucceeds() {
397+
s.mockShard.state = contextStateAcquiring
398+
s.mockShard.acquireShardRetryPolicy = backoff.NewExponentialRetryPolicy(time.Nanosecond).
399+
WithMaximumAttempts(5)
400+
s.mockShardManager.EXPECT().UpdateShard(gomock.Any(), gomock.Any()).
401+
Return(fmt.Errorf("temp error")).Times(3)
402+
s.mockShardManager.EXPECT().UpdateShard(gomock.Any(), gomock.Any()).
403+
Return(nil).Times(1)
404+
s.mockHistoryEngine.EXPECT().NotifyNewTasks(gomock.Any(), gomock.Any()).MinTimes(1)
405+
406+
s.mockShard.acquireShard()
407+
408+
s.Assert().Equal(contextStateAcquired, s.mockShard.state)
409+
}
410+
411+
func (s *contextSuite) TestAcquireShardNoError() {
412+
s.mockShard.state = contextStateAcquiring
413+
s.mockShard.acquireShardRetryPolicy = backoff.NewExponentialRetryPolicy(time.Nanosecond).
414+
WithMaximumAttempts(5)
415+
s.mockShardManager.EXPECT().UpdateShard(gomock.Any(), gomock.Any()).
416+
Return(nil).Times(1)
417+
s.mockHistoryEngine.EXPECT().NotifyNewTasks(gomock.Any(), gomock.Any()).MinTimes(1)
418+
419+
s.mockShard.acquireShard()
420+
421+
s.Assert().Equal(contextStateAcquired, s.mockShard.state)
422+
}

0 commit comments

Comments
 (0)