Skip to content

Commit 2b761b4

Browse files
authored
Remove internal bulk processor retries (#3739)
1 parent c6a89c0 commit 2b761b4

File tree

9 files changed

+42
-101
lines changed

9 files changed

+42
-101
lines changed

common/metrics/metric_defs.go

-1
Original file line numberDiff line numberDiff line change
@@ -1682,7 +1682,6 @@ var (
16821682
BatcherOperationFailures = NewCounterDef("batcher_operation_errors")
16831683
ElasticsearchBulkProcessorRequests = NewCounterDef("elasticsearch_bulk_processor_requests")
16841684
ElasticsearchBulkProcessorQueuedRequests = NewDimensionlessHistogramDef("elasticsearch_bulk_processor_queued_requests")
1685-
ElasticsearchBulkProcessorRetries = NewCounterDef("elasticsearch_bulk_processor_retries")
16861685
ElasticsearchBulkProcessorFailures = NewCounterDef("elasticsearch_bulk_processor_errors")
16871686
ElasticsearchBulkProcessorCorruptedData = NewCounterDef("elasticsearch_bulk_processor_corrupted_data")
16881687
ElasticsearchBulkProcessorDuplicateRequest = NewCounterDef("elasticsearch_bulk_processor_duplicate_request")

common/persistence/visibility/store/elasticsearch/client/bulk_processor.go

-1
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,6 @@ type (
5252
BulkActions int
5353
BulkSize int
5454
FlushInterval time.Duration
55-
Backoff elastic.Backoff
5655
BeforeFunc elastic.BulkBeforeFunc
5756
AfterFunc elastic.BulkAfterFunc
5857
}

common/persistence/visibility/store/elasticsearch/client/client_test.go

-8
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ import (
2929
"testing"
3030

3131
"github.com/stretchr/testify/assert"
32-
"github.com/stretchr/testify/require"
3332
enumspb "go.temporal.io/api/enums/v1"
3433
)
3534

@@ -74,10 +73,3 @@ func Test_BuildPutMappingBody(t *testing.T) {
7473
assert.Equal(test.expected, fmt.Sprintf("%v", buildMappingBody(test.input)))
7574
}
7675
}
77-
78-
func TestIsResponseRetryable(t *testing.T) {
79-
status := []int{408, 429, 500, 503, 507}
80-
for _, code := range status {
81-
require.True(t, IsRetryableStatus(code))
82-
}
83-
}

common/persistence/visibility/store/elasticsearch/client/client_v7.go

+2-1
Original file line numberDiff line numberDiff line change
@@ -225,9 +225,10 @@ func (c *clientImpl) RunBulkProcessor(ctx context.Context, p *BulkProcessorParam
225225
BulkActions(p.BulkActions).
226226
BulkSize(p.BulkSize).
227227
FlushInterval(p.FlushInterval).
228-
Backoff(p.Backoff).
229228
Before(p.BeforeFunc).
230229
After(p.AfterFunc).
230+
// Disable built-in retry logic because visibility task processor has its own.
231+
RetryItemStatusCodes().
231232
Do(ctx)
232233

233234
return newBulkProcessor(esBulkProcessor), err

common/persistence/visibility/store/elasticsearch/client/errors.go

-57
This file was deleted.

common/persistence/visibility/store/elasticsearch/processor.go

+17-27
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ package elasticsearch
3030
import (
3131
"context"
3232
"encoding/json"
33+
"errors"
3334
"fmt"
3435
"strings"
3536
"sync/atomic"
@@ -93,10 +94,7 @@ type (
9394
var _ Processor = (*processorImpl)(nil)
9495

9596
const (
96-
// retry configs for es bulk processor
97-
esProcessorInitialRetryInterval = 200 * time.Millisecond
98-
esProcessorMaxRetryInterval = 20 * time.Second
99-
visibilityProcessorName = "visibility-processor"
97+
visibilityProcessorName = "visibility-processor"
10098
)
10199

102100
// NewProcessor create new processorImpl
@@ -119,7 +117,6 @@ func NewProcessor(
119117
BulkActions: cfg.ESProcessorBulkActions(),
120118
BulkSize: cfg.ESProcessorBulkSize(),
121119
FlushInterval: cfg.ESProcessorFlushInterval(),
122-
Backoff: elastic.NewExponentialBackoff(esProcessorInitialRetryInterval, esProcessorMaxRetryInterval),
123120
},
124121
}
125122
p.bulkProcessorParameters.AfterFunc = p.bulkAfterAction
@@ -220,25 +217,26 @@ func (p *processorImpl) bulkBeforeAction(_ int64, requests []elastic.BulkableReq
220217
func (p *processorImpl) bulkAfterAction(_ int64, requests []elastic.BulkableRequest, response *elastic.BulkResponse, err error) {
221218
if err != nil {
222219
const logFirstNRequests = 5
223-
httpStatus := client.HttpStatus(err)
224-
isRetryable := client.IsRetryableStatus(httpStatus)
220+
var httpStatus int
221+
var esErr *elastic.Error
222+
if errors.As(err, &esErr) {
223+
httpStatus = esErr.Status
224+
}
225+
225226
var logRequests strings.Builder
226227
for i, request := range requests {
227228
if i < logFirstNRequests {
228229
logRequests.WriteString(request.String())
229230
logRequests.WriteRune('\n')
230231
}
231232
p.metricsHandler.Counter(metrics.ElasticsearchBulkProcessorFailures.GetMetricName()).Record(1, metrics.HttpStatusTag(httpStatus))
232-
233-
if !isRetryable {
234-
visibilityTaskKey := p.extractVisibilityTaskKey(request)
235-
if visibilityTaskKey == "" {
236-
continue
237-
}
238-
p.notifyResult(visibilityTaskKey, false)
233+
visibilityTaskKey := p.extractVisibilityTaskKey(request)
234+
if visibilityTaskKey == "" {
235+
continue
239236
}
237+
p.notifyResult(visibilityTaskKey, false)
240238
}
241-
p.logger.Error("Unable to commit bulk ES request.", tag.Error(err), tag.IsRetryable(isRetryable), tag.RequestCount(len(requests)), tag.ESRequest(logRequests.String()))
239+
p.logger.Error("Unable to commit bulk ES request.", tag.Error(err), tag.RequestCount(len(requests)), tag.ESRequest(logRequests.String()))
242240
return
243241
}
244242

@@ -262,10 +260,7 @@ func (p *processorImpl) bulkAfterAction(_ int64, requests []elastic.BulkableRequ
262260
continue
263261
}
264262

265-
switch {
266-
case isSuccess(responseItem):
267-
p.notifyResult(visibilityTaskKey, true)
268-
case !client.IsRetryableStatus(responseItem.Status):
263+
if !isSuccess(responseItem) {
269264
p.logger.Error("ES request failed.",
270265
tag.ESResponseStatus(responseItem.Status),
271266
tag.ESResponseError(extractErrorReason(responseItem)),
@@ -274,15 +269,10 @@ func (p *processorImpl) bulkAfterAction(_ int64, requests []elastic.BulkableRequ
274269
tag.ESRequest(request.String()))
275270
p.metricsHandler.Counter(metrics.ElasticsearchBulkProcessorFailures.GetMetricName()).Record(1, metrics.HttpStatusTag(responseItem.Status))
276271
p.notifyResult(visibilityTaskKey, false)
277-
default: // bulk processor will retry
278-
p.logger.Warn("ES request retried.",
279-
tag.ESResponseStatus(responseItem.Status),
280-
tag.ESResponseError(extractErrorReason(responseItem)),
281-
tag.Key(visibilityTaskKey),
282-
tag.ESDocID(docID),
283-
tag.ESRequest(request.String()))
284-
p.metricsHandler.Counter(metrics.ElasticsearchBulkProcessorRetries.GetMetricName()).Record(1, metrics.HttpStatusTag(responseItem.Status))
272+
continue
285273
}
274+
275+
p.notifyResult(visibilityTaskKey, true)
286276
}
287277

288278
// Record how many documents are waiting to be flushed to Elasticsearch after this bulk is committed.

common/persistence/visibility/store/elasticsearch/processor_test.go

-1
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,6 @@ func (s *processorSuite) TestNewESProcessorAndStartStop() {
113113
s.Equal(config.ESProcessorBulkActions(), input.BulkActions)
114114
s.Equal(config.ESProcessorBulkSize(), input.BulkSize)
115115
s.Equal(config.ESProcessorFlushInterval(), input.FlushInterval)
116-
s.NotNil(input.Backoff)
117116
s.NotNil(input.AfterFunc)
118117

119118
bulkProcessor := client.NewMockBulkProcessor(s.controller)

common/persistence/visibility/store/elasticsearch/visibility_store.go

+5-4
Original file line numberDiff line numberDiff line change
@@ -254,14 +254,15 @@ func (s *visibilityStore) addBulkRequestAndWait(
254254
return &persistence.TimeoutError{Msg: fmt.Sprintf("visibility task %s timed out waiting for ACK after %v", visibilityTaskKey, s.processorAckTimeout())}
255255
}
256256
// Returns non-retryable Internal error here because these errors are unexpected.
257-
// Visibility task processor retries all errors though, therefore new request will be generated for the same task.
257+
// Visibility task processor retries all errors though, therefore new request will be generated for the same visibility task.
258258
return serviceerror.NewInternal(fmt.Sprintf("visibility task %s received error %v", visibilityTaskKey, err))
259259
}
260260

261261
if !ack {
262-
// Returns non-retryable Internal error here because NACK from bulk processor means that this request can't be processed.
263-
// Visibility task processor retries all errors though, therefore new request will be generated for the same task.
264-
return serviceerror.NewInternal(fmt.Sprintf("visibility task %s received NACK", visibilityTaskKey))
262+
// Returns retryable Unavailable error here because NACK from bulk processor
263+
// means that this request wasn't processed successfully and needs to be retried.
264+
// Visibility task processor retries all errors anyway, therefore new request will be generated for the same visibility task.
265+
return serviceerror.NewUnavailable(fmt.Sprintf("visibility task %s received NACK", visibilityTaskKey))
265266
}
266267
return nil
267268
}

service/worker/addsearchattributes/workflow.go

+18-1
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,10 @@ import (
2828
"context"
2929
"errors"
3030
"fmt"
31+
"net/http"
3132
"time"
3233

34+
"github.com/olivere/elastic/v7"
3335
enumspb "go.temporal.io/api/enums/v1"
3436
"go.temporal.io/sdk/temporal"
3537
"go.temporal.io/sdk/workflow"
@@ -140,7 +142,8 @@ func (a *activities) AddESMappingFieldActivity(ctx context.Context, params Workf
140142
_, err := a.esClient.PutMapping(ctx, params.IndexName, params.CustomAttributesToAdd)
141143
if err != nil {
142144
a.metricsHandler.Counter(metrics.AddSearchAttributesFailuresCount.GetMetricName()).Record(1)
143-
if esclient.IsRetryableError(err) {
145+
146+
if a.isRetryableError(err) {
144147
a.logger.Error("Unable to update Elasticsearch mapping (retryable error).", tag.ESIndex(params.IndexName), tag.Error(err))
145148
return fmt.Errorf("%w: %v", ErrUnableToUpdateESMapping, err)
146149
}
@@ -152,6 +155,20 @@ func (a *activities) AddESMappingFieldActivity(ctx context.Context, params Workf
152155
return nil
153156
}
154157

158+
func (a *activities) isRetryableError(err error) bool {
159+
var esErr *elastic.Error
160+
if !errors.As(err, &esErr) {
161+
return true
162+
}
163+
164+
switch esErr.Status {
165+
case http.StatusBadRequest, http.StatusUnauthorized, http.StatusForbidden, http.StatusNotFound, http.StatusConflict:
166+
return false
167+
default:
168+
return true
169+
}
170+
}
171+
155172
func (a *activities) WaitForYellowStatusActivity(ctx context.Context, indexName string) error {
156173
if a.esClient == nil {
157174
a.logger.Info("Elasticsearch client is not configured. Skipping Elasticsearch status check.")

0 commit comments

Comments
 (0)