Skip to content

Commit eea26e9

Browse files
authored
[agent] Do not increment data loss counters on the first client batch (#2028)
* [agent] Do not increment data loss counters on the first client batch Signed-off-by: Yuri Shkuro <ys@uber.com> * simplify tests; Signed-off-by: Yuri Shkuro <ys@uber.com>
1 parent 886b965 commit eea26e9

File tree

2 files changed

+42
-23
lines changed

2 files changed

+42
-23
lines changed

cmd/agent/app/reporter/client_metrics.go

+12-9
Original file line numberDiff line numberDiff line change
@@ -195,21 +195,24 @@ func (s *lastReceivedClientStats) update(
195195
// That makes the metrics slightly off in time, but accurate in aggregate.
196196
return
197197
}
198+
// do not update counters on the first batch, because it may cause a huge spike in totals
199+
// if the client has been running for a while already, but the agent just started.
200+
if s.batchSeqNo > 0 {
201+
metrics.BatchesSent.Inc(batchSeqNo - s.batchSeqNo)
202+
if stats != nil {
203+
metrics.FailedToEmitSpans.Inc(stats.FailedToEmitSpans - s.failedToEmitSpans)
204+
metrics.TooLargeDroppedSpans.Inc(stats.TooLargeDroppedSpans - s.tooLargeDroppedSpans)
205+
metrics.FullQueueDroppedSpans.Inc(stats.FullQueueDroppedSpans - s.fullQueueDroppedSpans)
206+
}
207+
}
198208

199-
metrics.BatchesSent.Inc(batchSeqNo - s.batchSeqNo)
200-
209+
s.lastUpdated = time.Now()
210+
s.batchSeqNo = batchSeqNo
201211
if stats != nil {
202-
metrics.FailedToEmitSpans.Inc(stats.FailedToEmitSpans - s.failedToEmitSpans)
203-
metrics.TooLargeDroppedSpans.Inc(stats.TooLargeDroppedSpans - s.tooLargeDroppedSpans)
204-
metrics.FullQueueDroppedSpans.Inc(stats.FullQueueDroppedSpans - s.fullQueueDroppedSpans)
205-
206212
s.failedToEmitSpans = stats.FailedToEmitSpans
207213
s.tooLargeDroppedSpans = stats.TooLargeDroppedSpans
208214
s.fullQueueDroppedSpans = stats.FullQueueDroppedSpans
209215
}
210-
211-
s.lastUpdated = time.Now()
212-
s.batchSeqNo = batchSeqNo
213216
}
214217

215218
func clientUUID(batch *jaeger.Batch) string {

cmd/agent/app/reporter/client_metrics_test.go

+30-14
Original file line numberDiff line numberDiff line change
@@ -104,40 +104,58 @@ func TestClientMetricsReporter_Jaeger(t *testing.T) {
104104
clientUUID: &clientUUID,
105105
seqNo: nPtr(100),
106106
expLog: clientUUID,
107-
runExpire: true,
107+
stats: &jaeger.ClientStats{
108+
FullQueueDroppedSpans: 10,
109+
TooLargeDroppedSpans: 10,
110+
FailedToEmitSpans: 10,
111+
},
112+
runExpire: true,
113+
// first batch cannot increment counters, only capture the baseline
108114
expCounters: []metricstest.ExpectedMetric{
109-
{Name: prefix + "batches_sent", Value: 100},
115+
{Name: prefix + "batches_sent", Value: 0},
116+
{Name: prefix + "spans_dropped", Tags: tag("cause", "full-queue"), Value: 0},
117+
{Name: prefix + "spans_dropped", Tags: tag("cause", "too-large"), Value: 0},
118+
{Name: prefix + "spans_dropped", Tags: tag("cause", "send-failure"), Value: 0},
110119
},
111120
expGauges: []metricstest.ExpectedMetric{
112121
{Name: prefix + "connected_clients", Value: 1},
113122
},
114123
},
115124
{
116125
clientUUID: &clientUUID,
117-
seqNo: nPtr(101),
126+
seqNo: nPtr(105),
127+
stats: &jaeger.ClientStats{
128+
FullQueueDroppedSpans: 15,
129+
TooLargeDroppedSpans: 15,
130+
FailedToEmitSpans: 15,
131+
},
118132
expCounters: []metricstest.ExpectedMetric{
119-
{Name: prefix + "batches_sent", Value: 101},
133+
{Name: prefix + "batches_sent", Value: 5},
134+
{Name: prefix + "spans_dropped", Tags: tag("cause", "full-queue"), Value: 5},
135+
{Name: prefix + "spans_dropped", Tags: tag("cause", "too-large"), Value: 5},
136+
{Name: prefix + "spans_dropped", Tags: tag("cause", "send-failure"), Value: 5},
120137
},
121138
},
122139
{
123140
clientUUID: &clientUUID,
124141
seqNo: nPtr(90), // out of order batch will be ignored
125142
expCounters: []metricstest.ExpectedMetric{
126-
{Name: prefix + "batches_sent", Value: 101}, // unchanged!
143+
{Name: prefix + "batches_sent", Value: 5}, // unchanged!
127144
},
128145
},
129146
{
130147
clientUUID: &clientUUID,
131148
seqNo: nPtr(110),
149+
// use different stats values to test the correct assignments
132150
stats: &jaeger.ClientStats{
133-
FullQueueDroppedSpans: 5,
134-
TooLargeDroppedSpans: 6,
135-
FailedToEmitSpans: 7,
151+
FullQueueDroppedSpans: 17,
152+
TooLargeDroppedSpans: 18,
153+
FailedToEmitSpans: 19,
136154
}, expCounters: []metricstest.ExpectedMetric{
137-
{Name: prefix + "batches_sent", Value: 110},
138-
{Name: prefix + "spans_dropped", Tags: tag("cause", "full-queue"), Value: 5},
139-
{Name: prefix + "spans_dropped", Tags: tag("cause", "too-large"), Value: 6},
140-
{Name: prefix + "spans_dropped", Tags: tag("cause", "send-failure"), Value: 7},
155+
{Name: prefix + "batches_sent", Value: 10},
156+
{Name: prefix + "spans_dropped", Tags: tag("cause", "full-queue"), Value: 7},
157+
{Name: prefix + "spans_dropped", Tags: tag("cause", "too-large"), Value: 8},
158+
{Name: prefix + "spans_dropped", Tags: tag("cause", "send-failure"), Value: 9},
141159
},
142160
},
143161
}
@@ -219,8 +237,6 @@ func TestClientMetricsReporter_Expire(t *testing.T) {
219237
err := tr.r.EmitBatch(batch)
220238
assert.NoError(t, err)
221239
assert.Len(t, tr.mr.Spans(), 1)
222-
tr.mb.AssertCounterMetrics(t,
223-
metricstest.ExpectedMetric{Name: "client_stats.batches_sent", Value: 1})
224240

225241
// here we test that a connected-client gauge is updated to 1 by the auto-scheduled expire loop,
226242
// and then reset to 0 once the client entry expires.

0 commit comments

Comments
 (0)