Skip to content

Commit efdc00c

Browse files
Add proxy metric representing failed connection attempts to downstream clusters
1 parent 23f1a1d commit efdc00c

File tree

5 files changed

+133
-5
lines changed

5 files changed

+133
-5
lines changed

integration-tests/metrics_test.go

+2
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,8 @@ func checkMetrics(
283283
require.Contains(t, lines, fmt.Sprintf("%v 0", getPrometheusName(prefix, metrics.FailedWritesOnTarget)))
284284
require.Contains(t, lines, fmt.Sprintf("%v 0", getPrometheusName(prefix, metrics.FailedReadsTarget)))
285285
require.Contains(t, lines, fmt.Sprintf("%v 0", getPrometheusName(prefix, metrics.FailedReadsOrigin)))
286+
require.Contains(t, lines, fmt.Sprintf("%v 0", getPrometheusName(prefix, metrics.FailedConnectionsOrigin)))
287+
require.Contains(t, lines, fmt.Sprintf("%v 0", getPrometheusName(prefix, metrics.FailedConnectionsTarget)))
286288

287289
require.Contains(t, lines, fmt.Sprintf("%v 0", getPrometheusName(prefix, metrics.InFlightWrites)))
288290
require.Contains(t, lines, fmt.Sprintf("%v 0", getPrometheusName(prefix, metrics.InFlightReadsOrigin)))

integration-tests/runner_test.go

+92
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,24 @@ package integration_tests
33
import (
44
"context"
55
"fmt"
6+
"github.com/datastax/go-cassandra-native-protocol/message"
7+
"github.com/datastax/go-cassandra-native-protocol/primitive"
8+
"github.com/datastax/zdm-proxy/integration-tests/client"
69
"github.com/datastax/zdm-proxy/integration-tests/setup"
710
"github.com/datastax/zdm-proxy/integration-tests/utils"
811
"github.com/datastax/zdm-proxy/proxy/pkg/config"
912
"github.com/datastax/zdm-proxy/proxy/pkg/health"
1013
"github.com/datastax/zdm-proxy/proxy/pkg/httpzdmproxy"
1114
"github.com/datastax/zdm-proxy/proxy/pkg/metrics"
1215
"github.com/datastax/zdm-proxy/proxy/pkg/runner"
16+
"github.com/datastax/zdm-proxy/proxy/pkg/zdmproxy"
17+
"github.com/jpillora/backoff"
18+
log "github.com/sirupsen/logrus"
1319
"github.com/stretchr/testify/require"
1420
"net/http"
21+
"strings"
1522
"sync"
23+
"sync/atomic"
1624
"testing"
1725
"time"
1826
)
@@ -42,6 +50,10 @@ func TestWithHttpHandlers(t *testing.T) {
4250
t.Run("testHttpEndpointsWithUnavailableNode", func(t *testing.T) {
4351
testHttpEndpointsWithUnavailableNode(t, metricsHandler, readinessHandler)
4452
})
53+
54+
t.Run("testMetricsWithUnavailableNode", func(t *testing.T) {
55+
testMetricsWithUnavailableNode(t, metricsHandler)
56+
})
4557
}
4658

4759
func testHttpEndpointsWithProxyNotInitialized(
@@ -137,6 +149,86 @@ func testHttpEndpointsWithProxyInitialized(
137149
require.Equal(t, health.UP, report.Status)
138150
}
139151

152+
func testMetricsWithUnavailableNode(
153+
t *testing.T, metricsHandler *httpzdmproxy.HandlerWithFallback) {
154+
155+
simulacronSetup, err := setup.NewSimulacronTestSetupWithSession(t, false, false)
156+
require.Nil(t, err)
157+
defer simulacronSetup.Cleanup()
158+
159+
conf := setup.NewTestConfig(simulacronSetup.Origin.GetInitialContactPoint(), simulacronSetup.Target.GetInitialContactPoint())
160+
modifyConfForHealthTests(conf, 2)
161+
162+
waitGroup := &sync.WaitGroup{}
163+
ctx, cancelFunc := context.WithCancel(context.Background())
164+
165+
defer waitGroup.Wait()
166+
defer cancelFunc()
167+
168+
srv := httpzdmproxy.StartHttpServer(fmt.Sprintf("%s:%d", conf.MetricsAddress, conf.MetricsPort), waitGroup)
169+
defer func(srv *http.Server, ctx context.Context) {
170+
err := srv.Shutdown(ctx)
171+
if err != nil {
172+
log.Error("Failed to shutdown metrics server:", err.Error())
173+
}
174+
}(srv, ctx)
175+
176+
b := &backoff.Backoff{
177+
Factor: 2,
178+
Jitter: false,
179+
Min: 100 * time.Millisecond,
180+
Max: 500 * time.Millisecond,
181+
}
182+
proxy := atomic.Value{}
183+
waitGroup.Add(1)
184+
go func() {
185+
defer waitGroup.Done()
186+
p, err := zdmproxy.RunWithRetries(conf, ctx, b)
187+
if err == nil {
188+
metricsHandler.SetHandler(p.GetMetricHandler().GetHttpHandler())
189+
proxy.Store(&p)
190+
<-ctx.Done()
191+
p.Shutdown()
192+
}
193+
}()
194+
195+
httpAddr := fmt.Sprintf("%s:%d", conf.MetricsAddress, conf.MetricsPort)
196+
197+
// check that metrics endpoint has been initialized
198+
utils.RequireWithRetries(t, func() (err error, fatal bool) {
199+
fatal = false
200+
err = utils.CheckMetricsEndpointResult(httpAddr, true)
201+
return
202+
}, 10, 100*time.Millisecond)
203+
204+
// stop origin cluster
205+
err = simulacronSetup.Origin.DisableConnectionListener()
206+
require.Nil(t, err, "failed to disable origin connection listener: %v", err)
207+
err = simulacronSetup.Origin.DropAllConnections()
208+
require.Nil(t, err, "failed to drop origin connections: %v", err)
209+
210+
// send a request
211+
testClient, err := client.NewTestClient(context.Background(), "127.0.0.1:14002")
212+
require.Nil(t, err)
213+
queryMsg := &message.Query{
214+
Query: "SELECT * FROM table1",
215+
}
216+
_, _, _ = testClient.SendMessage(context.Background(), primitive.ProtocolVersion4, queryMsg)
217+
218+
utils.RequireWithRetries(t, func() (err error, fatal bool) {
219+
// expect connection failure to origin cluster
220+
statusCode, rspStr, err := utils.GetMetrics(httpAddr)
221+
require.Nil(t, err)
222+
require.Equal(t, http.StatusOK, statusCode)
223+
if !strings.Contains(rspStr, fmt.Sprintf("%v 1", getPrometheusName("zdm", metrics.FailedConnectionsOrigin))) {
224+
err = fmt.Errorf("did not observe failed connection attempts")
225+
} else {
226+
err = nil
227+
}
228+
return
229+
}, 10, 500*time.Millisecond)
230+
}
231+
140232
func testHttpEndpointsWithUnavailableNode(
141233
t *testing.T, metricsHandler *httpzdmproxy.HandlerWithFallback, healthHandler *httpzdmproxy.HandlerWithFallback) {
142234

proxy/pkg/metrics/proxy_metrics.go

+25-5
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,10 @@ const (
1010
failedRequestsClusterTarget = "target"
1111
failedRequestsClusterBoth = "both"
1212

13+
failedConnectionsName = "proxy_failed_connections_total"
14+
failedConnectionsDescription = "Running total of failed requests due to inability to connect to given cluster"
15+
failedConnectionsClusterLabel = "cluster"
16+
1317
failedReadsName = "proxy_failed_reads_total"
1418
failedReadsDescription = "Running total of failed reads"
1519
failedReadsClusterLabel = "cluster"
@@ -28,6 +32,20 @@ const (
2832
)
2933

3034
var (
35+
FailedConnectionsOrigin = NewMetricWithLabels(
36+
failedConnectionsName,
37+
failedConnectionsDescription,
38+
map[string]string{
39+
failedConnectionsClusterLabel: failedRequestsClusterOrigin,
40+
},
41+
)
42+
FailedConnectionsTarget = NewMetricWithLabels(
43+
failedConnectionsName,
44+
failedConnectionsDescription,
45+
map[string]string{
46+
failedConnectionsClusterLabel: failedRequestsClusterTarget,
47+
},
48+
)
3149
FailedReadsOrigin = NewMetricWithLabels(
3250
failedReadsName,
3351
failedReadsDescription,
@@ -124,11 +142,13 @@ var (
124142
)
125143

126144
type ProxyMetrics struct {
127-
FailedReadsOrigin Counter
128-
FailedReadsTarget Counter
129-
FailedWritesOnOrigin Counter
130-
FailedWritesOnTarget Counter
131-
FailedWritesOnBoth Counter
145+
FailedConnectionsOrigin Counter
146+
FailedConnectionsTarget Counter
147+
FailedReadsOrigin Counter
148+
FailedReadsTarget Counter
149+
FailedWritesOnOrigin Counter
150+
FailedWritesOnTarget Counter
151+
FailedWritesOnBoth Counter
132152

133153
PSCacheSize GaugeFunc
134154
PSCacheMissCount Counter

proxy/pkg/zdmproxy/clienthandler.go

+2
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,7 @@ func NewClientHandler(
209209
clientHandlerContext, clientHandlerCancelFunc, respChannel, readScheduler, writeScheduler, requestsDoneCtx,
210210
false, nil, handshakeDone, originFrameProcessor, originCCProtoVer)
211211
if err != nil {
212+
metricHandler.GetProxyMetrics().FailedConnectionsOrigin.Add(1)
212213
clientHandlerCancelFunc()
213214
return nil, err
214215
}
@@ -218,6 +219,7 @@ func NewClientHandler(
218219
clientHandlerContext, clientHandlerCancelFunc, respChannel, readScheduler, writeScheduler, requestsDoneCtx,
219220
false, nil, handshakeDone, targetFrameProcessor, targetCCProtoVer)
220221
if err != nil {
222+
metricHandler.GetProxyMetrics().FailedConnectionsTarget.Add(1)
221223
clientHandlerCancelFunc()
222224
return nil, err
223225
}

proxy/pkg/zdmproxy/proxy.go

+12
Original file line numberDiff line numberDiff line change
@@ -687,6 +687,16 @@ func sleepWithContext(d time.Duration, ctx context.Context, reconnectCh chan boo
687687
}
688688

689689
func (p *ZdmProxy) CreateProxyMetrics(metricFactory metrics.MetricFactory) (*metrics.ProxyMetrics, error) {
690+
failedConnectionsOrigin, err := metricFactory.GetOrCreateCounter(metrics.FailedConnectionsOrigin)
691+
if err != nil {
692+
return nil, err
693+
}
694+
695+
failedConnectionsTarget, err := metricFactory.GetOrCreateCounter(metrics.FailedConnectionsTarget)
696+
if err != nil {
697+
return nil, err
698+
}
699+
690700
failedReadsOrigin, err := metricFactory.GetOrCreateCounter(metrics.FailedReadsOrigin)
691701
if err != nil {
692702
return nil, err
@@ -760,6 +770,8 @@ func (p *ZdmProxy) CreateProxyMetrics(metricFactory metrics.MetricFactory) (*met
760770
}
761771

762772
proxyMetrics := &metrics.ProxyMetrics{
773+
FailedConnectionsOrigin: failedConnectionsOrigin,
774+
FailedConnectionsTarget: failedConnectionsTarget,
763775
FailedReadsOrigin: failedReadsOrigin,
764776
FailedReadsTarget: failedReadsTarget,
765777
FailedWritesOnOrigin: failedWritesOnOrigin,

0 commit comments

Comments
 (0)