Skip to content

Commit 9cab121

Browse files
authored
feat: e2e metrics reporting (#9776)
1 parent 233b387 commit 9cab121

File tree

16 files changed

+216
-7
lines changed

16 files changed

+216
-7
lines changed

.github/workflows/ci.yml

+1
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,7 @@ jobs:
208208
uses: ./.github/ensure-tester-with-images
209209
env:
210210
USERNAME: ${{ needs.configure.outputs.username }}
211+
PULL_REQUEST: ${{ github.event.pull_request.number }}
211212
with:
212213
runner_type: ${{ steps.runner_type.outputs.type }}
213214
builder_type: builder-x86

yarn-project/end-to-end/package.json

+3
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
"formatting": "run -T prettier --check ./src \"!src/web/main.js\" && run -T eslint ./src",
1717
"formatting:fix": "run -T eslint --fix ./src && run -T prettier -w ./src",
1818
"test": "LOG_LEVEL=${LOG_LEVEL:-verbose} DEBUG_COLORS=1 NODE_NO_WARNINGS=1 node --experimental-vm-modules ../node_modules/.bin/jest --testTimeout=300000 --forceExit",
19+
"test:with-alerts": "./scripts/test-with-alerts.sh",
1920
"test:profile": "LOG_LEVEL=${LOG_LEVEL:-verbose} DEBUG_COLORS=1 NODE_NO_WARNINGS=1 0x --output-dir \"flame_graph/{pid}.0x\" -- node --experimental-vm-modules ../node_modules/jest/bin/jest.js --runInBand --testTimeout=300000 --forceExit",
2021
"serve:flames": "python3 -m http.server --directory \"flame_graph\" 8000",
2122
"test:debug": "LOG_LEVEL=${LOG_LEVEL:-verbose} DEBUG_COLORS=1 NODE_NO_WARNINGS=1 node --inspect --experimental-vm-modules ../node_modules/.bin/jest --testTimeout=300000 --forceExit",
@@ -99,10 +100,12 @@
99100
"0x": "^5.7.0",
100101
"@jest/globals": "^29.5.0",
101102
"@types/jest": "^29.5.0",
103+
"@types/js-yaml": "^4.0.9",
102104
"@types/lodash.chunk": "^4.2.9",
103105
"concurrently": "^7.6.0",
104106
"jest": "^29.5.0",
105107
"jest-extended": "^4.0.2",
108+
"js-yaml": "^4.1.0",
106109
"ts-node": "^10.9.1",
107110
"typescript": "^5.0.4"
108111
},

yarn-project/end-to-end/scripts/e2e_test.sh

+2
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ fi
5050
# Check if the test uses docker compose
5151
if [ "$(echo "$test_config" | yq e '.use_compose // false' -)" = "true" ]; then
5252
$(dirname "$0")/e2e_compose_test.sh "$test_path" "$@" || [ "$ignore_failures" = "true" ]
53+
elif [ "$(echo "$test_config" | yq e '.with_alerts // false' -)" = "true" ]; then
54+
$(dirname "$0")/e2e_test_with_alerts.sh "$test_path" "$@" || [ "$ignore_failures" = "true" ]
5355
else
5456
# Set environment variables
5557
while IFS='=' read -r key value; do

yarn-project/end-to-end/scripts/e2e_test_config.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -85,9 +85,9 @@ tests:
8585
e2e_token_contract: {}
8686
e2e_p2p_gossip:
8787
test_path: 'e2e_p2p/gossip_network.test.ts'
88+
with_alerts: true
8889
e2e_p2p_upgrade_governance_proposer:
8990
test_path: 'e2e_p2p/upgrade_governance_proposer.test.ts'
90-
# https://github.com/AztecProtocol/aztec-packages/issues/9843
9191
e2e_p2p_rediscovery:
9292
test_path: 'e2e_p2p/rediscovery.test.ts'
9393
e2e_p2p_reqresp:
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
#! /bin/bash
2+
## Run an end to end test with alerts
3+
4+
# This will run an end to end test running the otel-lgtm stack (otel-collector, grafana, prometheus, tempo and loki)
5+
# Then check the test against a set of alerts defined in the alerts.yaml file
6+
# Note: these tests must run with METRICS enabled
7+
8+
# Usage: ./e2e_test_with_alerts.sh <test-name> <...extra-args>
9+
# Example: ./e2e_test_with_alerts.sh gossip_network
10+
11+
set -e
12+
13+
test_path=$1
14+
15+
echo "Running otel stack"
16+
CONTAINER_ID=$(docker run -d -p 3000:3000 -p 4317:4317 -p 4318:4318 --rm grafana/otel-lgtm)
17+
18+
trap "docker stop $CONTAINER_ID" EXIT SIGINT SIGTERM
19+
20+
echo "Waiting for LGTM stack to be ready..."
21+
timeout=90
22+
while [ $timeout -gt 0 ]; do
23+
if docker logs $CONTAINER_ID 2>&1 | grep -q "The OpenTelemetry collector and the Grafana LGTM stack are up and running"; then
24+
echo "LGTM stack is ready!"
25+
break
26+
fi
27+
sleep 1
28+
((timeout--))
29+
done
30+
31+
if [ $timeout -eq 0 ]; then
32+
echo "Timeout waiting for LGTM stack to be ready"
33+
docker stop $CONTAINER_ID
34+
exit 1
35+
fi
36+
37+
## Pass through run the existing e2e test
38+
docker run \
39+
--network host \
40+
-e HARDWARE_CONCURRENCY="$HARDWARE_CONCURRENCY" \
41+
-e FAKE_PROOFS="$FAKE_PROOFS" \
42+
-e METRICS_PORT="4318" \
43+
-e COLLECT_METRICS="true" \
44+
-e PULL_REQUEST="$PULL_REQUEST" \
45+
$env_args \
46+
--rm aztecprotocol/end-to-end:$AZTEC_DOCKER_TAG \
47+
"$test_path" "$@" || [ "$ignore_failures" = "true" ]
48+
49+
50+
echo "Running alert checker..."
51+
docker run --network host --rm aztecprotocol/end-to-end:$AZTEC_DOCKER_TAG quality_of_service/alert_checker.test.ts

yarn-project/end-to-end/src/e2e_p2p/rediscovery.test.ts

+5
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import { sleep } from '@aztec/aztec.js';
33

44
import fs from 'fs';
55

6+
import { shouldCollectMetrics } from '../fixtures/fixtures.js';
67
import { type NodeContext, createNode, createNodes } from '../fixtures/setup_p2p_test.js';
78
import { P2PNetworkTest, WAIT_FOR_TX_TIMEOUT } from './p2p_network.js';
89
import { createPXEServiceAndSubmitTransactions } from './shared.js';
@@ -23,6 +24,8 @@ describe('e2e_p2p_rediscovery', () => {
2324
testName: 'e2e_p2p_rediscovery',
2425
numberOfNodes: NUM_NODES,
2526
basePort: BOOT_NODE_UDP_PORT,
27+
// To collect metrics - run in aztec-packages `docker compose --profile metrics up` and set COLLECT_METRICS=true
28+
metricsPort: shouldCollectMetrics(),
2629
});
2730
await t.applyBaseSnapshots();
2831
await t.setup();
@@ -48,6 +51,8 @@ describe('e2e_p2p_rediscovery', () => {
4851
NUM_NODES,
4952
BOOT_NODE_UDP_PORT,
5053
DATA_DIR,
54+
// To collect metrics - run in aztec-packages `docker compose --profile metrics up`
55+
shouldCollectMetrics(),
5156
);
5257

5358
// wait a bit for peers to discover each other

yarn-project/end-to-end/src/e2e_p2p/reex.test.ts

+6
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import { BlockProposal, SignatureDomainSeperator, getHashedSignaturePayload } fr
77
import { beforeAll, describe, it, jest } from '@jest/globals';
88
import fs from 'fs';
99

10+
import { shouldCollectMetrics } from '../fixtures/fixtures.js';
1011
import { createNodes } from '../fixtures/setup_p2p_test.js';
1112
import { P2PNetworkTest } from './p2p_network.js';
1213
import { submitComplexTxsTo } from './shared.js';
@@ -28,6 +29,8 @@ describe('e2e_p2p_reex', () => {
2829
testName: 'e2e_p2p_reex',
2930
numberOfNodes: NUM_NODES,
3031
basePort: BOOT_NODE_UDP_PORT,
32+
// To collect metrics - run in aztec-packages `docker compose --profile metrics up` and set COLLECT_METRICS=true
33+
metricsPort: shouldCollectMetrics(),
3134
});
3235

3336
t.logger.verbose('Setup account');
@@ -66,6 +69,9 @@ describe('e2e_p2p_reex', () => {
6669
t.bootstrapNodeEnr,
6770
NUM_NODES,
6871
BOOT_NODE_UDP_PORT,
72+
DATA_DIR,
73+
// To collect metrics - run in aztec-packages `docker compose --profile metrics up` and set COLLECT_METRICS=true
74+
shouldCollectMetrics(),
6975
);
7076

7177
// Hook into the node and intercept re-execution logic, ensuring that it was infact called

yarn-project/end-to-end/src/e2e_p2p/reqresp.test.ts

+4
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import { jest } from '@jest/globals';
66
import fs from 'fs';
77
import { getContract } from 'viem';
88

9+
import { shouldCollectMetrics } from '../fixtures/fixtures.js';
910
import { type NodeContext, createNodes } from '../fixtures/setup_p2p_test.js';
1011
import { P2PNetworkTest, WAIT_FOR_TX_TIMEOUT } from './p2p_network.js';
1112
import { createPXEServiceAndSubmitTransactions } from './shared.js';
@@ -26,6 +27,8 @@ describe('e2e_p2p_reqresp_tx', () => {
2627
testName: 'e2e_p2p_reqresp_tx',
2728
numberOfNodes: NUM_NODES,
2829
basePort: BOOT_NODE_UDP_PORT,
30+
// To collect metrics - run in aztec-packages `docker compose --profile metrics up`
31+
metricsPort: shouldCollectMetrics(),
2932
});
3033
await t.applyBaseSnapshots();
3134
await t.setup();
@@ -67,6 +70,7 @@ describe('e2e_p2p_reqresp_tx', () => {
6770
NUM_NODES,
6871
BOOT_NODE_UDP_PORT,
6972
DATA_DIR,
73+
shouldCollectMetrics(),
7074
);
7175

7276
// wait a bit for peers to discover each other

yarn-project/end-to-end/src/e2e_p2p/upgrade_governance_proposer.test.ts

+4
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import {
1212
import fs from 'fs';
1313
import { getAddress, getContract } from 'viem';
1414

15+
import { shouldCollectMetrics } from '../fixtures/fixtures.js';
1516
import { createNodes } from '../fixtures/setup_p2p_test.js';
1617
import { P2PNetworkTest } from './p2p_network.js';
1718

@@ -36,6 +37,8 @@ describe('e2e_p2p_governance_proposer', () => {
3637
testName: 'e2e_p2p_gerousia',
3738
numberOfNodes: NUM_NODES,
3839
basePort: BOOT_NODE_UDP_PORT,
40+
// To collect metrics - run in aztec-packages `docker compose --profile metrics up`
41+
metricsPort: shouldCollectMetrics(),
3942
});
4043
await t.applyBaseSnapshots();
4144
await t.setup();
@@ -132,6 +135,7 @@ describe('e2e_p2p_governance_proposer', () => {
132135
NUM_NODES,
133136
BOOT_NODE_UDP_PORT,
134137
DATA_DIR,
138+
shouldCollectMetrics(),
135139
);
136140

137141
await sleep(4000);
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
import { type DebugLogger, createDebugLogger } from '@aztec/aztec.js';
2+
import { fileURLToPath } from '@aztec/foundation/url';
3+
4+
import * as fs from 'fs';
5+
import * as yaml from 'js-yaml';
6+
import { dirname, join } from 'path';
7+
8+
const GRAFANA_ENDPOINT = 'http://localhost:3000/api/datasources/proxy/uid/prometheus/api/v1/query';
9+
interface AlertConfig {
10+
alert: string;
11+
expr: string;
12+
for: string;
13+
labels: Record<string, string>;
14+
annotations: Record<string, string>;
15+
}
16+
// Define __dirname for ES modules
17+
const __filename = fileURLToPath(import.meta.url);
18+
const __dirname = dirname(__filename);
19+
20+
// Load YAML configuration
21+
function loadAlertsConfig(filePath: string): AlertConfig[] {
22+
const fileContents = fs.readFileSync(join(__dirname, filePath), 'utf8');
23+
const data = yaml.load(fileContents) as { alerts: AlertConfig[] };
24+
return data.alerts;
25+
}
26+
27+
// Function to query Grafana based on an expression
28+
async function queryGrafana(expr: string): Promise<number> {
29+
// Create base64 encoded credentials for basic auth
30+
const credentials = Buffer.from('admin:admin').toString('base64');
31+
32+
const response = await fetch(`${GRAFANA_ENDPOINT}?query=${encodeURIComponent(expr)}`, {
33+
headers: {
34+
Authorization: `Basic ${credentials}`,
35+
},
36+
});
37+
38+
if (!response.ok) {
39+
throw new Error(`Failed to fetch data from Grafana: ${response.statusText}`);
40+
}
41+
42+
const data = await response.json();
43+
const result = data.data.result;
44+
return result.length > 0 ? parseFloat(result[0].value[1]) : 0;
45+
}
46+
47+
// Function to check alerts based on expressions
48+
async function checkAlerts(alerts: AlertConfig[], logger: DebugLogger) {
49+
let alertTriggered = false;
50+
51+
for (const alert of alerts) {
52+
logger.info(`Checking alert: ${JSON.stringify(alert)}`);
53+
54+
const metricValue = await queryGrafana(alert.expr);
55+
logger.info(`Metric value: ${metricValue}`);
56+
if (metricValue > 0) {
57+
logger.error(`Alert ${alert.alert} triggered! Value: ${metricValue}`);
58+
alertTriggered = true;
59+
} else {
60+
logger.info(`Alert ${alert.alert} passed.`);
61+
}
62+
}
63+
64+
// If any alerts have been triggered we fail the test
65+
if (alertTriggered) {
66+
throw new Error('Test failed due to triggered alert');
67+
}
68+
}
69+
70+
// Main function to run tests
71+
async function runAlertChecker(logger: DebugLogger) {
72+
const alerts = loadAlertsConfig('alerts.yaml');
73+
try {
74+
await checkAlerts(alerts, logger);
75+
logger.info('All alerts passed.');
76+
} catch (error) {
77+
logger.error(error instanceof Error ? error.message : String(error));
78+
process.exit(1); // Exit with error code
79+
}
80+
}
81+
82+
// Running as a jest test to use existing end to end test framework
83+
describe('Alert Checker', () => {
84+
const logger = createDebugLogger('aztec:alert-checker');
85+
it('should check alerts', async () => {
86+
await runAlertChecker(logger);
87+
});
88+
});
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
## A set of alerts for the quality of service of the sequencer, these are tested for in certain e2e tests
2+
3+
## In end to end tests - page, will cause a test to fail
4+
## Warning will write a message to the PR
5+
6+
alerts:
7+
- alert: SequencerTimeToCollectAttestations
8+
expr: aztec_sequencer_time_to_collect_attestations > 2500
9+
labels:
10+
severity: page

yarn-project/sequencer-client/src/sequencer/metrics.ts

+19
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ export class SequencerMetrics {
2121
private currentBlockNumber: Gauge;
2222
private currentBlockSize: Gauge;
2323

24+
private timeToCollectAttestations: Gauge;
25+
2426
constructor(client: TelemetryClient, getState: SequencerStateCallback, name = 'Sequencer') {
2527
const meter = client.getMeter(name);
2628
this.tracer = client.getTracer(name);
@@ -60,9 +62,26 @@ export class SequencerMetrics {
6062
description: 'Current block number',
6163
});
6264

65+
this.timeToCollectAttestations = meter.createGauge(Metrics.SEQUENCER_TIME_TO_COLLECT_ATTESTATIONS, {
66+
description: 'The time spent collecting attestations from committee members',
67+
});
68+
6369
this.setCurrentBlock(0, 0);
6470
}
6571

72+
startCollectingAttestationsTimer(): () => void {
73+
const startTime = Date.now();
74+
const stop = () => {
75+
const duration = Date.now() - startTime;
76+
this.recordTimeToCollectAttestations(duration);
77+
};
78+
return stop.bind(this);
79+
}
80+
81+
recordTimeToCollectAttestations(time: number) {
82+
this.timeToCollectAttestations.record(time);
83+
}
84+
6685
recordCancelledBlock() {
6786
this.blockCounter.add(1, {
6887
[Attributes.STATUS]: 'cancelled',

yarn-project/sequencer-client/src/sequencer/sequencer.ts

+5-3
Original file line numberDiff line numberDiff line change
@@ -633,11 +633,13 @@ export class Sequencer {
633633
const txHashes = validTxs.map(tx => tx.getTxHash());
634634

635635
this.isFlushing = false;
636-
this.log.info('Collecting attestations');
636+
this.log.verbose('Collecting attestations');
637+
const stopCollectingAttestationsTimer = this.metrics.startCollectingAttestationsTimer();
637638
const attestations = await this.collectAttestations(block, txHashes);
638-
this.log.info('Attestations collected');
639+
this.log.verbose('Attestations collected');
640+
stopCollectingAttestationsTimer();
641+
this.log.verbose('Collecting proof quotes');
639642

640-
this.log.info('Collecting proof quotes');
641643
const proofQuote = await this.createProofClaimForPreviousEpoch(newGlobalVariables.slotNumber.toBigInt());
642644
this.log.info(proofQuote ? `Using proof quote ${inspect(proofQuote.payload)}` : 'No proof quote available');
643645

yarn-project/telemetry-client/src/metrics.ts

+1
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ export const SEQUENCER_BLOCK_COUNT = 'aztec.sequencer.block.count';
5353
export const SEQUENCER_CURRENT_STATE = 'aztec.sequencer.current.state';
5454
export const SEQUENCER_CURRENT_BLOCK_NUMBER = 'aztec.sequencer.current.block_number';
5555
export const SEQUENCER_CURRENT_BLOCK_SIZE = 'aztec.sequencer.current.block_size';
56+
export const SEQUENCER_TIME_TO_COLLECT_ATTESTATIONS = 'aztec.sequencer.time_to_collect_attestations';
5657

5758
export const L1_PUBLISHER_GAS_PRICE = 'aztec.l1_publisher.gas_price';
5859
export const L1_PUBLISHER_TX_COUNT = 'aztec.l1_publisher.tx_count';

yarn-project/telemetry-client/src/prom_otel_adapter.ts

+7-3
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ interface IGauge<Labels extends LabelsGeneric = NoLabels> {
2828
set: NoLabels extends Labels ? (value: number) => void : (labels: Labels, value: number) => void;
2929

3030
collect?(): void;
31-
addCollect(fn: CollectFn<Labels>): void;
31+
addCollect(collectFn: CollectFn<Labels>): void;
3232
}
3333

3434
interface IHistogram<Labels extends LabelsGeneric = NoLabels> {
@@ -101,8 +101,12 @@ export class OtelGauge<Labels extends LabelsGeneric = NoLabels> implements IGaug
101101
this.gauge.addCallback(this.handleObservation.bind(this));
102102
}
103103

104-
addCollect(fn: CollectFn<Labels>): void {
105-
this.collectFns.push(fn);
104+
/**
105+
* Add a collect callback
106+
* @param collectFn - Callback function
107+
*/
108+
addCollect(collectFn: CollectFn<Labels>): void {
109+
this.collectFns.push(collectFn);
106110
}
107111

108112
handleObservation(result: any): void {

0 commit comments

Comments
 (0)