Skip to content

Commit f9810cc

Browse files
authored
feat: persistence in helm chart for validator and boot node (#10543)
chore: give validators/boot-nodes 100Gi in network configs feat: allow metrics to be instantly flushed chore: flush archiver metrics on startup feat: allow making range queries to prometheus in tests
1 parent e065e05 commit f9810cc

File tree

17 files changed

+120
-18
lines changed

17 files changed

+120
-18
lines changed

spartan/aztec-network/templates/boot-node.yaml

+13
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,14 @@ spec:
1111
matchLabels:
1212
{{- include "aztec-network.selectorLabels" . | nindent 6 }}
1313
app: boot-node
14+
volumeClaimTemplates:
15+
- metadata:
16+
name: boot-node-data
17+
spec:
18+
accessModes: [ "ReadWriteOnce" ]
19+
resources:
20+
requests:
21+
storage: {{ .Values.bootNode.storageSize }}
1422
template:
1523
metadata:
1624
labels:
@@ -119,6 +127,8 @@ spec:
119127
mountPath: /shared/p2p
120128
- name: config
121129
mountPath: /shared/config
130+
- name: boot-node-data
131+
mountPath: {{ .Values.bootNode.dataDir }}
122132
{{- if .Values.bootNode.deployContracts }}
123133
- name: scripts-output
124134
mountPath: /shared/contracts
@@ -182,6 +192,9 @@ spec:
182192
emptyDir: {}
183193
- name: config
184194
emptyDir: {}
195+
- name: boot-node-data
196+
persistentVolumeClaim:
197+
claimName: boot-node-data
185198
{{- if .Values.bootNode.deployContracts }}
186199
- name: scripts
187200
configMap:

spartan/aztec-network/templates/validator.yaml

+13-1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,14 @@ spec:
1212
matchLabels:
1313
{{- include "aztec-network.selectorLabels" . | nindent 6 }}
1414
app: validator
15+
volumeClaimTemplates:
16+
- metadata:
17+
name: validator-data
18+
spec:
19+
accessModes: [ "ReadWriteOnce" ]
20+
resources:
21+
requests:
22+
storage: {{ .Values.validator.storageSize }}
1523
template:
1624
metadata:
1725
labels:
@@ -53,7 +61,6 @@ spec:
5361
{{- end }}
5462
5563
if [ "{{ .Values.validator.dynamicBootNode }}" = "true" ]; then
56-
# Get the list of pod IPs for the validator service
5764
echo "{{ include "aztec-network.pxeUrl" . }}" > /shared/pxe/pxe_url
5865
else
5966
until curl --silent --head --fail "${BOOT_NODE_HOST}/status" > /dev/null; do
@@ -136,6 +143,8 @@ spec:
136143
mountPath: /shared/p2p
137144
- name: config
138145
mountPath: /shared/config
146+
- name: validator-data
147+
mountPath: {{ .Values.validator.dataDir }}
139148
env:
140149
- name: POD_IP
141150
valueFrom:
@@ -197,6 +206,9 @@ spec:
197206
emptyDir: {}
198207
- name: config
199208
emptyDir: {}
209+
- name: validator-data
210+
persistentVolumeClaim:
211+
claimName: validator-data
200212
---
201213
# If this is not a public network, create a headless service for StatefulSet DNS entries
202214
{{ if not .Values.network.public }}

spartan/aztec-network/values.yaml

+5-1
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,9 @@ bootNode:
7171
outboxAddress: ""
7272
feeJuiceAddress: ""
7373
feeJuicePortalAddress: ""
74-
storage: "8Gi"
74+
stakingAssetAddress: ""
75+
storageSize: "1Gi"
76+
dataDir: "/data"
7577

7678
validator:
7779
# If true, the validator will use its peers to serve as the boot node.
@@ -108,6 +110,8 @@ validator:
108110
requests:
109111
memory: "2Gi"
110112
cpu: "200m"
113+
storageSize: "1Gi"
114+
dataDir: "/data"
111115

112116
proverNode:
113117
externalHost: ""

spartan/aztec-network/values/4-validators-with-metrics.yaml

+2
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ validator:
2121
- 0x90F79bf6EB2c4f870365E785982E1f101E93b906
2222
validator:
2323
disabled: false
24+
sequencer:
25+
enforceTimeTable: false
2426

2527
bootNode:
2628
validator:

spartan/aztec-network/values/exp-1.yaml

+2
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ images:
1414
pullPolicy: Always
1515

1616
validator:
17+
storageSize: "100Gi"
1718
replicas: 48
1819
validatorKeys:
1920
- 0xac0974bec39a17e36ba4a6b4d238ff944bacb478cbed5efcae784d7bf4f2ff80
@@ -124,6 +125,7 @@ validator:
124125

125126
bootNode:
126127
peerIdPrivateKey: 080212200ba8451c6d62b03c4441f0a466c0bce7a3a595f2cf50a055ded3305c77aa3af0
128+
storageSize: "100Gi"
127129
validator:
128130
disabled: true
129131

spartan/aztec-network/values/rc-1.yaml

+2
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ telemetry:
1515
otelCollectorEndpoint: http://35.197.100.168:4318
1616

1717
validator:
18+
storageSize: "100Gi"
1819
replicas: 48
1920
validatorKeys:
2021
- 0xac0974bec39a17e36ba4a6b4d238ff944bacb478cbed5efcae784d7bf4f2ff80
@@ -125,6 +126,7 @@ bootNode:
125126
peerIdPrivateKey: 080212200ba8451c6d62b03c4441f0a466c0bce7a3a595f2cf50a055ded3305c77aa3af0
126127
validator:
127128
disabled: true
129+
storageSize: "100Gi"
128130

129131
proverAgent:
130132
replicas: 8

spartan/aztec-network/values/rc-2.yaml

+2
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ telemetry:
1616

1717
validator:
1818
replicas: 48
19+
storageSize: "100Gi"
1920
validatorKeys:
2021
- 0xac0974bec39a17e36ba4a6b4d238ff944bacb478cbed5efcae784d7bf4f2ff80
2122
- 0x59c6995e998f97a5a0044966f0945389dc9e86dae88c7a8412f4603b6b78690d
@@ -122,6 +123,7 @@ validator:
122123
disabled: false
123124

124125
bootNode:
126+
storageSize: "100Gi"
125127
peerIdPrivateKey: 080212200ba8451c6d62b03c4441f0a466c0bce7a3a595f2cf50a055ded3305c77aa3af0
126128
validator:
127129
disabled: true

yarn-project/archiver/src/archiver/archiver.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ export class Archiver implements ArchiveSource {
174174
pollingIntervalMs: config.archiverPollingIntervalMS ?? 10_000,
175175
batchSize: config.archiverBatchSize ?? 100,
176176
},
177-
new ArchiverInstrumentation(telemetry, () => archiverStore.estimateSize()),
177+
await ArchiverInstrumentation.new(telemetry, () => archiverStore.estimateSize()),
178178
{ l1StartBlock, l1GenesisTime, epochDuration, slotDuration, ethereumSlotDuration },
179179
);
180180
await archiver.start(blockUntilSynced);

yarn-project/archiver/src/archiver/instrumentation.ts

+18-1
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,14 @@ export class ArchiverInstrumentation {
1818
private blockHeight: Gauge;
1919
private blockSize: Gauge;
2020
private syncDuration: Histogram;
21+
private l1BlocksSynced: UpDownCounter;
2122
private proofsSubmittedDelay: Histogram;
2223
private proofsSubmittedCount: UpDownCounter;
2324
private dbMetrics: LmdbMetrics;
2425

2526
private log = createLogger('archiver:instrumentation');
2627

27-
constructor(private telemetry: TelemetryClient, lmdbStats?: LmdbStatsCallback) {
28+
private constructor(private telemetry: TelemetryClient, lmdbStats?: LmdbStatsCallback) {
2829
const meter = telemetry.getMeter('Archiver');
2930
this.blockHeight = meter.createGauge(Metrics.ARCHIVER_BLOCK_HEIGHT, {
3031
description: 'The height of the latest block processed by the archiver',
@@ -59,6 +60,11 @@ export class ArchiverInstrumentation {
5960
},
6061
});
6162

63+
this.l1BlocksSynced = meter.createUpDownCounter(Metrics.ARCHIVER_L1_BLOCKS_SYNCED, {
64+
description: 'Number of blocks synced from L1',
65+
valueType: ValueType.INT,
66+
});
67+
6268
this.dbMetrics = new LmdbMetrics(
6369
meter,
6470
{
@@ -77,13 +83,24 @@ export class ArchiverInstrumentation {
7783
);
7884
}
7985

86+
public static async new(telemetry: TelemetryClient, lmdbStats?: LmdbStatsCallback) {
87+
const instance = new ArchiverInstrumentation(telemetry, lmdbStats);
88+
89+
instance.l1BlocksSynced.add(0);
90+
91+
await instance.telemetry.flush();
92+
93+
return instance;
94+
}
95+
8096
public isEnabled(): boolean {
8197
return this.telemetry.isEnabled();
8298
}
8399

84100
public processNewBlocks(syncTimePerBlock: number, blocks: L2Block[]) {
85101
this.syncDuration.record(Math.ceil(syncTimePerBlock));
86102
this.blockHeight.record(Math.max(...blocks.map(b => b.number)));
103+
this.l1BlocksSynced.add(blocks.length);
87104
for (const block of blocks) {
88105
this.blockSize.record(block.body.txEffects.length);
89106
}

yarn-project/end-to-end/scripts/network_test.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -180,5 +180,5 @@ docker run --rm --network=host \
180180
-e GRAFANA_PASSWORD=$GRAFANA_PASSWORD \
181181
-e DEBUG=${DEBUG:-""} \
182182
-e LOG_JSON=1 \
183-
-e LOG_LEVEL=verbose \
183+
-e LOG_LEVEL=${LOG_LEVEL:-"verbose"} \
184184
aztecprotocol/end-to-end:$AZTEC_DOCKER_TAG $TEST

yarn-project/end-to-end/src/quality_of_service/alert_checker.ts

+26-4
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@ import * as yaml from 'js-yaml';
66
export interface AlertConfig {
77
alert: string;
88
expr: string;
9+
start?: number;
10+
end?: number;
11+
step?: number;
912
for: string;
1013
labels: Record<string, string>;
1114
annotations: Record<string, string>;
@@ -18,7 +21,7 @@ export interface AlertCheckerConfig {
1821

1922
// This config is good if you're running the otel-lgtm stack locally
2023
const DEFAULT_CONFIG: AlertCheckerConfig = {
21-
grafanaEndpoint: 'http://localhost:3000/api/datasources/proxy/uid/prometheus/api/v1/query',
24+
grafanaEndpoint: 'http://localhost:3000/api/datasources/proxy/uid/prometheus/api/v1',
2225
grafanaCredentials: 'admin:admin',
2326
};
2427

@@ -41,10 +44,29 @@ export class AlertChecker {
4144
return data.alerts;
4245
}
4346

44-
private async queryGrafana(expr: string): Promise<number> {
47+
private async queryGrafana({ expr, start, end, step }: AlertConfig): Promise<number> {
4548
const credentials = Buffer.from(this.config.grafanaCredentials).toString('base64');
4649

47-
const response = await fetch(`${this.config.grafanaEndpoint}?query=${encodeURIComponent(expr)}`, {
50+
let query = `query=${encodeURIComponent(expr)}`;
51+
let action = 'query';
52+
53+
if (start) {
54+
action = 'query_range';
55+
query += `&start=${start}`;
56+
}
57+
58+
if (end) {
59+
query += `&end=${end}`;
60+
}
61+
62+
if (step) {
63+
query += `&step=${step}`;
64+
}
65+
66+
const urlString = `${this.config.grafanaEndpoint}/${action}?${query}`;
67+
this.logger.debug(`Querying Grafana: ${urlString}`);
68+
69+
const response = await fetch(urlString, {
4870
headers: {
4971
Authorization: `Basic ${credentials}`,
5072
},
@@ -65,7 +87,7 @@ export class AlertChecker {
6587
for (const alert of alerts) {
6688
this.logger.info(`Checking alert: ${JSON.stringify(alert)}`);
6789

68-
const metricValue = await this.queryGrafana(alert.expr);
90+
const metricValue = await this.queryGrafana(alert);
6991
this.logger.info(`Metric value: ${metricValue}`);
7092
if (metricValue > 0) {
7193
this.logger.error(`Alert ${alert.alert} triggered! Value: ${metricValue}`);

yarn-project/end-to-end/src/spartan/gating-passive.test.ts

+15-7
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,14 @@ const qosAlerts: AlertConfig[] = [
2626
for: '10m',
2727
annotations: {},
2828
},
29+
{
30+
// Checks that we are not syncing from scratch each time we reboot
31+
alert: 'ArchiverL1BlocksSynced',
32+
expr: 'rate(aztec_archiver_l1_blocks_synced[1m]) > 0.5',
33+
labels: { severity: 'error' },
34+
for: '10m',
35+
annotations: {},
36+
},
2937
];
3038

3139
const config = setupEnvironment(process.env);
@@ -52,6 +60,12 @@ describe('a test that passively observes the network in the presence of network
5260
const MAX_MISSED_SLOT_PERCENT = 0.6;
5361

5462
afterAll(async () => {
63+
await startPortForward({
64+
resource: `svc/metrics-grafana`,
65+
namespace: 'metrics',
66+
containerPort: config.CONTAINER_METRICS_PORT,
67+
hostPort: config.HOST_METRICS_PORT,
68+
});
5569
await runAlertCheck(config, qosAlerts, debugLogger);
5670
});
5771

@@ -69,12 +83,6 @@ describe('a test that passively observes the network in the presence of network
6983
hostPort: HOST_ETHEREUM_PORT,
7084
});
7185

72-
await startPortForward({
73-
resource: `svc/metrics-grafana`,
74-
namespace: 'metrics',
75-
containerPort: config.CONTAINER_METRICS_PORT,
76-
hostPort: config.HOST_METRICS_PORT,
77-
});
7886
const client = await createCompatibleClient(PXE_URL, debugLogger);
7987
const ethCheatCodes = new EthCheatCodes(ETHEREUM_HOST);
8088
const rollupCheatCodes = new RollupCheatCodes(
@@ -93,7 +101,7 @@ describe('a test that passively observes the network in the presence of network
93101
// note, don't forget that normally an epoch doesn't need epochDuration worth of blocks,
94102
// but here we do double duty:
95103
// we want a handful of blocks, and we want to pass the epoch boundary
96-
await awaitL2BlockNumber(rollupCheatCodes, epochDuration, 60 * 5, debugLogger);
104+
await awaitL2BlockNumber(rollupCheatCodes, epochDuration, 60 * 6, debugLogger);
97105

98106
let deploymentOutput: string = '';
99107
deploymentOutput = await applyNetworkShaping({

yarn-project/end-to-end/src/spartan/utils.ts

+1-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ const k8sLocalConfigSchema = z.object({
2323
HOST_METRICS_PORT: z.coerce.number().min(1, 'HOST_METRICS_PORT env variable must be set'),
2424
CONTAINER_METRICS_PORT: z.coerce.number().default(80),
2525
GRAFANA_PASSWORD: z.string().min(1, 'GRAFANA_PASSWORD env variable must be set'),
26-
METRICS_API_PATH: z.string().default('/api/datasources/proxy/uid/spartan-metrics-prometheus/api/v1/query'),
26+
METRICS_API_PATH: z.string().default('/api/datasources/proxy/uid/spartan-metrics-prometheus/api/v1'),
2727
SPARTAN_DIR: z.string().min(1, 'SPARTAN_DIR env variable must be set'),
2828
K8S: z.literal('local'),
2929
});

yarn-project/telemetry-client/src/metrics.ts

+1
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ export const MEMPOOL_PROVER_QUOTE_COUNT = 'aztec.mempool.prover_quote_count';
3636
export const MEMPOOL_PROVER_QUOTE_SIZE = 'aztec.mempool.prover_quote_size';
3737

3838
export const ARCHIVER_SYNC_DURATION = 'aztec.archiver.sync_duration';
39+
export const ARCHIVER_L1_BLOCKS_SYNCED = 'aztec.archiver.l1_blocks_synced';
3940
export const ARCHIVER_BLOCK_HEIGHT = 'aztec.archiver.block_height';
4041
export const ARCHIVER_BLOCK_SIZE = 'aztec.archiver.block_size';
4142
export const ARCHIVER_ROLLUP_PROOF_DELAY = 'aztec.archiver.rollup_proof_delay';

yarn-project/telemetry-client/src/noop.ts

+4
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@ export class NoopTelemetryClient implements TelemetryClient {
1515
return Promise.resolve();
1616
}
1717

18+
flush(): Promise<void> {
19+
return Promise.resolve();
20+
}
21+
1822
isEnabled() {
1923
return false;
2024
}

yarn-project/telemetry-client/src/otel.ts

+8
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,14 @@ export class OpenTelemetryClient implements TelemetryClient {
9595
return true;
9696
}
9797

98+
public async flush() {
99+
await Promise.all([
100+
this.meterProvider.forceFlush(),
101+
this.loggerProvider.forceFlush(),
102+
this.traceProvider instanceof NodeTracerProvider ? this.traceProvider.forceFlush() : Promise.resolve(),
103+
]);
104+
}
105+
98106
public async stop() {
99107
const flushAndShutdown = async (provider: { forceFlush: () => Promise<void>; shutdown: () => Promise<void> }) => {
100108
await provider.forceFlush();

yarn-project/telemetry-client/src/telemetry.ts

+6-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ import {
1818
import * as Attributes from './attributes.js';
1919
import * as Metrics from './metrics.js';
2020

21-
export { ValueType, Span } from '@opentelemetry/api';
21+
export { Span, ValueType } from '@opentelemetry/api';
2222

2323
type ValuesOf<T> = T extends Record<string, infer U> ? U : never;
2424

@@ -115,6 +115,11 @@ export interface TelemetryClient {
115115
* Stops the telemetry client.
116116
*/
117117
stop(): Promise<void>;
118+
119+
/**
120+
* Flushes the telemetry client.
121+
*/
122+
flush(): Promise<void>;
118123
}
119124

120125
/** Objects that adhere to this interface can use @trackSpan */

0 commit comments

Comments
 (0)