From 06be9fbfe1c91e9b0f1ce90f1e75c89a598aded9 Mon Sep 17 00:00:00 2001 From: SantiagoPittella Date: Thu, 12 Dec 2024 19:19:35 -0300 Subject: [PATCH 01/37] feat: add metrics to proxy feat: add request latency and count metrics feat: add grafana dashboard feat: add metrics endpoint to proxy config chore: update changelog docs: update README chore: add prometheus.yml --- CHANGELOG.md | 3 +- Cargo.lock | 2 + bin/tx-prover/Cargo.toml | 2 + bin/tx-prover/README.md | 20 + bin/tx-prover/prometheus.yml | 11 + bin/tx-prover/proxy_grafana_dashboard.json | 864 +++++++++++++++++++++ bin/tx-prover/src/commands/mod.rs | 6 + bin/tx-prover/src/commands/proxy.rs | 8 + bin/tx-prover/src/proxy/metrics.rs | 49 ++ bin/tx-prover/src/proxy/mod.rs | 59 +- 10 files changed, 1018 insertions(+), 6 deletions(-) create mode 100644 bin/tx-prover/prometheus.yml create mode 100644 bin/tx-prover/proxy_grafana_dashboard.json create mode 100644 bin/tx-prover/src/proxy/metrics.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index f9c94b0fa..51bcbedc4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,6 @@ ### Changes -- Added tracing to the `miden-tx-prover` CLI (#1014). - Added health check endpoints to the prover service (#1006). - Implemented serialization for `AccountHeader` (#996). - Updated Pingora crates to 0.4 and added polling time to the configuration file (#997). @@ -19,6 +18,8 @@ - [BREAKING] Added `miden::note::get_script_hash` procedure (#995). - [BREAKING] Refactor error messages in `miden-lib` and `miden-tx` and use `thiserror` 2.0 (#1005). - Removed workers list from the proxy configuration file (#1018). +- Added tracing to the `miden-tx-prover` CLI (#1014). +- Added metrics to the `miden-tx-prover` proxy (#1017). ## 0.6.2 (2024-11-20) diff --git a/Cargo.lock b/Cargo.lock index ae5a66c47..c597e0abe 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2018,6 +2018,7 @@ dependencies = [ "clap 4.5.21", "figment", "getrandom", + "lazy_static", "miden-lib", "miden-objects", "miden-tx", @@ -2032,6 +2033,7 @@ dependencies = [ "pingora-core", "pingora-limits", "pingora-proxy", + "prometheus", "prost", "prost-build", "protox", diff --git a/bin/tx-prover/Cargo.toml b/bin/tx-prover/Cargo.toml index 174b3c8eb..21fdcbdf2 100644 --- a/bin/tx-prover/Cargo.toml +++ b/bin/tx-prover/Cargo.toml @@ -54,6 +54,8 @@ figment = { version = "0.10", features = ["toml", "env"] } miden-lib = { workspace = true, default-features = false } miden-objects = { workspace = true, default-features = false } miden-tx = { workspace = true, default-features = false } +lazy_static = "1.5" +prometheus = "0.13" prost = { version = "0.13", default-features = false, features = ["derive"] } reqwest = { version = "0.11" } serde = { version = "1.0", features = ["derive"] } diff --git a/bin/tx-prover/README.md b/bin/tx-prover/README.md index 7dba23387..9f3a695a1 100644 --- a/bin/tx-prover/README.md +++ b/bin/tx-prover/README.md @@ -120,6 +120,26 @@ Then access the Jaeger UI at `http://localhost:16686/`. If Docker is not an option, Jaeger can also be set up directly on your machine or hosted in the cloud. See the [Jaeger documentation](https://www.jaegertracing.io/docs/) for alternative installation methods. +## Metrics + +The proxy includes a service that provides metrics for [Prometheus](https://prometheus.io/docs/introduction/overview/). This service is always enabled and uses the host and port defined in the configuration file. The metrics are available at the `/metrics` endpoint. + +To consume and display the metrics, you can use Prometheus and Grafana. The simplest way to install Prometheus and Grafana is by using Docker containers. To do so, run: + +```bash +docker run \ + -d \ + -p 9090:9090 \ + -v /path/to/prometheus.yml:/etc/prometheus/prometheus.yml \ + prom/prometheus + +docker run -d -p 3000:3000 --name grafana grafana/grafana-enterprise:latest +``` + +A prometheus configuration file is provided in this repository, you will need to modify the `scrape_configs` section to include the host and port of the proxy service. + +Then, to add the new Prometheus collector as a datasource for Grafana, you can [follow this tutorial](https://grafana.com/docs/grafana-cloud/connect-externally-hosted/existing-datasource/). A Grafana dashboard under the name `proxy_grafana_dashboard.json` is provided, see this [link](https://grafana.com/docs/grafana/latest/dashboards/build-dashboards/import-dashboards/) to import it. + ## Features Description of this crate's feature: diff --git a/bin/tx-prover/prometheus.yml b/bin/tx-prover/prometheus.yml new file mode 100644 index 000000000..e96926acf --- /dev/null +++ b/bin/tx-prover/prometheus.yml @@ -0,0 +1,11 @@ +global: + scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute. + evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. + # scrape_timeout is set to the global default (10s). + +# A scrape configuration containing exactly one endpoint to scrape: +scrape_configs: + - job_name: "proxy" + # Here you need to specify the address of the Prometheus service endpoint in the proxy + static_configs: + - targets: ["127.0.0.1:6192"] diff --git a/bin/tx-prover/proxy_grafana_dashboard.json b/bin/tx-prover/proxy_grafana_dashboard.json new file mode 100644 index 000000000..8f15d5ce9 --- /dev/null +++ b/bin/tx-prover/proxy_grafana_dashboard.json @@ -0,0 +1,864 @@ +{ + "__inputs": [ + { + "name": "DS_PROXY", + "label": "proxy", + "description": "Dashboard of the proxy service", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "panel", + "id": "barchart", + "name": "Bar chart", + "version": "" + }, + { + "type": "panel", + "id": "gauge", + "name": "Gauge", + "version": "" + }, + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "11.4.0" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROXY}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 0 + }, + "id": 7, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "builder", + "expr": "increase(request_count[1m])", + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROXY}" + } + } + ], + "title": "Requests/m", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROXY}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 6, + "y": 0 + }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "builder", + "expr": "worker_request_count", + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROXY}" + } + } + ], + "title": "Total request count per worker", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROXY}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "builder", + "expr": "worker_availability", + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROXY}" + } + } + ], + "title": "Worker availability", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROXY}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 0 + }, + "id": 4, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "builder", + "expr": "request_failure_count", + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROXY}" + } + } + ], + "title": "Request failure count", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROXY}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 7, + "x": 0, + "y": 7 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "builder", + "expr": "increase(request_count[1m])", + "fullMetaSearch": false, + "includeNullMetadata": true, + "interval": "", + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROXY}" + } + } + ], + "title": "Requests/m", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROXY}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 15 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 7, + "y": 7 + }, + "id": 3, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "builder", + "expr": "queue_size", + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROXY}" + } + } + ], + "title": "Queue size", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROXY}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 7, + "x": 13, + "y": 7 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROXY}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "worker_utilization", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "B", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROXY}" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "queue_size", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "C", + "useBackend": false + } + ], + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROXY}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 4, + "x": 20, + "y": 7 + }, + "id": 8, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "builder", + "expr": "rate_limit_violations", + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROXY}" + } + } + ], + "title": "Rate limit violations", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROXY}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 1, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 15 + }, + "id": 9, + "options": { + "barRadius": 0, + "barWidth": 0.97, + "fullHighlight": false, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "orientation": "auto", + "showValue": "auto", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "builder", + "exemplar": false, + "expr": "histogram_quantile(0.95, sum by(le) (rate(request_latency_bucket[$__rate_interval])))", + "fullMetaSearch": false, + "includeNullMetadata": false, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROXY}" + } + } + ], + "title": "Request latency", + "type": "barchart" + } + ], + "refresh": "5s", + "schemaVersion": 40, + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "2024-12-19T14:36:56.711Z", + "to": "2024-12-19T14:38:02.492Z" + }, + "timepicker": {}, + "timezone": "browser", + "title": "PROXY", + "uid": "be7bobzl5fr40f", + "version": 23, + "weekStart": "" +} diff --git a/bin/tx-prover/src/commands/mod.rs b/bin/tx-prover/src/commands/mod.rs index 8d9f494f5..68c23e824 100644 --- a/bin/tx-prover/src/commands/mod.rs +++ b/bin/tx-prover/src/commands/mod.rs @@ -42,6 +42,10 @@ pub struct ProxyConfig { pub available_workers_polling_time_ms: u64, /// Health check interval in seconds. pub health_check_interval_secs: u64, + /// Prometheus metrics host. + pub prometheus_host: String, + /// Prometheus metrics port. + pub prometheus_port: u16, } impl Default for ProxyConfig { @@ -56,6 +60,8 @@ impl Default for ProxyConfig { max_req_per_sec: 5, available_workers_polling_time_ms: 20, health_check_interval_secs: 1, + prometheus_host: "0.0.0.0".into(), + prometheus_port: 6192, } } } diff --git a/bin/tx-prover/src/commands/proxy.rs b/bin/tx-prover/src/commands/proxy.rs index b0a9edd31..01015fc33 100644 --- a/bin/tx-prover/src/commands/proxy.rs +++ b/bin/tx-prover/src/commands/proxy.rs @@ -65,6 +65,14 @@ impl StartProxy { http_server_options.h2c = true; logic.server_options = Some(http_server_options); + // Enable Prometheus metrics + let mut prometheus_service_http = + pingora::services::listening::Service::prometheus_http_service(); + prometheus_service_http.add_tcp( + format!("{}:{}", proxy_config.prometheus_host, proxy_config.prometheus_port).as_str(), + ); + + server.add_service(prometheus_service_http); server.add_service(health_check_service); server.add_service(lb); tokio::task::spawn_blocking(|| server.run_forever()) diff --git a/bin/tx-prover/src/proxy/metrics.rs b/bin/tx-prover/src/proxy/metrics.rs new file mode 100644 index 000000000..a6dbdbcba --- /dev/null +++ b/bin/tx-prover/src/proxy/metrics.rs @@ -0,0 +1,49 @@ +use lazy_static::lazy_static; +use prometheus::{ + register_histogram, register_int_counter, register_int_counter_vec, register_int_gauge, + Histogram, IntCounter, IntCounterVec, IntGauge, +}; + +lazy_static! { + // Queue Metrics + pub static ref QUEUE_SIZE: IntGauge = + register_int_gauge!("queue_size", "Number of requests in the queue").unwrap(); + pub static ref QUEUE_LATENCY: Histogram = + register_histogram!("queue_latency", "Time requests spend in the queue", vec![ + 0.1, 0.5, 1.0, 2.0, 5.0, 10.0 + ]).unwrap(); + pub static ref QUEUE_DROP_COUNT: IntCounter = + register_int_counter!("queue_drop_count", "Number of requests dropped due to a full queue").unwrap(); + + // Worker Metrics + pub static ref WORKER_COUNT: IntGauge = + register_int_gauge!("worker_count", "Number of workers").unwrap(); + pub static ref WORKER_UNHEALTHY: IntCounter = + register_int_counter!("worker_unhealthy", "Number of unhealthy workers").unwrap(); + pub static ref WORKER_UTILIZATION: IntGauge = + register_int_gauge!("worker_utilization", "Number of requests being processed by workers").unwrap(); + pub static ref WORKER_REQUEST_COUNT: IntCounterVec = + register_int_counter_vec!( + "worker_request_count", + "Number of requests processed by each worker", + &["worker_id"] + ).unwrap(); + + // Request Metrics + pub static ref REQUEST_FAILURE_COUNT: IntCounter = + register_int_counter!("request_failure_count", "Number of failed requests").unwrap(); + pub static ref REQUEST_RETRIES: IntCounter = + register_int_counter!("request_retries", "Number of request retries").unwrap(); + pub static ref REQUEST_COUNT: IntCounter = + register_int_counter!("request_count", "Number of requests processed").unwrap(); + pub static ref REQUEST_LATENCY: Histogram = + register_histogram!("request_latency", "Time requests take to process", vec![ + 0.1, 0.5, 1.0, 2.0, 5.0, 10.0 + ]).unwrap(); + + // Rate Limiting Metrics + pub static ref RATE_LIMITED_REQUESTS: IntCounter = + register_int_counter!("rate_limited_requests", "Number of requests blocked due to rate limiting").unwrap(); + pub static ref RATE_LIMIT_VIOLATIONS: IntCounter = + register_int_counter!("rate_limit_violations", "Number of rate limit violations by clients").unwrap(); +} diff --git a/bin/tx-prover/src/proxy/mod.rs b/bin/tx-prover/src/proxy/mod.rs index 49ec0d079..defd43bc4 100644 --- a/bin/tx-prover/src/proxy/mod.rs +++ b/bin/tx-prover/src/proxy/mod.rs @@ -1,7 +1,18 @@ -use std::{collections::VecDeque, future::Future, pin::Pin, sync::Arc, time::Duration}; +use std::{ + collections::VecDeque, + future::Future, + pin::Pin, + sync::Arc, + time::{Duration, Instant}, +}; use async_trait::async_trait; use bytes::Bytes; +use metrics::{ + QUEUE_DROP_COUNT, QUEUE_LATENCY, QUEUE_SIZE, RATE_LIMITED_REQUESTS, RATE_LIMIT_VIOLATIONS, + REQUEST_COUNT, REQUEST_FAILURE_COUNT, REQUEST_LATENCY, REQUEST_RETRIES, WORKER_COUNT, + WORKER_REQUEST_COUNT, WORKER_UNHEALTHY, WORKER_UTILIZATION, +}; use once_cell::sync::Lazy; use pingora::{ http::ResponseHeader, @@ -31,6 +42,7 @@ use crate::{ }, }; +mod metrics; mod worker; /// Localhost address @@ -87,8 +99,10 @@ impl LoadBalancerState { /// If no worker is available, it will return None. pub async fn pop_available_worker(&self) -> Option { let mut available_workers = self.workers.write().await; + WORKER_COUNT.set(available_workers.len() as i64); available_workers.iter_mut().find(|w| w.is_available()).map(|w| { w.set_availability(false); + WORKER_UTILIZATION.inc(); w.clone() }) } @@ -100,7 +114,9 @@ impl LoadBalancerState { let mut available_workers = self.workers.write().await; if let Some(w) = available_workers.iter_mut().find(|w| *w == &worker) { w.set_availability(true); + WORKER_UTILIZATION.dec(); } + WORKER_COUNT.set(available_workers.len() as i64); } /// Updates the list of available workers based on the given action ("add" or "remove"). @@ -154,6 +170,7 @@ impl LoadBalancerState { } info!("Workers updated: {:?}", workers); + WORKER_COUNT.set(workers.len() as i64); Ok(()) } @@ -237,6 +254,7 @@ impl LoadBalancerState { if worker.is_healthy().await { healthy_workers.push(worker.clone()); } else { + WORKER_UNHEALTHY.inc(); warn!("Worker {} is not healthy", worker.address()); } } @@ -254,12 +272,13 @@ static RATE_LIMITER: Lazy = Lazy::new(|| Rate::new(Duration::from_secs(1)) /// Request queue holds the list of requests that are waiting to be processed by the workers. /// It is used to keep track of the order of the requests to then assign them to the workers. pub struct RequestQueue { - queue: RwLock>, + queue: RwLock>, } impl RequestQueue { /// Create a new empty request queue pub fn new() -> Self { + QUEUE_SIZE.set(0); Self { queue: RwLock::new(VecDeque::new()) } } @@ -270,20 +289,28 @@ impl RequestQueue { /// Enqueue a request pub async fn enqueue(&self, request_id: Uuid) { + QUEUE_SIZE.inc(); let mut queue = self.queue.write().await; - queue.push_back(request_id); + queue.push_back((request_id, Instant::now())); } /// Dequeue a request pub async fn dequeue(&self) -> Option { let mut queue = self.queue.write().await; - queue.pop_front() + // If the queue was empty, the queue size does not change + if let Some((request_id, queued_time)) = queue.pop_front() { + QUEUE_SIZE.dec(); + QUEUE_LATENCY.observe(queued_time.elapsed().as_secs_f64()); + Some(request_id) + } else { + None + } } /// Peek at the first request in the queue pub async fn peek(&self) -> Option { let queue = self.queue.read().await; - queue.front().copied() + queue.front().copied().map(|(request_id, _)| request_id) } } @@ -306,6 +333,8 @@ pub struct RequestContext { worker: Option, /// Parent span for the request parent_span: Span, + /// Time when the request was created + created_at: Instant, } impl RequestContext { @@ -317,11 +346,13 @@ impl RequestContext { request_id, worker: None, parent_span: info_span!(target: MIDEN_TX_PROVER, "proxy:new_request", request_id = request_id.to_string()), + created_at: Instant::now(), } } /// Set the worker that will process the request fn set_worker(&mut self, worker: Worker) { + WORKER_REQUEST_COUNT.with_label_values(&[&worker.address()]).inc(); self.worker = Some(worker); } } @@ -374,6 +405,9 @@ impl ProxyHttp for LoadBalancer { where Self::CTX: Send + Sync, { + // Increment the request count + REQUEST_COUNT.inc(); + // Extract the client address early let client_addr = match session.client_addr() { Some(addr) => addr.to_string(), @@ -402,6 +436,13 @@ impl ProxyHttp for LoadBalancer { // Rate limit the request if curr_window_requests > self.0.max_req_per_sec { + RATE_LIMITED_REQUESTS.inc(); + + // Only count a violation the first time in a given window + if curr_window_requests == self.0.max_req_per_sec + 1 { + RATE_LIMIT_VIOLATIONS.inc(); + } + return create_too_many_requests_response(session, self.0.max_req_per_sec).await; }; @@ -412,6 +453,7 @@ impl ProxyHttp for LoadBalancer { // Check if the queue is full if queue_len >= self.0.max_queue_items { + QUEUE_DROP_COUNT.inc(); return create_queue_full_response(session).await; } @@ -519,6 +561,7 @@ impl ProxyHttp for LoadBalancer { if ctx.tries > self.0.max_retries_per_request { return e; } + REQUEST_RETRIES.inc(); ctx.tries += 1; e.set_retry(true); e @@ -534,6 +577,7 @@ impl ProxyHttp for LoadBalancer { Self::CTX: Send + Sync, { if let Some(e) = e { + REQUEST_FAILURE_COUNT.inc(); error!("Error: {:?}", e); } @@ -541,6 +585,8 @@ impl ProxyHttp for LoadBalancer { if let Some(worker) = ctx.worker.take() { self.0.add_available_worker(worker).await; } + + REQUEST_LATENCY.observe(ctx.created_at.elapsed().as_secs_f64()); } // The following methods are a copy of the default implementation defined in the trait, but @@ -725,6 +771,9 @@ impl BackgroundService for LoadBalancerState { // Update the worker list with healthy workers *workers = healthy_workers; + // Update the worker count metric + WORKER_COUNT.set(workers.len() as i64); + // Sleep for the defined interval before the next health check sleep(self.health_check_frequency).await; } From ff098acfb5b2232b010c9e3d748a5bbb66f34287 Mon Sep 17 00:00:00 2001 From: SantiagoPittella Date: Thu, 19 Dec 2024 18:14:49 -0300 Subject: [PATCH 02/37] review: update RequestContext documentation --- bin/tx-prover/src/proxy/mod.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bin/tx-prover/src/proxy/mod.rs b/bin/tx-prover/src/proxy/mod.rs index defd43bc4..6035be60d 100644 --- a/bin/tx-prover/src/proxy/mod.rs +++ b/bin/tx-prover/src/proxy/mod.rs @@ -321,8 +321,10 @@ static QUEUE: Lazy = Lazy::new(RequestQueue::new); // ================================================================================================ /// Custom context for the request/response lifecycle +/// /// We use this context to keep track of the number of tries for a request, the unique ID for the -/// request, and the worker that will process the request. +/// request, the worker that will process the request, a span that will be used for traces along +/// the transaction execution, and a timer to track how long the request took. #[derive(Debug)] pub struct RequestContext { /// Number of tries for the request From 5f6096ef9c4cac52ae43072454fea1a7594b263e Mon Sep 17 00:00:00 2001 From: SantiagoPittella Date: Fri, 20 Dec 2024 11:30:00 -0300 Subject: [PATCH 03/37] review: replace lazy_static with LazyLock --- Cargo.lock | 1 - bin/tx-prover/Cargo.toml | 1 - bin/tx-prover/src/proxy/metrics.rs | 106 ++++++++++++++++++----------- 3 files changed, 65 insertions(+), 43 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c597e0abe..849bff638 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2018,7 +2018,6 @@ dependencies = [ "clap 4.5.21", "figment", "getrandom", - "lazy_static", "miden-lib", "miden-objects", "miden-tx", diff --git a/bin/tx-prover/Cargo.toml b/bin/tx-prover/Cargo.toml index 21fdcbdf2..5ee1a67e8 100644 --- a/bin/tx-prover/Cargo.toml +++ b/bin/tx-prover/Cargo.toml @@ -54,7 +54,6 @@ figment = { version = "0.10", features = ["toml", "env"] } miden-lib = { workspace = true, default-features = false } miden-objects = { workspace = true, default-features = false } miden-tx = { workspace = true, default-features = false } -lazy_static = "1.5" prometheus = "0.13" prost = { version = "0.13", default-features = false, features = ["derive"] } reqwest = { version = "0.11" } diff --git a/bin/tx-prover/src/proxy/metrics.rs b/bin/tx-prover/src/proxy/metrics.rs index a6dbdbcba..d5a4424f7 100644 --- a/bin/tx-prover/src/proxy/metrics.rs +++ b/bin/tx-prover/src/proxy/metrics.rs @@ -1,49 +1,73 @@ -use lazy_static::lazy_static; +use std::sync::LazyLock; + use prometheus::{ register_histogram, register_int_counter, register_int_counter_vec, register_int_gauge, Histogram, IntCounter, IntCounterVec, IntGauge, }; -lazy_static! { - // Queue Metrics - pub static ref QUEUE_SIZE: IntGauge = - register_int_gauge!("queue_size", "Number of requests in the queue").unwrap(); - pub static ref QUEUE_LATENCY: Histogram = - register_histogram!("queue_latency", "Time requests spend in the queue", vec![ - 0.1, 0.5, 1.0, 2.0, 5.0, 10.0 - ]).unwrap(); - pub static ref QUEUE_DROP_COUNT: IntCounter = - register_int_counter!("queue_drop_count", "Number of requests dropped due to a full queue").unwrap(); +// Queue Metrics +pub static QUEUE_SIZE: LazyLock = + LazyLock::new(|| register_int_gauge!("queue_size", "Number of requests in the queue").unwrap()); +pub static QUEUE_LATENCY: LazyLock = LazyLock::new(|| { + register_histogram!( + "queue_latency", + "Time requests spend in the queue", + vec![0.1, 0.5, 1.0, 2.0, 5.0, 10.0] + ) + .unwrap() +}); +pub static QUEUE_DROP_COUNT: LazyLock = LazyLock::new(|| { + register_int_counter!("queue_drop_count", "Number of requests dropped due to a full queue") + .unwrap() +}); - // Worker Metrics - pub static ref WORKER_COUNT: IntGauge = - register_int_gauge!("worker_count", "Number of workers").unwrap(); - pub static ref WORKER_UNHEALTHY: IntCounter = - register_int_counter!("worker_unhealthy", "Number of unhealthy workers").unwrap(); - pub static ref WORKER_UTILIZATION: IntGauge = - register_int_gauge!("worker_utilization", "Number of requests being processed by workers").unwrap(); - pub static ref WORKER_REQUEST_COUNT: IntCounterVec = - register_int_counter_vec!( - "worker_request_count", - "Number of requests processed by each worker", - &["worker_id"] - ).unwrap(); +// Worker Metrics +pub static WORKER_COUNT: LazyLock = + LazyLock::new(|| register_int_gauge!("worker_count", "Number of workers").unwrap()); +pub static WORKER_UNHEALTHY: LazyLock = LazyLock::new(|| { + register_int_counter!("worker_unhealthy", "Number of unhealthy workers").unwrap() +}); +pub static WORKER_UTILIZATION: LazyLock = LazyLock::new(|| { + register_int_gauge!("worker_utilization", "Number of requests being processed by workers") + .unwrap() +}); +pub static WORKER_REQUEST_COUNT: LazyLock = LazyLock::new(|| { + register_int_counter_vec!( + "worker_request_count", + "Number of requests processed by each worker", + &["worker_id"] + ) + .unwrap() +}); - // Request Metrics - pub static ref REQUEST_FAILURE_COUNT: IntCounter = - register_int_counter!("request_failure_count", "Number of failed requests").unwrap(); - pub static ref REQUEST_RETRIES: IntCounter = - register_int_counter!("request_retries", "Number of request retries").unwrap(); - pub static ref REQUEST_COUNT: IntCounter = - register_int_counter!("request_count", "Number of requests processed").unwrap(); - pub static ref REQUEST_LATENCY: Histogram = - register_histogram!("request_latency", "Time requests take to process", vec![ - 0.1, 0.5, 1.0, 2.0, 5.0, 10.0 - ]).unwrap(); +// Request Metrics +pub static REQUEST_FAILURE_COUNT: LazyLock = LazyLock::new(|| { + register_int_counter!("request_failure_count", "Number of failed requests").unwrap() +}); +pub static REQUEST_RETRIES: LazyLock = LazyLock::new(|| { + register_int_counter!("request_retries", "Number of request retries").unwrap() +}); +pub static REQUEST_COUNT: LazyLock = LazyLock::new(|| { + register_int_counter!("request_count", "Number of requests processed").unwrap() +}); +pub static REQUEST_LATENCY: LazyLock = LazyLock::new(|| { + register_histogram!( + "request_latency", + "Time requests take to process", + vec![0.1, 0.5, 1.0, 2.0, 5.0, 10.0] + ) + .unwrap() +}); - // Rate Limiting Metrics - pub static ref RATE_LIMITED_REQUESTS: IntCounter = - register_int_counter!("rate_limited_requests", "Number of requests blocked due to rate limiting").unwrap(); - pub static ref RATE_LIMIT_VIOLATIONS: IntCounter = - register_int_counter!("rate_limit_violations", "Number of rate limit violations by clients").unwrap(); -} +// Rate Limiting Metrics +pub static RATE_LIMITED_REQUESTS: LazyLock = LazyLock::new(|| { + register_int_counter!( + "rate_limited_requests", + "Number of requests blocked due to rate limiting" + ) + .unwrap() +}); +pub static RATE_LIMIT_VIOLATIONS: LazyLock = LazyLock::new(|| { + register_int_counter!("rate_limit_violations", "Number of rate limit violations by clients") + .unwrap() +}); From c8506a4134240b156abc63acd6d9bf94e649225c Mon Sep 17 00:00:00 2001 From: SantiagoPittella Date: Mon, 23 Dec 2024 16:19:51 -0300 Subject: [PATCH 04/37] review: use always localhost for metrics host --- bin/tx-prover/README.md | 4 +++- bin/tx-prover/src/commands/mod.rs | 3 --- bin/tx-prover/src/commands/proxy.rs | 5 ++--- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/bin/tx-prover/README.md b/bin/tx-prover/README.md index 9f3a695a1..9a5fc6882 100644 --- a/bin/tx-prover/README.md +++ b/bin/tx-prover/README.md @@ -61,6 +61,8 @@ max_retries_per_request = 1 max_req_per_sec = 5 # Interval to check the health of the workers health_check_interval_secs = 1 +# Port of the metrics server +prometheus_port = 6192 ``` Then, to start the proxy service, you will need to run: @@ -122,7 +124,7 @@ If Docker is not an option, Jaeger can also be set up directly on your machine o ## Metrics -The proxy includes a service that provides metrics for [Prometheus](https://prometheus.io/docs/introduction/overview/). This service is always enabled and uses the host and port defined in the configuration file. The metrics are available at the `/metrics` endpoint. +The proxy includes a service that provides metrics for [Prometheus](https://prometheus.io/docs/introduction/overview/). This service is always enabled and uses the port defined in the configuration file. The metrics are available at the `/metrics` endpoint. To consume and display the metrics, you can use Prometheus and Grafana. The simplest way to install Prometheus and Grafana is by using Docker containers. To do so, run: diff --git a/bin/tx-prover/src/commands/mod.rs b/bin/tx-prover/src/commands/mod.rs index 68c23e824..caff3dc44 100644 --- a/bin/tx-prover/src/commands/mod.rs +++ b/bin/tx-prover/src/commands/mod.rs @@ -42,8 +42,6 @@ pub struct ProxyConfig { pub available_workers_polling_time_ms: u64, /// Health check interval in seconds. pub health_check_interval_secs: u64, - /// Prometheus metrics host. - pub prometheus_host: String, /// Prometheus metrics port. pub prometheus_port: u16, } @@ -60,7 +58,6 @@ impl Default for ProxyConfig { max_req_per_sec: 5, available_workers_polling_time_ms: 20, health_check_interval_secs: 1, - prometheus_host: "0.0.0.0".into(), prometheus_port: 6192, } } diff --git a/bin/tx-prover/src/commands/proxy.rs b/bin/tx-prover/src/commands/proxy.rs index 01015fc33..8880b9285 100644 --- a/bin/tx-prover/src/commands/proxy.rs +++ b/bin/tx-prover/src/commands/proxy.rs @@ -68,9 +68,8 @@ impl StartProxy { // Enable Prometheus metrics let mut prometheus_service_http = pingora::services::listening::Service::prometheus_http_service(); - prometheus_service_http.add_tcp( - format!("{}:{}", proxy_config.prometheus_host, proxy_config.prometheus_port).as_str(), - ); + prometheus_service_http + .add_tcp(format!("{}:{}", "127.0.0.1", proxy_config.prometheus_port).as_str()); server.add_service(prometheus_service_http); server.add_service(health_check_service); From 00a76b2e06491e73c27c8048ca48b5a8a219f1b1 Mon Sep 17 00:00:00 2001 From: SantiagoPittella Date: Fri, 27 Dec 2024 17:36:48 -0300 Subject: [PATCH 05/37] review: improve separators in metrics definition --- bin/tx-prover/src/proxy/metrics.rs | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/bin/tx-prover/src/proxy/metrics.rs b/bin/tx-prover/src/proxy/metrics.rs index d5a4424f7..d731d4608 100644 --- a/bin/tx-prover/src/proxy/metrics.rs +++ b/bin/tx-prover/src/proxy/metrics.rs @@ -5,7 +5,9 @@ use prometheus::{ Histogram, IntCounter, IntCounterVec, IntGauge, }; -// Queue Metrics +// QUEUE METRICS +// ================================================================================================ + pub static QUEUE_SIZE: LazyLock = LazyLock::new(|| register_int_gauge!("queue_size", "Number of requests in the queue").unwrap()); pub static QUEUE_LATENCY: LazyLock = LazyLock::new(|| { @@ -21,7 +23,9 @@ pub static QUEUE_DROP_COUNT: LazyLock = LazyLock::new(|| { .unwrap() }); -// Worker Metrics +// WORKER METRICS +// ================================================================================================ + pub static WORKER_COUNT: LazyLock = LazyLock::new(|| register_int_gauge!("worker_count", "Number of workers").unwrap()); pub static WORKER_UNHEALTHY: LazyLock = LazyLock::new(|| { @@ -40,7 +44,9 @@ pub static WORKER_REQUEST_COUNT: LazyLock = LazyLock::new(|| { .unwrap() }); -// Request Metrics +// REQUEST METRICS +// ================================================================================================ + pub static REQUEST_FAILURE_COUNT: LazyLock = LazyLock::new(|| { register_int_counter!("request_failure_count", "Number of failed requests").unwrap() }); @@ -59,7 +65,9 @@ pub static REQUEST_LATENCY: LazyLock = LazyLock::new(|| { .unwrap() }); -// Rate Limiting Metrics +// RATE LIMITING METRICS +// ================================================================================================ + pub static RATE_LIMITED_REQUESTS: LazyLock = LazyLock::new(|| { register_int_counter!( "rate_limited_requests", From e486ffd2ca040e732793ea8ac3db4c53724efbea Mon Sep 17 00:00:00 2001 From: SantiagoPittella Date: Fri, 27 Dec 2024 17:50:01 -0300 Subject: [PATCH 06/37] review: re-add metrics host configuration and default to localhost --- bin/tx-prover/src/commands/mod.rs | 3 +++ bin/tx-prover/src/commands/proxy.rs | 5 +++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/bin/tx-prover/src/commands/mod.rs b/bin/tx-prover/src/commands/mod.rs index caff3dc44..84954ed59 100644 --- a/bin/tx-prover/src/commands/mod.rs +++ b/bin/tx-prover/src/commands/mod.rs @@ -42,6 +42,8 @@ pub struct ProxyConfig { pub available_workers_polling_time_ms: u64, /// Health check interval in seconds. pub health_check_interval_secs: u64, + /// Prometheus metrics host. + pub prometheus_host: String, /// Prometheus metrics port. pub prometheus_port: u16, } @@ -58,6 +60,7 @@ impl Default for ProxyConfig { max_req_per_sec: 5, available_workers_polling_time_ms: 20, health_check_interval_secs: 1, + prometheus_host: "127.0.0.1".into(), prometheus_port: 6192, } } diff --git a/bin/tx-prover/src/commands/proxy.rs b/bin/tx-prover/src/commands/proxy.rs index 8880b9285..01015fc33 100644 --- a/bin/tx-prover/src/commands/proxy.rs +++ b/bin/tx-prover/src/commands/proxy.rs @@ -68,8 +68,9 @@ impl StartProxy { // Enable Prometheus metrics let mut prometheus_service_http = pingora::services::listening::Service::prometheus_http_service(); - prometheus_service_http - .add_tcp(format!("{}:{}", "127.0.0.1", proxy_config.prometheus_port).as_str()); + prometheus_service_http.add_tcp( + format!("{}:{}", proxy_config.prometheus_host, proxy_config.prometheus_port).as_str(), + ); server.add_service(prometheus_service_http); server.add_service(health_check_service); From ae669d2b0ccadf1032dad8f79dd888126a1b8a8e Mon Sep 17 00:00:00 2001 From: SantiagoPittella Date: Fri, 27 Dec 2024 18:17:06 -0300 Subject: [PATCH 07/37] review: update prometheus.yml --- bin/tx-prover/prometheus.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bin/tx-prover/prometheus.yml b/bin/tx-prover/prometheus.yml index e96926acf..c35b2aa44 100644 --- a/bin/tx-prover/prometheus.yml +++ b/bin/tx-prover/prometheus.yml @@ -5,7 +5,11 @@ global: # A scrape configuration containing exactly one endpoint to scrape: scrape_configs: + # The job name is a label that is used to group targets in the Prometheus UI. + # It can be any string. - job_name: "proxy" # Here you need to specify the address of the Prometheus service endpoint in the proxy + # We use the default port for Prometheus, but it need to be changed if you use a different host + # or port. static_configs: - targets: ["127.0.0.1:6192"] From 9c2836e7983579739c5fd47bf9a22bcfc4ff73df Mon Sep 17 00:00:00 2001 From: SantiagoPittella Date: Fri, 27 Dec 2024 18:17:36 -0300 Subject: [PATCH 08/37] review: add information about grafana dashboard creation and export to readme --- bin/tx-prover/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/tx-prover/README.md b/bin/tx-prover/README.md index 9a5fc6882..2aec10067 100644 --- a/bin/tx-prover/README.md +++ b/bin/tx-prover/README.md @@ -140,7 +140,7 @@ docker run -d -p 3000:3000 --name grafana grafana/grafana-enterprise:latest A prometheus configuration file is provided in this repository, you will need to modify the `scrape_configs` section to include the host and port of the proxy service. -Then, to add the new Prometheus collector as a datasource for Grafana, you can [follow this tutorial](https://grafana.com/docs/grafana-cloud/connect-externally-hosted/existing-datasource/). A Grafana dashboard under the name `proxy_grafana_dashboard.json` is provided, see this [link](https://grafana.com/docs/grafana/latest/dashboards/build-dashboards/import-dashboards/) to import it. +Then, to add the new Prometheus collector as a datasource for Grafana, you can [follow this tutorial](https://grafana.com/docs/grafana-cloud/connect-externally-hosted/existing-datasource/). A Grafana dashboard under the name `proxy_grafana_dashboard.json` is provided, see this [link](https://grafana.com/docs/grafana/latest/dashboards/build-dashboards/import-dashboards/) to import it. Otherwise, you can [create your own dashboard](https://grafana.com/docs/grafana/latest/getting-started/build-first-dashboard/) using the metrics provided by the proxy and export it by following this [link](https://grafana.com/docs/grafana/latest/dashboards/share-dashboards-panels/#export-a-dashboard-as-json). ## Features From 01ddba2615889bdb3715174a46e5f375b308277a Mon Sep 17 00:00:00 2001 From: SantiagoPittella Date: Fri, 27 Dec 2024 18:21:31 -0300 Subject: [PATCH 09/37] review: add documentation on local installation of prometheus and grafana --- bin/tx-prover/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bin/tx-prover/README.md b/bin/tx-prover/README.md index 2aec10067..403f90dcd 100644 --- a/bin/tx-prover/README.md +++ b/bin/tx-prover/README.md @@ -138,6 +138,8 @@ docker run \ docker run -d -p 3000:3000 --name grafana grafana/grafana-enterprise:latest ``` +In case that Docker is not an option, Prometheus and Grafana can also be set up directly on your machine or hosted in the cloud. See the [Prometheus documentation](https://prometheus.io/docs/prometheus/latest/getting_started/) and [Grafana documentation](https://grafana.com/docs/grafana/latest/setup-grafana/) for alternative installation methods. + A prometheus configuration file is provided in this repository, you will need to modify the `scrape_configs` section to include the host and port of the proxy service. Then, to add the new Prometheus collector as a datasource for Grafana, you can [follow this tutorial](https://grafana.com/docs/grafana-cloud/connect-externally-hosted/existing-datasource/). A Grafana dashboard under the name `proxy_grafana_dashboard.json` is provided, see this [link](https://grafana.com/docs/grafana/latest/dashboards/build-dashboards/import-dashboards/) to import it. Otherwise, you can [create your own dashboard](https://grafana.com/docs/grafana/latest/getting-started/build-first-dashboard/) using the metrics provided by the proxy and export it by following this [link](https://grafana.com/docs/grafana/latest/dashboards/share-dashboards-panels/#export-a-dashboard-as-json). From dd8a92801b44742a5083b425606282c205765fdd Mon Sep 17 00:00:00 2001 From: SantiagoPittella Date: Mon, 30 Dec 2024 17:23:32 -0300 Subject: [PATCH 10/37] review: add units to histograms --- bin/tx-prover/src/proxy/metrics.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/tx-prover/src/proxy/metrics.rs b/bin/tx-prover/src/proxy/metrics.rs index d731d4608..1f3c7da37 100644 --- a/bin/tx-prover/src/proxy/metrics.rs +++ b/bin/tx-prover/src/proxy/metrics.rs @@ -13,7 +13,7 @@ pub static QUEUE_SIZE: LazyLock = pub static QUEUE_LATENCY: LazyLock = LazyLock::new(|| { register_histogram!( "queue_latency", - "Time requests spend in the queue", + "Time (in seconds) requests spend in the queue", vec![0.1, 0.5, 1.0, 2.0, 5.0, 10.0] ) .unwrap() @@ -59,7 +59,7 @@ pub static REQUEST_COUNT: LazyLock = LazyLock::new(|| { pub static REQUEST_LATENCY: LazyLock = LazyLock::new(|| { register_histogram!( "request_latency", - "Time requests take to process", + "Time (in seconds) requests take to process", vec![0.1, 0.5, 1.0, 2.0, 5.0, 10.0] ) .unwrap() From 1729d6c7cebc26f86e3b45de1fe54574054494a7 Mon Sep 17 00:00:00 2001 From: SantiagoPittella Date: Mon, 30 Dec 2024 17:24:21 -0300 Subject: [PATCH 11/37] review: rename the tag in prometheus.yml to tx_prover --- bin/tx-prover/prometheus.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/tx-prover/prometheus.yml b/bin/tx-prover/prometheus.yml index c35b2aa44..f4889694e 100644 --- a/bin/tx-prover/prometheus.yml +++ b/bin/tx-prover/prometheus.yml @@ -7,7 +7,7 @@ global: scrape_configs: # The job name is a label that is used to group targets in the Prometheus UI. # It can be any string. - - job_name: "proxy" + - job_name: "tx_prover" # Here you need to specify the address of the Prometheus service endpoint in the proxy # We use the default port for Prometheus, but it need to be changed if you use a different host # or port. From 308612ca54b9bd03021226f533c564ff56a0da1f Mon Sep 17 00:00:00 2001 From: SantiagoPittella Date: Mon, 30 Dec 2024 17:30:43 -0300 Subject: [PATCH 12/37] review: fix WORKER_COUNT metric update --- bin/tx-prover/src/proxy/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/tx-prover/src/proxy/mod.rs b/bin/tx-prover/src/proxy/mod.rs index 6035be60d..43588c596 100644 --- a/bin/tx-prover/src/proxy/mod.rs +++ b/bin/tx-prover/src/proxy/mod.rs @@ -80,6 +80,8 @@ impl LoadBalancerState { workers.push(Worker::new(worker, connection_timeout, total_timeout).await?); } + WORKER_COUNT.set(workers.len() as i64); + Ok(Self { workers: Arc::new(RwLock::new(workers)), timeout_secs: total_timeout, @@ -99,7 +101,6 @@ impl LoadBalancerState { /// If no worker is available, it will return None. pub async fn pop_available_worker(&self) -> Option { let mut available_workers = self.workers.write().await; - WORKER_COUNT.set(available_workers.len() as i64); available_workers.iter_mut().find(|w| w.is_available()).map(|w| { w.set_availability(false); WORKER_UTILIZATION.inc(); @@ -116,7 +117,6 @@ impl LoadBalancerState { w.set_availability(true); WORKER_UTILIZATION.dec(); } - WORKER_COUNT.set(available_workers.len() as i64); } /// Updates the list of available workers based on the given action ("add" or "remove"). From 5e12d9713e4499245716e21bd848c644d33157cb Mon Sep 17 00:00:00 2001 From: SantiagoPittella Date: Mon, 30 Dec 2024 17:41:21 -0300 Subject: [PATCH 13/37] review: fix WORKER_UTILIZATION updates --- bin/tx-prover/src/proxy/mod.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/bin/tx-prover/src/proxy/mod.rs b/bin/tx-prover/src/proxy/mod.rs index 43588c596..c84137020 100644 --- a/bin/tx-prover/src/proxy/mod.rs +++ b/bin/tx-prover/src/proxy/mod.rs @@ -115,8 +115,13 @@ impl LoadBalancerState { let mut available_workers = self.workers.write().await; if let Some(w) = available_workers.iter_mut().find(|w| *w == &worker) { w.set_availability(true); - WORKER_UTILIZATION.dec(); } + + // If the worker is not in the list it means but this method was called for a worker that + // was removed from the list either manually or because it was unhealthy. + // Either way when the worker get a job assigned the value of `WORKER_UTILIZATION` was + // increased so we need to decrease it here. + WORKER_UTILIZATION.dec(); } /// Updates the list of available workers based on the given action ("add" or "remove"). From 89c6933d2035ff5a6b0ccf6921838d793ecc8b3a Mon Sep 17 00:00:00 2001 From: SantiagoPittella Date: Mon, 30 Dec 2024 17:54:38 -0300 Subject: [PATCH 14/37] review: update RequestQueue docs --- bin/tx-prover/src/proxy/mod.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/tx-prover/src/proxy/mod.rs b/bin/tx-prover/src/proxy/mod.rs index c84137020..b98d17469 100644 --- a/bin/tx-prover/src/proxy/mod.rs +++ b/bin/tx-prover/src/proxy/mod.rs @@ -274,7 +274,8 @@ static RATE_LIMITER: Lazy = Lazy::new(|| Rate::new(Duration::from_secs(1)) // REQUEST QUEUE // ================================================================================================ -/// Request queue holds the list of requests that are waiting to be processed by the workers. +/// Request queue holds the list of requests that are waiting to be processed by the workers and +/// the time they were enqueued. /// It is used to keep track of the order of the requests to then assign them to the workers. pub struct RequestQueue { queue: RwLock>, From ac33f91a3c9671765db52195c6c0ad8abc448b9a Mon Sep 17 00:00:00 2001 From: SantiagoPittella Date: Mon, 30 Dec 2024 18:00:47 -0300 Subject: [PATCH 15/37] review: move WORKER_UNHEALTHY update logic --- bin/tx-prover/src/proxy/mod.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/bin/tx-prover/src/proxy/mod.rs b/bin/tx-prover/src/proxy/mod.rs index b98d17469..7330cf6d3 100644 --- a/bin/tx-prover/src/proxy/mod.rs +++ b/bin/tx-prover/src/proxy/mod.rs @@ -259,7 +259,6 @@ impl LoadBalancerState { if worker.is_healthy().await { healthy_workers.push(worker.clone()); } else { - WORKER_UNHEALTHY.inc(); warn!("Worker {} is not healthy", worker.address()); } } @@ -772,6 +771,7 @@ impl BackgroundService for LoadBalancerState { let _guard = span.enter(); let mut workers = self.workers.write().await; + let initial_workers_len = workers.len(); // Perform health checks on workers and retain healthy ones let healthy_workers = self.check_workers_health(workers.iter_mut()).await; @@ -779,8 +779,10 @@ impl BackgroundService for LoadBalancerState { // Update the worker list with healthy workers *workers = healthy_workers; - // Update the worker count metric + // Update the worker count and worker unhealhy count metrics WORKER_COUNT.set(workers.len() as i64); + let unhealthy_workers = initial_workers_len - workers.len(); + WORKER_UNHEALTHY.inc_by(unhealthy_workers as u64); // Sleep for the defined interval before the next health check sleep(self.health_check_frequency).await; From 021923c60bf3d56075417a1d7b384318eb23f86b Mon Sep 17 00:00:00 2001 From: SantiagoPittella Date: Mon, 30 Dec 2024 18:03:53 -0300 Subject: [PATCH 16/37] review: fix WORKER_UTILIZATION desc --- bin/tx-prover/src/proxy/metrics.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/bin/tx-prover/src/proxy/metrics.rs b/bin/tx-prover/src/proxy/metrics.rs index 1f3c7da37..5522f0703 100644 --- a/bin/tx-prover/src/proxy/metrics.rs +++ b/bin/tx-prover/src/proxy/metrics.rs @@ -31,10 +31,8 @@ pub static WORKER_COUNT: LazyLock = pub static WORKER_UNHEALTHY: LazyLock = LazyLock::new(|| { register_int_counter!("worker_unhealthy", "Number of unhealthy workers").unwrap() }); -pub static WORKER_UTILIZATION: LazyLock = LazyLock::new(|| { - register_int_gauge!("worker_utilization", "Number of requests being processed by workers") - .unwrap() -}); +pub static WORKER_UTILIZATION: LazyLock = + LazyLock::new(|| register_int_gauge!("worker_utilization", "Number of busy workers").unwrap()); pub static WORKER_REQUEST_COUNT: LazyLock = LazyLock::new(|| { register_int_counter_vec!( "worker_request_count", From 56b2621f5c20100b703f4aa1e22321713a191dea Mon Sep 17 00:00:00 2001 From: SantiagoPittella Date: Mon, 30 Dec 2024 18:06:02 -0300 Subject: [PATCH 17/37] review: rename WORKER_UTILIZATION to WORKER_BUSY --- bin/tx-prover/proxy_grafana_dashboard.json | 2 +- bin/tx-prover/src/proxy/metrics.rs | 4 ++-- bin/tx-prover/src/proxy/mod.rs | 10 +++++----- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/bin/tx-prover/proxy_grafana_dashboard.json b/bin/tx-prover/proxy_grafana_dashboard.json index 8f15d5ce9..6a5d7e39d 100644 --- a/bin/tx-prover/proxy_grafana_dashboard.json +++ b/bin/tx-prover/proxy_grafana_dashboard.json @@ -643,7 +643,7 @@ }, "disableTextWrap": false, "editorMode": "builder", - "expr": "worker_utilization", + "expr": "worker_busy", "fullMetaSearch": false, "hide": false, "includeNullMetadata": true, diff --git a/bin/tx-prover/src/proxy/metrics.rs b/bin/tx-prover/src/proxy/metrics.rs index 5522f0703..b761f5342 100644 --- a/bin/tx-prover/src/proxy/metrics.rs +++ b/bin/tx-prover/src/proxy/metrics.rs @@ -31,8 +31,8 @@ pub static WORKER_COUNT: LazyLock = pub static WORKER_UNHEALTHY: LazyLock = LazyLock::new(|| { register_int_counter!("worker_unhealthy", "Number of unhealthy workers").unwrap() }); -pub static WORKER_UTILIZATION: LazyLock = - LazyLock::new(|| register_int_gauge!("worker_utilization", "Number of busy workers").unwrap()); +pub static WORKER_BUSY: LazyLock = + LazyLock::new(|| register_int_gauge!("worker_busy", "Number of busy workers").unwrap()); pub static WORKER_REQUEST_COUNT: LazyLock = LazyLock::new(|| { register_int_counter_vec!( "worker_request_count", diff --git a/bin/tx-prover/src/proxy/mod.rs b/bin/tx-prover/src/proxy/mod.rs index 7330cf6d3..5e855135b 100644 --- a/bin/tx-prover/src/proxy/mod.rs +++ b/bin/tx-prover/src/proxy/mod.rs @@ -10,8 +10,8 @@ use async_trait::async_trait; use bytes::Bytes; use metrics::{ QUEUE_DROP_COUNT, QUEUE_LATENCY, QUEUE_SIZE, RATE_LIMITED_REQUESTS, RATE_LIMIT_VIOLATIONS, - REQUEST_COUNT, REQUEST_FAILURE_COUNT, REQUEST_LATENCY, REQUEST_RETRIES, WORKER_COUNT, - WORKER_REQUEST_COUNT, WORKER_UNHEALTHY, WORKER_UTILIZATION, + REQUEST_COUNT, REQUEST_FAILURE_COUNT, REQUEST_LATENCY, REQUEST_RETRIES, WORKER_BUSY, + WORKER_COUNT, WORKER_REQUEST_COUNT, WORKER_UNHEALTHY, }; use once_cell::sync::Lazy; use pingora::{ @@ -103,7 +103,7 @@ impl LoadBalancerState { let mut available_workers = self.workers.write().await; available_workers.iter_mut().find(|w| w.is_available()).map(|w| { w.set_availability(false); - WORKER_UTILIZATION.inc(); + WORKER_BUSY.inc(); w.clone() }) } @@ -119,9 +119,9 @@ impl LoadBalancerState { // If the worker is not in the list it means but this method was called for a worker that // was removed from the list either manually or because it was unhealthy. - // Either way when the worker get a job assigned the value of `WORKER_UTILIZATION` was + // Either way when the worker get a job assigned the value of `WORKER_BUSY` was // increased so we need to decrease it here. - WORKER_UTILIZATION.dec(); + WORKER_BUSY.dec(); } /// Updates the list of available workers based on the given action ("add" or "remove"). From 64bb910acc110d96ded0bed164a1198e81a898e3 Mon Sep 17 00:00:00 2001 From: SantiagoPittella Date: Mon, 30 Dec 2024 18:12:14 -0300 Subject: [PATCH 18/37] review: add metrics host to readme --- bin/tx-prover/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/tx-prover/README.md b/bin/tx-prover/README.md index 403f90dcd..f37f20ed8 100644 --- a/bin/tx-prover/README.md +++ b/bin/tx-prover/README.md @@ -124,7 +124,7 @@ If Docker is not an option, Jaeger can also be set up directly on your machine o ## Metrics -The proxy includes a service that provides metrics for [Prometheus](https://prometheus.io/docs/introduction/overview/). This service is always enabled and uses the port defined in the configuration file. The metrics are available at the `/metrics` endpoint. +The proxy includes a service that provides metrics export for [Prometheus](https://prometheus.io/docs/introduction/overview/). This service is always enabled and uses the host and port defined in the configuration file. The metrics are available at the `/metrics` endpoint. To consume and display the metrics, you can use Prometheus and Grafana. The simplest way to install Prometheus and Grafana is by using Docker containers. To do so, run: From 0ab9b38a5e6c736d4a4fe4908e2eb8c96b3ddafb Mon Sep 17 00:00:00 2001 From: SantiagoPittella Date: Mon, 30 Dec 2024 18:16:13 -0300 Subject: [PATCH 19/37] review: mention config file by name --- bin/tx-prover/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/tx-prover/README.md b/bin/tx-prover/README.md index f37f20ed8..260dfe53b 100644 --- a/bin/tx-prover/README.md +++ b/bin/tx-prover/README.md @@ -124,7 +124,7 @@ If Docker is not an option, Jaeger can also be set up directly on your machine o ## Metrics -The proxy includes a service that provides metrics export for [Prometheus](https://prometheus.io/docs/introduction/overview/). This service is always enabled and uses the host and port defined in the configuration file. The metrics are available at the `/metrics` endpoint. +The proxy includes a service that provides metrics export for [Prometheus](https://prometheus.io/docs/introduction/overview/). This service is always enabled and uses the host and port defined in the `miden-tx-prover.toml` file. The metrics are available at the `/metrics` endpoint. To consume and display the metrics, you can use Prometheus and Grafana. The simplest way to install Prometheus and Grafana is by using Docker containers. To do so, run: From edb6092eabfbf2a949cba665289c969843607a8d Mon Sep 17 00:00:00 2001 From: SantiagoPittella Date: Mon, 30 Dec 2024 18:40:04 -0300 Subject: [PATCH 20/37] review: update doc about prometheus and grafana --- bin/tx-prover/README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/bin/tx-prover/README.md b/bin/tx-prover/README.md index 260dfe53b..ffebd7ee2 100644 --- a/bin/tx-prover/README.md +++ b/bin/tx-prover/README.md @@ -124,9 +124,11 @@ If Docker is not an option, Jaeger can also be set up directly on your machine o ## Metrics -The proxy includes a service that provides metrics export for [Prometheus](https://prometheus.io/docs/introduction/overview/). This service is always enabled and uses the host and port defined in the `miden-tx-prover.toml` file. The metrics are available at the `/metrics` endpoint. +The proxy includes a service that exposes metrics to be consumed by [Prometheus](https://prometheus.io/docs/introduction/overview/). This service is always enabled and uses the host and port defined in the `miden-tx-prover.toml` file. -To consume and display the metrics, you can use Prometheus and Grafana. The simplest way to install Prometheus and Grafana is by using Docker containers. To do so, run: +The metrics architecture works by having the proxy expose metrics at an endpoint (`/metrics`) in a format Prometheus can read. Prometheus periodically scrapes this endpoint, adds timestamps to the metrics, and stores them in its time-series database. Grafana then queries Prometheus to retrieve and visualize these metrics, allowing you to create dashboards and set up alerts based on the stored data. + +The simplest way to install Prometheus and Grafana is by using Docker containers. To do so, run: ```bash docker run \ From 80a02e7f1e94aceff8eedc7acd58191a5c79f029 Mon Sep 17 00:00:00 2001 From: SantiagoPittella Date: Mon, 30 Dec 2024 18:41:43 -0300 Subject: [PATCH 21/37] review: add missing config field to readme example --- bin/tx-prover/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bin/tx-prover/README.md b/bin/tx-prover/README.md index ffebd7ee2..f1e574b3d 100644 --- a/bin/tx-prover/README.md +++ b/bin/tx-prover/README.md @@ -59,6 +59,8 @@ max_queue_items = 10 max_retries_per_request = 1 # Maximum amount of requests that a given IP address can make per second max_req_per_sec = 5 +# Time to wait before checking the availability of workers +available_workers_polling_time_ms = 20 # Interval to check the health of the workers health_check_interval_secs = 1 # Port of the metrics server From 411b02378d4549e5fda109c1de42db544eb0ee8b Mon Sep 17 00:00:00 2001 From: SantiagoPittella Date: Mon, 30 Dec 2024 20:37:04 -0300 Subject: [PATCH 22/37] docs: improve prometheus.yml docs --- bin/tx-prover/prometheus.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/tx-prover/prometheus.yml b/bin/tx-prover/prometheus.yml index f4889694e..321418278 100644 --- a/bin/tx-prover/prometheus.yml +++ b/bin/tx-prover/prometheus.yml @@ -10,6 +10,7 @@ scrape_configs: - job_name: "tx_prover" # Here you need to specify the address of the Prometheus service endpoint in the proxy # We use the default port for Prometheus, but it need to be changed if you use a different host - # or port. + # or port. In case of using Prometheus in a docker container, you can use the + # `host.docker.internal` address to access the host machine. static_configs: - targets: ["127.0.0.1:6192"] From 6d9c2f5206c9d96ce8ac977f0a2ee5c2a90dd281 Mon Sep 17 00:00:00 2001 From: SantiagoPittella Date: Mon, 30 Dec 2024 20:44:07 -0300 Subject: [PATCH 23/37] chore: add safety comment to metrics --- bin/tx-prover/src/proxy/metrics.rs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/bin/tx-prover/src/proxy/metrics.rs b/bin/tx-prover/src/proxy/metrics.rs index b761f5342..69dcb3fce 100644 --- a/bin/tx-prover/src/proxy/metrics.rs +++ b/bin/tx-prover/src/proxy/metrics.rs @@ -5,6 +5,13 @@ use prometheus::{ Histogram, IntCounter, IntCounterVec, IntGauge, }; +// SAFETY: The `unwrap` calls here are safe because: +// 1. The metrics being registered (gauges, counters, histograms) use hardcoded names and +// descriptions, which are guaranteed not to conflict within the application. +// 2. Registration errors occur only if there is a naming conflict, which is not possible in this +// context due to controlled metric definitions. +// 3. Any changes to metric names or types should be carefully reviewed to avoid conflicts. + // QUEUE METRICS // ================================================================================================ From 2b9fc91e0feaa8e755531b6124bed160bf23baa6 Mon Sep 17 00:00:00 2001 From: SantiagoPittella Date: Mon, 30 Dec 2024 20:44:56 -0300 Subject: [PATCH 24/37] chore: initialize metrics on load balancer instance --- bin/tx-prover/src/proxy/mod.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bin/tx-prover/src/proxy/mod.rs b/bin/tx-prover/src/proxy/mod.rs index 5e855135b..cf74c58fc 100644 --- a/bin/tx-prover/src/proxy/mod.rs +++ b/bin/tx-prover/src/proxy/mod.rs @@ -81,6 +81,9 @@ impl LoadBalancerState { } WORKER_COUNT.set(workers.len() as i64); + RATE_LIMIT_VIOLATIONS.reset(); + RATE_LIMITED_REQUESTS.reset(); + REQUEST_RETRIES.reset(); Ok(Self { workers: Arc::new(RwLock::new(workers)), From 749d5f5e5b763bc27a62810fa0e5c17d9d6b166b Mon Sep 17 00:00:00 2001 From: SantiagoPittella Date: Mon, 30 Dec 2024 20:46:38 -0300 Subject: [PATCH 25/37] chore: update dashboard --- bin/tx-prover/proxy_grafana_dashboard.json | 759 +++++++++++++++------ 1 file changed, 535 insertions(+), 224 deletions(-) diff --git a/bin/tx-prover/proxy_grafana_dashboard.json b/bin/tx-prover/proxy_grafana_dashboard.json index 6a5d7e39d..bde1e143c 100644 --- a/bin/tx-prover/proxy_grafana_dashboard.json +++ b/bin/tx-prover/proxy_grafana_dashboard.json @@ -1,9 +1,9 @@ { "__inputs": [ { - "name": "DS_PROXY", - "label": "proxy", - "description": "Dashboard of the proxy service", + "name": "DS_TX_PROVER", + "label": "tx_prover", + "description": "", "type": "datasource", "pluginId": "prometheus", "pluginName": "Prometheus" @@ -11,18 +11,6 @@ ], "__elements": {}, "__requires": [ - { - "type": "panel", - "id": "barchart", - "name": "Bar chart", - "version": "" - }, - { - "type": "panel", - "id": "gauge", - "name": "Gauge", - "version": "" - }, { "type": "grafana", "id": "grafana", @@ -73,7 +61,7 @@ { "datasource": { "type": "prometheus", - "uid": "${DS_PROXY}" + "uid": "${DS_TX_PROVER}" }, "fieldConfig": { "defaults": { @@ -87,23 +75,20 @@ { "color": "green", "value": null - }, - { - "color": "red", - "value": 80 } ] - } + }, + "unit": "requests" }, "overrides": [] }, "gridPos": { - "h": 7, - "w": 6, + "h": 8, + "w": 4, "x": 0, "y": 0 }, - "id": 7, + "id": 16, "options": { "colorMode": "value", "graphMode": "area", @@ -124,33 +109,62 @@ "pluginVersion": "11.4.0", "targets": [ { - "disableTextWrap": false, - "editorMode": "builder", - "expr": "increase(request_count[1m])", - "fullMetaSearch": false, - "includeNullMetadata": true, + "editorMode": "code", + "expr": "request_count", "legendFormat": "__auto", "range": true, "refId": "A", - "useBackend": false, "datasource": { "type": "prometheus", - "uid": "${DS_PROXY}" + "uid": "${DS_TX_PROVER}" } } ], - "title": "Requests/m", + "title": "Total requests handled", "type": "stat" }, { "datasource": { "type": "prometheus", - "uid": "${DS_PROXY}" + "uid": "${DS_TX_PROVER}" }, "fieldConfig": { "defaults": { "color": { - "mode": "thresholds" + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } }, "mappings": [], "thresholds": { @@ -170,54 +184,68 @@ "overrides": [] }, "gridPos": { - "h": 7, - "w": 6, - "x": 6, + "h": 8, + "w": 7, + "x": 4, "y": 0 }, - "id": 5, + "id": 1, "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true + "tooltip": { + "mode": "single", + "sort": "none" + } }, "pluginVersion": "11.4.0", "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + }, "disableTextWrap": false, - "editorMode": "builder", - "expr": "worker_request_count", + "editorMode": "code", + "expr": "worker_count", "fullMetaSearch": false, + "hide": false, "includeNullMetadata": true, - "legendFormat": "__auto", + "instant": false, + "legendFormat": "Total workers", "range": true, - "refId": "A", - "useBackend": false, + "refId": "C", + "useBackend": false + }, + { "datasource": { "type": "prometheus", - "uid": "${DS_PROXY}" - } + "uid": "${DS_TX_PROVER}" + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "worker_busy", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "Busy workers", + "range": true, + "refId": "B", + "useBackend": false } ], - "title": "Total request count per worker", - "type": "stat" + "title": "Workers", + "type": "timeseries" }, { "datasource": { "type": "prometheus", - "uid": "${DS_PROXY}" + "uid": "${DS_TX_PROVER}" }, "fieldConfig": { "defaults": { @@ -267,7 +295,7 @@ }, { "color": "red", - "value": 80 + "value": 15 } ] } @@ -275,12 +303,12 @@ "overrides": [] }, "gridPos": { - "h": 7, + "h": 8, "w": 6, - "x": 12, + "x": 11, "y": 0 }, - "id": 2, + "id": 3, "options": { "legend": { "calcs": [], @@ -296,33 +324,66 @@ "pluginVersion": "11.4.0", "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + }, "disableTextWrap": false, - "editorMode": "builder", - "expr": "worker_availability", + "editorMode": "code", + "expr": "queue_size", "fullMetaSearch": false, "includeNullMetadata": true, - "legendFormat": "__auto", + "legendFormat": "Queue size", "range": true, "refId": "A", - "useBackend": false, - "datasource": { - "type": "prometheus", - "uid": "${DS_PROXY}" - } + "useBackend": false } ], - "title": "Worker availability", + "title": "Queue", "type": "timeseries" }, { "datasource": { "type": "prometheus", - "uid": "${DS_PROXY}" + "uid": "${DS_TX_PROVER}" }, "fieldConfig": { "defaults": { "color": { - "mode": "thresholds" + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } }, "mappings": [], "thresholds": { @@ -337,58 +398,66 @@ "value": 80 } ] - } + }, + "unit": "s" }, "overrides": [] }, "gridPos": { - "h": 7, - "w": 6, - "x": 18, + "h": 8, + "w": 7, + "x": 17, "y": 0 }, - "id": 4, + "id": 11, "options": { - "minVizHeight": 75, - "minVizWidth": 75, - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true }, - "showThresholdLabels": false, - "showThresholdMarkers": true, - "sizing": "auto" + "tooltip": { + "mode": "single", + "sort": "none" + } }, "pluginVersion": "11.4.0", "targets": [ { - "disableTextWrap": false, - "editorMode": "builder", - "expr": "request_failure_count", - "fullMetaSearch": false, - "includeNullMetadata": true, - "legendFormat": "__auto", + "editorMode": "code", + "expr": "rate(request_latency_sum[1m]) / rate(request_latency_count[1m])", + "legendFormat": "Average request latency", "range": true, "refId": "A", - "useBackend": false, "datasource": { "type": "prometheus", - "uid": "${DS_PROXY}" + "uid": "${DS_TX_PROVER}" } + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + }, + "editorMode": "code", + "expr": "rate(queue_latency_sum[1m]) / rate(queue_latency_count[1m])", + "hide": false, + "instant": false, + "legendFormat": "Average queue latency", + "range": true, + "refId": "B" } ], - "title": "Request failure count", - "type": "gauge" + "title": "Latency", + "type": "timeseries" }, { "datasource": { "type": "prometheus", - "uid": "${DS_PROXY}" + "uid": "${DS_TX_PROVER}" }, + "description": "Rate of requests dropped due to a full queue", "fieldConfig": { "defaults": { "color": { @@ -440,17 +509,18 @@ "value": 80 } ] - } + }, + "unit": "reqps" }, "overrides": [] }, "gridPos": { "h": 8, - "w": 7, + "w": 8, "x": 0, - "y": 7 + "y": 8 }, - "id": 6, + "id": 14, "options": { "legend": { "calcs": [], @@ -466,34 +536,63 @@ "pluginVersion": "11.4.0", "targets": [ { - "disableTextWrap": false, - "editorMode": "builder", - "expr": "increase(request_count[1m])", - "fullMetaSearch": false, - "includeNullMetadata": true, - "interval": "", - "legendFormat": "__auto", + "editorMode": "code", + "expr": "rate(queue_drop_count[1m])", + "legendFormat": "Queue drop rate", "range": true, "refId": "A", - "useBackend": false, "datasource": { "type": "prometheus", - "uid": "${DS_PROXY}" + "uid": "${DS_TX_PROVER}" } } ], - "title": "Requests/m", + "title": "Queue drop rate", "type": "timeseries" }, { "datasource": { "type": "prometheus", - "uid": "${DS_PROXY}" + "uid": "${DS_TX_PROVER}" }, "fieldConfig": { "defaults": { "color": { - "mode": "thresholds" + "fixedColor": "red", + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } }, "mappings": [], "thresholds": { @@ -505,60 +604,98 @@ }, { "color": "red", - "value": 15 + "value": 80 } ] - } + }, + "unit": "reqpm" }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Total requests" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Failed requests" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] }, "gridPos": { "h": 8, - "w": 6, - "x": 7, - "y": 7 + "w": 9, + "x": 8, + "y": 8 }, - "id": 3, + "id": 10, "options": { - "minVizHeight": 75, - "minVizWidth": 75, - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true }, - "showThresholdLabels": false, - "showThresholdMarkers": true, - "sizing": "auto" + "tooltip": { + "mode": "single", + "sort": "none" + } }, "pluginVersion": "11.4.0", "targets": [ { - "disableTextWrap": false, - "editorMode": "builder", - "expr": "queue_size", - "fullMetaSearch": false, - "includeNullMetadata": true, - "legendFormat": "__auto", + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + }, + "editorMode": "code", + "expr": "sum(rate(request_count[1m]))", + "hide": false, + "instant": false, + "legendFormat": "Total requests", + "range": true, + "refId": "B" + }, + { + "editorMode": "code", + "expr": "sum(rate(request_failure_count[1m]))", + "legendFormat": "Failed requests", "range": true, "refId": "A", - "useBackend": false, "datasource": { "type": "prometheus", - "uid": "${DS_PROXY}" + "uid": "${DS_TX_PROVER}" } } ], - "title": "Queue size", - "type": "gauge" + "title": "Requests", + "type": "timeseries" }, { "datasource": { "type": "prometheus", - "uid": "${DS_PROXY}" + "uid": "${DS_TX_PROVER}" }, "fieldConfig": { "defaults": { @@ -611,17 +748,18 @@ "value": 80 } ] - } + }, + "unit": "reqps" }, "overrides": [] }, "gridPos": { "h": 8, "w": 7, - "x": 13, - "y": 7 + "x": 17, + "y": 8 }, - "id": 1, + "id": 12, "options": { "legend": { "calcs": [], @@ -637,51 +775,225 @@ "pluginVersion": "11.4.0", "targets": [ { + "editorMode": "code", + "expr": "rate(worker_request_count[1m])", + "legendFormat": "{{worker_id}}", + "range": true, + "refId": "A", "datasource": { "type": "prometheus", - "uid": "${DS_PROXY}" + "uid": "${DS_TX_PROVER}" + } + } + ], + "title": "Requests per worker", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "worker_busy", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "B", - "useBackend": false + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 16 + }, + "id": 13, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ { + "editorMode": "code", + "expr": "(1 - rate(request_failure_count[1m]) / rate(request_count[1m])) * 100", + "legendFormat": "Success rate over time", + "range": true, + "refId": "A", "datasource": { "type": "prometheus", - "uid": "${DS_PROXY}" + "uid": "${DS_TX_PROVER}" + } + } + ], + "title": "Success rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "red", + "mode": "fixed" }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "queue_size", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 5, + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": "" + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 7, + "x": 8, + "y": 16 + }, + "id": 15, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "worker_unhealthy", + "legendFormat": "Number of unhealty workers", "range": true, - "refId": "C", - "useBackend": false + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + } } ], + "title": "Unhealthy workers", "type": "timeseries" }, { "datasource": { "type": "prometheus", - "uid": "${DS_PROXY}" + "uid": "${DS_TX_PROVER}" }, "fieldConfig": { "defaults": { "color": { - "mode": "thresholds" + "mode": "continuous-RdYlGr" }, "mappings": [], "thresholds": { @@ -696,15 +1008,16 @@ "value": 80 } ] - } + }, + "unit": "reqps" }, "overrides": [] }, "gridPos": { "h": 8, "w": 4, - "x": 20, - "y": 7 + "x": 15, + "y": 16 }, "id": 8, "options": { @@ -727,33 +1040,33 @@ "pluginVersion": "11.4.0", "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + }, "disableTextWrap": false, - "editorMode": "builder", - "expr": "rate_limit_violations", + "editorMode": "code", + "expr": "rate(rate_limited_requests[1m])", "fullMetaSearch": false, "includeNullMetadata": true, "legendFormat": "__auto", "range": true, "refId": "A", - "useBackend": false, - "datasource": { - "type": "prometheus", - "uid": "${DS_PROXY}" - } + "useBackend": false } ], - "title": "Rate limit violations", + "title": "Rate-Limited Requests", "type": "stat" }, { "datasource": { "type": "prometheus", - "uid": "${DS_PROXY}" + "uid": "${DS_TX_PROVER}" }, "fieldConfig": { "defaults": { "color": { - "mode": "palette-classic" + "mode": "continuous-YlRd" }, "custom": { "axisBorderShow": false, @@ -761,17 +1074,29 @@ "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", - "fillOpacity": 80, + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, + "insertNulls": false, + "lineInterpolation": "linear", "lineWidth": 1, + "pointSize": 5, "scaleDistribution": { "type": "linear" }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, "thresholdsStyle": { "mode": "off" } @@ -790,59 +1115,45 @@ } ] }, - "unit": "none" + "unit": "reqpm" }, "overrides": [] }, "gridPos": { "h": 8, - "w": 12, - "x": 0, - "y": 15 + "w": 5, + "x": 19, + "y": 16 }, - "id": 9, + "id": 17, "options": { - "barRadius": 0, - "barWidth": 0.97, - "fullHighlight": false, - "groupWidth": 0.7, "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, - "orientation": "auto", - "showValue": "auto", - "stacking": "none", "tooltip": { "mode": "single", "sort": "none" - }, - "xTickLabelRotation": 0, - "xTickLabelSpacing": 0 + } }, "pluginVersion": "11.4.0", "targets": [ { - "disableTextWrap": false, - "editorMode": "builder", - "exemplar": false, - "expr": "histogram_quantile(0.95, sum by(le) (rate(request_latency_bucket[$__rate_interval])))", - "fullMetaSearch": false, - "includeNullMetadata": false, + "editorMode": "code", + "expr": "rate(request_retries[1m])", "legendFormat": "__auto", "range": true, "refId": "A", - "useBackend": false, "datasource": { "type": "prometheus", - "uid": "${DS_PROXY}" + "uid": "${DS_TX_PROVER}" } } ], - "title": "Request latency", - "type": "barchart" + "title": "Request retry rate", + "type": "timeseries" } ], "refresh": "5s", @@ -852,13 +1163,13 @@ "list": [] }, "time": { - "from": "2024-12-19T14:36:56.711Z", - "to": "2024-12-19T14:38:02.492Z" + "from": "now-5m", + "to": "now" }, "timepicker": {}, "timezone": "browser", - "title": "PROXY", + "title": "tx_prover", "uid": "be7bobzl5fr40f", - "version": 23, + "version": 24, "weekStart": "" -} +} \ No newline at end of file From 44fe4f0eafc9ad5456bd04fa47c62d91ba1b8cf0 Mon Sep 17 00:00:00 2001 From: SantiagoPittella Date: Mon, 30 Dec 2024 20:51:06 -0300 Subject: [PATCH 26/37] review: change unhealthy workers metric description --- bin/tx-prover/src/proxy/metrics.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bin/tx-prover/src/proxy/metrics.rs b/bin/tx-prover/src/proxy/metrics.rs index 69dcb3fce..3adb59350 100644 --- a/bin/tx-prover/src/proxy/metrics.rs +++ b/bin/tx-prover/src/proxy/metrics.rs @@ -36,7 +36,11 @@ pub static QUEUE_DROP_COUNT: LazyLock = LazyLock::new(|| { pub static WORKER_COUNT: LazyLock = LazyLock::new(|| register_int_gauge!("worker_count", "Number of workers").unwrap()); pub static WORKER_UNHEALTHY: LazyLock = LazyLock::new(|| { - register_int_counter!("worker_unhealthy", "Number of unhealthy workers").unwrap() + register_int_counter!( + "worker_unhealthy", + "Number of times that workers were registered as unhealthy" + ) + .unwrap() }); pub static WORKER_BUSY: LazyLock = LazyLock::new(|| register_int_gauge!("worker_busy", "Number of busy workers").unwrap()); From 370eb99977b1f39bc169eb3e513f2930054c6b52 Mon Sep 17 00:00:00 2001 From: SantiagoPittella Date: Thu, 2 Jan 2025 12:11:48 -0300 Subject: [PATCH 27/37] review: add missing field to README example --- bin/tx-prover/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bin/tx-prover/README.md b/bin/tx-prover/README.md index f1e574b3d..63dae47da 100644 --- a/bin/tx-prover/README.md +++ b/bin/tx-prover/README.md @@ -63,6 +63,8 @@ max_req_per_sec = 5 available_workers_polling_time_ms = 20 # Interval to check the health of the workers health_check_interval_secs = 1 +# Host of the metrics server +prometheus_host = "127.0.0.1" # Port of the metrics server prometheus_port = 6192 ``` From 8066e4651dd87578672ce244a533f14bdece0855 Mon Sep 17 00:00:00 2001 From: SantiagoPittella Date: Thu, 2 Jan 2025 12:14:46 -0300 Subject: [PATCH 28/37] review: rewrite Grafana introduction on readme --- bin/tx-prover/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/tx-prover/README.md b/bin/tx-prover/README.md index 63dae47da..820b3173a 100644 --- a/bin/tx-prover/README.md +++ b/bin/tx-prover/README.md @@ -130,7 +130,7 @@ If Docker is not an option, Jaeger can also be set up directly on your machine o The proxy includes a service that exposes metrics to be consumed by [Prometheus](https://prometheus.io/docs/introduction/overview/). This service is always enabled and uses the host and port defined in the `miden-tx-prover.toml` file. -The metrics architecture works by having the proxy expose metrics at an endpoint (`/metrics`) in a format Prometheus can read. Prometheus periodically scrapes this endpoint, adds timestamps to the metrics, and stores them in its time-series database. Grafana then queries Prometheus to retrieve and visualize these metrics, allowing you to create dashboards and set up alerts based on the stored data. +The metrics architecture works by having the proxy expose metrics at an endpoint (`/metrics`) in a format Prometheus can read. Prometheus periodically scrapes this endpoint, adds timestamps to the metrics, and stores them in its time-series database. Then, we can use tools like Grafana to query Prometheus and visualize these metrics in configurable dashboards. The simplest way to install Prometheus and Grafana is by using Docker containers. To do so, run: From 429b28d13e8d20361ed4e03a8888587fb0fe036d Mon Sep 17 00:00:00 2001 From: SantiagoPittella Date: Thu, 2 Jan 2025 12:18:03 -0300 Subject: [PATCH 29/37] review: change WORKER_COUNT metric description --- bin/tx-prover/src/proxy/metrics.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/tx-prover/src/proxy/metrics.rs b/bin/tx-prover/src/proxy/metrics.rs index 3adb59350..8a02e6509 100644 --- a/bin/tx-prover/src/proxy/metrics.rs +++ b/bin/tx-prover/src/proxy/metrics.rs @@ -34,7 +34,7 @@ pub static QUEUE_DROP_COUNT: LazyLock = LazyLock::new(|| { // ================================================================================================ pub static WORKER_COUNT: LazyLock = - LazyLock::new(|| register_int_gauge!("worker_count", "Number of workers").unwrap()); + LazyLock::new(|| register_int_gauge!("worker_count", "Total number of workers").unwrap()); pub static WORKER_UNHEALTHY: LazyLock = LazyLock::new(|| { register_int_counter!( "worker_unhealthy", From 82dbe5e6f07c0dce5cb4ee43a1c65bdb7d0e2a62 Mon Sep 17 00:00:00 2001 From: SantiagoPittella Date: Thu, 2 Jan 2025 12:23:21 -0300 Subject: [PATCH 30/37] review: do not count update worker requests for metrics --- bin/tx-prover/src/proxy/mod.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/tx-prover/src/proxy/mod.rs b/bin/tx-prover/src/proxy/mod.rs index cf74c58fc..82a9aa02d 100644 --- a/bin/tx-prover/src/proxy/mod.rs +++ b/bin/tx-prover/src/proxy/mod.rs @@ -415,9 +415,6 @@ impl ProxyHttp for LoadBalancer { where Self::CTX: Send + Sync, { - // Increment the request count - REQUEST_COUNT.inc(); - // Extract the client address early let client_addr = match session.client_addr() { Some(addr) => addr.to_string(), @@ -439,6 +436,9 @@ impl ProxyHttp for LoadBalancer { } } + // Increment the request count + REQUEST_COUNT.inc(); + let user_id = Some(client_addr); // Retrieve the current window requests From 7e4a3c546c39cb00354999d0ce5f05ba4e22361e Mon Sep 17 00:00:00 2001 From: SantiagoPittella Date: Thu, 2 Jan 2025 15:45:26 -0300 Subject: [PATCH 31/37] review: move busy workers metric update --- bin/tx-prover/src/proxy/mod.rs | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/bin/tx-prover/src/proxy/mod.rs b/bin/tx-prover/src/proxy/mod.rs index 82a9aa02d..df0bb0e60 100644 --- a/bin/tx-prover/src/proxy/mod.rs +++ b/bin/tx-prover/src/proxy/mod.rs @@ -119,12 +119,6 @@ impl LoadBalancerState { if let Some(w) = available_workers.iter_mut().find(|w| *w == &worker) { w.set_availability(true); } - - // If the worker is not in the list it means but this method was called for a worker that - // was removed from the list either manually or because it was unhealthy. - // Either way when the worker get a job assigned the value of `WORKER_BUSY` was - // increased so we need to decrease it here. - WORKER_BUSY.dec(); } /// Updates the list of available workers based on the given action ("add" or "remove"). @@ -188,6 +182,11 @@ impl LoadBalancerState { self.workers.read().await.len() } + /// Get the number of busy workers. + pub async fn num_busy_workers(&self) -> usize { + self.workers.read().await.iter().filter(|w| !w.is_available()).count() + } + /// Handles the update workers request. /// /// # Behavior @@ -597,6 +596,9 @@ impl ProxyHttp for LoadBalancer { } REQUEST_LATENCY.observe(ctx.created_at.elapsed().as_secs_f64()); + + // Update the number of busy workers + WORKER_BUSY.set(self.0.num_busy_workers().await as i64); } // The following methods are a copy of the default implementation defined in the trait, but From bc577dc33f85480e6997a690fe72728cdcebd3a7 Mon Sep 17 00:00:00 2001 From: SantiagoPittella Date: Thu, 2 Jan 2025 15:54:08 -0300 Subject: [PATCH 32/37] review: update grafana dashboard --- ... tx_prover_service_grafana_dashboard.json} | 602 ++++++++++-------- 1 file changed, 331 insertions(+), 271 deletions(-) rename bin/tx-prover/{proxy_grafana_dashboard.json => tx_prover_service_grafana_dashboard.json} (86%) diff --git a/bin/tx-prover/proxy_grafana_dashboard.json b/bin/tx-prover/tx_prover_service_grafana_dashboard.json similarity index 86% rename from bin/tx-prover/proxy_grafana_dashboard.json rename to bin/tx-prover/tx_prover_service_grafana_dashboard.json index bde1e143c..9a634ed67 100644 --- a/bin/tx-prover/proxy_grafana_dashboard.json +++ b/bin/tx-prover/tx_prover_service_grafana_dashboard.json @@ -17,6 +17,12 @@ "name": "Grafana", "version": "11.4.0" }, + { + "type": "panel", + "id": "heatmap", + "name": "Heatmap", + "version": "" + }, { "type": "datasource", "id": "prometheus", @@ -58,6 +64,19 @@ "id": null, "links": [], "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 18, + "panels": [], + "title": "Requests", + "type": "row" + }, { "datasource": { "type": "prometheus", @@ -86,7 +105,7 @@ "h": 8, "w": 4, "x": 0, - "y": 0 + "y": 1 }, "id": 16, "options": { @@ -131,6 +150,7 @@ "fieldConfig": { "defaults": { "color": { + "fixedColor": "red", "mode": "palette-classic" }, "custom": { @@ -179,17 +199,64 @@ "value": 80 } ] - } + }, + "unit": "reqpm" }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Total requests" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Failed requests" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Accepted requests" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + } + ] }, "gridPos": { "h": 8, - "w": 7, + "w": 9, "x": 4, - "y": 0 + "y": 1 }, - "id": 1, + "id": 10, "options": { "legend": { "calcs": [], @@ -209,37 +276,40 @@ "type": "prometheus", "uid": "${DS_TX_PROVER}" }, - "disableTextWrap": false, "editorMode": "code", - "expr": "worker_count", - "fullMetaSearch": false, + "expr": "sum(rate(request_count[1m]))", "hide": false, - "includeNullMetadata": true, "instant": false, - "legendFormat": "Total workers", + "legendFormat": "Total requests", "range": true, - "refId": "C", - "useBackend": false + "refId": "B" + }, + { + "editorMode": "code", + "expr": "sum(rate(request_failure_count[1m]))", + "legendFormat": "Failed requests", + "range": true, + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + } }, { "datasource": { "type": "prometheus", "uid": "${DS_TX_PROVER}" }, - "disableTextWrap": false, "editorMode": "code", - "expr": "worker_busy", - "fullMetaSearch": false, + "expr": "sum(rate(request_count[1m])) - sum(rate(rate_limited_requests[1m])) - sum(rate(queue_drop_count[1m]))", "hide": false, - "includeNullMetadata": true, "instant": false, - "legendFormat": "Busy workers", + "legendFormat": "Accepted requests", "range": true, - "refId": "B", - "useBackend": false + "refId": "C" } ], - "title": "Workers", + "title": "Requests", "type": "timeseries" }, { @@ -295,20 +365,52 @@ }, { "color": "red", - "value": 15 + "value": 80 } ] - } + }, + "unit": "reqps" }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Rate limited" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Queue full" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "purple", + "mode": "fixed" + } + } + ] + } + ] }, "gridPos": { "h": 8, - "w": 6, - "x": 11, - "y": 0 + "w": 4, + "x": 13, + "y": 1 }, - "id": 3, + "id": 8, "options": { "legend": { "calcs": [], @@ -330,16 +432,29 @@ }, "disableTextWrap": false, "editorMode": "code", - "expr": "queue_size", + "expr": "rate(rate_limited_requests[1m])", "fullMetaSearch": false, "includeNullMetadata": true, - "legendFormat": "Queue size", + "legendFormat": "Rate limited", "range": true, "refId": "A", "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + }, + "editorMode": "code", + "expr": "rate(queue_drop_count[1m])", + "hide": false, + "instant": false, + "legendFormat": "Queue full", + "range": true, + "refId": "B" } ], - "title": "Queue", + "title": "Rejected requests", "type": "timeseries" }, { @@ -350,7 +465,7 @@ "fieldConfig": { "defaults": { "color": { - "mode": "palette-classic" + "mode": "continuous-YlRd" }, "custom": { "axisBorderShow": false, @@ -399,17 +514,17 @@ } ] }, - "unit": "s" + "unit": "reqpm" }, "overrides": [] }, "gridPos": { "h": 8, - "w": 7, + "w": 5, "x": 17, - "y": 0 + "y": 1 }, - "id": 11, + "id": 17, "options": { "legend": { "calcs": [], @@ -426,30 +541,17 @@ "targets": [ { "editorMode": "code", - "expr": "rate(request_latency_sum[1m]) / rate(request_latency_count[1m])", - "legendFormat": "Average request latency", + "expr": "rate(request_retries[1m])", + "legendFormat": "Retry rate", "range": true, "refId": "A", "datasource": { "type": "prometheus", "uid": "${DS_TX_PROVER}" } - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_TX_PROVER}" - }, - "editorMode": "code", - "expr": "rate(queue_latency_sum[1m]) / rate(queue_latency_count[1m])", - "hide": false, - "instant": false, - "legendFormat": "Average queue latency", - "range": true, - "refId": "B" } ], - "title": "Latency", + "title": "Request retry rate", "type": "timeseries" }, { @@ -457,7 +559,6 @@ "type": "prometheus", "uid": "${DS_TX_PROVER}" }, - "description": "Rate of requests dropped due to a full queue", "fieldConfig": { "defaults": { "color": { @@ -510,7 +611,7 @@ } ] }, - "unit": "reqps" + "unit": "percent" }, "overrides": [] }, @@ -518,9 +619,9 @@ "h": 8, "w": 8, "x": 0, - "y": 8 + "y": 9 }, - "id": 14, + "id": 13, "options": { "legend": { "calcs": [], @@ -537,8 +638,8 @@ "targets": [ { "editorMode": "code", - "expr": "rate(queue_drop_count[1m])", - "legendFormat": "Queue drop rate", + "expr": "(1 - rate(request_failure_count[1m]) / rate(request_count[1m])) * 100", + "legendFormat": "Success rate over time", "range": true, "refId": "A", "datasource": { @@ -547,7 +648,7 @@ } } ], - "title": "Queue drop rate", + "title": "Success rate", "type": "timeseries" }, { @@ -558,7 +659,6 @@ "fieldConfig": { "defaults": { "color": { - "fixedColor": "red", "mode": "palette-classic" }, "custom": { @@ -608,48 +708,17 @@ } ] }, - "unit": "reqpm" + "unit": "s" }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Total requests" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "green", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Failed requests" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "red", - "mode": "fixed" - } - } - ] - } - ] + "overrides": [] }, "gridPos": { "h": 8, - "w": 9, + "w": 7, "x": 8, - "y": 8 + "y": 9 }, - "id": 10, + "id": 11, "options": { "legend": { "calcs": [], @@ -664,34 +733,47 @@ }, "pluginVersion": "11.4.0", "targets": [ + { + "editorMode": "code", + "expr": "rate(request_latency_sum[1m]) / rate(request_latency_count[1m])", + "legendFormat": "Average request latency", + "range": true, + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + } + }, { "datasource": { "type": "prometheus", "uid": "${DS_TX_PROVER}" }, "editorMode": "code", - "expr": "sum(rate(request_count[1m]))", + "expr": "rate(queue_latency_sum[1m]) / rate(queue_latency_count[1m])", "hide": false, "instant": false, - "legendFormat": "Total requests", + "legendFormat": "Average queue latency", "range": true, "refId": "B" - }, - { - "editorMode": "code", - "expr": "sum(rate(request_failure_count[1m]))", - "legendFormat": "Failed requests", - "range": true, - "refId": "A", - "datasource": { - "type": "prometheus", - "uid": "${DS_TX_PROVER}" - } } ], - "title": "Requests", + "title": "Latency", "type": "timeseries" }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 17 + }, + "id": 19, + "panels": [], + "title": "Workers", + "type": "row" + }, { "datasource": { "type": "prometheus", @@ -748,18 +830,37 @@ "value": 80 } ] - }, - "unit": "reqps" + } }, - "overrides": [] + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Unhealthy workers" + }, + "properties": [ + { + "id": "custom.axisColorMode", + "value": "text" + }, + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + } + ] }, "gridPos": { "h": 8, "w": 7, - "x": 17, - "y": 8 + "x": 0, + "y": 18 }, - "id": 12, + "id": 1, "options": { "legend": { "calcs": [], @@ -775,18 +876,54 @@ "pluginVersion": "11.4.0", "targets": [ { + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + }, + "disableTextWrap": false, "editorMode": "code", - "expr": "rate(worker_request_count[1m])", - "legendFormat": "{{worker_id}}", + "expr": "worker_count", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "Total workers", "range": true, - "refId": "A", + "refId": "C", + "useBackend": false + }, + { "datasource": { "type": "prometheus", "uid": "${DS_TX_PROVER}" - } + }, + "disableTextWrap": false, + "editorMode": "code", + "expr": "worker_busy", + "fullMetaSearch": false, + "hide": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "Busy workers", + "range": true, + "refId": "B", + "useBackend": false + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + }, + "editorMode": "code", + "expr": "worker_unhealthy", + "hide": false, + "instant": false, + "legendFormat": "Unhealthy workers", + "range": true, + "refId": "A" } ], - "title": "Requests per worker", + "title": "Workers", "type": "timeseries" }, { @@ -796,85 +933,66 @@ }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, "scaleDistribution": { "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent" + } }, "overrides": [] }, "gridPos": { "h": 8, - "w": 8, - "x": 0, - "y": 16 + "w": 7, + "x": 7, + "y": 18 }, - "id": 13, + "id": 12, "options": { + "calculate": false, + "cellGap": 1, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "mode": "scheme", + "reverse": true, + "scale": "exponential", + "scheme": "BrBG", + "steps": 64 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "show": true + }, + "rowsFrame": { + "layout": "auto" }, "tooltip": { "mode": "single", - "sort": "none" + "showColorScale": false, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false } }, "pluginVersion": "11.4.0", "targets": [ { "editorMode": "code", - "expr": "(1 - rate(request_failure_count[1m]) / rate(request_count[1m])) * 100", - "legendFormat": "Success rate over time", + "expr": "rate(worker_request_count[1m])", + "legendFormat": "{{worker_id}}", "range": true, "refId": "A", "datasource": { @@ -883,8 +1001,21 @@ } } ], - "title": "Success rate", - "type": "timeseries" + "title": "Requests per worker", + "type": "heatmap" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 26 + }, + "id": 20, + "panels": [], + "title": "Queue", + "type": "row" }, { "datasource": { @@ -894,8 +1025,7 @@ "fieldConfig": { "defaults": { "color": { - "fixedColor": "red", - "mode": "fixed" + "mode": "palette-classic" }, "custom": { "axisBorderShow": false, @@ -903,7 +1033,6 @@ "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", - "axisSoftMax": 5, "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", @@ -941,21 +1070,20 @@ }, { "color": "red", - "value": "" + "value": 15 } ] - }, - "unit": "none" + } }, "overrides": [] }, "gridPos": { "h": 8, - "w": 7, - "x": 8, - "y": 16 + "w": 6, + "x": 0, + "y": 27 }, - "id": 15, + "id": 3, "options": { "legend": { "calcs": [], @@ -969,75 +1097,6 @@ } }, "pluginVersion": "11.4.0", - "targets": [ - { - "editorMode": "code", - "expr": "worker_unhealthy", - "legendFormat": "Number of unhealty workers", - "range": true, - "refId": "A", - "datasource": { - "type": "prometheus", - "uid": "${DS_TX_PROVER}" - } - } - ], - "title": "Unhealthy workers", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_TX_PROVER}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "continuous-RdYlGr" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "reqps" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 4, - "x": 15, - "y": 16 - }, - "id": 8, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.4.0", "targets": [ { "datasource": { @@ -1046,27 +1105,28 @@ }, "disableTextWrap": false, "editorMode": "code", - "expr": "rate(rate_limited_requests[1m])", + "expr": "queue_size", "fullMetaSearch": false, "includeNullMetadata": true, - "legendFormat": "__auto", + "legendFormat": "Queue size", "range": true, "refId": "A", "useBackend": false } ], - "title": "Rate-Limited Requests", - "type": "stat" + "title": "Queue", + "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_TX_PROVER}" }, + "description": "Rate of requests dropped due to a full queue", "fieldConfig": { "defaults": { "color": { - "mode": "continuous-YlRd" + "mode": "palette-classic" }, "custom": { "axisBorderShow": false, @@ -1115,17 +1175,17 @@ } ] }, - "unit": "reqpm" + "unit": "reqps" }, "overrides": [] }, "gridPos": { "h": 8, - "w": 5, - "x": 19, - "y": 16 + "w": 8, + "x": 6, + "y": 27 }, - "id": 17, + "id": 14, "options": { "legend": { "calcs": [], @@ -1142,8 +1202,8 @@ "targets": [ { "editorMode": "code", - "expr": "rate(request_retries[1m])", - "legendFormat": "__auto", + "expr": "rate(queue_drop_count[1m])", + "legendFormat": "Queue drop rate", "range": true, "refId": "A", "datasource": { @@ -1152,7 +1212,7 @@ } } ], - "title": "Request retry rate", + "title": "Queue drop rate", "type": "timeseries" } ], @@ -1163,13 +1223,13 @@ "list": [] }, "time": { - "from": "now-5m", - "to": "now" + "from": "2024-12-30T23:21:01.558Z", + "to": "2024-12-30T23:21:58.639Z" }, "timepicker": {}, "timezone": "browser", "title": "tx_prover", "uid": "be7bobzl5fr40f", - "version": 24, + "version": 28, "weekStart": "" } \ No newline at end of file From ce0892d259e6b8fbd6de30c4a4c4aa5e485dd087 Mon Sep 17 00:00:00 2001 From: SantiagoPittella Date: Thu, 2 Jan 2025 20:27:14 -0300 Subject: [PATCH 33/37] review: update dashboard --- .../tx_prover_service_grafana_dashboard.json | 1440 ++++++++--------- 1 file changed, 635 insertions(+), 805 deletions(-) diff --git a/bin/tx-prover/tx_prover_service_grafana_dashboard.json b/bin/tx-prover/tx_prover_service_grafana_dashboard.json index 9a634ed67..7816b1742 100644 --- a/bin/tx-prover/tx_prover_service_grafana_dashboard.json +++ b/bin/tx-prover/tx_prover_service_grafana_dashboard.json @@ -17,12 +17,6 @@ "name": "Grafana", "version": "11.4.0" }, - { - "type": "panel", - "id": "heatmap", - "name": "Heatmap", - "version": "" - }, { "type": "datasource", "id": "prometheus", @@ -31,8 +25,8 @@ }, { "type": "panel", - "id": "stat", - "name": "Stat", + "id": "table", + "name": "Table", "version": "" }, { @@ -65,7 +59,7 @@ "links": [], "panels": [ { - "collapsed": false, + "collapsed": true, "gridPos": { "h": 1, "w": 24, @@ -73,693 +67,634 @@ "y": 0 }, "id": 18, - "panels": [], - "title": "Requests", - "type": "row" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_TX_PROVER}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "requests" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 4, - "x": 0, - "y": 1 - }, - "id": 16, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.4.0", - "targets": [ + "panels": [ { - "editorMode": "code", - "expr": "request_count", - "legendFormat": "__auto", - "range": true, - "refId": "A", "datasource": { "type": "prometheus", "uid": "${DS_TX_PROVER}" - } - } - ], - "title": "Total requests handled", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_TX_PROVER}" - }, - "fieldConfig": { - "defaults": { - "color": { - "fixedColor": "red", - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "reqpm" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Total requests" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "blue", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Failed requests" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "red", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Accepted requests" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "green", - "mode": "fixed" + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" } - } - ] - } - ] - }, - "gridPos": { - "h": 8, - "w": 9, - "x": 4, - "y": 1 - }, - "id": 10, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_TX_PROVER}" - }, - "editorMode": "code", - "expr": "sum(rate(request_count[1m]))", - "hide": false, - "instant": false, - "legendFormat": "Total requests", - "range": true, - "refId": "B" - }, - { - "editorMode": "code", - "expr": "sum(rate(request_failure_count[1m]))", - "legendFormat": "Failed requests", - "range": true, - "refId": "A", - "datasource": { - "type": "prometheus", - "uid": "${DS_TX_PROVER}" - } - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_TX_PROVER}" - }, - "editorMode": "code", - "expr": "sum(rate(request_count[1m])) - sum(rate(rate_limited_requests[1m])) - sum(rate(queue_drop_count[1m]))", - "hide": false, - "instant": false, - "legendFormat": "Accepted requests", - "range": true, - "refId": "C" - } - ], - "title": "Requests", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_TX_PROVER}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "requests" }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ + "overrides": [ { - "color": "green", - "value": null + "matcher": { + "id": "byName", + "options": "Rate limited" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" + } + } + ] }, { - "color": "red", - "value": 80 + "matcher": { + "id": "byName", + "options": "Dropped by full queue" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "purple", + "mode": "fixed" + } + } + ] } - ] - }, - "unit": "reqps" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Rate limited" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "orange", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "Queue full" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "purple", - "mode": "fixed" - } - } - ] - } - ] - }, - "gridPos": { - "h": 8, - "w": 4, - "x": 13, - "y": 1 - }, - "id": 8, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_TX_PROVER}" - }, - "disableTextWrap": false, - "editorMode": "code", - "expr": "rate(rate_limited_requests[1m])", - "fullMetaSearch": false, - "includeNullMetadata": true, - "legendFormat": "Rate limited", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_TX_PROVER}" - }, - "editorMode": "code", - "expr": "rate(queue_drop_count[1m])", - "hide": false, - "instant": false, - "legendFormat": "Queue full", - "range": true, - "refId": "B" - } - ], - "title": "Rejected requests", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_TX_PROVER}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "continuous-YlRd" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "reqpm" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 5, - "x": 17, - "y": 1 - }, - "id": 17, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "11.4.0", - "targets": [ - { - "editorMode": "code", - "expr": "rate(request_retries[1m])", - "legendFormat": "Retry rate", - "range": true, - "refId": "A", - "datasource": { - "type": "prometheus", - "uid": "${DS_TX_PROVER}" - } - } - ], - "title": "Request retry rate", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_TX_PROVER}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 9 - }, - "id": 13, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "11.4.0", - "targets": [ - { - "editorMode": "code", - "expr": "(1 - rate(request_failure_count[1m]) / rate(request_count[1m])) * 100", - "legendFormat": "Success rate over time", - "range": true, - "refId": "A", - "datasource": { - "type": "prometheus", - "uid": "${DS_TX_PROVER}" - } - } - ], - "title": "Success rate", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_TX_PROVER}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false + ] + }, + "gridPos": { + "h": 8, + "w": 9, + "x": 0, + "y": 1 + }, + "id": 16, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "rate(request_count[1m])", + "legendFormat": "Total requests", + "range": true, + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + } }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" + { + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + }, + "editorMode": "code", + "expr": "rate(rate_limited_requests[1m])", + "hide": false, + "instant": false, + "legendFormat": "Rate limited", + "range": true, + "refId": "B" }, - "thresholdsStyle": { - "mode": "off" + { + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + }, + "editorMode": "code", + "expr": "rate(queue_drop_count[1m])", + "hide": false, + "instant": false, + "legendFormat": "Dropped by full queue", + "range": true, + "refId": "C" } + ], + "title": "Total requests handled", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "red", + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqpm" + }, + "overrides": [ { - "color": "green", - "value": null + "matcher": { + "id": "byName", + "options": "Total requests" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] }, { - "color": "red", - "value": 80 + "matcher": { + "id": "byName", + "options": "Failed requests" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Accepted requests" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] } ] }, - "unit": "s" + "gridPos": { + "h": 8, + "w": 9, + "x": 9, + "y": 1 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + }, + "editorMode": "code", + "expr": "sum(rate(request_count[1m]))", + "hide": false, + "instant": false, + "legendFormat": "Total requests", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + }, + "editorMode": "code", + "expr": "sum(rate(request_count[1m])) - sum(rate(rate_limited_requests[1m])) - sum(rate(queue_drop_count[1m]))", + "hide": false, + "instant": false, + "legendFormat": "Accepted requests", + "range": true, + "refId": "C" + }, + { + "editorMode": "code", + "expr": "sum(rate(request_failure_count[1m]))", + "legendFormat": "Failed requests", + "range": true, + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + } + } + ], + "title": "Requests", + "type": "timeseries" }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 7, - "x": 8, - "y": 9 - }, - "id": 11, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + { + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-YlRd" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqpm" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 5, + "x": 18, + "y": 1 + }, + "id": 17, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "rate(request_retries[1m])", + "legendFormat": "Retry rate", + "range": true, + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + } + } + ], + "title": "Request retry rate", + "type": "timeseries" }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "11.4.0", - "targets": [ { - "editorMode": "code", - "expr": "rate(request_latency_sum[1m]) / rate(request_latency_count[1m])", - "legendFormat": "Average request latency", - "range": true, - "refId": "A", "datasource": { "type": "prometheus", "uid": "${DS_TX_PROVER}" - } + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 25 + }, + "id": 13, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "(1 - rate(request_failure_count[1m]) / rate(request_count[1m])) * 100", + "legendFormat": "Success rate over time", + "range": true, + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + } + } + ], + "title": "Success rate", + "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "${DS_TX_PROVER}" }, - "editorMode": "code", - "expr": "rate(queue_latency_sum[1m]) / rate(queue_latency_count[1m])", - "hide": false, - "instant": false, - "legendFormat": "Average queue latency", - "range": true, - "refId": "B" + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 7, + "x": 8, + "y": 25 + }, + "id": 11, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "rate(request_latency_sum[1m]) / rate(request_latency_count[1m])", + "legendFormat": "Average request latency", + "range": true, + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + } + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + }, + "editorMode": "code", + "expr": "rate(queue_latency_sum[1m]) / rate(queue_latency_count[1m])", + "hide": false, + "instant": false, + "legendFormat": "Average queue latency", + "range": true, + "refId": "B" + } + ], + "title": "Latency", + "type": "timeseries" } ], - "title": "Latency", - "type": "timeseries" + "title": "Requests", + "type": "row" }, { "collapsed": false, @@ -767,7 +702,7 @@ "h": 1, "w": 24, "x": 0, - "y": 17 + "y": 1 }, "id": 19, "panels": [], @@ -858,7 +793,7 @@ "h": 8, "w": 7, "x": 0, - "y": 18 + "y": 2 }, "id": 1, "options": { @@ -933,15 +868,29 @@ }, "fieldConfig": { "defaults": { + "color": { + "mode": "thresholds" + }, "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false + "align": "auto", + "cellOptions": { + "type": "auto" }, - "scaleDistribution": { - "type": "linear" - } + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] } }, "overrides": [] @@ -950,42 +899,21 @@ "h": 8, "w": 7, "x": 7, - "y": 18 + "y": 2 }, "id": 12, "options": { - "calculate": false, - "cellGap": 1, - "color": { - "exponent": 0.5, - "fill": "dark-orange", - "mode": "scheme", - "reverse": true, - "scale": "exponential", - "scheme": "BrBG", - "steps": 64 - }, - "exemplars": { - "color": "rgba(255,0,255,0.7)" - }, - "filterValues": { - "le": 1e-9 - }, - "legend": { - "show": true - }, - "rowsFrame": { - "layout": "auto" - }, - "tooltip": { - "mode": "single", - "showColorScale": false, - "yHistogram": false + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false }, - "yAxis": { - "axisPlacement": "left", - "reverse": false - } + "frameIndex": 0, + "showHeader": true }, "pluginVersion": "11.4.0", "targets": [ @@ -1002,7 +930,7 @@ } ], "title": "Requests per worker", - "type": "heatmap" + "type": "table" }, { "collapsed": false, @@ -1010,7 +938,7 @@ "h": 1, "w": 24, "x": 0, - "y": 26 + "y": 10 }, "id": 20, "panels": [], @@ -1081,7 +1009,7 @@ "h": 8, "w": 6, "x": 0, - "y": 27 + "y": 11 }, "id": 3, "options": { @@ -1114,105 +1042,7 @@ "useBackend": false } ], - "title": "Queue", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_TX_PROVER}" - }, - "description": "Rate of requests dropped due to a full queue", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "reqps" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 6, - "y": 27 - }, - "id": 14, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "11.4.0", - "targets": [ - { - "editorMode": "code", - "expr": "rate(queue_drop_count[1m])", - "legendFormat": "Queue drop rate", - "range": true, - "refId": "A", - "datasource": { - "type": "prometheus", - "uid": "${DS_TX_PROVER}" - } - } - ], - "title": "Queue drop rate", + "title": "Queue size", "type": "timeseries" } ], @@ -1223,13 +1053,13 @@ "list": [] }, "time": { - "from": "2024-12-30T23:21:01.558Z", - "to": "2024-12-30T23:21:58.639Z" + "from": "now-3h", + "to": "now" }, "timepicker": {}, "timezone": "browser", "title": "tx_prover", "uid": "be7bobzl5fr40f", - "version": 28, + "version": 36, "weekStart": "" } \ No newline at end of file From 07c999e5b83eea17a211da5d3b4372f308d0d7ef Mon Sep 17 00:00:00 2001 From: SantiagoPittella Date: Thu, 2 Jan 2025 21:33:56 -0300 Subject: [PATCH 34/37] review: move metric update to create response function --- bin/tx-prover/src/main.rs | 2 +- bin/tx-prover/src/proxy/mod.rs | 5 ++--- bin/tx-prover/src/utils.rs | 6 ++++++ 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/bin/tx-prover/src/main.rs b/bin/tx-prover/src/main.rs index a9ba6a088..1bbb3592e 100644 --- a/bin/tx-prover/src/main.rs +++ b/bin/tx-prover/src/main.rs @@ -1,6 +1,6 @@ pub mod api; pub mod commands; -mod proxy; +pub mod proxy; mod utils; use commands::Cli; use utils::setup_tracing; diff --git a/bin/tx-prover/src/proxy/mod.rs b/bin/tx-prover/src/proxy/mod.rs index df0bb0e60..0d57e00d2 100644 --- a/bin/tx-prover/src/proxy/mod.rs +++ b/bin/tx-prover/src/proxy/mod.rs @@ -9,7 +9,7 @@ use std::{ use async_trait::async_trait; use bytes::Bytes; use metrics::{ - QUEUE_DROP_COUNT, QUEUE_LATENCY, QUEUE_SIZE, RATE_LIMITED_REQUESTS, RATE_LIMIT_VIOLATIONS, + QUEUE_LATENCY, QUEUE_SIZE, RATE_LIMITED_REQUESTS, RATE_LIMIT_VIOLATIONS, REQUEST_COUNT, REQUEST_FAILURE_COUNT, REQUEST_LATENCY, REQUEST_RETRIES, WORKER_BUSY, WORKER_COUNT, WORKER_REQUEST_COUNT, WORKER_UNHEALTHY, }; @@ -42,7 +42,7 @@ use crate::{ }, }; -mod metrics; +pub mod metrics; mod worker; /// Localhost address @@ -462,7 +462,6 @@ impl ProxyHttp for LoadBalancer { // Check if the queue is full if queue_len >= self.0.max_queue_items { - QUEUE_DROP_COUNT.inc(); return create_queue_full_response(session).await; } diff --git a/bin/tx-prover/src/utils.rs b/bin/tx-prover/src/utils.rs index 5e6c3e2bc..815801093 100644 --- a/bin/tx-prover/src/utils.rs +++ b/bin/tx-prover/src/utils.rs @@ -16,6 +16,8 @@ use tonic::transport::Channel; use tonic_health::pb::health_client::HealthClient; use tracing_subscriber::{layer::SubscriberExt, Registry}; +use crate::proxy::metrics::QUEUE_DROP_COUNT; + pub const MIDEN_TX_PROVER: &str = "miden-tx-prover"; const RESOURCE_EXHAUSTED_CODE: u16 = 8; @@ -110,6 +112,10 @@ pub(crate) async fn create_queue_full_response( error.set_cause("Too many requests in the queue"); session.write_response_header(Box::new(header), false).await?; + + // Increment the queue drop count metric + QUEUE_DROP_COUNT.inc(); + Err(error) } From 5295fd22ca8fb0eabe7f51e22dc8db0533318984 Mon Sep 17 00:00:00 2001 From: SantiagoPittella Date: Thu, 2 Jan 2025 21:36:58 -0300 Subject: [PATCH 35/37] chore: address lint errors --- bin/tx-prover/src/proxy/mod.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/bin/tx-prover/src/proxy/mod.rs b/bin/tx-prover/src/proxy/mod.rs index 0d57e00d2..3fe749672 100644 --- a/bin/tx-prover/src/proxy/mod.rs +++ b/bin/tx-prover/src/proxy/mod.rs @@ -9,9 +9,9 @@ use std::{ use async_trait::async_trait; use bytes::Bytes; use metrics::{ - QUEUE_LATENCY, QUEUE_SIZE, RATE_LIMITED_REQUESTS, RATE_LIMIT_VIOLATIONS, - REQUEST_COUNT, REQUEST_FAILURE_COUNT, REQUEST_LATENCY, REQUEST_RETRIES, WORKER_BUSY, - WORKER_COUNT, WORKER_REQUEST_COUNT, WORKER_UNHEALTHY, + QUEUE_LATENCY, QUEUE_SIZE, RATE_LIMITED_REQUESTS, RATE_LIMIT_VIOLATIONS, REQUEST_COUNT, + REQUEST_FAILURE_COUNT, REQUEST_LATENCY, REQUEST_RETRIES, WORKER_BUSY, WORKER_COUNT, + WORKER_REQUEST_COUNT, WORKER_UNHEALTHY, }; use once_cell::sync::Lazy; use pingora::{ @@ -284,12 +284,14 @@ pub struct RequestQueue { impl RequestQueue { /// Create a new empty request queue + #[allow(clippy::new_without_default)] pub fn new() -> Self { QUEUE_SIZE.set(0); Self { queue: RwLock::new(VecDeque::new()) } } /// Get the length of the queue + #[allow(clippy::len_without_is_empty)] pub async fn len(&self) -> usize { self.queue.read().await.len() } From 61b7074372b1e7aae99e09cf10b34cb21e3d00a0 Mon Sep 17 00:00:00 2001 From: SantiagoPittella Date: Fri, 3 Jan 2025 09:32:29 -0300 Subject: [PATCH 36/37] review: rename grafana dashboard file --- ...over_service_grafana_dashboard.json => grafana_dashboard.json} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename bin/tx-prover/{tx_prover_service_grafana_dashboard.json => grafana_dashboard.json} (100%) diff --git a/bin/tx-prover/tx_prover_service_grafana_dashboard.json b/bin/tx-prover/grafana_dashboard.json similarity index 100% rename from bin/tx-prover/tx_prover_service_grafana_dashboard.json rename to bin/tx-prover/grafana_dashboard.json From d9c1d4043a624756491cca8589f034d440216a2f Mon Sep 17 00:00:00 2001 From: SantiagoPittella Date: Fri, 3 Jan 2025 09:39:48 -0300 Subject: [PATCH 37/37] review: update dashboard --- bin/tx-prover/grafana_dashboard.json | 1303 ++++++++++++++------------ 1 file changed, 678 insertions(+), 625 deletions(-) diff --git a/bin/tx-prover/grafana_dashboard.json b/bin/tx-prover/grafana_dashboard.json index 7816b1742..ba6791a3a 100644 --- a/bin/tx-prover/grafana_dashboard.json +++ b/bin/tx-prover/grafana_dashboard.json @@ -59,7 +59,7 @@ "links": [], "panels": [ { - "collapsed": true, + "collapsed": false, "gridPos": { "h": 1, "w": 24, @@ -67,634 +67,622 @@ "y": 0 }, "id": 18, - "panels": [ + "panels": [], + "title": "Requests", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "red", + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqpm" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Total requests" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Failed requests" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "red", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Accepted requests" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "green", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 9, + "x": 0, + "y": 1 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ { "datasource": { "type": "prometheus", "uid": "${DS_TX_PROVER}" }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" + "editorMode": "code", + "expr": "sum(rate(request_count[1m]))", + "hide": false, + "instant": false, + "legendFormat": "Total requests", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + }, + "editorMode": "code", + "expr": "sum(rate(request_count[1m])) - sum(rate(rate_limited_requests[1m])) - sum(rate(queue_drop_count[1m]))", + "hide": false, + "instant": false, + "legendFormat": "Accepted requests", + "range": true, + "refId": "C" + }, + { + "editorMode": "code", + "expr": "sum(rate(request_failure_count[1m]))", + "legendFormat": "Failed requests", + "range": true, + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + } + } + ], + "title": "Requests", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "reqpm" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Rate limited requests" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "orange", + "mode": "fixed" } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "requests" + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "Queue overflow requests" }, - "overrides": [ + "properties": [ { - "matcher": { - "id": "byName", - "options": "Rate limited" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "orange", - "mode": "fixed" - } - } - ] + "id": "color", + "value": { + "fixedColor": "purple", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 9, + "x": 9, + "y": 1 + }, + "id": 16, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + }, + "editorMode": "code", + "expr": "rate(rate_limited_requests[1m])", + "hide": false, + "instant": false, + "legendFormat": "Rate limited requests", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + }, + "editorMode": "code", + "expr": "rate(queue_drop_count[1m])", + "hide": false, + "instant": false, + "legendFormat": "Queue overflow requests", + "range": true, + "refId": "C" + } + ], + "title": "Rejected requests", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-YlRd" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null }, { - "matcher": { - "id": "byName", - "options": "Dropped by full queue" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "purple", - "mode": "fixed" - } - } - ] + "color": "red", + "value": 80 } ] }, - "gridPos": { - "h": 8, - "w": 9, - "x": 0, - "y": 1 + "unit": "reqpm" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 5, + "x": 18, + "y": 1 + }, + "id": 17, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "rate(request_retries[1m])", + "legendFormat": "Retry rate", + "range": true, + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + } + } + ], + "title": "Request retry rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" }, - "id": 16, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" }, - "tooltip": { - "mode": "single", - "sort": "none" + "thresholdsStyle": { + "mode": "off" } }, - "pluginVersion": "11.4.0", - "targets": [ - { - "editorMode": "code", - "expr": "rate(request_count[1m])", - "legendFormat": "Total requests", - "range": true, - "refId": "A", - "datasource": { - "type": "prometheus", - "uid": "${DS_TX_PROVER}" - } + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 9 + }, + "id": 13, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "editorMode": "code", + "expr": "(1 - rate(request_failure_count[1m]) / rate(request_count[1m])) * 100", + "legendFormat": "Success rate over time", + "range": true, + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + } + } + ], + "title": "Success rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_TX_PROVER}" - }, - "editorMode": "code", - "expr": "rate(rate_limited_requests[1m])", - "hide": false, - "instant": false, - "legendFormat": "Rate limited", - "range": true, - "refId": "B" + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_TX_PROVER}" - }, - "editorMode": "code", - "expr": "rate(queue_drop_count[1m])", - "hide": false, - "instant": false, - "legendFormat": "Dropped by full queue", - "range": true, - "refId": "C" + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" } - ], - "title": "Total requests handled", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_TX_PROVER}" }, - "fieldConfig": { - "defaults": { - "color": { - "fixedColor": "red", - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "reqpm" - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Total requests" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "blue", - "mode": "fixed" - } - } - ] - }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ { - "matcher": { - "id": "byName", - "options": "Failed requests" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "red", - "mode": "fixed" - } - } - ] + "color": "green", + "value": null }, { - "matcher": { - "id": "byName", - "options": "Accepted requests" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "green", - "mode": "fixed" - } - } - ] + "color": "red", + "value": 80 } ] }, - "gridPos": { - "h": 8, - "w": 9, - "x": 9, - "y": 1 - }, - "id": 10, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "11.4.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${DS_TX_PROVER}" - }, - "editorMode": "code", - "expr": "sum(rate(request_count[1m]))", - "hide": false, - "instant": false, - "legendFormat": "Total requests", - "range": true, - "refId": "B" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_TX_PROVER}" - }, - "editorMode": "code", - "expr": "sum(rate(request_count[1m])) - sum(rate(rate_limited_requests[1m])) - sum(rate(queue_drop_count[1m]))", - "hide": false, - "instant": false, - "legendFormat": "Accepted requests", - "range": true, - "refId": "C" - }, - { - "editorMode": "code", - "expr": "sum(rate(request_failure_count[1m]))", - "legendFormat": "Failed requests", - "range": true, - "refId": "A", - "datasource": { - "type": "prometheus", - "uid": "${DS_TX_PROVER}" - } - } - ], - "title": "Requests", - "type": "timeseries" + "unit": "s" }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_TX_PROVER}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "continuous-YlRd" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "reqpm" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 5, - "x": 18, - "y": 1 - }, - "id": 17, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "11.4.0", - "targets": [ - { - "editorMode": "code", - "expr": "rate(request_retries[1m])", - "legendFormat": "Retry rate", - "range": true, - "refId": "A", - "datasource": { - "type": "prometheus", - "uid": "${DS_TX_PROVER}" - } - } - ], - "title": "Request retry rate", - "type": "timeseries" + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 7, + "x": 8, + "y": 9 + }, + "id": 11, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ { + "editorMode": "code", + "expr": "rate(request_latency_sum[1m]) / rate(request_latency_count[1m])", + "legendFormat": "Average request latency", + "range": true, + "refId": "A", "datasource": { "type": "prometheus", "uid": "${DS_TX_PROVER}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "percent" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 25 - }, - "id": 13, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "11.4.0", - "targets": [ - { - "editorMode": "code", - "expr": "(1 - rate(request_failure_count[1m]) / rate(request_count[1m])) * 100", - "legendFormat": "Success rate over time", - "range": true, - "refId": "A", - "datasource": { - "type": "prometheus", - "uid": "${DS_TX_PROVER}" - } - } - ], - "title": "Success rate", - "type": "timeseries" + } }, { "datasource": { "type": "prometheus", "uid": "${DS_TX_PROVER}" }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "barWidthFactor": 0.6, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - }, - "unit": "s" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 7, - "x": 8, - "y": 25 - }, - "id": 11, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "11.4.0", - "targets": [ - { - "editorMode": "code", - "expr": "rate(request_latency_sum[1m]) / rate(request_latency_count[1m])", - "legendFormat": "Average request latency", - "range": true, - "refId": "A", - "datasource": { - "type": "prometheus", - "uid": "${DS_TX_PROVER}" - } - }, - { - "datasource": { - "type": "prometheus", - "uid": "${DS_TX_PROVER}" - }, - "editorMode": "code", - "expr": "rate(queue_latency_sum[1m]) / rate(queue_latency_count[1m])", - "hide": false, - "instant": false, - "legendFormat": "Average queue latency", - "range": true, - "refId": "B" - } - ], - "title": "Latency", - "type": "timeseries" + "editorMode": "code", + "expr": "rate(queue_latency_sum[1m]) / rate(queue_latency_count[1m])", + "hide": false, + "instant": false, + "legendFormat": "Average queue latency", + "range": true, + "refId": "B" } ], - "title": "Requests", - "type": "row" + "title": "Latency", + "type": "timeseries" }, { "collapsed": false, @@ -702,7 +690,7 @@ "h": 1, "w": 24, "x": 0, - "y": 1 + "y": 17 }, "id": 19, "panels": [], @@ -767,33 +755,13 @@ ] } }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "Unhealthy workers" - }, - "properties": [ - { - "id": "custom.axisColorMode", - "value": "text" - }, - { - "id": "color", - "value": { - "fixedColor": "red", - "mode": "fixed" - } - } - ] - } - ] + "overrides": [] }, "gridPos": { "h": 8, "w": 7, "x": 0, - "y": 2 + "y": 18 }, "id": 1, "options": { @@ -843,22 +811,107 @@ "range": true, "refId": "B", "useBackend": false + } + ], + "title": "Workers", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "red", + "mode": "fixed" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMax": 10, + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 7, + "x": 7, + "y": 18 + }, + "id": 21, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "${DS_TX_PROVER}" - }, "editorMode": "code", "expr": "worker_unhealthy", - "hide": false, - "instant": false, "legendFormat": "Unhealthy workers", "range": true, - "refId": "A" + "refId": "A", + "datasource": { + "type": "prometheus", + "uid": "${DS_TX_PROVER}" + } } ], - "title": "Workers", + "title": "Unhealthy workers", "type": "timeseries" }, { @@ -898,8 +951,8 @@ "gridPos": { "h": 8, "w": 7, - "x": 7, - "y": 2 + "x": 14, + "y": 18 }, "id": 12, "options": { @@ -938,7 +991,7 @@ "h": 1, "w": 24, "x": 0, - "y": 10 + "y": 26 }, "id": 20, "panels": [], @@ -1009,7 +1062,7 @@ "h": 8, "w": 6, "x": 0, - "y": 11 + "y": 27 }, "id": 3, "options": { @@ -1053,13 +1106,13 @@ "list": [] }, "time": { - "from": "now-3h", + "from": "now-2d", "to": "now" }, "timepicker": {}, "timezone": "browser", "title": "tx_prover", "uid": "be7bobzl5fr40f", - "version": 36, + "version": 40, "weekStart": "" } \ No newline at end of file