Skip to content

Commit e21069d

Browse files
authored
feat: metrics via terraform (#10594)
Also, fix the prod deployment so that it uses the correct service endpoints for the "scalable" loki config. Have ran the metrics deployment workflow, and the Loki datasource now works as expected: <img width="1603" alt="Screenshot 2024-12-10 at 14 11 17" src="https://github.com/user-attachments/assets/2ca0adb5-1dd0-480b-a16c-39e622e922c3"> fix #10191 fix #10439
1 parent 9eaa527 commit e21069d

File tree

9 files changed

+286
-157
lines changed

9 files changed

+286
-157
lines changed

.github/workflows/metrics-deploy.yml

+143
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
name: Aztec Metrics Stack Deployment
2+
3+
on:
4+
workflow_call:
5+
inputs:
6+
namespace:
7+
description: The namespace to deploy to, e.g. metrics
8+
required: true
9+
type: string
10+
default: metrics
11+
values_file:
12+
description: The values file to use, e.g. prod.yaml
13+
required: true
14+
type: string
15+
default: "prod.yaml"
16+
respect_tf_lock:
17+
description: Whether to respect the Terraform lock
18+
required: false
19+
type: string
20+
default: "true"
21+
run_terraform_destroy:
22+
description: Whether to run terraform destroy before deploying
23+
required: false
24+
type: string
25+
default: "false"
26+
ref:
27+
description: The branch name to deploy from
28+
required: false
29+
type: string
30+
default: "master"
31+
secrets:
32+
GCP_SA_KEY:
33+
required: true
34+
workflow_dispatch:
35+
inputs:
36+
namespace:
37+
description: The namespace to deploy to, e.g. metrics
38+
required: true
39+
default: metrics
40+
values_file:
41+
description: The values file to use, e.g. prod.yaml
42+
required: true
43+
default: "prod.yaml"
44+
respect_tf_lock:
45+
description: Whether to respect the Terraform lock
46+
required: false
47+
default: "true"
48+
run_terraform_destroy:
49+
description: Whether to run terraform destroy before deploying
50+
required: false
51+
default: "false"
52+
ref:
53+
description: The branch name to deploy from
54+
required: false
55+
default: "master"
56+
57+
jobs:
58+
metrics_deployment:
59+
# This job will run on Ubuntu
60+
runs-on: ubuntu-latest
61+
concurrency:
62+
group: deploy-${{ github.ref }} # Only one job per branch
63+
cancel-in-progress: false # Allow previous deployment to complete to avoid corruption
64+
65+
# Set up a variable based on the branch name
66+
env:
67+
NAMESPACE: ${{ inputs.namespace }}
68+
VALUES_FILE: ${{ inputs.values_file }}
69+
CHART_PATH: ./spartan/metrics
70+
CLUSTER_NAME: aztec-gke
71+
REGION: us-west1-a
72+
TF_STATE_BUCKET: aztec-terraform
73+
GKE_CLUSTER_CONTEXT: gke_testnet-440309_us-west1-a_aztec-gke
74+
75+
steps:
76+
- name: Checkout code
77+
uses: actions/checkout@v3
78+
with:
79+
ref: ${{ inputs.ref }}
80+
81+
- name: Authenticate to Google Cloud
82+
uses: google-github-actions/auth@v2
83+
with:
84+
credentials_json: ${{ secrets.GCP_SA_KEY }}
85+
86+
- name: Set up Cloud SDK
87+
uses: google-github-actions/setup-gcloud@v2
88+
89+
- name: Install GKE Auth Plugin
90+
run: |
91+
gcloud components install gke-gcloud-auth-plugin --quiet
92+
93+
- name: Configure kubectl with GKE cluster
94+
run: |
95+
gcloud container clusters get-credentials ${{ env.CLUSTER_NAME }} --region ${{ env.REGION }}
96+
97+
- name: Ensure Terraform state bucket exists
98+
run: |
99+
if ! gsutil ls gs://${{ env.TF_STATE_BUCKET }} >/dev/null 2>&1; then
100+
echo "Creating GCS bucket for Terraform state..."
101+
gsutil mb -l us-east4 gs://${{ env.TF_STATE_BUCKET }}
102+
gsutil versioning set on gs://${{ env.TF_STATE_BUCKET }}
103+
else
104+
echo "Terraform state bucket already exists"
105+
fi
106+
107+
- name: Setup Terraform
108+
uses: hashicorp/setup-terraform@v2
109+
with:
110+
terraform_version: "1.5.0" # Specify your desired version
111+
112+
- name: Terraform Init
113+
working-directory: ./spartan/terraform/deploy-metrics
114+
run: |
115+
terraform init \
116+
-backend-config="bucket=${{ env.TF_STATE_BUCKET }}" \
117+
-backend-config="prefix=metrics-deploy/${{ env.REGION }}/${{ env.CLUSTER_NAME }}/${{ env.NAMESPACE }}/terraform.tfstate"
118+
119+
- name: Terraform Destroy
120+
working-directory: ./spartan/terraform/deploy-metrics
121+
if: ${{ inputs.run_terraform_destroy == 'true' }}
122+
# Destroy fails if the resources are already destroyed, so we continue on error
123+
continue-on-error: true
124+
run: |
125+
terraform destroy -auto-approve \
126+
-var="RELEASE_NAME=${{ env.NAMESPACE }}" \
127+
-var="VALUES_FILE=${{ env.VALUES_FILE }}" \
128+
-var="GKE_CLUSTER_CONTEXT=${{ env.GKE_CLUSTER_CONTEXT }}" \
129+
-lock=${{ inputs.respect_tf_lock }}
130+
131+
- name: Terraform Plan
132+
working-directory: ./spartan/terraform/deploy-metrics
133+
run: |
134+
terraform plan \
135+
-var="RELEASE_NAME=${{ env.NAMESPACE }}" \
136+
-var="VALUES_FILE=${{ env.VALUES_FILE }}" \
137+
-var="GKE_CLUSTER_CONTEXT=${{ env.GKE_CLUSTER_CONTEXT }}" \
138+
-out=tfplan \
139+
-lock=${{ inputs.respect_tf_lock }}
140+
141+
- name: Terraform Apply
142+
working-directory: ./spartan/terraform/deploy-metrics
143+
run: terraform apply -lock=${{ inputs.respect_tf_lock }} -auto-approve tfplan

.github/workflows/metrics-deploys.yml

-129
This file was deleted.

spartan/metrics/values.yaml

+1-15
Original file line numberDiff line numberDiff line change
@@ -31,20 +31,6 @@ opentelemetry-collector:
3131
kubernetesAttributes:
3232
enabled: true
3333
config:
34-
exporters:
35-
# debug:
36-
# verbosity: detailed
37-
otlphttp/logs:
38-
endpoint: http://metrics-loki.metrics:3100/otlp
39-
otlp/tempo:
40-
endpoint: http://metrics-tempo.metrics:4317
41-
tls:
42-
insecure: true
43-
prometheus:
44-
endpoint: ${env:MY_POD_IP}:8889
45-
metric_expiration: 5m
46-
resource_to_telemetry_conversion:
47-
enabled: true
4834
extensions:
4935
health_check:
5036
endpoint: ${env:MY_POD_IP}:13133
@@ -91,7 +77,7 @@ opentelemetry-collector:
9177
# - debug
9278

9379
# Enable and configure the Loki subchart
94-
# https://artifacthub.io/packages/helm/grafana/loki-simple-scalable
80+
# https://artifacthub.io/packages/helm/grafana/loki
9581
# loki:
9682
# Nothing set here, because we need to use values from the values directory;
9783
# otherwise, things don't get overridden correctly.

spartan/metrics/values/kind.yaml

+34
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,20 @@
1+
opentelemetry-collector:
2+
config:
3+
exporters:
4+
# debug:
5+
# verbosity: detailed
6+
otlphttp/logs:
7+
endpoint: http://metrics-loki.metrics:3100/otlp
8+
otlp/tempo:
9+
endpoint: http://metrics-tempo.metrics:4317
10+
tls:
11+
insecure: true
12+
prometheus:
13+
endpoint: ${env:MY_POD_IP}:8889
14+
metric_expiration: 5m
15+
resource_to_telemetry_conversion:
16+
enabled: true
17+
118
loki:
219
deploymentMode: SingleBinary
320
loki:
@@ -23,3 +40,20 @@ loki:
2340
replicas: 0
2441
write:
2542
replicas: 0
43+
44+
grafana:
45+
datasources:
46+
datasources.yaml:
47+
apiVersion: 1
48+
datasources:
49+
- name: Loki
50+
type: loki
51+
url: http://metrics-loki.metrics:3100
52+
- name: Tempo
53+
type: tempo
54+
url: http://metrics-tempo.metrics:3100
55+
- name: Prometheus
56+
type: prometheus
57+
uid: spartan-metrics-prometheus
58+
isDefault: true
59+
url: http://metrics-prometheus-server.metrics:80

0 commit comments

Comments
 (0)