Skip to content

Commit

Permalink
Merge pull request #587 from dashpay/fix/alarm-dimensions
Browse files Browse the repository at this point in the history
fix: alarm dimensions
  • Loading branch information
strophy authored Oct 18, 2023
2 parents 1fc9066 + 232fb52 commit f20461d
Show file tree
Hide file tree
Showing 4 changed files with 85 additions and 117 deletions.
1 change: 0 additions & 1 deletion ansible/deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,6 @@
- eternal_terminal
- cwagent


- name: Configure tcpdump
hosts: all
become: true
Expand Down
51 changes: 0 additions & 51 deletions ansible/files/cloudwatch.json

This file was deleted.

33 changes: 32 additions & 1 deletion ansible/roles/cwagent/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,35 @@
name: christiangda.amazon_cloudwatch_agent
vars:
cwa_agent_mode: "ec2"
cwa_conf_json_file_content: "{{ lookup('file', 'cloudwatch.json') | from_json }}"
cwa_conf_json_file_content:
agent:
metrics_collection_interval: 300
logfile: "/opt/aws/amazon-cloudwatch-agent/logs/amazon-cloudwatch-agent.log"
debug: false
logs:
logs_collected:
files:
collect_list:
- file_path: "/var/log/message"
log_group_name: "/var/log/message"
log_stream_name: "{instance_id}"
metrics:
append_dimensions:
AutoScalingGroupName: "${aws:AutoScalingGroupName}"
ImageId: "${aws:ImageId}"
InstanceId: "${aws:InstanceId}"
InstanceType: "${aws:InstanceType}"
metrics_collected:
disk:
measurement:
- used_percent
metrics_collection_interval: 300
resources: "{{ ['/'] + (['/dash/elastic/data'] if inventory_hostname in groups['logs_nodes'] else []) }}"
mem:
measurement:
- mem_used_percent
metrics_collection_interval: 300
swap:
measurement:
- swap_used_percent
metrics_collection_interval: 300
117 changes: 53 additions & 64 deletions terraform/aws/monitoring.tf
Original file line number Diff line number Diff line change
@@ -1,125 +1,114 @@
locals {
instance_ids = concat(
aws_instance.web.*.id,
aws_instance.dashd_wallet.*.id,
aws_instance.seed_node.*.id,
aws_instance.miner.*.id,
aws_instance.masternode_amd.*.id,
aws_instance.masternode_arm.*.id,
aws_instance.hp_masternode_amd.*.id,
aws_instance.hp_masternode_arm.*.id,
aws_instance.vpn.*.id,
aws_instance.mixer.*.id,
aws_instance.logs.*.id,
)
instance_hostnames = concat(
aws_instance.web.*.tags.Hostname,
aws_instance.dashd_wallet.*.tags.Hostname,
aws_instance.seed_node.*.tags.Hostname,
aws_instance.miner.*.tags.Hostname,
aws_instance.masternode_amd.*.tags.Hostname,
aws_instance.masternode_arm.*.tags.Hostname,
aws_instance.hp_masternode_amd.*.tags.Hostname,
aws_instance.hp_masternode_arm.*.tags.Hostname,
aws_instance.vpn.*.tags.Hostname,
aws_instance.mixer.*.tags.Hostname,
aws_instance.logs.*.tags.Hostname,
)
instance_data = [
for instance in concat(
aws_instance.web,
aws_instance.dashd_wallet,
aws_instance.seed_node,
aws_instance.miner,
aws_instance.masternode_amd,
aws_instance.masternode_arm,
aws_instance.hp_masternode_amd,
aws_instance.hp_masternode_arm,
aws_instance.vpn,
aws_instance.mixer,
aws_instance.logs,
) : {
Hostname = instance.tags.Hostname
InstanceId = instance.id
ImageId = instance.ami
InstanceType = instance.instance_type
}
]
}

resource "aws_cloudwatch_metric_alarm" "cpu_monitoring" {

count = var.monitoring_cpu_enabled ? length(local.instance_ids) : 0
count = var.monitoring_cpu_enabled ? length(local.instance_data) : 0

alarm_name = "${terraform.workspace}-${local.instance_hostnames[count.index]}-cpu-monitoring"
alarm_name = "${terraform.workspace}-${local.instance_data[count.index].Hostname}-cpu-monitoring"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = "2"
evaluation_periods = 2
metric_name = "CPUUtilization"
namespace = "AWS/EC2"
period = "360"
period = 300
statistic = "Average"
threshold = "60"
threshold = 60

insufficient_data_actions = length(var.monitoring_sns_arn) > 1 ? [var.monitoring_sns_arn] : []
alarm_actions = length(var.monitoring_sns_arn) > 1 ? [var.monitoring_sns_arn] : []
ok_actions = length(var.monitoring_sns_arn) > 1 ? [var.monitoring_sns_arn] : []
alarm_actions = length(var.monitoring_sns_arn) > 1 ? [var.monitoring_sns_arn] : []

dimensions = {
InstanceId = local.instance_ids[count.index]
InstanceId = local.instance_data[count.index].InstanceId
}

alarm_description = "This alarm monitors ec2 cpu utilization"
}

resource "aws_cloudwatch_metric_alarm" "memory_monitoring" {

count = var.monitoring_mem_enabled ? length(local.instance_ids) : 0
count = var.monitoring_mem_enabled ? length(local.instance_data) : 0

alarm_name = "${terraform.workspace}-${local.instance_hostnames[count.index]}-memory-monitoring"
alarm_name = "${terraform.workspace}-${local.instance_data[count.index].Hostname}-memory-monitoring"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = "2"
evaluation_periods = 2
metric_name = "mem_used_percent"
namespace = "CWAgent"
period = "360"
period = 300
statistic = "Average"
threshold = "85"
threshold = 85

insufficient_data_actions = length(var.monitoring_sns_arn) > 1 ? [var.monitoring_sns_arn] : []
alarm_actions = length(var.monitoring_sns_arn) > 1 ? [var.monitoring_sns_arn] : []
ok_actions = length(var.monitoring_sns_arn) > 1 ? [var.monitoring_sns_arn] : []
alarm_actions = length(var.monitoring_sns_arn) > 1 ? [var.monitoring_sns_arn] : []

dimensions = {
InstanceId = local.instance_ids[count.index]
InstanceId = local.instance_data[count.index].InstanceId
}

alarm_description = "This alarm monitors ec2 memory utilization"
}

resource "aws_cloudwatch_metric_alarm" "swap_monitoring" {

count = var.monitoring_swap_enabled ? length(local.instance_ids) : 0
count = var.monitoring_swap_enabled ? length(local.instance_data) : 0

alarm_name = "${terraform.workspace}-${local.instance_hostnames[count.index]}-swap-monitoring"
alarm_name = "${terraform.workspace}-${local.instance_data[count.index].Hostname}-swap-monitoring"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = "2"
evaluation_periods = 2
metric_name = "swap_used_percent"
namespace = "CWAgent"
period = "360"
period = 300
statistic = "Average"
threshold = "60"
threshold = 60

insufficient_data_actions = length(var.monitoring_sns_arn) > 1 ? [var.monitoring_sns_arn] : []
alarm_actions = length(var.monitoring_sns_arn) > 1 ? [var.monitoring_sns_arn] : []
ok_actions = length(var.monitoring_sns_arn) > 1 ? [var.monitoring_sns_arn] : []
alarm_actions = length(var.monitoring_sns_arn) > 1 ? [var.monitoring_sns_arn] : []

dimensions = {
InstanceId = local.instance_ids[count.index]
InstanceId = local.instance_data[count.index].InstanceId
}

alarm_description = "This alarm monitors ec2 swap utilization"
}

resource "aws_cloudwatch_metric_alarm" "diskspace_monitoring" {

count = var.monitoring_disk_enabled ? length(local.instance_ids) : 0
count = var.monitoring_disk_enabled ? length(local.instance_data) : 0

alarm_name = "${terraform.workspace}-${local.instance_hostnames[count.index]}-diskspace-monitoring"
alarm_name = "${terraform.workspace}-${local.instance_data[count.index].Hostname}-diskspace-monitoring"
comparison_operator = "GreaterThanOrEqualToThreshold"
evaluation_periods = "2"
evaluation_periods = 2
metric_name = "disk_used_percent"
namespace = "CWAgent"
period = "360"
period = 300
statistic = "Average"
threshold = "80"
threshold = 80

insufficient_data_actions = length(var.monitoring_sns_arn) > 1 ? [var.monitoring_sns_arn] : []
alarm_actions = length(var.monitoring_sns_arn) > 1 ? [var.monitoring_sns_arn] : []
ok_actions = length(var.monitoring_sns_arn) > 1 ? [var.monitoring_sns_arn] : []
alarm_actions = length(var.monitoring_sns_arn) > 1 ? [var.monitoring_sns_arn] : []

dimensions = {
InstanceId = local.instance_ids[count.index]
MountPath = "/"
Filesystem = "/dev/nvme0n1p1"
ImageId = local.instance_data[count.index].ImageId
InstanceId = local.instance_data[count.index].InstanceId
InstanceType = local.instance_data[count.index].InstanceType
device = strcontains(local.instance_data[count.index].Hostname, "logs") ? "nvme1n1p1" : "nvme0n1p1"
fstype = "ext4"
path = strcontains(local.instance_data[count.index].Hostname, "logs") ? "/dash/elastic/data" : "/"
}

alarm_description = "This alarm monitors ec2 disk utilization"
Expand Down

0 comments on commit f20461d

Please sign in to comment.