https://docs.datadoghq.com/ja/getting_started/monitors/
https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor
ECS/RDSのMetricsからCPUとメモリーのしきい値が上がったらSlack通知させるようにする。
- ecs_alert.tf
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
resource "datadog_monitor" "ecs_cpu_alert" { name = "ecs_cpu_alert" type = "metric alert" query = "avg(last_5m):avg:aws.ecs.service.cpuutilization{clustername:hoge-${var.environment}} by {clustername} > 80" escalation_message = "ECS/Fargate CPU usage has exceeded 80%" notify_no_data = false notify_audit = false timeout_h = 1 include_tags = true monitor_thresholds { warning = 40 critical = 80 } message = <<-EOT @slack-alert-hoge {{#is_alert}} @slack-alert-hoge {{/is_alert}} {{#is_recovery}} @slack-alert-hoge {{/is_recovery}} EOT tags = [ "product:hoge", "service:hoge", "env:${var.environment}" ] } resource "datadog_monitor" "ecs_memory_alert" { name = "ecs_memory_alert" type = "metric alert" query = "avg(last_5m):avg:aws.ecs.service.memory_utilization{clustername:hoge-${var.environment}} by {servicename} > 80" escalation_message = "ECS/Fargate Memory usage has exceeded 80%" notify_no_data = false notify_audit = false timeout_h = 1 include_tags = true monitor_thresholds { warning = 70 critical = 80 } message = <<-EOT @slack-alert-hoge {{#is_alert}} @slack-alert-hoge {{/is_alert}} {{#is_recovery}} @slack-alert-hoge {{/is_recovery}} EOT tags = [ "product:hoge", "service:hoge", "env:${var.environment}" ] } |
- rds_alert.tf
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 |
resource "datadog_monitor" "rds_cpu_alert" { name = "rds_cpu_alert" type = "metric alert" query = "avg(last_5m):avg:aws.rds.cpuutilization{dbname:hoge_${var.environment}} by {dbname} > 80" escalation_message = "RDS CPU usage for hoge_${var.environment} instance has exceeded 80%" notify_no_data = false notify_audit = false timeout_h = 1 include_tags = true monitor_thresholds { warning = 70 critical = 80 } message = <<-EOT @slack-dev-alert-hoge {{#is_alert}} @slack-alert-hoge {{/is_alert}} {{#is_recovery}} @slack-alert-hoge {{/is_recovery}} EOT tags = [ "product:hoge", "service:hoge", "env:${var.environment}" ] } |
- status_code_alert.tf
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 |
resource "datadog_monitor" "alb_5xx_alert" { name = "alb_5xx_alert" type = "metric alert" query = "sum(last_5m):avg:aws.applicationelb.httpcode_target_5xx{name:hoge} by {loadbalancer}.as_count() > 5" escalation_message = "ALB hoge has more than 5% 5xx errors" notify_no_data = false notify_audit = false timeout_h = 1 include_tags = true monitor_thresholds { warning = 3 critical = 5 } message = <<-EOT @slack-alert-hoge {{#is_alert}} @slack-alert-hoge {{/is_alert}} {{#is_recovery}} @slack-alert-hoge {{/is_recovery}} EOT tags = [ "product:hoge", "service:hoge", "env:${var.environment}" ] } |
- event_log_alert.tf
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 |
resource "datadog_monitor" "event_log_alert" { name = "event_log_alert" type = "event-v2 alert" query = "events(\"status:(error OR warn OR failed) AND hoge\").rollup(\"count\").last(\"5m\") > 0" notify_no_data = false notify_audit = false timeout_h = 1 monitor_thresholds { critical = 1 } message = <<-EOT @slack-alert-hoge {{#is_alert}} @slack-alert-hoge {{/is_alert}} {{#is_recovery}} @slack-alert-hoge {{/is_recovery}} EOT tags = [ "product:hoge", "service:hoge", "env:${var.environment}" ] } |
- cloudfront_5xx_error_rate_alert.tf
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 |
resource "datadog_monitor" "cloudfront_5xx_error_rate_alert" { name = "[hoge]cloudfront_5xx_error_rate_alert" type = "metric alert" query = "avg(last_5m):avg:aws.cloudfront.5xx_error_rate{distributionid:hoge OR distributionid:hoge OR distributionid:hoge} by {distributionid,aws_account} > 50" escalation_message = "CloudFront distributions have more than 50% 5xx error rate" notify_no_data = false notify_audit = false timeout_h = 1 include_tags = true monitor_thresholds { warning = 30 critical = 50 } message = <<-EOT @slack-alert-hoge {{#is_alert}} @slack-alert-hoge {{/is_alert}} {{#is_recovery}} @slack-alert-hoge {{/is_recovery}} EOT tags = [ "product:hoge", "service:hoge", "env:${var.environment}" ] } |
イベントログのtypeはevent-v2 alertを指定すること。複数のリソースを指定する場合は ORを使うこと。
Was this helpful?
0 / 0
1989年生まれのFindy/SRE。ホスティングから大規模なアドテクなどのインフラエンジニアとして携わる。現在はサービスの信頼性向上、DevOps、可用性、レイテンシ、パフォーマンス、モニタリング、オブザーバビリティ、緊急対応、AWSでのインフラ構築、Docker開発環境の提供、IaC、新技術の検証、リファクタリング、セキュリティ強化、分析基盤の運用などを担当している。個人事業主では数社サーバー保守とベンチャー企業のSREインフラコンサルティングやMENTA/TechBullで未経験者にインフラのコーチング/コミュニティマネージャーとして立ち上げと運営をしている。また、過去「脆弱性スキャナVuls」のOSS活動もしており、自称エバンジェリスト/技術広報/テクニカルサポート/コントリビュータでもある。