[Datadog][Terraform]MonitorでMetricsのアラート設定をする

https://docs.datadoghq.com/ja/getting_started/monitors/

https://registry.terraform.io/providers/DataDog/datadog/latest/docs/resources/monitor

ECS/RDSのMetricsからCPUとメモリーのしきい値が上がったらSlack通知させるようにする。

ecs_alert.tf

resource "datadog_monitor" "ecs_cpu_alert" {

name = "ecs_cpu_alert"

type = "metric alert"

query = "avg(last_5m):avg:aws.ecs.service.cpuutilization{clustername:hoge-${var.environment}} by {clustername} > 80"

escalation_message = "ECS/Fargate CPU usage has exceeded 80%"

notify_no_data = false

notify_audit = false

timeout_h = 1

include_tags = true

monitor_thresholds {

warning = 40

critical = 80

}

message = <<-EOT

@slack-alert-hoge

{{#is_alert}} @slack-alert-hoge {{/is_alert}}

{{#is_recovery}} @slack-alert-hoge {{/is_recovery}}

EOT

tags = [

"product:hoge",

"service:hoge",

"env:${var.environment}"

]

}

resource "datadog_monitor" "ecs_memory_alert" {

name = "ecs_memory_alert"

type = "metric alert"

query = "avg(last_5m):avg:aws.ecs.service.memory_utilization{clustername:hoge-${var.environment}} by {servicename} > 80"

escalation_message = "ECS/Fargate Memory usage has exceeded 80%"

notify_no_data = false

notify_audit = false

timeout_h = 1

include_tags = true

monitor_thresholds {

warning = 70

critical = 80

}

message = <<-EOT

@slack-alert-hoge

{{#is_alert}} @slack-alert-hoge {{/is_alert}}

{{#is_recovery}} @slack-alert-hoge {{/is_recovery}}

EOT

tags = [

"product:hoge",

"service:hoge",

"env:${var.environment}"

]

}

rds_alert.tf

resource "datadog_monitor" "rds_cpu_alert" {

name = "rds_cpu_alert"

type = "metric alert"

query = "avg(last_5m):avg:aws.rds.cpuutilization{dbname:hoge_${var.environment}} by {dbname} > 80"

escalation_message = "RDS CPU usage for hoge_${var.environment} instance has exceeded 80%"

notify_no_data = false

notify_audit = false

timeout_h = 1

include_tags = true

monitor_thresholds {

warning = 70

critical = 80

}

message = <<-EOT

@slack-dev-alert-hoge

{{#is_alert}} @slack-alert-hoge {{/is_alert}}

{{#is_recovery}} @slack-alert-hoge {{/is_recovery}}

EOT

tags = [

"product:hoge",

"service:hoge",

"env:${var.environment}"

]

}

status_code_alert.tf

resource "datadog_monitor" "alb_5xx_alert" {

name = "alb_5xx_alert"

type = "metric alert"

query = "sum(last_5m):avg:aws.applicationelb.httpcode_target_5xx{name:hoge} by {loadbalancer}.as_count() > 5"

escalation_message = "ALB hoge has more than 5% 5xx errors"

notify_no_data = false

notify_audit = false

timeout_h = 1

include_tags = true

monitor_thresholds {

warning = 3

critical = 5

}

message = <<-EOT

@slack-alert-hoge

{{#is_alert}} @slack-alert-hoge {{/is_alert}}

{{#is_recovery}} @slack-alert-hoge {{/is_recovery}}

EOT

tags = [

"product:hoge",

"service:hoge",

"env:${var.environment}"

]

}

event_log_alert.tf

resource "datadog_monitor" "event_log_alert" {

name = "event_log_alert"

type = "event-v2 alert"

query = "events(\"status:(error OR warn OR failed) AND hoge\").rollup(\"count\").last(\"5m\") > 0"

notify_no_data = false

notify_audit = false

timeout_h = 1

monitor_thresholds {

critical = 1

}

message = <<-EOT

@slack-alert-hoge

{{#is_alert}} @slack-alert-hoge {{/is_alert}}

{{#is_recovery}} @slack-alert-hoge {{/is_recovery}}

EOT

tags = [

"product:hoge",

"service:hoge",

"env:${var.environment}"

]

}

cloudfront_5xx_error_rate_alert.tf

resource "datadog_monitor" "cloudfront_5xx_error_rate_alert" {

name = "[hoge]cloudfront_5xx_error_rate_alert"

type = "metric alert"

query = "avg(last_5m):avg:aws.cloudfront.5xx_error_rate{distributionid:hoge OR distributionid:hoge OR distributionid:hoge} by {distributionid,aws_account} > 50"

escalation_message = "CloudFront distributions have more than 50% 5xx error rate"

notify_no_data = false

notify_audit = false

timeout_h = 1

include_tags = true

monitor_thresholds {

warning = 30

critical = 50

}

message = <<-EOT

@slack-alert-hoge

{{#is_alert}} @slack-alert-hoge {{/is_alert}}

{{#is_recovery}} @slack-alert-hoge {{/is_recovery}}

EOT

tags = [

"product:hoge",

"service:hoge",

"env:${var.environment}"

]

}

イベントログのtypeはevent-v2 alertを指定すること。複数のリソースを指定する場合は ORを使うこと。

Was this helpful?

0 / 0

adachi.ryo

1989年生まれのFindy/SRE。ホスティングから大規模なアドテクなどのインフラエンジニアとして携わる。現在はサービスの信頼性向上、DevOps、可用性、レイテンシ、パフォーマンス、モニタリング、オブザーバビリティ、緊急対応、AWSでのインフラ構築、Docker開発環境の提供、IaC、新技術の検証、リファクタリング、セキュリティ強化、分析基盤の運用などを担当している。

個人事業主では数社サーバー保守とベンチャー企業のインフラコンサルティングを行うほか、TechBullを創業し、未経験者向けにSREのコーチングやコミュニティ運営、LT大会の開催、会員管理システム「Members」の開発をリードしている。さらに、エンジニア向けYouTubeメディア「TECH WORLD」ではSRE関連の動画に出演し、過去には脆弱性スキャナ「Vuls」のOSS活動にも貢献。

[Datadog][Terraform]MonitorでMetricsのアラート設定をする

いいね:

関連

コメントを残すコメントをキャンセル