From e656468b0a06fdf1418d111c76aead6b7aca2568 Mon Sep 17 00:00:00 2001 From: ian-flores Date: Thu, 26 Feb 2026 16:36:09 -0800 Subject: [PATCH] refine CloudWatch alert thresholds and severity labels - Add severity: warning to all alerts missing it (CPU utilization, free storage, freeable memory, database connections, ALB/NLB targets, ALB 5XX errors, ALB response latency) - Delete NAT Gateway alerts (port allocation errors, packets dropped) - Delete RDS Read Latency High alert - Reduce ALB/NLB Unhealthy Targets window from 10m to 5m - Lower RDS Database Connections threshold from 500 to 80; unpause alert - Note why RDS Free Storage and Freeable Memory remain as absolute byte thresholds (CloudWatch lacks AllocatedStorage/total RAM as metrics) --- .../src/ptd/grafana_alerts/loadbalancer.yaml | 12 +- .../src/ptd/grafana_alerts/natgateway.yaml | 135 +----------------- python-pulumi/src/ptd/grafana_alerts/rds.yaml | 79 +++------- 3 files changed, 32 insertions(+), 194 deletions(-) diff --git a/python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml b/python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml index 19e77ec..8d6c216 100644 --- a/python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml @@ -79,6 +79,7 @@ groups: summary: High 5XX errors on ALB {{$labels.dimension_LoadBalancer}} in cluster {{$labels.cluster}} labels: opsgenie: "1" + severity: warning isPaused: false - uid: alb_unhealthy_targets title: ALB Unhealthy Targets @@ -128,12 +129,13 @@ groups: type: threshold noDataState: NoData execErrState: Error - for: 10m + for: 5m annotations: - description: Application Load Balancer has unhealthy targets for over 10 minutes, indicating backend service health issues. The extended window reduces false positives during rolling deployments. + description: Application Load Balancer has unhealthy targets for over 5 minutes, indicating backend service health issues. summary: Unhealthy targets on ALB {{$labels.dimension_LoadBalancer}} in cluster {{$labels.cluster}} labels: opsgenie: "1" + severity: warning isPaused: false - uid: nlb_unhealthy_targets title: NLB Unhealthy Targets @@ -183,12 +185,13 @@ groups: type: threshold noDataState: NoData execErrState: Error - for: 10m + for: 5m annotations: - description: Network Load Balancer has unhealthy targets for over 10 minutes, indicating backend service health issues. The extended window reduces false positives during rolling deployments and node drains. + description: Network Load Balancer has unhealthy targets for over 5 minutes, indicating backend service health issues. summary: Unhealthy targets on NLB {{$labels.dimension_LoadBalancer}} in cluster {{$labels.cluster}} labels: opsgenie: "1" + severity: warning isPaused: false - uid: alb_response_latency_high title: ALB Response Latency High @@ -244,4 +247,5 @@ groups: summary: High response latency on ALB {{$labels.dimension_LoadBalancer}} in cluster {{$labels.cluster}} labels: opsgenie: "1" + severity: warning isPaused: false diff --git a/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml b/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml index 0baf178..f06cfa5 100644 --- a/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml @@ -1,133 +1,10 @@ -# To delete these alerts, simply removing the configMap that uses this method will not work. -# Replace file contents with the following and apply in order to delete the alerts -# (repeat the deleteRules entry for each uid listed below): -# apiVersion: 1 -# deleteRules: -# - orgId: 1 -# uid: nat_gateway_port_allocation_errors -# - orgId: 1 -# uid: nat_gateway_packets_dropped +# These alerts have been deleted. The deleteRules entries below will remove them from Grafana +# on the next provisioning run. # # See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/ -# -# Note: alert annotations reference {{$labels.cluster}}. For CloudWatch-sourced metrics, -# this label is injected by the prometheus.relabel.default block in grafana_alloy.py. -# If Alloy is not running or relabeling is misconfigured, the label will be absent and -# the annotation will render as "in cluster " (blank). apiVersion: 1 -groups: +deleteRules: + - orgId: 1 + uid: nat_gateway_port_allocation_errors - orgId: 1 - name: NATGateway - folder: Posit Alerts - interval: 5m - rules: - - uid: nat_gateway_port_allocation_errors - title: NAT Gateway Port Allocation Errors - condition: B - data: - - refId: A - relativeTimeRange: - from: 600 - to: 0 - datasourceUid: mimir - model: - editorMode: code - expr: aws_natgateway_error_port_allocation_sum{job="integrations/cloudwatch"} - instant: true - intervalMs: 1000 - legendFormat: __auto - maxDataPoints: 43200 - range: false - refId: A - - refId: B - relativeTimeRange: - from: 600 - to: 0 - datasourceUid: __expr__ - model: - conditions: - - evaluator: - params: - - 0 - type: gt - operator: - type: and - query: - params: - - A - reducer: - params: [] - type: last - type: query - datasource: - type: __expr__ - uid: __expr__ - expression: A - intervalMs: 1000 - maxDataPoints: 43200 - refId: B - type: threshold - noDataState: NoData # Using NoData to avoid spurious pages during scrape outages (Alloy restart, credential rotation, brief partitions). Add a separate "CloudWatch scrape down" alert to cover the outage case independently. - execErrState: Error - for: 5m - annotations: - description: NAT Gateway is experiencing port allocation errors, which means outbound network connectivity is failing. This is a critical issue that requires immediate attention. - summary: Port allocation errors on NAT Gateway {{$labels.dimension_NatGatewayId}} in cluster {{$labels.cluster}} - labels: - opsgenie: "1" - isPaused: false - - uid: nat_gateway_packets_dropped - title: NAT Gateway Packets Dropped - condition: B - data: - - refId: A - relativeTimeRange: - from: 600 - to: 0 - datasourceUid: mimir - model: - editorMode: code - expr: aws_natgateway_packets_drop_count_sum{job="integrations/cloudwatch"} - instant: true - intervalMs: 1000 - legendFormat: __auto - maxDataPoints: 43200 - range: false - refId: A - - refId: B - relativeTimeRange: - from: 600 - to: 0 - datasourceUid: __expr__ - model: - conditions: - - evaluator: - params: - - 100 # Calibrated as a conservative baseline; high-throughput gateways may see this normally. Adjust per environment. - type: gt - operator: - type: and - query: - params: - - A - reducer: - params: [] - type: last - type: query - datasource: - type: __expr__ - uid: __expr__ - expression: A - intervalMs: 1000 - maxDataPoints: 43200 - refId: B - type: threshold - noDataState: NoData - execErrState: Error - for: 5m - annotations: - description: NAT Gateway has dropped more than 100 packets for over 5 minutes, indicating potential network issues. - summary: High packet drop rate on NAT Gateway {{$labels.dimension_NatGatewayId}} in cluster {{$labels.cluster}} - labels: - opsgenie: "1" - isPaused: false + uid: nat_gateway_packets_dropped diff --git a/python-pulumi/src/ptd/grafana_alerts/rds.yaml b/python-pulumi/src/ptd/grafana_alerts/rds.yaml index 6b94f53..c09e58a 100644 --- a/python-pulumi/src/ptd/grafana_alerts/rds.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/rds.yaml @@ -10,8 +10,6 @@ # - orgId: 1 # uid: rds_freeable_memory_low # - orgId: 1 -# uid: rds_read_latency_high -# - orgId: 1 # uid: rds_database_connections_high # # See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/ @@ -21,6 +19,9 @@ # If Alloy is not running or relabeling is misconfigured, the label will be absent and # the annotation will render as "in cluster " (blank). apiVersion: 1 +deleteRules: + - orgId: 1 + uid: rds_read_latency_high groups: - orgId: 1 name: RDS @@ -81,6 +82,7 @@ groups: summary: High CPU utilization on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}} labels: opsgenie: "1" + severity: warning isPaused: false - uid: rds_free_storage_low title: RDS Free Storage Low @@ -136,6 +138,11 @@ groups: summary: Low free storage on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}} labels: opsgenie: "1" + severity: warning + # Note: threshold is absolute (5 GiB) rather than percentage-based. CloudWatch does not + # expose AllocatedStorage as a time-series metric for RDS (it is an instance attribute), + # so computing a usage percentage is not feasible without a separate exporter or recording + # rule. The 5 GiB threshold is calibrated to PTD's default 100 GiB allocation. isPaused: false - uid: rds_freeable_memory_low title: RDS Freeable Memory Low @@ -191,63 +198,13 @@ groups: summary: Low freeable memory on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}} labels: opsgenie: "1" + severity: warning instance_size_dependent: "true" # Silence this label for known-small instance classes + # Note: threshold is absolute (512 MiB) rather than percentage-based. CloudWatch does not + # expose total instance RAM as a metric for RDS — it varies by instance type. PTD's default + # instance (db.t3.small, 2 GiB) would fire constantly at a 90%-used threshold under normal + # Postgres buffer cache load, making percentage-based alerting impractical here. isPaused: true # Paused until a Grafana silence or Alertmanager inhibit rule is configured for the instance_size_dependent label - - uid: rds_read_latency_high - title: RDS Read Latency High - condition: B - data: - - refId: A - relativeTimeRange: - from: 600 - to: 0 - datasourceUid: mimir - model: - editorMode: code - expr: aws_rds_read_latency_average{job="integrations/cloudwatch"} - instant: true - intervalMs: 1000 - legendFormat: __auto - maxDataPoints: 43200 - range: false - refId: A - - refId: B - relativeTimeRange: - from: 600 - to: 0 - datasourceUid: __expr__ - model: - conditions: - - evaluator: - params: - - 0.1 # 100ms in seconds - type: gt - operator: - type: and - query: - params: - - A - reducer: - params: [] - type: last - type: query - datasource: - type: __expr__ - uid: __expr__ - expression: A - intervalMs: 1000 - maxDataPoints: 43200 - refId: B - type: threshold - noDataState: NoData # Performance metric; silent suppression on scrape outage is acceptable - execErrState: Error - for: 10m - annotations: - description: RDS instance read latency is above 100ms for more than 10 minutes. - summary: High read latency on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}} - labels: - opsgenie: "1" - isPaused: false - uid: rds_database_connections_high title: RDS Database Connections High condition: B @@ -275,7 +232,7 @@ groups: conditions: - evaluator: params: - - 500 # Calibrated for db.r5.large (max ~4000). For small instances (db.t3.small max ~36) this alert will never fire; adjust per instance class. + - 80 type: gt operator: type: and @@ -298,9 +255,9 @@ groups: execErrState: Error for: 5m annotations: - description: RDS instance has more than 500 active database connections for more than 5 minutes. Note: this threshold is calibrated for db.r5.large (max ~4000 connections); it will never fire for small instances (e.g. db.t3.small max ~36). Adjust per instance class. + description: RDS instance has more than 80 active database connections for more than 5 minutes. summary: High database connections on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}} labels: opsgenie: "1" - instance_size_dependent: "true" # Silence this label for known-small instance classes - isPaused: true # Paused until a Grafana silence or Alertmanager inhibit rule is configured for the instance_size_dependent label + severity: warning + isPaused: false