From e656468b0a06fdf1418d111c76aead6b7aca2568 Mon Sep 17 00:00:00 2001
From: ian-flores <iflores.siaca@posit.co>
Date: Thu, 26 Feb 2026 16:36:09 -0800
Subject: [PATCH] refine CloudWatch alert thresholds and severity labels

- Add severity: warning to all alerts missing it (CPU utilization,
  free storage, freeable memory, database connections, ALB/NLB targets,
  ALB 5XX errors, ALB response latency)
- Delete NAT Gateway alerts (port allocation errors, packets dropped)
- Delete RDS Read Latency High alert
- Reduce ALB/NLB Unhealthy Targets window from 10m to 5m
- Lower RDS Database Connections threshold from 500 to 80; unpause alert
- Note why RDS Free Storage and Freeable Memory remain as absolute byte
  thresholds (CloudWatch lacks AllocatedStorage/total RAM as metrics)
---
 .../src/ptd/grafana_alerts/loadbalancer.yaml  |  12 +-
 .../src/ptd/grafana_alerts/natgateway.yaml    | 135 +-----------------
 python-pulumi/src/ptd/grafana_alerts/rds.yaml |  79 +++-------
 3 files changed, 32 insertions(+), 194 deletions(-)

diff --git a/python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml b/python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml
index 19e77ec..8d6c216 100644
--- a/python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml
+++ b/python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml
@@ -79,6 +79,7 @@ groups:
             summary: High 5XX errors on ALB {{$labels.dimension_LoadBalancer}} in cluster {{$labels.cluster}}
           labels:
             opsgenie: "1"
+            severity: warning
           isPaused: false
         - uid: alb_unhealthy_targets
           title: ALB Unhealthy Targets
@@ -128,12 +129,13 @@ groups:
                 type: threshold
           noDataState: NoData
           execErrState: Error
-          for: 10m
+          for: 5m
           annotations:
-            description: Application Load Balancer has unhealthy targets for over 10 minutes, indicating backend service health issues. The extended window reduces false positives during rolling deployments.
+            description: Application Load Balancer has unhealthy targets for over 5 minutes, indicating backend service health issues.
             summary: Unhealthy targets on ALB {{$labels.dimension_LoadBalancer}} in cluster {{$labels.cluster}}
           labels:
             opsgenie: "1"
+            severity: warning
           isPaused: false
         - uid: nlb_unhealthy_targets
           title: NLB Unhealthy Targets
@@ -183,12 +185,13 @@ groups:
                 type: threshold
           noDataState: NoData
           execErrState: Error
-          for: 10m
+          for: 5m
           annotations:
-            description: Network Load Balancer has unhealthy targets for over 10 minutes, indicating backend service health issues. The extended window reduces false positives during rolling deployments and node drains.
+            description: Network Load Balancer has unhealthy targets for over 5 minutes, indicating backend service health issues.
             summary: Unhealthy targets on NLB {{$labels.dimension_LoadBalancer}} in cluster {{$labels.cluster}}
           labels:
             opsgenie: "1"
+            severity: warning
           isPaused: false
         - uid: alb_response_latency_high
           title: ALB Response Latency High
@@ -244,4 +247,5 @@ groups:
             summary: High response latency on ALB {{$labels.dimension_LoadBalancer}} in cluster {{$labels.cluster}}
           labels:
             opsgenie: "1"
+            severity: warning
           isPaused: false
diff --git a/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml b/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml
index 0baf178..f06cfa5 100644
--- a/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml
+++ b/python-pulumi/src/ptd/grafana_alerts/natgateway.yaml
@@ -1,133 +1,10 @@
-# To delete these alerts, simply removing the configMap that uses this method will not work.
-# Replace file contents with the following and apply in order to delete the alerts
-# (repeat the deleteRules entry for each uid listed below):
-# apiVersion: 1
-# deleteRules:
-#   - orgId: 1
-#     uid: nat_gateway_port_allocation_errors
-#   - orgId: 1
-#     uid: nat_gateway_packets_dropped
+# These alerts have been deleted. The deleteRules entries below will remove them from Grafana
+# on the next provisioning run.
 #
 # See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/
-#
-# Note: alert annotations reference {{$labels.cluster}}. For CloudWatch-sourced metrics,
-# this label is injected by the prometheus.relabel.default block in grafana_alloy.py.
-# If Alloy is not running or relabeling is misconfigured, the label will be absent and
-# the annotation will render as "in cluster " (blank).
 apiVersion: 1
-groups:
+deleteRules:
+    - orgId: 1
+      uid: nat_gateway_port_allocation_errors
     - orgId: 1
-      name: NATGateway
-      folder: Posit Alerts
-      interval: 5m
-      rules:
-        - uid: nat_gateway_port_allocation_errors
-          title: NAT Gateway Port Allocation Errors
-          condition: B
-          data:
-            - refId: A
-              relativeTimeRange:
-                from: 600
-                to: 0
-              datasourceUid: mimir
-              model:
-                editorMode: code
-                expr: aws_natgateway_error_port_allocation_sum{job="integrations/cloudwatch"}
-                instant: true
-                intervalMs: 1000
-                legendFormat: __auto
-                maxDataPoints: 43200
-                range: false
-                refId: A
-            - refId: B
-              relativeTimeRange:
-                from: 600
-                to: 0
-              datasourceUid: __expr__
-              model:
-                conditions:
-                    - evaluator:
-                        params:
-                            - 0
-                        type: gt
-                      operator:
-                        type: and
-                      query:
-                        params:
-                            - A
-                      reducer:
-                        params: []
-                        type: last
-                      type: query
-                datasource:
-                    type: __expr__
-                    uid: __expr__
-                expression: A
-                intervalMs: 1000
-                maxDataPoints: 43200
-                refId: B
-                type: threshold
-          noDataState: NoData  # Using NoData to avoid spurious pages during scrape outages (Alloy restart, credential rotation, brief partitions). Add a separate "CloudWatch scrape down" alert to cover the outage case independently.
-          execErrState: Error
-          for: 5m
-          annotations:
-            description: NAT Gateway is experiencing port allocation errors, which means outbound network connectivity is failing. This is a critical issue that requires immediate attention.
-            summary: Port allocation errors on NAT Gateway {{$labels.dimension_NatGatewayId}} in cluster {{$labels.cluster}}
-          labels:
-            opsgenie: "1"
-          isPaused: false
-        - uid: nat_gateway_packets_dropped
-          title: NAT Gateway Packets Dropped
-          condition: B
-          data:
-            - refId: A
-              relativeTimeRange:
-                from: 600
-                to: 0
-              datasourceUid: mimir
-              model:
-                editorMode: code
-                expr: aws_natgateway_packets_drop_count_sum{job="integrations/cloudwatch"}
-                instant: true
-                intervalMs: 1000
-                legendFormat: __auto
-                maxDataPoints: 43200
-                range: false
-                refId: A
-            - refId: B
-              relativeTimeRange:
-                from: 600
-                to: 0
-              datasourceUid: __expr__
-              model:
-                conditions:
-                    - evaluator:
-                        params:
-                            - 100  # Calibrated as a conservative baseline; high-throughput gateways may see this normally. Adjust per environment.
-                        type: gt
-                      operator:
-                        type: and
-                      query:
-                        params:
-                            - A
-                      reducer:
-                        params: []
-                        type: last
-                      type: query
-                datasource:
-                    type: __expr__
-                    uid: __expr__
-                expression: A
-                intervalMs: 1000
-                maxDataPoints: 43200
-                refId: B
-                type: threshold
-          noDataState: NoData
-          execErrState: Error
-          for: 5m
-          annotations:
-            description: NAT Gateway has dropped more than 100 packets for over 5 minutes, indicating potential network issues.
-            summary: High packet drop rate on NAT Gateway {{$labels.dimension_NatGatewayId}} in cluster {{$labels.cluster}}
-          labels:
-            opsgenie: "1"
-          isPaused: false
+      uid: nat_gateway_packets_dropped
diff --git a/python-pulumi/src/ptd/grafana_alerts/rds.yaml b/python-pulumi/src/ptd/grafana_alerts/rds.yaml
index 6b94f53..c09e58a 100644
--- a/python-pulumi/src/ptd/grafana_alerts/rds.yaml
+++ b/python-pulumi/src/ptd/grafana_alerts/rds.yaml
@@ -10,8 +10,6 @@
 #   - orgId: 1
 #     uid: rds_freeable_memory_low
 #   - orgId: 1
-#     uid: rds_read_latency_high
-#   - orgId: 1
 #     uid: rds_database_connections_high
 #
 # See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/
@@ -21,6 +19,9 @@
 # If Alloy is not running or relabeling is misconfigured, the label will be absent and
 # the annotation will render as "in cluster " (blank).
 apiVersion: 1
+deleteRules:
+    - orgId: 1
+      uid: rds_read_latency_high
 groups:
     - orgId: 1
       name: RDS
@@ -81,6 +82,7 @@ groups:
             summary: High CPU utilization on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}}
           labels:
             opsgenie: "1"
+            severity: warning
           isPaused: false
         - uid: rds_free_storage_low
           title: RDS Free Storage Low
@@ -136,6 +138,11 @@ groups:
             summary: Low free storage on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}}
           labels:
             opsgenie: "1"
+            severity: warning
+            # Note: threshold is absolute (5 GiB) rather than percentage-based. CloudWatch does not
+            # expose AllocatedStorage as a time-series metric for RDS (it is an instance attribute),
+            # so computing a usage percentage is not feasible without a separate exporter or recording
+            # rule. The 5 GiB threshold is calibrated to PTD's default 100 GiB allocation.
           isPaused: false
         - uid: rds_freeable_memory_low
           title: RDS Freeable Memory Low
@@ -191,63 +198,13 @@ groups:
             summary: Low freeable memory on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}}
           labels:
             opsgenie: "1"
+            severity: warning
             instance_size_dependent: "true"  # Silence this label for known-small instance classes
+            # Note: threshold is absolute (512 MiB) rather than percentage-based. CloudWatch does not
+            # expose total instance RAM as a metric for RDS — it varies by instance type. PTD's default
+            # instance (db.t3.small, 2 GiB) would fire constantly at a 90%-used threshold under normal
+            # Postgres buffer cache load, making percentage-based alerting impractical here.
           isPaused: true  # Paused until a Grafana silence or Alertmanager inhibit rule is configured for the instance_size_dependent label
-        - uid: rds_read_latency_high
-          title: RDS Read Latency High
-          condition: B
-          data:
-            - refId: A
-              relativeTimeRange:
-                from: 600
-                to: 0
-              datasourceUid: mimir
-              model:
-                editorMode: code
-                expr: aws_rds_read_latency_average{job="integrations/cloudwatch"}
-                instant: true
-                intervalMs: 1000
-                legendFormat: __auto
-                maxDataPoints: 43200
-                range: false
-                refId: A
-            - refId: B
-              relativeTimeRange:
-                from: 600
-                to: 0
-              datasourceUid: __expr__
-              model:
-                conditions:
-                    - evaluator:
-                        params:
-                            - 0.1  # 100ms in seconds
-                        type: gt
-                      operator:
-                        type: and
-                      query:
-                        params:
-                            - A
-                      reducer:
-                        params: []
-                        type: last
-                      type: query
-                datasource:
-                    type: __expr__
-                    uid: __expr__
-                expression: A
-                intervalMs: 1000
-                maxDataPoints: 43200
-                refId: B
-                type: threshold
-          noDataState: NoData  # Performance metric; silent suppression on scrape outage is acceptable
-          execErrState: Error
-          for: 10m
-          annotations:
-            description: RDS instance read latency is above 100ms for more than 10 minutes.
-            summary: High read latency on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}}
-          labels:
-            opsgenie: "1"
-          isPaused: false
         - uid: rds_database_connections_high
           title: RDS Database Connections High
           condition: B
@@ -275,7 +232,7 @@ groups:
                 conditions:
                     - evaluator:
                         params:
-                            - 500  # Calibrated for db.r5.large (max ~4000). For small instances (db.t3.small max ~36) this alert will never fire; adjust per instance class.
+                            - 80
                         type: gt
                       operator:
                         type: and
@@ -298,9 +255,9 @@ groups:
           execErrState: Error
           for: 5m
           annotations:
-            description: RDS instance has more than 500 active database connections for more than 5 minutes. Note: this threshold is calibrated for db.r5.large (max ~4000 connections); it will never fire for small instances (e.g. db.t3.small max ~36). Adjust per instance class.
+            description: RDS instance has more than 80 active database connections for more than 5 minutes.
             summary: High database connections on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}}
           labels:
             opsgenie: "1"
-            instance_size_dependent: "true"  # Silence this label for known-small instance classes
-          isPaused: true  # Paused until a Grafana silence or Alertmanager inhibit rule is configured for the instance_size_dependent label
+            severity: warning
+          isPaused: false