posit-dev · ian-flores · Feb 26, 2026 · Feb 23, 2026 · Feb 23, 2026 · Feb 23, 2026
@@ -1123,13 +1123,18 @@ func SqsActions() []Action {
 
 // EventsActions returns EventBridge-related actions
 func EventsActions() []Action {
+	// SupportsManagedByCondition is false for all resource-level actions because
+	// EventBridge rules may not have the posit.team/managed-by tag (e.g., rules
+	// created by Karpenter). Requiring the tag creates a chicken-and-egg problem
+	// where you can't tag, describe, or manage rules that lack the tag.
+	// Account-scoping via ResourceConditionLimitingActions is sufficient.
 	return []Action{
 		{"events:ListRules", false, false},
-		{"events:ListTagsForResource", true, true},
-		{"events:*Rule", true, true},
-		{"events:*Targets", true, true},
-		{"events:*tag*", true, true},
-		{"events:PutRule", true, true},
+		{"events:ListTagsForResource", true, false},
+		{"events:*Rule", true, false},
+		{"events:*Targets", true, false},
+		{"events:*tag*", true, false},
+		{"events:PutRule", true, false},
 	}
 }
 

@@ -46,7 +46,7 @@ groups:
                         type: and
                       query:
                         params:
-                            - C
+                            - A
                       reducer:
                         params: []
                         type: last
@@ -107,7 +107,7 @@ groups:
                         type: and
                       query:
                         params:
-                            - C
+                            - A
                       reducer:
                         params: []
                         type: last
@@ -164,7 +164,7 @@ groups:
                         type: and
                       query:
                         params:
-                            - C
+                            - A
                       reducer:
                         params: []
                         type: last

@@ -0,0 +1,247 @@
+# To delete these alerts, simply removing the configMap that uses this method will not work.
+# Replace file contents with the following and apply in order to delete the alerts
+# (repeat the deleteRules entry for each uid listed below):
+# apiVersion: 1
+# deleteRules:
+#   - orgId: 1
+#     uid: alb_target_5xx_errors_high
+#   - orgId: 1
+#     uid: alb_unhealthy_targets
+#   - orgId: 1
+#     uid: nlb_unhealthy_targets
+#   - orgId: 1
+#     uid: alb_response_latency_high
+#
+# See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/
+#
+# Note: alert annotations reference {{$labels.cluster}}. For CloudWatch-sourced metrics,
+# this label is injected by the prometheus.relabel.default block in grafana_alloy.py.
+# If Alloy is not running or relabeling is misconfigured, the label will be absent and
+# the annotation will render as "in cluster " (blank).
+apiVersion: 1
+groups:
+    - orgId: 1
+      name: LoadBalancer
+      folder: Posit Alerts
+      interval: 5m
+      rules:
+        - uid: alb_target_5xx_errors_high
+          title: ALB Target 5XX Errors High
+          condition: B
+          data:
+            - refId: A
+              relativeTimeRange:
+                from: 600
+                to: 0
+              datasourceUid: mimir
+              model:
+                editorMode: code
+                expr: aws_applicationelb_httpcode_target_5_xx_count_sum{job="integrations/cloudwatch"}
+                instant: true
+                intervalMs: 1000
+                legendFormat: __auto
+                maxDataPoints: 43200
+                range: false
+                refId: A
+            - refId: B
+              relativeTimeRange:
+                from: 600
+                to: 0
+              datasourceUid: __expr__
+              model:
+                conditions:
+                    - evaluator:
+                        params:
+                            - 10  # Absolute count, not an error rate. High-traffic ALBs may never breach this; low-traffic ALBs may page on transient spikes. Consider a rate-based alert if traffic volume varies significantly across environments.
+                        type: gt
+                      operator:
+                        type: and
+                      query:
+                        params:
+                            - A
+                      reducer:
+                        params: []
+                        type: last
+                      type: query
+                datasource:
+                    type: __expr__
+                    uid: __expr__
+                expression: A
+                intervalMs: 1000
+                maxDataPoints: 43200
+                refId: B
+                type: threshold
+          noDataState: NoData
+          execErrState: Error
+          for: 5m
+          annotations:
+            description: Application Load Balancer has more than 10 target 5XX errors for over 5 minutes, indicating backend service failures.
+            summary: High 5XX errors on ALB {{$labels.dimension_LoadBalancer}} in cluster {{$labels.cluster}}
+          labels:
+            opsgenie: "1"
+          isPaused: false
+        - uid: alb_unhealthy_targets
+          title: ALB Unhealthy Targets
+          condition: B
+          data:
+            - refId: A
+              relativeTimeRange:
+                from: 600
+                to: 0
+              datasourceUid: mimir
+              model:
+                editorMode: code
+                expr: aws_applicationelb_un_healthy_host_count_average{job="integrations/cloudwatch"}
+                instant: true
+                intervalMs: 1000
+                legendFormat: __auto
+                maxDataPoints: 43200
+                range: false
+                refId: A
+            - refId: B
+              relativeTimeRange:
+                from: 600
+                to: 0
+              datasourceUid: __expr__
+              model:
+                conditions:
+                    - evaluator:
+                        params:
+                            - 0
+                        type: gt
+                      operator:
+                        type: and
+                      query:
+                        params:
+                            - A
+                      reducer:
+                        params: []
+                        type: last
+                      type: query
+                datasource:
+                    type: __expr__
+                    uid: __expr__
+                expression: A
+                intervalMs: 1000
+                maxDataPoints: 43200
+                refId: B
+                type: threshold
+          noDataState: NoData
+          execErrState: Error
+          for: 10m
+          annotations:
+            description: Application Load Balancer has unhealthy targets for over 10 minutes, indicating backend service health issues. The extended window reduces false positives during rolling deployments.
+            summary: Unhealthy targets on ALB {{$labels.dimension_LoadBalancer}} in cluster {{$labels.cluster}}
+          labels:
+            opsgenie: "1"
+          isPaused: false
+        - uid: nlb_unhealthy_targets
+          title: NLB Unhealthy Targets
+          condition: B
+          data:
+            - refId: A
+              relativeTimeRange:
+                from: 600
+                to: 0
+              datasourceUid: mimir
+              model:
+                editorMode: code
+                expr: aws_networkelb_un_healthy_host_count_average{job="integrations/cloudwatch"}
+                instant: true
+                intervalMs: 1000
+                legendFormat: __auto
+                maxDataPoints: 43200
+                range: false
+                refId: A
+            - refId: B
+              relativeTimeRange:
+                from: 600
+                to: 0
+              datasourceUid: __expr__
+              model:
+                conditions:
+                    - evaluator:
+                        params:
+                            - 0
+                        type: gt
+                      operator:
+                        type: and
+                      query:
+                        params:
+                            - A
+                      reducer:
+                        params: []
+                        type: last
+                      type: query
+                datasource:
+                    type: __expr__
+                    uid: __expr__
+                expression: A
+                intervalMs: 1000
+                maxDataPoints: 43200
+                refId: B
+                type: threshold
+          noDataState: NoData
+          execErrState: Error
+          for: 10m
+          annotations:
+            description: Network Load Balancer has unhealthy targets for over 10 minutes, indicating backend service health issues. The extended window reduces false positives during rolling deployments and node drains.
+            summary: Unhealthy targets on NLB {{$labels.dimension_LoadBalancer}} in cluster {{$labels.cluster}}
+          labels:
+            opsgenie: "1"
+          isPaused: false
+        - uid: alb_response_latency_high
+          title: ALB Response Latency High
+          condition: B
+          data:
+            - refId: A
+              relativeTimeRange:
+                from: 600
+                to: 0
+              datasourceUid: mimir
+              model:
+                editorMode: code
+                expr: aws_applicationelb_target_response_time_average{job="integrations/cloudwatch"}
+                instant: true
+                intervalMs: 1000
+                legendFormat: __auto
+                maxDataPoints: 43200
+                range: false
+                refId: A
+            - refId: B
+              relativeTimeRange:
+                from: 600
+                to: 0
+              datasourceUid: __expr__
+              model:
+                conditions:
+                    - evaluator:
+                        params:
+                            - 2  # 2 seconds
+                        type: gt
+                      operator:
+                        type: and
+                      query:
+                        params:
+                            - A
+                      reducer:
+                        params: []
+                        type: last
+                      type: query
+                datasource:
+                    type: __expr__
+                    uid: __expr__
+                expression: A
+                intervalMs: 1000
+                maxDataPoints: 43200
+                refId: B
+                type: threshold
+          noDataState: NoData
+          execErrState: Error
+          for: 10m
+          annotations:
+            description: Application Load Balancer target response time is above 2 seconds for more than 10 minutes, indicating performance degradation.
+            summary: High response latency on ALB {{$labels.dimension_LoadBalancer}} in cluster {{$labels.cluster}}
+          labels:
+            opsgenie: "1"
+          isPaused: false