Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
4e13829
Add CloudWatch alerting for RDS, NAT Gateway, and Load Balancers
ian-flores Feb 23, 2026
a172905
Tag Load Balancers with PTD workload identifiers
ian-flores Feb 23, 2026
5f84b5f
Address code review findings for alerting implementation
ian-flores Feb 23, 2026
0034b7d
Address review findings (job 75)
ian-flores Feb 23, 2026
7e4ff77
Address review findings (job 78)
ian-flores Feb 23, 2026
7cdc43e
Address review findings (job 80)
ian-flores Feb 23, 2026
e3734ab
Address review findings (job 85)
ian-flores Feb 23, 2026
82b3a8b
Address review findings (job 87)
ian-flores Feb 23, 2026
3376e6e
Address review findings (job 91)
ian-flores Feb 23, 2026
d0379bf
Address review findings (job 93)
ian-flores Feb 23, 2026
b98959b
Address review findings (job 96)
ian-flores Feb 23, 2026
c4370f7
Address review findings (job 99)
ian-flores Feb 23, 2026
2a74ad7
Address review findings (job 101)
ian-flores Feb 23, 2026
b1754b7
Address review findings (job 102)
ian-flores Feb 23, 2026
064612b
Address review findings (job 104)
ian-flores Feb 23, 2026
7302197
Address review findings (job 106)
ian-flores Feb 23, 2026
7deed4e
Address review findings (job 108)
ian-flores Feb 23, 2026
c493b87
Address review findings (job 111)
ian-flores Feb 23, 2026
12ae185
Address review findings (job 113)
ian-flores Feb 23, 2026
64688cf
Address review findings (job 116)
ian-flores Feb 23, 2026
90cd258
Address review findings (job 120)
ian-flores Feb 23, 2026
3fed805
Address review findings (job 122)
ian-flores Feb 23, 2026
29be3c3
Address review findings (job 123)
ian-flores Feb 23, 2026
64bd4e2
Address review findings (job 126)
ian-flores Feb 23, 2026
be4d986
Address review findings (job 127)
ian-flores Feb 23, 2026
b7a6b57
Fix EventBridge IAM permissions chicken-and-egg problem
ian-flores Feb 25, 2026
fade18d
Fix Python lint and formatting errors
ian-flores Feb 25, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions lib/aws/iam.go
Original file line number Diff line number Diff line change
Expand Up @@ -1123,13 +1123,18 @@ func SqsActions() []Action {

// EventsActions returns EventBridge-related actions
func EventsActions() []Action {
// SupportsManagedByCondition is false for all resource-level actions because
// EventBridge rules may not have the posit.team/managed-by tag (e.g., rules
// created by Karpenter). Requiring the tag creates a chicken-and-egg problem
// where you can't tag, describe, or manage rules that lack the tag.
// Account-scoping via ResourceConditionLimitingActions is sufficient.
return []Action{
{"events:ListRules", false, false},
{"events:ListTagsForResource", true, true},
{"events:*Rule", true, true},
{"events:*Targets", true, true},
{"events:*tag*", true, true},
{"events:PutRule", true, true},
{"events:ListTagsForResource", true, false},
{"events:*Rule", true, false},
{"events:*Targets", true, false},
{"events:*tag*", true, false},
{"events:PutRule", true, false},
}
}

Expand Down
6 changes: 3 additions & 3 deletions python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ groups:
type: and
query:
params:
- C
- A
reducer:
params: []
type: last
Expand Down Expand Up @@ -107,7 +107,7 @@ groups:
type: and
query:
params:
- C
- A
reducer:
params: []
type: last
Expand Down Expand Up @@ -164,7 +164,7 @@ groups:
type: and
query:
params:
- C
- A
reducer:
params: []
type: last
Expand Down
247 changes: 247 additions & 0 deletions python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,247 @@
# To delete these alerts, simply removing the configMap that uses this method will not work.
# Replace file contents with the following and apply in order to delete the alerts
# (repeat the deleteRules entry for each uid listed below):
# apiVersion: 1
# deleteRules:
# - orgId: 1
# uid: alb_target_5xx_errors_high
# - orgId: 1
# uid: alb_unhealthy_targets
# - orgId: 1
# uid: nlb_unhealthy_targets
# - orgId: 1
# uid: alb_response_latency_high
#
# See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/
#
# Note: alert annotations reference {{$labels.cluster}}. For CloudWatch-sourced metrics,
# this label is injected by the prometheus.relabel.default block in grafana_alloy.py.
# If Alloy is not running or relabeling is misconfigured, the label will be absent and
# the annotation will render as "in cluster " (blank).
apiVersion: 1
groups:
- orgId: 1
name: LoadBalancer
folder: Posit Alerts
interval: 5m
rules:
- uid: alb_target_5xx_errors_high
title: ALB Target 5XX Errors High
condition: B
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: mimir
model:
editorMode: code
expr: aws_applicationelb_httpcode_target_5_xx_count_sum{job="integrations/cloudwatch"}
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: B
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 10 # Absolute count, not an error rate. High-traffic ALBs may never breach this; low-traffic ALBs may page on transient spikes. Consider a rate-based alert if traffic volume varies significantly across environments.
type: gt
operator:
type: and
query:
params:
- A
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: B
type: threshold
noDataState: NoData
execErrState: Error
for: 5m
annotations:
description: Application Load Balancer has more than 10 target 5XX errors for over 5 minutes, indicating backend service failures.
summary: High 5XX errors on ALB {{$labels.dimension_LoadBalancer}} in cluster {{$labels.cluster}}
labels:
opsgenie: "1"
isPaused: false
- uid: alb_unhealthy_targets
title: ALB Unhealthy Targets
condition: B
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: mimir
model:
editorMode: code
expr: aws_applicationelb_un_healthy_host_count_average{job="integrations/cloudwatch"}
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: B
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
type: gt
operator:
type: and
query:
params:
- A
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: B
type: threshold
noDataState: NoData
execErrState: Error
for: 10m
annotations:
description: Application Load Balancer has unhealthy targets for over 10 minutes, indicating backend service health issues. The extended window reduces false positives during rolling deployments.
summary: Unhealthy targets on ALB {{$labels.dimension_LoadBalancer}} in cluster {{$labels.cluster}}
labels:
opsgenie: "1"
isPaused: false
- uid: nlb_unhealthy_targets
title: NLB Unhealthy Targets
condition: B
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: mimir
model:
editorMode: code
expr: aws_networkelb_un_healthy_host_count_average{job="integrations/cloudwatch"}
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: B
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
type: gt
operator:
type: and
query:
params:
- A
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: B
type: threshold
noDataState: NoData
execErrState: Error
for: 10m
annotations:
description: Network Load Balancer has unhealthy targets for over 10 minutes, indicating backend service health issues. The extended window reduces false positives during rolling deployments and node drains.
summary: Unhealthy targets on NLB {{$labels.dimension_LoadBalancer}} in cluster {{$labels.cluster}}
labels:
opsgenie: "1"
isPaused: false
- uid: alb_response_latency_high
title: ALB Response Latency High
condition: B
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: mimir
model:
editorMode: code
expr: aws_applicationelb_target_response_time_average{job="integrations/cloudwatch"}
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: B
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 2 # 2 seconds
type: gt
operator:
type: and
query:
params:
- A
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: B
type: threshold
noDataState: NoData
execErrState: Error
for: 10m
annotations:
description: Application Load Balancer target response time is above 2 seconds for more than 10 minutes, indicating performance degradation.
summary: High response latency on ALB {{$labels.dimension_LoadBalancer}} in cluster {{$labels.cluster}}
labels:
opsgenie: "1"
isPaused: false
Loading
Loading