Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions python-pulumi/src/ptd/grafana_alerts/loadbalancer.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ groups:
summary: High 5XX errors on ALB {{$labels.dimension_LoadBalancer}} in cluster {{$labels.cluster}}
labels:
opsgenie: "1"
severity: warning
isPaused: false
- uid: alb_unhealthy_targets
title: ALB Unhealthy Targets
Expand Down Expand Up @@ -128,12 +129,13 @@ groups:
type: threshold
noDataState: NoData
execErrState: Error
for: 10m
for: 5m
annotations:
description: Application Load Balancer has unhealthy targets for over 10 minutes, indicating backend service health issues. The extended window reduces false positives during rolling deployments.
description: Application Load Balancer has unhealthy targets for over 5 minutes, indicating backend service health issues.
summary: Unhealthy targets on ALB {{$labels.dimension_LoadBalancer}} in cluster {{$labels.cluster}}
labels:
opsgenie: "1"
severity: warning
isPaused: false
- uid: nlb_unhealthy_targets
title: NLB Unhealthy Targets
Expand Down Expand Up @@ -183,12 +185,13 @@ groups:
type: threshold
noDataState: NoData
execErrState: Error
for: 10m
for: 5m
annotations:
description: Network Load Balancer has unhealthy targets for over 10 minutes, indicating backend service health issues. The extended window reduces false positives during rolling deployments and node drains.
description: Network Load Balancer has unhealthy targets for over 5 minutes, indicating backend service health issues.
summary: Unhealthy targets on NLB {{$labels.dimension_LoadBalancer}} in cluster {{$labels.cluster}}
labels:
opsgenie: "1"
severity: warning
isPaused: false
- uid: alb_response_latency_high
title: ALB Response Latency High
Expand Down Expand Up @@ -244,4 +247,5 @@ groups:
summary: High response latency on ALB {{$labels.dimension_LoadBalancer}} in cluster {{$labels.cluster}}
labels:
opsgenie: "1"
severity: warning
isPaused: false
135 changes: 6 additions & 129 deletions python-pulumi/src/ptd/grafana_alerts/natgateway.yaml
Original file line number Diff line number Diff line change
@@ -1,133 +1,10 @@
# To delete these alerts, simply removing the configMap that uses this method will not work.
# Replace file contents with the following and apply in order to delete the alerts
# (repeat the deleteRules entry for each uid listed below):
# apiVersion: 1
# deleteRules:
# - orgId: 1
# uid: nat_gateway_port_allocation_errors
# - orgId: 1
# uid: nat_gateway_packets_dropped
# These alerts have been deleted. The deleteRules entries below will remove them from Grafana
# on the next provisioning run.
#
# See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/
#
# Note: alert annotations reference {{$labels.cluster}}. For CloudWatch-sourced metrics,
# this label is injected by the prometheus.relabel.default block in grafana_alloy.py.
# If Alloy is not running or relabeling is misconfigured, the label will be absent and
# the annotation will render as "in cluster " (blank).
apiVersion: 1
groups:
deleteRules:
- orgId: 1
uid: nat_gateway_port_allocation_errors
- orgId: 1
name: NATGateway
folder: Posit Alerts
interval: 5m
rules:
- uid: nat_gateway_port_allocation_errors
title: NAT Gateway Port Allocation Errors
condition: B
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: mimir
model:
editorMode: code
expr: aws_natgateway_error_port_allocation_sum{job="integrations/cloudwatch"}
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: B
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
type: gt
operator:
type: and
query:
params:
- A
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: B
type: threshold
noDataState: NoData # Using NoData to avoid spurious pages during scrape outages (Alloy restart, credential rotation, brief partitions). Add a separate "CloudWatch scrape down" alert to cover the outage case independently.
execErrState: Error
for: 5m
annotations:
description: NAT Gateway is experiencing port allocation errors, which means outbound network connectivity is failing. This is a critical issue that requires immediate attention.
summary: Port allocation errors on NAT Gateway {{$labels.dimension_NatGatewayId}} in cluster {{$labels.cluster}}
labels:
opsgenie: "1"
isPaused: false
- uid: nat_gateway_packets_dropped
title: NAT Gateway Packets Dropped
condition: B
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: mimir
model:
editorMode: code
expr: aws_natgateway_packets_drop_count_sum{job="integrations/cloudwatch"}
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: B
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 100 # Calibrated as a conservative baseline; high-throughput gateways may see this normally. Adjust per environment.
type: gt
operator:
type: and
query:
params:
- A
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: B
type: threshold
noDataState: NoData
execErrState: Error
for: 5m
annotations:
description: NAT Gateway has dropped more than 100 packets for over 5 minutes, indicating potential network issues.
summary: High packet drop rate on NAT Gateway {{$labels.dimension_NatGatewayId}} in cluster {{$labels.cluster}}
labels:
opsgenie: "1"
isPaused: false
uid: nat_gateway_packets_dropped
79 changes: 18 additions & 61 deletions python-pulumi/src/ptd/grafana_alerts/rds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@
# - orgId: 1
# uid: rds_freeable_memory_low
# - orgId: 1
# uid: rds_read_latency_high
# - orgId: 1
# uid: rds_database_connections_high
#
# See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/
Expand All @@ -21,6 +19,9 @@
# If Alloy is not running or relabeling is misconfigured, the label will be absent and
# the annotation will render as "in cluster " (blank).
apiVersion: 1
deleteRules:
- orgId: 1
uid: rds_read_latency_high
groups:
- orgId: 1
name: RDS
Expand Down Expand Up @@ -81,6 +82,7 @@ groups:
summary: High CPU utilization on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}}
labels:
opsgenie: "1"
severity: warning
isPaused: false
- uid: rds_free_storage_low
title: RDS Free Storage Low
Expand Down Expand Up @@ -136,6 +138,11 @@ groups:
summary: Low free storage on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}}
labels:
opsgenie: "1"
severity: warning
# Note: threshold is absolute (5 GiB) rather than percentage-based. CloudWatch does not
# expose AllocatedStorage as a time-series metric for RDS (it is an instance attribute),
# so computing a usage percentage is not feasible without a separate exporter or recording
# rule. The 5 GiB threshold is calibrated to PTD's default 100 GiB allocation.
isPaused: false
- uid: rds_freeable_memory_low
title: RDS Freeable Memory Low
Expand Down Expand Up @@ -191,63 +198,13 @@ groups:
summary: Low freeable memory on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}}
labels:
opsgenie: "1"
severity: warning
instance_size_dependent: "true" # Silence this label for known-small instance classes
# Note: threshold is absolute (512 MiB) rather than percentage-based. CloudWatch does not
# expose total instance RAM as a metric for RDS — it varies by instance type. PTD's default
# instance (db.t3.small, 2 GiB) would fire constantly at a 90%-used threshold under normal
# Postgres buffer cache load, making percentage-based alerting impractical here.
isPaused: true # Paused until a Grafana silence or Alertmanager inhibit rule is configured for the instance_size_dependent label
- uid: rds_read_latency_high
title: RDS Read Latency High
condition: B
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: mimir
model:
editorMode: code
expr: aws_rds_read_latency_average{job="integrations/cloudwatch"}
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: B
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0.1 # 100ms in seconds
type: gt
operator:
type: and
query:
params:
- A
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: B
type: threshold
noDataState: NoData # Performance metric; silent suppression on scrape outage is acceptable
execErrState: Error
for: 10m
annotations:
description: RDS instance read latency is above 100ms for more than 10 minutes.
summary: High read latency on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}}
labels:
opsgenie: "1"
isPaused: false
- uid: rds_database_connections_high
title: RDS Database Connections High
condition: B
Expand Down Expand Up @@ -275,7 +232,7 @@ groups:
conditions:
- evaluator:
params:
- 500 # Calibrated for db.r5.large (max ~4000). For small instances (db.t3.small max ~36) this alert will never fire; adjust per instance class.
- 80
type: gt
operator:
type: and
Expand All @@ -298,9 +255,9 @@ groups:
execErrState: Error
for: 5m
annotations:
description: RDS instance has more than 500 active database connections for more than 5 minutes. Note: this threshold is calibrated for db.r5.large (max ~4000 connections); it will never fire for small instances (e.g. db.t3.small max ~36). Adjust per instance class.
description: RDS instance has more than 80 active database connections for more than 5 minutes.
summary: High database connections on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}}
labels:
opsgenie: "1"
instance_size_dependent: "true" # Silence this label for known-small instance classes
isPaused: true # Paused until a Grafana silence or Alertmanager inhibit rule is configured for the instance_size_dependent label
severity: warning
isPaused: false
Loading