Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion python-pulumi/src/ptd/grafana_alerts/applications.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ groups:
execErrState: Error
for: 5m
annotations:
summary: "🔴 CRITICAL: Loki WAL Disk Full"
summary: "🟡 WARNING: Loki WAL Disk Full"
description: |
Loki ingester experiencing WAL disk full failures

Expand Down
114 changes: 21 additions & 93 deletions python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,23 @@
# apiVersion: 1
# deleteRules:
# - orgId: 1
# uid: fsx_capacity
# uid: fsx_capacity_warning
# - orgId: 1
# uid: fsx_capacity_critical
#
# See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/
apiVersion: 1
deleteRules:
- orgId: 1
uid: fsx_capacity
groups:
- orgId: 1
name: Cloudwatch
folder: Posit Alerts
interval: 5m
rules:
- uid: fsx_capacity
title: FSx Capacity
- uid: fsx_capacity_warning
title: FSx Capacity Warning
condition: C
data:
- refId: A
Expand Down Expand Up @@ -82,84 +87,9 @@ groups:
labels:
opsgenie: "1"
isPaused: false
- uid: ec2_network_out_high
title: EC2 Network Out High
condition: B
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: mimir
model:
editorMode: code
# Network out threshold: 314572800 bytes/s (~300 MiB/s)
# Based on analysis of Loki->S3 traffic patterns from issue #2347
# Instance-aware thresholds: Using the same threshold for all instances
# To set different thresholds by instance type, use:
# avg_over_time(aws_ec2_network_out_average{job="integrations/cloudwatch", dimension_InstanceType="t3.xlarge"}[5m]) > 157286400 or
# avg_over_time(aws_ec2_network_out_average{job="integrations/cloudwatch", dimension_InstanceType="m5.2xlarge"}[5m]) > 314572800
expr: avg_over_time(aws_ec2_network_out_average{job="integrations/cloudwatch"}[5m])
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: B
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 3.145728e+08 # ~ 300 MiB/s
type: gt
operator:
type: and
query:
params:
- A
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: B
type: threshold
noDataState: NoData
execErrState: Error
for: 5m
annotations:
summary: "🟡 WARNING: EC2 Network Out High"
description: |
EC2 instance has unusually high outbound network traffic

─── WHERE ───────────────────────────
Tenant: {{ $labels.tenant_name }}
Cluster: {{ $labels.cluster }}
Resource: {{ $labels.dimension_InstanceId }}
Region: {{ $labels.region }}

─── DETAILS ─────────────────────────
Metric: Network Out
Current: > 300 MiB/s
Threshold: 300 MiB/s
Duration: 5 minutes

labels:
opsgenie: "1"
isPaused: false
- uid: ec2_network_packets_out_high
title: EC2 Network Packets Out High
condition: B
- uid: fsx_capacity_critical
title: FSx Capacity Critical
condition: C
data:
- refId: A
relativeTimeRange:
Expand All @@ -168,16 +98,14 @@ groups:
datasourceUid: mimir
model:
editorMode: code
# Network packets out threshold: 400000 packets/s
# High packet rate can indicate network bottlenecks or unusual traffic patterns
expr: avg_over_time(aws_ec2_network_packets_out_average{job="integrations/cloudwatch"}[5m])
expr: aws_fsx_used_storage_capacity_average{job="integrations/cloudwatch", dimension_DataType!="Snapshot", dimension_VolumeId!=""} / aws_fsx_storage_capacity_average{job="integrations/cloudwatch", dimension_VolumeId!=""}
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: B
- refId: C
relativeTimeRange:
from: 600
to: 0
Expand All @@ -186,7 +114,7 @@ groups:
conditions:
- evaluator:
params:
- 400000
- 0.9
type: gt
operator:
type: and
Expand All @@ -203,26 +131,26 @@ groups:
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: B
refId: C
type: threshold
noDataState: NoData
execErrState: Error
for: 5m
annotations:
summary: "🟡 WARNING: EC2 Network Packets Out High"
summary: "🔴 CRITICAL: FSx Storage Capacity Critical"
description: |
EC2 instance has unusually high packet transmission rate
FSx file system storage capacity is critically low

─── WHERE ───────────────────────────
Tenant: {{ $labels.tenant_name }}
Cluster: {{ $labels.cluster }}
Resource: {{ $labels.dimension_InstanceId }}
Resource: {{ $labels.dimension_FileSystemId }}
Region: {{ $labels.region }}

─── DETAILS ─────────────────────────
Metric: Network Packets Out
Current: > 400,000 packets/s
Threshold: 400,000 packets/s
Metric: Storage Capacity
Current: > 90% used
Threshold: 90%
Duration: 5 minutes

labels:
Expand Down
Loading
Loading