diff --git a/python-pulumi/src/ptd/grafana_alerts/applications.yaml b/python-pulumi/src/ptd/grafana_alerts/applications.yaml index 7f36bb7..db1a4bf 100644 --- a/python-pulumi/src/ptd/grafana_alerts/applications.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/applications.yaml @@ -63,7 +63,7 @@ groups: execErrState: Error for: 5m annotations: - summary: "🔴 CRITICAL: Loki WAL Disk Full" + summary: "🟡 WARNING: Loki WAL Disk Full" description: | Loki ingester experiencing WAL disk full failures diff --git a/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml b/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml index 076c603..d29c724 100644 --- a/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml @@ -3,18 +3,23 @@ # apiVersion: 1 # deleteRules: # - orgId: 1 -# uid: fsx_capacity +# uid: fsx_capacity_warning +# - orgId: 1 +# uid: fsx_capacity_critical # # See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/ apiVersion: 1 +deleteRules: + - orgId: 1 + uid: fsx_capacity groups: - orgId: 1 name: Cloudwatch folder: Posit Alerts interval: 5m rules: - - uid: fsx_capacity - title: FSx Capacity + - uid: fsx_capacity_warning + title: FSx Capacity Warning condition: C data: - refId: A @@ -82,84 +87,9 @@ groups: labels: opsgenie: "1" isPaused: false - - uid: ec2_network_out_high - title: EC2 Network Out High - condition: B - data: - - refId: A - relativeTimeRange: - from: 600 - to: 0 - datasourceUid: mimir - model: - editorMode: code - # Network out threshold: 314572800 bytes/s (~300 MiB/s) - # Based on analysis of Loki->S3 traffic patterns from issue #2347 - # Instance-aware thresholds: Using the same threshold for all instances - # To set different thresholds by instance type, use: - # avg_over_time(aws_ec2_network_out_average{job="integrations/cloudwatch", dimension_InstanceType="t3.xlarge"}[5m]) > 157286400 or - # avg_over_time(aws_ec2_network_out_average{job="integrations/cloudwatch", dimension_InstanceType="m5.2xlarge"}[5m]) > 314572800 - expr: avg_over_time(aws_ec2_network_out_average{job="integrations/cloudwatch"}[5m]) - instant: true - intervalMs: 1000 - legendFormat: __auto - maxDataPoints: 43200 - range: false - refId: A - - refId: B - relativeTimeRange: - from: 600 - to: 0 - datasourceUid: __expr__ - model: - conditions: - - evaluator: - params: - - 3.145728e+08 # ~ 300 MiB/s - type: gt - operator: - type: and - query: - params: - - A - reducer: - params: [] - type: last - type: query - datasource: - type: __expr__ - uid: __expr__ - expression: A - intervalMs: 1000 - maxDataPoints: 43200 - refId: B - type: threshold - noDataState: NoData - execErrState: Error - for: 5m - annotations: - summary: "🟡 WARNING: EC2 Network Out High" - description: | - EC2 instance has unusually high outbound network traffic - - ─── WHERE ─────────────────────────── - Tenant: {{ $labels.tenant_name }} - Cluster: {{ $labels.cluster }} - Resource: {{ $labels.dimension_InstanceId }} - Region: {{ $labels.region }} - - ─── DETAILS ───────────────────────── - Metric: Network Out - Current: > 300 MiB/s - Threshold: 300 MiB/s - Duration: 5 minutes - - labels: - opsgenie: "1" - isPaused: false - - uid: ec2_network_packets_out_high - title: EC2 Network Packets Out High - condition: B + - uid: fsx_capacity_critical + title: FSx Capacity Critical + condition: C data: - refId: A relativeTimeRange: @@ -168,16 +98,14 @@ groups: datasourceUid: mimir model: editorMode: code - # Network packets out threshold: 400000 packets/s - # High packet rate can indicate network bottlenecks or unusual traffic patterns - expr: avg_over_time(aws_ec2_network_packets_out_average{job="integrations/cloudwatch"}[5m]) + expr: aws_fsx_used_storage_capacity_average{job="integrations/cloudwatch", dimension_DataType!="Snapshot", dimension_VolumeId!=""} / aws_fsx_storage_capacity_average{job="integrations/cloudwatch", dimension_VolumeId!=""} instant: true intervalMs: 1000 legendFormat: __auto maxDataPoints: 43200 range: false refId: A - - refId: B + - refId: C relativeTimeRange: from: 600 to: 0 @@ -186,7 +114,7 @@ groups: conditions: - evaluator: params: - - 400000 + - 0.9 type: gt operator: type: and @@ -203,26 +131,26 @@ groups: expression: A intervalMs: 1000 maxDataPoints: 43200 - refId: B + refId: C type: threshold noDataState: NoData execErrState: Error for: 5m annotations: - summary: "🟡 WARNING: EC2 Network Packets Out High" + summary: "🔴 CRITICAL: FSx Storage Capacity Critical" description: | - EC2 instance has unusually high packet transmission rate + FSx file system storage capacity is critically low ─── WHERE ─────────────────────────── Tenant: {{ $labels.tenant_name }} Cluster: {{ $labels.cluster }} - Resource: {{ $labels.dimension_InstanceId }} + Resource: {{ $labels.dimension_FileSystemId }} Region: {{ $labels.region }} ─── DETAILS ───────────────────────── - Metric: Network Packets Out - Current: > 400,000 packets/s - Threshold: 400,000 packets/s + Metric: Storage Capacity + Current: > 90% used + Threshold: 90% Duration: 5 minutes labels: diff --git a/python-pulumi/src/ptd/grafana_alerts/pods.yaml b/python-pulumi/src/ptd/grafana_alerts/pods.yaml index 5ffda98..1da8db8 100644 --- a/python-pulumi/src/ptd/grafana_alerts/pods.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/pods.yaml @@ -4,8 +4,6 @@ # deleteRules: # - orgId: 1 # uid: crash_loop_backoff -# - orgId: 1 -# uid: pod_error # # See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/ # @@ -116,7 +114,7 @@ groups: execErrState: Error for: 5m annotations: - summary: "🔴 CRITICAL: Container Crash-Looping" + summary: "🟡 WARNING: Container Crash-Looping" description: | Container keeps crashing and restarting @@ -131,313 +129,6 @@ groups: Status: CrashLoopBackOff Duration: 5 minutes - labels: - opsgenie: "1" - isPaused: false - - uid: pod_error - title: Pod Error - condition: C - data: - - refId: A - relativeTimeRange: - from: 600 - to: 0 - datasourceUid: mimir - model: - datasource: - type: prometheus - uid: mimir - disableTextWrap: false - editorMode: code - expr: count by(cluster, namespace, pod, reason) (kube_pod_container_status_terminated_reason{namespace=~"posit-team|posit-team-system|alloy|mimir|loki|grafana",reason!="Completed"} * on(cluster,pod) group_left(label_launcher_instance_id) kube_pod_labels{label_launcher_instance_id=""}) - fullMetaSearch: false - includeNullMetadata: true - instant: true - intervalMs: 1000 - legendFormat: __auto - maxDataPoints: 43200 - range: false - refId: A - useBackend: false - - refId: B - relativeTimeRange: - from: 600 - to: 0 - datasourceUid: __expr__ - model: - conditions: - - evaluator: - params: [] - type: gt - operator: - type: and - query: - params: - - B - reducer: - params: [] - type: last - type: query - datasource: - type: __expr__ - uid: __expr__ - expression: A - intervalMs: 1000 - maxDataPoints: 43200 - reducer: count - refId: B - settings: - mode: dropNN - type: reduce - - refId: C - relativeTimeRange: - from: 600 - to: 0 - datasourceUid: __expr__ - model: - conditions: - - evaluator: - params: - - 0 - type: gt - operator: - type: and - query: - params: - - C - reducer: - params: [] - type: last - type: query - datasource: - type: __expr__ - uid: __expr__ - expression: B - intervalMs: 1000 - maxDataPoints: 43200 - refId: C - type: threshold - noDataState: OK - execErrState: Error - for: 5m - annotations: - summary: "🟡 WARNING: Pod Error" - description: | - Pod container terminated with an error - - ─── WHERE ─────────────────────────── - Tenant: {{ $labels.tenant_name }} - Cluster: {{ $labels.cluster }} - Namespace: {{ $labels.namespace }} - Pod: {{ $labels.pod }} - - ─── DETAILS ───────────────────────── - Reason: {{ $labels.reason }} - Duration: 5 minutes - - labels: - opsgenie: "1" - isPaused: false - - uid: PodNotHealthy - title: Pod Not Healthy - condition: B - data: - - refId: A - relativeTimeRange: - from: 600 - to: 0 - datasourceUid: mimir - model: - datasource: - type: prometheus - uid: mimir - disableTextWrap: false - editorMode: code - expr: sum by (cluster, namespace, pod, phase) (kube_pod_status_phase{namespace=~"posit-team|posit-team-system|alloy|mimir|loki|grafana",phase=~"Pending|Unknown|Failed"} * on(cluster,pod) group_left(label_launcher_instance_id) kube_pod_labels{label_launcher_instance_id=""}) > 0 - fullMetaSearch: false - includeNullMetadata: true - instant: true - intervalMs: 60000 - legendFormat: __auto - maxDataPoints: 43200 - range: false - refId: A - useBackend: false - - refId: B - relativeTimeRange: - from: 600 - to: 0 - datasourceUid: __expr__ - model: - conditions: - - evaluator: - params: - - 0 - - 0 - type: gt - operator: - type: and - query: - params: [] - reducer: - params: [] - type: avg - type: query - datasource: - name: Expression - type: __expr__ - uid: __expr__ - expression: A - hide: false - reducer: last - refId: B - type: reduce - - refId: C - datasourceUid: __expr__ - model: - conditions: - - evaluator: - params: - - 0 - - 0 - type: gt - operator: - type: and - query: - params: [] - reducer: - params: [] - type: avg - type: query - datasource: - name: Expression - type: __expr__ - uid: __expr__ - expression: B - hide: false - refId: C - type: threshold - noDataState: OK - execErrState: Error - for: 15m - annotations: - summary: "🟡 WARNING: Pod Not Healthy" - description: | - Pod has been in a non-running state - - ─── WHERE ─────────────────────────── - Tenant: {{ $labels.tenant_name }} - Cluster: {{ $labels.cluster }} - Namespace: {{ $labels.namespace }} - Pod: {{ $labels.pod }} - - ─── DETAILS ───────────────────────── - Phase: {{ $labels.phase }} - Duration: 15 minutes - - labels: - opsgenie: "1" - isPaused: false - - uid: PodRestarts - title: Pod Restarts - condition: C - data: - - refId: A - relativeTimeRange: - from: 600 - to: 0 - datasourceUid: mimir - model: - datasource: - type: prometheus - uid: mimir - disableTextWrap: false - editorMode: code - expr: avg_over_time(increase(kube_pod_container_status_restarts_total{namespace=~"posit-team|posit-team-system|alloy|mimir|loki|grafana"}[15m])[15m:1m]) > 5 - fullMetaSearch: false - includeNullMetadata: true - instant: false - intervalMs: 60000 - legendFormat: __auto - maxDataPoints: 43200 - range: true - refId: A - useBackend: false - - refId: B - relativeTimeRange: - from: 600 - to: 0 - datasourceUid: __expr__ - model: - conditions: - - evaluator: - params: - - 0 - - 0 - type: gt - operator: - type: and - query: - params: [] - reducer: - params: [] - type: avg - type: query - datasource: - name: Expression - type: __expr__ - uid: __expr__ - expression: A - hide: false - intervalMs: 1000 - maxDataPoints: 43200 - reducer: last - refId: B - type: reduce - - refId: C - datasourceUid: __expr__ - model: - conditions: - - evaluator: - params: - - 0 - - 0 - type: gt - operator: - type: and - query: - params: [] - reducer: - params: [] - type: avg - type: query - datasource: - name: Expression - type: __expr__ - uid: __expr__ - expression: B - hide: false - intervalMs: 1000 - maxDataPoints: 43200 - refId: C - type: threshold - noDataState: OK - execErrState: Error - for: 15m - annotations: - summary: "🟡 WARNING: Pod Restarts" - description: | - Pod has restarted excessively - - ─── WHERE ─────────────────────────── - Tenant: {{ $labels.tenant_name }} - Cluster: {{ $labels.cluster }} - Namespace: {{ $labels.namespace }} - Pod: {{ $labels.pod }} - - ─── DETAILS ───────────────────────── - Issue: > 5 restarts in 15 minutes - labels: opsgenie: "1" isPaused: false diff --git a/python-pulumi/src/ptd/grafana_alerts/rds.yaml b/python-pulumi/src/ptd/grafana_alerts/rds.yaml index c09e58a..63c4c49 100644 --- a/python-pulumi/src/ptd/grafana_alerts/rds.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/rds.yaml @@ -6,7 +6,9 @@ # - orgId: 1 # uid: rds_cpu_utilization_high # - orgId: 1 -# uid: rds_free_storage_low +# uid: rds_free_storage_low_warning +# - orgId: 1 +# uid: rds_free_storage_low_critical # - orgId: 1 # uid: rds_freeable_memory_low # - orgId: 1 @@ -22,6 +24,8 @@ apiVersion: 1 deleteRules: - orgId: 1 uid: rds_read_latency_high + - orgId: 1 + uid: rds_free_storage_low groups: - orgId: 1 name: RDS @@ -74,18 +78,101 @@ groups: maxDataPoints: 43200 refId: B type: threshold - noDataState: NoData # Performance metric; silent suppression on scrape outage is acceptable + noDataState: NoData execErrState: Error for: 10m annotations: - description: RDS instance CPU utilization is above 80% for more than 10 minutes. - summary: High CPU utilization on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}} + summary: "🟡 WARNING: RDS CPU Utilization High" + description: | + RDS instance CPU utilization is elevated + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + Resource: {{ $labels.dimension_DBInstanceIdentifier }} + Region: {{ $labels.region }} + + ─── DETAILS ───────────────────────── + Metric: CPU Utilization + Current: > 80% + Threshold: 80% + Duration: 10 minutes + + labels: + opsgenie: "1" + isPaused: false + - uid: rds_free_storage_low_warning + title: RDS Free Storage Low (Warning) + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: mimir + model: + editorMode: code + expr: aws_rds_free_storage_space_average{job="integrations/cloudwatch"} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + # 10 GiB in bytes + - 10737418240 + type: lt + operator: + type: and + query: + params: + - A + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: B + type: threshold + noDataState: Alerting + execErrState: Error + for: 5m + annotations: + summary: "🟡 WARNING: RDS Free Storage Low" + description: | + RDS instance storage capacity is running low + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + Resource: {{ $labels.dimension_DBInstanceIdentifier }} + Region: {{ $labels.region }} + + ─── DETAILS ───────────────────────── + Metric: Free Storage Space + Current: < 10 GiB free + Threshold: 10 GiB + Duration: 5 minutes + labels: opsgenie: "1" - severity: warning isPaused: false - - uid: rds_free_storage_low - title: RDS Free Storage Low + - uid: rds_free_storage_low_critical + title: RDS Free Storage Low (Critical) condition: B data: - refId: A @@ -111,7 +198,8 @@ groups: conditions: - evaluator: params: - - 5368709120 # 5 GiB in bytes; calibrated for mid-size instances (100–500 GiB). Adjust for larger (e.g. 1 TiB) or smaller instances. + # 5 GiB in bytes + - 5368709120 type: lt operator: type: and @@ -130,19 +218,28 @@ groups: maxDataPoints: 43200 refId: B type: threshold - noDataState: Alerting # Storage exhaustion is latent; alert even when scraping stops so we don't silently miss a full disk + noDataState: Alerting execErrState: Error for: 5m annotations: - description: RDS instance has less than 5 GiB of free storage space remaining. Note: on new cluster deployments where CloudWatch scraping has not yet initialized, noDataState=Alerting may produce a false positive after the for:5m window; this is expected during provisioning. - summary: Low free storage on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}} + summary: "🔴 CRITICAL: RDS Free Storage Critical" + description: | + RDS instance storage capacity is critically low + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + Resource: {{ $labels.dimension_DBInstanceIdentifier }} + Region: {{ $labels.region }} + + ─── DETAILS ───────────────────────── + Metric: Free Storage Space + Current: < 5 GiB free + Threshold: 5 GiB + Duration: 5 minutes + labels: opsgenie: "1" - severity: warning - # Note: threshold is absolute (5 GiB) rather than percentage-based. CloudWatch does not - # expose AllocatedStorage as a time-series metric for RDS (it is an instance attribute), - # so computing a usage percentage is not feasible without a separate exporter or recording - # rule. The 5 GiB threshold is calibrated to PTD's default 100 GiB allocation. isPaused: false - uid: rds_freeable_memory_low title: RDS Freeable Memory Low @@ -171,7 +268,8 @@ groups: conditions: - evaluator: params: - - 536870912 # 512 MiB in bytes; calibrated for db.r5.large (~16 GiB RAM). Adjust for other instance classes. + # 200 MiB in bytes + - 209715200 type: lt operator: type: and @@ -190,21 +288,29 @@ groups: maxDataPoints: 43200 refId: B type: threshold - noDataState: Alerting # Memory exhaustion is latent; alert even when scraping stops so we don't silently miss an OOM condition + noDataState: Alerting execErrState: Error for: 10m annotations: - description: RDS instance has less than 512 MiB of freeable memory remaining for more than 10 minutes. Note: this threshold is calibrated for db.r5.large (~16 GiB RAM); it will fire continuously for small instances (e.g. db.t3.micro, db.t3.small). Adjust per instance class. - summary: Low freeable memory on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}} + summary: "🟡 WARNING: RDS Freeable Memory Low" + description: | + RDS instance freeable memory is low + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + Resource: {{ $labels.dimension_DBInstanceIdentifier }} + Region: {{ $labels.region }} + + ─── DETAILS ───────────────────────── + Metric: Freeable Memory + Current: < 200 MiB free + Threshold: 200 MiB + Duration: 10 minutes + labels: opsgenie: "1" - severity: warning - instance_size_dependent: "true" # Silence this label for known-small instance classes - # Note: threshold is absolute (512 MiB) rather than percentage-based. CloudWatch does not - # expose total instance RAM as a metric for RDS — it varies by instance type. PTD's default - # instance (db.t3.small, 2 GiB) would fire constantly at a 90%-used threshold under normal - # Postgres buffer cache load, making percentage-based alerting impractical here. - isPaused: true # Paused until a Grafana silence or Alertmanager inhibit rule is configured for the instance_size_dependent label + isPaused: false - uid: rds_database_connections_high title: RDS Database Connections High condition: B @@ -251,13 +357,26 @@ groups: maxDataPoints: 43200 refId: B type: threshold - noDataState: NoData # Performance metric; silent suppression on scrape outage is acceptable + noDataState: NoData execErrState: Error for: 5m annotations: - description: RDS instance has more than 80 active database connections for more than 5 minutes. - summary: High database connections on RDS instance {{$labels.dimension_DBInstanceIdentifier}} in cluster {{$labels.cluster}} + summary: "🟡 WARNING: RDS Database Connections High" + description: | + RDS instance has high number of database connections + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + Resource: {{ $labels.dimension_DBInstanceIdentifier }} + Region: {{ $labels.region }} + + ─── DETAILS ───────────────────────── + Metric: Database Connections + Current: > 80 connections + Threshold: 80 + Duration: 5 minutes + labels: opsgenie: "1" - severity: warning isPaused: false