Skip to content
1 change: 1 addition & 0 deletions python-pulumi/src/ptd/azure_roles.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
ACR_PULL_ROLE_DEFINITION_ID = "7f951dda-4ed3-4680-a7ca-43fe172d538d"
CONTRIBUTOR_ROLE_DEFINITION_ID = "b24988ac-6180-42a0-ab88-20f7382dd24c"
DNS_ZONE_CONTRIBUTOR_ROLE_DEFINITION_ID = "befefa01-2a29-4197-83a8-272ff33ce314"
MONITORING_READER_ROLE_DEFINITION_ID = "43d0d8ad-25c7-4714-9337-8ba259a9fe05"
NETWORK_CONTRIBUTOR_ROLE_DEFINITION_ID = "4d97b98b-1d4f-4787-a291-c67834d212e7"
READER_ROLE_DEFINITION_ID = "acdd72a7-3385-48ef-bd42-f606fba81ae7"
STORAGE_BLOB_DATA_CONTRIBUTOR_ROLE_DEFINITION_ID = "ba92f5b4-2d11-453d-a403-e96b0029c9fe"
Expand Down
193 changes: 193 additions & 0 deletions python-pulumi/src/ptd/grafana_alerts/azure_loadbalancer.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
# To delete these alerts, simply removing the configMap that uses this method will not work.
# Replace file contents with the following and apply in order to delete the alerts
# (repeat the deleteRules entry for each uid listed below):
# apiVersion: 1
# deleteRules:
# - orgId: 1
# uid: azure_lb_health_probe_down
# - orgId: 1
# uid: azure_lb_data_path_down
# - orgId: 1
# uid: azure_lb_snat_port_exhaustion
#
# See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/
#
# Note: alert annotations reference {{$labels.cluster}}. For Azure Monitor-sourced metrics,
# this label is injected by the prometheus.relabel.default block in grafana_alloy.py.
# If Alloy is not running or relabeling is misconfigured, the label will be absent and
# the annotation will render as "in cluster " (blank).
apiVersion: 1
groups:
- orgId: 1
name: Azure Load Balancer
folder: Posit Alerts
interval: 5m
rules:
- uid: azure_lb_health_probe_down
title: Azure Load Balancer Health Probe Down
condition: B
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: mimir
model:
editorMode: code
expr: azure_microsoft_network_loadbalancers_dipavailability{job="integrations/azure"}
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: B
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 100
type: lt
operator:
type: and
query:
params:
- A
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: B
type: threshold
noDataState: NoData
execErrState: Error
for: 5m
annotations:
description: Azure Load Balancer backend health probe availability is below 100% for over 5 minutes, indicating unhealthy backend instances. This is a critical issue that requires immediate attention.
summary: Health probe down on Azure Load Balancer {{$labels.resource}} in cluster {{$labels.cluster}}
labels:
opsgenie: "1"
isPaused: false
- uid: azure_lb_data_path_down
title: Azure Load Balancer Data Path Down
condition: B
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: mimir
model:
editorMode: code
expr: azure_microsoft_network_loadbalancers_vipavailability{job="integrations/azure"}
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: B
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 100
type: lt
operator:
type: and
query:
params:
- A
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: B
type: threshold
noDataState: NoData
execErrState: Error
for: 5m
annotations:
description: Azure Load Balancer data path availability is below 100% for over 5 minutes, indicating the load balancer frontend is not responding to health probes. This is a critical issue that requires immediate attention.
summary: Data path down on Azure Load Balancer {{$labels.resource}} in cluster {{$labels.cluster}}
labels:
opsgenie: "1"
isPaused: false
- uid: azure_lb_snat_port_exhaustion
title: Azure Load Balancer SNAT Port Exhaustion
condition: B
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: mimir
model:
editorMode: code
expr: |
(azure_microsoft_network_loadbalancers_usedsnatports{job="integrations/azure"}
/
azure_microsoft_network_loadbalancers_allocatedsnatports{job="integrations/azure"}) * 100
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: B
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 80
type: gt
operator:
type: and
query:
params:
- A
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: B
type: threshold
noDataState: NoData
execErrState: Error
for: 5m
annotations:
description: Azure Load Balancer is using more than 80% of allocated SNAT ports for over 5 minutes. SNAT port exhaustion can cause outbound connection failures and may require increasing the number of backend instances or using a NAT Gateway.
summary: SNAT port exhaustion on Azure Load Balancer {{$labels.resource}} in cluster {{$labels.cluster}}
labels:
opsgenie: "1"
isPaused: false
190 changes: 190 additions & 0 deletions python-pulumi/src/ptd/grafana_alerts/azure_netapp.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
# To delete these alerts, simply removing the configMap that uses this method will not work.
# Replace file contents with the following and apply in order to delete the alerts
# (repeat the deleteRules entry for each uid listed below):
# apiVersion: 1
# deleteRules:
# - orgId: 1
# uid: azure_netapp_capacity_high
# - orgId: 1
# uid: azure_netapp_read_latency_high
# - orgId: 1
# uid: azure_netapp_write_latency_high
#
# See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/
#
# Note: alert annotations reference {{$labels.cluster}}. For Azure Monitor-sourced metrics,
# this label is injected by the prometheus.relabel.default block in grafana_alloy.py.
# If Alloy is not running or relabeling is misconfigured, the label will be absent and
# the annotation will render as "in cluster " (blank).
apiVersion: 1
groups:
- orgId: 1
name: Azure NetApp Files
folder: Posit Alerts
interval: 5m
rules:
- uid: azure_netapp_capacity_high
title: Azure NetApp Files Capacity High
condition: B
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: mimir
model:
editorMode: code
expr: azure_microsoft_netapp_netappaccounts_capacitypools_volumes_volumeconsumedsizepercentage{job="integrations/azure"}
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: B
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 80
type: gt
operator:
type: and
query:
params:
- A
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: B
type: threshold
noDataState: Alerting # Storage exhaustion is latent; alert even when scraping stops so we don't silently miss a full volume
execErrState: Error
for: 10m
annotations:
description: Azure NetApp Files volume has more than 80% capacity utilization for more than 10 minutes. Note: on new cluster deployments where Azure Monitor scraping has not yet initialized, noDataState=Alerting may produce a false positive after the for:10m window; this is expected during provisioning.
summary: High capacity utilization on Azure NetApp Files volume {{$labels.resource}} in cluster {{$labels.cluster}}
labels:
opsgenie: "1"
isPaused: false
- uid: azure_netapp_read_latency_high
title: Azure NetApp Files Read Latency High
condition: B
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: mimir
model:
editorMode: code
expr: azure_microsoft_netapp_netappaccounts_capacitypools_volumes_averagereadlatency{job="integrations/azure"}
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: B
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 10 # 10ms threshold; Azure NetApp Files typically has sub-millisecond latency
type: gt
operator:
type: and
query:
params:
- A
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: B
type: threshold
noDataState: NoData # Performance metric; silent suppression on scrape outage is acceptable
execErrState: Error
for: 10m
annotations:
description: Azure NetApp Files volume read latency is above 10ms for more than 10 minutes, indicating potential performance degradation.
summary: High read latency on Azure NetApp Files volume {{$labels.resource}} in cluster {{$labels.cluster}}
labels:
opsgenie: "1"
isPaused: false
- uid: azure_netapp_write_latency_high
title: Azure NetApp Files Write Latency High
condition: B
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: mimir
model:
editorMode: code
expr: azure_microsoft_netapp_netappaccounts_capacitypools_volumes_averagewritelatency{job="integrations/azure"}
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: B
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 10 # 10ms threshold; Azure NetApp Files typically has sub-millisecond latency
type: gt
operator:
type: and
query:
params:
- A
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: B
type: threshold
noDataState: NoData # Performance metric; silent suppression on scrape outage is acceptable
execErrState: Error
for: 10m
annotations:
description: Azure NetApp Files volume write latency is above 10ms for more than 10 minutes, indicating potential performance degradation.
summary: High write latency on Azure NetApp Files volume {{$labels.resource}} in cluster {{$labels.cluster}}
labels:
opsgenie: "1"
isPaused: false
Loading
Loading