diff --git a/python-pulumi/src/ptd/azure_roles.py b/python-pulumi/src/ptd/azure_roles.py index 24ce971..c9e6870 100644 --- a/python-pulumi/src/ptd/azure_roles.py +++ b/python-pulumi/src/ptd/azure_roles.py @@ -2,6 +2,7 @@ ACR_PULL_ROLE_DEFINITION_ID = "7f951dda-4ed3-4680-a7ca-43fe172d538d" CONTRIBUTOR_ROLE_DEFINITION_ID = "b24988ac-6180-42a0-ab88-20f7382dd24c" DNS_ZONE_CONTRIBUTOR_ROLE_DEFINITION_ID = "befefa01-2a29-4197-83a8-272ff33ce314" +MONITORING_READER_ROLE_DEFINITION_ID = "43d0d8ad-25c7-4714-9337-8ba259a9fe05" NETWORK_CONTRIBUTOR_ROLE_DEFINITION_ID = "4d97b98b-1d4f-4787-a291-c67834d212e7" READER_ROLE_DEFINITION_ID = "acdd72a7-3385-48ef-bd42-f606fba81ae7" STORAGE_BLOB_DATA_CONTRIBUTOR_ROLE_DEFINITION_ID = "ba92f5b4-2d11-453d-a403-e96b0029c9fe" diff --git a/python-pulumi/src/ptd/grafana_alerts/azure_loadbalancer.yaml b/python-pulumi/src/ptd/grafana_alerts/azure_loadbalancer.yaml new file mode 100644 index 0000000..e5b9ed9 --- /dev/null +++ b/python-pulumi/src/ptd/grafana_alerts/azure_loadbalancer.yaml @@ -0,0 +1,193 @@ +# To delete these alerts, simply removing the configMap that uses this method will not work. +# Replace file contents with the following and apply in order to delete the alerts +# (repeat the deleteRules entry for each uid listed below): +# apiVersion: 1 +# deleteRules: +# - orgId: 1 +# uid: azure_lb_health_probe_down +# - orgId: 1 +# uid: azure_lb_data_path_down +# - orgId: 1 +# uid: azure_lb_snat_port_exhaustion +# +# See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/ +# +# Note: alert annotations reference {{$labels.cluster}}. For Azure Monitor-sourced metrics, +# this label is injected by the prometheus.relabel.default block in grafana_alloy.py. +# If Alloy is not running or relabeling is misconfigured, the label will be absent and +# the annotation will render as "in cluster " (blank). +apiVersion: 1 +groups: + - orgId: 1 + name: Azure Load Balancer + folder: Posit Alerts + interval: 5m + rules: + - uid: azure_lb_health_probe_down + title: Azure Load Balancer Health Probe Down + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: mimir + model: + editorMode: code + expr: azure_microsoft_network_loadbalancers_dipavailability{job="integrations/azure"} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 100 + type: lt + operator: + type: and + query: + params: + - A + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: B + type: threshold + noDataState: NoData + execErrState: Error + for: 5m + annotations: + description: Azure Load Balancer backend health probe availability is below 100% for over 5 minutes, indicating unhealthy backend instances. This is a critical issue that requires immediate attention. + summary: Health probe down on Azure Load Balancer {{$labels.resource}} in cluster {{$labels.cluster}} + labels: + opsgenie: "1" + isPaused: false + - uid: azure_lb_data_path_down + title: Azure Load Balancer Data Path Down + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: mimir + model: + editorMode: code + expr: azure_microsoft_network_loadbalancers_vipavailability{job="integrations/azure"} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 100 + type: lt + operator: + type: and + query: + params: + - A + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: B + type: threshold + noDataState: NoData + execErrState: Error + for: 5m + annotations: + description: Azure Load Balancer data path availability is below 100% for over 5 minutes, indicating the load balancer frontend is not responding to health probes. This is a critical issue that requires immediate attention. + summary: Data path down on Azure Load Balancer {{$labels.resource}} in cluster {{$labels.cluster}} + labels: + opsgenie: "1" + isPaused: false + - uid: azure_lb_snat_port_exhaustion + title: Azure Load Balancer SNAT Port Exhaustion + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: mimir + model: + editorMode: code + expr: | + (azure_microsoft_network_loadbalancers_usedsnatports{job="integrations/azure"} + / + azure_microsoft_network_loadbalancers_allocatedsnatports{job="integrations/azure"}) * 100 + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 80 + type: gt + operator: + type: and + query: + params: + - A + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: B + type: threshold + noDataState: NoData + execErrState: Error + for: 5m + annotations: + description: Azure Load Balancer is using more than 80% of allocated SNAT ports for over 5 minutes. SNAT port exhaustion can cause outbound connection failures and may require increasing the number of backend instances or using a NAT Gateway. + summary: SNAT port exhaustion on Azure Load Balancer {{$labels.resource}} in cluster {{$labels.cluster}} + labels: + opsgenie: "1" + isPaused: false diff --git a/python-pulumi/src/ptd/grafana_alerts/azure_netapp.yaml b/python-pulumi/src/ptd/grafana_alerts/azure_netapp.yaml new file mode 100644 index 0000000..da75dcc --- /dev/null +++ b/python-pulumi/src/ptd/grafana_alerts/azure_netapp.yaml @@ -0,0 +1,190 @@ +# To delete these alerts, simply removing the configMap that uses this method will not work. +# Replace file contents with the following and apply in order to delete the alerts +# (repeat the deleteRules entry for each uid listed below): +# apiVersion: 1 +# deleteRules: +# - orgId: 1 +# uid: azure_netapp_capacity_high +# - orgId: 1 +# uid: azure_netapp_read_latency_high +# - orgId: 1 +# uid: azure_netapp_write_latency_high +# +# See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/ +# +# Note: alert annotations reference {{$labels.cluster}}. For Azure Monitor-sourced metrics, +# this label is injected by the prometheus.relabel.default block in grafana_alloy.py. +# If Alloy is not running or relabeling is misconfigured, the label will be absent and +# the annotation will render as "in cluster " (blank). +apiVersion: 1 +groups: + - orgId: 1 + name: Azure NetApp Files + folder: Posit Alerts + interval: 5m + rules: + - uid: azure_netapp_capacity_high + title: Azure NetApp Files Capacity High + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: mimir + model: + editorMode: code + expr: azure_microsoft_netapp_netappaccounts_capacitypools_volumes_volumeconsumedsizepercentage{job="integrations/azure"} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 80 + type: gt + operator: + type: and + query: + params: + - A + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: B + type: threshold + noDataState: Alerting # Storage exhaustion is latent; alert even when scraping stops so we don't silently miss a full volume + execErrState: Error + for: 10m + annotations: + description: Azure NetApp Files volume has more than 80% capacity utilization for more than 10 minutes. Note: on new cluster deployments where Azure Monitor scraping has not yet initialized, noDataState=Alerting may produce a false positive after the for:10m window; this is expected during provisioning. + summary: High capacity utilization on Azure NetApp Files volume {{$labels.resource}} in cluster {{$labels.cluster}} + labels: + opsgenie: "1" + isPaused: false + - uid: azure_netapp_read_latency_high + title: Azure NetApp Files Read Latency High + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: mimir + model: + editorMode: code + expr: azure_microsoft_netapp_netappaccounts_capacitypools_volumes_averagereadlatency{job="integrations/azure"} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 10 # 10ms threshold; Azure NetApp Files typically has sub-millisecond latency + type: gt + operator: + type: and + query: + params: + - A + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: B + type: threshold + noDataState: NoData # Performance metric; silent suppression on scrape outage is acceptable + execErrState: Error + for: 10m + annotations: + description: Azure NetApp Files volume read latency is above 10ms for more than 10 minutes, indicating potential performance degradation. + summary: High read latency on Azure NetApp Files volume {{$labels.resource}} in cluster {{$labels.cluster}} + labels: + opsgenie: "1" + isPaused: false + - uid: azure_netapp_write_latency_high + title: Azure NetApp Files Write Latency High + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: mimir + model: + editorMode: code + expr: azure_microsoft_netapp_netappaccounts_capacitypools_volumes_averagewritelatency{job="integrations/azure"} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 10 # 10ms threshold; Azure NetApp Files typically has sub-millisecond latency + type: gt + operator: + type: and + query: + params: + - A + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: B + type: threshold + noDataState: NoData # Performance metric; silent suppression on scrape outage is acceptable + execErrState: Error + for: 10m + annotations: + description: Azure NetApp Files volume write latency is above 10ms for more than 10 minutes, indicating potential performance degradation. + summary: High write latency on Azure NetApp Files volume {{$labels.resource}} in cluster {{$labels.cluster}} + labels: + opsgenie: "1" + isPaused: false diff --git a/python-pulumi/src/ptd/grafana_alerts/azure_postgres.yaml b/python-pulumi/src/ptd/grafana_alerts/azure_postgres.yaml new file mode 100644 index 0000000..61cce0b --- /dev/null +++ b/python-pulumi/src/ptd/grafana_alerts/azure_postgres.yaml @@ -0,0 +1,362 @@ +# To delete these alerts, simply removing the configMap that uses this method will not work. +# Replace file contents with the following and apply in order to delete the alerts +# (repeat the deleteRules entry for each uid listed below): +# apiVersion: 1 +# deleteRules: +# - orgId: 1 +# uid: azure_postgres_cpu_high +# - orgId: 1 +# uid: azure_postgres_storage_high +# - orgId: 1 +# uid: azure_postgres_memory_high +# - orgId: 1 +# uid: azure_postgres_connections_high +# - orgId: 1 +# uid: azure_postgres_failed_connections +# - orgId: 1 +# uid: azure_postgres_deadlocks +# +# See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/ +# +# Note: alert annotations reference {{$labels.cluster}}. For Azure Monitor-sourced metrics, +# this label is injected by the prometheus.relabel.default block in grafana_alloy.py. +# If Alloy is not running or relabeling is misconfigured, the label will be absent and +# the annotation will render as "in cluster " (blank). +apiVersion: 1 +groups: + - orgId: 1 + name: Azure PostgreSQL + folder: Posit Alerts + interval: 5m + rules: + - uid: azure_postgres_cpu_high + title: Azure PostgreSQL CPU High + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: mimir + model: + editorMode: code + expr: azure_microsoft_dbforpostgresql_flexibleservers_cpu_percent{job="integrations/azure"} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 80 + type: gt + operator: + type: and + query: + params: + - A + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: B + type: threshold + noDataState: NoData # Performance metric; silent suppression on scrape outage is acceptable + execErrState: Error + for: 10m + annotations: + description: Azure PostgreSQL Flexible Server CPU utilization is above 80% for more than 10 minutes. + summary: High CPU utilization on Azure PostgreSQL server {{$labels.resource}} in cluster {{$labels.cluster}} + labels: + opsgenie: "1" + isPaused: false + - uid: azure_postgres_storage_high + title: Azure PostgreSQL Storage High + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: mimir + model: + editorMode: code + expr: azure_microsoft_dbforpostgresql_flexibleservers_storage_percent{job="integrations/azure"} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 80 + type: gt + operator: + type: and + query: + params: + - A + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: B + type: threshold + noDataState: Alerting # Storage exhaustion is latent; alert even when scraping stops so we don't silently miss a full disk + execErrState: Error + for: 5m + annotations: + description: Azure PostgreSQL Flexible Server has more than 80% storage utilization for more than 5 minutes. Note: on new cluster deployments where Azure Monitor scraping has not yet initialized, noDataState=Alerting may produce a false positive after the for:5m window; this is expected during provisioning. + summary: High storage utilization on Azure PostgreSQL server {{$labels.resource}} in cluster {{$labels.cluster}} + labels: + opsgenie: "1" + isPaused: false + - uid: azure_postgres_memory_high + title: Azure PostgreSQL Memory High + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: mimir + model: + editorMode: code + expr: azure_microsoft_dbforpostgresql_flexibleservers_memory_percent{job="integrations/azure"} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 80 + type: gt + operator: + type: and + query: + params: + - A + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: B + type: threshold + noDataState: Alerting # Memory exhaustion is latent; alert even when scraping stops so we don't silently miss an OOM condition + execErrState: Error + for: 10m + annotations: + description: Azure PostgreSQL Flexible Server has more than 80% memory utilization for more than 10 minutes. + summary: High memory utilization on Azure PostgreSQL server {{$labels.resource}} in cluster {{$labels.cluster}} + labels: + opsgenie: "1" + isPaused: false + - uid: azure_postgres_connections_high + title: Azure PostgreSQL Connections High + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: mimir + model: + editorMode: code + expr: azure_microsoft_dbforpostgresql_flexibleservers_active_connections{job="integrations/azure"} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 500 # Calibrated for mid-size SKUs (e.g., Standard_D4s_v3). Adjust per SKU; smaller SKUs may have lower connection limits. + type: gt + operator: + type: and + query: + params: + - A + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: B + type: threshold + noDataState: NoData # Performance metric; silent suppression on scrape outage is acceptable + execErrState: Error + for: 5m + annotations: + description: Azure PostgreSQL Flexible Server has more than 500 active database connections for more than 5 minutes. Note: this threshold is calibrated for mid-size SKUs; it may need adjustment for smaller or larger instance sizes. + summary: High database connections on Azure PostgreSQL server {{$labels.resource}} in cluster {{$labels.cluster}} + labels: + opsgenie: "1" + instance_size_dependent: "true" # Silence this label for known-small instance classes + isPaused: true # Paused until a Grafana silence or Alertmanager inhibit rule is configured for the instance_size_dependent label + - uid: azure_postgres_failed_connections + title: Azure PostgreSQL Failed Connections + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: mimir + model: + editorMode: code + expr: azure_microsoft_dbforpostgresql_flexibleservers_connections_failed{job="integrations/azure"} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 10 + type: gt + operator: + type: and + query: + params: + - A + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: B + type: threshold + noDataState: NoData + execErrState: Error + for: 5m + annotations: + description: Azure PostgreSQL Flexible Server has more than 10 failed connection attempts for over 5 minutes, indicating potential authentication or connectivity issues. + summary: Failed connections on Azure PostgreSQL server {{$labels.resource}} in cluster {{$labels.cluster}} + labels: + opsgenie: "1" + isPaused: false + - uid: azure_postgres_deadlocks + title: Azure PostgreSQL Deadlocks + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: mimir + model: + editorMode: code + expr: azure_microsoft_dbforpostgresql_flexibleservers_deadlocks{job="integrations/azure"} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0 + type: gt + operator: + type: and + query: + params: + - A + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: B + type: threshold + noDataState: NoData + execErrState: Error + for: 5m + annotations: + description: Azure PostgreSQL Flexible Server is experiencing deadlocks, indicating potential application-level locking issues that may require investigation. + summary: Deadlocks detected on Azure PostgreSQL server {{$labels.resource}} in cluster {{$labels.cluster}} + labels: + opsgenie: "1" + isPaused: false diff --git a/python-pulumi/src/ptd/grafana_alerts/azure_storage.yaml b/python-pulumi/src/ptd/grafana_alerts/azure_storage.yaml new file mode 100644 index 0000000..d2e9381 --- /dev/null +++ b/python-pulumi/src/ptd/grafana_alerts/azure_storage.yaml @@ -0,0 +1,133 @@ +# To delete these alerts, simply removing the configMap that uses this method will not work. +# Replace file contents with the following and apply in order to delete the alerts +# (repeat the deleteRules entry for each uid listed below): +# apiVersion: 1 +# deleteRules: +# - orgId: 1 +# uid: azure_storage_availability_low +# - orgId: 1 +# uid: azure_storage_latency_high +# +# See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/ +# +# Note: alert annotations reference {{$labels.cluster}}. For Azure Monitor-sourced metrics, +# this label is injected by the prometheus.relabel.default block in grafana_alloy.py. +# If Alloy is not running or relabeling is misconfigured, the label will be absent and +# the annotation will render as "in cluster " (blank). +apiVersion: 1 +groups: + - orgId: 1 + name: Azure Storage + folder: Posit Alerts + interval: 5m + rules: + - uid: azure_storage_availability_low + title: Azure Storage Availability Low + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: mimir + model: + editorMode: code + expr: azure_microsoft_storage_storageaccounts_availability{job="integrations/azure"} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 99.9 + type: lt + operator: + type: and + query: + params: + - A + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: B + type: threshold + noDataState: NoData + execErrState: Error + for: 5m + annotations: + description: Azure Storage Account availability is below 99.9% for over 5 minutes, indicating potential service degradation or outages. + summary: Low availability on Azure Storage Account {{$labels.resource}} in cluster {{$labels.cluster}} + labels: + opsgenie: "1" + isPaused: false + - uid: azure_storage_latency_high + title: Azure Storage Latency High + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: mimir + model: + editorMode: code + expr: azure_microsoft_storage_storageaccounts_successe2elatency{job="integrations/azure"} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 1000 # 1000ms = 1 second + type: gt + operator: + type: and + query: + params: + - A + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: B + type: threshold + noDataState: NoData # Performance metric; silent suppression on scrape outage is acceptable + execErrState: Error + for: 10m + annotations: + description: Azure Storage Account end-to-end latency is above 1000ms for more than 10 minutes, indicating performance degradation. + summary: High latency on Azure Storage Account {{$labels.resource}} in cluster {{$labels.cluster}} + labels: + opsgenie: "1" + isPaused: false diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py index 7d97f0c..b1338fb 100644 --- a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py +++ b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py @@ -2500,7 +2500,7 @@ def _create_alert_configmaps(self, ns: k8s.core.v1.Namespace): alerts_dir = ptd.paths.alerts() for alert_file in sorted(alerts_dir.glob("*.yaml")): - alert_name = alert_file.stem + alert_name = alert_file.stem.replace("_", "-") with open(alert_file) as f: alert_yaml = f.read() diff --git a/python-pulumi/src/ptd/pulumi_resources/azure_workload_helm.py b/python-pulumi/src/ptd/pulumi_resources/azure_workload_helm.py index df6fab8..6660454 100644 --- a/python-pulumi/src/ptd/pulumi_resources/azure_workload_helm.py +++ b/python-pulumi/src/ptd/pulumi_resources/azure_workload_helm.py @@ -9,6 +9,7 @@ import ptd.azure_roles import ptd.azure_workload +import ptd.paths from ptd import azure_sdk from ptd.pulumi_resources.grafana_alloy import AlloyConfig @@ -280,6 +281,8 @@ def _define_mimir(self, release: str, version: str): ) def _define_alloy(self, release: str, version: str): + alloy_identity = self._define_alloy_monitoring_identity(release) + namespace = kubernetes.core.v1.Namespace( f"{self.workload.compound_name}-{release}-alloy-ns", metadata=kubernetes.meta.v1.ObjectMetaArgs( @@ -325,58 +328,69 @@ def _define_alloy(self, release: str, version: str): "chart": "alloy", "targetNamespace": ALLOY_NAMESPACE, "version": version, - "valuesContent": yaml.dump( - { - "serviceAccount": { - "create": True, - "name": str(ptd.Roles.ALLOY), - }, - "controller": { - "volumes": { - "extra": [ - { - "name": "mimir-auth", - "secret": { - "secretName": "mimir-auth", - "items": [ - { - "key": "password", - "path": "password", - } - ], - }, - } - ] - } - }, - "alloy": { - "clustering": {"enabled": True}, - "extraPorts": [ - { - "name": "faro", - "port": 12347, - "targetPort": 12347, - "protocol": "TCP", - } - ], - "mounts": { - "extra": [ + "valuesContent": alloy_identity.client_id.apply( + lambda client_id: yaml.dump( + { + "serviceAccount": { + "create": True, + "name": str(ptd.Roles.ALLOY), + "annotations": { + "azure.workload.identity/client-id": client_id, + }, + "labels": { + "azure.workload.identity/use": "true", + }, + }, + "controller": { + "podLabels": { + "azure.workload.identity/use": "true", + }, + "volumes": { + "extra": [ + { + "name": "mimir-auth", + "secret": { + "secretName": "mimir-auth", + "items": [ + { + "key": "password", + "path": "password", + } + ], + }, + } + ] + }, + }, + "alloy": { + "clustering": {"enabled": True}, + "extraPorts": [ { - "name": "mimir-auth", - "mountPath": "/etc/mimir/", - "readOnly": True, + "name": "faro", + "port": 12347, + "targetPort": 12347, + "protocol": "TCP", } ], - "varlog": True, + "mounts": { + "extra": [ + { + "name": "mimir-auth", + "mountPath": "/etc/mimir/", + "readOnly": True, + } + ], + "varlog": True, + }, + "configMap": {"create": False, "name": "alloy-config", "key": "config.alloy"}, }, - "configMap": {"create": False, "name": "alloy-config", "key": "config.alloy"}, - }, - "ingress": { - "enabled": True, - "faroPort": 12347, - "hosts": [f"faro.{self.workload.cfg.domain}"], - }, - } + "ingress": { + "enabled": True, + "faroPort": 12347, + "hosts": [f"faro.{self.workload.cfg.domain}"], + }, + } + ) ), }, opts=pulumi.ResourceOptions(depends_on=[namespace], provider=self.kube_providers[release]), @@ -416,6 +430,38 @@ def _define_blob_storage_managed_identity( return identity + def _define_alloy_monitoring_identity(self, release: str) -> azure.managedidentity.UserAssignedIdentity: + identity = azure.managedidentity.UserAssignedIdentity( + resource_name=f"id-{self.workload.compound_name}-{release}-alloy", + resource_group_name=self.workload.resource_group_name, + location=self.workload.cfg.region, + tags=self.workload.required_tags, + opts=pulumi.ResourceOptions(parent=self), + ) + + azure.authorization.RoleAssignment( + f"{self.workload.compound_name}-{release}-alloy-monitoring-reader", + scope=f"/subscriptions/{self.workload.cfg.subscription_id}/resourceGroups/{self.workload.resource_group_name}", + principal_id=identity.principal_id, + role_definition_id=f"/providers/Microsoft.Authorization/roleDefinitions/{ptd.azure_roles.MONITORING_READER_ROLE_DEFINITION_ID}", + principal_type=azure.authorization.PrincipalType.SERVICE_PRINCIPAL, + opts=pulumi.ResourceOptions(parent=identity), + ) + + oidc_issuer_url = self.workload.cluster_oidc_issuer_url(release) + azure.managedidentity.FederatedIdentityCredential( + resource_name=f"fedid-{self.workload.compound_name}-{release}-alloy", + resource_name_=identity.name, + federated_identity_credential_resource_name=f"fedid-{self.workload.compound_name}-{release}-alloy", + resource_group_name=self.workload.resource_group_name, + subject=f"system:serviceaccount:{ALLOY_NAMESPACE}:{ptd.Roles.ALLOY!s}", + issuer=oidc_issuer_url, + audiences=["api://AzureADTokenExchange"], + opts=pulumi.ResourceOptions(parent=identity), + ) + + return identity + def _define_external_dns(self, release: str, version: str): identity = azure.managedidentity.UserAssignedIdentity( resource_name=f"id-{self.workload.compound_name}-{release}-external-dns", @@ -631,11 +677,9 @@ def _define_grafana_secret(self) -> None: db_url = pulumi.Output.secret(f"postgres://{role}:{pw}@{fqdn}/{database}") - kubernetes.core.v1.Namespace( + ns = kubernetes.core.v1.Namespace( f"{self.workload.compound_name}-{release}-grafana-ns", - metadata={ - "name": "grafana", - }, + metadata={"name": GRAFANA_NAMESPACE}, opts=pulumi.ResourceOptions(parent=self, providers=[self.kube_providers[release]]), ) @@ -646,7 +690,7 @@ def _define_grafana_secret(self) -> None: "namespace": GRAFANA_NAMESPACE, }, data={"PTD_DATABASE_URL": db_url.apply(lambda url: base64.b64encode(url.encode()).decode())}, - opts=pulumi.ResourceOptions(parent=self, providers=[self.kube_providers[release]]), + opts=pulumi.ResourceOptions(parent=self, providers=[self.kube_providers[release]], depends_on=[ns]), ) def _define_kube_state_metrics(self, release: str, version: str): diff --git a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py index fa9235c..5891f95 100644 --- a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py +++ b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py @@ -222,6 +222,99 @@ def _define_blackbox_config() -> str: many_spaces = re.compile(r"\s+") return many_spaces.sub(" ", cfg).strip() + def _define_azure_monitor_config(self) -> str: + """Generate Azure Monitor exporter configuration for Azure. Returns empty string for non-Azure.""" + if self.cloud_provider != "azure": + return "" + + subscription_id = self.workload.cfg.subscription_id + resource_group_name = self.workload.resource_group_name + + # Base exporters that are always included + config = textwrap.dedent(f""" + prometheus.exporter.azure "postgres" {{ + subscriptions = ["{subscription_id}"] + resource_type = "Microsoft.DBforPostgreSQL/flexibleServers" + resource_graph_query_filter = "where resourceGroup == '{resource_group_name}'" + metrics = ["cpu_percent", "memory_percent", "storage_percent", "active_connections", "connections_failed", "deadlocks"] + included_dimensions = ["*"] + }} + + prometheus.scrape "azure_postgres" {{ + targets = prometheus.exporter.azure.postgres.targets + forward_to = [prometheus.relabel.default.receiver] + clustering {{ + enabled = true + }} + }} + + prometheus.exporter.azure "netapp" {{ + subscriptions = ["{subscription_id}"] + resource_type = "Microsoft.NetApp/netAppAccounts/capacityPools/volumes" + resource_graph_query_filter = "where resourceGroup == '{resource_group_name}'" + metrics = ["VolumeConsumedSizePercentage", "VolumeLogicalSize", "AverageReadLatency", "AverageWriteLatency", "ReadIops", "WriteIops"] + }} + + prometheus.scrape "azure_netapp" {{ + targets = prometheus.exporter.azure.netapp.targets + forward_to = [prometheus.relabel.default.receiver] + clustering {{ + enabled = true + }} + }} + + prometheus.exporter.azure "loadbalancer" {{ + subscriptions = ["{subscription_id}"] + resource_type = "Microsoft.Network/loadBalancers" + resource_graph_query_filter = "where resourceGroup == '{resource_group_name}'" + metrics = ["DipAvailability", "VipAvailability", "UsedSnatPorts", "AllocatedSnatPorts", "SnatConnectionCount"] + }} + + prometheus.scrape "azure_loadbalancer" {{ + targets = prometheus.exporter.azure.loadbalancer.targets + forward_to = [prometheus.relabel.default.receiver] + clustering {{ + enabled = true + }} + }} + + prometheus.exporter.azure "storage" {{ + subscriptions = ["{subscription_id}"] + resource_type = "Microsoft.Storage/storageAccounts" + resource_graph_query_filter = "where resourceGroup == '{resource_group_name}'" + metrics = ["Availability", "SuccessE2ELatency", "UsedCapacity", "Transactions"] + }} + + prometheus.scrape "azure_storage" {{ + targets = prometheus.exporter.azure.storage.targets + forward_to = [prometheus.relabel.default.receiver] + clustering {{ + enabled = true + }} + }} + """) + + # Conditionally add NAT Gateway exporter if public_subnet_cidr is configured + if isinstance(self.workload, ptd.azure_workload.AzureWorkload) and self.workload.cfg.network.public_subnet_cidr: + config += textwrap.dedent(f""" + prometheus.exporter.azure "natgateway" {{ + subscriptions = ["{subscription_id}"] + resource_type = "Microsoft.Network/natGateways" + resource_graph_query_filter = "where resourceGroup == '{resource_group_name}'" + metrics = ["PacketCount", "ByteCount", "DroppedPackets", "TotalConnectionCount", "SNATConnectionCount"] + }} + + prometheus.scrape "azure_natgateway" {{ + targets = prometheus.exporter.azure.natgateway.targets + forward_to = [prometheus.relabel.default.receiver] + clustering {{ + enabled = true + }} + }} + """) + + return config + def _define_cloudwatch_config(self) -> str: """Generate CloudWatch exporter configuration for AWS. Returns empty string for non-AWS.""" if self.cloud_provider != "aws": @@ -442,6 +535,9 @@ def _define_config_map( # Generate CloudWatch exporter configuration for AWS cloudwatch_config = self._define_cloudwatch_config() + # Generate Azure Monitor exporter configuration for Azure + azure_monitor_config = self._define_azure_monitor_config() + # Generate system log scraping configuration system_logs_config = "" if self.should_scrape_system_logs: @@ -578,6 +674,8 @@ def _define_config_map( {cloudwatch_config} + {azure_monitor_config} + prometheus.relabel "default" {{ forward_to = [ prometheus.remote_write.control_room.receiver, diff --git a/python-pulumi/tests/test_azure_alert_files.py b/python-pulumi/tests/test_azure_alert_files.py new file mode 100644 index 0000000..e5122f8 --- /dev/null +++ b/python-pulumi/tests/test_azure_alert_files.py @@ -0,0 +1,108 @@ +"""Tests for Azure Monitor alert YAML files. + +Note: These tests verify that Azure alert YAML files exist and have basic structure. +The YAML files use Grafana's alert provisioning format which may not be fully compatible +with strict yaml.safe_load() parsing (e.g., descriptions with colons may not be quoted). +""" + +import pathlib + + +class TestAzureAlertFiles: + """Tests for Azure Monitor alert YAML files.""" + + def test_azure_postgres_yaml_exists(self) -> None: + """Test that azure_postgres.yaml exists.""" + yaml_path = pathlib.Path(__file__).parent.parent / "src" / "ptd" / "grafana_alerts" / "azure_postgres.yaml" + assert yaml_path.exists(), f"azure_postgres.yaml not found at {yaml_path}" + + # Verify basic structure + text = yaml_path.read_text() + assert "apiVersion: 1" in text + assert "groups:" in text + assert "Azure PostgreSQL" in text + + def test_azure_netapp_yaml_exists(self) -> None: + """Test that azure_netapp.yaml exists.""" + yaml_path = pathlib.Path(__file__).parent.parent / "src" / "ptd" / "grafana_alerts" / "azure_netapp.yaml" + assert yaml_path.exists(), f"azure_netapp.yaml not found at {yaml_path}" + + # Verify basic structure + text = yaml_path.read_text() + assert "apiVersion: 1" in text + assert "groups:" in text + + def test_azure_loadbalancer_yaml_exists(self) -> None: + """Test that azure_loadbalancer.yaml exists.""" + yaml_path = pathlib.Path(__file__).parent.parent / "src" / "ptd" / "grafana_alerts" / "azure_loadbalancer.yaml" + assert yaml_path.exists(), f"azure_loadbalancer.yaml not found at {yaml_path}" + + # Verify basic structure + text = yaml_path.read_text() + assert "apiVersion: 1" in text + assert "groups:" in text + + def test_azure_storage_yaml_exists(self) -> None: + """Test that azure_storage.yaml exists.""" + yaml_path = pathlib.Path(__file__).parent.parent / "src" / "ptd" / "grafana_alerts" / "azure_storage.yaml" + assert yaml_path.exists(), f"azure_storage.yaml not found at {yaml_path}" + + # Verify basic structure + text = yaml_path.read_text() + assert "apiVersion: 1" in text + assert "groups:" in text + + def test_azure_alert_files_have_grafana_structure(self) -> None: + """Test that all Azure alert files have Grafana provisioning structure.""" + yaml_files = [ + "azure_postgres.yaml", + "azure_netapp.yaml", + "azure_loadbalancer.yaml", + "azure_storage.yaml", + ] + + for yaml_file in yaml_files: + yaml_path = pathlib.Path(__file__).parent.parent / "src" / "ptd" / "grafana_alerts" / yaml_file + text = yaml_path.read_text() + + # Verify Grafana alert provisioning structure + assert "apiVersion: 1" in text, f"{yaml_file} missing apiVersion" + assert "groups:" in text, f"{yaml_file} missing groups" + assert "rules:" in text, f"{yaml_file} missing rules" + assert "uid:" in text, f"{yaml_file} missing rule UIDs" + assert "datasourceUid: mimir" in text, f"{yaml_file} not using mimir datasource" + assert "opsgenie:" in text, f"{yaml_file} missing opsgenie labels" + + def test_azure_postgres_has_expected_alerts(self) -> None: + """Test that azure_postgres.yaml contains expected alert rules.""" + yaml_path = pathlib.Path(__file__).parent.parent / "src" / "ptd" / "grafana_alerts" / "azure_postgres.yaml" + text = yaml_path.read_text() + + # Verify expected PostgreSQL alert rules exist + expected_rules = [ + "azure_postgres_cpu_high", + "azure_postgres_storage_high", + "azure_postgres_memory_high", + "azure_postgres_connections_high", + "azure_postgres_failed_connections", + "azure_postgres_deadlocks", + ] + + for expected_rule in expected_rules: + assert expected_rule in text, f"Expected rule {expected_rule} not found in azure_postgres.yaml" + + def test_azure_alerts_query_azure_monitor_metrics(self) -> None: + """Test that Azure alerts query Azure Monitor metrics.""" + yaml_files = { + "azure_postgres.yaml": "azure_microsoft_dbforpostgresql_flexibleservers", + "azure_netapp.yaml": "azure_microsoft_netapp_netappaccounts", + "azure_loadbalancer.yaml": "azure_microsoft_network_loadbalancers", + "azure_storage.yaml": "azure_microsoft_storage_storageaccounts", + } + + for yaml_file, expected_metric_prefix in yaml_files.items(): + yaml_path = pathlib.Path(__file__).parent.parent / "src" / "ptd" / "grafana_alerts" / yaml_file + text = yaml_path.read_text() + + # Verify the file queries Azure Monitor metrics (lowercased resource type) + assert expected_metric_prefix in text, f"{yaml_file} should query {expected_metric_prefix} metrics" diff --git a/python-pulumi/tests/test_azure_alloy_identity.py b/python-pulumi/tests/test_azure_alloy_identity.py new file mode 100644 index 0000000..92f23be --- /dev/null +++ b/python-pulumi/tests/test_azure_alloy_identity.py @@ -0,0 +1,37 @@ +"""Tests for Azure Alloy monitoring identity creation.""" + +import inspect + +import ptd.pulumi_resources.azure_workload_helm + + +class TestAzureAlloyMonitoringIdentity: + """Tests for the _define_alloy_monitoring_identity method on AzureWorkloadHelm.""" + + def test_define_alloy_monitoring_identity_method_exists(self) -> None: + """Test that AzureWorkloadHelm has _define_alloy_monitoring_identity method.""" + assert hasattr( + ptd.pulumi_resources.azure_workload_helm.AzureWorkloadHelm, + "_define_alloy_monitoring_identity", + ), "AzureWorkloadHelm should have _define_alloy_monitoring_identity method" + + def test_define_alloy_monitoring_identity_is_callable(self) -> None: + """Test that _define_alloy_monitoring_identity is callable.""" + method = ptd.pulumi_resources.azure_workload_helm.AzureWorkloadHelm._define_alloy_monitoring_identity # noqa: SLF001 + assert callable(method), "_define_alloy_monitoring_identity should be callable" + + def test_define_alloy_monitoring_identity_signature(self) -> None: + """Test that _define_alloy_monitoring_identity has expected signature.""" + method = ptd.pulumi_resources.azure_workload_helm.AzureWorkloadHelm._define_alloy_monitoring_identity # noqa: SLF001 + sig = inspect.signature(method) + + # Verify it takes self and release parameters + params = list(sig.parameters.keys()) + assert "self" in params, "_define_alloy_monitoring_identity should have self parameter" + assert "release" in params, "_define_alloy_monitoring_identity should have release parameter" + + # Verify release parameter is a string + release_param = sig.parameters["release"] + assert release_param.annotation in {str, inspect.Parameter.empty}, ( + "_define_alloy_monitoring_identity release parameter should be a string" + ) diff --git a/python-pulumi/tests/test_grafana_alloy.py b/python-pulumi/tests/test_grafana_alloy.py index 2b28fc3..d36dff2 100644 --- a/python-pulumi/tests/test_grafana_alloy.py +++ b/python-pulumi/tests/test_grafana_alloy.py @@ -641,3 +641,168 @@ def test_invalid_compound_name_raises_value_error(self) -> None: alloy = _make_alloy_for_cloudwatch("aws", compound_name="bad{name}") with pytest.raises(ValueError, match="unsafe for Alloy River config"): alloy._define_cloudwatch_config() # noqa: SLF001 + + +def _make_alloy_for_azure_monitor( + subscription_id: str = "test-subscription-id", + resource_group_name: str = "test-rg", + public_subnet_cidr: str | None = None, +) -> AlloyConfig: + """Helper to create an AlloyConfig instance with mocked Azure workload attributes.""" + import ptd.azure_workload + + alloy = AlloyConfig.__new__(AlloyConfig) + + # Create mock Azure workload - use spec to ensure isinstance checks work + mock_workload = Mock(spec=ptd.azure_workload.AzureWorkload) + mock_cfg = Mock() + mock_cfg.subscription_id = subscription_id + mock_network = Mock() + mock_network.public_subnet_cidr = public_subnet_cidr + mock_cfg.network = mock_network + mock_workload.cfg = mock_cfg + mock_workload.resource_group_name = resource_group_name + + # Mock cloud_provider + mock_cloud_provider = Mock() + mock_cloud_provider.name = "Azure" + mock_workload.cloud_provider = mock_cloud_provider + + alloy.workload = mock_workload + alloy.cloud_provider = "azure" + alloy.region = "eastus" + + return alloy + + +class TestDefineAzureMonitorConfig: + """Tests for _define_azure_monitor_config method.""" + + def test_azure_contains_postgres_exporter(self) -> None: + alloy = _make_alloy_for_azure_monitor() + result = alloy._define_azure_monitor_config() # noqa: SLF001 + assert 'prometheus.exporter.azure "postgres"' in result + assert "Microsoft.DBforPostgreSQL/flexibleServers" in result + + def test_azure_contains_netapp_exporter(self) -> None: + alloy = _make_alloy_for_azure_monitor() + result = alloy._define_azure_monitor_config() # noqa: SLF001 + assert 'prometheus.exporter.azure "netapp"' in result + assert "Microsoft.NetApp/netAppAccounts/capacityPools/volumes" in result + + def test_azure_contains_loadbalancer_exporter(self) -> None: + alloy = _make_alloy_for_azure_monitor() + result = alloy._define_azure_monitor_config() # noqa: SLF001 + assert 'prometheus.exporter.azure "loadbalancer"' in result + assert "Microsoft.Network/loadBalancers" in result + + def test_azure_contains_storage_exporter(self) -> None: + alloy = _make_alloy_for_azure_monitor() + result = alloy._define_azure_monitor_config() # noqa: SLF001 + assert 'prometheus.exporter.azure "storage"' in result + assert "Microsoft.Storage/storageAccounts" in result + + def test_azure_subscription_id_interpolated(self) -> None: + alloy = _make_alloy_for_azure_monitor(subscription_id="custom-subscription-id") + result = alloy._define_azure_monitor_config() # noqa: SLF001 + assert 'subscriptions = ["custom-subscription-id"]' in result + + def test_azure_resource_group_name_interpolated(self) -> None: + alloy = _make_alloy_for_azure_monitor(resource_group_name="custom-rg-name") + result = alloy._define_azure_monitor_config() # noqa: SLF001 + assert "where resourceGroup == 'custom-rg-name'" in result + + def test_azure_contains_all_postgres_metrics(self) -> None: + alloy = _make_alloy_for_azure_monitor() + result = alloy._define_azure_monitor_config() # noqa: SLF001 + postgres_metrics = [ + "cpu_percent", + "memory_percent", + "storage_percent", + "active_connections", + "connections_failed", + "deadlocks", + ] + for metric in postgres_metrics: + assert metric in result + + def test_azure_contains_all_netapp_metrics(self) -> None: + alloy = _make_alloy_for_azure_monitor() + result = alloy._define_azure_monitor_config() # noqa: SLF001 + netapp_metrics = [ + "VolumeConsumedSizePercentage", + "VolumeLogicalSize", + "AverageReadLatency", + "AverageWriteLatency", + "ReadIops", + "WriteIops", + ] + for metric in netapp_metrics: + assert metric in result + + def test_azure_contains_all_loadbalancer_metrics(self) -> None: + alloy = _make_alloy_for_azure_monitor() + result = alloy._define_azure_monitor_config() # noqa: SLF001 + lb_metrics = [ + "DipAvailability", + "VipAvailability", + "UsedSnatPorts", + "AllocatedSnatPorts", + "SnatConnectionCount", + ] + for metric in lb_metrics: + assert metric in result + + def test_azure_contains_all_storage_metrics(self) -> None: + alloy = _make_alloy_for_azure_monitor() + result = alloy._define_azure_monitor_config() # noqa: SLF001 + storage_metrics = ["Availability", "SuccessE2ELatency", "UsedCapacity", "Transactions"] + for metric in storage_metrics: + assert metric in result + + def test_azure_includes_scrape_blocks(self) -> None: + alloy = _make_alloy_for_azure_monitor() + result = alloy._define_azure_monitor_config() # noqa: SLF001 + # Each exporter should have a corresponding scrape block + assert 'prometheus.scrape "azure_postgres"' in result + assert 'prometheus.scrape "azure_netapp"' in result + assert 'prometheus.scrape "azure_loadbalancer"' in result + assert 'prometheus.scrape "azure_storage"' in result + + def test_azure_scrape_blocks_forward_to_relabel(self) -> None: + alloy = _make_alloy_for_azure_monitor() + result = alloy._define_azure_monitor_config() # noqa: SLF001 + # All scrape blocks should forward to the default relabel receiver + assert result.count("forward_to = [prometheus.relabel.default.receiver]") >= 4 + + def test_azure_scrape_blocks_enable_clustering(self) -> None: + alloy = _make_alloy_for_azure_monitor() + result = alloy._define_azure_monitor_config() # noqa: SLF001 + # All scrape blocks should enable clustering + assert result.count("enabled = true") >= 4 + + def test_azure_natgateway_included_when_public_subnet_configured(self) -> None: + alloy = _make_alloy_for_azure_monitor(public_subnet_cidr="10.0.100.0/24") + result = alloy._define_azure_monitor_config() # noqa: SLF001 + assert 'prometheus.exporter.azure "natgateway"' in result + assert "Microsoft.Network/natGateways" in result + assert 'prometheus.scrape "azure_natgateway"' in result + + def test_azure_natgateway_excluded_when_no_public_subnet(self) -> None: + alloy = _make_alloy_for_azure_monitor(public_subnet_cidr=None) + result = alloy._define_azure_monitor_config() # noqa: SLF001 + assert 'prometheus.exporter.azure "natgateway"' not in result + assert "Microsoft.Network/natGateways" not in result + assert 'prometheus.scrape "azure_natgateway"' not in result + + def test_aws_workload_returns_empty_string(self) -> None: + # Create an AWS workload instead of Azure + alloy = _make_alloy_for_cloudwatch("aws") + result = alloy._define_azure_monitor_config() # noqa: SLF001 + assert result == "" + + def test_azure_config_not_empty_for_azure_workload(self) -> None: + alloy = _make_alloy_for_azure_monitor() + result = alloy._define_azure_monitor_config() # noqa: SLF001 + assert result != "" + assert len(result) > 100 # Should be a substantial config block