From 0aaea2f7af8424541867b60656b2c34981de42d3 Mon Sep 17 00:00:00 2001 From: Anna Williamson Date: Wed, 25 Feb 2026 11:53:06 -0800 Subject: [PATCH 01/19] docs: update monitoring doc with alert format --- docs/guides/monitoring.md | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/docs/guides/monitoring.md b/docs/guides/monitoring.md index a320a53..f38ecee 100644 --- a/docs/guides/monitoring.md +++ b/docs/guides/monitoring.md @@ -532,6 +532,42 @@ PTD deploys a set of Grafana alerts to the control room for centralized monitori All alerts are configured to send notifications to OpsGenie when triggered. +### Alert Format + +Alerts use a standardized format for consistency across all alert types: + +``` +[🔴 CRITICAL | 🟡 WARNING]: [Title] + +[Description] + +─── WHERE ─────────────────────────── +Tenant: [tenant name] (Note: The organization or group that a workload cluster is provisioned for) +Cluster: [cluster name] +Component: [affected component] + +─── DETAILS ───────────────────────── +[Key]: [Value] +[Key]: [Value] +... + +📖 [runbook link] +📊 [dashboard link] +``` + +**Severity levels:** +- 🔴 **CRITICAL** — Immediate action required +- 🟡 **WARNING** — Investigate soon + +**Alert types and their WHERE/DETAILS fields:** + +| Type | WHERE | DETAILS | +|------|-------|---------| +| Health Check | Tenant, Cluster, Product | Endpoint, Status, Response Time, Down Since | +| Kubernetes | Tenant, Cluster, Namespace, Pod/Node | Varies by alert (restarts, replicas, conditions) | +| Cloud (AWS) | Tenant, Cluster, Resource, Region | Metric, Current, Threshold, Duration | +| Cloud (Azure) | Tenant, Cluster, Resource, Location | Metric, Current, Threshold, Duration | + ### Application Alerts | Alert | Threshold | Duration | Description | From 8d8b8a0bfe8b0cca1000c4f48cc6e8707093795f Mon Sep 17 00:00:00 2001 From: Anna Williamson Date: Wed, 25 Feb 2026 15:10:30 -0800 Subject: [PATCH 02/19] adding new field for friendly tenant name --- python-pulumi/src/ptd/__init__.py | 1 + python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/python-pulumi/src/ptd/__init__.py b/python-pulumi/src/ptd/__init__.py index 2ab3dda..325ebaf 100644 --- a/python-pulumi/src/ptd/__init__.py +++ b/python-pulumi/src/ptd/__init__.py @@ -315,6 +315,7 @@ class WorkloadConfig: network_trust: NetworkTrust sites: typing.Mapping[str, SiteConfig] true_name: str + tenant_name: str | None = dataclasses.field(default=None, kw_only=True) # Human-readable name for the tenant @property def domain(self) -> str: diff --git a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py index 4a75bec..1cb3ec5 100644 --- a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py +++ b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py @@ -220,6 +220,9 @@ def _define_config_map( account_id = self.workload.cfg.account_id cluster_name = self.workload.eks_cluster_name(self.release) + # Use tenant_name if set, otherwise fall back to compound_name + tenant_name = self.workload.cfg.tenant_name or self.workload.compound_name + # Generate CloudWatch exporter configuration for AWS cloudwatch_config = "" if self.cloud_provider == "aws": @@ -635,6 +638,7 @@ def _define_config_map( external_labels = {{ data = "true", + tenant_name = "{tenant_name}", }} }} """ From 4f166f61d21702d2c436dd36790e12b9d8285be7 Mon Sep 17 00:00:00 2001 From: Anna Williamson Date: Wed, 25 Feb 2026 15:11:19 -0800 Subject: [PATCH 03/19] updating format for existing k8s alerts --- .../src/ptd/grafana_alerts/nodes.yaml | 51 ++++++++- .../src/ptd/grafana_alerts/pods.yaml | 103 ++++++++++++++++-- 2 files changed, 138 insertions(+), 16 deletions(-) diff --git a/python-pulumi/src/ptd/grafana_alerts/nodes.yaml b/python-pulumi/src/ptd/grafana_alerts/nodes.yaml index dde33dc..697bdf8 100644 --- a/python-pulumi/src/ptd/grafana_alerts/nodes.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/nodes.yaml @@ -55,8 +55,21 @@ groups: execErrState: Error for: 15m annotations: - summary: Node not in ready state (instance {{ $labels.instance }}) - description: "Node {{ $labels.node }} in cluster {{ $labels.cluster }} has been unready for 15 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: "🔴 CRITICAL: Node Not Ready" + description: | + Kubernetes node is not accepting workloads + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + Node: {{ $labels.node }} + + ─── DETAILS ───────────────────────── + Condition: NotReady + Duration: 15 minutes + + 📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/node-not-ready.md + 📊 https://grafana.example.com/d/nodes?var-cluster={{ $labels.cluster }} labels: opsgenie: "1" isPaused: false @@ -110,8 +123,21 @@ groups: execErrState: Error for: 15m annotations: - summary: Kubernetes Node memory pressure (instance {{ $labels.instance }}) - description: "Node {{ $labels.node }} in cluster {{ $labels.cluster }} has MemoryPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: "🟡 WARNING: Node Memory Pressure" + description: | + Node is running low on available memory + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + Node: {{ $labels.node }} + + ─── DETAILS ───────────────────────── + Condition: MemoryPressure + Duration: 15 minutes + + 📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/node-memory-pressure.md + 📊 https://grafana.example.com/d/nodes?var-cluster={{ $labels.cluster }} labels: opsgenie: "1" isPaused: false @@ -165,8 +191,21 @@ groups: execErrState: Error for: 15m annotations: - summary: Kubernetes Node disk pressure (instance {{ $labels.instance }}) - description: "Node {{ $labels.node }} in cluster {{ $labels.cluster }} has DiskPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: "🟡 WARNING: Node Disk Pressure" + description: | + Node is running low on available disk space + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + Node: {{ $labels.node }} + + ─── DETAILS ───────────────────────── + Condition: DiskPressure + Duration: 15 minutes + + 📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/node-disk-pressure.md + 📊 https://grafana.example.com/d/nodes?var-cluster={{ $labels.cluster }} labels: opsgenie: "1" isPaused: false diff --git a/python-pulumi/src/ptd/grafana_alerts/pods.yaml b/python-pulumi/src/ptd/grafana_alerts/pods.yaml index 1422ab7..2efff1c 100644 --- a/python-pulumi/src/ptd/grafana_alerts/pods.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/pods.yaml @@ -96,7 +96,21 @@ groups: execErrState: Error for: 5m annotations: - summary: Container {{ $labels.container }} in {{ $labels.cluster }} is in CrashLoopBackoff state. \n VALUE = {{ $value }}\n LABELS = {{ $labels }} + summary: "🔴 CRITICAL: Container Crash-Looping" + description: | + Container keeps crashing and restarting + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + Container: {{ $labels.container }} + + ─── DETAILS ───────────────────────── + Status: CrashLoopBackOff + Duration: 5 minutes + + 📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/crash-loop-backoff.md + 📊 https://grafana.example.com/d/pods?var-cluster={{ $labels.cluster }} labels: opsgenie: "1" isPaused: false @@ -187,7 +201,21 @@ groups: execErrState: Error for: 5m annotations: - summary: Pod {{ $labels.pod }} in cluster {{ $labels.cluster }} is in a {{ $labels.reason }} state. + summary: "🟡 WARNING: Pod Error" + description: | + Pod container terminated with an error + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + Pod: {{ $labels.pod }} + + ─── DETAILS ───────────────────────── + Reason: {{ $labels.reason }} + Duration: 5 minutes + + 📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/pod-error.md + 📊 https://grafana.example.com/d/pods?var-cluster={{ $labels.cluster }} labels: opsgenie: "1" isPaused: false @@ -274,8 +302,21 @@ groups: execErrState: Error for: 15m annotations: - summary: Workload Cluster Pod not healthy (pod {{ $labels.pod }}) - description: "Cluster: {{ $labels.cluster }}, Pod: {{ $labels.pod }} has been in a non-running state for longer than 15 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: "🟡 WARNING: Pod Not Healthy" + description: | + Pod has been in a non-running state + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + Pod: {{ $labels.pod }} + + ─── DETAILS ───────────────────────── + Phase: {{ $labels.phase }} + Duration: 15 minutes + + 📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/pod-not-healthy.md + 📊 https://grafana.example.com/d/pods?var-cluster={{ $labels.cluster }} labels: opsgenie: "1" isPaused: false @@ -366,8 +407,22 @@ groups: execErrState: Error for: 15m annotations: - summary: Workload Cluster Pod {{ $labels.pod }} has more than 5 restarts in 15 minutes. - description: "Cluster: {{ $labels.cluster }}, Pod: {{ $labels.pod }} has restarted {{$value}} times in 15 minutes. \n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: "🟡 WARNING: Pod Restarts" + description: | + Pod has restarted excessively + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + Namespace: {{ $labels.namespace }} + Pod: {{ $labels.pod }} + + ─── DETAILS ───────────────────────── + Restarts: {{ $value }} in 15 minutes + Threshold: > 5 restarts + + 📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/pod-restarts.md + 📊 https://grafana.example.com/d/pods?var-cluster={{ $labels.cluster }} labels: opsgenie: "1" isPaused: false @@ -451,8 +506,22 @@ groups: execErrState: Error for: 15m annotations: - summary: Deployment replicas mismatch (instance {{ $labels.instance }}) - description: "Cluster: {{ $labels.cluster }}, Deployment: {{ $labels.deployment }} has mismatched desired and available replicas.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: "🟡 WARNING: Deployment Unhealthy" + description: | + Deployment has fewer replicas than expected + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + Namespace: {{ $labels.namespace }} + Deployment: {{ $labels.deployment }} + + ─── DETAILS ───────────────────────── + Status: Replicas mismatch + Duration: 15 minutes + + 📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/deployment-replica-mismatch.md + 📊 https://grafana.example.com/d/deployments?var-cluster={{ $labels.cluster }} labels: opsgenie: "1" isPaused: false @@ -536,8 +605,22 @@ groups: execErrState: Error for: 15m annotations: - summary: Stateful Set replicas mismatch (instance {{ $labels.instance }}) - description: "Cluster: {{ $labels.cluster }}, Stateful Set: {{ $labels.statefulset }} does not match the desired number of replicas.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: "🟡 WARNING: StatefulSet Unhealthy" + description: | + StatefulSet has fewer replicas than expected + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + Namespace: {{ $labels.namespace }} + StatefulSet: {{ $labels.statefulset }} + + ─── DETAILS ───────────────────────── + Status: Replicas mismatch + Duration: 15 minutes + + 📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/statefulset-replica-mismatch.md + 📊 https://grafana.example.com/d/statefulsets?var-cluster={{ $labels.cluster }} labels: opsgenie: "1" isPaused: false From 935888012bbc516605b11ad8036fc3d6c951db1c Mon Sep 17 00:00:00 2001 From: Anna Williamson Date: Wed, 25 Feb 2026 15:11:30 -0800 Subject: [PATCH 04/19] updating format for existing healthcheck alerts --- .../src/ptd/grafana_alerts/healthchecks.yaml | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/python-pulumi/src/ptd/grafana_alerts/healthchecks.yaml b/python-pulumi/src/ptd/grafana_alerts/healthchecks.yaml index 3924d39..9609afc 100644 --- a/python-pulumi/src/ptd/grafana_alerts/healthchecks.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/healthchecks.yaml @@ -68,8 +68,24 @@ groups: execErrState: Error for: 5m annotations: - description: Calls the health check for each component of each PTD site and errors on non-200 response. - summary: "Health check failed! \nCluster: {{ $labels.cluster }} \nSite: {{ $labels.ptd_site }}\nComponent: {{ $labels.ptd_component }}\nCheck Type: {{ $labels.check_type }}\nURL: {{ $labels.health_check_url }}" + summary: "🔴 CRITICAL: Health Check Failed" + description: | + Product health check returning non-200 response + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + Site: {{ $labels.ptd_site }} + Product: {{ $labels.ptd_component }} + + ─── DETAILS ───────────────────────── + Check Type: {{ $labels.check_type }} + Endpoint: {{ $labels.health_check_url }} + Status: Non-200 response + Duration: 5 minutes + + 📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/health-check-failed.md + 📊 https://grafana.example.com/d/healthchecks?var-cluster={{ $labels.cluster }} labels: opsgenie: "1" isPaused: false From f41c5273906da67a58d862f6826497c57d8fa30d Mon Sep 17 00:00:00 2001 From: Anna Williamson Date: Wed, 25 Feb 2026 15:22:54 -0800 Subject: [PATCH 05/19] updating format for cloud and system alerts --- .../src/ptd/grafana_alerts/applications.yaml | 19 +++++- .../src/ptd/grafana_alerts/cloudwatch.yaml | 60 +++++++++++++++++-- .../src/ptd/grafana_alerts/mimir.yaml | 17 +++++- 3 files changed, 86 insertions(+), 10 deletions(-) diff --git a/python-pulumi/src/ptd/grafana_alerts/applications.yaml b/python-pulumi/src/ptd/grafana_alerts/applications.yaml index 2e089d6..0e3c089 100644 --- a/python-pulumi/src/ptd/grafana_alerts/applications.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/applications.yaml @@ -63,8 +63,23 @@ groups: execErrState: Error for: 5m annotations: - description: Loki ingester has experienced WAL disk full failures. This indicates storage issues with the Loki WAL directory in cluster {{ $labels.cluster }}. - summary: Loki WAL disk full failures detected in {{ $labels.cluster }} + summary: "🔴 CRITICAL: Loki WAL Disk Full" + description: | + Loki ingester experiencing WAL disk full failures + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + Namespace: {{ $labels.namespace }} + Pod: {{ $labels.pod }} + + ─── DETAILS ───────────────────────── + Component: Loki Ingester + Issue: WAL disk full + Duration: 5 minutes + + 📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/loki-wal-disk-full.md + 📊 https://grafana.example.com/d/loki?var-cluster={{ $labels.cluster }} labels: opsgenie: "1" isPaused: false diff --git a/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml b/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml index d28856f..bae42e0 100644 --- a/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml @@ -63,8 +63,24 @@ groups: execErrState: Error for: 5m annotations: - description: Evaluates FSx instance usage to determine if it is greater than 80% allocated. - summary: FSx instance {{$labels.dimension_FileSystemId}} has less than 20% of capacity remaining. + summary: "🟡 WARNING: FSx Storage Capacity Low" + description: | + FSx file system is running low on storage capacity + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + Resource: {{ $labels.dimension_FileSystemId }} + Region: {{ $labels.region }} + + ─── DETAILS ───────────────────────── + Metric: Storage Capacity + Current: > 80% used + Threshold: 80% + Duration: 5 minutes + + 📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/fsx-capacity-low.md + 📊 https://grafana.example.com/d/cloudwatch?var-cluster={{ $labels.cluster }} labels: opsgenie: "1" isPaused: false @@ -124,8 +140,24 @@ groups: execErrState: Error for: 5m annotations: - description: EC2 instance has high network outbound traffic (over 300 MiB/s) for more than 5 minutes. - summary: High network outbound traffic on EC2 instance {{$labels.dimension_InstanceId}} in cluster {{$labels.cluster}} + summary: "🟡 WARNING: EC2 Network Out High" + description: | + EC2 instance has unusually high outbound network traffic + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + Resource: {{ $labels.dimension_InstanceId }} + Region: {{ $labels.region }} + + ─── DETAILS ───────────────────────── + Metric: Network Out + Current: > 300 MiB/s + Threshold: 300 MiB/s + Duration: 5 minutes + + 📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/ec2-network-high.md + 📊 https://grafana.example.com/d/cloudwatch?var-cluster={{ $labels.cluster }} labels: opsgenie: "1" isPaused: false @@ -181,8 +213,24 @@ groups: execErrState: Error for: 5m annotations: - description: EC2 instance has an unusually high packet transmission rate (over 400,000 packets/s) for more than 5 minutes, which could indicate abnormal network activity. - summary: High network packet rate on EC2 instance {{$labels.dimension_InstanceId}} in cluster {{$labels.cluster}} + summary: "🟡 WARNING: EC2 Network Packets Out High" + description: | + EC2 instance has unusually high packet transmission rate + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + Resource: {{ $labels.dimension_InstanceId }} + Region: {{ $labels.region }} + + ─── DETAILS ───────────────────────── + Metric: Network Packets Out + Current: > 400,000 packets/s + Threshold: 400,000 packets/s + Duration: 5 minutes + + 📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/ec2-network-high.md + 📊 https://grafana.example.com/d/cloudwatch?var-cluster={{ $labels.cluster }} labels: opsgenie: "1" isPaused: false diff --git a/python-pulumi/src/ptd/grafana_alerts/mimir.yaml b/python-pulumi/src/ptd/grafana_alerts/mimir.yaml index 71da183..1e966a2 100644 --- a/python-pulumi/src/ptd/grafana_alerts/mimir.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/mimir.yaml @@ -91,8 +91,21 @@ groups: execErrState: Error for: 10m annotations: - description: No metrics have been received from workload cluster {{ $labels.cluster }} for 10 minutes. This could indicate Alloy is not running, network issues between the workload and control room, or the workload cluster is down. - summary: "Workload metrics silent!\nCluster: {{ $labels.cluster }}" + summary: "🔴 CRITICAL: Workload Metrics Silent" + description: | + No metrics received from workload cluster + + ─── WHERE ─────────────────────────── + Tenant: {{ $labels.tenant_name }} + Cluster: {{ $labels.cluster }} + + ─── DETAILS ───────────────────────── + Component: Metrics Pipeline + Issue: No metrics received + Duration: 10 minutes + + 📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/workload-metrics-silent.md + 📊 https://grafana.example.com/d/mimir?var-cluster={{ $labels.cluster }} labels: opsgenie: "1" isPaused: false From 245ffa68398a7cd20303abf47d5a93437774fc00 Mon Sep 17 00:00:00 2001 From: Anna Williamson Date: Thu, 26 Feb 2026 08:49:35 -0800 Subject: [PATCH 06/19] add tenant name field to blackbox healthcheck alerts --- python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py index 12a8a09..fd585ae 100644 --- a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py +++ b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py @@ -127,6 +127,7 @@ def _is_fqdn_health_check_enabled(self, site_dict: dict[str, typing.Any] | None) def _define_blackbox_targets(self) -> str: output = "" + tenant_name = self.workload.cfg.tenant_name or self.workload.compound_name for site_name, site_config in self.workload.cfg.sites.items(): # Parse site YAML once for this site @@ -150,6 +151,7 @@ def _define_blackbox_targets(self) -> str: address = {internal_address} module = "{component.module_name}" labels = {{ + "tenant_name" = "{tenant_name}", "ptd_site" = "{site_name}", "ptd_component" = "{lower_name}", "check_type" = "internal", @@ -168,6 +170,7 @@ def _define_blackbox_targets(self) -> str: address = {fqdn_address} module = "{component.module_name}" labels = {{ + "tenant_name" = "{tenant_name}", "ptd_site" = "{site_name}", "ptd_component" = "{lower_name}", "check_type" = "fqdn", From 523b491e5521ffe55ffd6da6e498f213a0cdc78b Mon Sep 17 00:00:00 2001 From: Anna Williamson Date: Thu, 26 Feb 2026 08:54:44 -0800 Subject: [PATCH 07/19] remove labels output and send tags only --- python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py index a7023af..c5baf67 100644 --- a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py +++ b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py @@ -1982,6 +1982,7 @@ def with_grafana( "settings": { "apiKey": '${{ "{" }}POSIT_OPSGENIE_KEY{{ "}" }}', # ${POSIT_OPSGENIE_KEY} in the resulting configMap, "apiUrl": "https://api.opsgenie.com/v2/alerts", + "sendTagsAs": "tags", }, } ], From d331bf0ebce3b4d06c49664bd6c312f002b612f8 Mon Sep 17 00:00:00 2001 From: Anna Williamson Date: Thu, 26 Feb 2026 11:35:25 -0800 Subject: [PATCH 08/19] fix tenant_label on metrics --- python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py index fd585ae..ead1744 100644 --- a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py +++ b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py @@ -574,6 +574,12 @@ def _define_config_map( target_label = "cluster" replacement = "{cluster_name}" }} + + rule {{ + action = "replace" + target_label = "tenant_name" + replacement = "{tenant_name}" + }} }} prometheus.remote_write "control_room" {{ From 27fb7526370c961012ab85d80bd69355ac3aac9e Mon Sep 17 00:00:00 2001 From: Anna Williamson Date: Thu, 26 Feb 2026 14:14:34 -0800 Subject: [PATCH 09/19] update alert template fields --- python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py index c5baf67..ee2fa86 100644 --- a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py +++ b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py @@ -1983,6 +1983,8 @@ def with_grafana( "apiKey": '${{ "{" }}POSIT_OPSGENIE_KEY{{ "}" }}', # ${POSIT_OPSGENIE_KEY} in the resulting configMap, "apiUrl": "https://api.opsgenie.com/v2/alerts", "sendTagsAs": "tags", + "message": "{{ .CommonAnnotations.summary }}", + "description": "{{ .CommonAnnotations.description }}", }, } ], From dac790c1d132950ded95010a0cd17ca104f4983f Mon Sep 17 00:00:00 2001 From: Anna Williamson Date: Thu, 26 Feb 2026 14:17:58 -0800 Subject: [PATCH 10/19] fix field format, escape characters --- python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py index ee2fa86..320570d 100644 --- a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py +++ b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py @@ -1983,8 +1983,8 @@ def with_grafana( "apiKey": '${{ "{" }}POSIT_OPSGENIE_KEY{{ "}" }}', # ${POSIT_OPSGENIE_KEY} in the resulting configMap, "apiUrl": "https://api.opsgenie.com/v2/alerts", "sendTagsAs": "tags", - "message": "{{ .CommonAnnotations.summary }}", - "description": "{{ .CommonAnnotations.description }}", + "message": '{{ "{{" }} .CommonAnnotations.summary {{ "}}" }}', + "description": '{{ "{{" }} .CommonAnnotations.description {{ "}}" }}', }, } ], From 0637b306bb51968ebcbacc940ccee41209df6f3b Mon Sep 17 00:00:00 2001 From: Anna Williamson Date: Thu, 26 Feb 2026 15:03:42 -0800 Subject: [PATCH 11/19] add namespace and pod, and fix formatting issues --- .../src/ptd/grafana_alerts/applications.yaml | 2 -- .../src/ptd/grafana_alerts/cloudwatch.yaml | 6 ----- .../src/ptd/grafana_alerts/healthchecks.yaml | 2 -- .../src/ptd/grafana_alerts/mimir.yaml | 2 -- .../src/ptd/grafana_alerts/nodes.yaml | 6 ----- .../src/ptd/grafana_alerts/pods.yaml | 25 ++++++------------- 6 files changed, 8 insertions(+), 35 deletions(-) diff --git a/python-pulumi/src/ptd/grafana_alerts/applications.yaml b/python-pulumi/src/ptd/grafana_alerts/applications.yaml index 0e3c089..7f36bb7 100644 --- a/python-pulumi/src/ptd/grafana_alerts/applications.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/applications.yaml @@ -78,8 +78,6 @@ groups: Issue: WAL disk full Duration: 5 minutes - 📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/loki-wal-disk-full.md - 📊 https://grafana.example.com/d/loki?var-cluster={{ $labels.cluster }} labels: opsgenie: "1" isPaused: false diff --git a/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml b/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml index a2af562..076c603 100644 --- a/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml @@ -79,8 +79,6 @@ groups: Threshold: 80% Duration: 5 minutes - 📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/fsx-capacity-low.md - 📊 https://grafana.example.com/d/cloudwatch?var-cluster={{ $labels.cluster }} labels: opsgenie: "1" isPaused: false @@ -156,8 +154,6 @@ groups: Threshold: 300 MiB/s Duration: 5 minutes - 📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/ec2-network-high.md - 📊 https://grafana.example.com/d/cloudwatch?var-cluster={{ $labels.cluster }} labels: opsgenie: "1" isPaused: false @@ -229,8 +225,6 @@ groups: Threshold: 400,000 packets/s Duration: 5 minutes - 📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/ec2-network-high.md - 📊 https://grafana.example.com/d/cloudwatch?var-cluster={{ $labels.cluster }} labels: opsgenie: "1" isPaused: false diff --git a/python-pulumi/src/ptd/grafana_alerts/healthchecks.yaml b/python-pulumi/src/ptd/grafana_alerts/healthchecks.yaml index 9609afc..0922c8b 100644 --- a/python-pulumi/src/ptd/grafana_alerts/healthchecks.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/healthchecks.yaml @@ -84,8 +84,6 @@ groups: Status: Non-200 response Duration: 5 minutes - 📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/health-check-failed.md - 📊 https://grafana.example.com/d/healthchecks?var-cluster={{ $labels.cluster }} labels: opsgenie: "1" isPaused: false diff --git a/python-pulumi/src/ptd/grafana_alerts/mimir.yaml b/python-pulumi/src/ptd/grafana_alerts/mimir.yaml index 1e966a2..b5f6779 100644 --- a/python-pulumi/src/ptd/grafana_alerts/mimir.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/mimir.yaml @@ -104,8 +104,6 @@ groups: Issue: No metrics received Duration: 10 minutes - 📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/workload-metrics-silent.md - 📊 https://grafana.example.com/d/mimir?var-cluster={{ $labels.cluster }} labels: opsgenie: "1" isPaused: false diff --git a/python-pulumi/src/ptd/grafana_alerts/nodes.yaml b/python-pulumi/src/ptd/grafana_alerts/nodes.yaml index 697bdf8..bb6f91d 100644 --- a/python-pulumi/src/ptd/grafana_alerts/nodes.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/nodes.yaml @@ -68,8 +68,6 @@ groups: Condition: NotReady Duration: 15 minutes - 📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/node-not-ready.md - 📊 https://grafana.example.com/d/nodes?var-cluster={{ $labels.cluster }} labels: opsgenie: "1" isPaused: false @@ -136,8 +134,6 @@ groups: Condition: MemoryPressure Duration: 15 minutes - 📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/node-memory-pressure.md - 📊 https://grafana.example.com/d/nodes?var-cluster={{ $labels.cluster }} labels: opsgenie: "1" isPaused: false @@ -204,8 +200,6 @@ groups: Condition: DiskPressure Duration: 15 minutes - 📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/node-disk-pressure.md - 📊 https://grafana.example.com/d/nodes?var-cluster={{ $labels.cluster }} labels: opsgenie: "1" isPaused: false diff --git a/python-pulumi/src/ptd/grafana_alerts/pods.yaml b/python-pulumi/src/ptd/grafana_alerts/pods.yaml index 2efff1c..5019c50 100644 --- a/python-pulumi/src/ptd/grafana_alerts/pods.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/pods.yaml @@ -29,7 +29,7 @@ groups: type: prometheus uid: mimir editorMode: code - expr: count by (cluster, container)(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"}) + expr: count by (cluster, namespace, pod, container)(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"}) instant: true intervalMs: 1000 legendFormat: __auto @@ -103,14 +103,14 @@ groups: ─── WHERE ─────────────────────────── Tenant: {{ $labels.tenant_name }} Cluster: {{ $labels.cluster }} + Namespace: {{ $labels.namespace }} + Pod: {{ $labels.pod }} Container: {{ $labels.container }} ─── DETAILS ───────────────────────── Status: CrashLoopBackOff Duration: 5 minutes - 📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/crash-loop-backoff.md - 📊 https://grafana.example.com/d/pods?var-cluster={{ $labels.cluster }} labels: opsgenie: "1" isPaused: false @@ -129,7 +129,7 @@ groups: uid: mimir disableTextWrap: false editorMode: code - expr: count by(cluster, pod, reason) (kube_pod_container_status_terminated_reason{reason!="Completed"} * on(cluster,pod) group_left(label_launcher_instance_id) kube_pod_labels{label_launcher_instance_id=""}) + expr: count by(cluster, namespace, pod, reason) (kube_pod_container_status_terminated_reason{reason!="Completed"} * on(cluster,pod) group_left(label_launcher_instance_id) kube_pod_labels{label_launcher_instance_id=""}) fullMetaSearch: false includeNullMetadata: true instant: true @@ -208,14 +208,13 @@ groups: ─── WHERE ─────────────────────────── Tenant: {{ $labels.tenant_name }} Cluster: {{ $labels.cluster }} + Namespace: {{ $labels.namespace }} Pod: {{ $labels.pod }} ─── DETAILS ───────────────────────── Reason: {{ $labels.reason }} Duration: 5 minutes - 📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/pod-error.md - 📊 https://grafana.example.com/d/pods?var-cluster={{ $labels.cluster }} labels: opsgenie: "1" isPaused: false @@ -234,7 +233,7 @@ groups: uid: mimir disableTextWrap: false editorMode: code - expr: sum by (cluster, pod, phase) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"} * on(cluster,pod) group_left(label_launcher_instance_id) kube_pod_labels{label_launcher_instance_id=""}) > 0 + expr: sum by (cluster, namespace, pod, phase) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"} * on(cluster,pod) group_left(label_launcher_instance_id) kube_pod_labels{label_launcher_instance_id=""}) > 0 fullMetaSearch: false includeNullMetadata: true instant: true @@ -309,14 +308,13 @@ groups: ─── WHERE ─────────────────────────── Tenant: {{ $labels.tenant_name }} Cluster: {{ $labels.cluster }} + Namespace: {{ $labels.namespace }} Pod: {{ $labels.pod }} ─── DETAILS ───────────────────────── Phase: {{ $labels.phase }} Duration: 15 minutes - 📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/pod-not-healthy.md - 📊 https://grafana.example.com/d/pods?var-cluster={{ $labels.cluster }} labels: opsgenie: "1" isPaused: false @@ -418,11 +416,8 @@ groups: Pod: {{ $labels.pod }} ─── DETAILS ───────────────────────── - Restarts: {{ $value }} in 15 minutes - Threshold: > 5 restarts + Issue: > 5 restarts in 15 minutes - 📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/pod-restarts.md - 📊 https://grafana.example.com/d/pods?var-cluster={{ $labels.cluster }} labels: opsgenie: "1" isPaused: false @@ -520,8 +515,6 @@ groups: Status: Replicas mismatch Duration: 15 minutes - 📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/deployment-replica-mismatch.md - 📊 https://grafana.example.com/d/deployments?var-cluster={{ $labels.cluster }} labels: opsgenie: "1" isPaused: false @@ -619,8 +612,6 @@ groups: Status: Replicas mismatch Duration: 15 minutes - 📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/statefulset-replica-mismatch.md - 📊 https://grafana.example.com/d/statefulsets?var-cluster={{ $labels.cluster }} labels: opsgenie: "1" isPaused: false From a6de747baad5b6f1cf674971b60eb1d52223febb Mon Sep 17 00:00:00 2001 From: Anna Williamson Date: Fri, 27 Feb 2026 14:08:09 -0800 Subject: [PATCH 12/19] fix comment characters in alloy config --- .../src/ptd/pulumi_resources/grafana_alloy.py | 28 +++++++++---------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py index ead1744..07fd538 100644 --- a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py +++ b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py @@ -337,8 +337,8 @@ def _define_cloudwatch_config(self) -> str: type = "AWS/NATGateway" regions = ["{self.region}"] - # NAT Gateways inherit VPC tags including posit.team/true-name - # (see python-pulumi/src/ptd/pulumi_resources/aws_vpc.py:607-616) + // NAT Gateways inherit VPC tags including posit.team/true-name + // (see python-pulumi/src/ptd/pulumi_resources/aws_vpc.py:607-616) search_tags = {{ "posit.team/true-name" = "{self.workload.cfg.true_name}", }} @@ -360,12 +360,12 @@ def _define_cloudwatch_config(self) -> str: type = "AWS/ApplicationELB" regions = ["{self.region}"] - # ALBs are tagged at creation time via aws_workload_helm.py. - # LBs provisioned before this tag was added won't be discovered - # until the cluster is redeployed. - # FIXME: To tag existing ALBs without redeploying, use the AWS CLI: - # aws elbv2 add-tags --resource-arns \ - # --tags Key=posit.team/true-name,Value= + // ALBs are tagged at creation time via aws_workload_helm.py. + // LBs provisioned before this tag was added won't be discovered + // until the cluster is redeployed. + // FIXME: To tag existing ALBs without redeploying, use the AWS CLI: + // aws elbv2 add-tags --resource-arns + // --tags Key=posit.team/true-name,Value= search_tags = {{ "posit.team/true-name" = "{self.workload.cfg.true_name}", }} @@ -393,12 +393,12 @@ def _define_cloudwatch_config(self) -> str: type = "AWS/NetworkELB" regions = ["{self.region}"] - # NLBs are tagged at creation time via traefik.py. - # LBs provisioned before this tag was added won't be discovered - # until the cluster is redeployed. - # FIXME: To tag existing NLBs without redeploying, use the AWS CLI: - # aws elbv2 add-tags --resource-arns \ - # --tags Key=posit.team/true-name,Value= + // NLBs are tagged at creation time via traefik.py. + // LBs provisioned before this tag was added won't be discovered + // until the cluster is redeployed. + // FIXME: To tag existing NLBs without redeploying, use the AWS CLI: + // aws elbv2 add-tags --resource-arns + // --tags Key=posit.team/true-name,Value= search_tags = {{ "posit.team/true-name" = "{self.workload.cfg.true_name}", }} From 90cc64063dc8ea3b80c9e04ffc70b35ea49286b2 Mon Sep 17 00:00:00 2001 From: Anna Williamson Date: Fri, 27 Feb 2026 15:41:20 -0800 Subject: [PATCH 13/19] fix pound sign breaking alloy --- .../src/ptd/pulumi_resources/grafana_alloy.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py index 07fd538..912ca92 100644 --- a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py +++ b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py @@ -267,12 +267,12 @@ def _define_cloudwatch_config(self) -> str: period = "5m" }} - # TODO: Remove ["Sum"] from statistics once all Grafana dashboards have - # been updated to query aws_rds_database_connections_average. - # Collecting both Sum and Average during migration. Average is the - # target metric (aws_rds_database_connections_average); Sum - # (aws_rds_database_connections_sum) is kept temporarily for existing - # dashboards. NOTE: Keeping Sum doubles the CloudWatch API cost for this metric. + // TODO: Remove ["Sum"] from statistics once all Grafana dashboards have + // been updated to query aws_rds_database_connections_average. + // Collecting both Sum and Average during migration. Average is the + // target metric (aws_rds_database_connections_average). Sum + // (aws_rds_database_connections_sum) is kept temporarily for existing + // dashboards. NOTE: Keeping Sum doubles the CloudWatch API cost for this metric. metric {{ name = "DatabaseConnections" statistics = ["Average", "Sum"] @@ -297,14 +297,14 @@ def _define_cloudwatch_config(self) -> str: period = "5m" }} - # Collected for dashboard visibility; no alert rules defined + // Collected for dashboard visibility, no alert rules defined metric {{ name = "WriteLatency" statistics = ["Average"] period = "5m" }} - # Collected for dashboard visibility; no alert rules defined + // Collected for dashboard visibility, no alert rules defined metric {{ name = "Deadlocks" statistics = ["Sum"] From bbde877ef2b952087fd24ead93319f6613a7c713 Mon Sep 17 00:00:00 2001 From: Anna Williamson Date: Tue, 3 Mar 2026 14:06:01 -0800 Subject: [PATCH 14/19] update example config yaml --- examples/workload/ptd.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/workload/ptd.yaml b/examples/workload/ptd.yaml index c04a275..4a867c0 100644 --- a/examples/workload/ptd.yaml +++ b/examples/workload/ptd.yaml @@ -11,6 +11,9 @@ spec: # AWS account ID where the workload will be deployed account_id: "123456789012" + # Human-readable tenant name for alerts (defaults to compound_name if not set) + tenant_name: "Example Analytics Team" + # Control room that manages this workload control_room_account_id: "123456789012" control_room_cluster_name: control-room-production From 2c4a0e9ff2db84d46bbec8b7a74e828a318d4d0f Mon Sep 17 00:00:00 2001 From: Anna Williamson Date: Tue, 3 Mar 2026 15:16:38 -0800 Subject: [PATCH 15/19] update grouping to not combine healthchecks --- python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py index 9f260fc..1ddf635 100644 --- a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py +++ b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py @@ -1991,7 +1991,7 @@ def with_grafana( { "orgId": 1, "receiver": "posit-opsgenie", - "group_by": ["alertname", "cluster"], + "group_by": ["alertname", "cluster", "ptd_component"], "matchers": ["opsgenie = 1"], "group_wait": "30s", "group_interval": "5m", From 844ae6ec80b6a8251f06f4cd045c53341b9d2a40 Mon Sep 17 00:00:00 2001 From: Anna Williamson Date: Wed, 4 Mar 2026 15:45:04 -0800 Subject: [PATCH 16/19] healthcheck format and dedupe by type --- .../ptd/pulumi_resources/aws_eks_cluster.py | 23 +++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py index 1ddf635..b44f690 100644 --- a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py +++ b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py @@ -1963,6 +1963,21 @@ def with_grafana( ), values={ "alerting": { + # Custom notification templates for clean alert formatting. + # These templates output ONLY our formatted content without + # Grafana's default prefix (Firing, Value, Labels, Annotations). + "templates.yaml": { + "apiVersion": 1, + "templates": [ + { + "orgId": 1, + "name": "ptd_templates", + # Template outputs just the description annotation for each alert. + # This avoids Grafana's default verbose format. + "template": '{{ "{{" }} define "ptd.description" {{ "}}" }}{{ "{{" }} range .Alerts {{ "}}" }}{{ "{{" }} .Annotations.description {{ "}}" }}{{ "{{" }} end {{ "}}" }}{{ "{{" }} end {{ "}}" }}', + } + ], + }, "contactpoints.yaml": { "apiVersion": 1, "contactPoints": [ @@ -1978,7 +1993,8 @@ def with_grafana( "apiUrl": "https://api.opsgenie.com/v2/alerts", "sendTagsAs": "tags", "message": '{{ "{{" }} .CommonAnnotations.summary {{ "}}" }}', - "description": '{{ "{{" }} .CommonAnnotations.description {{ "}}" }}', + # Use custom template for clean description without label dumps + "description": '{{ "{{" }} template "ptd.description" . {{ "}}" }}', }, } ], @@ -1991,7 +2007,10 @@ def with_grafana( { "orgId": 1, "receiver": "posit-opsgenie", - "group_by": ["alertname", "cluster", "ptd_component"], + # health_check_url ensures each health check endpoint gets its own alert + # (internal vs fqdn checks are separate). This label is empty for + # non-healthcheck alerts so it won't affect their grouping. + "group_by": ["alertname", "cluster", "ptd_component", "health_check_url"], "matchers": ["opsgenie = 1"], "group_wait": "30s", "group_interval": "5m", From 76cd3d8f001fa637b739d1e9158d3d184190e1cd Mon Sep 17 00:00:00 2001 From: Anna Williamson Date: Wed, 4 Mar 2026 15:48:01 -0800 Subject: [PATCH 17/19] add back source and silence links --- python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py index b44f690..7d97f0c 100644 --- a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py +++ b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py @@ -1972,9 +1972,9 @@ def with_grafana( { "orgId": 1, "name": "ptd_templates", - # Template outputs just the description annotation for each alert. - # This avoids Grafana's default verbose format. - "template": '{{ "{{" }} define "ptd.description" {{ "}}" }}{{ "{{" }} range .Alerts {{ "}}" }}{{ "{{" }} .Annotations.description {{ "}}" }}{{ "{{" }} end {{ "}}" }}{{ "{{" }} end {{ "}}" }}', + # Template outputs description annotation + Source/Silence links for each alert. + # This avoids Grafana's default verbose format while keeping useful links. + "template": '{{ "{{" }} define "ptd.description" {{ "}}" }}{{ "{{" }} range .Alerts {{ "}}" }}{{ "{{" }} .Annotations.description {{ "}}" }}\n\nSource: {{ "{{" }} .GeneratorURL {{ "}}" }}\nSilence: {{ "{{" }} .SilenceURL {{ "}}" }}{{ "{{" }} end {{ "}}" }}{{ "{{" }} end {{ "}}" }}', } ], }, From 758147e9e91d40e9693efd4af091da22e85c0653 Mon Sep 17 00:00:00 2001 From: Anna Williamson Date: Thu, 5 Mar 2026 11:16:32 -0800 Subject: [PATCH 18/19] missed a query update to include container in appropriate alerts --- python-pulumi/src/ptd/grafana_alerts/pods.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-pulumi/src/ptd/grafana_alerts/pods.yaml b/python-pulumi/src/ptd/grafana_alerts/pods.yaml index 70fe8c7..5ffda98 100644 --- a/python-pulumi/src/ptd/grafana_alerts/pods.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/pods.yaml @@ -49,7 +49,7 @@ groups: type: prometheus uid: mimir editorMode: code - expr: count by (cluster, namespace, pod)(kube_pod_container_status_waiting_reason{namespace=~"posit-team|posit-team-system|alloy|mimir|loki|grafana",reason="CrashLoopBackOff"}) + expr: count by (cluster, namespace, pod, container)(kube_pod_container_status_waiting_reason{namespace=~"posit-team|posit-team-system|alloy|mimir|loki|grafana",reason="CrashLoopBackOff"}) instant: true intervalMs: 1000 legendFormat: __auto From 175c995e37a477c9380ffcc3b029bebc215d6f98 Mon Sep 17 00:00:00 2001 From: Anna Williamson Date: Thu, 5 Mar 2026 11:21:21 -0800 Subject: [PATCH 19/19] this should have been an external label for consistency --- .../src/ptd/pulumi_resources/grafana_alloy.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py index 866b728..fa9235c 100644 --- a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py +++ b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py @@ -589,15 +589,12 @@ def _define_config_map( target_label = "cluster" replacement = "{cluster_name}" }} - - rule {{ - action = "replace" - target_label = "tenant_name" - replacement = "{tenant_name}" - }} }} prometheus.remote_write "control_room" {{ + external_labels = {{ + tenant_name = "{tenant_name}", + }} endpoint {{ url = "{control_room_url}" basic_auth {{ @@ -611,6 +608,9 @@ def _define_config_map( }} prometheus.remote_write "workload" {{ + external_labels = {{ + tenant_name = "{tenant_name}", + }} endpoint {{ url = "{workload_url}" }}