From 0aaea2f7af8424541867b60656b2c34981de42d3 Mon Sep 17 00:00:00 2001
From: Anna Williamson <annamdove@gmail.com>
Date: Wed, 25 Feb 2026 11:53:06 -0800
Subject: [PATCH 01/19] docs: update monitoring doc with alert format

---
 docs/guides/monitoring.md | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/docs/guides/monitoring.md b/docs/guides/monitoring.md
index a320a53..f38ecee 100644
--- a/docs/guides/monitoring.md
+++ b/docs/guides/monitoring.md
@@ -532,6 +532,42 @@ PTD deploys a set of Grafana alerts to the control room for centralized monitori
 
 All alerts are configured to send notifications to OpsGenie when triggered.
 
+### Alert Format
+
+Alerts use a standardized format for consistency across all alert types:
+
+```
+[🔴 CRITICAL | 🟡 WARNING]: [Title]
+
+[Description]
+
+─── WHERE ───────────────────────────
+Tenant:      [tenant name] (Note: The organization or group that a workload cluster is provisioned for)
+Cluster:     [cluster name]
+Component:   [affected component]
+
+─── DETAILS ─────────────────────────
+[Key]:       [Value]
+[Key]:       [Value]
+...
+
+📖 [runbook link]
+📊 [dashboard link]
+```
+
+**Severity levels:**
+- 🔴 **CRITICAL** — Immediate action required
+- 🟡 **WARNING** — Investigate soon
+
+**Alert types and their WHERE/DETAILS fields:**
+
+| Type | WHERE | DETAILS |
+|------|-------|---------|
+| Health Check | Tenant, Cluster, Product | Endpoint, Status, Response Time, Down Since |
+| Kubernetes | Tenant, Cluster, Namespace, Pod/Node | Varies by alert (restarts, replicas, conditions) |
+| Cloud (AWS) | Tenant, Cluster, Resource, Region | Metric, Current, Threshold, Duration |
+| Cloud (Azure) | Tenant, Cluster, Resource, Location | Metric, Current, Threshold, Duration |
+
 ### Application Alerts
 
 | Alert | Threshold | Duration | Description |

From 8d8b8a0bfe8b0cca1000c4f48cc6e8707093795f Mon Sep 17 00:00:00 2001
From: Anna Williamson <annamdove@gmail.com>
Date: Wed, 25 Feb 2026 15:10:30 -0800
Subject: [PATCH 02/19] adding new field for friendly tenant name

---
 python-pulumi/src/ptd/__init__.py                       | 1 +
 python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/python-pulumi/src/ptd/__init__.py b/python-pulumi/src/ptd/__init__.py
index 2ab3dda..325ebaf 100644
--- a/python-pulumi/src/ptd/__init__.py
+++ b/python-pulumi/src/ptd/__init__.py
@@ -315,6 +315,7 @@ class WorkloadConfig:
     network_trust: NetworkTrust
     sites: typing.Mapping[str, SiteConfig]
     true_name: str
+    tenant_name: str | None = dataclasses.field(default=None, kw_only=True)  # Human-readable name for the tenant
 
     @property
     def domain(self) -> str:
diff --git a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py
index 4a75bec..1cb3ec5 100644
--- a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py
+++ b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py
@@ -220,6 +220,9 @@ def _define_config_map(
             account_id = self.workload.cfg.account_id
             cluster_name = self.workload.eks_cluster_name(self.release)
 
+        # Use tenant_name if set, otherwise fall back to compound_name
+        tenant_name = self.workload.cfg.tenant_name or self.workload.compound_name
+
         # Generate CloudWatch exporter configuration for AWS
         cloudwatch_config = ""
         if self.cloud_provider == "aws":
@@ -635,6 +638,7 @@ def _define_config_map(
 
                   external_labels = {{
                     data = "true",
+                    tenant_name = "{tenant_name}",
                   }}
                 }}
             """

From 4f166f61d21702d2c436dd36790e12b9d8285be7 Mon Sep 17 00:00:00 2001
From: Anna Williamson <annamdove@gmail.com>
Date: Wed, 25 Feb 2026 15:11:19 -0800
Subject: [PATCH 03/19] updating format for existing k8s alerts

---
 .../src/ptd/grafana_alerts/nodes.yaml         |  51 ++++++++-
 .../src/ptd/grafana_alerts/pods.yaml          | 103 ++++++++++++++++--
 2 files changed, 138 insertions(+), 16 deletions(-)

diff --git a/python-pulumi/src/ptd/grafana_alerts/nodes.yaml b/python-pulumi/src/ptd/grafana_alerts/nodes.yaml
index dde33dc..697bdf8 100644
--- a/python-pulumi/src/ptd/grafana_alerts/nodes.yaml
+++ b/python-pulumi/src/ptd/grafana_alerts/nodes.yaml
@@ -55,8 +55,21 @@ groups:
           execErrState: Error
           for: 15m
           annotations:
-            summary: Node not in ready state (instance {{ $labels.instance }})
-            description: "Node {{ $labels.node }} in cluster {{ $labels.cluster }} has been unready for 15 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+            summary: "🔴 CRITICAL: Node Not Ready"
+            description: |
+              Kubernetes node is not accepting workloads
+
+              ─── WHERE ───────────────────────────
+              Tenant:      {{ $labels.tenant_name }}
+              Cluster:     {{ $labels.cluster }}
+              Node:        {{ $labels.node }}
+
+              ─── DETAILS ─────────────────────────
+              Condition:   NotReady
+              Duration:    15 minutes
+
+              📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/node-not-ready.md
+              📊 https://grafana.example.com/d/nodes?var-cluster={{ $labels.cluster }}
           labels:
             opsgenie: "1"
           isPaused: false
@@ -110,8 +123,21 @@ groups:
           execErrState: Error
           for: 15m
           annotations:
-            summary: Kubernetes Node memory pressure (instance {{ $labels.instance }})
-            description: "Node {{ $labels.node }} in cluster {{ $labels.cluster }} has MemoryPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+            summary: "🟡 WARNING: Node Memory Pressure"
+            description: |
+              Node is running low on available memory
+
+              ─── WHERE ───────────────────────────
+              Tenant:      {{ $labels.tenant_name }}
+              Cluster:     {{ $labels.cluster }}
+              Node:        {{ $labels.node }}
+
+              ─── DETAILS ─────────────────────────
+              Condition:   MemoryPressure
+              Duration:    15 minutes
+
+              📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/node-memory-pressure.md
+              📊 https://grafana.example.com/d/nodes?var-cluster={{ $labels.cluster }}
           labels:
             opsgenie: "1"
           isPaused: false
@@ -165,8 +191,21 @@ groups:
           execErrState: Error
           for: 15m
           annotations:
-            summary: Kubernetes Node disk pressure (instance {{ $labels.instance }})
-            description: "Node {{ $labels.node }} in cluster {{ $labels.cluster }} has DiskPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+            summary: "🟡 WARNING: Node Disk Pressure"
+            description: |
+              Node is running low on available disk space
+
+              ─── WHERE ───────────────────────────
+              Tenant:      {{ $labels.tenant_name }}
+              Cluster:     {{ $labels.cluster }}
+              Node:        {{ $labels.node }}
+
+              ─── DETAILS ─────────────────────────
+              Condition:   DiskPressure
+              Duration:    15 minutes
+
+              📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/node-disk-pressure.md
+              📊 https://grafana.example.com/d/nodes?var-cluster={{ $labels.cluster }}
           labels:
             opsgenie: "1"
           isPaused: false
diff --git a/python-pulumi/src/ptd/grafana_alerts/pods.yaml b/python-pulumi/src/ptd/grafana_alerts/pods.yaml
index 1422ab7..2efff1c 100644
--- a/python-pulumi/src/ptd/grafana_alerts/pods.yaml
+++ b/python-pulumi/src/ptd/grafana_alerts/pods.yaml
@@ -96,7 +96,21 @@ groups:
           execErrState: Error
           for: 5m
           annotations:
-            summary: Container {{ $labels.container }} in {{ $labels.cluster }} is in CrashLoopBackoff state. \n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}
+            summary: "🔴 CRITICAL: Container Crash-Looping"
+            description: |
+              Container keeps crashing and restarting
+
+              ─── WHERE ───────────────────────────
+              Tenant:      {{ $labels.tenant_name }}
+              Cluster:     {{ $labels.cluster }}
+              Container:   {{ $labels.container }}
+
+              ─── DETAILS ─────────────────────────
+              Status:      CrashLoopBackOff
+              Duration:    5 minutes
+
+              📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/crash-loop-backoff.md
+              📊 https://grafana.example.com/d/pods?var-cluster={{ $labels.cluster }}
           labels:
             opsgenie: "1"
           isPaused: false
@@ -187,7 +201,21 @@ groups:
           execErrState: Error
           for: 5m
           annotations:
-            summary: Pod {{ $labels.pod }} in cluster {{ $labels.cluster }} is in a {{ $labels.reason }} state.
+            summary: "🟡 WARNING: Pod Error"
+            description: |
+              Pod container terminated with an error
+
+              ─── WHERE ───────────────────────────
+              Tenant:      {{ $labels.tenant_name }}
+              Cluster:     {{ $labels.cluster }}
+              Pod:         {{ $labels.pod }}
+
+              ─── DETAILS ─────────────────────────
+              Reason:      {{ $labels.reason }}
+              Duration:    5 minutes
+
+              📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/pod-error.md
+              📊 https://grafana.example.com/d/pods?var-cluster={{ $labels.cluster }}
           labels:
             opsgenie: "1"
           isPaused: false
@@ -274,8 +302,21 @@ groups:
           execErrState: Error
           for: 15m
           annotations:
-            summary: Workload Cluster Pod not healthy (pod {{ $labels.pod }})
-            description: "Cluster: {{ $labels.cluster }}, Pod: {{ $labels.pod }} has been in a non-running state for longer than 15 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+            summary: "🟡 WARNING: Pod Not Healthy"
+            description: |
+              Pod has been in a non-running state
+
+              ─── WHERE ───────────────────────────
+              Tenant:      {{ $labels.tenant_name }}
+              Cluster:     {{ $labels.cluster }}
+              Pod:         {{ $labels.pod }}
+
+              ─── DETAILS ─────────────────────────
+              Phase:       {{ $labels.phase }}
+              Duration:    15 minutes
+
+              📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/pod-not-healthy.md
+              📊 https://grafana.example.com/d/pods?var-cluster={{ $labels.cluster }}
           labels:
             opsgenie: "1"
           isPaused: false
@@ -366,8 +407,22 @@ groups:
           execErrState: Error
           for: 15m
           annotations:
-            summary: Workload Cluster Pod {{ $labels.pod }} has more than 5 restarts in 15 minutes.
-            description: "Cluster: {{ $labels.cluster }}, Pod: {{ $labels.pod }} has restarted {{$value}} times in 15 minutes. \n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+            summary: "🟡 WARNING: Pod Restarts"
+            description: |
+              Pod has restarted excessively
+
+              ─── WHERE ───────────────────────────
+              Tenant:      {{ $labels.tenant_name }}
+              Cluster:     {{ $labels.cluster }}
+              Namespace:   {{ $labels.namespace }}
+              Pod:         {{ $labels.pod }}
+
+              ─── DETAILS ─────────────────────────
+              Restarts:    {{ $value }} in 15 minutes
+              Threshold:   > 5 restarts
+
+              📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/pod-restarts.md
+              📊 https://grafana.example.com/d/pods?var-cluster={{ $labels.cluster }}
           labels:
             opsgenie: "1"
           isPaused: false
@@ -451,8 +506,22 @@ groups:
           execErrState: Error
           for: 15m
           annotations:
-            summary: Deployment replicas mismatch (instance {{ $labels.instance }})
-            description: "Cluster: {{ $labels.cluster }}, Deployment: {{ $labels.deployment }} has mismatched desired and available replicas.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+            summary: "🟡 WARNING: Deployment Unhealthy"
+            description: |
+              Deployment has fewer replicas than expected
+
+              ─── WHERE ───────────────────────────
+              Tenant:      {{ $labels.tenant_name }}
+              Cluster:     {{ $labels.cluster }}
+              Namespace:   {{ $labels.namespace }}
+              Deployment:  {{ $labels.deployment }}
+
+              ─── DETAILS ─────────────────────────
+              Status:      Replicas mismatch
+              Duration:    15 minutes
+
+              📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/deployment-replica-mismatch.md
+              📊 https://grafana.example.com/d/deployments?var-cluster={{ $labels.cluster }}
           labels:
             opsgenie: "1"
           isPaused: false
@@ -536,8 +605,22 @@ groups:
           execErrState: Error
           for: 15m
           annotations:
-            summary: Stateful Set replicas mismatch (instance {{ $labels.instance }})
-            description: "Cluster: {{ $labels.cluster }}, Stateful Set: {{ $labels.statefulset }} does not match the desired number of replicas.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+            summary: "🟡 WARNING: StatefulSet Unhealthy"
+            description: |
+              StatefulSet has fewer replicas than expected
+
+              ─── WHERE ───────────────────────────
+              Tenant:      {{ $labels.tenant_name }}
+              Cluster:     {{ $labels.cluster }}
+              Namespace:   {{ $labels.namespace }}
+              StatefulSet: {{ $labels.statefulset }}
+
+              ─── DETAILS ─────────────────────────
+              Status:      Replicas mismatch
+              Duration:    15 minutes
+
+              📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/statefulset-replica-mismatch.md
+              📊 https://grafana.example.com/d/statefulsets?var-cluster={{ $labels.cluster }}
           labels:
             opsgenie: "1"
           isPaused: false

From 935888012bbc516605b11ad8036fc3d6c951db1c Mon Sep 17 00:00:00 2001
From: Anna Williamson <annamdove@gmail.com>
Date: Wed, 25 Feb 2026 15:11:30 -0800
Subject: [PATCH 04/19] updating format for existing healthcheck alerts

---
 .../src/ptd/grafana_alerts/healthchecks.yaml  | 20 +++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/python-pulumi/src/ptd/grafana_alerts/healthchecks.yaml b/python-pulumi/src/ptd/grafana_alerts/healthchecks.yaml
index 3924d39..9609afc 100644
--- a/python-pulumi/src/ptd/grafana_alerts/healthchecks.yaml
+++ b/python-pulumi/src/ptd/grafana_alerts/healthchecks.yaml
@@ -68,8 +68,24 @@ groups:
           execErrState: Error
           for: 5m
           annotations:
-            description: Calls the health check for each component of each PTD site and errors on non-200 response.
-            summary: "Health check failed! \nCluster: {{ $labels.cluster }} \nSite: {{ $labels.ptd_site }}\nComponent: {{ $labels.ptd_component }}\nCheck Type: {{ $labels.check_type }}\nURL: {{ $labels.health_check_url }}"
+            summary: "🔴 CRITICAL: Health Check Failed"
+            description: |
+              Product health check returning non-200 response
+
+              ─── WHERE ───────────────────────────
+              Tenant:      {{ $labels.tenant_name }}
+              Cluster:     {{ $labels.cluster }}
+              Site:        {{ $labels.ptd_site }}
+              Product:     {{ $labels.ptd_component }}
+
+              ─── DETAILS ─────────────────────────
+              Check Type:  {{ $labels.check_type }}
+              Endpoint:    {{ $labels.health_check_url }}
+              Status:      Non-200 response
+              Duration:    5 minutes
+
+              📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/health-check-failed.md
+              📊 https://grafana.example.com/d/healthchecks?var-cluster={{ $labels.cluster }}
           labels:
             opsgenie: "1"
           isPaused: false

From f41c5273906da67a58d862f6826497c57d8fa30d Mon Sep 17 00:00:00 2001
From: Anna Williamson <annamdove@gmail.com>
Date: Wed, 25 Feb 2026 15:22:54 -0800
Subject: [PATCH 05/19] updating format for cloud and system alerts

---
 .../src/ptd/grafana_alerts/applications.yaml  | 19 +++++-
 .../src/ptd/grafana_alerts/cloudwatch.yaml    | 60 +++++++++++++++++--
 .../src/ptd/grafana_alerts/mimir.yaml         | 17 +++++-
 3 files changed, 86 insertions(+), 10 deletions(-)

diff --git a/python-pulumi/src/ptd/grafana_alerts/applications.yaml b/python-pulumi/src/ptd/grafana_alerts/applications.yaml
index 2e089d6..0e3c089 100644
--- a/python-pulumi/src/ptd/grafana_alerts/applications.yaml
+++ b/python-pulumi/src/ptd/grafana_alerts/applications.yaml
@@ -63,8 +63,23 @@ groups:
           execErrState: Error
           for: 5m
           annotations:
-            description: Loki ingester has experienced WAL disk full failures. This indicates storage issues with the Loki WAL directory in cluster {{ $labels.cluster }}.
-            summary: Loki WAL disk full failures detected in {{ $labels.cluster }}
+            summary: "🔴 CRITICAL: Loki WAL Disk Full"
+            description: |
+              Loki ingester experiencing WAL disk full failures
+
+              ─── WHERE ───────────────────────────
+              Tenant:      {{ $labels.tenant_name }}
+              Cluster:     {{ $labels.cluster }}
+              Namespace:   {{ $labels.namespace }}
+              Pod:         {{ $labels.pod }}
+
+              ─── DETAILS ─────────────────────────
+              Component:   Loki Ingester
+              Issue:       WAL disk full
+              Duration:    5 minutes
+
+              📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/loki-wal-disk-full.md
+              📊 https://grafana.example.com/d/loki?var-cluster={{ $labels.cluster }}
           labels:
             opsgenie: "1"
           isPaused: false
diff --git a/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml b/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml
index d28856f..bae42e0 100644
--- a/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml
+++ b/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml
@@ -63,8 +63,24 @@ groups:
           execErrState: Error
           for: 5m
           annotations:
-            description: Evaluates FSx instance usage to determine if it is greater than 80% allocated.
-            summary: FSx instance {{$labels.dimension_FileSystemId}} has less than 20% of capacity remaining.
+            summary: "🟡 WARNING: FSx Storage Capacity Low"
+            description: |
+              FSx file system is running low on storage capacity
+
+              ─── WHERE ───────────────────────────
+              Tenant:      {{ $labels.tenant_name }}
+              Cluster:     {{ $labels.cluster }}
+              Resource:    {{ $labels.dimension_FileSystemId }}
+              Region:      {{ $labels.region }}
+
+              ─── DETAILS ─────────────────────────
+              Metric:      Storage Capacity
+              Current:     > 80% used
+              Threshold:   80%
+              Duration:    5 minutes
+
+              📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/fsx-capacity-low.md
+              📊 https://grafana.example.com/d/cloudwatch?var-cluster={{ $labels.cluster }}
           labels:
             opsgenie: "1"
           isPaused: false
@@ -124,8 +140,24 @@ groups:
           execErrState: Error
           for: 5m
           annotations:
-            description: EC2 instance has high network outbound traffic (over 300 MiB/s) for more than 5 minutes.
-            summary: High network outbound traffic on EC2 instance {{$labels.dimension_InstanceId}} in cluster {{$labels.cluster}}
+            summary: "🟡 WARNING: EC2 Network Out High"
+            description: |
+              EC2 instance has unusually high outbound network traffic
+
+              ─── WHERE ───────────────────────────
+              Tenant:      {{ $labels.tenant_name }}
+              Cluster:     {{ $labels.cluster }}
+              Resource:    {{ $labels.dimension_InstanceId }}
+              Region:      {{ $labels.region }}
+
+              ─── DETAILS ─────────────────────────
+              Metric:      Network Out
+              Current:     > 300 MiB/s
+              Threshold:   300 MiB/s
+              Duration:    5 minutes
+
+              📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/ec2-network-high.md
+              📊 https://grafana.example.com/d/cloudwatch?var-cluster={{ $labels.cluster }}
           labels:
             opsgenie: "1"
           isPaused: false
@@ -181,8 +213,24 @@ groups:
           execErrState: Error
           for: 5m
           annotations:
-            description: EC2 instance has an unusually high packet transmission rate (over 400,000 packets/s) for more than 5 minutes, which could indicate abnormal network activity.
-            summary: High network packet rate on EC2 instance {{$labels.dimension_InstanceId}} in cluster {{$labels.cluster}}
+            summary: "🟡 WARNING: EC2 Network Packets Out High"
+            description: |
+              EC2 instance has unusually high packet transmission rate
+
+              ─── WHERE ───────────────────────────
+              Tenant:      {{ $labels.tenant_name }}
+              Cluster:     {{ $labels.cluster }}
+              Resource:    {{ $labels.dimension_InstanceId }}
+              Region:      {{ $labels.region }}
+
+              ─── DETAILS ─────────────────────────
+              Metric:      Network Packets Out
+              Current:     > 400,000 packets/s
+              Threshold:   400,000 packets/s
+              Duration:    5 minutes
+
+              📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/ec2-network-high.md
+              📊 https://grafana.example.com/d/cloudwatch?var-cluster={{ $labels.cluster }}
           labels:
             opsgenie: "1"
           isPaused: false
diff --git a/python-pulumi/src/ptd/grafana_alerts/mimir.yaml b/python-pulumi/src/ptd/grafana_alerts/mimir.yaml
index 71da183..1e966a2 100644
--- a/python-pulumi/src/ptd/grafana_alerts/mimir.yaml
+++ b/python-pulumi/src/ptd/grafana_alerts/mimir.yaml
@@ -91,8 +91,21 @@ groups:
           execErrState: Error
           for: 10m
           annotations:
-            description: No metrics have been received from workload cluster {{ $labels.cluster }} for 10 minutes. This could indicate Alloy is not running, network issues between the workload and control room, or the workload cluster is down.
-            summary: "Workload metrics silent!\nCluster: {{ $labels.cluster }}"
+            summary: "🔴 CRITICAL: Workload Metrics Silent"
+            description: |
+              No metrics received from workload cluster
+
+              ─── WHERE ───────────────────────────
+              Tenant:      {{ $labels.tenant_name }}
+              Cluster:     {{ $labels.cluster }}
+
+              ─── DETAILS ─────────────────────────
+              Component:   Metrics Pipeline
+              Issue:       No metrics received
+              Duration:    10 minutes
+
+              📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/workload-metrics-silent.md
+              📊 https://grafana.example.com/d/mimir?var-cluster={{ $labels.cluster }}
           labels:
             opsgenie: "1"
           isPaused: false

From 245ffa68398a7cd20303abf47d5a93437774fc00 Mon Sep 17 00:00:00 2001
From: Anna Williamson <annamdove@gmail.com>
Date: Thu, 26 Feb 2026 08:49:35 -0800
Subject: [PATCH 06/19] add tenant name field to blackbox healthcheck alerts

---
 python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py
index 12a8a09..fd585ae 100644
--- a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py
+++ b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py
@@ -127,6 +127,7 @@ def _is_fqdn_health_check_enabled(self, site_dict: dict[str, typing.Any] | None)
 
     def _define_blackbox_targets(self) -> str:
         output = ""
+        tenant_name = self.workload.cfg.tenant_name or self.workload.compound_name
 
         for site_name, site_config in self.workload.cfg.sites.items():
             # Parse site YAML once for this site
@@ -150,6 +151,7 @@ def _define_blackbox_targets(self) -> str:
                   address = {internal_address}
                   module = "{component.module_name}"
                   labels = {{
+                    "tenant_name" = "{tenant_name}",
                     "ptd_site" = "{site_name}",
                     "ptd_component" = "{lower_name}",
                     "check_type" = "internal",
@@ -168,6 +170,7 @@ def _define_blackbox_targets(self) -> str:
                       address = {fqdn_address}
                       module = "{component.module_name}"
                       labels = {{
+                        "tenant_name" = "{tenant_name}",
                         "ptd_site" = "{site_name}",
                         "ptd_component" = "{lower_name}",
                         "check_type" = "fqdn",

From 523b491e5521ffe55ffd6da6e498f213a0cdc78b Mon Sep 17 00:00:00 2001
From: Anna Williamson <annamdove@gmail.com>
Date: Thu, 26 Feb 2026 08:54:44 -0800
Subject: [PATCH 07/19] remove labels output and send tags only

---
 python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py
index a7023af..c5baf67 100644
--- a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py
+++ b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py
@@ -1982,6 +1982,7 @@ def with_grafana(
                                             "settings": {
                                                 "apiKey": '${{ "{" }}POSIT_OPSGENIE_KEY{{ "}" }}',  # ${POSIT_OPSGENIE_KEY} in the resulting configMap,
                                                 "apiUrl": "https://api.opsgenie.com/v2/alerts",
+                                                "sendTagsAs": "tags",
                                             },
                                         }
                                     ],

From d331bf0ebce3b4d06c49664bd6c312f002b612f8 Mon Sep 17 00:00:00 2001
From: Anna Williamson <annamdove@gmail.com>
Date: Thu, 26 Feb 2026 11:35:25 -0800
Subject: [PATCH 08/19] fix tenant_label on metrics

---
 python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py
index fd585ae..ead1744 100644
--- a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py
+++ b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py
@@ -574,6 +574,12 @@ def _define_config_map(
                         target_label = "cluster"
                         replacement  = "{cluster_name}"
                     }}
+
+                    rule {{
+                        action       = "replace"
+                        target_label = "tenant_name"
+                        replacement  = "{tenant_name}"
+                    }}
                 }}
 
                 prometheus.remote_write "control_room" {{

From 27fb7526370c961012ab85d80bd69355ac3aac9e Mon Sep 17 00:00:00 2001
From: Anna Williamson <annamdove@gmail.com>
Date: Thu, 26 Feb 2026 14:14:34 -0800
Subject: [PATCH 09/19] update alert template fields

---
 python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py
index c5baf67..ee2fa86 100644
--- a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py
+++ b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py
@@ -1983,6 +1983,8 @@ def with_grafana(
                                                 "apiKey": '${{ "{" }}POSIT_OPSGENIE_KEY{{ "}" }}',  # ${POSIT_OPSGENIE_KEY} in the resulting configMap,
                                                 "apiUrl": "https://api.opsgenie.com/v2/alerts",
                                                 "sendTagsAs": "tags",
+                                                "message": "{{ .CommonAnnotations.summary }}",
+                                                "description": "{{ .CommonAnnotations.description }}",
                                             },
                                         }
                                     ],

From dac790c1d132950ded95010a0cd17ca104f4983f Mon Sep 17 00:00:00 2001
From: Anna Williamson <annamdove@gmail.com>
Date: Thu, 26 Feb 2026 14:17:58 -0800
Subject: [PATCH 10/19] fix field format, escape characters

---
 python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py
index ee2fa86..320570d 100644
--- a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py
+++ b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py
@@ -1983,8 +1983,8 @@ def with_grafana(
                                                 "apiKey": '${{ "{" }}POSIT_OPSGENIE_KEY{{ "}" }}',  # ${POSIT_OPSGENIE_KEY} in the resulting configMap,
                                                 "apiUrl": "https://api.opsgenie.com/v2/alerts",
                                                 "sendTagsAs": "tags",
-                                                "message": "{{ .CommonAnnotations.summary }}",
-                                                "description": "{{ .CommonAnnotations.description }}",
+                                                "message": '{{ "{{" }} .CommonAnnotations.summary {{ "}}" }}',
+                                                "description": '{{ "{{" }} .CommonAnnotations.description {{ "}}" }}',
                                             },
                                         }
                                     ],

From 0637b306bb51968ebcbacc940ccee41209df6f3b Mon Sep 17 00:00:00 2001
From: Anna Williamson <annamdove@gmail.com>
Date: Thu, 26 Feb 2026 15:03:42 -0800
Subject: [PATCH 11/19] add namespace and pod, and fix formatting issues

---
 .../src/ptd/grafana_alerts/applications.yaml  |  2 --
 .../src/ptd/grafana_alerts/cloudwatch.yaml    |  6 -----
 .../src/ptd/grafana_alerts/healthchecks.yaml  |  2 --
 .../src/ptd/grafana_alerts/mimir.yaml         |  2 --
 .../src/ptd/grafana_alerts/nodes.yaml         |  6 -----
 .../src/ptd/grafana_alerts/pods.yaml          | 25 ++++++-------------
 6 files changed, 8 insertions(+), 35 deletions(-)

diff --git a/python-pulumi/src/ptd/grafana_alerts/applications.yaml b/python-pulumi/src/ptd/grafana_alerts/applications.yaml
index 0e3c089..7f36bb7 100644
--- a/python-pulumi/src/ptd/grafana_alerts/applications.yaml
+++ b/python-pulumi/src/ptd/grafana_alerts/applications.yaml
@@ -78,8 +78,6 @@ groups:
               Issue:       WAL disk full
               Duration:    5 minutes
 
-              📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/loki-wal-disk-full.md
-              📊 https://grafana.example.com/d/loki?var-cluster={{ $labels.cluster }}
           labels:
             opsgenie: "1"
           isPaused: false
diff --git a/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml b/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml
index a2af562..076c603 100644
--- a/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml
+++ b/python-pulumi/src/ptd/grafana_alerts/cloudwatch.yaml
@@ -79,8 +79,6 @@ groups:
               Threshold:   80%
               Duration:    5 minutes
 
-              📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/fsx-capacity-low.md
-              📊 https://grafana.example.com/d/cloudwatch?var-cluster={{ $labels.cluster }}
           labels:
             opsgenie: "1"
           isPaused: false
@@ -156,8 +154,6 @@ groups:
               Threshold:   300 MiB/s
               Duration:    5 minutes
 
-              📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/ec2-network-high.md
-              📊 https://grafana.example.com/d/cloudwatch?var-cluster={{ $labels.cluster }}
           labels:
             opsgenie: "1"
           isPaused: false
@@ -229,8 +225,6 @@ groups:
               Threshold:   400,000 packets/s
               Duration:    5 minutes
 
-              📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/ec2-network-high.md
-              📊 https://grafana.example.com/d/cloudwatch?var-cluster={{ $labels.cluster }}
           labels:
             opsgenie: "1"
           isPaused: false
diff --git a/python-pulumi/src/ptd/grafana_alerts/healthchecks.yaml b/python-pulumi/src/ptd/grafana_alerts/healthchecks.yaml
index 9609afc..0922c8b 100644
--- a/python-pulumi/src/ptd/grafana_alerts/healthchecks.yaml
+++ b/python-pulumi/src/ptd/grafana_alerts/healthchecks.yaml
@@ -84,8 +84,6 @@ groups:
               Status:      Non-200 response
               Duration:    5 minutes
 
-              📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/health-check-failed.md
-              📊 https://grafana.example.com/d/healthchecks?var-cluster={{ $labels.cluster }}
           labels:
             opsgenie: "1"
           isPaused: false
diff --git a/python-pulumi/src/ptd/grafana_alerts/mimir.yaml b/python-pulumi/src/ptd/grafana_alerts/mimir.yaml
index 1e966a2..b5f6779 100644
--- a/python-pulumi/src/ptd/grafana_alerts/mimir.yaml
+++ b/python-pulumi/src/ptd/grafana_alerts/mimir.yaml
@@ -104,8 +104,6 @@ groups:
               Issue:       No metrics received
               Duration:    10 minutes
 
-              📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/workload-metrics-silent.md
-              📊 https://grafana.example.com/d/mimir?var-cluster={{ $labels.cluster }}
           labels:
             opsgenie: "1"
           isPaused: false
diff --git a/python-pulumi/src/ptd/grafana_alerts/nodes.yaml b/python-pulumi/src/ptd/grafana_alerts/nodes.yaml
index 697bdf8..bb6f91d 100644
--- a/python-pulumi/src/ptd/grafana_alerts/nodes.yaml
+++ b/python-pulumi/src/ptd/grafana_alerts/nodes.yaml
@@ -68,8 +68,6 @@ groups:
               Condition:   NotReady
               Duration:    15 minutes
 
-              📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/node-not-ready.md
-              📊 https://grafana.example.com/d/nodes?var-cluster={{ $labels.cluster }}
           labels:
             opsgenie: "1"
           isPaused: false
@@ -136,8 +134,6 @@ groups:
               Condition:   MemoryPressure
               Duration:    15 minutes
 
-              📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/node-memory-pressure.md
-              📊 https://grafana.example.com/d/nodes?var-cluster={{ $labels.cluster }}
           labels:
             opsgenie: "1"
           isPaused: false
@@ -204,8 +200,6 @@ groups:
               Condition:   DiskPressure
               Duration:    15 minutes
 
-              📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/node-disk-pressure.md
-              📊 https://grafana.example.com/d/nodes?var-cluster={{ $labels.cluster }}
           labels:
             opsgenie: "1"
           isPaused: false
diff --git a/python-pulumi/src/ptd/grafana_alerts/pods.yaml b/python-pulumi/src/ptd/grafana_alerts/pods.yaml
index 2efff1c..5019c50 100644
--- a/python-pulumi/src/ptd/grafana_alerts/pods.yaml
+++ b/python-pulumi/src/ptd/grafana_alerts/pods.yaml
@@ -29,7 +29,7 @@ groups:
                     type: prometheus
                     uid: mimir
                 editorMode: code
-                expr: count by (cluster, container)(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"})
+                expr: count by (cluster, namespace, pod, container)(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"})
                 instant: true
                 intervalMs: 1000
                 legendFormat: __auto
@@ -103,14 +103,14 @@ groups:
               ─── WHERE ───────────────────────────
               Tenant:      {{ $labels.tenant_name }}
               Cluster:     {{ $labels.cluster }}
+              Namespace:   {{ $labels.namespace }}
+              Pod:         {{ $labels.pod }}
               Container:   {{ $labels.container }}
 
               ─── DETAILS ─────────────────────────
               Status:      CrashLoopBackOff
               Duration:    5 minutes
 
-              📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/crash-loop-backoff.md
-              📊 https://grafana.example.com/d/pods?var-cluster={{ $labels.cluster }}
           labels:
             opsgenie: "1"
           isPaused: false
@@ -129,7 +129,7 @@ groups:
                     uid: mimir
                 disableTextWrap: false
                 editorMode: code
-                expr: count by(cluster, pod, reason) (kube_pod_container_status_terminated_reason{reason!="Completed"} * on(cluster,pod) group_left(label_launcher_instance_id) kube_pod_labels{label_launcher_instance_id=""})
+                expr: count by(cluster, namespace, pod, reason) (kube_pod_container_status_terminated_reason{reason!="Completed"} * on(cluster,pod) group_left(label_launcher_instance_id) kube_pod_labels{label_launcher_instance_id=""})
                 fullMetaSearch: false
                 includeNullMetadata: true
                 instant: true
@@ -208,14 +208,13 @@ groups:
               ─── WHERE ───────────────────────────
               Tenant:      {{ $labels.tenant_name }}
               Cluster:     {{ $labels.cluster }}
+              Namespace:   {{ $labels.namespace }}
               Pod:         {{ $labels.pod }}
 
               ─── DETAILS ─────────────────────────
               Reason:      {{ $labels.reason }}
               Duration:    5 minutes
 
-              📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/pod-error.md
-              📊 https://grafana.example.com/d/pods?var-cluster={{ $labels.cluster }}
           labels:
             opsgenie: "1"
           isPaused: false
@@ -234,7 +233,7 @@ groups:
                     uid: mimir
                 disableTextWrap: false
                 editorMode: code
-                expr: sum by (cluster, pod, phase) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"} * on(cluster,pod) group_left(label_launcher_instance_id) kube_pod_labels{label_launcher_instance_id=""}) > 0
+                expr: sum by (cluster, namespace, pod, phase) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"} * on(cluster,pod) group_left(label_launcher_instance_id) kube_pod_labels{label_launcher_instance_id=""}) > 0
                 fullMetaSearch: false
                 includeNullMetadata: true
                 instant: true
@@ -309,14 +308,13 @@ groups:
               ─── WHERE ───────────────────────────
               Tenant:      {{ $labels.tenant_name }}
               Cluster:     {{ $labels.cluster }}
+              Namespace:   {{ $labels.namespace }}
               Pod:         {{ $labels.pod }}
 
               ─── DETAILS ─────────────────────────
               Phase:       {{ $labels.phase }}
               Duration:    15 minutes
 
-              📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/pod-not-healthy.md
-              📊 https://grafana.example.com/d/pods?var-cluster={{ $labels.cluster }}
           labels:
             opsgenie: "1"
           isPaused: false
@@ -418,11 +416,8 @@ groups:
               Pod:         {{ $labels.pod }}
 
               ─── DETAILS ─────────────────────────
-              Restarts:    {{ $value }} in 15 minutes
-              Threshold:   > 5 restarts
+              Issue:       > 5 restarts in 15 minutes
 
-              📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/pod-restarts.md
-              📊 https://grafana.example.com/d/pods?var-cluster={{ $labels.cluster }}
           labels:
             opsgenie: "1"
           isPaused: false
@@ -520,8 +515,6 @@ groups:
               Status:      Replicas mismatch
               Duration:    15 minutes
 
-              📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/deployment-replica-mismatch.md
-              📊 https://grafana.example.com/d/deployments?var-cluster={{ $labels.cluster }}
           labels:
             opsgenie: "1"
           isPaused: false
@@ -619,8 +612,6 @@ groups:
               Status:      Replicas mismatch
               Duration:    15 minutes
 
-              📖 https://github.com/posit-dev/ptd/blob/main/docs/runbooks/statefulset-replica-mismatch.md
-              📊 https://grafana.example.com/d/statefulsets?var-cluster={{ $labels.cluster }}
           labels:
             opsgenie: "1"
           isPaused: false

From a6de747baad5b6f1cf674971b60eb1d52223febb Mon Sep 17 00:00:00 2001
From: Anna Williamson <annamdove@gmail.com>
Date: Fri, 27 Feb 2026 14:08:09 -0800
Subject: [PATCH 12/19] fix comment characters in alloy config

---
 .../src/ptd/pulumi_resources/grafana_alloy.py | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py
index ead1744..07fd538 100644
--- a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py
+++ b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py
@@ -337,8 +337,8 @@ def _define_cloudwatch_config(self) -> str:
                     type    = "AWS/NATGateway"
                     regions = ["{self.region}"]
 
-                    # NAT Gateways inherit VPC tags including posit.team/true-name
-                    # (see python-pulumi/src/ptd/pulumi_resources/aws_vpc.py:607-616)
+                    // NAT Gateways inherit VPC tags including posit.team/true-name
+                    // (see python-pulumi/src/ptd/pulumi_resources/aws_vpc.py:607-616)
                     search_tags = {{
                         "posit.team/true-name" = "{self.workload.cfg.true_name}",
                     }}
@@ -360,12 +360,12 @@ def _define_cloudwatch_config(self) -> str:
                     type    = "AWS/ApplicationELB"
                     regions = ["{self.region}"]
 
-                    # ALBs are tagged at creation time via aws_workload_helm.py.
-                    # LBs provisioned before this tag was added won't be discovered
-                    # until the cluster is redeployed.
-                    # FIXME: To tag existing ALBs without redeploying, use the AWS CLI:
-                    #   aws elbv2 add-tags --resource-arns <ALB_ARN> \
-                    #     --tags Key=posit.team/true-name,Value=<true_name>
+                    // ALBs are tagged at creation time via aws_workload_helm.py.
+                    // LBs provisioned before this tag was added won't be discovered
+                    // until the cluster is redeployed.
+                    // FIXME: To tag existing ALBs without redeploying, use the AWS CLI:
+                    //   aws elbv2 add-tags --resource-arns <ALB_ARN>
+                    //     --tags Key=posit.team/true-name,Value=<true_name>
                     search_tags = {{
                         "posit.team/true-name" = "{self.workload.cfg.true_name}",
                     }}
@@ -393,12 +393,12 @@ def _define_cloudwatch_config(self) -> str:
                     type    = "AWS/NetworkELB"
                     regions = ["{self.region}"]
 
-                    # NLBs are tagged at creation time via traefik.py.
-                    # LBs provisioned before this tag was added won't be discovered
-                    # until the cluster is redeployed.
-                    # FIXME: To tag existing NLBs without redeploying, use the AWS CLI:
-                    #   aws elbv2 add-tags --resource-arns <NLB_ARN> \
-                    #     --tags Key=posit.team/true-name,Value=<true_name>
+                    // NLBs are tagged at creation time via traefik.py.
+                    // LBs provisioned before this tag was added won't be discovered
+                    // until the cluster is redeployed.
+                    // FIXME: To tag existing NLBs without redeploying, use the AWS CLI:
+                    //   aws elbv2 add-tags --resource-arns <NLB_ARN>
+                    //     --tags Key=posit.team/true-name,Value=<true_name>
                     search_tags = {{
                         "posit.team/true-name" = "{self.workload.cfg.true_name}",
                     }}

From 90cc64063dc8ea3b80c9e04ffc70b35ea49286b2 Mon Sep 17 00:00:00 2001
From: Anna Williamson <annamdove@gmail.com>
Date: Fri, 27 Feb 2026 15:41:20 -0800
Subject: [PATCH 13/19] fix pound sign breaking alloy

---
 .../src/ptd/pulumi_resources/grafana_alloy.py    | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py
index 07fd538..912ca92 100644
--- a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py
+++ b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py
@@ -267,12 +267,12 @@ def _define_cloudwatch_config(self) -> str:
                         period     = "5m"
                     }}
 
-                    # TODO: Remove ["Sum"] from statistics once all Grafana dashboards have
-                    # been updated to query aws_rds_database_connections_average.
-                    # Collecting both Sum and Average during migration. Average is the
-                    # target metric (aws_rds_database_connections_average); Sum
-                    # (aws_rds_database_connections_sum) is kept temporarily for existing
-                    # dashboards. NOTE: Keeping Sum doubles the CloudWatch API cost for this metric.
+                    // TODO: Remove ["Sum"] from statistics once all Grafana dashboards have
+                    // been updated to query aws_rds_database_connections_average.
+                    // Collecting both Sum and Average during migration. Average is the
+                    // target metric (aws_rds_database_connections_average). Sum
+                    // (aws_rds_database_connections_sum) is kept temporarily for existing
+                    // dashboards. NOTE: Keeping Sum doubles the CloudWatch API cost for this metric.
                     metric {{
                         name       = "DatabaseConnections"
                         statistics = ["Average", "Sum"]
@@ -297,14 +297,14 @@ def _define_cloudwatch_config(self) -> str:
                         period     = "5m"
                     }}
 
-                    # Collected for dashboard visibility; no alert rules defined
+                    // Collected for dashboard visibility, no alert rules defined
                     metric {{
                         name       = "WriteLatency"
                         statistics = ["Average"]
                         period     = "5m"
                     }}
 
-                    # Collected for dashboard visibility; no alert rules defined
+                    // Collected for dashboard visibility, no alert rules defined
                     metric {{
                         name       = "Deadlocks"
                         statistics = ["Sum"]

From bbde877ef2b952087fd24ead93319f6613a7c713 Mon Sep 17 00:00:00 2001
From: Anna Williamson <annamdove@gmail.com>
Date: Tue, 3 Mar 2026 14:06:01 -0800
Subject: [PATCH 14/19] update example config yaml

---
 examples/workload/ptd.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/examples/workload/ptd.yaml b/examples/workload/ptd.yaml
index c04a275..4a867c0 100644
--- a/examples/workload/ptd.yaml
+++ b/examples/workload/ptd.yaml
@@ -11,6 +11,9 @@ spec:
   # AWS account ID where the workload will be deployed
   account_id: "123456789012"
 
+  # Human-readable tenant name for alerts (defaults to compound_name if not set)
+  tenant_name: "Example Analytics Team"
+
   # Control room that manages this workload
   control_room_account_id: "123456789012"
   control_room_cluster_name: control-room-production

From 2c4a0e9ff2db84d46bbec8b7a74e828a318d4d0f Mon Sep 17 00:00:00 2001
From: Anna Williamson <annamdove@gmail.com>
Date: Tue, 3 Mar 2026 15:16:38 -0800
Subject: [PATCH 15/19] update grouping to not combine healthchecks

---
 python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py
index 9f260fc..1ddf635 100644
--- a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py
+++ b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py
@@ -1991,7 +1991,7 @@ def with_grafana(
                                 {
                                     "orgId": 1,
                                     "receiver": "posit-opsgenie",
-                                    "group_by": ["alertname", "cluster"],
+                                    "group_by": ["alertname", "cluster", "ptd_component"],
                                     "matchers": ["opsgenie = 1"],
                                     "group_wait": "30s",
                                     "group_interval": "5m",

From 844ae6ec80b6a8251f06f4cd045c53341b9d2a40 Mon Sep 17 00:00:00 2001
From: Anna Williamson <annamdove@gmail.com>
Date: Wed, 4 Mar 2026 15:45:04 -0800
Subject: [PATCH 16/19] healthcheck format and dedupe by type

---
 .../ptd/pulumi_resources/aws_eks_cluster.py   | 23 +++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py
index 1ddf635..b44f690 100644
--- a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py
+++ b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py
@@ -1963,6 +1963,21 @@ def with_grafana(
                 ),
                 values={
                     "alerting": {
+                        # Custom notification templates for clean alert formatting.
+                        # These templates output ONLY our formatted content without
+                        # Grafana's default prefix (Firing, Value, Labels, Annotations).
+                        "templates.yaml": {
+                            "apiVersion": 1,
+                            "templates": [
+                                {
+                                    "orgId": 1,
+                                    "name": "ptd_templates",
+                                    # Template outputs just the description annotation for each alert.
+                                    # This avoids Grafana's default verbose format.
+                                    "template": '{{ "{{" }} define "ptd.description" {{ "}}" }}{{ "{{" }} range .Alerts {{ "}}" }}{{ "{{" }} .Annotations.description {{ "}}" }}{{ "{{" }} end {{ "}}" }}{{ "{{" }} end {{ "}}" }}',
+                                }
+                            ],
+                        },
                         "contactpoints.yaml": {
                             "apiVersion": 1,
                             "contactPoints": [
@@ -1978,7 +1993,8 @@ def with_grafana(
                                                 "apiUrl": "https://api.opsgenie.com/v2/alerts",
                                                 "sendTagsAs": "tags",
                                                 "message": '{{ "{{" }} .CommonAnnotations.summary {{ "}}" }}',
-                                                "description": '{{ "{{" }} .CommonAnnotations.description {{ "}}" }}',
+                                                # Use custom template for clean description without label dumps
+                                                "description": '{{ "{{" }} template "ptd.description" . {{ "}}" }}',
                                             },
                                         }
                                     ],
@@ -1991,7 +2007,10 @@ def with_grafana(
                                 {
                                     "orgId": 1,
                                     "receiver": "posit-opsgenie",
-                                    "group_by": ["alertname", "cluster", "ptd_component"],
+                                    # health_check_url ensures each health check endpoint gets its own alert
+                                    # (internal vs fqdn checks are separate). This label is empty for
+                                    # non-healthcheck alerts so it won't affect their grouping.
+                                    "group_by": ["alertname", "cluster", "ptd_component", "health_check_url"],
                                     "matchers": ["opsgenie = 1"],
                                     "group_wait": "30s",
                                     "group_interval": "5m",

From 76cd3d8f001fa637b739d1e9158d3d184190e1cd Mon Sep 17 00:00:00 2001
From: Anna Williamson <annamdove@gmail.com>
Date: Wed, 4 Mar 2026 15:48:01 -0800
Subject: [PATCH 17/19] add back source and silence links

---
 python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py
index b44f690..7d97f0c 100644
--- a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py
+++ b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py
@@ -1972,9 +1972,9 @@ def with_grafana(
                                 {
                                     "orgId": 1,
                                     "name": "ptd_templates",
-                                    # Template outputs just the description annotation for each alert.
-                                    # This avoids Grafana's default verbose format.
-                                    "template": '{{ "{{" }} define "ptd.description" {{ "}}" }}{{ "{{" }} range .Alerts {{ "}}" }}{{ "{{" }} .Annotations.description {{ "}}" }}{{ "{{" }} end {{ "}}" }}{{ "{{" }} end {{ "}}" }}',
+                                    # Template outputs description annotation + Source/Silence links for each alert.
+                                    # This avoids Grafana's default verbose format while keeping useful links.
+                                    "template": '{{ "{{" }} define "ptd.description" {{ "}}" }}{{ "{{" }} range .Alerts {{ "}}" }}{{ "{{" }} .Annotations.description {{ "}}" }}\n\nSource: {{ "{{" }} .GeneratorURL {{ "}}" }}\nSilence: {{ "{{" }} .SilenceURL {{ "}}" }}{{ "{{" }} end {{ "}}" }}{{ "{{" }} end {{ "}}" }}',
                                 }
                             ],
                         },

From 758147e9e91d40e9693efd4af091da22e85c0653 Mon Sep 17 00:00:00 2001
From: Anna Williamson <annamdove@gmail.com>
Date: Thu, 5 Mar 2026 11:16:32 -0800
Subject: [PATCH 18/19] missed a query update to include container in
 appropriate alerts

---
 python-pulumi/src/ptd/grafana_alerts/pods.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python-pulumi/src/ptd/grafana_alerts/pods.yaml b/python-pulumi/src/ptd/grafana_alerts/pods.yaml
index 70fe8c7..5ffda98 100644
--- a/python-pulumi/src/ptd/grafana_alerts/pods.yaml
+++ b/python-pulumi/src/ptd/grafana_alerts/pods.yaml
@@ -49,7 +49,7 @@ groups:
                     type: prometheus
                     uid: mimir
                 editorMode: code
-                expr: count by (cluster, namespace, pod)(kube_pod_container_status_waiting_reason{namespace=~"posit-team|posit-team-system|alloy|mimir|loki|grafana",reason="CrashLoopBackOff"})
+                expr: count by (cluster, namespace, pod, container)(kube_pod_container_status_waiting_reason{namespace=~"posit-team|posit-team-system|alloy|mimir|loki|grafana",reason="CrashLoopBackOff"})
                 instant: true
                 intervalMs: 1000
                 legendFormat: __auto

From 175c995e37a477c9380ffcc3b029bebc215d6f98 Mon Sep 17 00:00:00 2001
From: Anna Williamson <annamdove@gmail.com>
Date: Thu, 5 Mar 2026 11:21:21 -0800
Subject: [PATCH 19/19] this should have been an external label for consistency

---
 .../src/ptd/pulumi_resources/grafana_alloy.py        | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py
index 866b728..fa9235c 100644
--- a/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py
+++ b/python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py
@@ -589,15 +589,12 @@ def _define_config_map(
                         target_label = "cluster"
                         replacement  = "{cluster_name}"
                     }}
-
-                    rule {{
-                        action       = "replace"
-                        target_label = "tenant_name"
-                        replacement  = "{tenant_name}"
-                    }}
                 }}
 
                 prometheus.remote_write "control_room" {{
+                    external_labels = {{
+                        tenant_name = "{tenant_name}",
+                    }}
                     endpoint {{
                         url = "{control_room_url}"
                         basic_auth {{
@@ -611,6 +608,9 @@ def _define_config_map(
                 }}
 
                 prometheus.remote_write "workload" {{
+                    external_labels = {{
+                        tenant_name = "{tenant_name}",
+                    }}
                     endpoint {{
                         url = "{workload_url}"
                     }}