posit-dev · amdove · Mar 5, 2026 · Feb 25, 2026 · Feb 25, 2026 · Feb 25, 2026
@@ -532,6 +532,42 @@ PTD deploys a set of Grafana alerts to the control room for centralized monitori
 
 All alerts are configured to send notifications to OpsGenie when triggered.
 
+### Alert Format
+
+Alerts use a standardized format for consistency across all alert types:
+
+```
+[🔴 CRITICAL | 🟡 WARNING]: [Title]
+
+[Description]
+
+─── WHERE ───────────────────────────
+Tenant:      [tenant name] (Note: The organization or group that a workload cluster is provisioned for)
+Cluster:     [cluster name]
+Component:   [affected component]
+
+─── DETAILS ─────────────────────────
+[Key]:       [Value]
+[Key]:       [Value]
+...
+
+📖 [runbook link]
+📊 [dashboard link]
+```
+
+**Severity levels:**
+- 🔴 **CRITICAL** — Immediate action required
+- 🟡 **WARNING** — Investigate soon
+
+**Alert types and their WHERE/DETAILS fields:**
+
+| Type | WHERE | DETAILS |
+|------|-------|---------|
+| Health Check | Tenant, Cluster, Product | Endpoint, Status, Response Time, Down Since |
+| Kubernetes | Tenant, Cluster, Namespace, Pod/Node | Varies by alert (restarts, replicas, conditions) |
+| Cloud (AWS) | Tenant, Cluster, Resource, Region | Metric, Current, Threshold, Duration |
+| Cloud (Azure) | Tenant, Cluster, Resource, Location | Metric, Current, Threshold, Duration |
+
 ### Application Alerts
 
 | Alert | Threshold | Duration | Description |

@@ -11,6 +11,9 @@ spec:
   # AWS account ID where the workload will be deployed
   account_id: "123456789012"
 
+  # Human-readable tenant name for alerts (defaults to compound_name if not set)
+  tenant_name: "Example Analytics Team"
+
   # Control room that manages this workload
   control_room_account_id: "123456789012"
   control_room_cluster_name: control-room-production

@@ -316,6 +316,7 @@ class WorkloadConfig:
     network_trust: NetworkTrust
     sites: typing.Mapping[str, SiteConfig]
     true_name: str
+    tenant_name: str | None = dataclasses.field(default=None, kw_only=True)  # Human-readable name for the tenant
 
     @property
     def domain(self) -> str:

@@ -63,8 +63,21 @@ groups:
           execErrState: Error
           for: 5m
           annotations:
-            description: Loki ingester has experienced WAL disk full failures. This indicates storage issues with the Loki WAL directory in cluster {{ $labels.cluster }}.
-            summary: Loki WAL disk full failures detected in {{ $labels.cluster }}
+            summary: "🔴 CRITICAL: Loki WAL Disk Full"
+            description: |
+              Loki ingester experiencing WAL disk full failures
+
+              ─── WHERE ───────────────────────────
+              Tenant:      {{ $labels.tenant_name }}
+              Cluster:     {{ $labels.cluster }}
+              Namespace:   {{ $labels.namespace }}
+              Pod:         {{ $labels.pod }}
+
+              ─── DETAILS ─────────────────────────
+              Component:   Loki Ingester
+              Issue:       WAL disk full
+              Duration:    5 minutes
+
           labels:
             opsgenie: "1"
           isPaused: false
@@ -63,8 +63,22 @@ groups:
           execErrState: Error
           for: 5m
           annotations:
-            description: Evaluates FSx instance usage to determine if it is greater than 80% allocated.
-            summary: FSx instance {{$labels.dimension_FileSystemId}} has less than 20% of capacity remaining.
+            summary: "🟡 WARNING: FSx Storage Capacity Low"
+            description: |
+              FSx file system is running low on storage capacity
+
+              ─── WHERE ───────────────────────────
+              Tenant:      {{ $labels.tenant_name }}
+              Cluster:     {{ $labels.cluster }}
+              Resource:    {{ $labels.dimension_FileSystemId }}
+              Region:      {{ $labels.region }}
+
+              ─── DETAILS ─────────────────────────
+              Metric:      Storage Capacity
+              Current:     > 80% used
+              Threshold:   80%
+              Duration:    5 minutes
+
           labels:
             opsgenie: "1"
           isPaused: false
@@ -124,8 +138,22 @@ groups:
           execErrState: Error
           for: 5m
           annotations:
-            description: EC2 instance has high network outbound traffic (over 300 MiB/s) for more than 5 minutes.
-            summary: High network outbound traffic on EC2 instance {{$labels.dimension_InstanceId}} in cluster {{$labels.cluster}}
+            summary: "🟡 WARNING: EC2 Network Out High"
+            description: |
+              EC2 instance has unusually high outbound network traffic
+
+              ─── WHERE ───────────────────────────
+              Tenant:      {{ $labels.tenant_name }}
+              Cluster:     {{ $labels.cluster }}
+              Resource:    {{ $labels.dimension_InstanceId }}
+              Region:      {{ $labels.region }}
+
+              ─── DETAILS ─────────────────────────
+              Metric:      Network Out
+              Current:     > 300 MiB/s
+              Threshold:   300 MiB/s
+              Duration:    5 minutes
+
           labels:
             opsgenie: "1"
           isPaused: false
@@ -181,8 +209,22 @@ groups:
           execErrState: Error
           for: 5m
           annotations:
-            description: EC2 instance has an unusually high packet transmission rate (over 400,000 packets/s) for more than 5 minutes, which could indicate abnormal network activity.
-            summary: High network packet rate on EC2 instance {{$labels.dimension_InstanceId}} in cluster {{$labels.cluster}}
+            summary: "🟡 WARNING: EC2 Network Packets Out High"
+            description: |
+              EC2 instance has unusually high packet transmission rate
+
+              ─── WHERE ───────────────────────────
+              Tenant:      {{ $labels.tenant_name }}
+              Cluster:     {{ $labels.cluster }}
+              Resource:    {{ $labels.dimension_InstanceId }}
+              Region:      {{ $labels.region }}
+
+              ─── DETAILS ─────────────────────────
+              Metric:      Network Packets Out
+              Current:     > 400,000 packets/s
+              Threshold:   400,000 packets/s
+              Duration:    5 minutes
+
           labels:
             opsgenie: "1"
           isPaused: false
@@ -68,8 +68,22 @@ groups:
           execErrState: Error
           for: 5m
           annotations:
-            description: Calls the health check for each component of each PTD site and errors on non-200 response.
-            summary: "Health check failed! \nCluster: {{ $labels.cluster }} \nSite: {{ $labels.ptd_site }}\nComponent: {{ $labels.ptd_component }}\nCheck Type: {{ $labels.check_type }}\nURL: {{ $labels.health_check_url }}"
+            summary: "🔴 CRITICAL: Health Check Failed"
+            description: |
+              Product health check returning non-200 response
+
+              ─── WHERE ───────────────────────────
+              Tenant:      {{ $labels.tenant_name }}
+              Cluster:     {{ $labels.cluster }}
+              Site:        {{ $labels.ptd_site }}
+              Product:     {{ $labels.ptd_component }}
+
+              ─── DETAILS ─────────────────────────
+              Check Type:  {{ $labels.check_type }}
+              Endpoint:    {{ $labels.health_check_url }}
+              Status:      Non-200 response
+              Duration:    5 minutes
+
           labels:
             opsgenie: "1"
           isPaused: false
@@ -91,8 +91,19 @@ groups:
           execErrState: Error
           for: 10m
           annotations:
-            description: No metrics have been received from workload cluster {{ $labels.cluster }} for 10 minutes. This could indicate Alloy is not running, network issues between the workload and control room, or the workload cluster is down.
-            summary: "Workload metrics silent!\nCluster: {{ $labels.cluster }}"
+            summary: "🔴 CRITICAL: Workload Metrics Silent"
+            description: |
+              No metrics received from workload cluster
+
+              ─── WHERE ───────────────────────────
+              Tenant:      {{ $labels.tenant_name }}
+              Cluster:     {{ $labels.cluster }}
+
+              ─── DETAILS ─────────────────────────
+              Component:   Metrics Pipeline
+              Issue:       No metrics received
+              Duration:    10 minutes
+
           labels:
             opsgenie: "1"
           isPaused: false
@@ -55,8 +55,19 @@ groups:
           execErrState: Error
           for: 15m
           annotations:
-            summary: Node not in ready state (instance {{ $labels.instance }})
-            description: "Node {{ $labels.node }} in cluster {{ $labels.cluster }} has been unready for 15 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+            summary: "🔴 CRITICAL: Node Not Ready"
+            description: |
+              Kubernetes node is not accepting workloads
+
+              ─── WHERE ───────────────────────────
+              Tenant:      {{ $labels.tenant_name }}
+              Cluster:     {{ $labels.cluster }}
+              Node:        {{ $labels.node }}
+
+              ─── DETAILS ─────────────────────────
+              Condition:   NotReady
+              Duration:    15 minutes
+
           labels:
             opsgenie: "1"
           isPaused: false
@@ -110,8 +121,19 @@ groups:
           execErrState: Error
           for: 15m
           annotations:
-            summary: Kubernetes Node memory pressure (instance {{ $labels.instance }})
-            description: "Node {{ $labels.node }} in cluster {{ $labels.cluster }} has MemoryPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+            summary: "🟡 WARNING: Node Memory Pressure"
+            description: |
+              Node is running low on available memory
+
+              ─── WHERE ───────────────────────────
+              Tenant:      {{ $labels.tenant_name }}
+              Cluster:     {{ $labels.cluster }}
+              Node:        {{ $labels.node }}
+
+              ─── DETAILS ─────────────────────────
+              Condition:   MemoryPressure
+              Duration:    15 minutes
+
           labels:
             opsgenie: "1"
           isPaused: false
@@ -165,8 +187,19 @@ groups:
           execErrState: Error
           for: 15m
           annotations:
-            summary: Kubernetes Node disk pressure (instance {{ $labels.instance }})
-            description: "Node {{ $labels.node }} in cluster {{ $labels.cluster }} has DiskPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+            summary: "🟡 WARNING: Node Disk Pressure"
+            description: |
+              Node is running low on available disk space
+
+              ─── WHERE ───────────────────────────
+              Tenant:      {{ $labels.tenant_name }}
+              Cluster:     {{ $labels.cluster }}
+              Node:        {{ $labels.node }}
+
+              ─── DETAILS ─────────────────────────
+              Condition:   DiskPressure
+              Duration:    15 minutes
+
           labels:
             opsgenie: "1"
           isPaused: false