From f204317499405b5931d1dba58642583a01f984ef Mon Sep 17 00:00:00 2001 From: Tim Margheim Date: Mon, 2 Mar 2026 15:09:56 -0700 Subject: [PATCH] Introduce namespace restriction on alerts --- CLAUDE.md | 18 +++++++++++ docs/guides/monitoring.md | 22 +++++++++++++ .../src/ptd/grafana_alerts/pods.yaml | 32 +++++++++++++++---- 3 files changed, 66 insertions(+), 6 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 2bb0e4c..9ae2ef7 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -143,6 +143,24 @@ git worktree remove ../.worktrees/ptd- - **ALWAYS** rebuild the binary after creating a worktree (`just build-cmd`) - Branch names: kebab-case, no slashes, no usernames (slashes break worktree directory paths) +## Monitoring and Alerts + +### Alert Namespace Scope + +Pod alerts (PodError, CrashLoopBackoff, DeploymentReplicaMismatch, etc.) are scoped to a minimal namespace allowlist to prevent false alerts from customer-deployed workloads: + +**Monitored Namespaces**: +- **Application**: `posit-team`, `posit-team-system` (direct customer impact) +- **Observability**: `alloy`, `mimir`, `loki`, `grafana` (failures cause monitoring blindness) + +**PromQL Filter**: `{namespace=~"posit-team|posit-team-system|alloy|mimir|loki|grafana"}` + +**Why Infrastructure Namespaces Are Excluded**: Infrastructure namespaces (Calico, Traefik, kube-system) are excluded because their failures manifest as application failures, avoiding redundant alerts. For example: +- CNI failure → Network breaks → Application pods fail → Alert fires for application namespace +- Ingress failure → HTTP checks fail → `Healthchecks` alert fires + +**Alert Configuration**: Alert definitions are in `python-pulumi/src/ptd/grafana_alerts/*.yaml`. All pod-related alerts in `pods.yaml` include the namespace filter in their PromQL queries. + ## Contributing When contributing to the project: diff --git a/docs/guides/monitoring.md b/docs/guides/monitoring.md index a320a53..2e72098 100644 --- a/docs/guides/monitoring.md +++ b/docs/guides/monitoring.md @@ -577,6 +577,28 @@ All alerts are configured to send notifications to OpsGenie when triggered. | **Deployment Replicas Mismatch** | Desired != Available | 15m | Deployment does not have the expected number of available replicas | | **StatefulSet Replicas Mismatch** | Ready != Desired | 15m | StatefulSet does not have the expected number of ready replicas | +Pod-related alerts are filtered to only monitor PTD-managed namespaces to prevent false alerts for customer-deployed workloads. + +**Monitored Namespaces** (minimal allowlist): +- **Application namespaces**: `posit-team`, `posit-team-system` - Direct customer-facing applications where failures immediately impact users +- **Observability stack**: `alloy`, `mimir`, `loki`, `grafana` - Monitoring infrastructure failures cause blindness to other failures + +**Excluded Namespaces**: +- **Infrastructure namespaces** (`calico-system`, `traefik`, `kube-system`, `tigera-operator`, etc.) - Failures manifest as application failures, which trigger alerts naturally +- **Customer namespaces** (`default`, custom namespaces) - Outside PTD responsibility + +**Rationale**: The monitoring strategy follows a "monitor symptoms, not all infrastructure layers" approach. Infrastructure failures (CNI, ingress, storage) cascade to application failures, which trigger alerts. This prevents redundant alerts while ensuring PTD is notified of actual customer impact. The observability stack must be monitored directly since failures prevent other alerts from firing. + +**PromQL Filter Pattern**: All pod alerts use the namespace filter: +```promql +{namespace=~"posit-team|posit-team-system|alloy|mimir|loki|grafana"} +``` + +**Example Failure Cascade**: +- Calico CNI pod crashes → Network connectivity breaks for application pods → Application pods become unhealthy → `PodNotHealthy` alert fires in `posit-team` namespace +- Traefik ingress pod crashes → Ingress routing breaks → HTTP health checks fail → `Healthchecks` alert fires +- Alloy pod crashes → Metrics/logs stop flowing → No alerts fire (blind) → **Must alert on Alloy pod failures directly** + ### Adding or Modifying Alerts To add or modify alerts, edit the YAML files in `python-pulumi/src/ptd/grafana_alerts/`. Each file contains alerts grouped by category: diff --git a/python-pulumi/src/ptd/grafana_alerts/pods.yaml b/python-pulumi/src/ptd/grafana_alerts/pods.yaml index 1422ab7..753fce6 100644 --- a/python-pulumi/src/ptd/grafana_alerts/pods.yaml +++ b/python-pulumi/src/ptd/grafana_alerts/pods.yaml @@ -8,6 +8,26 @@ # uid: pod_error # # See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/ +# +# ============================================================================= +# NAMESPACE FILTERING +# ============================================================================= +# All pod alerts in this file are filtered to the following PTD-managed namespaces: +# +# namespace=~"posit-team|posit-team-system|alloy|mimir|loki|grafana" +# +# This prevents false alerts from customer-deployed workloads outside PTD's control. +# +# Monitored namespaces: +# - posit-team, posit-team-system: Application namespaces (direct customer impact) +# - alloy, mimir, loki, grafana: Observability stack (failures cause blindness) +# +# Excluded namespaces (failures cascade to application alerts): +# - calico-system, tigera-operator, traefik, kube-system, etc. +# - default and customer-created namespaces +# +# To update the namespace filter, use find/replace on the regex pattern above. +# ============================================================================= apiVersion: 1 groups: - orgId: 1 @@ -29,7 +49,7 @@ groups: type: prometheus uid: mimir editorMode: code - expr: count by (cluster, container)(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"}) + expr: count by (cluster, container)(kube_pod_container_status_waiting_reason{namespace=~"posit-team|posit-team-system|alloy|mimir|loki|grafana",reason="CrashLoopBackOff"}) instant: true intervalMs: 1000 legendFormat: __auto @@ -115,7 +135,7 @@ groups: uid: mimir disableTextWrap: false editorMode: code - expr: count by(cluster, pod, reason) (kube_pod_container_status_terminated_reason{reason!="Completed"} * on(cluster,pod) group_left(label_launcher_instance_id) kube_pod_labels{label_launcher_instance_id=""}) + expr: count by(cluster, pod, reason) (kube_pod_container_status_terminated_reason{namespace=~"posit-team|posit-team-system|alloy|mimir|loki|grafana",reason!="Completed"} * on(cluster,pod) group_left(label_launcher_instance_id) kube_pod_labels{label_launcher_instance_id=""}) fullMetaSearch: false includeNullMetadata: true instant: true @@ -206,7 +226,7 @@ groups: uid: mimir disableTextWrap: false editorMode: code - expr: sum by (cluster, pod, phase) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"} * on(cluster,pod) group_left(label_launcher_instance_id) kube_pod_labels{label_launcher_instance_id=""}) > 0 + expr: sum by (cluster, pod, phase) (kube_pod_status_phase{namespace=~"posit-team|posit-team-system|alloy|mimir|loki|grafana",phase=~"Pending|Unknown|Failed"} * on(cluster,pod) group_left(label_launcher_instance_id) kube_pod_labels{label_launcher_instance_id=""}) > 0 fullMetaSearch: false includeNullMetadata: true instant: true @@ -294,7 +314,7 @@ groups: uid: mimir disableTextWrap: false editorMode: code - expr: avg_over_time(increase(kube_pod_container_status_restarts_total[15m])[15m:1m]) > 5 + expr: avg_over_time(increase(kube_pod_container_status_restarts_total{namespace=~"posit-team|posit-team-system|alloy|mimir|loki|grafana"}[15m])[15m:1m]) > 5 fullMetaSearch: false includeNullMetadata: true instant: false @@ -386,7 +406,7 @@ groups: uid: mimir disableTextWrap: false editorMode: code - expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available + expr: kube_deployment_spec_replicas{namespace=~"posit-team|posit-team-system|alloy|mimir|loki|grafana"} != kube_deployment_status_replicas_available{namespace=~"posit-team|posit-team-system|alloy|mimir|loki|grafana"} fullMetaSearch: false includeNullMetadata: true instant: true @@ -471,7 +491,7 @@ groups: uid: mimir disableTextWrap: false editorMode: code - expr: kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas + expr: kube_statefulset_status_replicas_ready{namespace=~"posit-team|posit-team-system|alloy|mimir|loki|grafana"} != kube_statefulset_status_replicas{namespace=~"posit-team|posit-team-system|alloy|mimir|loki|grafana"} fullMetaSearch: false includeNullMetadata: true instant: true