From 99e4cd7aa301722e0cbf5408e36e031e87eec70e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9a=20Wang?= Date: Thu, 5 Feb 2026 15:15:57 +0100 Subject: [PATCH] feat(alerts): add kubernetes-v2 alerts for modern Kubernetes clusters - New alert set for Kubernetes 1.34+ clusters - NoPVMetrics uses kube_persistentvolume_capacity_bytes (kube-state-metrics) - Removed HighPersistentVolumeUsage and LowInodes (kubelet_volume_stats_* deprecated in k8s 1.34) - Keep PVError (uses kube_persistentvolume_status_phase) - Legacy 'kubernetes' alerts remain for older clusters --- .../kubernetes-v2/kubernetes.yaml | 234 ++++++++++++++++++ .../kubernetes-v2/kustomization.yaml | 3 + .../kubernetes-v2/persistentvolume.yaml | 30 +++ 3 files changed, 267 insertions(+) create mode 100644 grafana/grafana-alerts/kubernetes-v2/kubernetes.yaml create mode 100644 grafana/grafana-alerts/kubernetes-v2/kustomization.yaml create mode 100644 grafana/grafana-alerts/kubernetes-v2/persistentvolume.yaml diff --git a/grafana/grafana-alerts/kubernetes-v2/kubernetes.yaml b/grafana/grafana-alerts/kubernetes-v2/kubernetes.yaml new file mode 100644 index 0000000..415839f --- /dev/null +++ b/grafana/grafana-alerts/kubernetes-v2/kubernetes.yaml @@ -0,0 +1,234 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: kubernetes +spec: + groups: + - name: kubernetes + rules: + - alert: KubeVersionMismatch + expr: | + count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]+.[0-9]+).[0-9]+"))) > 1 + for: 4h + labels: + alertowner: skyloud + severity: medium + annotations: + tenant_id: "{{ $labels.tenant_id }}" + env: "{{ $labels.env }}" + summary: Different semantic versions of Kubernetes components running + # Requires kube-state-metrics + - name: kubernetes-nodes + rules: + - alert: KubernetesNodeNotReady + expr: kube_node_status_condition{condition="Ready",status="true"} == 0 + for: 15m + labels: + alertowner: skyloud + severity: critical + annotations: + tenant_id: "{{ $labels.tenant_id }}" + env: "{{ $labels.env }}" + summary: Node {{ $labels.instance }} is not ready + + - alert: KubernetesNodeDiskPressure + expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1 + for: 1m + labels: + alertowner: skyloud + severity: critical + annotations: + tenant_id: "{{ $labels.tenant_id }}" + env: "{{ $labels.env }}" + summary: Node {{ $labels.instance }} is DiskPressure + + - alert: KubernetesNodeMemoryPressure + expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1 + for: 1m + labels: + alertowner: skyloud + severity: critical + annotations: + tenant_id: "{{ $labels.tenant_id }}" + env: "{{ $labels.env }}" + summary: Node {{ $labels.instance }} is MemoryPressure + + - alert: HostHighCPUUsage + expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle",job="node-exporter"}[5m])) * 100) > 80 + for: 15m + labels: + alertowner: skyloud + severity: high + annotations: + tenant_id: "{{ $labels.tenant_id }}" + env: "{{ $labels.env }}" + summary: "CPU usage is {{ $value }}% on {{ $labels.instance }}" + + - alert: HostHighMemoryUsage + expr: | + 100*(1-node_memory_MemAvailable_bytes{job="node-exporter"}/node_memory_MemTotal_bytes{job="node-exporter"}) > 80 + for: 15m + labels: + alertowner: skyloud + severity: critical + annotations: + tenant_id: "{{ $labels.tenant_id }}" + env: "{{ $labels.env }}" + summary: "RAM usage is {{ $value }}% on {{ $labels.instance }}" + + # Normally there is no swap on kubernetes nodes + - alert: NodeHighSwapUsage + expr: | + sum(100 * ((node_memory_SwapTotal_bytes{job="node-exporter"} - node_memory_SwapFree_bytes{job="node-exporter"}) / (node_memory_SwapTotal_bytes{job="node-exporter"}))) by (instance) > 80 + for: 15m + labels: + alertowner: skyloud + severity: critical + annotations: + tenant_id: "{{ $labels.tenant_id }}" + env: "{{ $labels.env }}" + summary: "SWAP usage is {{ $value }}% on {{ $labels.instance }}" + + - alert: HostHighDiskUsage + expr: | + (100 - 100 * node_filesystem_avail_bytes{mountpoint="/etc/hostname",job="node-exporter"} / node_filesystem_size_bytes{mountpoint="/etc/hostname",job="node-exporter"}) > 80 and ON (instance, device) node_filesystem_readonly{mountpoint="/etc/hostname",job="node-exporter"} == 0 + for: 15m + labels: + alertowner: skyloud + severity: critical + annotations: + tenant_id: "{{ $labels.tenant_id }}" + env: "{{ $labels.env }}" + summary: "Disk usage is {{ $value }}% on {{ $labels.instance }}:{{ $labels.mountpoint }}" + + # Check all mountpoints except rootfs + - alert: HostHighDiskUsageExternal + expr: | + (100 - 100 * node_filesystem_avail_bytes{device!="/dev/root",job="node-exporter"} / node_filesystem_size_bytes{device!="/dev/root",job="node-exporter"}) > 80 and ON (instance, mountpoint) node_filesystem_readonly{device!="/dev/root",job="node-exporter"} == 0 + for: 15m + labels: + alertowner: skyloud + severity: critical + annotations: + tenant_id: "{{ $labels.tenant_id }}" + env: "{{ $labels.env }}" + summary: "Disk usage is {{ $value }}% on {{ $labels.instance }}:{{ $labels.mountpoint }}" + + - alert: HostOOMKill + expr: increase(node_vmstat_oom_kill{job="node-exporter"}[15m]) > 0 + labels: + alertowner: skyloud + severity: critical + annotations: + tenant_id: "{{ $labels.tenant_id }}" + env: "{{ $labels.env }}" + summary: "{{ $value }} OOM killed processes on {{ $labels.instance }}" + + - alert: NodeClockSkewDetected + expr: | + ( + node_timex_offset_seconds{job="node-exporter"} > 0.05 + and + deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0 + ) + or + ( + node_timex_offset_seconds{job="node-exporter"} < -0.05 + and + deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0 + ) + for: 10m + labels: + alertowner: skyloud + severity: medium + annotations: + tenant_id: "{{ $labels.tenant_id }}" + env: "{{ $labels.env }}" + summary: "Clock skew detected on {{ $labels.instance }}" + + - name: kubernetes-pods + rules: + - alert: NoPodMetrics + expr: absent(kube_pod_container_status_restarts_total) > 0 + for: 5m + labels: + alertowner: skyloud + severity: high + annotations: + tenant_id: "anonymous" + env: "unknown" + summary: No pod metrics in the cluster. You probably don't have kube-state-metrics deployed. + + - alert: PodRestartedAtLeastOneTimeOver10m + expr: | + increase(kube_pod_container_status_restarts_total[10m]) > 0 + for: 1m + labels: + alertowner: skyloud + severity: critical + annotations: + tenant_id: "{{ $labels.tenant_id }}" + env: "{{ $labels.env }}" + summary: Pod {{ $labels.namespace }}/{{ $labels.pod }} restarted at least one time in 10 mins + + - alert: PodNotReady15m + expr: | + count(kube_pod_status_phase{phase="Running"} == 1 and on(pod, namespace) kube_pod_status_ready{condition="false"} == 1) by (pod) + for: 15m + labels: + alertowner: skyloud + severity: critical + annotations: + tenant_id: "{{ $labels.tenant_id }}" + env: "{{ $labels.env }}" + summary: Pod {{ $labels.namespace }}/{{ $labels.pod }} not ready for 5 mins + + - alert: PodBadPhase15m + expr: | + kube_pod_status_phase{phase!~"(Running|Succeeded)"} > 0 + for: 15m + labels: + alertowner: skyloud + severity: critical + annotations: + tenant_id: "{{ $labels.tenant_id }}" + env: "{{ $labels.env }}" + summary: Pod {{ $labels.namespace }}/{{ $labels.pod }} is phase={{ $labels.phase }} for 15 mins + + - alert: PodWaiting15m + expr: | + kube_pod_container_status_waiting_reason > 0 + for: 5m + labels: + alertowner: skyloud + severity: high + annotations: + tenant_id: "{{ $labels.tenant_id }}" + env: "{{ $labels.env }}" + summary: Pod {{ $labels.namespace }}/{{ $labels.pod }} waiting for 15 mins on {{ $labels.reason }} + + - alert: PodOOMkilledOneTime10m + expr: | + kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} == 1 + and on(exported_container) + increase(kube_pod_container_status_restarts_total[10m]) > 1 + labels: + alertowner: skyloud + severity: critical + annotations: + tenant_id: "{{ $labels.tenant_id }}" + env: "{{ $labels.env }}" + summary: Pod {{ $labels.namespace }}/{{ $labels.pod }} exited with '{{ $labels.reason }}' + + - alert: PodEvicted + expr: | + kube_pod_container_status_last_terminated_reason{reason="Evicted"} == 1 + and on(container) increase(kube_pod_container_status_restarts_total[10m]) > 0 + and on(pod, namespace) kube_pod_info{priority_class!="overprovisioning"} + labels: + alertowner: skyloud + severity: high + annotations: + tenant_id: "{{ $labels.tenant_id }}" + env: "{{ $labels.env }}" + summary: Pod {{ $labels.namespace }}/{{ $labels.pod }} was evicted diff --git a/grafana/grafana-alerts/kubernetes-v2/kustomization.yaml b/grafana/grafana-alerts/kubernetes-v2/kustomization.yaml new file mode 100644 index 0000000..3116a82 --- /dev/null +++ b/grafana/grafana-alerts/kubernetes-v2/kustomization.yaml @@ -0,0 +1,3 @@ +resources: + - kubernetes.yaml + - persistentvolume.yaml diff --git a/grafana/grafana-alerts/kubernetes-v2/persistentvolume.yaml b/grafana/grafana-alerts/kubernetes-v2/persistentvolume.yaml new file mode 100644 index 0000000..285bb33 --- /dev/null +++ b/grafana/grafana-alerts/kubernetes-v2/persistentvolume.yaml @@ -0,0 +1,30 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: persistentvolume +spec: + groups: + - name: persistentvolume + rules: + - alert: NoPVMetrics + expr: absent(kube_persistentvolume_capacity_bytes) > 0 + for: 5m + labels: + alertowner: skyloud + severity: high + annotations: + tenant_id: "anonymous" + env: "unknown" + summary: No PV metrics from kube-state-metrics + + - alert: PVError + expr: | + kube_persistentvolume_status_phase{phase=~"Failed|Pending"} > 0 + for: 5m + labels: + alertowner: skyloud + severity: high + annotations: + tenant_id: "{{ $labels.tenant_id }}" + env: "{{ $labels.env }}" + summary: PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is {{ $labels.phase }}