From 19e747d07b79ea7a0a924fa97b89dca93f3b3cdf Mon Sep 17 00:00:00 2001 From: Mike Ditton Date: Thu, 12 Feb 2026 14:02:16 +0100 Subject: [PATCH 1/2] Fix container name and regex capture group for user alerting This fixes the wrong container name being used for the user alerting of mariadb. This also fixes an issue with inproper parentheses in the regex capture groups. --- .../functions/common/nonsla/alerting.go | 6 +++--- .../functions/common/nonsla/alerting_test.go | 12 ++++++------ pkg/comp-functions/functions/vshnmariadb/register.go | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/pkg/comp-functions/functions/common/nonsla/alerting.go b/pkg/comp-functions/functions/common/nonsla/alerting.go index 6b9d8c337a..cea849421a 100644 --- a/pkg/comp-functions/functions/common/nonsla/alerting.go +++ b/pkg/comp-functions/functions/common/nonsla/alerting.go @@ -44,7 +44,7 @@ var ( }, Expr: intstr.IntOrString{ Type: intstr.String, - StrVal: "label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job=\"kubelet\", metrics_path=\"/metrics\"} / kubelet_volume_stats_capacity_bytes{job=\"kubelet\",metrics_path=\"/metrics\"}) < 0.03 and kubelet_volume_stats_used_bytes{job=\"kubelet\",metrics_path=\"/metrics\"} > 0 unless on(namespace,persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{access_mode=\"ReadOnlyMany\"} == 1 unless on(namespace,persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts=\"true\"}== 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace)kube_namespace_labels, \"name\", \"$1\", \"namespace\",\"" + namespace + "\")", + StrVal: "label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job=\"kubelet\", metrics_path=\"/metrics\"} / kubelet_volume_stats_capacity_bytes{job=\"kubelet\",metrics_path=\"/metrics\"}) < 0.03 and kubelet_volume_stats_used_bytes{job=\"kubelet\",metrics_path=\"/metrics\"} > 0 unless on(namespace,persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{access_mode=\"ReadOnlyMany\"} == 1 unless on(namespace,persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts=\"true\"}== 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace)kube_namespace_labels, \"name\", \"$1\", \"namespace\",\"(" + namespace + ")\")", }, For: MinuteInterval, Labels: map[string]string{ @@ -64,7 +64,7 @@ var ( }, Expr: intstr.IntOrString{ Type: intstr.String, - StrVal: "label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job=\"kubelet\",metrics_path=\"/metrics\"} / kubelet_volume_stats_capacity_bytes{job=\"kubelet\",metrics_path=\"/metrics\"}) < 0.15 and kubelet_volume_stats_used_bytes{job=\"kubelet\",metrics_path=\"/metrics\"} > 0 and predict_linear(kubelet_volume_stats_available_bytes{job=\"kubelet\",metrics_path=\"/metrics\"}[6h], 4 * 24 * 3600) < 0 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{access_mode=\"ReadOnlyMany\"} == 1 unless on(namespace,persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts=\"true\"}== 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace)kube_namespace_labels, \"name\", \"$1\", \"namespace\",\"" + namespace + "\")", + StrVal: "label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job=\"kubelet\",metrics_path=\"/metrics\"} / kubelet_volume_stats_capacity_bytes{job=\"kubelet\",metrics_path=\"/metrics\"}) < 0.15 and kubelet_volume_stats_used_bytes{job=\"kubelet\",metrics_path=\"/metrics\"} > 0 and predict_linear(kubelet_volume_stats_available_bytes{job=\"kubelet\",metrics_path=\"/metrics\"}[6h], 4 * 24 * 3600) < 0 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{access_mode=\"ReadOnlyMany\"} == 1 unless on(namespace,persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts=\"true\"}== 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace)kube_namespace_labels, \"name\", \"$1\", \"namespace\",\"(" + namespace + ")\")", }, For: HourInterval, Labels: map[string]string{ @@ -84,7 +84,7 @@ var ( }, Expr: intstr.IntOrString{ Type: intstr.String, - StrVal: "label_replace( topk(1, (max(container_memory_working_set_bytes{container=\"" + name + "\"})without (name, id) / on(container,pod,namespace) kube_pod_container_resource_limits{resource=\"memory\"}* 100) > 85) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace)kube_namespace_labels, \"name\", \"$1\", \"namespace\",\"" + namespace + "\")", + StrVal: "label_replace( topk(1, (max(container_memory_working_set_bytes{container=\"" + name + "\"})without (name, id) / on(container,pod,namespace) kube_pod_container_resource_limits{resource=\"memory\"}* 100) > 85) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace)kube_namespace_labels, \"name\", \"$1\", \"namespace\",\"(" + namespace + ")\")", }, For: TwoHourInterval, Labels: map[string]string{ diff --git a/pkg/comp-functions/functions/common/nonsla/alerting_test.go b/pkg/comp-functions/functions/common/nonsla/alerting_test.go index 1c18062c82..3fa80410f7 100644 --- a/pkg/comp-functions/functions/common/nonsla/alerting_test.go +++ b/pkg/comp-functions/functions/common/nonsla/alerting_test.go @@ -9,15 +9,15 @@ import ( var ( // PostgreSQL alerts - most specific as currently those are the only ones where we have different container name and namespace - patroniPersistentVolumeExpectedToFillUp = `label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet",metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet",metrics_path="/metrics"}) < 0.15 and kubelet_volume_stats_used_bytes{job="kubelet",metrics_path="/metrics"} > 0 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet",metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{access_mode="ReadOnlyMany"} == 1 unless on(namespace,persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"}== 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace)kube_namespace_labels, "name", "$1", "namespace","vshn-postgresql-test")` + patroniPersistentVolumeExpectedToFillUp = `label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet",metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet",metrics_path="/metrics"}) < 0.15 and kubelet_volume_stats_used_bytes{job="kubelet",metrics_path="/metrics"} > 0 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet",metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 unless on(namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{access_mode="ReadOnlyMany"} == 1 unless on(namespace,persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"}== 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace)kube_namespace_labels, "name", "$1", "namespace","(vshn-postgresql-test)")` - patroniMemoryCritical = `label_replace( topk(1, (max(container_memory_working_set_bytes{container="patroni"})without (name, id) / on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"}* 100) > 85) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace)kube_namespace_labels, "name", "$1", "namespace","vshn-postgresql-test")` + patroniMemoryCritical = `label_replace( topk(1, (max(container_memory_working_set_bytes{container="patroni"})without (name, id) / on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"}* 100) > 85) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace)kube_namespace_labels, "name", "$1", "namespace","(vshn-postgresql-test)")` - patroniPersistentVolumeFillingUp = `label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet",metrics_path="/metrics"}) < 0.03 and kubelet_volume_stats_used_bytes{job="kubelet",metrics_path="/metrics"} > 0 unless on(namespace,persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{access_mode="ReadOnlyMany"} == 1 unless on(namespace,persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"}== 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace)kube_namespace_labels, "name", "$1", "namespace","vshn-postgresql-test")` + patroniPersistentVolumeFillingUp = `label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet",metrics_path="/metrics"}) < 0.03 and kubelet_volume_stats_used_bytes{job="kubelet",metrics_path="/metrics"} > 0 unless on(namespace,persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{access_mode="ReadOnlyMany"} == 1 unless on(namespace,persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"}== 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace)kube_namespace_labels, "name", "$1", "namespace","(vshn-postgresql-test)")` - mariadbPersistentVolumeFillingUp = `label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet",metrics_path="/metrics"}) < 0.03 and kubelet_volume_stats_used_bytes{job="kubelet",metrics_path="/metrics"} > 0 unless on(namespace,persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{access_mode="ReadOnlyMany"} == 1 unless on(namespace,persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"}== 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace)kube_namespace_labels, "name", "$1", "namespace","vshn-mariadb-myinstance")` + mariadbPersistentVolumeFillingUp = `label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"} / kubelet_volume_stats_capacity_bytes{job="kubelet",metrics_path="/metrics"}) < 0.03 and kubelet_volume_stats_used_bytes{job="kubelet",metrics_path="/metrics"} > 0 unless on(namespace,persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{access_mode="ReadOnlyMany"} == 1 unless on(namespace,persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"}== 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace)kube_namespace_labels, "name", "$1", "namespace","(vshn-mariadb-myinstance)")` - keycloakMemoryCritical = `label_replace( topk(1, (max(container_memory_working_set_bytes{container="keycloak"})without (name, id) / on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"}* 100) > 85) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace)kube_namespace_labels, "name", "$1", "namespace","vshn-keycloak-myinstance")` + keycloakMemoryCritical = `label_replace( topk(1, (max(container_memory_working_set_bytes{container="keycloak"})without (name, id) / on(container,pod,namespace) kube_pod_container_resource_limits{resource="memory"}* 100) > 85) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace)kube_namespace_labels, "name", "$1", "namespace","(vshn-keycloak-myinstance)")` ) func TestNewAlertSetBuilder(t *testing.T) { @@ -65,7 +65,7 @@ func TestNewAlertSetBuilder(t *testing.T) { assert.Equal(t, 3, checkCount) // test for mariadb - containerName = "mariadb" + containerName = "mariadb-galera" namespace = "vshn-mariadb-myinstance" builder = NewAlertSetBuilder(containerName) builder.AddDiskFillingUp() diff --git a/pkg/comp-functions/functions/vshnmariadb/register.go b/pkg/comp-functions/functions/vshnmariadb/register.go index 9603b8a74a..9a1d728324 100644 --- a/pkg/comp-functions/functions/vshnmariadb/register.go +++ b/pkg/comp-functions/functions/vshnmariadb/register.go @@ -33,7 +33,7 @@ func init() { }, { Name: "non-sla-prometheus-rules", - Execute: nonsla.GenerateNonSLAPromRules[*vshnv1.VSHNMariaDB](nonsla.NewAlertSetBuilder("mariadb").AddAll().GetAlerts()), + Execute: nonsla.GenerateNonSLAPromRules[*vshnv1.VSHNMariaDB](nonsla.NewAlertSetBuilder("mariadb-galera").AddAll().GetAlerts()), }, { Name: "billing", From 9faa0dd086c853d3b856018460076fd5e8d204bb Mon Sep 17 00:00:00 2001 From: Mike Ditton Date: Thu, 12 Feb 2026 15:10:14 +0100 Subject: [PATCH 2/2] Increase interval for pvFillUp alert This increases the interval of the pvFillUp alert from 1 minute to 15 minutes to prevent repeated alerts due to flapping. --- pkg/comp-functions/functions/common/nonsla/alerting.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pkg/comp-functions/functions/common/nonsla/alerting.go b/pkg/comp-functions/functions/common/nonsla/alerting.go index cea849421a..17c5deb994 100644 --- a/pkg/comp-functions/functions/common/nonsla/alerting.go +++ b/pkg/comp-functions/functions/common/nonsla/alerting.go @@ -25,9 +25,9 @@ type Alerts struct { } const ( - SynTeam string = "schedar" - SeverityCritical string = "critical" - MinuteInterval, HourInterval, TwoHourInterval promV1.Duration = "1m", "1h", "2h" + SynTeam string = "schedar" + SeverityCritical string = "critical" + MinuteInterval, FifteenMinuteInterval, HourInterval, TwoHourInterval promV1.Duration = "1m", "15m", "1h", "2h" ) var ( @@ -46,7 +46,7 @@ var ( Type: intstr.String, StrVal: "label_replace( bottomk(1, (kubelet_volume_stats_available_bytes{job=\"kubelet\", metrics_path=\"/metrics\"} / kubelet_volume_stats_capacity_bytes{job=\"kubelet\",metrics_path=\"/metrics\"}) < 0.03 and kubelet_volume_stats_used_bytes{job=\"kubelet\",metrics_path=\"/metrics\"} > 0 unless on(namespace,persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{access_mode=\"ReadOnlyMany\"} == 1 unless on(namespace,persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts=\"true\"}== 1) * on(namespace) group_left(label_appcat_vshn_io_claim_namespace)kube_namespace_labels, \"name\", \"$1\", \"namespace\",\"(" + namespace + ")\")", }, - For: MinuteInterval, + For: FifteenMinuteInterval, Labels: map[string]string{ "severity": SeverityCritical, "syn_team": SynTeam,