Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
560 changes: 115 additions & 445 deletions docs/guides/monitoring.md

Large diffs are not rendered by default.

138 changes: 138 additions & 0 deletions python-pulumi/src/ptd/grafana_alerts/mimir.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
# To delete these alerts, replace file contents with:
# apiVersion: 1
# deleteRules:
# - orgId: 1
# uid: mimir_ingester_pods_not_ready
# - orgId: 1
# uid: mimir_remote_write_failures
#
# See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/
#
# These alerts monitor workload Mimir health from the control room.
# They use metrics that Alloy dual-writes to the control room Mimir.
apiVersion: 1
groups:
- orgId: 1
name: Mimir
folder: Posit Alerts
interval: 1m
rules:
- uid: mimir_ingester_pods_not_ready
title: Mimir Ingester Pods Not Ready
condition: B
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: mimir
model:
editorMode: code
expr: |
kube_statefulset_status_replicas_ready{namespace="mimir",statefulset="mimir-ingester"}
<
kube_statefulset_status_replicas{namespace="mimir",statefulset="mimir-ingester"}
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: B
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
type: gt
operator:
type: and
query:
params:
- B
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: B
type: threshold
noDataState: NoData
execErrState: Error
for: 2m
annotations:
description: >-
Mimir ingester pods not ready in cluster {{ $labels.cluster }}.
Check: kubectl get pods -n mimir -l app.kubernetes.io/component=ingester
summary: Mimir ingester pods not ready in {{ $labels.cluster }}
labels:
opsgenie: "1"
isPaused: false

- uid: mimir_remote_write_failures
title: Mimir Remote Write Failures
condition: B
data:
- refId: A
relativeTimeRange:
from: 600
to: 0
datasourceUid: mimir
model:
editorMode: code
expr: |
rate(prometheus_remote_storage_samples_failed_total{url=~".*mimir.*"}[5m]) > 0
instant: true
intervalMs: 1000
legendFormat: __auto
maxDataPoints: 43200
range: false
refId: A
- refId: B
relativeTimeRange:
from: 600
to: 0
datasourceUid: __expr__
model:
conditions:
- evaluator:
params:
- 0
type: gt
operator:
type: and
query:
params:
- B
reducer:
params: []
type: last
type: query
datasource:
type: __expr__
uid: __expr__
expression: A
intervalMs: 1000
maxDataPoints: 43200
refId: B
type: threshold
noDataState: NoData
execErrState: Error
for: 5m
annotations:
description: >-
Alloy failing to write metrics to Mimir in cluster {{ $labels.cluster }}.
Check Alloy: kubectl logs -n alloy -l app.kubernetes.io/name=alloy --tail=100 | grep -i error
summary: Metrics remote write to Mimir failing in {{ $labels.cluster }}
labels:
opsgenie: "1"
isPaused: false
43 changes: 43 additions & 0 deletions python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -1188,6 +1188,17 @@ def with_ebs_csi_driver(
tags=self.eks.tags,
configuration_values=json.dumps(
{
"controller": {
"resources": {
"requests": {
"cpu": "10m",
"memory": "40Mi",
},
"limits": {
"memory": "40Mi",
},
},
},
"defaultStorageClass": {
"enabled": True,
},
Expand Down Expand Up @@ -1236,6 +1247,21 @@ def with_efs_csi_driver(
cluster_name=self.name,
service_account_role_arn=sa_role.arn,
tags=self.eks.tags,
configuration_values=json.dumps(
{
"controller": {
"resources": {
"requests": {
"cpu": "10m",
"memory": "40Mi",
},
"limits": {
"memory": "40Mi",
},
},
},
}
),
),
opts=pulumi.ResourceOptions(parent=self.eks),
)
Expand Down Expand Up @@ -1888,6 +1914,7 @@ def with_grafana(
self._create_alert_configmap("healthchecks", grafana_ns)
self._create_alert_configmap("nodes", grafana_ns)
self._create_alert_configmap("applications", grafana_ns)
self._create_alert_configmap("mimir", grafana_ns)

# TODO: auth.proxy should be configurable, prod grafana auth will need tighter controls than letting anyone in as an Editor
k8s.helm.v3.Release(
Expand Down Expand Up @@ -2176,6 +2203,22 @@ def with_mimir(
"max_global_series_per_user": 800000,
"max_label_names_per_series": 45,
},
# Ring health configuration to auto-forget unhealthy members
# and prevent stale entries from blocking queries
"ingester": {
"ring": {
"heartbeat_timeout": "1m",
"auto_forget_unhealthy": True,
"auto_forget_unhealthy_timeout": "10m",
},
},
"store_gateway": {
"sharding_ring": {
"heartbeat_timeout": "1m",
"auto_forget_unhealthy": True,
"auto_forget_unhealthy_timeout": "10m",
},
},
}
},
"alertmanager": {"enabled": False},
Expand Down
Loading
Loading