From 2824b55ba67d4fe7e8a9a3cb91ff934ad63df59c Mon Sep 17 00:00:00 2001 From: Anna Williamson Date: Mon, 26 Jan 2026 13:27:50 -0800 Subject: [PATCH 1/8] Adding resource requests and limits for component helm charts --- .../ptd/pulumi_resources/aws_eks_cluster.py | 26 +++++++ .../ptd/pulumi_resources/aws_workload_helm.py | 72 ++++++++++++++++++- .../src/ptd/pulumi_resources/external_dns.py | 9 +++ .../ptd/pulumi_resources/tigera_operator.py | 9 +++ 4 files changed, 113 insertions(+), 3 deletions(-) diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py index b172238..33cbedf 100644 --- a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py +++ b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py @@ -1185,6 +1185,17 @@ def with_ebs_csi_driver( tags=self.eks.tags, configuration_values=json.dumps( { + "controller": { + "resources": { + "requests": { + "cpu": "10m", + "memory": "40Mi", + }, + "limits": { + "memory": "40Mi", + }, + }, + }, "defaultStorageClass": { "enabled": True, }, @@ -1233,6 +1244,21 @@ def with_efs_csi_driver( cluster_name=self.name, service_account_role_arn=sa_role.arn, tags=self.eks.tags, + configuration_values=json.dumps( + { + "controller": { + "resources": { + "requests": { + "cpu": "10m", + "memory": "40Mi", + }, + "limits": { + "memory": "40Mi", + }, + }, + }, + } + ), ), opts=pulumi.ResourceOptions(parent=self.eks), ) diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py b/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py index 9153d35..f9f8422 100644 --- a/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py +++ b/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py @@ -91,6 +91,15 @@ def _define_aws_fsx_openzfs_csi(self, release: str, version: str): "valuesContent": yaml.dump( { "controller": { + "resources": { + "requests": { + "cpu": "10m", + "memory": "40Mi", + }, + "limits": { + "memory": "40Mi", + }, + }, "serviceAccount": { "create": True, "name": f"controller.{ptd.Roles.AWS_FSX_OPENZFS_CSI_DRIVER}", @@ -167,6 +176,15 @@ def _define_secret_store_csi(self, release: str, version: str): "version": version, "valuesContent": yaml.dump( { + "resources": { + "requests": { + "cpu": "30m", + "memory": "128Mi", + }, + "limits": { + "memory": "128Mi", + }, + }, "rotationPollInterval": "15s", "enableSecretRotation": True, "syncSecret": { @@ -195,6 +213,15 @@ def _define_secret_store_csi_aws(self, release: str, version: str): "version": version, "valuesContent": yaml.dump( { + "resources": { + "requests": { + "cpu": "10m", + "memory": "50Mi", + }, + "limits": { + "memory": "50Mi", + }, + }, "tolerations": [ { "key": "workload-type", @@ -202,7 +229,7 @@ def _define_secret_store_csi_aws(self, release: str, version: str): "value": "session", "effect": "NoSchedule", }, - ] + ], } ), }, @@ -227,6 +254,15 @@ def _define_aws_lbc(self, release: str, version: str): "version": version, "valuesContent": yaml.dump( { + "resources": { + "requests": { + "cpu": "100m", + "memory": "256Mi", + }, + "limits": { + "memory": "256Mi", + }, + }, "clusterName": cluster_name, "serviceAccount": { "create": True, @@ -262,7 +298,19 @@ def _define_metrics_server(self, release: str, version: str): "chart": "metrics-server", "targetNamespace": ptd.KUBE_SYSTEM_NAMESPACE, "version": version, - "valuesContent": yaml.dump({}), + "valuesContent": yaml.dump( + { + "resources": { + "requests": { + "cpu": "100m", + "memory": "200Mi", + }, + "limits": { + "memory": "200Mi", + }, + }, + } + ), }, opts=pulumi.ResourceOptions(provider=self.kube_providers[release]), ) @@ -619,9 +667,18 @@ def _define_kube_state_metrics(self, release: str, version: str): "version": version, "valuesContent": yaml.dump( { + "resources": { + "requests": { + "cpu": "10m", + "memory": "64Mi", + }, + "limits": { + "memory": "64Mi", + }, + }, "metricLabelsAllowlist": [ "pods=[launcher-instance-id]", - ] + ], } ), }, @@ -654,6 +711,15 @@ def _define_traefik(self, release: str, version: str, weight: str, cert_arns_out "version": version, "valuesContent": yaml.dump( { + "resources": { + "requests": { + "cpu": "100m", + "memory": "128Mi", + }, + "limits": { + "memory": "128Mi", + }, + }, "image": { "registry": "ghcr.io/traefik", }, diff --git a/python-pulumi/src/ptd/pulumi_resources/external_dns.py b/python-pulumi/src/ptd/pulumi_resources/external_dns.py index 2546924..540fdae 100644 --- a/python-pulumi/src/ptd/pulumi_resources/external_dns.py +++ b/python-pulumi/src/ptd/pulumi_resources/external_dns.py @@ -56,6 +56,15 @@ def _define_helm_release(self) -> None: ), atomic=True, values={ + "resources": { + "requests": { + "cpu": "50m", + "memory": "64Mi", + }, + "limits": { + "memory": "64Mi", + }, + }, "provider": "aws", "serviceAccount": { "create": True, diff --git a/python-pulumi/src/ptd/pulumi_resources/tigera_operator.py b/python-pulumi/src/ptd/pulumi_resources/tigera_operator.py index 8ebea19..fc2b27e 100644 --- a/python-pulumi/src/ptd/pulumi_resources/tigera_operator.py +++ b/python-pulumi/src/ptd/pulumi_resources/tigera_operator.py @@ -54,6 +54,15 @@ def _define_helm_release(self): ), atomic=False, values={ + "resources": { + "requests": { + "cpu": "100m", + "memory": "128Mi", + }, + "limits": { + "memory": "128Mi", + }, + }, "installation": { "enabled": True, "registry": "quay.io", From 01c63e1cac83c6d123f6d26acb6e05339eb9cfcc Mon Sep 17 00:00:00 2001 From: Anna Williamson Date: Mon, 26 Jan 2026 15:34:40 -0800 Subject: [PATCH 2/8] add ephemeral storage limit to prevent run away pods --- python-pulumi/src/ptd/pulumi_resources/tigera_operator.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python-pulumi/src/ptd/pulumi_resources/tigera_operator.py b/python-pulumi/src/ptd/pulumi_resources/tigera_operator.py index fc2b27e..1c09dd8 100644 --- a/python-pulumi/src/ptd/pulumi_resources/tigera_operator.py +++ b/python-pulumi/src/ptd/pulumi_resources/tigera_operator.py @@ -58,9 +58,11 @@ def _define_helm_release(self): "requests": { "cpu": "100m", "memory": "128Mi", + "ephemeral-storage": "1Gi", }, "limits": { "memory": "128Mi", + "ephemeral-storage": "2Gi", }, }, "installation": { From 9e1471c2d0cdf67b4ea57bb3b690d986578a0257 Mon Sep 17 00:00:00 2001 From: Anna Williamson Date: Mon, 26 Jan 2026 15:41:01 -0800 Subject: [PATCH 3/8] observability stack resources --- .../src/ptd/pulumi_resources/azure_traefik.py | 9 +++ .../pulumi_resources/azure_workload_helm.py | 56 ++++++++++++++++++- .../ptd/pulumi_resources/tigera_operator.py | 2 +- 3 files changed, 65 insertions(+), 2 deletions(-) diff --git a/python-pulumi/src/ptd/pulumi_resources/azure_traefik.py b/python-pulumi/src/ptd/pulumi_resources/azure_traefik.py index 15ed71b..9dea558 100644 --- a/python-pulumi/src/ptd/pulumi_resources/azure_traefik.py +++ b/python-pulumi/src/ptd/pulumi_resources/azure_traefik.py @@ -60,6 +60,15 @@ def _define_helm_release(self): ), atomic=True, values={ + "resources": { + "requests": { + "cpu": "100m", + "memory": "128Mi", + }, + "limits": { + "memory": "128Mi", + }, + }, "logs": { "general": { "level": "DEBUG", diff --git a/python-pulumi/src/ptd/pulumi_resources/azure_workload_helm.py b/python-pulumi/src/ptd/pulumi_resources/azure_workload_helm.py index df6fab8..9912fa7 100644 --- a/python-pulumi/src/ptd/pulumi_resources/azure_workload_helm.py +++ b/python-pulumi/src/ptd/pulumi_resources/azure_workload_helm.py @@ -104,6 +104,15 @@ def _define_loki(self, release: str, version: str): "valuesContent": loki_identity.client_id.apply( lambda client_id: yaml.dump( { + "resources": { + "requests": { + "cpu": "100m", + "memory": "512Mi", + }, + "limits": { + "memory": "512Mi", + }, + }, "gateway": { "image": { "registry": "quay.io", @@ -217,6 +226,15 @@ def _define_mimir(self, release: str, version: str): "valuesContent": mimir_identity.client_id.apply( lambda client_id: yaml.dump( { + "resources": { + "requests": { + "cpu": "100m", + "memory": "512Mi", + }, + "limits": { + "memory": "512Mi", + }, + }, "serviceAccount": { "create": True, "name": str(ptd.Roles.MIMIR), @@ -327,6 +345,15 @@ def _define_alloy(self, release: str, version: str): "version": version, "valuesContent": yaml.dump( { + "resources": { + "requests": { + "cpu": "50m", + "memory": "128Mi", + }, + "limits": { + "memory": "128Mi", + }, + }, "serviceAccount": { "create": True, "name": str(ptd.Roles.ALLOY), @@ -502,6 +529,15 @@ def _define_external_dns(self, release: str, version: str): "valuesContent": identity.client_id.apply( lambda client_id: yaml.dump( { + "resources": { + "requests": { + "cpu": "50m", + "memory": "64Mi", + }, + "limits": { + "memory": "64Mi", + }, + }, "provider": "azure", "domainFilters": [*sorted([site.domain for site in self.workload.cfg.sites.values()])], "extraArgs": { @@ -554,6 +590,15 @@ def _define_grafana(self, release: str, version: str): "version": version, "valuesContent": yaml.dump( { + "resources": { + "requests": { + "cpu": "100m", + "memory": "256Mi", + }, + "limits": { + "memory": "256Mi", + }, + }, "envFromSecret": "grafana-db-url", "grafana.ini": { "server": { @@ -665,9 +710,18 @@ def _define_kube_state_metrics(self, release: str, version: str): "version": version, "valuesContent": yaml.dump( { + "resources": { + "requests": { + "cpu": "10m", + "memory": "64Mi", + }, + "limits": { + "memory": "64Mi", + }, + }, "metricLabelsAllowlist": [ "pods=[launcher-instance-id]", - ] + ], } ), }, diff --git a/python-pulumi/src/ptd/pulumi_resources/tigera_operator.py b/python-pulumi/src/ptd/pulumi_resources/tigera_operator.py index 1c09dd8..26a2d10 100644 --- a/python-pulumi/src/ptd/pulumi_resources/tigera_operator.py +++ b/python-pulumi/src/ptd/pulumi_resources/tigera_operator.py @@ -89,7 +89,7 @@ def _define_helm_release(self): "type": "Calico", }, "nonPrivileged": "Enabled", - } + }, }, opts=pulumi.ResourceOptions(parent=self, depends_on=self.namespace), ) From 52f9f6192f3a0ac91bdb7c911425538fceca6b6c Mon Sep 17 00:00:00 2001 From: Anna Williamson Date: Mon, 26 Jan 2026 15:49:05 -0800 Subject: [PATCH 4/8] observability stack resources aws --- .../ptd/pulumi_resources/aws_workload_helm.py | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py b/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py index f9f8422..25dd985 100644 --- a/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py +++ b/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py @@ -340,6 +340,15 @@ def _define_loki(self, release: str, version: str, components): "version": version, "valuesContent": yaml.dump( { + "resources": { + "requests": { + "cpu": "100m", + "memory": "512Mi", + }, + "limits": { + "memory": "512Mi", + }, + }, "gateway": { "image": { "registry": "quay.io", @@ -457,6 +466,15 @@ def _define_grafana(self, release: str, version: str): "version": version, "valuesContent": yaml.dump( { + "resources": { + "requests": { + "cpu": "100m", + "memory": "256Mi", + }, + "limits": { + "memory": "256Mi", + }, + }, "envFromSecret": "grafana-db-url", "grafana.ini": { "server": { @@ -543,6 +561,15 @@ def _define_mimir(self, release: str, version: str, components): "version": version, "valuesContent": yaml.dump( { + "resources": { + "requests": { + "cpu": "100m", + "memory": "512Mi", + }, + "limits": { + "memory": "512Mi", + }, + }, "serviceAccount": { "create": True, "name": str(ptd.Roles.MIMIR), @@ -1252,6 +1279,15 @@ def _define_alloy(self, release: str, version: str): "version": version, "valuesContent": yaml.dump( { + "resources": { + "requests": { + "cpu": "50m", + "memory": "128Mi", + }, + "limits": { + "memory": "128Mi", + }, + }, "serviceAccount": { "create": True, "name": str(ptd.Roles.ALLOY), From 1782a1e8e90c09f3d6c0a3f0d4e5abdbd9b9a8e1 Mon Sep 17 00:00:00 2001 From: Anna Williamson Date: Thu, 29 Jan 2026 16:07:01 -0800 Subject: [PATCH 5/8] Add tolerations for prepull daemonset --- .../ptd/pulumi_resources/aws_workload_helm.py | 7 +--- .../src/ptd/pulumi_resources/team_site.py | 37 ++++++++++++------- 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py b/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py index 25dd985..8e131e1 100644 --- a/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py +++ b/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py @@ -1344,13 +1344,10 @@ def _define_alloy(self, release: str, version: str): "faroPort": 12347, "hosts": [f"faro.{self.workload.cfg.domain}"], }, - # Alloy is a DaemonSet, needs to run on all nodes including Karpenter session nodes + # Alloy is a DaemonSet, needs to run on all nodes regardless of taints "tolerations": [ { - "key": "workload-type", - "operator": "Equal", - "value": "session", - "effect": "NoSchedule", + "operator": "Exists", }, ], } diff --git a/python-pulumi/src/ptd/pulumi_resources/team_site.py b/python-pulumi/src/ptd/pulumi_resources/team_site.py index 19b88f2..b6ae067 100644 --- a/python-pulumi/src/ptd/pulumi_resources/team_site.py +++ b/python-pulumi/src/ptd/pulumi_resources/team_site.py @@ -131,8 +131,9 @@ def inject_cluster_tolerations(obj: dict[str, typing.Any], _: pulumi.ResourceOpt if obj["kind"] != "Site": return - # Compute session tolerations based on Karpenter node pools with session_taints=true + # Compute session tolerations and prepull node pools based on Karpenter node pools with session_taints=true session_tolerations = [] + session_node_pools = [] if self.cluster_config and hasattr(self.cluster_config, "karpenter_config"): karpenter_config = self.cluster_config.karpenter_config if karpenter_config and karpenter_config.node_pools: @@ -147,20 +148,30 @@ def inject_cluster_tolerations(obj: dict[str, typing.Any], _: pulumi.ResourceOpt if toleration not in session_tolerations: session_tolerations.append(toleration) - if not session_tolerations: - return + # Track node pool names for prepull targeting + if node_pool.name not in session_node_pools: + session_node_pools.append(node_pool.name) # Merge session tolerations into workbench spec - deepmerge.always_merger.merge(obj, {"spec": {"workbench": {"sessionTolerations": session_tolerations}}}) - - # Deduplicate tolerations (deepmerge concatenates lists) - tolerations = obj["spec"]["workbench"]["sessionTolerations"] - seen = {} - for t in tolerations: - key = (t.get("key"), t.get("operator"), t.get("value"), t.get("effect")) - if key not in seen: - seen[key] = t - obj["spec"]["workbench"]["sessionTolerations"] = list(seen.values()) + if session_tolerations: + deepmerge.always_merger.merge(obj, {"spec": {"workbench": {"sessionTolerations": session_tolerations}}}) + + # Deduplicate tolerations (deepmerge concatenates lists) + tolerations = obj["spec"]["workbench"]["sessionTolerations"] + seen = {} + for t in tolerations: + key = (t.get("key"), t.get("operator"), t.get("value"), t.get("effect")) + if key not in seen: + seen[key] = t + obj["spec"]["workbench"]["sessionTolerations"] = list(seen.values()) + + # Inject prepull node pool targeting if session-tainted pools exist AND prepull is not disabled + # Check if disablePrePullImages is set to true in the Site spec + disable_prepull = obj.get("spec", {}).get("disablePrePullImages", False) + if session_node_pools and not disable_prepull: + deepmerge.always_merger.merge( + obj, {"spec": {"prepullNodePools": session_node_pools}} + ) api_version_path = self._config_overrides.get("apiVersion", "").split("/")[-1] From cd4b273d533985ce9bbd400c7a8695a2d8c21cbf Mon Sep 17 00:00:00 2001 From: Anna Williamson Date: Fri, 30 Jan 2026 11:18:45 -0800 Subject: [PATCH 6/8] Updated monitoring for mimir and improved resource usage --- docs/guides/monitoring.md | 560 ++++-------------- .../src/ptd/grafana_alerts/mimir.yaml | 138 +++++ .../ptd/pulumi_resources/aws_eks_cluster.py | 17 + .../ptd/pulumi_resources/aws_workload_helm.py | 98 ++- .../pulumi_resources/azure_workload_helm.py | 100 +++- .../src/ptd/pulumi_resources/team_site.py | 4 +- 6 files changed, 418 insertions(+), 499 deletions(-) create mode 100644 python-pulumi/src/ptd/grafana_alerts/mimir.yaml diff --git a/docs/guides/monitoring.md b/docs/guides/monitoring.md index 8c36de9..4807859 100644 --- a/docs/guides/monitoring.md +++ b/docs/guides/monitoring.md @@ -1,534 +1,204 @@ # Monitoring Stack -This guide describes the Grafana-based monitoring stack deployed by the PTD CLI for workload observability. +PTD deploys a Grafana-based observability stack to each workload cluster: -## Overview - -PTD deploys a complete observability stack to each workload cluster consisting of: - -- **Grafana Alloy**: Metrics and log collection agent (deployed as a DaemonSet) -- **Mimir**: Prometheus-compatible metrics storage and querying -- **Loki**: Log aggregation and querying -- **Grafana**: Visualization and dashboard UI +- **Grafana Alloy**: Metrics and log collection (DaemonSet on every node) +- **Mimir**: Prometheus-compatible metrics storage +- **Loki**: Log aggregation +- **Grafana**: Visualization UI at `https://grafana.` ## Architecture -### Data Flow - ``` ┌─────────────────────────────────────────────────────────────┐ -│ Workload Cluster │ -│ │ -│ ┌──────────────┐ │ -│ │ Grafana │ │ -│ │ Alloy │ (DaemonSet - runs on every node) │ -│ │ │ │ -│ └──────┬───────┘ │ -│ │ │ -│ ├─── Metrics ───┬─────────────────────────────┐ │ -│ │ │ │ │ -│ │ ▼ │ │ -│ │ ┌─────────────────┐ │ │ -│ │ │ Local Mimir │ │ │ -│ │ │ (workload-only) │ │ │ -│ │ └────────┬────────┘ │ │ -│ │ │ │ │ -│ │ ▼ │ │ -│ │ ┌─────────────────┐ │ │ -│ │ │ Grafana UI │ │ │ -│ │ │ │ │ │ -│ │ └─────────────────┘ │ │ -│ │ │ │ -│ └─── Logs ──────────────────────┐ │ │ -│ │ │ │ -│ ▼ │ │ -│ ┌─────────────────┐ │ │ -│ │ Local Loki │ │ │ -│ │ (workload-only) │ │ │ -│ └────────┬────────┘ │ │ -│ │ │ │ -│ ▼ │ │ -│ ┌─────────────────┐ │ │ -│ │ Grafana UI │ │ │ -│ │ │ │ │ -│ └─────────────────┘ │ │ -│ │ │ -└────────────────────────────────────────────────────────┼───┘ - │ - Metrics Only (for alerting)│ - │ - ▼ - ┌──────────────────┐ - │ Control Room │ - │ Mimir │ - │ │ - └──────────────────┘ -``` - -### Key Design Principles - -**Metrics**: Dual-write pattern -- Sent to **local Mimir** for workload-specific dashboards and queries -- Sent to **control room Mimir** for centralized alerting and cross-workload monitoring - -**Logs**: Workload boundary isolation -- Sent **only to local Loki** within the workload -- Logs never leave the workload boundary -- Each workload has complete control over its own log data +│ Workload Cluster │ +│ │ +│ Grafana Alloy (DaemonSet) │ +│ │ │ +│ ├─── Metrics ──→ Local Mimir ──→ Grafana UI │ +│ │ │ │ +│ │ └──────────→ Control Room Mimir │ +│ │ (for alerting) │ +│ │ │ +│ └─── Logs ─────→ Local Loki ───→ Grafana UI │ +│ (stays in workload) │ +└─────────────────────────────────────────────────────────────┘ +``` + +**Key Design:** +- **Metrics**: Dual-write to local Mimir (dashboards) and control room Mimir (alerting) +- **Logs**: Stay within workload boundary only ## Components ### Grafana Alloy -Grafana Alloy is the telemetry collection agent that runs on every node in the cluster. - -**Deployment**: DaemonSet in the `alloy` namespace - -**Configuration** (see `python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py`): -- Scrapes metrics from: - - Kubernetes pods in `posit-team`, `posit-team-system`, and `loki` namespaces - - Node exporters (CPU, memory, disk, network) - - kube-state-metrics for cluster state - - **kubelet cAdvisor** for container-level resource usage metrics - - Blackbox exporter for health checks - - Cloud provider metrics for managed storage and database services -- Collects logs from: - - Kubernetes pods in `posit-team` and `posit-team-system` namespaces - - Optionally system logs via journald (controlled by `grafana_scrape_system_logs`) -- Runs with clustering enabled for high availability - -**Container Metrics (via cAdvisor)**: The following container-level metrics are collected for debugging resource issues: - -#### Memory Metrics -- `container_memory_working_set_bytes` - Active memory usage (what the OOM killer evaluates against limits) -- `container_memory_usage_bytes` - Total memory usage including cache -- `container_memory_rss` - Resident Set Size (anonymous memory: heap, stack) -- `container_memory_cache` - Page cache memory (can be reclaimed) -- `container_memory_swap` - Swap space usage -- `container_memory_failcnt` - Number of times memory allocation failed (OOM events) -- `container_spec_memory_limit_bytes` - Configured memory limit -- `container_spec_memory_reservation_limit_bytes` - Configured memory request - -#### CPU Metrics -- `container_cpu_usage_seconds_total` - Cumulative CPU time consumed -- `container_cpu_cfs_throttled_seconds_total` - Total time container was throttled due to CPU limits -- `container_cpu_cfs_throttled_periods_total` - Number of throttled periods -- `container_cpu_cfs_periods_total` - Total number of CPU CFS scheduler periods -- `container_spec_cpu_quota` - CPU limit in microseconds per 100ms period (-1 if unlimited) -- `container_spec_cpu_shares` - CPU request weight (relative to other containers) - -#### Network Metrics -- `container_network_receive_bytes_total` - Bytes received -- `container_network_transmit_bytes_total` - Bytes transmitted -- `container_network_receive_packets_total` - Packets received -- `container_network_transmit_packets_total` - Packets transmitted -- `container_network_receive_errors_total` - Errors receiving packets -- `container_network_transmit_errors_total` - Errors transmitting packets -- `container_network_receive_packets_dropped_total` - Inbound packets dropped -- `container_network_transmit_packets_dropped_total` - Outbound packets dropped - -#### Filesystem Metrics -- `container_fs_usage_bytes` - Current filesystem usage -- `container_fs_limit_bytes` - Filesystem capacity -- `container_fs_reads_bytes_total` - Bytes read from filesystem -- `container_fs_writes_bytes_total` - Bytes written to filesystem -- `container_fs_reads_total` - Number of read operations -- `container_fs_writes_total` - Number of write operations - -#### Container Lifecycle Metrics -- `container_start_time_seconds` - Unix timestamp when container started -- `kube_pod_container_status_restarts_total` - Number of container restarts (from kube-state-metrics) -- `kube_pod_container_status_last_terminated_reason` - Reason for last termination (from kube-state-metrics) - -**Helm Chart**: `grafana/alloy` - -**Key Configuration** (from `aws_workload_helm.py:1127-1258`): -```yaml -alloy: - clustering: - enabled: true - mounts: - extra: - - name: mimir-auth - mountPath: /etc/mimir/ - readOnly: true - varlog: true # If grafana_scrape_system_logs enabled - securityContext: - privileged: true # If grafana_scrape_system_logs enabled -tolerations: - - key: workload-type - operator: Equal - value: session - effect: NoSchedule -``` - -**Authentication**: Alloy uses basic authentication when writing metrics to the control room Mimir. Credentials are stored in a Kubernetes Secret (`mimir-auth`) and mounted into the Alloy pods. - -### Mimir +Configuration: `python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py` -Mimir is a horizontally scalable, long-term storage for Prometheus metrics. +**Scrapes metrics from:** +- Kubernetes pods in `posit-team`, `posit-team-system`, and `loki` namespaces +- Node exporters, kube-state-metrics, kubelet cAdvisor +- Cloud provider metrics for managed services -**Deployment**: Distributed deployment in the `mimir` namespace +**Collects logs from:** +- Kubernetes pods in `posit-team` and `posit-team-system` namespaces +- Optionally system logs via journald (`grafana_scrape_system_logs` setting) -**Storage Backend**: Object storage (S3 or Azure Blob Storage, configured per workload) - -**Helm Chart**: `grafana/mimir-distributed` - -**Key Configuration** (from `aws_workload_helm.py:473-604`): -```yaml -mimir: - structuredConfig: - blocks_storage: - backend: - storage_prefix: blocks - limits: - max_global_series_per_user: 800000 - max_label_names_per_series: 45 - -ingester: - replicas: - persistentVolume: - size: 20Gi - -compactor: - replicas: - persistentVolume: - size: 20Gi +### Mimir -store_gateway: - replicas: - persistentVolume: - size: 20Gi -``` +Distributed deployment in `mimir` namespace. Uses object storage (S3/Azure Blob) backend. -**Endpoints**: +**Endpoints:** - Gateway: `http://mimir-gateway.mimir.svc.cluster.local/prometheus` - Push API: `http://mimir-gateway.mimir.svc.cluster.local/api/v1/push` -### Loki - -Loki is a log aggregation system designed to store and query logs efficiently. - -**Deployment**: Distributed deployment in the `loki` namespace - -**Storage Backend**: Object storage (S3 or Azure Blob Storage, configured per workload) - -**Helm Chart**: `grafana/loki` - -**Key Configuration** (from `aws_workload_helm.py:270-393`): -```yaml -loki: - auth_enabled: false - storage: - type: - bucketNames: - chunks: - - ruler: - - admin: - - limits_config: - max_cache_freshness_per_query: 10m - query_timeout: 300s - reject_old_samples: true - reject_old_samples_max_age: 168h # 7 days - split_queries_by_interval: 15m - volume_enabled: true - storage_config: - hedging: - at: 250ms - max_per_second: 20 - up_to: 3 - -backend: - replicas: -read: - replicas: -write: - replicas: +**Architecture:** ``` - -**Endpoints**: -- Gateway: `http://loki-gateway.loki.svc.cluster.local` -- Push API: `http://loki-gateway.loki.svc.cluster.local/loki/api/v1/push` - -### Grafana - -Grafana provides the visualization layer for metrics and logs. - -**Deployment**: Single deployment in the `grafana` namespace - -**Helm Chart**: `grafana/grafana` - -**Data Sources** (from `aws_workload_helm.py:444-466`): -```yaml -datasources: - - name: Loki - type: loki - access: proxy - url: http://loki-gateway.loki.svc.cluster.local - isDefault: true - - name: Mimir - type: prometheus - access: proxy - url: http://mimir-gateway.mimir.svc.cluster.local/prometheus - isDefault: false +Write: Alloy → Gateway → Distributor → Ingesters (ring) → S3 +Read: Grafana → Gateway → Query Frontend → Querier → Ingesters/Store Gateway ``` -**Authentication**: Configured with proxy authentication via Traefik forward auth. Users are automatically signed up with Editor role. +**Ring Health:** Mimir uses a hash ring to distribute data. If ingesters are marked UNHEALTHY but remain in the ring, queries fail. Auto-forget is configured to clean up stale members after 10 minutes. -**Access**: Available at `https://grafana.` +**Troubleshooting Ring Issues:** +```bash +# View ring status +kubectl port-forward -n mimir svc/mimir-querier 8080:8080 +# Visit http://localhost:8080/ingester/ring +# Check pod status +kubectl get pods -n mimir -l app.kubernetes.io/component=ingester +``` -## Accessing Monitoring Data +### Loki -### Grafana UI +Distributed deployment in `loki` namespace. Uses object storage backend. -Access Grafana at `https://grafana.` for metrics visualization and log exploration. +**Endpoint:** `http://loki-gateway.loki.svc.cluster.local` -## Container Troubleshooting with Metrics +### Grafana -This section provides practical Grafana queries for diagnosing common container issues. +Single deployment in `grafana` namespace with Mimir and Loki as data sources. -### Memory Issues and OOMKilled Pods +**Access:** `https://grafana.` (authenticated via Traefik forward auth) -When pods are terminated due to OOM (Out of Memory), use these queries to investigate: +## Container Troubleshooting Queries -#### Identify OOMKilled Pods -```promql -# See which containers were OOMKilled -kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} +### Memory (OOMKilled Investigation) -# Count OOM events by pod over time -sum by (pod, namespace) (container_memory_failcnt{namespace="posit-team"}) -``` +| Metric | Purpose | +|--------|---------| +| `container_memory_working_set_bytes` | Active memory (OOM killer evaluates this) | +| `container_spec_memory_limit_bytes` | Configured limit | +| `container_memory_failcnt` | OOM event counter | -#### Memory Usage Analysis ```promql -# Working set memory (what OOM killer evaluates) by container -container_memory_working_set_bytes{namespace="posit-team"} - -# Memory usage as percentage of limit +# Memory usage as % of limit (container_memory_working_set_bytes{namespace="posit-team"} / container_spec_memory_limit_bytes{namespace="posit-team"}) * 100 -# Memory breakdown: RSS vs cache -container_memory_rss{namespace="posit-team"} -container_memory_cache{namespace="posit-team"} - -# Containers approaching memory limit (>90%) -(container_memory_working_set_bytes{namespace="posit-team"} - / container_spec_memory_limit_bytes{namespace="posit-team"}) > 0.9 -``` - -#### Historical Memory Trends -```promql -# Memory usage over time for a specific pod -container_memory_working_set_bytes{pod="", namespace="posit-team"} - -# Memory growth rate (bytes per second) -rate(container_memory_working_set_bytes{namespace="posit-team"}[5m]) +# Containers approaching limit (>90%) +(container_memory_working_set_bytes / container_spec_memory_limit_bytes) > 0.9 -# Peak memory usage in last hour -max_over_time(container_memory_working_set_bytes{namespace="posit-team"}[1h]) +# OOMKilled containers +kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} ``` -**Key Investigation Points:** -- `container_memory_working_set_bytes` exceeding `container_spec_memory_limit_bytes` triggers OOM -- High `container_memory_rss` indicates application memory pressure (heap, stack) -- High `container_memory_cache` can usually be reclaimed and is not the root cause -- Check if `container_memory_failcnt` is incrementing (indicates memory allocation failures) - -### CPU Throttling and Performance +### CPU Throttling -CPU throttling occurs when containers hit their CPU limits, causing performance degradation. +| Metric | Purpose | +|--------|---------| +| `container_cpu_usage_seconds_total` | Cumulative CPU time | +| `container_cpu_cfs_throttled_seconds_total` | Time spent throttled | +| `container_spec_cpu_quota` | CPU limit (microseconds per 100ms) | -#### Detect CPU Throttling ```promql -# Percentage of time container was throttled +# Throttle percentage rate(container_cpu_cfs_throttled_seconds_total{namespace="posit-team"}[5m]) / rate(container_cpu_cfs_periods_total{namespace="posit-team"}[5m]) * 100 -# Containers being throttled more than 10% of the time -(rate(container_cpu_cfs_throttled_periods_total{namespace="posit-team"}[5m]) - / rate(container_cpu_cfs_periods_total{namespace="posit-team"}[5m])) > 0.1 -``` - -#### CPU Usage Analysis -```promql -# CPU usage rate (cores) per container -rate(container_cpu_usage_seconds_total{namespace="posit-team"}[5m]) - -# CPU usage as percentage of limit (quota/100000 = cores) +# CPU usage (cores) rate(container_cpu_usage_seconds_total{namespace="posit-team"}[5m]) - / (container_spec_cpu_quota{namespace="posit-team"} / 100000) * 100 - -# Total throttled time per container -rate(container_cpu_cfs_throttled_seconds_total{namespace="posit-team"}[5m]) ``` -#### CPU Requests vs Usage -```promql -# CPU shares (requests) vs actual usage -container_spec_cpu_shares{namespace="posit-team"} -rate(container_cpu_usage_seconds_total{namespace="posit-team"}[5m]) -``` - -**Key Investigation Points:** -- Throttling >25% indicates containers need higher CPU limits -- CPU usage consistently at limit suggests CPU-bound workload -- Compare throttling patterns across similar pods to identify outliers -- Check if `container_spec_cpu_quota` is set too low for the workload - -### Network Issues +> **Tip:** Throttling >25% indicates containers need higher CPU limits. -Diagnose network connectivity, throughput, and error issues. +### Network -#### Network Throughput ```promql -# Receive throughput (bytes/second) +# Throughput rate(container_network_receive_bytes_total{namespace="posit-team"}[5m]) - -# Transmit throughput (bytes/second) rate(container_network_transmit_bytes_total{namespace="posit-team"}[5m]) -# Total network throughput per pod -sum by (pod) ( - rate(container_network_receive_bytes_total{namespace="posit-team"}[5m]) + - rate(container_network_transmit_bytes_total{namespace="posit-team"}[5m]) -) -``` - -#### Network Errors and Drops -```promql -# Packet errors +# Errors (non-zero indicates issues) rate(container_network_receive_errors_total{namespace="posit-team"}[5m]) -rate(container_network_transmit_errors_total{namespace="posit-team"}[5m]) - -# Dropped packets (indicates network congestion or buffer overflow) -rate(container_network_receive_packets_dropped_total{namespace="posit-team"}[5m]) rate(container_network_transmit_packets_dropped_total{namespace="posit-team"}[5m]) - -# Containers with any packet drops -(rate(container_network_receive_packets_dropped_total{namespace="posit-team"}[5m]) + - rate(container_network_transmit_packets_dropped_total{namespace="posit-team"}[5m])) > 0 -``` - -#### Network Packet Rate -```promql -# Packets per second -rate(container_network_receive_packets_total{namespace="posit-team"}[5m]) -rate(container_network_transmit_packets_total{namespace="posit-team"}[5m]) ``` -**Key Investigation Points:** -- Non-zero error rates indicate network interface or driver issues -- Dropped packets suggest network congestion or insufficient buffer space -- Compare throughput against expected workload to identify bottlenecks -- Sudden changes in packet rates may indicate connectivity problems - -### Disk I/O Issues +### Disk I/O -Diagnose filesystem usage and I/O performance problems. - -#### Filesystem Usage ```promql -# Filesystem usage by container -container_fs_usage_bytes{namespace="posit-team"} - -# Filesystem usage as percentage of capacity -(container_fs_usage_bytes{namespace="posit-team"} - / container_fs_limit_bytes{namespace="posit-team"}) * 100 +# Filesystem usage % +(container_fs_usage_bytes / container_fs_limit_bytes) * 100 -# Containers with >80% disk usage -(container_fs_usage_bytes{namespace="posit-team"} - / container_fs_limit_bytes{namespace="posit-team"}) > 0.8 -``` - -#### Disk I/O Throughput -```promql -# Read throughput (bytes/second) +# I/O throughput rate(container_fs_reads_bytes_total{namespace="posit-team"}[5m]) - -# Write throughput (bytes/second) rate(container_fs_writes_bytes_total{namespace="posit-team"}[5m]) - -# Total I/O throughput -sum by (pod) ( - rate(container_fs_reads_bytes_total{namespace="posit-team"}[5m]) + - rate(container_fs_writes_bytes_total{namespace="posit-team"}[5m]) -) ``` -#### Disk I/O Operations +> **Tip:** Filesystem usage >90% can cause pod evictions. + +### Container Restarts + ```promql -# Read IOPS (operations per second) -rate(container_fs_reads_total{namespace="posit-team"}[5m]) +# Containers with restarts +kube_pod_container_status_restarts_total{namespace="posit-team"} > 0 -# Write IOPS -rate(container_fs_writes_total{namespace="posit-team"}[5m]) +# Termination reasons +kube_pod_container_status_last_terminated_reason{namespace="posit-team"} -# Top containers by IOPS -topk(10, - rate(container_fs_reads_total{namespace="posit-team"}[5m]) + - rate(container_fs_writes_total{namespace="posit-team"}[5m]) -) +# Recently restarted (< 1 hour uptime) +(time() - container_start_time_seconds{namespace="posit-team"}) < 3600 ``` -**Key Investigation Points:** -- Filesystem usage >90% can cause application errors and pod evictions -- High IOPS with low throughput suggests small file operations -- Sudden spikes in write operations may indicate logging or caching issues -- Compare I/O patterns against storage backend limits (EBS, Azure Disk) +## Mimir Self-Monitoring -### Container Restart and Lifecycle Issues +### The Chicken-and-Egg Problem -Track container restarts, crashes, and lifecycle problems. +If a workload's Mimir breaks, alerts running on that workload can't query it. PTD solves this by running Mimir alerts on the **control room**, which queries its own Mimir instance that receives metrics via dual-write from all workloads. -#### Container Restart Patterns -```promql -# Containers with recent restarts -kube_pod_container_status_restarts_total{namespace="posit-team"} > 0 +### Alerts -# Restart rate (restarts per minute) -rate(kube_pod_container_status_restarts_total{namespace="posit-team"}[5m]) * 60 +Alerts defined in `python-pulumi/src/ptd/grafana_alerts/mimir.yaml` (deployed to control room Grafana): -# Top restarting containers -topk(10, kube_pod_container_status_restarts_total{namespace="posit-team"}) -``` - -#### Termination Reasons -```promql -# See why containers terminated -kube_pod_container_status_last_terminated_reason{namespace="posit-team"} +| Alert | Catches | +|-------|---------| +| `mimir_ingester_pods_not_ready` | Pod crashes/restarts (earliest warning) | +| `mimir_remote_write_failures` | Alloy can't push metrics to Mimir | -# Count terminations by reason -count by (reason) (kube_pod_container_status_last_terminated_reason{namespace="posit-team"}) +Ring health issues are handled by auto-forget configuration (stale members removed after 10 minutes). -# OOMKilled containers specifically -kube_pod_container_status_last_terminated_reason{reason="OOMKilled", namespace="posit-team"} -``` +### Mimir Diagnostic Queries -#### Container Age and Uptime ```promql -# Container uptime (seconds) -time() - container_start_time_seconds{namespace="posit-team"} +# Ring health +cortex_ring_members{ring="ingester"} +cortex_ring_members{state="Unhealthy",ring="ingester"} -# Containers younger than 1 hour (recently restarted) -(time() - container_start_time_seconds{namespace="posit-team"}) < 3600 +# Ingestion rate +sum(rate(cortex_distributor_received_samples_total[5m])) -# Average container age by pod -avg by (pod) (time() - container_start_time_seconds{namespace="posit-team"}) -``` +# Query latency (p99) +histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket{route=~".*query.*"}[5m])) by (le)) -**Key Investigation Points:** -- Restart rate >0 indicates instability (crashes, OOM, failed health checks) -- Check `kube_pod_container_status_last_terminated_reason` to understand why -- Frequent restarts with "Error" reason suggest application bugs -- OOMKilled restarts indicate insufficient memory limits -- Short uptime combined with high restart count suggests crash loops +# Query error rate +sum(rate(cortex_request_duration_seconds_count{status_code=~"5.."}[5m])) + / sum(rate(cortex_request_duration_seconds_count[5m])) +``` ## Related Documentation -- [Grafana Alloy Documentation](https://grafana.com/docs/alloy/latest/) -- [Mimir Documentation](https://grafana.com/docs/mimir/latest/) -- [Loki Documentation](https://grafana.com/docs/loki/latest/) -- [Grafana Documentation](https://grafana.com/docs/grafana/latest/) +- [Grafana Alloy](https://grafana.com/docs/alloy/latest/) +- [Mimir](https://grafana.com/docs/mimir/latest/) +- [Loki](https://grafana.com/docs/loki/latest/) +- [Grafana](https://grafana.com/docs/grafana/latest/) diff --git a/python-pulumi/src/ptd/grafana_alerts/mimir.yaml b/python-pulumi/src/ptd/grafana_alerts/mimir.yaml new file mode 100644 index 0000000..d8200c7 --- /dev/null +++ b/python-pulumi/src/ptd/grafana_alerts/mimir.yaml @@ -0,0 +1,138 @@ +# To delete these alerts, replace file contents with: +# apiVersion: 1 +# deleteRules: +# - orgId: 1 +# uid: mimir_ingester_pods_not_ready +# - orgId: 1 +# uid: mimir_remote_write_failures +# +# See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/ +# +# These alerts monitor workload Mimir health from the control room. +# They use metrics that Alloy dual-writes to the control room Mimir. +apiVersion: 1 +groups: + - orgId: 1 + name: Mimir + folder: Posit Alerts + interval: 1m + rules: + - uid: mimir_ingester_pods_not_ready + title: Mimir Ingester Pods Not Ready + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: mimir + model: + editorMode: code + expr: | + kube_statefulset_status_replicas_ready{namespace="mimir",statefulset="mimir-ingester"} + < + kube_statefulset_status_replicas{namespace="mimir",statefulset="mimir-ingester"} + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0 + type: gt + operator: + type: and + query: + params: + - B + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: B + type: threshold + noDataState: NoData + execErrState: Error + for: 2m + annotations: + description: >- + Mimir ingester pods not ready in cluster {{ $labels.cluster }}. + Check: kubectl get pods -n mimir -l app.kubernetes.io/component=ingester + summary: Mimir ingester pods not ready in {{ $labels.cluster }} + labels: + opsgenie: "1" + isPaused: false + + - uid: mimir_remote_write_failures + title: Mimir Remote Write Failures + condition: B + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: mimir + model: + editorMode: code + expr: | + rate(prometheus_remote_storage_samples_failed_total{url=~".*mimir.*"}[5m]) > 0 + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: B + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0 + type: gt + operator: + type: and + query: + params: + - B + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: B + type: threshold + noDataState: NoData + execErrState: Error + for: 5m + annotations: + description: >- + Alloy failing to write metrics to Mimir in cluster {{ $labels.cluster }}. + Check Alloy: kubectl logs -n alloy -l app.kubernetes.io/name=alloy --tail=100 | grep -i error + summary: Metrics remote write to Mimir failing in {{ $labels.cluster }} + labels: + opsgenie: "1" + isPaused: false diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py index 82f6a54..bdb358e 100644 --- a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py +++ b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py @@ -1914,6 +1914,7 @@ def with_grafana( self._create_alert_configmap("healthchecks", grafana_ns) self._create_alert_configmap("nodes", grafana_ns) self._create_alert_configmap("applications", grafana_ns) + self._create_alert_configmap("mimir", grafana_ns) # TODO: auth.proxy should be configurable, prod grafana auth will need tighter controls than letting anyone in as an Editor k8s.helm.v3.Release( @@ -2202,6 +2203,22 @@ def with_mimir( "max_global_series_per_user": 800000, "max_label_names_per_series": 45, }, + # Ring health configuration to auto-forget unhealthy members + # and prevent stale entries from blocking queries + "ingester": { + "ring": { + "heartbeat_timeout": "1m", + "auto_forget_unhealthy": True, + "auto_forget_unhealthy_timeout": "10m", + }, + }, + "store_gateway": { + "sharding_ring": { + "heartbeat_timeout": "1m", + "auto_forget_unhealthy": True, + "auto_forget_unhealthy_timeout": "10m", + }, + }, } }, "alertmanager": {"enabled": False}, diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py b/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py index 8e131e1..b401004 100644 --- a/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py +++ b/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py @@ -340,20 +340,15 @@ def _define_loki(self, release: str, version: str, components): "version": version, "valuesContent": yaml.dump( { - "resources": { - "requests": { - "cpu": "100m", - "memory": "512Mi", - }, - "limits": { - "memory": "512Mi", - }, - }, "gateway": { "image": { "registry": "quay.io", "repository": "nginx/nginx-unprivileged", - } + }, + "resources": { + "requests": {"cpu": "10m", "memory": "32Mi"}, + "limits": {"memory": "32Mi"}, + }, }, "loki": { "auth_enabled": False, @@ -430,18 +425,30 @@ def _define_loki(self, release: str, version: str, components): "persistence": { "enableStatefulSetAutoDeletePVC": True, }, + "resources": { + "requests": {"cpu": "50m", "memory": "128Mi"}, + "limits": {"memory": "128Mi"}, + }, }, "read": { "replicas": components.loki_replicas, "persistence": { "enableStatefulSetAutoDeletePVC": True, }, + "resources": { + "requests": {"cpu": "50m", "memory": "128Mi"}, + "limits": {"memory": "128Mi"}, + }, }, "write": { "replicas": components.loki_replicas, "persistence": { "enableStatefulSetAutoDeletePVC": True, }, + "resources": { + "requests": {"cpu": "50m", "memory": "128Mi"}, + "limits": {"memory": "128Mi"}, + }, }, }, ), @@ -468,11 +475,11 @@ def _define_grafana(self, release: str, version: str): { "resources": { "requests": { - "cpu": "100m", - "memory": "256Mi", + "cpu": "50m", + "memory": "128Mi", }, "limits": { - "memory": "256Mi", + "memory": "128Mi", }, }, "envFromSecret": "grafana-db-url", @@ -561,15 +568,6 @@ def _define_mimir(self, release: str, version: str, components): "version": version, "valuesContent": yaml.dump( { - "resources": { - "requests": { - "cpu": "100m", - "memory": "512Mi", - }, - "limits": { - "memory": "512Mi", - }, - }, "serviceAccount": { "create": True, "name": str(ptd.Roles.MIMIR), @@ -596,6 +594,22 @@ def _define_mimir(self, release: str, version: str, components): "max_global_series_per_user": 800000, "max_label_names_per_series": 45, }, + # Ring health configuration to auto-forget unhealthy members + # and prevent stale entries from blocking queries + "ingester": { + "ring": { + "heartbeat_timeout": "1m", + "auto_forget_unhealthy": True, + "auto_forget_unhealthy_timeout": "10m", + }, + }, + "store_gateway": { + "sharding_ring": { + "heartbeat_timeout": "1m", + "auto_forget_unhealthy": True, + "auto_forget_unhealthy_timeout": "10m", + }, + }, } }, "alertmanager": {"enabled": False}, @@ -604,6 +618,10 @@ def _define_mimir(self, release: str, version: str, components): "persistentVolume": {"size": "20Gi"}, "replicas": components.mimir_replicas, "zoneAwareReplication": {"enabled": False}, + "resources": { + "requests": {"cpu": "50m", "memory": "256Mi"}, + "limits": {"memory": "256Mi"}, + }, "affinity": { "nodeAffinity": { "requiredDuringSchedulingIgnoredDuringExecution": { @@ -621,9 +639,31 @@ def _define_mimir(self, release: str, version: str, components): } }, }, + "distributor": { + "resources": { + "requests": {"cpu": "50m", "memory": "128Mi"}, + "limits": {"memory": "128Mi"}, + }, + }, + "querier": { + "resources": { + "requests": {"cpu": "50m", "memory": "128Mi"}, + "limits": {"memory": "128Mi"}, + }, + }, + "query_frontend": { + "resources": { + "requests": {"cpu": "50m", "memory": "128Mi"}, + "limits": {"memory": "128Mi"}, + }, + }, "compactor": { "persistentVolume": {"size": "20Gi"}, "replicas": components.mimir_replicas, + "resources": { + "requests": {"cpu": "50m", "memory": "128Mi"}, + "limits": {"memory": "128Mi"}, + }, "affinity": { "nodeAffinity": { "requiredDuringSchedulingIgnoredDuringExecution": { @@ -645,6 +685,10 @@ def _define_mimir(self, release: str, version: str, components): "persistentVolume": {"size": "20Gi"}, "replicas": components.mimir_replicas, "zoneAwareReplication": {"enabled": False}, + "resources": { + "requests": {"cpu": "50m", "memory": "128Mi"}, + "limits": {"memory": "128Mi"}, + }, "affinity": { "nodeAffinity": { "requiredDuringSchedulingIgnoredDuringExecution": { @@ -664,6 +708,10 @@ def _define_mimir(self, release: str, version: str, components): }, "gateway": { "enabledNonEnterprise": True, + "resources": { + "requests": {"cpu": "10m", "memory": "32Mi"}, + "limits": {"memory": "32Mi"}, + }, "nginx": { "image": { "registry": "quay.io", @@ -1281,11 +1329,11 @@ def _define_alloy(self, release: str, version: str): { "resources": { "requests": { - "cpu": "50m", - "memory": "128Mi", + "cpu": "25m", + "memory": "64Mi", }, "limits": { - "memory": "128Mi", + "memory": "64Mi", }, }, "serviceAccount": { diff --git a/python-pulumi/src/ptd/pulumi_resources/azure_workload_helm.py b/python-pulumi/src/ptd/pulumi_resources/azure_workload_helm.py index 9912fa7..c98851e 100644 --- a/python-pulumi/src/ptd/pulumi_resources/azure_workload_helm.py +++ b/python-pulumi/src/ptd/pulumi_resources/azure_workload_helm.py @@ -104,20 +104,21 @@ def _define_loki(self, release: str, version: str): "valuesContent": loki_identity.client_id.apply( lambda client_id: yaml.dump( { - "resources": { - "requests": { - "cpu": "100m", - "memory": "512Mi", - }, - "limits": { - "memory": "512Mi", + "singleBinary": { + "resources": { + "requests": {"cpu": "50m", "memory": "256Mi"}, + "limits": {"memory": "256Mi"}, }, }, "gateway": { "image": { "registry": "quay.io", "repository": "nginx/nginx-unprivileged", - } + }, + "resources": { + "requests": {"cpu": "10m", "memory": "32Mi"}, + "limits": {"memory": "32Mi"}, + }, }, "loki": { "auth_enabled": False, @@ -226,15 +227,6 @@ def _define_mimir(self, release: str, version: str): "valuesContent": mimir_identity.client_id.apply( lambda client_id: yaml.dump( { - "resources": { - "requests": { - "cpu": "100m", - "memory": "512Mi", - }, - "limits": { - "memory": "512Mi", - }, - }, "serviceAccount": { "create": True, "name": str(ptd.Roles.MIMIR), @@ -270,6 +262,22 @@ def _define_mimir(self, release: str, version: str): "max_global_series_per_user": 800000, "max_label_names_per_series": 45, }, + # Ring health configuration to auto-forget unhealthy members + # and prevent stale entries from blocking queries + "ingester": { + "ring": { + "heartbeat_timeout": "1m", + "auto_forget_unhealthy": True, + "auto_forget_unhealthy_timeout": "10m", + }, + }, + "store_gateway": { + "sharding_ring": { + "heartbeat_timeout": "1m", + "auto_forget_unhealthy": True, + "auto_forget_unhealthy_timeout": "10m", + }, + }, }, }, "minio": { @@ -277,11 +285,51 @@ def _define_mimir(self, release: str, version: str): }, "alertmanager": {"enabled": False}, "ruler": {"enabled": False}, - "ingester": {"persistentVolume": {"size": "20Gi"}}, - "compactor": {"persistentVolume": {"size": "20Gi"}}, - "store_gateway": {"persistentVolume": {"size": "20Gi"}}, + "ingester": { + "persistentVolume": {"size": "20Gi"}, + "resources": { + "requests": {"cpu": "50m", "memory": "256Mi"}, + "limits": {"memory": "256Mi"}, + }, + }, + "distributor": { + "resources": { + "requests": {"cpu": "50m", "memory": "128Mi"}, + "limits": {"memory": "128Mi"}, + }, + }, + "querier": { + "resources": { + "requests": {"cpu": "50m", "memory": "128Mi"}, + "limits": {"memory": "128Mi"}, + }, + }, + "query_frontend": { + "resources": { + "requests": {"cpu": "50m", "memory": "128Mi"}, + "limits": {"memory": "128Mi"}, + }, + }, + "compactor": { + "persistentVolume": {"size": "20Gi"}, + "resources": { + "requests": {"cpu": "50m", "memory": "128Mi"}, + "limits": {"memory": "128Mi"}, + }, + }, + "store_gateway": { + "persistentVolume": {"size": "20Gi"}, + "resources": { + "requests": {"cpu": "50m", "memory": "128Mi"}, + "limits": {"memory": "128Mi"}, + }, + }, "gateway": { "enabledNonEnterprise": True, + "resources": { + "requests": {"cpu": "10m", "memory": "32Mi"}, + "limits": {"memory": "32Mi"}, + }, "nginx": { "image": { "registry": "quay.io", @@ -347,11 +395,11 @@ def _define_alloy(self, release: str, version: str): { "resources": { "requests": { - "cpu": "50m", - "memory": "128Mi", + "cpu": "25m", + "memory": "64Mi", }, "limits": { - "memory": "128Mi", + "memory": "64Mi", }, }, "serviceAccount": { @@ -592,11 +640,11 @@ def _define_grafana(self, release: str, version: str): { "resources": { "requests": { - "cpu": "100m", - "memory": "256Mi", + "cpu": "50m", + "memory": "128Mi", }, "limits": { - "memory": "256Mi", + "memory": "128Mi", }, }, "envFromSecret": "grafana-db-url", diff --git a/python-pulumi/src/ptd/pulumi_resources/team_site.py b/python-pulumi/src/ptd/pulumi_resources/team_site.py index b6ae067..c784f78 100644 --- a/python-pulumi/src/ptd/pulumi_resources/team_site.py +++ b/python-pulumi/src/ptd/pulumi_resources/team_site.py @@ -169,9 +169,7 @@ def inject_cluster_tolerations(obj: dict[str, typing.Any], _: pulumi.ResourceOpt # Check if disablePrePullImages is set to true in the Site spec disable_prepull = obj.get("spec", {}).get("disablePrePullImages", False) if session_node_pools and not disable_prepull: - deepmerge.always_merger.merge( - obj, {"spec": {"prepullNodePools": session_node_pools}} - ) + deepmerge.always_merger.merge(obj, {"spec": {"prepullNodePools": session_node_pools}}) api_version_path = self._config_overrides.get("apiVersion", "").split("/")[-1] From 04e37b14c08444f732b5a155147fb59142781414 Mon Sep 17 00:00:00 2001 From: Anna Williamson Date: Fri, 30 Jan 2026 14:24:26 -0800 Subject: [PATCH 7/8] resource updates --- .../ptd/pulumi_resources/aws_workload_helm.py | 117 ++++++++++++------ .../ptd/pulumi_resources/tigera_operator.py | 6 +- 2 files changed, 81 insertions(+), 42 deletions(-) diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py b/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py index b401004..09408eb 100644 --- a/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py +++ b/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py @@ -300,11 +300,9 @@ def _define_metrics_server(self, release: str, version: str): "version": version, "valuesContent": yaml.dump( { + # Chart defaults: 100m CPU, 200Mi memory requests + # Only adding memory limit to prevent OOM "resources": { - "requests": { - "cpu": "100m", - "memory": "200Mi", - }, "limits": { "memory": "200Mi", }, @@ -346,8 +344,8 @@ def _define_loki(self, release: str, version: str, components): "repository": "nginx/nginx-unprivileged", }, "resources": { - "requests": {"cpu": "10m", "memory": "32Mi"}, - "limits": {"memory": "32Mi"}, + "requests": {"cpu": "10m", "memory": "100Mi"}, + "limits": {"memory": "100Mi"}, }, }, "loki": { @@ -406,6 +404,12 @@ def _define_loki(self, release: str, version: str, components): "image": { "repository": "quay.io/kiwigrid/k8s-sidecar", }, + "rules": { + "resources": { + "requests": {"cpu": "10m", "memory": "100Mi"}, + "limits": {"memory": "100Mi"}, + }, + }, }, "monitoring": { "dashboards": {"enabled": False}, @@ -426,8 +430,8 @@ def _define_loki(self, release: str, version: str, components): "enableStatefulSetAutoDeletePVC": True, }, "resources": { - "requests": {"cpu": "50m", "memory": "128Mi"}, - "limits": {"memory": "128Mi"}, + "requests": {"cpu": "12m", "memory": "111Mi"}, + "limits": {"memory": "111Mi"}, }, }, "read": { @@ -436,8 +440,8 @@ def _define_loki(self, release: str, version: str, components): "enableStatefulSetAutoDeletePVC": True, }, "resources": { - "requests": {"cpu": "50m", "memory": "128Mi"}, - "limits": {"memory": "128Mi"}, + "requests": {"cpu": "22m", "memory": "186Mi"}, + "limits": {"memory": "186Mi"}, }, }, "write": { @@ -446,8 +450,14 @@ def _define_loki(self, release: str, version: str, components): "enableStatefulSetAutoDeletePVC": True, }, "resources": { - "requests": {"cpu": "50m", "memory": "128Mi"}, - "limits": {"memory": "128Mi"}, + "requests": {"cpu": "12m", "memory": "261Mi"}, + "limits": {"memory": "261Mi"}, + }, + }, + "lokiCanary": { + "resources": { + "requests": {"cpu": "10m", "memory": "100Mi"}, + "limits": {"memory": "100Mi"}, }, }, }, @@ -475,11 +485,11 @@ def _define_grafana(self, release: str, version: str): { "resources": { "requests": { - "cpu": "50m", - "memory": "128Mi", + "cpu": "10m", + "memory": "100Mi", }, "limits": { - "memory": "128Mi", + "memory": "100Mi", }, }, "envFromSecret": "grafana-db-url", @@ -619,8 +629,8 @@ def _define_mimir(self, release: str, version: str, components): "replicas": components.mimir_replicas, "zoneAwareReplication": {"enabled": False}, "resources": { - "requests": {"cpu": "50m", "memory": "256Mi"}, - "limits": {"memory": "256Mi"}, + "requests": {"cpu": "17m", "memory": "279Mi"}, + "limits": {"memory": "279Mi"}, }, "affinity": { "nodeAffinity": { @@ -641,28 +651,46 @@ def _define_mimir(self, release: str, version: str, components): }, "distributor": { "resources": { - "requests": {"cpu": "50m", "memory": "128Mi"}, - "limits": {"memory": "128Mi"}, + "requests": {"cpu": "16m", "memory": "119Mi"}, + "limits": {"memory": "119Mi"}, }, }, "querier": { "resources": { - "requests": {"cpu": "50m", "memory": "128Mi"}, - "limits": {"memory": "128Mi"}, + "requests": {"cpu": "10m", "memory": "100Mi"}, + "limits": {"memory": "100Mi"}, }, }, "query_frontend": { "resources": { - "requests": {"cpu": "50m", "memory": "128Mi"}, - "limits": {"memory": "128Mi"}, + "requests": {"cpu": "10m", "memory": "100Mi"}, + "limits": {"memory": "100Mi"}, + }, + }, + "query_scheduler": { + "resources": { + "requests": {"cpu": "10m", "memory": "100Mi"}, + "limits": {"memory": "100Mi"}, + }, + }, + "overrides_exporter": { + "resources": { + "requests": {"cpu": "10m", "memory": "100Mi"}, + "limits": {"memory": "100Mi"}, + }, + }, + "rollout_operator": { + "resources": { + "requests": {"cpu": "10m", "memory": "100Mi"}, + "limits": {"memory": "100Mi"}, }, }, "compactor": { "persistentVolume": {"size": "20Gi"}, "replicas": components.mimir_replicas, "resources": { - "requests": {"cpu": "50m", "memory": "128Mi"}, - "limits": {"memory": "128Mi"}, + "requests": {"cpu": "10m", "memory": "117Mi"}, + "limits": {"memory": "117Mi"}, }, "affinity": { "nodeAffinity": { @@ -686,8 +714,8 @@ def _define_mimir(self, release: str, version: str, components): "replicas": components.mimir_replicas, "zoneAwareReplication": {"enabled": False}, "resources": { - "requests": {"cpu": "50m", "memory": "128Mi"}, - "limits": {"memory": "128Mi"}, + "requests": {"cpu": "10m", "memory": "100Mi"}, + "limits": {"memory": "100Mi"}, }, "affinity": { "nodeAffinity": { @@ -709,8 +737,8 @@ def _define_mimir(self, release: str, version: str, components): "gateway": { "enabledNonEnterprise": True, "resources": { - "requests": {"cpu": "10m", "memory": "32Mi"}, - "limits": {"memory": "32Mi"}, + "requests": {"cpu": "10m", "memory": "100Mi"}, + "limits": {"memory": "100Mi"}, }, "nginx": { "image": { @@ -788,11 +816,11 @@ def _define_traefik(self, release: str, version: str, weight: str, cert_arns_out { "resources": { "requests": { - "cpu": "100m", - "memory": "128Mi", + "cpu": "10m", + "memory": "100Mi", }, "limits": { - "memory": "128Mi", + "memory": "100Mi", }, }, "image": { @@ -1327,13 +1355,15 @@ def _define_alloy(self, release: str, version: str): "version": version, "valuesContent": yaml.dump( { - "resources": { - "requests": { - "cpu": "25m", - "memory": "64Mi", - }, - "limits": { - "memory": "64Mi", + "configReloader": { + "resources": { + "requests": { + "cpu": "10m", + "memory": "100Mi", + }, + "limits": { + "memory": "100Mi", + }, }, }, "serviceAccount": { @@ -1362,6 +1392,15 @@ def _define_alloy(self, release: str, version: str): } }, "alloy": { + "resources": { + "requests": { + "cpu": "27m", + "memory": "896Mi", + }, + "limits": { + "memory": "896Mi", + }, + }, "clustering": {"enabled": True}, "extraPorts": [ { diff --git a/python-pulumi/src/ptd/pulumi_resources/tigera_operator.py b/python-pulumi/src/ptd/pulumi_resources/tigera_operator.py index 26a2d10..4913d25 100644 --- a/python-pulumi/src/ptd/pulumi_resources/tigera_operator.py +++ b/python-pulumi/src/ptd/pulumi_resources/tigera_operator.py @@ -56,12 +56,12 @@ def _define_helm_release(self): values={ "resources": { "requests": { - "cpu": "100m", - "memory": "128Mi", + "cpu": "10m", + "memory": "100Mi", "ephemeral-storage": "1Gi", }, "limits": { - "memory": "128Mi", + "memory": "100Mi", "ephemeral-storage": "2Gi", }, }, From 52813a0ee6843083a1d94148a25192506b0ce033 Mon Sep 17 00:00:00 2001 From: Anna Williamson Date: Fri, 30 Jan 2026 14:28:17 -0800 Subject: [PATCH 8/8] revert prepull changes, moving to another pr --- .../src/ptd/pulumi_resources/team_site.py | 35 +++++++------------ 1 file changed, 13 insertions(+), 22 deletions(-) diff --git a/python-pulumi/src/ptd/pulumi_resources/team_site.py b/python-pulumi/src/ptd/pulumi_resources/team_site.py index c784f78..19b88f2 100644 --- a/python-pulumi/src/ptd/pulumi_resources/team_site.py +++ b/python-pulumi/src/ptd/pulumi_resources/team_site.py @@ -131,9 +131,8 @@ def inject_cluster_tolerations(obj: dict[str, typing.Any], _: pulumi.ResourceOpt if obj["kind"] != "Site": return - # Compute session tolerations and prepull node pools based on Karpenter node pools with session_taints=true + # Compute session tolerations based on Karpenter node pools with session_taints=true session_tolerations = [] - session_node_pools = [] if self.cluster_config and hasattr(self.cluster_config, "karpenter_config"): karpenter_config = self.cluster_config.karpenter_config if karpenter_config and karpenter_config.node_pools: @@ -148,28 +147,20 @@ def inject_cluster_tolerations(obj: dict[str, typing.Any], _: pulumi.ResourceOpt if toleration not in session_tolerations: session_tolerations.append(toleration) - # Track node pool names for prepull targeting - if node_pool.name not in session_node_pools: - session_node_pools.append(node_pool.name) + if not session_tolerations: + return # Merge session tolerations into workbench spec - if session_tolerations: - deepmerge.always_merger.merge(obj, {"spec": {"workbench": {"sessionTolerations": session_tolerations}}}) - - # Deduplicate tolerations (deepmerge concatenates lists) - tolerations = obj["spec"]["workbench"]["sessionTolerations"] - seen = {} - for t in tolerations: - key = (t.get("key"), t.get("operator"), t.get("value"), t.get("effect")) - if key not in seen: - seen[key] = t - obj["spec"]["workbench"]["sessionTolerations"] = list(seen.values()) - - # Inject prepull node pool targeting if session-tainted pools exist AND prepull is not disabled - # Check if disablePrePullImages is set to true in the Site spec - disable_prepull = obj.get("spec", {}).get("disablePrePullImages", False) - if session_node_pools and not disable_prepull: - deepmerge.always_merger.merge(obj, {"spec": {"prepullNodePools": session_node_pools}}) + deepmerge.always_merger.merge(obj, {"spec": {"workbench": {"sessionTolerations": session_tolerations}}}) + + # Deduplicate tolerations (deepmerge concatenates lists) + tolerations = obj["spec"]["workbench"]["sessionTolerations"] + seen = {} + for t in tolerations: + key = (t.get("key"), t.get("operator"), t.get("value"), t.get("effect")) + if key not in seen: + seen[key] = t + obj["spec"]["workbench"]["sessionTolerations"] = list(seen.values()) api_version_path = self._config_overrides.get("apiVersion", "").split("/")[-1]