From 2824b55ba67d4fe7e8a9a3cb91ff934ad63df59c Mon Sep 17 00:00:00 2001
From: Anna Williamson <annamdove@gmail.com>
Date: Mon, 26 Jan 2026 13:27:50 -0800
Subject: [PATCH 1/8] Adding resource requests and limits for component helm
 charts

---
 .../ptd/pulumi_resources/aws_eks_cluster.py   | 26 +++++++
 .../ptd/pulumi_resources/aws_workload_helm.py | 72 ++++++++++++++++++-
 .../src/ptd/pulumi_resources/external_dns.py  |  9 +++
 .../ptd/pulumi_resources/tigera_operator.py   |  9 +++
 4 files changed, 113 insertions(+), 3 deletions(-)

diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py
index b172238..33cbedf 100644
--- a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py
+++ b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py
@@ -1185,6 +1185,17 @@ def with_ebs_csi_driver(
                 tags=self.eks.tags,
                 configuration_values=json.dumps(
                     {
+                        "controller": {
+                            "resources": {
+                                "requests": {
+                                    "cpu": "10m",
+                                    "memory": "40Mi",
+                                },
+                                "limits": {
+                                    "memory": "40Mi",
+                                },
+                            },
+                        },
                         "defaultStorageClass": {
                             "enabled": True,
                         },
@@ -1233,6 +1244,21 @@ def with_efs_csi_driver(
                 cluster_name=self.name,
                 service_account_role_arn=sa_role.arn,
                 tags=self.eks.tags,
+                configuration_values=json.dumps(
+                    {
+                        "controller": {
+                            "resources": {
+                                "requests": {
+                                    "cpu": "10m",
+                                    "memory": "40Mi",
+                                },
+                                "limits": {
+                                    "memory": "40Mi",
+                                },
+                            },
+                        },
+                    }
+                ),
             ),
             opts=pulumi.ResourceOptions(parent=self.eks),
         )
diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py b/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py
index 9153d35..f9f8422 100644
--- a/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py
+++ b/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py
@@ -91,6 +91,15 @@ def _define_aws_fsx_openzfs_csi(self, release: str, version: str):
                 "valuesContent": yaml.dump(
                     {
                         "controller": {
+                            "resources": {
+                                "requests": {
+                                    "cpu": "10m",
+                                    "memory": "40Mi",
+                                },
+                                "limits": {
+                                    "memory": "40Mi",
+                                },
+                            },
                             "serviceAccount": {
                                 "create": True,
                                 "name": f"controller.{ptd.Roles.AWS_FSX_OPENZFS_CSI_DRIVER}",
@@ -167,6 +176,15 @@ def _define_secret_store_csi(self, release: str, version: str):
                 "version": version,
                 "valuesContent": yaml.dump(
                     {
+                        "resources": {
+                            "requests": {
+                                "cpu": "30m",
+                                "memory": "128Mi",
+                            },
+                            "limits": {
+                                "memory": "128Mi",
+                            },
+                        },
                         "rotationPollInterval": "15s",
                         "enableSecretRotation": True,
                         "syncSecret": {
@@ -195,6 +213,15 @@ def _define_secret_store_csi_aws(self, release: str, version: str):
                 "version": version,
                 "valuesContent": yaml.dump(
                     {
+                        "resources": {
+                            "requests": {
+                                "cpu": "10m",
+                                "memory": "50Mi",
+                            },
+                            "limits": {
+                                "memory": "50Mi",
+                            },
+                        },
                         "tolerations": [
                             {
                                 "key": "workload-type",
@@ -202,7 +229,7 @@ def _define_secret_store_csi_aws(self, release: str, version: str):
                                 "value": "session",
                                 "effect": "NoSchedule",
                             },
-                        ]
+                        ],
                     }
                 ),
             },
@@ -227,6 +254,15 @@ def _define_aws_lbc(self, release: str, version: str):
                 "version": version,
                 "valuesContent": yaml.dump(
                     {
+                        "resources": {
+                            "requests": {
+                                "cpu": "100m",
+                                "memory": "256Mi",
+                            },
+                            "limits": {
+                                "memory": "256Mi",
+                            },
+                        },
                         "clusterName": cluster_name,
                         "serviceAccount": {
                             "create": True,
@@ -262,7 +298,19 @@ def _define_metrics_server(self, release: str, version: str):
                 "chart": "metrics-server",
                 "targetNamespace": ptd.KUBE_SYSTEM_NAMESPACE,
                 "version": version,
-                "valuesContent": yaml.dump({}),
+                "valuesContent": yaml.dump(
+                    {
+                        "resources": {
+                            "requests": {
+                                "cpu": "100m",
+                                "memory": "200Mi",
+                            },
+                            "limits": {
+                                "memory": "200Mi",
+                            },
+                        },
+                    }
+                ),
             },
             opts=pulumi.ResourceOptions(provider=self.kube_providers[release]),
         )
@@ -619,9 +667,18 @@ def _define_kube_state_metrics(self, release: str, version: str):
                 "version": version,
                 "valuesContent": yaml.dump(
                     {
+                        "resources": {
+                            "requests": {
+                                "cpu": "10m",
+                                "memory": "64Mi",
+                            },
+                            "limits": {
+                                "memory": "64Mi",
+                            },
+                        },
                         "metricLabelsAllowlist": [
                             "pods=[launcher-instance-id]",
-                        ]
+                        ],
                     }
                 ),
             },
@@ -654,6 +711,15 @@ def _define_traefik(self, release: str, version: str, weight: str, cert_arns_out
                 "version": version,
                 "valuesContent": yaml.dump(
                     {
+                        "resources": {
+                            "requests": {
+                                "cpu": "100m",
+                                "memory": "128Mi",
+                            },
+                            "limits": {
+                                "memory": "128Mi",
+                            },
+                        },
                         "image": {
                             "registry": "ghcr.io/traefik",
                         },
diff --git a/python-pulumi/src/ptd/pulumi_resources/external_dns.py b/python-pulumi/src/ptd/pulumi_resources/external_dns.py
index 2546924..540fdae 100644
--- a/python-pulumi/src/ptd/pulumi_resources/external_dns.py
+++ b/python-pulumi/src/ptd/pulumi_resources/external_dns.py
@@ -56,6 +56,15 @@ def _define_helm_release(self) -> None:
             ),
             atomic=True,
             values={
+                "resources": {
+                    "requests": {
+                        "cpu": "50m",
+                        "memory": "64Mi",
+                    },
+                    "limits": {
+                        "memory": "64Mi",
+                    },
+                },
                 "provider": "aws",
                 "serviceAccount": {
                     "create": True,
diff --git a/python-pulumi/src/ptd/pulumi_resources/tigera_operator.py b/python-pulumi/src/ptd/pulumi_resources/tigera_operator.py
index 8ebea19..fc2b27e 100644
--- a/python-pulumi/src/ptd/pulumi_resources/tigera_operator.py
+++ b/python-pulumi/src/ptd/pulumi_resources/tigera_operator.py
@@ -54,6 +54,15 @@ def _define_helm_release(self):
             ),
             atomic=False,
             values={
+                "resources": {
+                    "requests": {
+                        "cpu": "100m",
+                        "memory": "128Mi",
+                    },
+                    "limits": {
+                        "memory": "128Mi",
+                    },
+                },
                 "installation": {
                     "enabled": True,
                     "registry": "quay.io",

From 01c63e1cac83c6d123f6d26acb6e05339eb9cfcc Mon Sep 17 00:00:00 2001
From: Anna Williamson <annamdove@gmail.com>
Date: Mon, 26 Jan 2026 15:34:40 -0800
Subject: [PATCH 2/8] add ephemeral storage limit to prevent run away pods

---
 python-pulumi/src/ptd/pulumi_resources/tigera_operator.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python-pulumi/src/ptd/pulumi_resources/tigera_operator.py b/python-pulumi/src/ptd/pulumi_resources/tigera_operator.py
index fc2b27e..1c09dd8 100644
--- a/python-pulumi/src/ptd/pulumi_resources/tigera_operator.py
+++ b/python-pulumi/src/ptd/pulumi_resources/tigera_operator.py
@@ -58,9 +58,11 @@ def _define_helm_release(self):
                     "requests": {
                         "cpu": "100m",
                         "memory": "128Mi",
+                        "ephemeral-storage": "1Gi",
                     },
                     "limits": {
                         "memory": "128Mi",
+                        "ephemeral-storage": "2Gi",
                     },
                 },
                 "installation": {

From 9e1471c2d0cdf67b4ea57bb3b690d986578a0257 Mon Sep 17 00:00:00 2001
From: Anna Williamson <annamdove@gmail.com>
Date: Mon, 26 Jan 2026 15:41:01 -0800
Subject: [PATCH 3/8] observability stack resources

---
 .../src/ptd/pulumi_resources/azure_traefik.py |  9 +++
 .../pulumi_resources/azure_workload_helm.py   | 56 ++++++++++++++++++-
 .../ptd/pulumi_resources/tigera_operator.py   |  2 +-
 3 files changed, 65 insertions(+), 2 deletions(-)

diff --git a/python-pulumi/src/ptd/pulumi_resources/azure_traefik.py b/python-pulumi/src/ptd/pulumi_resources/azure_traefik.py
index 15ed71b..9dea558 100644
--- a/python-pulumi/src/ptd/pulumi_resources/azure_traefik.py
+++ b/python-pulumi/src/ptd/pulumi_resources/azure_traefik.py
@@ -60,6 +60,15 @@ def _define_helm_release(self):
             ),
             atomic=True,
             values={
+                "resources": {
+                    "requests": {
+                        "cpu": "100m",
+                        "memory": "128Mi",
+                    },
+                    "limits": {
+                        "memory": "128Mi",
+                    },
+                },
                 "logs": {
                     "general": {
                         "level": "DEBUG",
diff --git a/python-pulumi/src/ptd/pulumi_resources/azure_workload_helm.py b/python-pulumi/src/ptd/pulumi_resources/azure_workload_helm.py
index df6fab8..9912fa7 100644
--- a/python-pulumi/src/ptd/pulumi_resources/azure_workload_helm.py
+++ b/python-pulumi/src/ptd/pulumi_resources/azure_workload_helm.py
@@ -104,6 +104,15 @@ def _define_loki(self, release: str, version: str):
                 "valuesContent": loki_identity.client_id.apply(
                     lambda client_id: yaml.dump(
                         {
+                            "resources": {
+                                "requests": {
+                                    "cpu": "100m",
+                                    "memory": "512Mi",
+                                },
+                                "limits": {
+                                    "memory": "512Mi",
+                                },
+                            },
                             "gateway": {
                                 "image": {
                                     "registry": "quay.io",
@@ -217,6 +226,15 @@ def _define_mimir(self, release: str, version: str):
                 "valuesContent": mimir_identity.client_id.apply(
                     lambda client_id: yaml.dump(
                         {
+                            "resources": {
+                                "requests": {
+                                    "cpu": "100m",
+                                    "memory": "512Mi",
+                                },
+                                "limits": {
+                                    "memory": "512Mi",
+                                },
+                            },
                             "serviceAccount": {
                                 "create": True,
                                 "name": str(ptd.Roles.MIMIR),
@@ -327,6 +345,15 @@ def _define_alloy(self, release: str, version: str):
                 "version": version,
                 "valuesContent": yaml.dump(
                     {
+                        "resources": {
+                            "requests": {
+                                "cpu": "50m",
+                                "memory": "128Mi",
+                            },
+                            "limits": {
+                                "memory": "128Mi",
+                            },
+                        },
                         "serviceAccount": {
                             "create": True,
                             "name": str(ptd.Roles.ALLOY),
@@ -502,6 +529,15 @@ def _define_external_dns(self, release: str, version: str):
                 "valuesContent": identity.client_id.apply(
                     lambda client_id: yaml.dump(
                         {
+                            "resources": {
+                                "requests": {
+                                    "cpu": "50m",
+                                    "memory": "64Mi",
+                                },
+                                "limits": {
+                                    "memory": "64Mi",
+                                },
+                            },
                             "provider": "azure",
                             "domainFilters": [*sorted([site.domain for site in self.workload.cfg.sites.values()])],
                             "extraArgs": {
@@ -554,6 +590,15 @@ def _define_grafana(self, release: str, version: str):
                 "version": version,
                 "valuesContent": yaml.dump(
                     {
+                        "resources": {
+                            "requests": {
+                                "cpu": "100m",
+                                "memory": "256Mi",
+                            },
+                            "limits": {
+                                "memory": "256Mi",
+                            },
+                        },
                         "envFromSecret": "grafana-db-url",
                         "grafana.ini": {
                             "server": {
@@ -665,9 +710,18 @@ def _define_kube_state_metrics(self, release: str, version: str):
                 "version": version,
                 "valuesContent": yaml.dump(
                     {
+                        "resources": {
+                            "requests": {
+                                "cpu": "10m",
+                                "memory": "64Mi",
+                            },
+                            "limits": {
+                                "memory": "64Mi",
+                            },
+                        },
                         "metricLabelsAllowlist": [
                             "pods=[launcher-instance-id]",
-                        ]
+                        ],
                     }
                 ),
             },
diff --git a/python-pulumi/src/ptd/pulumi_resources/tigera_operator.py b/python-pulumi/src/ptd/pulumi_resources/tigera_operator.py
index 1c09dd8..26a2d10 100644
--- a/python-pulumi/src/ptd/pulumi_resources/tigera_operator.py
+++ b/python-pulumi/src/ptd/pulumi_resources/tigera_operator.py
@@ -89,7 +89,7 @@ def _define_helm_release(self):
                         "type": "Calico",
                     },
                     "nonPrivileged": "Enabled",
-                }
+                },
             },
             opts=pulumi.ResourceOptions(parent=self, depends_on=self.namespace),
         )

From 52f9f6192f3a0ac91bdb7c911425538fceca6b6c Mon Sep 17 00:00:00 2001
From: Anna Williamson <annamdove@gmail.com>
Date: Mon, 26 Jan 2026 15:49:05 -0800
Subject: [PATCH 4/8] observability stack resources aws

---
 .../ptd/pulumi_resources/aws_workload_helm.py | 36 +++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py b/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py
index f9f8422..25dd985 100644
--- a/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py
+++ b/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py
@@ -340,6 +340,15 @@ def _define_loki(self, release: str, version: str, components):
                 "version": version,
                 "valuesContent": yaml.dump(
                     {
+                        "resources": {
+                            "requests": {
+                                "cpu": "100m",
+                                "memory": "512Mi",
+                            },
+                            "limits": {
+                                "memory": "512Mi",
+                            },
+                        },
                         "gateway": {
                             "image": {
                                 "registry": "quay.io",
@@ -457,6 +466,15 @@ def _define_grafana(self, release: str, version: str):
                 "version": version,
                 "valuesContent": yaml.dump(
                     {
+                        "resources": {
+                            "requests": {
+                                "cpu": "100m",
+                                "memory": "256Mi",
+                            },
+                            "limits": {
+                                "memory": "256Mi",
+                            },
+                        },
                         "envFromSecret": "grafana-db-url",
                         "grafana.ini": {
                             "server": {
@@ -543,6 +561,15 @@ def _define_mimir(self, release: str, version: str, components):
                 "version": version,
                 "valuesContent": yaml.dump(
                     {
+                        "resources": {
+                            "requests": {
+                                "cpu": "100m",
+                                "memory": "512Mi",
+                            },
+                            "limits": {
+                                "memory": "512Mi",
+                            },
+                        },
                         "serviceAccount": {
                             "create": True,
                             "name": str(ptd.Roles.MIMIR),
@@ -1252,6 +1279,15 @@ def _define_alloy(self, release: str, version: str):
                 "version": version,
                 "valuesContent": yaml.dump(
                     {
+                        "resources": {
+                            "requests": {
+                                "cpu": "50m",
+                                "memory": "128Mi",
+                            },
+                            "limits": {
+                                "memory": "128Mi",
+                            },
+                        },
                         "serviceAccount": {
                             "create": True,
                             "name": str(ptd.Roles.ALLOY),

From 1782a1e8e90c09f3d6c0a3f0d4e5abdbd9b9a8e1 Mon Sep 17 00:00:00 2001
From: Anna Williamson <annamdove@gmail.com>
Date: Thu, 29 Jan 2026 16:07:01 -0800
Subject: [PATCH 5/8] Add tolerations for prepull daemonset

---
 .../ptd/pulumi_resources/aws_workload_helm.py |  7 +---
 .../src/ptd/pulumi_resources/team_site.py     | 37 ++++++++++++-------
 2 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py b/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py
index 25dd985..8e131e1 100644
--- a/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py
+++ b/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py
@@ -1344,13 +1344,10 @@ def _define_alloy(self, release: str, version: str):
                             "faroPort": 12347,
                             "hosts": [f"faro.{self.workload.cfg.domain}"],
                         },
-                        # Alloy is a DaemonSet, needs to run on all nodes including Karpenter session nodes
+                        # Alloy is a DaemonSet, needs to run on all nodes regardless of taints
                         "tolerations": [
                             {
-                                "key": "workload-type",
-                                "operator": "Equal",
-                                "value": "session",
-                                "effect": "NoSchedule",
+                                "operator": "Exists",
                             },
                         ],
                     }
diff --git a/python-pulumi/src/ptd/pulumi_resources/team_site.py b/python-pulumi/src/ptd/pulumi_resources/team_site.py
index 19b88f2..b6ae067 100644
--- a/python-pulumi/src/ptd/pulumi_resources/team_site.py
+++ b/python-pulumi/src/ptd/pulumi_resources/team_site.py
@@ -131,8 +131,9 @@ def inject_cluster_tolerations(obj: dict[str, typing.Any], _: pulumi.ResourceOpt
             if obj["kind"] != "Site":
                 return
 
-            # Compute session tolerations based on Karpenter node pools with session_taints=true
+            # Compute session tolerations and prepull node pools based on Karpenter node pools with session_taints=true
             session_tolerations = []
+            session_node_pools = []
             if self.cluster_config and hasattr(self.cluster_config, "karpenter_config"):
                 karpenter_config = self.cluster_config.karpenter_config
                 if karpenter_config and karpenter_config.node_pools:
@@ -147,20 +148,30 @@ def inject_cluster_tolerations(obj: dict[str, typing.Any], _: pulumi.ResourceOpt
                             if toleration not in session_tolerations:
                                 session_tolerations.append(toleration)
 
-            if not session_tolerations:
-                return
+                            # Track node pool names for prepull targeting
+                            if node_pool.name not in session_node_pools:
+                                session_node_pools.append(node_pool.name)
 
             # Merge session tolerations into workbench spec
-            deepmerge.always_merger.merge(obj, {"spec": {"workbench": {"sessionTolerations": session_tolerations}}})
-
-            # Deduplicate tolerations (deepmerge concatenates lists)
-            tolerations = obj["spec"]["workbench"]["sessionTolerations"]
-            seen = {}
-            for t in tolerations:
-                key = (t.get("key"), t.get("operator"), t.get("value"), t.get("effect"))
-                if key not in seen:
-                    seen[key] = t
-            obj["spec"]["workbench"]["sessionTolerations"] = list(seen.values())
+            if session_tolerations:
+                deepmerge.always_merger.merge(obj, {"spec": {"workbench": {"sessionTolerations": session_tolerations}}})
+
+                # Deduplicate tolerations (deepmerge concatenates lists)
+                tolerations = obj["spec"]["workbench"]["sessionTolerations"]
+                seen = {}
+                for t in tolerations:
+                    key = (t.get("key"), t.get("operator"), t.get("value"), t.get("effect"))
+                    if key not in seen:
+                        seen[key] = t
+                obj["spec"]["workbench"]["sessionTolerations"] = list(seen.values())
+
+            # Inject prepull node pool targeting if session-tainted pools exist AND prepull is not disabled
+            # Check if disablePrePullImages is set to true in the Site spec
+            disable_prepull = obj.get("spec", {}).get("disablePrePullImages", False)
+            if session_node_pools and not disable_prepull:
+                deepmerge.always_merger.merge(
+                    obj, {"spec": {"prepullNodePools": session_node_pools}}
+                )
 
         api_version_path = self._config_overrides.get("apiVersion", "").split("/")[-1]
 

From cd4b273d533985ce9bbd400c7a8695a2d8c21cbf Mon Sep 17 00:00:00 2001
From: Anna Williamson <annamdove@gmail.com>
Date: Fri, 30 Jan 2026 11:18:45 -0800
Subject: [PATCH 6/8] Updated monitoring for mimir and improved resource usage

---
 docs/guides/monitoring.md                     | 560 ++++--------------
 .../src/ptd/grafana_alerts/mimir.yaml         | 138 +++++
 .../ptd/pulumi_resources/aws_eks_cluster.py   |  17 +
 .../ptd/pulumi_resources/aws_workload_helm.py |  98 ++-
 .../pulumi_resources/azure_workload_helm.py   | 100 +++-
 .../src/ptd/pulumi_resources/team_site.py     |   4 +-
 6 files changed, 418 insertions(+), 499 deletions(-)
 create mode 100644 python-pulumi/src/ptd/grafana_alerts/mimir.yaml

diff --git a/docs/guides/monitoring.md b/docs/guides/monitoring.md
index 8c36de9..4807859 100644
--- a/docs/guides/monitoring.md
+++ b/docs/guides/monitoring.md
@@ -1,534 +1,204 @@
 # Monitoring Stack
 
-This guide describes the Grafana-based monitoring stack deployed by the PTD CLI for workload observability.
+PTD deploys a Grafana-based observability stack to each workload cluster:
 
-## Overview
-
-PTD deploys a complete observability stack to each workload cluster consisting of:
-
-- **Grafana Alloy**: Metrics and log collection agent (deployed as a DaemonSet)
-- **Mimir**: Prometheus-compatible metrics storage and querying
-- **Loki**: Log aggregation and querying
-- **Grafana**: Visualization and dashboard UI
+- **Grafana Alloy**: Metrics and log collection (DaemonSet on every node)
+- **Mimir**: Prometheus-compatible metrics storage
+- **Loki**: Log aggregation
+- **Grafana**: Visualization UI at `https://grafana.<workload-domain>`
 
 ## Architecture
 
-### Data Flow
-
 ```
 ┌─────────────────────────────────────────────────────────────┐
-│                      Workload Cluster                        │
-│                                                              │
-│  ┌──────────────┐                                           │
-│  │ Grafana      │                                           │
-│  │ Alloy        │ (DaemonSet - runs on every node)         │
-│  │              │                                           │
-│  └──────┬───────┘                                           │
-│         │                                                    │
-│         ├─── Metrics ───┬─────────────────────────────┐    │
-│         │               │                              │    │
-│         │               ▼                              │    │
-│         │      ┌─────────────────┐                     │    │
-│         │      │ Local Mimir     │                     │    │
-│         │      │ (workload-only) │                     │    │
-│         │      └────────┬────────┘                     │    │
-│         │               │                              │    │
-│         │               ▼                              │    │
-│         │      ┌─────────────────┐                     │    │
-│         │      │ Grafana UI      │                     │    │
-│         │      │                 │                     │    │
-│         │      └─────────────────┘                     │    │
-│         │                                              │    │
-│         └─── Logs ──────────────────────┐             │    │
-│                                          │             │    │
-│                                          ▼             │    │
-│                                 ┌─────────────────┐   │    │
-│                                 │ Local Loki      │   │    │
-│                                 │ (workload-only) │   │    │
-│                                 └────────┬────────┘   │    │
-│                                          │             │    │
-│                                          ▼             │    │
-│                                 ┌─────────────────┐   │    │
-│                                 │ Grafana UI      │   │    │
-│                                 │                 │   │    │
-│                                 └─────────────────┘   │    │
-│                                                        │    │
-└────────────────────────────────────────────────────────┼───┘
-                                                         │
-                             Metrics Only (for alerting)│
-                                                         │
-                                                         ▼
-                                              ┌──────────────────┐
-                                              │ Control Room     │
-                                              │ Mimir            │
-                                              │                  │
-                                              └──────────────────┘
-```
-
-### Key Design Principles
-
-**Metrics**: Dual-write pattern
-- Sent to **local Mimir** for workload-specific dashboards and queries
-- Sent to **control room Mimir** for centralized alerting and cross-workload monitoring
-
-**Logs**: Workload boundary isolation
-- Sent **only to local Loki** within the workload
-- Logs never leave the workload boundary
-- Each workload has complete control over its own log data
+│                      Workload Cluster                       │
+│                                                             │
+│  Grafana Alloy (DaemonSet)                                  │
+│       │                                                     │
+│       ├─── Metrics ──→ Local Mimir ──→ Grafana UI           │
+│       │                     │                               │
+│       │                     └──────────→ Control Room Mimir │
+│       │                                  (for alerting)     │
+│       │                                                     │
+│       └─── Logs ─────→ Local Loki ───→ Grafana UI           │
+│                        (stays in workload)                  │
+└─────────────────────────────────────────────────────────────┘
+```
+
+**Key Design:**
+- **Metrics**: Dual-write to local Mimir (dashboards) and control room Mimir (alerting)
+- **Logs**: Stay within workload boundary only
 
 ## Components
 
 ### Grafana Alloy
 
-Grafana Alloy is the telemetry collection agent that runs on every node in the cluster.
-
-**Deployment**: DaemonSet in the `alloy` namespace
-
-**Configuration** (see `python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py`):
-- Scrapes metrics from:
-  - Kubernetes pods in `posit-team`, `posit-team-system`, and `loki` namespaces
-  - Node exporters (CPU, memory, disk, network)
-  - kube-state-metrics for cluster state
-  - **kubelet cAdvisor** for container-level resource usage metrics
-  - Blackbox exporter for health checks
-  - Cloud provider metrics for managed storage and database services
-- Collects logs from:
-  - Kubernetes pods in `posit-team` and `posit-team-system` namespaces
-  - Optionally system logs via journald (controlled by `grafana_scrape_system_logs`)
-- Runs with clustering enabled for high availability
-
-**Container Metrics (via cAdvisor)**: The following container-level metrics are collected for debugging resource issues:
-
-#### Memory Metrics
-- `container_memory_working_set_bytes` - Active memory usage (what the OOM killer evaluates against limits)
-- `container_memory_usage_bytes` - Total memory usage including cache
-- `container_memory_rss` - Resident Set Size (anonymous memory: heap, stack)
-- `container_memory_cache` - Page cache memory (can be reclaimed)
-- `container_memory_swap` - Swap space usage
-- `container_memory_failcnt` - Number of times memory allocation failed (OOM events)
-- `container_spec_memory_limit_bytes` - Configured memory limit
-- `container_spec_memory_reservation_limit_bytes` - Configured memory request
-
-#### CPU Metrics
-- `container_cpu_usage_seconds_total` - Cumulative CPU time consumed
-- `container_cpu_cfs_throttled_seconds_total` - Total time container was throttled due to CPU limits
-- `container_cpu_cfs_throttled_periods_total` - Number of throttled periods
-- `container_cpu_cfs_periods_total` - Total number of CPU CFS scheduler periods
-- `container_spec_cpu_quota` - CPU limit in microseconds per 100ms period (-1 if unlimited)
-- `container_spec_cpu_shares` - CPU request weight (relative to other containers)
-
-#### Network Metrics
-- `container_network_receive_bytes_total` - Bytes received
-- `container_network_transmit_bytes_total` - Bytes transmitted
-- `container_network_receive_packets_total` - Packets received
-- `container_network_transmit_packets_total` - Packets transmitted
-- `container_network_receive_errors_total` - Errors receiving packets
-- `container_network_transmit_errors_total` - Errors transmitting packets
-- `container_network_receive_packets_dropped_total` - Inbound packets dropped
-- `container_network_transmit_packets_dropped_total` - Outbound packets dropped
-
-#### Filesystem Metrics
-- `container_fs_usage_bytes` - Current filesystem usage
-- `container_fs_limit_bytes` - Filesystem capacity
-- `container_fs_reads_bytes_total` - Bytes read from filesystem
-- `container_fs_writes_bytes_total` - Bytes written to filesystem
-- `container_fs_reads_total` - Number of read operations
-- `container_fs_writes_total` - Number of write operations
-
-#### Container Lifecycle Metrics
-- `container_start_time_seconds` - Unix timestamp when container started
-- `kube_pod_container_status_restarts_total` - Number of container restarts (from kube-state-metrics)
-- `kube_pod_container_status_last_terminated_reason` - Reason for last termination (from kube-state-metrics)
-
-**Helm Chart**: `grafana/alloy`
-
-**Key Configuration** (from `aws_workload_helm.py:1127-1258`):
-```yaml
-alloy:
-  clustering:
-    enabled: true
-  mounts:
-    extra:
-      - name: mimir-auth
-        mountPath: /etc/mimir/
-        readOnly: true
-    varlog: true  # If grafana_scrape_system_logs enabled
-  securityContext:
-    privileged: true  # If grafana_scrape_system_logs enabled
-tolerations:
-  - key: workload-type
-    operator: Equal
-    value: session
-    effect: NoSchedule
-```
-
-**Authentication**: Alloy uses basic authentication when writing metrics to the control room Mimir. Credentials are stored in a Kubernetes Secret (`mimir-auth`) and mounted into the Alloy pods.
-
-### Mimir
+Configuration: `python-pulumi/src/ptd/pulumi_resources/grafana_alloy.py`
 
-Mimir is a horizontally scalable, long-term storage for Prometheus metrics.
+**Scrapes metrics from:**
+- Kubernetes pods in `posit-team`, `posit-team-system`, and `loki` namespaces
+- Node exporters, kube-state-metrics, kubelet cAdvisor
+- Cloud provider metrics for managed services
 
-**Deployment**: Distributed deployment in the `mimir` namespace
+**Collects logs from:**
+- Kubernetes pods in `posit-team` and `posit-team-system` namespaces
+- Optionally system logs via journald (`grafana_scrape_system_logs` setting)
 
-**Storage Backend**: Object storage (S3 or Azure Blob Storage, configured per workload)
-
-**Helm Chart**: `grafana/mimir-distributed`
-
-**Key Configuration** (from `aws_workload_helm.py:473-604`):
-```yaml
-mimir:
-  structuredConfig:
-    blocks_storage:
-      backend: <s3 or azure>
-      storage_prefix: blocks
-    limits:
-      max_global_series_per_user: 800000
-      max_label_names_per_series: 45
-
-ingester:
-  replicas: <configurable>
-  persistentVolume:
-    size: 20Gi
-
-compactor:
-  replicas: <configurable>
-  persistentVolume:
-    size: 20Gi
+### Mimir
 
-store_gateway:
-  replicas: <configurable>
-  persistentVolume:
-    size: 20Gi
-```
+Distributed deployment in `mimir` namespace. Uses object storage (S3/Azure Blob) backend.
 
-**Endpoints**:
+**Endpoints:**
 - Gateway: `http://mimir-gateway.mimir.svc.cluster.local/prometheus`
 - Push API: `http://mimir-gateway.mimir.svc.cluster.local/api/v1/push`
 
-### Loki
-
-Loki is a log aggregation system designed to store and query logs efficiently.
-
-**Deployment**: Distributed deployment in the `loki` namespace
-
-**Storage Backend**: Object storage (S3 or Azure Blob Storage, configured per workload)
-
-**Helm Chart**: `grafana/loki`
-
-**Key Configuration** (from `aws_workload_helm.py:270-393`):
-```yaml
-loki:
-  auth_enabled: false
-  storage:
-    type: <s3 or azure>
-    bucketNames:
-      chunks: <workload-prefix>-<bucket-name>
-      ruler: <workload-prefix>-<bucket-name>
-      admin: <workload-prefix>-<bucket-name>
-  limits_config:
-    max_cache_freshness_per_query: 10m
-    query_timeout: 300s
-    reject_old_samples: true
-    reject_old_samples_max_age: 168h  # 7 days
-    split_queries_by_interval: 15m
-    volume_enabled: true
-  storage_config:
-    hedging:
-      at: 250ms
-      max_per_second: 20
-      up_to: 3
-
-backend:
-  replicas: <configurable>
-read:
-  replicas: <configurable>
-write:
-  replicas: <configurable>
+**Architecture:**
 ```
-
-**Endpoints**:
-- Gateway: `http://loki-gateway.loki.svc.cluster.local`
-- Push API: `http://loki-gateway.loki.svc.cluster.local/loki/api/v1/push`
-
-### Grafana
-
-Grafana provides the visualization layer for metrics and logs.
-
-**Deployment**: Single deployment in the `grafana` namespace
-
-**Helm Chart**: `grafana/grafana`
-
-**Data Sources** (from `aws_workload_helm.py:444-466`):
-```yaml
-datasources:
-  - name: Loki
-    type: loki
-    access: proxy
-    url: http://loki-gateway.loki.svc.cluster.local
-    isDefault: true
-  - name: Mimir
-    type: prometheus
-    access: proxy
-    url: http://mimir-gateway.mimir.svc.cluster.local/prometheus
-    isDefault: false
+Write: Alloy → Gateway → Distributor → Ingesters (ring) → S3
+Read:  Grafana → Gateway → Query Frontend → Querier → Ingesters/Store Gateway
 ```
 
-**Authentication**: Configured with proxy authentication via Traefik forward auth. Users are automatically signed up with Editor role.
+**Ring Health:** Mimir uses a hash ring to distribute data. If ingesters are marked UNHEALTHY but remain in the ring, queries fail. Auto-forget is configured to clean up stale members after 10 minutes.
 
-**Access**: Available at `https://grafana.<workload-domain>`
+**Troubleshooting Ring Issues:**
+```bash
+# View ring status
+kubectl port-forward -n mimir svc/mimir-querier 8080:8080
+# Visit http://localhost:8080/ingester/ring
 
+# Check pod status
+kubectl get pods -n mimir -l app.kubernetes.io/component=ingester
+```
 
-## Accessing Monitoring Data
+### Loki
 
-### Grafana UI
+Distributed deployment in `loki` namespace. Uses object storage backend.
 
-Access Grafana at `https://grafana.<workload-domain>` for metrics visualization and log exploration.
+**Endpoint:** `http://loki-gateway.loki.svc.cluster.local`
 
-## Container Troubleshooting with Metrics
+### Grafana
 
-This section provides practical Grafana queries for diagnosing common container issues.
+Single deployment in `grafana` namespace with Mimir and Loki as data sources.
 
-### Memory Issues and OOMKilled Pods
+**Access:** `https://grafana.<workload-domain>` (authenticated via Traefik forward auth)
 
-When pods are terminated due to OOM (Out of Memory), use these queries to investigate:
+## Container Troubleshooting Queries
 
-#### Identify OOMKilled Pods
-```promql
-# See which containers were OOMKilled
-kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}
+### Memory (OOMKilled Investigation)
 
-# Count OOM events by pod over time
-sum by (pod, namespace) (container_memory_failcnt{namespace="posit-team"})
-```
+| Metric | Purpose |
+|--------|---------|
+| `container_memory_working_set_bytes` | Active memory (OOM killer evaluates this) |
+| `container_spec_memory_limit_bytes` | Configured limit |
+| `container_memory_failcnt` | OOM event counter |
 
-#### Memory Usage Analysis
 ```promql
-# Working set memory (what OOM killer evaluates) by container
-container_memory_working_set_bytes{namespace="posit-team"}
-
-# Memory usage as percentage of limit
+# Memory usage as % of limit
 (container_memory_working_set_bytes{namespace="posit-team"}
   / container_spec_memory_limit_bytes{namespace="posit-team"}) * 100
 
-# Memory breakdown: RSS vs cache
-container_memory_rss{namespace="posit-team"}
-container_memory_cache{namespace="posit-team"}
-
-# Containers approaching memory limit (>90%)
-(container_memory_working_set_bytes{namespace="posit-team"}
-  / container_spec_memory_limit_bytes{namespace="posit-team"}) > 0.9
-```
-
-#### Historical Memory Trends
-```promql
-# Memory usage over time for a specific pod
-container_memory_working_set_bytes{pod="<pod-name>", namespace="posit-team"}
-
-# Memory growth rate (bytes per second)
-rate(container_memory_working_set_bytes{namespace="posit-team"}[5m])
+# Containers approaching limit (>90%)
+(container_memory_working_set_bytes / container_spec_memory_limit_bytes) > 0.9
 
-# Peak memory usage in last hour
-max_over_time(container_memory_working_set_bytes{namespace="posit-team"}[1h])
+# OOMKilled containers
+kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}
 ```
 
-**Key Investigation Points:**
-- `container_memory_working_set_bytes` exceeding `container_spec_memory_limit_bytes` triggers OOM
-- High `container_memory_rss` indicates application memory pressure (heap, stack)
-- High `container_memory_cache` can usually be reclaimed and is not the root cause
-- Check if `container_memory_failcnt` is incrementing (indicates memory allocation failures)
-
-### CPU Throttling and Performance
+### CPU Throttling
 
-CPU throttling occurs when containers hit their CPU limits, causing performance degradation.
+| Metric | Purpose |
+|--------|---------|
+| `container_cpu_usage_seconds_total` | Cumulative CPU time |
+| `container_cpu_cfs_throttled_seconds_total` | Time spent throttled |
+| `container_spec_cpu_quota` | CPU limit (microseconds per 100ms) |
 
-#### Detect CPU Throttling
 ```promql
-# Percentage of time container was throttled
+# Throttle percentage
 rate(container_cpu_cfs_throttled_seconds_total{namespace="posit-team"}[5m])
   / rate(container_cpu_cfs_periods_total{namespace="posit-team"}[5m]) * 100
 
-# Containers being throttled more than 10% of the time
-(rate(container_cpu_cfs_throttled_periods_total{namespace="posit-team"}[5m])
-  / rate(container_cpu_cfs_periods_total{namespace="posit-team"}[5m])) > 0.1
-```
-
-#### CPU Usage Analysis
-```promql
-# CPU usage rate (cores) per container
-rate(container_cpu_usage_seconds_total{namespace="posit-team"}[5m])
-
-# CPU usage as percentage of limit (quota/100000 = cores)
+# CPU usage (cores)
 rate(container_cpu_usage_seconds_total{namespace="posit-team"}[5m])
-  / (container_spec_cpu_quota{namespace="posit-team"} / 100000) * 100
-
-# Total throttled time per container
-rate(container_cpu_cfs_throttled_seconds_total{namespace="posit-team"}[5m])
 ```
 
-#### CPU Requests vs Usage
-```promql
-# CPU shares (requests) vs actual usage
-container_spec_cpu_shares{namespace="posit-team"}
-rate(container_cpu_usage_seconds_total{namespace="posit-team"}[5m])
-```
-
-**Key Investigation Points:**
-- Throttling >25% indicates containers need higher CPU limits
-- CPU usage consistently at limit suggests CPU-bound workload
-- Compare throttling patterns across similar pods to identify outliers
-- Check if `container_spec_cpu_quota` is set too low for the workload
-
-### Network Issues
+> **Tip:** Throttling >25% indicates containers need higher CPU limits.
 
-Diagnose network connectivity, throughput, and error issues.
+### Network
 
-#### Network Throughput
 ```promql
-# Receive throughput (bytes/second)
+# Throughput
 rate(container_network_receive_bytes_total{namespace="posit-team"}[5m])
-
-# Transmit throughput (bytes/second)
 rate(container_network_transmit_bytes_total{namespace="posit-team"}[5m])
 
-# Total network throughput per pod
-sum by (pod) (
-  rate(container_network_receive_bytes_total{namespace="posit-team"}[5m]) +
-  rate(container_network_transmit_bytes_total{namespace="posit-team"}[5m])
-)
-```
-
-#### Network Errors and Drops
-```promql
-# Packet errors
+# Errors (non-zero indicates issues)
 rate(container_network_receive_errors_total{namespace="posit-team"}[5m])
-rate(container_network_transmit_errors_total{namespace="posit-team"}[5m])
-
-# Dropped packets (indicates network congestion or buffer overflow)
-rate(container_network_receive_packets_dropped_total{namespace="posit-team"}[5m])
 rate(container_network_transmit_packets_dropped_total{namespace="posit-team"}[5m])
-
-# Containers with any packet drops
-(rate(container_network_receive_packets_dropped_total{namespace="posit-team"}[5m]) +
- rate(container_network_transmit_packets_dropped_total{namespace="posit-team"}[5m])) > 0
-```
-
-#### Network Packet Rate
-```promql
-# Packets per second
-rate(container_network_receive_packets_total{namespace="posit-team"}[5m])
-rate(container_network_transmit_packets_total{namespace="posit-team"}[5m])
 ```
 
-**Key Investigation Points:**
-- Non-zero error rates indicate network interface or driver issues
-- Dropped packets suggest network congestion or insufficient buffer space
-- Compare throughput against expected workload to identify bottlenecks
-- Sudden changes in packet rates may indicate connectivity problems
-
-### Disk I/O Issues
+### Disk I/O
 
-Diagnose filesystem usage and I/O performance problems.
-
-#### Filesystem Usage
 ```promql
-# Filesystem usage by container
-container_fs_usage_bytes{namespace="posit-team"}
-
-# Filesystem usage as percentage of capacity
-(container_fs_usage_bytes{namespace="posit-team"}
-  / container_fs_limit_bytes{namespace="posit-team"}) * 100
+# Filesystem usage %
+(container_fs_usage_bytes / container_fs_limit_bytes) * 100
 
-# Containers with >80% disk usage
-(container_fs_usage_bytes{namespace="posit-team"}
-  / container_fs_limit_bytes{namespace="posit-team"}) > 0.8
-```
-
-#### Disk I/O Throughput
-```promql
-# Read throughput (bytes/second)
+# I/O throughput
 rate(container_fs_reads_bytes_total{namespace="posit-team"}[5m])
-
-# Write throughput (bytes/second)
 rate(container_fs_writes_bytes_total{namespace="posit-team"}[5m])
-
-# Total I/O throughput
-sum by (pod) (
-  rate(container_fs_reads_bytes_total{namespace="posit-team"}[5m]) +
-  rate(container_fs_writes_bytes_total{namespace="posit-team"}[5m])
-)
 ```
 
-#### Disk I/O Operations
+> **Tip:** Filesystem usage >90% can cause pod evictions.
+
+### Container Restarts
+
 ```promql
-# Read IOPS (operations per second)
-rate(container_fs_reads_total{namespace="posit-team"}[5m])
+# Containers with restarts
+kube_pod_container_status_restarts_total{namespace="posit-team"} > 0
 
-# Write IOPS
-rate(container_fs_writes_total{namespace="posit-team"}[5m])
+# Termination reasons
+kube_pod_container_status_last_terminated_reason{namespace="posit-team"}
 
-# Top containers by IOPS
-topk(10,
-  rate(container_fs_reads_total{namespace="posit-team"}[5m]) +
-  rate(container_fs_writes_total{namespace="posit-team"}[5m])
-)
+# Recently restarted (< 1 hour uptime)
+(time() - container_start_time_seconds{namespace="posit-team"}) < 3600
 ```
 
-**Key Investigation Points:**
-- Filesystem usage >90% can cause application errors and pod evictions
-- High IOPS with low throughput suggests small file operations
-- Sudden spikes in write operations may indicate logging or caching issues
-- Compare I/O patterns against storage backend limits (EBS, Azure Disk)
+## Mimir Self-Monitoring
 
-### Container Restart and Lifecycle Issues
+### The Chicken-and-Egg Problem
 
-Track container restarts, crashes, and lifecycle problems.
+If a workload's Mimir breaks, alerts running on that workload can't query it. PTD solves this by running Mimir alerts on the **control room**, which queries its own Mimir instance that receives metrics via dual-write from all workloads.
 
-#### Container Restart Patterns
-```promql
-# Containers with recent restarts
-kube_pod_container_status_restarts_total{namespace="posit-team"} > 0
+### Alerts
 
-# Restart rate (restarts per minute)
-rate(kube_pod_container_status_restarts_total{namespace="posit-team"}[5m]) * 60
+Alerts defined in `python-pulumi/src/ptd/grafana_alerts/mimir.yaml` (deployed to control room Grafana):
 
-# Top restarting containers
-topk(10, kube_pod_container_status_restarts_total{namespace="posit-team"})
-```
-
-#### Termination Reasons
-```promql
-# See why containers terminated
-kube_pod_container_status_last_terminated_reason{namespace="posit-team"}
+| Alert | Catches |
+|-------|---------|
+| `mimir_ingester_pods_not_ready` | Pod crashes/restarts (earliest warning) |
+| `mimir_remote_write_failures` | Alloy can't push metrics to Mimir |
 
-# Count terminations by reason
-count by (reason) (kube_pod_container_status_last_terminated_reason{namespace="posit-team"})
+Ring health issues are handled by auto-forget configuration (stale members removed after 10 minutes).
 
-# OOMKilled containers specifically
-kube_pod_container_status_last_terminated_reason{reason="OOMKilled", namespace="posit-team"}
-```
+### Mimir Diagnostic Queries
 
-#### Container Age and Uptime
 ```promql
-# Container uptime (seconds)
-time() - container_start_time_seconds{namespace="posit-team"}
+# Ring health
+cortex_ring_members{ring="ingester"}
+cortex_ring_members{state="Unhealthy",ring="ingester"}
 
-# Containers younger than 1 hour (recently restarted)
-(time() - container_start_time_seconds{namespace="posit-team"}) < 3600
+# Ingestion rate
+sum(rate(cortex_distributor_received_samples_total[5m]))
 
-# Average container age by pod
-avg by (pod) (time() - container_start_time_seconds{namespace="posit-team"})
-```
+# Query latency (p99)
+histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket{route=~".*query.*"}[5m])) by (le))
 
-**Key Investigation Points:**
-- Restart rate >0 indicates instability (crashes, OOM, failed health checks)
-- Check `kube_pod_container_status_last_terminated_reason` to understand why
-- Frequent restarts with "Error" reason suggest application bugs
-- OOMKilled restarts indicate insufficient memory limits
-- Short uptime combined with high restart count suggests crash loops
+# Query error rate
+sum(rate(cortex_request_duration_seconds_count{status_code=~"5.."}[5m]))
+  / sum(rate(cortex_request_duration_seconds_count[5m]))
+```
 
 ## Related Documentation
 
-- [Grafana Alloy Documentation](https://grafana.com/docs/alloy/latest/)
-- [Mimir Documentation](https://grafana.com/docs/mimir/latest/)
-- [Loki Documentation](https://grafana.com/docs/loki/latest/)
-- [Grafana Documentation](https://grafana.com/docs/grafana/latest/)
+- [Grafana Alloy](https://grafana.com/docs/alloy/latest/)
+- [Mimir](https://grafana.com/docs/mimir/latest/)
+- [Loki](https://grafana.com/docs/loki/latest/)
+- [Grafana](https://grafana.com/docs/grafana/latest/)
diff --git a/python-pulumi/src/ptd/grafana_alerts/mimir.yaml b/python-pulumi/src/ptd/grafana_alerts/mimir.yaml
new file mode 100644
index 0000000..d8200c7
--- /dev/null
+++ b/python-pulumi/src/ptd/grafana_alerts/mimir.yaml
@@ -0,0 +1,138 @@
+# To delete these alerts, replace file contents with:
+# apiVersion: 1
+# deleteRules:
+#   - orgId: 1
+#     uid: mimir_ingester_pods_not_ready
+#   - orgId: 1
+#     uid: mimir_remote_write_failures
+#
+# See https://grafana.com/docs/grafana/latest/alerting/set-up/provision-alerting-resources/file-provisioning/
+#
+# These alerts monitor workload Mimir health from the control room.
+# They use metrics that Alloy dual-writes to the control room Mimir.
+apiVersion: 1
+groups:
+    - orgId: 1
+      name: Mimir
+      folder: Posit Alerts
+      interval: 1m
+      rules:
+        - uid: mimir_ingester_pods_not_ready
+          title: Mimir Ingester Pods Not Ready
+          condition: B
+          data:
+            - refId: A
+              relativeTimeRange:
+                from: 600
+                to: 0
+              datasourceUid: mimir
+              model:
+                editorMode: code
+                expr: |
+                  kube_statefulset_status_replicas_ready{namespace="mimir",statefulset="mimir-ingester"}
+                  <
+                  kube_statefulset_status_replicas{namespace="mimir",statefulset="mimir-ingester"}
+                instant: true
+                intervalMs: 1000
+                legendFormat: __auto
+                maxDataPoints: 43200
+                range: false
+                refId: A
+            - refId: B
+              relativeTimeRange:
+                from: 600
+                to: 0
+              datasourceUid: __expr__
+              model:
+                conditions:
+                    - evaluator:
+                        params:
+                            - 0
+                        type: gt
+                      operator:
+                        type: and
+                      query:
+                        params:
+                            - B
+                      reducer:
+                        params: []
+                        type: last
+                      type: query
+                datasource:
+                    type: __expr__
+                    uid: __expr__
+                expression: A
+                intervalMs: 1000
+                maxDataPoints: 43200
+                refId: B
+                type: threshold
+          noDataState: NoData
+          execErrState: Error
+          for: 2m
+          annotations:
+            description: >-
+              Mimir ingester pods not ready in cluster {{ $labels.cluster }}.
+              Check: kubectl get pods -n mimir -l app.kubernetes.io/component=ingester
+            summary: Mimir ingester pods not ready in {{ $labels.cluster }}
+          labels:
+            opsgenie: "1"
+          isPaused: false
+
+        - uid: mimir_remote_write_failures
+          title: Mimir Remote Write Failures
+          condition: B
+          data:
+            - refId: A
+              relativeTimeRange:
+                from: 600
+                to: 0
+              datasourceUid: mimir
+              model:
+                editorMode: code
+                expr: |
+                  rate(prometheus_remote_storage_samples_failed_total{url=~".*mimir.*"}[5m]) > 0
+                instant: true
+                intervalMs: 1000
+                legendFormat: __auto
+                maxDataPoints: 43200
+                range: false
+                refId: A
+            - refId: B
+              relativeTimeRange:
+                from: 600
+                to: 0
+              datasourceUid: __expr__
+              model:
+                conditions:
+                    - evaluator:
+                        params:
+                            - 0
+                        type: gt
+                      operator:
+                        type: and
+                      query:
+                        params:
+                            - B
+                      reducer:
+                        params: []
+                        type: last
+                      type: query
+                datasource:
+                    type: __expr__
+                    uid: __expr__
+                expression: A
+                intervalMs: 1000
+                maxDataPoints: 43200
+                refId: B
+                type: threshold
+          noDataState: NoData
+          execErrState: Error
+          for: 5m
+          annotations:
+            description: >-
+              Alloy failing to write metrics to Mimir in cluster {{ $labels.cluster }}.
+              Check Alloy: kubectl logs -n alloy -l app.kubernetes.io/name=alloy --tail=100 | grep -i error
+            summary: Metrics remote write to Mimir failing in {{ $labels.cluster }}
+          labels:
+            opsgenie: "1"
+          isPaused: false
diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py
index 82f6a54..bdb358e 100644
--- a/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py
+++ b/python-pulumi/src/ptd/pulumi_resources/aws_eks_cluster.py
@@ -1914,6 +1914,7 @@ def with_grafana(
         self._create_alert_configmap("healthchecks", grafana_ns)
         self._create_alert_configmap("nodes", grafana_ns)
         self._create_alert_configmap("applications", grafana_ns)
+        self._create_alert_configmap("mimir", grafana_ns)
 
         # TODO: auth.proxy should be configurable, prod grafana auth will need tighter controls than letting anyone in as an Editor
         k8s.helm.v3.Release(
@@ -2202,6 +2203,22 @@ def with_mimir(
                                 "max_global_series_per_user": 800000,
                                 "max_label_names_per_series": 45,
                             },
+                            # Ring health configuration to auto-forget unhealthy members
+                            # and prevent stale entries from blocking queries
+                            "ingester": {
+                                "ring": {
+                                    "heartbeat_timeout": "1m",
+                                    "auto_forget_unhealthy": True,
+                                    "auto_forget_unhealthy_timeout": "10m",
+                                },
+                            },
+                            "store_gateway": {
+                                "sharding_ring": {
+                                    "heartbeat_timeout": "1m",
+                                    "auto_forget_unhealthy": True,
+                                    "auto_forget_unhealthy_timeout": "10m",
+                                },
+                            },
                         }
                     },
                     "alertmanager": {"enabled": False},
diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py b/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py
index 8e131e1..b401004 100644
--- a/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py
+++ b/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py
@@ -340,20 +340,15 @@ def _define_loki(self, release: str, version: str, components):
                 "version": version,
                 "valuesContent": yaml.dump(
                     {
-                        "resources": {
-                            "requests": {
-                                "cpu": "100m",
-                                "memory": "512Mi",
-                            },
-                            "limits": {
-                                "memory": "512Mi",
-                            },
-                        },
                         "gateway": {
                             "image": {
                                 "registry": "quay.io",
                                 "repository": "nginx/nginx-unprivileged",
-                            }
+                            },
+                            "resources": {
+                                "requests": {"cpu": "10m", "memory": "32Mi"},
+                                "limits": {"memory": "32Mi"},
+                            },
                         },
                         "loki": {
                             "auth_enabled": False,
@@ -430,18 +425,30 @@ def _define_loki(self, release: str, version: str, components):
                             "persistence": {
                                 "enableStatefulSetAutoDeletePVC": True,
                             },
+                            "resources": {
+                                "requests": {"cpu": "50m", "memory": "128Mi"},
+                                "limits": {"memory": "128Mi"},
+                            },
                         },
                         "read": {
                             "replicas": components.loki_replicas,
                             "persistence": {
                                 "enableStatefulSetAutoDeletePVC": True,
                             },
+                            "resources": {
+                                "requests": {"cpu": "50m", "memory": "128Mi"},
+                                "limits": {"memory": "128Mi"},
+                            },
                         },
                         "write": {
                             "replicas": components.loki_replicas,
                             "persistence": {
                                 "enableStatefulSetAutoDeletePVC": True,
                             },
+                            "resources": {
+                                "requests": {"cpu": "50m", "memory": "128Mi"},
+                                "limits": {"memory": "128Mi"},
+                            },
                         },
                     },
                 ),
@@ -468,11 +475,11 @@ def _define_grafana(self, release: str, version: str):
                     {
                         "resources": {
                             "requests": {
-                                "cpu": "100m",
-                                "memory": "256Mi",
+                                "cpu": "50m",
+                                "memory": "128Mi",
                             },
                             "limits": {
-                                "memory": "256Mi",
+                                "memory": "128Mi",
                             },
                         },
                         "envFromSecret": "grafana-db-url",
@@ -561,15 +568,6 @@ def _define_mimir(self, release: str, version: str, components):
                 "version": version,
                 "valuesContent": yaml.dump(
                     {
-                        "resources": {
-                            "requests": {
-                                "cpu": "100m",
-                                "memory": "512Mi",
-                            },
-                            "limits": {
-                                "memory": "512Mi",
-                            },
-                        },
                         "serviceAccount": {
                             "create": True,
                             "name": str(ptd.Roles.MIMIR),
@@ -596,6 +594,22 @@ def _define_mimir(self, release: str, version: str, components):
                                     "max_global_series_per_user": 800000,
                                     "max_label_names_per_series": 45,
                                 },
+                                # Ring health configuration to auto-forget unhealthy members
+                                # and prevent stale entries from blocking queries
+                                "ingester": {
+                                    "ring": {
+                                        "heartbeat_timeout": "1m",
+                                        "auto_forget_unhealthy": True,
+                                        "auto_forget_unhealthy_timeout": "10m",
+                                    },
+                                },
+                                "store_gateway": {
+                                    "sharding_ring": {
+                                        "heartbeat_timeout": "1m",
+                                        "auto_forget_unhealthy": True,
+                                        "auto_forget_unhealthy_timeout": "10m",
+                                    },
+                                },
                             }
                         },
                         "alertmanager": {"enabled": False},
@@ -604,6 +618,10 @@ def _define_mimir(self, release: str, version: str, components):
                             "persistentVolume": {"size": "20Gi"},
                             "replicas": components.mimir_replicas,
                             "zoneAwareReplication": {"enabled": False},
+                            "resources": {
+                                "requests": {"cpu": "50m", "memory": "256Mi"},
+                                "limits": {"memory": "256Mi"},
+                            },
                             "affinity": {
                                 "nodeAffinity": {
                                     "requiredDuringSchedulingIgnoredDuringExecution": {
@@ -621,9 +639,31 @@ def _define_mimir(self, release: str, version: str, components):
                                 }
                             },
                         },
+                        "distributor": {
+                            "resources": {
+                                "requests": {"cpu": "50m", "memory": "128Mi"},
+                                "limits": {"memory": "128Mi"},
+                            },
+                        },
+                        "querier": {
+                            "resources": {
+                                "requests": {"cpu": "50m", "memory": "128Mi"},
+                                "limits": {"memory": "128Mi"},
+                            },
+                        },
+                        "query_frontend": {
+                            "resources": {
+                                "requests": {"cpu": "50m", "memory": "128Mi"},
+                                "limits": {"memory": "128Mi"},
+                            },
+                        },
                         "compactor": {
                             "persistentVolume": {"size": "20Gi"},
                             "replicas": components.mimir_replicas,
+                            "resources": {
+                                "requests": {"cpu": "50m", "memory": "128Mi"},
+                                "limits": {"memory": "128Mi"},
+                            },
                             "affinity": {
                                 "nodeAffinity": {
                                     "requiredDuringSchedulingIgnoredDuringExecution": {
@@ -645,6 +685,10 @@ def _define_mimir(self, release: str, version: str, components):
                             "persistentVolume": {"size": "20Gi"},
                             "replicas": components.mimir_replicas,
                             "zoneAwareReplication": {"enabled": False},
+                            "resources": {
+                                "requests": {"cpu": "50m", "memory": "128Mi"},
+                                "limits": {"memory": "128Mi"},
+                            },
                             "affinity": {
                                 "nodeAffinity": {
                                     "requiredDuringSchedulingIgnoredDuringExecution": {
@@ -664,6 +708,10 @@ def _define_mimir(self, release: str, version: str, components):
                         },
                         "gateway": {
                             "enabledNonEnterprise": True,
+                            "resources": {
+                                "requests": {"cpu": "10m", "memory": "32Mi"},
+                                "limits": {"memory": "32Mi"},
+                            },
                             "nginx": {
                                 "image": {
                                     "registry": "quay.io",
@@ -1281,11 +1329,11 @@ def _define_alloy(self, release: str, version: str):
                     {
                         "resources": {
                             "requests": {
-                                "cpu": "50m",
-                                "memory": "128Mi",
+                                "cpu": "25m",
+                                "memory": "64Mi",
                             },
                             "limits": {
-                                "memory": "128Mi",
+                                "memory": "64Mi",
                             },
                         },
                         "serviceAccount": {
diff --git a/python-pulumi/src/ptd/pulumi_resources/azure_workload_helm.py b/python-pulumi/src/ptd/pulumi_resources/azure_workload_helm.py
index 9912fa7..c98851e 100644
--- a/python-pulumi/src/ptd/pulumi_resources/azure_workload_helm.py
+++ b/python-pulumi/src/ptd/pulumi_resources/azure_workload_helm.py
@@ -104,20 +104,21 @@ def _define_loki(self, release: str, version: str):
                 "valuesContent": loki_identity.client_id.apply(
                     lambda client_id: yaml.dump(
                         {
-                            "resources": {
-                                "requests": {
-                                    "cpu": "100m",
-                                    "memory": "512Mi",
-                                },
-                                "limits": {
-                                    "memory": "512Mi",
+                            "singleBinary": {
+                                "resources": {
+                                    "requests": {"cpu": "50m", "memory": "256Mi"},
+                                    "limits": {"memory": "256Mi"},
                                 },
                             },
                             "gateway": {
                                 "image": {
                                     "registry": "quay.io",
                                     "repository": "nginx/nginx-unprivileged",
-                                }
+                                },
+                                "resources": {
+                                    "requests": {"cpu": "10m", "memory": "32Mi"},
+                                    "limits": {"memory": "32Mi"},
+                                },
                             },
                             "loki": {
                                 "auth_enabled": False,
@@ -226,15 +227,6 @@ def _define_mimir(self, release: str, version: str):
                 "valuesContent": mimir_identity.client_id.apply(
                     lambda client_id: yaml.dump(
                         {
-                            "resources": {
-                                "requests": {
-                                    "cpu": "100m",
-                                    "memory": "512Mi",
-                                },
-                                "limits": {
-                                    "memory": "512Mi",
-                                },
-                            },
                             "serviceAccount": {
                                 "create": True,
                                 "name": str(ptd.Roles.MIMIR),
@@ -270,6 +262,22 @@ def _define_mimir(self, release: str, version: str):
                                         "max_global_series_per_user": 800000,
                                         "max_label_names_per_series": 45,
                                     },
+                                    # Ring health configuration to auto-forget unhealthy members
+                                    # and prevent stale entries from blocking queries
+                                    "ingester": {
+                                        "ring": {
+                                            "heartbeat_timeout": "1m",
+                                            "auto_forget_unhealthy": True,
+                                            "auto_forget_unhealthy_timeout": "10m",
+                                        },
+                                    },
+                                    "store_gateway": {
+                                        "sharding_ring": {
+                                            "heartbeat_timeout": "1m",
+                                            "auto_forget_unhealthy": True,
+                                            "auto_forget_unhealthy_timeout": "10m",
+                                        },
+                                    },
                                 },
                             },
                             "minio": {
@@ -277,11 +285,51 @@ def _define_mimir(self, release: str, version: str):
                             },
                             "alertmanager": {"enabled": False},
                             "ruler": {"enabled": False},
-                            "ingester": {"persistentVolume": {"size": "20Gi"}},
-                            "compactor": {"persistentVolume": {"size": "20Gi"}},
-                            "store_gateway": {"persistentVolume": {"size": "20Gi"}},
+                            "ingester": {
+                                "persistentVolume": {"size": "20Gi"},
+                                "resources": {
+                                    "requests": {"cpu": "50m", "memory": "256Mi"},
+                                    "limits": {"memory": "256Mi"},
+                                },
+                            },
+                            "distributor": {
+                                "resources": {
+                                    "requests": {"cpu": "50m", "memory": "128Mi"},
+                                    "limits": {"memory": "128Mi"},
+                                },
+                            },
+                            "querier": {
+                                "resources": {
+                                    "requests": {"cpu": "50m", "memory": "128Mi"},
+                                    "limits": {"memory": "128Mi"},
+                                },
+                            },
+                            "query_frontend": {
+                                "resources": {
+                                    "requests": {"cpu": "50m", "memory": "128Mi"},
+                                    "limits": {"memory": "128Mi"},
+                                },
+                            },
+                            "compactor": {
+                                "persistentVolume": {"size": "20Gi"},
+                                "resources": {
+                                    "requests": {"cpu": "50m", "memory": "128Mi"},
+                                    "limits": {"memory": "128Mi"},
+                                },
+                            },
+                            "store_gateway": {
+                                "persistentVolume": {"size": "20Gi"},
+                                "resources": {
+                                    "requests": {"cpu": "50m", "memory": "128Mi"},
+                                    "limits": {"memory": "128Mi"},
+                                },
+                            },
                             "gateway": {
                                 "enabledNonEnterprise": True,
+                                "resources": {
+                                    "requests": {"cpu": "10m", "memory": "32Mi"},
+                                    "limits": {"memory": "32Mi"},
+                                },
                                 "nginx": {
                                     "image": {
                                         "registry": "quay.io",
@@ -347,11 +395,11 @@ def _define_alloy(self, release: str, version: str):
                     {
                         "resources": {
                             "requests": {
-                                "cpu": "50m",
-                                "memory": "128Mi",
+                                "cpu": "25m",
+                                "memory": "64Mi",
                             },
                             "limits": {
-                                "memory": "128Mi",
+                                "memory": "64Mi",
                             },
                         },
                         "serviceAccount": {
@@ -592,11 +640,11 @@ def _define_grafana(self, release: str, version: str):
                     {
                         "resources": {
                             "requests": {
-                                "cpu": "100m",
-                                "memory": "256Mi",
+                                "cpu": "50m",
+                                "memory": "128Mi",
                             },
                             "limits": {
-                                "memory": "256Mi",
+                                "memory": "128Mi",
                             },
                         },
                         "envFromSecret": "grafana-db-url",
diff --git a/python-pulumi/src/ptd/pulumi_resources/team_site.py b/python-pulumi/src/ptd/pulumi_resources/team_site.py
index b6ae067..c784f78 100644
--- a/python-pulumi/src/ptd/pulumi_resources/team_site.py
+++ b/python-pulumi/src/ptd/pulumi_resources/team_site.py
@@ -169,9 +169,7 @@ def inject_cluster_tolerations(obj: dict[str, typing.Any], _: pulumi.ResourceOpt
             # Check if disablePrePullImages is set to true in the Site spec
             disable_prepull = obj.get("spec", {}).get("disablePrePullImages", False)
             if session_node_pools and not disable_prepull:
-                deepmerge.always_merger.merge(
-                    obj, {"spec": {"prepullNodePools": session_node_pools}}
-                )
+                deepmerge.always_merger.merge(obj, {"spec": {"prepullNodePools": session_node_pools}})
 
         api_version_path = self._config_overrides.get("apiVersion", "").split("/")[-1]
 

From 04e37b14c08444f732b5a155147fb59142781414 Mon Sep 17 00:00:00 2001
From: Anna Williamson <annamdove@gmail.com>
Date: Fri, 30 Jan 2026 14:24:26 -0800
Subject: [PATCH 7/8] resource updates

---
 .../ptd/pulumi_resources/aws_workload_helm.py | 117 ++++++++++++------
 .../ptd/pulumi_resources/tigera_operator.py   |   6 +-
 2 files changed, 81 insertions(+), 42 deletions(-)

diff --git a/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py b/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py
index b401004..09408eb 100644
--- a/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py
+++ b/python-pulumi/src/ptd/pulumi_resources/aws_workload_helm.py
@@ -300,11 +300,9 @@ def _define_metrics_server(self, release: str, version: str):
                 "version": version,
                 "valuesContent": yaml.dump(
                     {
+                        # Chart defaults: 100m CPU, 200Mi memory requests
+                        # Only adding memory limit to prevent OOM
                         "resources": {
-                            "requests": {
-                                "cpu": "100m",
-                                "memory": "200Mi",
-                            },
                             "limits": {
                                 "memory": "200Mi",
                             },
@@ -346,8 +344,8 @@ def _define_loki(self, release: str, version: str, components):
                                 "repository": "nginx/nginx-unprivileged",
                             },
                             "resources": {
-                                "requests": {"cpu": "10m", "memory": "32Mi"},
-                                "limits": {"memory": "32Mi"},
+                                "requests": {"cpu": "10m", "memory": "100Mi"},
+                                "limits": {"memory": "100Mi"},
                             },
                         },
                         "loki": {
@@ -406,6 +404,12 @@ def _define_loki(self, release: str, version: str, components):
                             "image": {
                                 "repository": "quay.io/kiwigrid/k8s-sidecar",
                             },
+                            "rules": {
+                                "resources": {
+                                    "requests": {"cpu": "10m", "memory": "100Mi"},
+                                    "limits": {"memory": "100Mi"},
+                                },
+                            },
                         },
                         "monitoring": {
                             "dashboards": {"enabled": False},
@@ -426,8 +430,8 @@ def _define_loki(self, release: str, version: str, components):
                                 "enableStatefulSetAutoDeletePVC": True,
                             },
                             "resources": {
-                                "requests": {"cpu": "50m", "memory": "128Mi"},
-                                "limits": {"memory": "128Mi"},
+                                "requests": {"cpu": "12m", "memory": "111Mi"},
+                                "limits": {"memory": "111Mi"},
                             },
                         },
                         "read": {
@@ -436,8 +440,8 @@ def _define_loki(self, release: str, version: str, components):
                                 "enableStatefulSetAutoDeletePVC": True,
                             },
                             "resources": {
-                                "requests": {"cpu": "50m", "memory": "128Mi"},
-                                "limits": {"memory": "128Mi"},
+                                "requests": {"cpu": "22m", "memory": "186Mi"},
+                                "limits": {"memory": "186Mi"},
                             },
                         },
                         "write": {
@@ -446,8 +450,14 @@ def _define_loki(self, release: str, version: str, components):
                                 "enableStatefulSetAutoDeletePVC": True,
                             },
                             "resources": {
-                                "requests": {"cpu": "50m", "memory": "128Mi"},
-                                "limits": {"memory": "128Mi"},
+                                "requests": {"cpu": "12m", "memory": "261Mi"},
+                                "limits": {"memory": "261Mi"},
+                            },
+                        },
+                        "lokiCanary": {
+                            "resources": {
+                                "requests": {"cpu": "10m", "memory": "100Mi"},
+                                "limits": {"memory": "100Mi"},
                             },
                         },
                     },
@@ -475,11 +485,11 @@ def _define_grafana(self, release: str, version: str):
                     {
                         "resources": {
                             "requests": {
-                                "cpu": "50m",
-                                "memory": "128Mi",
+                                "cpu": "10m",
+                                "memory": "100Mi",
                             },
                             "limits": {
-                                "memory": "128Mi",
+                                "memory": "100Mi",
                             },
                         },
                         "envFromSecret": "grafana-db-url",
@@ -619,8 +629,8 @@ def _define_mimir(self, release: str, version: str, components):
                             "replicas": components.mimir_replicas,
                             "zoneAwareReplication": {"enabled": False},
                             "resources": {
-                                "requests": {"cpu": "50m", "memory": "256Mi"},
-                                "limits": {"memory": "256Mi"},
+                                "requests": {"cpu": "17m", "memory": "279Mi"},
+                                "limits": {"memory": "279Mi"},
                             },
                             "affinity": {
                                 "nodeAffinity": {
@@ -641,28 +651,46 @@ def _define_mimir(self, release: str, version: str, components):
                         },
                         "distributor": {
                             "resources": {
-                                "requests": {"cpu": "50m", "memory": "128Mi"},
-                                "limits": {"memory": "128Mi"},
+                                "requests": {"cpu": "16m", "memory": "119Mi"},
+                                "limits": {"memory": "119Mi"},
                             },
                         },
                         "querier": {
                             "resources": {
-                                "requests": {"cpu": "50m", "memory": "128Mi"},
-                                "limits": {"memory": "128Mi"},
+                                "requests": {"cpu": "10m", "memory": "100Mi"},
+                                "limits": {"memory": "100Mi"},
                             },
                         },
                         "query_frontend": {
                             "resources": {
-                                "requests": {"cpu": "50m", "memory": "128Mi"},
-                                "limits": {"memory": "128Mi"},
+                                "requests": {"cpu": "10m", "memory": "100Mi"},
+                                "limits": {"memory": "100Mi"},
+                            },
+                        },
+                        "query_scheduler": {
+                            "resources": {
+                                "requests": {"cpu": "10m", "memory": "100Mi"},
+                                "limits": {"memory": "100Mi"},
+                            },
+                        },
+                        "overrides_exporter": {
+                            "resources": {
+                                "requests": {"cpu": "10m", "memory": "100Mi"},
+                                "limits": {"memory": "100Mi"},
+                            },
+                        },
+                        "rollout_operator": {
+                            "resources": {
+                                "requests": {"cpu": "10m", "memory": "100Mi"},
+                                "limits": {"memory": "100Mi"},
                             },
                         },
                         "compactor": {
                             "persistentVolume": {"size": "20Gi"},
                             "replicas": components.mimir_replicas,
                             "resources": {
-                                "requests": {"cpu": "50m", "memory": "128Mi"},
-                                "limits": {"memory": "128Mi"},
+                                "requests": {"cpu": "10m", "memory": "117Mi"},
+                                "limits": {"memory": "117Mi"},
                             },
                             "affinity": {
                                 "nodeAffinity": {
@@ -686,8 +714,8 @@ def _define_mimir(self, release: str, version: str, components):
                             "replicas": components.mimir_replicas,
                             "zoneAwareReplication": {"enabled": False},
                             "resources": {
-                                "requests": {"cpu": "50m", "memory": "128Mi"},
-                                "limits": {"memory": "128Mi"},
+                                "requests": {"cpu": "10m", "memory": "100Mi"},
+                                "limits": {"memory": "100Mi"},
                             },
                             "affinity": {
                                 "nodeAffinity": {
@@ -709,8 +737,8 @@ def _define_mimir(self, release: str, version: str, components):
                         "gateway": {
                             "enabledNonEnterprise": True,
                             "resources": {
-                                "requests": {"cpu": "10m", "memory": "32Mi"},
-                                "limits": {"memory": "32Mi"},
+                                "requests": {"cpu": "10m", "memory": "100Mi"},
+                                "limits": {"memory": "100Mi"},
                             },
                             "nginx": {
                                 "image": {
@@ -788,11 +816,11 @@ def _define_traefik(self, release: str, version: str, weight: str, cert_arns_out
                     {
                         "resources": {
                             "requests": {
-                                "cpu": "100m",
-                                "memory": "128Mi",
+                                "cpu": "10m",
+                                "memory": "100Mi",
                             },
                             "limits": {
-                                "memory": "128Mi",
+                                "memory": "100Mi",
                             },
                         },
                         "image": {
@@ -1327,13 +1355,15 @@ def _define_alloy(self, release: str, version: str):
                 "version": version,
                 "valuesContent": yaml.dump(
                     {
-                        "resources": {
-                            "requests": {
-                                "cpu": "25m",
-                                "memory": "64Mi",
-                            },
-                            "limits": {
-                                "memory": "64Mi",
+                        "configReloader": {
+                            "resources": {
+                                "requests": {
+                                    "cpu": "10m",
+                                    "memory": "100Mi",
+                                },
+                                "limits": {
+                                    "memory": "100Mi",
+                                },
                             },
                         },
                         "serviceAccount": {
@@ -1362,6 +1392,15 @@ def _define_alloy(self, release: str, version: str):
                             }
                         },
                         "alloy": {
+                            "resources": {
+                                "requests": {
+                                    "cpu": "27m",
+                                    "memory": "896Mi",
+                                },
+                                "limits": {
+                                    "memory": "896Mi",
+                                },
+                            },
                             "clustering": {"enabled": True},
                             "extraPorts": [
                                 {
diff --git a/python-pulumi/src/ptd/pulumi_resources/tigera_operator.py b/python-pulumi/src/ptd/pulumi_resources/tigera_operator.py
index 26a2d10..4913d25 100644
--- a/python-pulumi/src/ptd/pulumi_resources/tigera_operator.py
+++ b/python-pulumi/src/ptd/pulumi_resources/tigera_operator.py
@@ -56,12 +56,12 @@ def _define_helm_release(self):
             values={
                 "resources": {
                     "requests": {
-                        "cpu": "100m",
-                        "memory": "128Mi",
+                        "cpu": "10m",
+                        "memory": "100Mi",
                         "ephemeral-storage": "1Gi",
                     },
                     "limits": {
-                        "memory": "128Mi",
+                        "memory": "100Mi",
                         "ephemeral-storage": "2Gi",
                     },
                 },

From 52813a0ee6843083a1d94148a25192506b0ce033 Mon Sep 17 00:00:00 2001
From: Anna Williamson <annamdove@gmail.com>
Date: Fri, 30 Jan 2026 14:28:17 -0800
Subject: [PATCH 8/8] revert prepull changes, moving to another pr

---
 .../src/ptd/pulumi_resources/team_site.py     | 35 +++++++------------
 1 file changed, 13 insertions(+), 22 deletions(-)

diff --git a/python-pulumi/src/ptd/pulumi_resources/team_site.py b/python-pulumi/src/ptd/pulumi_resources/team_site.py
index c784f78..19b88f2 100644
--- a/python-pulumi/src/ptd/pulumi_resources/team_site.py
+++ b/python-pulumi/src/ptd/pulumi_resources/team_site.py
@@ -131,9 +131,8 @@ def inject_cluster_tolerations(obj: dict[str, typing.Any], _: pulumi.ResourceOpt
             if obj["kind"] != "Site":
                 return
 
-            # Compute session tolerations and prepull node pools based on Karpenter node pools with session_taints=true
+            # Compute session tolerations based on Karpenter node pools with session_taints=true
             session_tolerations = []
-            session_node_pools = []
             if self.cluster_config and hasattr(self.cluster_config, "karpenter_config"):
                 karpenter_config = self.cluster_config.karpenter_config
                 if karpenter_config and karpenter_config.node_pools:
@@ -148,28 +147,20 @@ def inject_cluster_tolerations(obj: dict[str, typing.Any], _: pulumi.ResourceOpt
                             if toleration not in session_tolerations:
                                 session_tolerations.append(toleration)
 
-                            # Track node pool names for prepull targeting
-                            if node_pool.name not in session_node_pools:
-                                session_node_pools.append(node_pool.name)
+            if not session_tolerations:
+                return
 
             # Merge session tolerations into workbench spec
-            if session_tolerations:
-                deepmerge.always_merger.merge(obj, {"spec": {"workbench": {"sessionTolerations": session_tolerations}}})
-
-                # Deduplicate tolerations (deepmerge concatenates lists)
-                tolerations = obj["spec"]["workbench"]["sessionTolerations"]
-                seen = {}
-                for t in tolerations:
-                    key = (t.get("key"), t.get("operator"), t.get("value"), t.get("effect"))
-                    if key not in seen:
-                        seen[key] = t
-                obj["spec"]["workbench"]["sessionTolerations"] = list(seen.values())
-
-            # Inject prepull node pool targeting if session-tainted pools exist AND prepull is not disabled
-            # Check if disablePrePullImages is set to true in the Site spec
-            disable_prepull = obj.get("spec", {}).get("disablePrePullImages", False)
-            if session_node_pools and not disable_prepull:
-                deepmerge.always_merger.merge(obj, {"spec": {"prepullNodePools": session_node_pools}})
+            deepmerge.always_merger.merge(obj, {"spec": {"workbench": {"sessionTolerations": session_tolerations}}})
+
+            # Deduplicate tolerations (deepmerge concatenates lists)
+            tolerations = obj["spec"]["workbench"]["sessionTolerations"]
+            seen = {}
+            for t in tolerations:
+                key = (t.get("key"), t.get("operator"), t.get("value"), t.get("effect"))
+                if key not in seen:
+                    seen[key] = t
+            obj["spec"]["workbench"]["sessionTolerations"] = list(seen.values())
 
         api_version_path = self._config_overrides.get("apiVersion", "").split("/")[-1]