From e69c41cf5b7ecb4c28e04d2883b9671c64498b08 Mon Sep 17 00:00:00 2001
From: ian-flores <iflores.siaca@posit.co>
Date: Tue, 20 Jan 2026 13:38:44 -0800
Subject: [PATCH 1/3] fix: remove duplicate metrics service and add tolerations
 docs

- Remove auth_proxy_service.yaml which duplicated metrics-service.yaml
- Add troubleshooting docs for pod scheduling failures due to taints
---
 .../templates/rbac/auth_proxy_service.yaml    | 17 ----
 docs/guides/troubleshooting.md                | 95 +++++++++++++++++++
 2 files changed, 95 insertions(+), 17 deletions(-)
 delete mode 100755 dist/chart/templates/rbac/auth_proxy_service.yaml

diff --git a/dist/chart/templates/rbac/auth_proxy_service.yaml b/dist/chart/templates/rbac/auth_proxy_service.yaml
deleted file mode 100755
index b665f0dc..00000000
--- a/dist/chart/templates/rbac/auth_proxy_service.yaml
+++ /dev/null
@@ -1,17 +0,0 @@
-{{- if .Values.rbac.enable }}
-apiVersion: v1
-kind: Service
-metadata:
-  labels:
-    {{- include "chart.labels" . | nindent 4 }}
-  name: {{ .Values.controllerManager.serviceAccountName }}-metrics-service
-  namespace: {{ .Release.Namespace }}
-spec:
-  ports:
-  - name: https
-    port: 8443
-    protocol: TCP
-    targetPort: https
-  selector:
-    control-plane: controller-manager
-{{- end -}}
diff --git a/docs/guides/troubleshooting.md b/docs/guides/troubleshooting.md
index d8c3bc77..90d79e47 100644
--- a/docs/guides/troubleshooting.md
+++ b/docs/guides/troubleshooting.md
@@ -6,6 +6,7 @@ This comprehensive guide covers common issues and their solutions when running P
 
 1. [General Debugging](#general-debugging)
 2. [Operator Issues](#operator-issues)
+   - [Operator Pod Stuck in Pending (Scheduling Failures)](#operator-pod-stuck-in-pending-scheduling-failures)
 3. [Site Reconciliation Issues](#site-reconciliation-issues)
 4. [Database Issues](#database-issues)
 5. [Product-Specific Issues](#product-specific-issues)
@@ -203,6 +204,100 @@ kubectl describe crd sites.core.posit.team
   helm upgrade --install team-operator ./dist/chart --set installCRDs=true
   ```
 
+### Operator Pod Stuck in Pending (Scheduling Failures)
+
+**Symptoms:**
+- Operator pod stays in `Pending` state indefinitely
+- `kubectl describe pod` shows taint-related scheduling errors
+- Events contain messages like `node(s) had taints that the pod didn't tolerate`
+
+**Diagnosis:**
+```bash
+# Check operator pod status
+kubectl get pods -n posit-team-system
+
+# Describe the pod to see scheduling failures
+kubectl describe pod -n posit-team-system -l control-plane=controller-manager
+
+# List node taints to understand what tolerations are needed
+kubectl get nodes -o custom-columns=NAME:.metadata.name,TAINTS:.spec.taints
+```
+
+**Cause:**
+
+Kubernetes nodes can have [taints](https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/) that prevent pods from scheduling unless the pod has a matching toleration. Common scenarios include:
+
+- Dedicated node pools for specific workloads (e.g., GPU nodes, session nodes)
+- Nodes reserved for critical system components
+- Cloud provider managed node pools with default taints
+
+If the operator pod doesn't have tolerations matching the node taints, it will remain in `Pending` state.
+
+**Solution:**
+
+Configure tolerations in your Helm values to match your cluster's node taints:
+
+```yaml
+# values.yaml
+controllerManager:
+  tolerations:
+    # Example: Tolerate nodes tainted for session workloads
+    - key: "workload-type"
+      operator: "Equal"
+      value: "session"
+      effect: "NoSchedule"
+
+    # Example: Tolerate nodes with GPU taints
+    - key: "nvidia.com/gpu"
+      operator: "Exists"
+      effect: "NoSchedule"
+
+    # Example: Tolerate all taints (use with caution)
+    # - operator: "Exists"
+```
+
+Apply the configuration:
+
+```bash
+helm upgrade team-operator ./dist/chart \
+  --namespace posit-team-system \
+  -f values.yaml
+```
+
+**Common Toleration Patterns:**
+
+| Scenario | Toleration Configuration |
+|----------|-------------------------|
+| Session-dedicated nodes | `key: "workload-type", value: "session", effect: "NoSchedule"` |
+| GPU nodes | `key: "nvidia.com/gpu", operator: "Exists", effect: "NoSchedule"` |
+| Cloud provider taints (EKS) | `key: "eks.amazonaws.com/compute-type", operator: "Exists"` |
+| Cloud provider taints (GKE) | `key: "cloud.google.com/gke-nodepool", operator: "Exists"` |
+| Control plane nodes | `key: "node-role.kubernetes.io/control-plane", operator: "Exists"` |
+
+**Using nodeSelector as an alternative:**
+
+If you want the operator to run on specific nodes instead of tolerating taints, use `nodeSelector`:
+
+```yaml
+controllerManager:
+  nodeSelector:
+    kubernetes.io/os: linux
+    node-type: management
+```
+
+**Verification:**
+
+After applying tolerations, verify the pod schedules successfully:
+
+```bash
+# Check pod is running
+kubectl get pods -n posit-team-system
+
+# Verify tolerations were applied
+kubectl get deployment team-operator-controller-manager -n posit-team-system \
+  -o jsonpath='{.spec.template.spec.tolerations}' | jq
+```
+
 ---
 
 ## Site Reconciliation Issues

From dceea83d53029fb40d9f9b0a70e6d7826e7eb3d3 Mon Sep 17 00:00:00 2001
From: ian-flores <iflores.siaca@posit.co>
Date: Tue, 20 Jan 2026 13:55:19 -0800
Subject: [PATCH 2/3] fix: use templated name for metrics service

Replace hardcoded service name with .Values.controllerManager.serviceAccountName
---
 dist/chart/templates/metrics/metrics-service.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dist/chart/templates/metrics/metrics-service.yaml b/dist/chart/templates/metrics/metrics-service.yaml
index 88b9d0bc..acf12180 100644
--- a/dist/chart/templates/metrics/metrics-service.yaml
+++ b/dist/chart/templates/metrics/metrics-service.yaml
@@ -2,7 +2,7 @@
 apiVersion: v1
 kind: Service
 metadata:
-  name: team-operator-controller-manager-metrics-service
+  name: {{ .Values.controllerManager.serviceAccountName }}-metrics-service
   namespace: {{ .Release.Namespace }}
   labels:
     {{- include "chart.labels" . | nindent 4 }}

From d45d8800613ac3942085777d6302e53653e0e713 Mon Sep 17 00:00:00 2001
From: ian-flores <iflores.siaca@posit.co>
Date: Tue, 20 Jan 2026 14:57:07 -0800
Subject: [PATCH 3/3] fix: correct metrics service DNS name in certificate
 template

The certificate referenced 'team-operator-metrics-service' but the actual
service is named 'team-operator-controller-manager-metrics-service'.
This mismatch would cause TLS validation failures for metrics scraping.
---
 dist/chart/templates/certmanager/certificate.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dist/chart/templates/certmanager/certificate.yaml b/dist/chart/templates/certmanager/certificate.yaml
index 33d2f249..ebe84709 100644
--- a/dist/chart/templates/certmanager/certificate.yaml
+++ b/dist/chart/templates/certmanager/certificate.yaml
@@ -51,7 +51,7 @@ spec:
   dnsNames:
     - team-operator.{{ .Release.Namespace }}.svc
     - team-operator.{{ .Release.Namespace }}.svc.cluster.local
-    - team-operator-metrics-service.{{ .Release.Namespace }}.svc
+    - team-operator-controller-manager-metrics-service.{{ .Release.Namespace }}.svc
   issuerRef:
     kind: Issuer
     name: selfsigned-issuer