From e69c41cf5b7ecb4c28e04d2883b9671c64498b08 Mon Sep 17 00:00:00 2001 From: ian-flores Date: Tue, 20 Jan 2026 13:38:44 -0800 Subject: [PATCH 1/3] fix: remove duplicate metrics service and add tolerations docs - Remove auth_proxy_service.yaml which duplicated metrics-service.yaml - Add troubleshooting docs for pod scheduling failures due to taints --- .../templates/rbac/auth_proxy_service.yaml | 17 ---- docs/guides/troubleshooting.md | 95 +++++++++++++++++++ 2 files changed, 95 insertions(+), 17 deletions(-) delete mode 100755 dist/chart/templates/rbac/auth_proxy_service.yaml diff --git a/dist/chart/templates/rbac/auth_proxy_service.yaml b/dist/chart/templates/rbac/auth_proxy_service.yaml deleted file mode 100755 index b665f0dc..00000000 --- a/dist/chart/templates/rbac/auth_proxy_service.yaml +++ /dev/null @@ -1,17 +0,0 @@ -{{- if .Values.rbac.enable }} -apiVersion: v1 -kind: Service -metadata: - labels: - {{- include "chart.labels" . | nindent 4 }} - name: {{ .Values.controllerManager.serviceAccountName }}-metrics-service - namespace: {{ .Release.Namespace }} -spec: - ports: - - name: https - port: 8443 - protocol: TCP - targetPort: https - selector: - control-plane: controller-manager -{{- end -}} diff --git a/docs/guides/troubleshooting.md b/docs/guides/troubleshooting.md index d8c3bc77..90d79e47 100644 --- a/docs/guides/troubleshooting.md +++ b/docs/guides/troubleshooting.md @@ -6,6 +6,7 @@ This comprehensive guide covers common issues and their solutions when running P 1. [General Debugging](#general-debugging) 2. [Operator Issues](#operator-issues) + - [Operator Pod Stuck in Pending (Scheduling Failures)](#operator-pod-stuck-in-pending-scheduling-failures) 3. [Site Reconciliation Issues](#site-reconciliation-issues) 4. [Database Issues](#database-issues) 5. [Product-Specific Issues](#product-specific-issues) @@ -203,6 +204,100 @@ kubectl describe crd sites.core.posit.team helm upgrade --install team-operator ./dist/chart --set installCRDs=true ``` +### Operator Pod Stuck in Pending (Scheduling Failures) + +**Symptoms:** +- Operator pod stays in `Pending` state indefinitely +- `kubectl describe pod` shows taint-related scheduling errors +- Events contain messages like `node(s) had taints that the pod didn't tolerate` + +**Diagnosis:** +```bash +# Check operator pod status +kubectl get pods -n posit-team-system + +# Describe the pod to see scheduling failures +kubectl describe pod -n posit-team-system -l control-plane=controller-manager + +# List node taints to understand what tolerations are needed +kubectl get nodes -o custom-columns=NAME:.metadata.name,TAINTS:.spec.taints +``` + +**Cause:** + +Kubernetes nodes can have [taints](https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/) that prevent pods from scheduling unless the pod has a matching toleration. Common scenarios include: + +- Dedicated node pools for specific workloads (e.g., GPU nodes, session nodes) +- Nodes reserved for critical system components +- Cloud provider managed node pools with default taints + +If the operator pod doesn't have tolerations matching the node taints, it will remain in `Pending` state. + +**Solution:** + +Configure tolerations in your Helm values to match your cluster's node taints: + +```yaml +# values.yaml +controllerManager: + tolerations: + # Example: Tolerate nodes tainted for session workloads + - key: "workload-type" + operator: "Equal" + value: "session" + effect: "NoSchedule" + + # Example: Tolerate nodes with GPU taints + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + + # Example: Tolerate all taints (use with caution) + # - operator: "Exists" +``` + +Apply the configuration: + +```bash +helm upgrade team-operator ./dist/chart \ + --namespace posit-team-system \ + -f values.yaml +``` + +**Common Toleration Patterns:** + +| Scenario | Toleration Configuration | +|----------|-------------------------| +| Session-dedicated nodes | `key: "workload-type", value: "session", effect: "NoSchedule"` | +| GPU nodes | `key: "nvidia.com/gpu", operator: "Exists", effect: "NoSchedule"` | +| Cloud provider taints (EKS) | `key: "eks.amazonaws.com/compute-type", operator: "Exists"` | +| Cloud provider taints (GKE) | `key: "cloud.google.com/gke-nodepool", operator: "Exists"` | +| Control plane nodes | `key: "node-role.kubernetes.io/control-plane", operator: "Exists"` | + +**Using nodeSelector as an alternative:** + +If you want the operator to run on specific nodes instead of tolerating taints, use `nodeSelector`: + +```yaml +controllerManager: + nodeSelector: + kubernetes.io/os: linux + node-type: management +``` + +**Verification:** + +After applying tolerations, verify the pod schedules successfully: + +```bash +# Check pod is running +kubectl get pods -n posit-team-system + +# Verify tolerations were applied +kubectl get deployment team-operator-controller-manager -n posit-team-system \ + -o jsonpath='{.spec.template.spec.tolerations}' | jq +``` + --- ## Site Reconciliation Issues From dceea83d53029fb40d9f9b0a70e6d7826e7eb3d3 Mon Sep 17 00:00:00 2001 From: ian-flores Date: Tue, 20 Jan 2026 13:55:19 -0800 Subject: [PATCH 2/3] fix: use templated name for metrics service Replace hardcoded service name with .Values.controllerManager.serviceAccountName --- dist/chart/templates/metrics/metrics-service.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dist/chart/templates/metrics/metrics-service.yaml b/dist/chart/templates/metrics/metrics-service.yaml index 88b9d0bc..acf12180 100644 --- a/dist/chart/templates/metrics/metrics-service.yaml +++ b/dist/chart/templates/metrics/metrics-service.yaml @@ -2,7 +2,7 @@ apiVersion: v1 kind: Service metadata: - name: team-operator-controller-manager-metrics-service + name: {{ .Values.controllerManager.serviceAccountName }}-metrics-service namespace: {{ .Release.Namespace }} labels: {{- include "chart.labels" . | nindent 4 }} From d45d8800613ac3942085777d6302e53653e0e713 Mon Sep 17 00:00:00 2001 From: ian-flores Date: Tue, 20 Jan 2026 14:57:07 -0800 Subject: [PATCH 3/3] fix: correct metrics service DNS name in certificate template The certificate referenced 'team-operator-metrics-service' but the actual service is named 'team-operator-controller-manager-metrics-service'. This mismatch would cause TLS validation failures for metrics scraping. --- dist/chart/templates/certmanager/certificate.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dist/chart/templates/certmanager/certificate.yaml b/dist/chart/templates/certmanager/certificate.yaml index 33d2f249..ebe84709 100644 --- a/dist/chart/templates/certmanager/certificate.yaml +++ b/dist/chart/templates/certmanager/certificate.yaml @@ -51,7 +51,7 @@ spec: dnsNames: - team-operator.{{ .Release.Namespace }}.svc - team-operator.{{ .Release.Namespace }}.svc.cluster.local - - team-operator-metrics-service.{{ .Release.Namespace }}.svc + - team-operator-controller-manager-metrics-service.{{ .Release.Namespace }}.svc issuerRef: kind: Issuer name: selfsigned-issuer