From 464031887222ed144b16e977ebe4c7274b5b5fb1 Mon Sep 17 00:00:00 2001 From: Marcos Amorim Date: Fri, 2 Jan 2026 18:08:40 -0500 Subject: [PATCH 1/7] feat: Add Celery-based distributed task processing Introduce Celery workers for asynchronous processing of resource management operations, enabling horizontal scalability and improved performance for Poolboy. Key changes: - Add Redis as message broker and shared cache backend - Implement Celery workers for ResourcePool, ResourceHandle, and ResourceClaim processing - Add partitioned queues with consistent hashing for event ordering - Implement distributed locking to prevent concurrent resource access - Create unified cache system shared between operator and workers - Add Celery Beat scheduler for periodic reconciliation tasks - Support three operation modes: daemon, scheduler, or both - Add HPA configuration for worker auto-scaling - Maintain backward compatibility with synchronous fallback The implementation uses feature flags to enable workers per resource type, allowing gradual migration and easy rollback. All existing business logic is preserved - workers execute the same class methods that previously ran synchronously in the operator. --- build-template.yaml | 2 + helm/helm-vars-dev.yaml | 80 ++ helm/templates/deployment.yaml | 22 +- helm/templates/metrics-credentials.yaml | 26 +- helm/templates/service-monitor.yaml | 1 + helm/templates/service.yaml | 2 + helm/templates/worker/flower-credentials.yaml | 20 + helm/templates/worker/flower-deployment.yaml | 101 ++ helm/templates/worker/flower-pvc.yaml | 19 + helm/templates/worker/flower-route.yaml | 21 + helm/templates/worker/flower-service.yaml | 21 + helm/templates/worker/redis-deployment.yaml | 57 ++ helm/templates/worker/redis-pvc.yaml | 20 + helm/templates/worker/redis-service.yaml | 19 + helm/templates/worker/scheduler-cm.yaml | 16 + .../worker/scheduler-deployment.yaml | 89 ++ helm/templates/worker/useworkers-cm.yaml | 53 ++ helm/templates/worker/worker-cm.yaml | 37 + helm/templates/worker/worker-deployment.yaml | 91 ++ helm/templates/worker/worker-hpa.yaml | 51 ++ .../worker/worker-service-monitor.yaml | 31 + helm/templates/worker/worker-service.yaml | 22 + helm/values.yaml | 227 ++++- operator/cache.py | 265 ++++++ operator/distributed_lock.py | 267 ++++++ operator/kopfobject.py | 41 +- operator/metrics/__init__.py | 2 + operator/metrics/app_metrics.py | 18 +- operator/metrics/metrics_service.py | 80 +- operator/metrics/timer_decorator.py | 53 +- operator/operator.py | 863 +++++++++--------- operator/poolboy.py | 215 +---- operator/poolboy_worker.py | 36 + operator/processor/__init__.py | 11 + operator/processor/app.py | 431 +++++++++ operator/processor/config.py | 98 ++ operator/resourceclaim.py | 84 +- operator/resourcehandle.py | 219 ++--- operator/resourcepool.py | 89 +- operator/resourceprovider.py | 49 +- operator/resourcewatch.py | 165 +--- operator/scheduler/__init__.py | 3 + operator/scheduler/config_loader.py | 50 + operator/scheduler/registry.py | 103 +++ operator/scheduler/scheduler.py | 91 ++ operator/tasks/__init__.py | 35 + operator/tasks/resourceclaim.py | 207 +++++ operator/tasks/resourcehandle.py | 213 +++++ operator/tasks/resourcepool.py | 136 +++ requirements.txt | 6 +- 50 files changed, 3795 insertions(+), 1063 deletions(-) create mode 100644 helm/helm-vars-dev.yaml create mode 100644 helm/templates/worker/flower-credentials.yaml create mode 100644 helm/templates/worker/flower-deployment.yaml create mode 100644 helm/templates/worker/flower-pvc.yaml create mode 100644 helm/templates/worker/flower-route.yaml create mode 100644 helm/templates/worker/flower-service.yaml create mode 100644 helm/templates/worker/redis-deployment.yaml create mode 100644 helm/templates/worker/redis-pvc.yaml create mode 100644 helm/templates/worker/redis-service.yaml create mode 100644 helm/templates/worker/scheduler-cm.yaml create mode 100644 helm/templates/worker/scheduler-deployment.yaml create mode 100644 helm/templates/worker/useworkers-cm.yaml create mode 100644 helm/templates/worker/worker-cm.yaml create mode 100644 helm/templates/worker/worker-deployment.yaml create mode 100644 helm/templates/worker/worker-hpa.yaml create mode 100644 helm/templates/worker/worker-service-monitor.yaml create mode 100644 helm/templates/worker/worker-service.yaml create mode 100644 operator/cache.py create mode 100644 operator/distributed_lock.py create mode 100644 operator/poolboy_worker.py create mode 100644 operator/processor/__init__.py create mode 100644 operator/processor/app.py create mode 100644 operator/processor/config.py create mode 100644 operator/scheduler/__init__.py create mode 100644 operator/scheduler/config_loader.py create mode 100644 operator/scheduler/registry.py create mode 100644 operator/scheduler/scheduler.py create mode 100644 operator/tasks/__init__.py create mode 100644 operator/tasks/resourceclaim.py create mode 100644 operator/tasks/resourcehandle.py create mode 100644 operator/tasks/resourcepool.py diff --git a/build-template.yaml b/build-template.yaml index 3f0c168..8c193eb 100644 --- a/build-template.yaml +++ b/build-template.yaml @@ -28,6 +28,8 @@ objects: metadata: name: ${NAME} spec: + successfulBuildsHistoryLimit: 2 + failedBuildsHistoryLimit: 1 output: to: kind: ImageStreamTag diff --git a/helm/helm-vars-dev.yaml b/helm/helm-vars-dev.yaml new file mode 100644 index 0000000..f70e65b --- /dev/null +++ b/helm/helm-vars-dev.yaml @@ -0,0 +1,80 @@ +# Development environment values for Poolboy +# Usage: helm template poolboy-dev helm -f helm/helm-vars-dev.yaml | oc apply -f - + +clusterDomain: apps-crc.testing + +# Use distributed mode with Celery workers for development +operatorMode: distributed + +namespace: + name: poolboy-dev + create: false + +image: + repository: image-registry.openshift-image-registry.svc:5000/poolboy-dev/poolboy + tagOverride: latest + pullPolicy: Always + +# =========================================== +# Redis Configuration +# =========================================== +redis: + enabled: true + +# =========================================== +# Worker Configuration +# =========================================== +worker: + enabled: true + replicas: 1 + hpa: + enabled: false + +# =========================================== +# Scheduler Configuration +# =========================================== +scheduler: + enabled: true + +# =========================================== +# Celery Flower - Task Monitoring UI +# =========================================== +flower: + enabled: true + route: + enabled: true + +# =========================================== +# Feature Flags - Enable Celery Workers +# =========================================== +useWorkers: + resourcePool: + enabled: true + daemonMode: "scheduler" + partitions: 2 + resourceHandle: + enabled: true + daemonMode: "scheduler" + partitions: 4 + resourceClaim: + enabled: true + daemonMode: "scheduler" + partitions: 4 + +# =========================================== +# Scheduled Tasks +# =========================================== +schedules: + maintain-all-pools: + enabled: true + schedule: + seconds: 30 + maintain-all-handles: + enabled: true + schedule: + seconds: 60 + maintain-all-claims: + enabled: true + schedule: + seconds: 60 + diff --git a/helm/templates/deployment.yaml b/helm/templates/deployment.yaml index cab8c4c..daf4807 100644 --- a/helm/templates/deployment.yaml +++ b/helm/templates/deployment.yaml @@ -6,17 +6,20 @@ metadata: namespace: {{ include "poolboy.namespaceName" . }} labels: {{- include "poolboy.labels" . | nindent 4 }} + app.kubernetes.io/component: operator spec: replicas: {{ .Values.replicaCount }} selector: matchLabels: {{- include "poolboy.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: operator strategy: type: Recreate template: metadata: labels: {{- include "poolboy.selectorLabels" . | nindent 8 }} + app.kubernetes.io/component: operator spec: containers: - name: manager @@ -33,24 +36,17 @@ spec: value: {{ include "poolboy.operatorDomain" . }} - name: OPERATOR_MODE value: "{{ .Values.operatorMode }}" - {{- if eq .Values.operatorMode "manager" }} - - name: RESOURCE_HANDLER_COUNT - value: "{{ .Values.resourceHandlerCount }}" - {{- if .Values.resourceHandlerResources }} - - name: RESOURCE_HANDLER_RESOURCES - value: {{ .Values.resourceHandlerResources | toJson | quote }} - {{- end }} - {{- if .Values.resourceWatchResources }} - - name: RESOURCE_WATCH_RESOURCES - value: {{ .Values.resourceWatchResources | toJson | quote }} - {{- end }} - {{- end }} {{- if .Values.enablePrometheusMetrics}} - name: METRICS_ENABLED value: "true" {{- end }} - name: RESOURCE_REFRESH_INTERVAL value: "{{ .Values.resourceRefreshInterval }}" + {{- if .Values.worker.enabled }} + envFrom: + - configMapRef: + name: {{ include "poolboy.name" . }}-useworkers-cm + {{- end }} image: "{{ include "poolboy.image" . }}" imagePullPolicy: {{ .Values.image.pullPolicy }} resources: @@ -64,7 +60,7 @@ spec: - name: kopf containerPort: 8080 - name: metrics - containerPort: 9091 + containerPort: 9090 {{- with .Values.imagePullSecrets }} imagePullSecrets: {{- toYaml . | nindent 8 }} diff --git a/helm/templates/metrics-credentials.yaml b/helm/templates/metrics-credentials.yaml index 9172b6d..4f7f9c8 100644 --- a/helm/templates/metrics-credentials.yaml +++ b/helm/templates/metrics-credentials.yaml @@ -1,19 +1,19 @@ {{- if .Values.enablePrometheusMetrics -}} -apiVersion: secretgenerator.mittwald.de/v1alpha1 -kind: StringSecret +{{- $secretName := printf "%s-metrics-credentials" (include "poolboy.name" .) }} +{{- $existingSecret := lookup "v1" "Secret" (include "poolboy.namespaceName" .) $secretName }} +apiVersion: v1 +kind: Secret metadata: - name: {{ include "poolboy.name" . }}-metrics-credentials + name: {{ $secretName }} namespace: {{ include "poolboy.namespaceName" . }} labels: {{- include "poolboy.labels" . | nindent 4 }} - annotations: - secret-generator.v1.mittwald.de/type: basic-auth -spec: - forceRegenerate: false - data: - metrics_username: {{ .Values.metrics.username }} - fields: - - fieldName: metrics_password - encoding: "hex" - length: "32" +type: Opaque +data: + metrics_username: {{ .Values.metrics.username | b64enc }} + {{- if and $existingSecret $existingSecret.data.metrics_password }} + metrics_password: {{ $existingSecret.data.metrics_password }} + {{- else }} + metrics_password: {{ randAlphaNum 32 | b64enc }} + {{- end }} {{- end }} diff --git a/helm/templates/service-monitor.yaml b/helm/templates/service-monitor.yaml index 6273228..25cb9ca 100644 --- a/helm/templates/service-monitor.yaml +++ b/helm/templates/service-monitor.yaml @@ -10,6 +10,7 @@ spec: selector: matchLabels: {{- include "poolboy.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: operator namespaceSelector: matchNames: - {{ include "poolboy.namespaceName" . }} diff --git a/helm/templates/service.yaml b/helm/templates/service.yaml index 1424f83..4b08454 100644 --- a/helm/templates/service.yaml +++ b/helm/templates/service.yaml @@ -6,6 +6,7 @@ metadata: namespace: {{ include "poolboy.namespaceName" . }} labels: {{- include "poolboy.labels" . | nindent 4 }} + app.kubernetes.io/component: operator spec: type: {{ .Values.service.type }} {{- with .Values.service.ports }} @@ -14,5 +15,6 @@ spec: {{- end }} selector: {{- include "poolboy.selectorLabels" . | nindent 4 }} + app.kubernetes.io/component: operator sessionAffinity: None {{- end -}} diff --git a/helm/templates/worker/flower-credentials.yaml b/helm/templates/worker/flower-credentials.yaml new file mode 100644 index 0000000..8357a59 --- /dev/null +++ b/helm/templates/worker/flower-credentials.yaml @@ -0,0 +1,20 @@ +{{- if and .Values.flower.enabled .Values.flower.auth.enabled }} +{{- $secretName := printf "%s-flower-credentials" (include "poolboy.name" .) }} +{{- $existingSecret := lookup "v1" "Secret" (include "poolboy.namespaceName" .) $secretName }} +apiVersion: v1 +kind: Secret +metadata: + name: {{ $secretName }} + namespace: {{ include "poolboy.namespaceName" . }} + labels: + {{- include "poolboy.labels" . | nindent 4 }} + app.kubernetes.io/component: flower +type: Opaque +data: + username: {{ .Values.flower.auth.username | default "admin" | b64enc }} + {{- if and $existingSecret $existingSecret.data.password }} + password: {{ $existingSecret.data.password }} + {{- else }} + password: {{ randAlphaNum 32 | b64enc }} + {{- end }} +{{- end }} diff --git a/helm/templates/worker/flower-deployment.yaml b/helm/templates/worker/flower-deployment.yaml new file mode 100644 index 0000000..2729b1e --- /dev/null +++ b/helm/templates/worker/flower-deployment.yaml @@ -0,0 +1,101 @@ +{{- if .Values.flower.enabled }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "poolboy.name" . }}-flower + namespace: {{ include "poolboy.namespaceName" . }} + labels: + {{- include "poolboy.labels" . | nindent 4 }} + app.kubernetes.io/component: flower + annotations: + reloader.stakater.com/auto: "true" +spec: + replicas: {{ .Values.flower.replicas | default 1 }} + selector: + matchLabels: + {{- include "poolboy.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: flower + template: + metadata: + labels: + {{- include "poolboy.selectorLabels" . | nindent 8 }} + app.kubernetes.io/component: flower + spec: + containers: + - name: flower + image: "{{ .Values.flower.image.repository }}:{{ .Values.flower.image.tag }}" + imagePullPolicy: {{ .Values.flower.image.pullPolicy | default "IfNotPresent" }} + command: ["celery"] + args: + - "--broker={{ printf "redis://%s-redis:6379/0" (include "poolboy.name" .) }}" + - "--result-backend={{ printf "redis://%s-redis:6379/1" (include "poolboy.name" .) }}" + - "flower" + - "--port={{ .Values.flower.port | default 5555 }}" + {{- if .Values.flower.auth.enabled }} + - "--basic-auth=$(FLOWER_USERNAME):$(FLOWER_PASSWORD)" + {{- end }} + {{- if .Values.flower.config.maxTasks }} + - "--max-tasks={{ .Values.flower.config.maxTasks }}" + {{- end }} + {{- if .Values.flower.config.purgeOfflineWorkers }} + - "--purge_offline_workers={{ .Values.flower.config.purgeOfflineWorkers }}" + {{- end }} + {{- if .Values.flower.persistence.enabled }} + - "--persistent=true" + - "--db=/data/flower.db" + {{- end }} + ports: + - name: flower + containerPort: {{ .Values.flower.port | default 5555 }} + envFrom: + {{- with .Values.flower.extraEnvFrom }} + {{- toYaml . | nindent 12 }} + {{- end }} + env: + {{- range $key, $value := .Values.flower.extraEnvVars }} + - name: {{ $key }} + value: {{ $value | quote }} + {{- end }} + {{- if .Values.flower.auth.enabled }} + - name: FLOWER_USERNAME + valueFrom: + secretKeyRef: + name: {{ include "poolboy.name" . }}-flower-credentials + key: username + - name: FLOWER_PASSWORD + valueFrom: + secretKeyRef: + name: {{ include "poolboy.name" . }}-flower-credentials + key: password + {{- end }} + livenessProbe: + httpGet: + path: /healthcheck + port: flower + initialDelaySeconds: 10 + periodSeconds: 30 + readinessProbe: + httpGet: + path: /healthcheck + port: flower + initialDelaySeconds: 5 + periodSeconds: 10 + resources: + {{- toYaml .Values.flower.resources | nindent 12 }} + volumeMounts: + - name: flower-data + mountPath: /data + volumes: + - name: flower-data + {{- if .Values.flower.persistence.enabled }} + persistentVolumeClaim: + claimName: {{ include "poolboy.name" . }}-flower-pvc + {{- else }} + emptyDir: {} + {{- end }} + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "poolboy.serviceAccountName" . }} +{{- end }} diff --git a/helm/templates/worker/flower-pvc.yaml b/helm/templates/worker/flower-pvc.yaml new file mode 100644 index 0000000..4351cd4 --- /dev/null +++ b/helm/templates/worker/flower-pvc.yaml @@ -0,0 +1,19 @@ +{{- if and .Values.flower.enabled .Values.flower.persistence.enabled }} +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ include "poolboy.name" . }}-flower-pvc + namespace: {{ include "poolboy.namespaceName" . }} + labels: + {{- include "poolboy.labels" . | nindent 4 }} + app.kubernetes.io/component: flower +spec: + accessModes: + - ReadWriteOnce + {{- if .Values.flower.persistence.storageClass }} + storageClassName: {{ .Values.flower.persistence.storageClass }} + {{- end }} + resources: + requests: + storage: {{ .Values.flower.persistence.size }} +{{- end }} diff --git a/helm/templates/worker/flower-route.yaml b/helm/templates/worker/flower-route.yaml new file mode 100644 index 0000000..85ee023 --- /dev/null +++ b/helm/templates/worker/flower-route.yaml @@ -0,0 +1,21 @@ +{{- if and .Values.flower.enabled .Values.flower.route.enabled }} +apiVersion: route.openshift.io/v1 +kind: Route +metadata: + name: {{ include "poolboy.name" . }}-flower + namespace: {{ include "poolboy.namespaceName" . }} + labels: + {{- include "poolboy.labels" . | nindent 4 }} + app.kubernetes.io/component: flower +spec: + to: + kind: Service + name: {{ include "poolboy.name" . }}-flower + weight: 100 + port: + targetPort: flower + tls: + termination: edge + insecureEdgeTerminationPolicy: Redirect + wildcardPolicy: None +{{- end }} diff --git a/helm/templates/worker/flower-service.yaml b/helm/templates/worker/flower-service.yaml new file mode 100644 index 0000000..eaead65 --- /dev/null +++ b/helm/templates/worker/flower-service.yaml @@ -0,0 +1,21 @@ +{{- if .Values.flower.enabled }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "poolboy.name" . }}-flower + namespace: {{ include "poolboy.namespaceName" . }} + labels: + {{- include "poolboy.labels" . | nindent 4 }} + app.kubernetes.io/component: flower +spec: + type: ClusterIP + ports: + - name: flower + port: {{ .Values.flower.port | default 5555 }} + protocol: TCP + targetPort: flower + selector: + {{- include "poolboy.selectorLabels" . | nindent 4 }} + app.kubernetes.io/component: flower +{{- end }} + diff --git a/helm/templates/worker/redis-deployment.yaml b/helm/templates/worker/redis-deployment.yaml new file mode 100644 index 0000000..6795f4b --- /dev/null +++ b/helm/templates/worker/redis-deployment.yaml @@ -0,0 +1,57 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "poolboy.name" . }}-redis + namespace: {{ include "poolboy.namespaceName" . }} + labels: + {{- include "poolboy.labels" . | nindent 4 }} + app.kubernetes.io/component: redis +spec: + replicas: 1 + selector: + matchLabels: + {{- include "poolboy.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: redis + strategy: + type: Recreate + template: + metadata: + labels: + {{- include "poolboy.selectorLabels" . | nindent 8 }} + app.kubernetes.io/component: redis + spec: + containers: + - name: redis + image: "{{ .Values.redis.image.repository }}:{{ .Values.redis.image.tag }}" + imagePullPolicy: {{ .Values.redis.image.pullPolicy }} + ports: + - containerPort: 6379 + name: redis + command: + - redis-server + - --appendonly + - "yes" + resources: + {{- toYaml .Values.redis.resources | nindent 12 }} + {{- if .Values.redis.persistence.enabled }} + volumeMounts: + - name: redis-data + mountPath: /data + {{- end }} + livenessProbe: + tcpSocket: + port: 6379 + initialDelaySeconds: 5 + periodSeconds: 10 + readinessProbe: + tcpSocket: + port: 6379 + initialDelaySeconds: 5 + periodSeconds: 5 + {{- if .Values.redis.persistence.enabled }} + volumes: + - name: redis-data + persistentVolumeClaim: + claimName: {{ include "poolboy.name" . }}-redis-pvc + {{- end }} + diff --git a/helm/templates/worker/redis-pvc.yaml b/helm/templates/worker/redis-pvc.yaml new file mode 100644 index 0000000..53ca96f --- /dev/null +++ b/helm/templates/worker/redis-pvc.yaml @@ -0,0 +1,20 @@ +{{- if .Values.redis.persistence.enabled }} +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ include "poolboy.name" . }}-redis-pvc + namespace: {{ include "poolboy.namespaceName" . }} + labels: + {{- include "poolboy.labels" . | nindent 4 }} + app.kubernetes.io/component: redis +spec: + accessModes: + - ReadWriteOnce + {{- if .Values.redis.persistence.storageClass }} + storageClassName: {{ .Values.redis.persistence.storageClass }} + {{- end }} + resources: + requests: + storage: {{ .Values.redis.persistence.size }} +{{- end }} + diff --git a/helm/templates/worker/redis-service.yaml b/helm/templates/worker/redis-service.yaml new file mode 100644 index 0000000..dfb1bbc --- /dev/null +++ b/helm/templates/worker/redis-service.yaml @@ -0,0 +1,19 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ include "poolboy.name" . }}-redis + namespace: {{ include "poolboy.namespaceName" . }} + labels: + {{- include "poolboy.labels" . | nindent 4 }} + app.kubernetes.io/component: redis +spec: + type: ClusterIP + ports: + - port: 6379 + targetPort: 6379 + protocol: TCP + name: redis + selector: + {{- include "poolboy.selectorLabels" . | nindent 4 }} + app.kubernetes.io/component: redis + diff --git a/helm/templates/worker/scheduler-cm.yaml b/helm/templates/worker/scheduler-cm.yaml new file mode 100644 index 0000000..39b6d10 --- /dev/null +++ b/helm/templates/worker/scheduler-cm.yaml @@ -0,0 +1,16 @@ +{{- /* Scheduler ConfigMap - auto-enabled when operatorMode is 'distributed' */ -}} +{{- if or .Values.scheduler.enabled (eq .Values.operatorMode "distributed") }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "poolboy.name" . }}-scheduler-cm + namespace: {{ include "poolboy.namespaceName" . }} + labels: + {{- include "poolboy.labels" . | nindent 4 }} + app.kubernetes.io/component: scheduler-cm +data: + schedule_config.yaml: | + schedules: + {{- toYaml .Values.schedules | nindent 6 }} +{{- end }} + diff --git a/helm/templates/worker/scheduler-deployment.yaml b/helm/templates/worker/scheduler-deployment.yaml new file mode 100644 index 0000000..043d561 --- /dev/null +++ b/helm/templates/worker/scheduler-deployment.yaml @@ -0,0 +1,89 @@ +{{- /* Scheduler deployment - auto-enabled when operatorMode is 'distributed' */ -}} +{{- if or .Values.scheduler.enabled (eq .Values.operatorMode "distributed") }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "poolboy.name" . }}-scheduler + namespace: {{ include "poolboy.namespaceName" . }} + labels: + {{- include "poolboy.labels" . | nindent 4 }} + app.kubernetes.io/component: scheduler + annotations: + reloader.stakater.com/auto: "true" +spec: + replicas: 1 + selector: + matchLabels: + {{- include "poolboy.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: scheduler + strategy: + type: Recreate + template: + metadata: + labels: + {{- include "poolboy.selectorLabels" . | nindent 8 }} + app.kubernetes.io/component: scheduler + spec: + containers: + - name: scheduler + image: "{{ include "poolboy.image" . }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + command: ["python3", "/opt/app-root/operator/poolboy_worker.py"] + args: + - "-A" + - "processor.app" + - "beat" + - "--loglevel={{ .Values.scheduler.config.logging.level | lower }}" + - "--schedule=/tmp/celerybeat-schedule" + envFrom: + - configMapRef: + name: {{ include "poolboy.name" . }}-worker-cm + {{- with .Values.scheduler.extraEnvFrom }} + {{- toYaml . | nindent 12 }} + {{- end }} + env: + {{- range $key, $value := .Values.scheduler.extraEnvVars }} + - name: {{ $key }} + value: {{ $value | quote }} + {{- end }} + - name: CELERY_SCHEDULER_ENABLED + value: "true" + - name: CELERY_SCHEDULE_CONFIG + value: "/etc/poolboy/schedule_config.yaml" + - name: PROMETHEUS_MULTIPROC_DIR + value: "" + - name: WORKER_METRICS_ENABLED + value: "false" + resources: + {{- toYaml .Values.scheduler.resources | nindent 12 }} + volumeMounts: + - name: schedule-data + mountPath: /tmp + - name: scheduler-cm + mountPath: /etc/poolboy + readOnly: true + volumes: + - name: schedule-data + emptyDir: {} + - name: scheduler-cm + configMap: + name: {{ include "poolboy.name" . }}-scheduler-cm + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "poolboy.serviceAccountName" . }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} +{{- end }} + diff --git a/helm/templates/worker/useworkers-cm.yaml b/helm/templates/worker/useworkers-cm.yaml new file mode 100644 index 0000000..ee5c7ba --- /dev/null +++ b/helm/templates/worker/useworkers-cm.yaml @@ -0,0 +1,53 @@ +{{- /* UseWorkers ConfigMap - auto-enabled when operatorMode is 'distributed' */ -}} +{{- if or .Values.worker.enabled (eq .Values.operatorMode "distributed") }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "poolboy.name" . }}-useworkers-cm + namespace: {{ include "poolboy.namespaceName" . }} + labels: + {{- include "poolboy.labels" . | nindent 4 }} + app.kubernetes.io/component: useworkers-cm +data: + # Redis URL (base URL, components append DB number: /0, /1, /2, /3) + REDIS_URL: {{ printf "redis://%s-redis:6379" (include "poolboy.name" .) | quote }} + # Celery config for sending tasks from operator + CELERY_BROKER_URL: {{ printf "redis://%s-redis:6379/0" (include "poolboy.name" .) | quote }} + CELERY_RESULT_BACKEND: {{ printf "redis://%s-redis:6379/1" (include "poolboy.name" .) | quote }} + # Lock retry delay (seconds) + WORKERS_LOCK_RETRY_COUNTDOWN: {{ .Values.useWorkers.lockRetryCountdown | default 3 | quote }} + # Error retry delay (seconds) + WORKERS_ERROR_RETRY_COUNTDOWN: {{ .Values.useWorkers.errorRetryCountdown | default 30 | quote }} + # ResourcePool configuration + {{- if .Values.useWorkers.resourcePool.enabled }} + WORKERS_RESOURCE_POOL: "true" + WORKERS_RESOURCE_POOL_DAEMON_MODE: {{ .Values.useWorkers.resourcePool.daemonMode | default "scheduler" | quote }} + {{- if .Values.useWorkers.resourcePool.partitions }} + PARTITION_RESOURCE_POOL: {{ .Values.useWorkers.resourcePool.partitions | quote }} + {{- end }} + {{- end }} + # ResourceHandle configuration + {{- if .Values.useWorkers.resourceHandle.enabled }} + WORKERS_RESOURCE_HANDLE: "true" + WORKERS_RESOURCE_HANDLE_DAEMON_MODE: {{ .Values.useWorkers.resourceHandle.daemonMode | default "scheduler" | quote }} + {{- if .Values.useWorkers.resourceHandle.partitions }} + PARTITION_RESOURCE_HANDLE: {{ .Values.useWorkers.resourceHandle.partitions | quote }} + {{- end }} + {{- end }} + # ResourceClaim configuration + {{- if .Values.useWorkers.resourceClaim.enabled }} + WORKERS_RESOURCE_CLAIM: "true" + WORKERS_RESOURCE_CLAIM_DAEMON_MODE: {{ .Values.useWorkers.resourceClaim.daemonMode | default "scheduler" | quote }} + {{- if .Values.useWorkers.resourceClaim.partitions }} + PARTITION_RESOURCE_CLAIM: {{ .Values.useWorkers.resourceClaim.partitions | quote }} + {{- end }} + {{- end }} + # Other resource types + {{- range $name, $config := .Values.useWorkers }} + {{- if and (kindIs "map" $config) (ne $name "resourcePool") (ne $name "resourceHandle") (ne $name "resourceClaim") (ne $name "lockRetryCountdown") (ne $name "errorRetryCountdown") }} + {{- if $config.enabled }} + WORKERS_{{ $name | snakecase | upper }}: "true" + {{- end }} + {{- end }} + {{- end }} +{{- end }} diff --git a/helm/templates/worker/worker-cm.yaml b/helm/templates/worker/worker-cm.yaml new file mode 100644 index 0000000..2d70452 --- /dev/null +++ b/helm/templates/worker/worker-cm.yaml @@ -0,0 +1,37 @@ +{{- /* Worker ConfigMap - auto-enabled when operatorMode is 'distributed' */ -}} +{{- if or .Values.worker.enabled (eq .Values.operatorMode "distributed") }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "poolboy.name" . }}-worker-cm + namespace: {{ include "poolboy.namespaceName" . }} + labels: + {{- include "poolboy.labels" . | nindent 4 }} + app.kubernetes.io/component: worker-cm +data: + # Redis URL (base URL, components append DB number: /0, /1, /2, /3) + REDIS_URL: {{ printf "redis://%s-redis:6379" (include "poolboy.name" .) | quote }} + CELERY_BROKER_URL: {{ printf "redis://%s-redis:6379/0" (include "poolboy.name" .) | quote }} + CELERY_RESULT_BACKEND: {{ printf "redis://%s-redis:6379/1" (include "poolboy.name" .) | quote }} + # Celery configuration (from worker.config) + {{- range $key, $value := .Values.worker.config }} + CELERY_{{ $key | upper }}: {{ $value | quote }} + {{- end }} + # Partition configuration (from useWorkers) + {{- range $name, $config := .Values.useWorkers }} + {{- if and (kindIs "map" $config) $config.enabled $config.partitions }} + PARTITION_{{ $name | snakecase | upper }}: {{ $config.partitions | quote }} + {{- end }} + {{- end }} + # Operator configuration + WORKER: "true" + CLUSTER_DOMAIN: {{ .Values.clusterDomain | quote }} + OPERATOR_DOMAIN: {{ include "poolboy.operatorDomain" . | quote }} + RESOURCE_REFRESH_INTERVAL: {{ .Values.resourceRefreshInterval | quote }} + # Metrics configuration + {{- if .Values.worker.metrics.enabled }} + PROMETHEUS_MULTIPROC_DIR: "/tmp/prometheus_metrics" + WORKER_METRICS_ENABLED: "true" + WORKER_METRICS_PORT: {{ .Values.worker.metrics.port | default 9090 | quote }} + {{- end }} +{{- end }} diff --git a/helm/templates/worker/worker-deployment.yaml b/helm/templates/worker/worker-deployment.yaml new file mode 100644 index 0000000..79e0b87 --- /dev/null +++ b/helm/templates/worker/worker-deployment.yaml @@ -0,0 +1,91 @@ +{{- /* Worker deployment - auto-enabled when operatorMode is 'distributed' */ -}} +{{- if or .Values.worker.enabled (eq .Values.operatorMode "distributed") }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "poolboy.name" . }}-worker + namespace: {{ include "poolboy.namespaceName" . }} + labels: + {{- include "poolboy.labels" . | nindent 4 }} + app.kubernetes.io/component: worker + annotations: + reloader.stakater.com/auto: "true" +spec: + {{- if not .Values.worker.hpa.enabled }} + replicas: {{ .Values.worker.replicas }} + {{- end }} + selector: + matchLabels: + {{- include "poolboy.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: worker + template: + metadata: + labels: + {{- include "poolboy.selectorLabels" . | nindent 8 }} + app.kubernetes.io/component: worker + spec: + containers: + - name: worker + image: "{{ include "poolboy.image" . }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + command: ["python3", "/opt/app-root/operator/poolboy_worker.py"] + args: + - "-A" + - "processor.app" + - "worker" + {{- with .Values.worker.args }} + - "--loglevel={{ .loglevel | default "info" }}" + - "--concurrency={{ .concurrency | default 4 }}" + - "--pool={{ .pool | default "prefork" }}" + - "--max-tasks-per-child={{ .maxTasksPerChild | default 100 }}" + {{- end }} + {{- if .Values.worker.metrics.enabled }} + ports: + - name: worker-metrics + containerPort: {{ .Values.worker.metrics.port | default 9090 }} + {{- end }} + envFrom: + - configMapRef: + name: {{ include "poolboy.name" . }}-worker-cm + {{- with .Values.worker.extraEnvFrom }} + {{- toYaml . | nindent 12 }} + {{- end }} + env: + {{- range $key, $value := .Values.worker.extraEnvVars }} + - name: {{ $key }} + value: {{ $value | quote }} + {{- end }} + - name: OPERATOR_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + {{- if .Values.worker.metrics.enabled }} + volumeMounts: + - name: prometheus-metrics + mountPath: /tmp/prometheus_metrics + {{- end }} + resources: + {{- toYaml .Values.worker.resources | nindent 12 }} + {{- if .Values.worker.metrics.enabled }} + volumes: + - name: prometheus-metrics + emptyDir: {} + {{- end }} + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "poolboy.serviceAccountName" . }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} +{{- end }} diff --git a/helm/templates/worker/worker-hpa.yaml b/helm/templates/worker/worker-hpa.yaml new file mode 100644 index 0000000..056de98 --- /dev/null +++ b/helm/templates/worker/worker-hpa.yaml @@ -0,0 +1,51 @@ +{{- /* Worker HPA - auto-enabled when operatorMode is 'distributed' */ -}} +{{- if and (or .Values.worker.enabled (eq .Values.operatorMode "distributed")) .Values.worker.hpa.enabled }} +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "poolboy.name" . }}-worker + namespace: {{ include "poolboy.namespaceName" . }} + labels: + {{- include "poolboy.labels" . | nindent 4 }} + app.kubernetes.io/component: worker +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "poolboy.name" . }}-worker + minReplicas: {{ .Values.worker.hpa.minReplicas }} + maxReplicas: {{ .Values.worker.hpa.maxReplicas }} + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: {{ .Values.worker.hpa.targetCPUUtilizationPercentage }} + {{- if .Values.worker.hpa.targetMemoryUtilizationPercentage }} + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: {{ .Values.worker.hpa.targetMemoryUtilizationPercentage }} + {{- end }} + behavior: + scaleDown: + stabilizationWindowSeconds: {{ .Values.worker.hpa.scaleDownStabilizationWindowSeconds | default 300 }} + policies: + - type: Pods + value: 1 + periodSeconds: 120 + scaleUp: + stabilizationWindowSeconds: 0 + policies: + - type: Pods + value: 2 + periodSeconds: 60 + - type: Percent + value: 100 + periodSeconds: 60 + selectPolicy: Max +{{- end }} + diff --git a/helm/templates/worker/worker-service-monitor.yaml b/helm/templates/worker/worker-service-monitor.yaml new file mode 100644 index 0000000..a9ec30f --- /dev/null +++ b/helm/templates/worker/worker-service-monitor.yaml @@ -0,0 +1,31 @@ +{{- /* Worker ServiceMonitor - auto-enabled when operatorMode is 'distributed' */ -}} +{{- if and (or .Values.worker.enabled (eq .Values.operatorMode "distributed")) .Values.worker.metrics.enabled .Values.enablePrometheusMetrics }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "poolboy.name" . }}-worker-metrics + namespace: {{ include "poolboy.namespaceName" . }} + labels: + {{- include "poolboy.labels" . | nindent 4 }} + app.kubernetes.io/component: worker +spec: + selector: + matchLabels: + {{- include "poolboy.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: worker + namespaceSelector: + matchNames: + - {{ include "poolboy.namespaceName" . }} + endpoints: + - port: worker-metrics + interval: "30s" + path: {{ .Values.worker.metrics.path | default "/metrics" }} + basicAuth: + username: + name: {{ include "poolboy.name" . }}-metrics-credentials + key: metrics_username + password: + name: {{ include "poolboy.name" . }}-metrics-credentials + key: metrics_password +{{- end }} + diff --git a/helm/templates/worker/worker-service.yaml b/helm/templates/worker/worker-service.yaml new file mode 100644 index 0000000..be2f409 --- /dev/null +++ b/helm/templates/worker/worker-service.yaml @@ -0,0 +1,22 @@ +{{- /* Worker Service - auto-enabled when operatorMode is 'distributed' */ -}} +{{- if and (or .Values.worker.enabled (eq .Values.operatorMode "distributed")) .Values.worker.metrics.enabled }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "poolboy.name" . }}-worker + namespace: {{ include "poolboy.namespaceName" . }} + labels: + {{- include "poolboy.labels" . | nindent 4 }} + app.kubernetes.io/component: worker +spec: + type: ClusterIP + ports: + - name: worker-metrics + port: {{ .Values.worker.metrics.port | default 9090 }} + protocol: TCP + targetPort: worker-metrics + selector: + {{- include "poolboy.selectorLabels" . | nindent 4 }} + app.kubernetes.io/component: worker +{{- end }} + diff --git a/helm/values.yaml b/helm/values.yaml index af477de..bb4f086 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -20,27 +20,14 @@ operatorDomain: # If not set and create is true, a name is generated using the operatorDomain template name: -operatorMode: manager +# Operator mode: 'standalone' (single operator) or 'distributed' (with Celery workers) +# Backward compatibility: 'all-in-one' maps to 'standalone', 'manager' maps to 'distributed' +operatorMode: distributed manageClaimsInterval: 60 manageHandlesInterval: 60 managePoolsInterval: 10 resourceRefreshInterval: 600 -resourceHandlerCount: 1 -resourceHandlerResources: - limits: - cpu: 1000m - memory: 256Mi - requests: - cpu: 100m - memory: 128Mi -resourceWatchResources: - limits: - cpu: 1000m - memory: 256Mi - requests: - cpu: 100m - memory: 128Mi anarchy: # Control whether anarchy integration should be created @@ -74,9 +61,9 @@ service: type: ClusterIP ports: - name: metrics - port: 9091 + port: 9090 protocol: TCP - targetPort: 9091 + targetPort: 9090 resources: {} # We usually recommend not to specify default resources and to leave this as a conscious @@ -104,3 +91,207 @@ enablePrometheusMetrics: true # Metrics Credentials metrics: username: metrics + +# =========================================== +# Worker Feature Flags +# Enable/disable async processing per resource type +# =========================================== +useWorkers: + # Retry delay (seconds) when a task cannot acquire the distributed lock. + # Lower values = faster retry but more Redis load. + # Higher values = slower retry but less Redis load. + lockRetryCountdown: 3 + + # Retry delay (seconds) when a task fails due to a real error (not lock contention). + # Tasks will retry up to 5 times with this delay between attempts. + # After 5 retries, the task fails permanently. + errorRetryCountdown: 30 + + resourcePool: + enabled: false + # Operation mode for periodic tasks: daemon, scheduler, or both + daemonMode: "scheduler" + partitions: 2 + resourceHandle: + enabled: false + # Operation mode for periodic tasks: daemon, scheduler, or both + daemonMode: "scheduler" + partitions: 4 + resourceClaim: + enabled: false + # Operation mode for periodic tasks: daemon, scheduler, or both + daemonMode: "scheduler" + partitions: 4 + resourceProvider: + enabled: false + partitions: 2 + resourceWatch: + enabled: false + partitions: 2 + cleanup: + enabled: false + partitions: 4 + +# =========================================== +# Redis Configuration +# =========================================== +redis: + enabled: true + image: + repository: redis + tag: 7-alpine + pullPolicy: IfNotPresent + persistence: + enabled: true + size: 4Gi + storageClass: "" + resources: + requests: + memory: "256Mi" + cpu: "100m" + limits: + memory: "512Mi" + cpu: "400m" + +# =========================================== +# Worker Configuration +# =========================================== +worker: + enabled: false + replicas: 2 + + # Prometheus metrics endpoint + metrics: + enabled: true + port: 9090 + path: /metrics + + resources: + requests: + memory: "256Mi" + cpu: "100m" + limits: + memory: "512Mi" + cpu: "500m" + + hpa: + enabled: true + minReplicas: 2 + maxReplicas: 10 + targetCPUUtilizationPercentage: 70 + targetMemoryUtilizationPercentage: 80 + scaleDownStabilizationWindowSeconds: 300 + + extraEnvVars: {} + extraEnvFrom: [] + + # Command-line args for celery worker + args: + loglevel: info + concurrency: 4 + pool: prefork + maxTasksPerChild: 100 + + # Celery configuration (CELERY_ prefix added automatically) + # Keys are snake_case matching Celery config names + config: + result_expires: 3600 + result_extended: true + task_ack_late: true + task_default_retry_delay: 60 + task_default_retry_delay_max: 600 + task_reject_on_worker_lost: true + task_soft_time_limit: 1740 + task_time_limit: 1800 + worker_prefetch_multiplier: 1 + worker_send_task_events: true + task_send_sent_event: true + +# =========================================== +# Scheduler Configuration (Celery Beat) +# =========================================== +scheduler: + enabled: false + resources: + requests: + memory: "64Mi" + cpu: "25m" + limits: + memory: "128Mi" + cpu: "100m" + extraEnvVars: {} + extraEnvFrom: [] + config: + logging: + level: INFO + +# =========================================== +# Celery Flower Configuration +# =========================================== +flower: + enabled: false + replicas: 1 + + image: + repository: mher/flower + tag: "2.0" + pullPolicy: IfNotPresent + + # Web UI port + port: 5555 + + # Basic authentication + auth: + enabled: true + username: admin + + resources: + requests: + memory: "64Mi" + cpu: "25m" + limits: + memory: "256Mi" + cpu: "200m" + + # Flower behavior settings + config: + # Maximum number of tasks to keep in memory + maxTasks: 10000 + # Remove offline workers after N seconds (0 = keep forever) + purgeOfflineWorkers: 0 + + # Persistent storage for task history + persistence: + enabled: false + size: 1Gi + storageClass: "" + + # Additional environment variables + extraEnvVars: {} + extraEnvFrom: [] + + # OpenShift Route for external access + route: + enabled: false + +# =========================================== +# Task Schedules +# Supports: seconds (interval) or cron (expression) +# =========================================== +schedules: + maintain-all-pools: + enabled: false + schedule: + seconds: 30 + maintain-all-handles: + enabled: false + schedule: + seconds: 60 + maintain-all-claims: + enabled: false + schedule: + seconds: 60 + cleanup-stale-handles: + enabled: false + schedule: + cron: "0 * * * *" diff --git a/operator/cache.py b/operator/cache.py new file mode 100644 index 0000000..ef6e696 --- /dev/null +++ b/operator/cache.py @@ -0,0 +1,265 @@ +""" +Unified cache system for Poolboy. + +Provides a single interface for caching with automatic backend selection: +- MemoryBackend: Used in standalone mode (single process) +- RedisBackend: Used in distributed mode (shared across workers) + +Usage: + from cache import Cache, CacheTag + + # Cache an object + instance.cache_set(CacheTag.HANDLE, name, ttl=300) + + # Retrieve from cache + instance = cls.cache_get(CacheTag.HANDLE, name) + + # Delete from cache + cls.cache_delete(CacheTag.HANDLE, name) +""" + +import fnmatch +import json +import logging +import time +from enum import Enum +from typing import Any, Optional, Protocol + +import redis +from poolboy import Poolboy + +logger = logging.getLogger(__name__) + + +class CacheTag(Enum): + """Tags for cache key namespacing.""" + CLAIM = "claim" + HANDLE = "handle" + HANDLE_BOUND = "handle_bound" + HANDLE_UNBOUND = "handle_unbound" + POOL = "pool" + PROVIDER = "provider" + WATCH = "watch" + WATCH_RESOURCE = "watch_resource" + + +class CacheBackend(Protocol): + """Protocol defining cache backend interface.""" + + def delete(self, key: str) -> None: ... + def delete_pattern(self, pattern: str) -> int: ... + def exists(self, key: str) -> bool: ... + def get(self, key: str) -> Optional[Any]: ... + def keys(self, pattern: str) -> list[str]: ... + def set(self, key: str, value: Any, ttl: int) -> None: ... + + +class MemoryBackend: + """In-memory cache backend for standalone mode.""" + + def __init__(self): + self._cache: dict[str, tuple[Any, float]] = {} + + def _cleanup_expired(self) -> None: + """Remove expired entries.""" + now = time.time() + expired = [k for k, (_, exp) in self._cache.items() if exp <= now] + for k in expired: + del self._cache[k] + + def delete(self, key: str) -> None: + """Delete a key from the cache.""" + self._cache.pop(key, None) + + def delete_pattern(self, pattern: str) -> int: + """Delete all keys matching pattern. Returns count of deleted keys.""" + keys_to_delete = [k for k in self._cache.keys() if fnmatch.fnmatch(k, pattern)] + for k in keys_to_delete: + del self._cache[k] + return len(keys_to_delete) + + def exists(self, key: str) -> bool: + """Check if key exists and is not expired.""" + if key not in self._cache: + return False + _, expires_at = self._cache[key] + if expires_at <= time.time(): + del self._cache[key] + return False + return True + + def get(self, key: str) -> Optional[Any]: + """Get value. Returns Python object directly.""" + if not self.exists(key): + return None + value, _ = self._cache[key] + return value + + def keys(self, pattern: str) -> list[str]: + """Get all keys matching pattern.""" + self._cleanup_expired() + return [k for k in self._cache.keys() if fnmatch.fnmatch(k, pattern)] + + def set(self, key: str, value: Any, ttl: int) -> None: + """Set value with TTL in seconds. Stores Python object directly.""" + self._cleanup_expired() + expires_at = time.time() + ttl + self._cache[key] = (value, expires_at) + + +class RedisBackend: + """Redis cache backend for distributed mode.""" + + def __init__(self, url: str): + self._client = redis.from_url(url, decode_responses=True) + + def delete(self, key: str) -> None: + """Delete a key from Redis.""" + try: + self._client.delete(key) + except Exception as e: + logger.warning(f"Redis delete failed for {key}: {e}") + + def delete_pattern(self, pattern: str) -> int: + """Delete all keys matching pattern. Returns count of deleted keys.""" + try: + keys = self._client.keys(pattern) + if keys: + return self._client.delete(*keys) + return 0 + except Exception as e: + logger.warning(f"Redis delete_pattern failed for {pattern}: {e}") + return 0 + + def exists(self, key: str) -> bool: + """Check if key exists in Redis.""" + try: + return bool(self._client.exists(key)) + except Exception as e: + logger.warning(f"Redis exists check failed for {key}: {e}") + return False + + def get(self, key: str) -> Optional[Any]: + """Get value. Returns deserialized dict.""" + try: + data = self._client.get(key) + if data: + return json.loads(data) + return None + except Exception as e: + logger.warning(f"Redis get failed for {key}: {e}") + return None + + def keys(self, pattern: str) -> list[str]: + """Get all keys matching pattern.""" + try: + return self._client.keys(pattern) + except Exception as e: + logger.warning(f"Redis keys failed for {pattern}: {e}") + return [] + + def set(self, key: str, value: Any, ttl: int) -> None: + """Set value with TTL in seconds. Serializes using 'definition' property if available.""" + try: + if hasattr(value, 'definition'): + data = json.dumps(value.definition) + else: + data = json.dumps(value) + self._client.setex(key, ttl, data) + except Exception as e: + logger.warning(f"Redis set failed for {key}: {e}") + + +class CacheManager: + """Unified cache interface with automatic backend selection.""" + + _backend: Optional[CacheBackend] = None + _initialized: bool = False + + @classmethod + def _ensure_initialized(cls) -> None: + """Lazy initialization of backend.""" + if cls._initialized: + return + cls.initialize() + + @classmethod + def _make_key(cls, tag: CacheTag, identifier: str) -> str: + """Build cache key from tag and identifier.""" + return f"poolboy:{tag.value}:{identifier}" + + @classmethod + def delete(cls, tag: CacheTag, identifier: str) -> None: + """Delete a cached value.""" + cls._ensure_initialized() + key = cls._make_key(tag, identifier) + cls._backend.delete(key) + + @classmethod + def delete_by_tag(cls, tag: CacheTag) -> int: + """Delete all cached values for a tag. Returns count of deleted keys.""" + cls._ensure_initialized() + pattern = f"poolboy:{tag.value}:*" + return cls._backend.delete_pattern(pattern) + + @classmethod + def exists(cls, tag: CacheTag, identifier: str) -> bool: + """Check if a value exists in the cache.""" + cls._ensure_initialized() + key = cls._make_key(tag, identifier) + return cls._backend.exists(key) + + @classmethod + def get(cls, tag: CacheTag, identifier: str) -> Optional[Any]: + """Get a value from the cache.""" + cls._ensure_initialized() + key = cls._make_key(tag, identifier) + return cls._backend.get(key) + + @classmethod + def get_keys_by_tag(cls, tag: CacheTag) -> list[str]: + """Get all identifiers for a given tag.""" + cls._ensure_initialized() + pattern = f"poolboy:{tag.value}:*" + prefix = f"poolboy:{tag.value}:" + keys = cls._backend.keys(pattern) + return [k[len(prefix):] for k in keys] + + @classmethod + def initialize(cls, standalone: Optional[bool] = None) -> None: + """ + Initialize the cache backend. + + Args: + standalone: Force standalone mode. If None, uses Poolboy.operator_mode_standalone. + """ + if cls._initialized: + return + + if standalone is None: + standalone = Poolboy.operator_mode_standalone + + if standalone: + logger.info("Cache: Using MemoryBackend (standalone mode)") + cls._backend = MemoryBackend() + else: + redis_url = f"{Poolboy.redis_url}/3" + logger.info(f"Cache: Using RedisBackend ({redis_url})") + try: + cls._backend = RedisBackend(redis_url) + except Exception as e: + logger.warning(f"Redis connection failed, falling back to MemoryBackend: {e}") + cls._backend = MemoryBackend() + + cls._initialized = True + + @classmethod + def set(cls, tag: CacheTag, identifier: str, value: Any, ttl: int = 60) -> None: + """Set a value in the cache with TTL in seconds.""" + cls._ensure_initialized() + key = cls._make_key(tag, identifier) + cls._backend.set(key, value, ttl) + + +# Module-level singleton +Cache = CacheManager diff --git a/operator/distributed_lock.py b/operator/distributed_lock.py new file mode 100644 index 0000000..22f4dc0 --- /dev/null +++ b/operator/distributed_lock.py @@ -0,0 +1,267 @@ +""" +Distributed locking for Poolboy using Redis. + +Provides process-safe locking for Celery tasks and other components. +Uses token-based locking to prevent accidental unlock by other processes. +""" + +import logging +import time +import uuid +from contextlib import contextmanager +from typing import Optional + +import redis +from metrics import TimerDecoratorMeta +from poolboy import Poolboy + +logger = logging.getLogger(__name__) + + +class DistributedLockError(Exception): + """Exception raised when distributed lock operations fail.""" + pass + + +class DistributedLock(metaclass=TimerDecoratorMeta): + """ + A distributed lock implementation using Redis. + + Features: + - Token-based ownership (prevents accidental unlock) + - Automatic expiration to prevent deadlocks + - Configurable timeout and retry behavior + - Context manager support + - Lock extension support + + Example: + lock = DistributedLock("resource_pool:default:my-pool") + with lock: + # Critical section + pass + + # Or with acquire check: + lock = DistributedLock("resource_pool:default:my-pool", blocking=False) + if lock.acquire(): + try: + # Critical section + pass + finally: + lock.release() + """ + + _client: Optional[redis.Redis] = None + + def __init__( + self, + key: str, + timeout: int = 300, + blocking: bool = True, + blocking_timeout: float = 10.0, + retry_interval: float = 0.1, + ): + """ + Initialize the distributed lock. + + Args: + key: Unique identifier (prefixed with "poolboy:lock:") + timeout: Lock expiration time (seconds) + blocking: If True, wait for lock acquisition + blocking_timeout: Max time to wait if blocking (seconds) + retry_interval: Time between acquisition attempts (seconds) + """ + # Lazy init if on_startup() wasn't called (e.g., outside worker context) + if self._client is None: + self.on_startup() + self.client = self._client + self.key = f"poolboy:lock:{key}" + self.timeout = timeout + self.blocking = blocking + self.blocking_timeout = blocking_timeout + self.retry_interval = retry_interval + self.token = str(uuid.uuid4()) + self._acquired = False + + def __enter__(self): + """Context manager entry.""" + if not self.acquire(): + raise DistributedLockError( + f"Could not acquire lock '{self.key}' within {self.blocking_timeout}s" + ) + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + """Context manager exit.""" + self.release() + + def _try_acquire(self) -> bool: + """Single attempt to acquire the lock.""" + try: + result = self.client.set( + self.key, + self.token, + nx=True, + ex=self.timeout, + ) + if result: + self._acquired = True + logger.debug(f"Acquired lock: {self.key}") + return True + return False + except redis.RedisError as e: + logger.error(f"Error acquiring lock {self.key}: {e}") + raise DistributedLockError(f"Failed to acquire lock: {e}") + + def acquire(self, blocking: Optional[bool] = None, timeout: Optional[float] = None) -> bool: + """ + Acquire the distributed lock. + + Args: + blocking: Override instance blocking setting + timeout: Override instance blocking_timeout setting + + Returns: + True if lock was acquired, False otherwise + """ + should_block = blocking if blocking is not None else self.blocking + wait_timeout = timeout if timeout is not None else self.blocking_timeout + + if not should_block: + return self._try_acquire() + + start_time = time.time() + while (time.time() - start_time) < wait_timeout: + if self._try_acquire(): + return True + time.sleep(self.retry_interval) + + logger.warning(f"Failed to acquire lock {self.key} within {wait_timeout}s") + return False + + def extend(self, additional_time: Optional[int] = None) -> bool: + """ + Extend the lock expiration time. + + Args: + additional_time: Additional seconds. Defaults to original timeout. + + Returns: + True if extended, False otherwise + """ + if not self._acquired: + return False + + extension = additional_time or self.timeout + + try: + current_token = self.client.get(self.key) + if current_token == self.token: + if self.client.expire(self.key, extension): + logger.debug(f"Extended lock {self.key} by {extension}s") + return True + logger.warning(f"Cannot extend lock {self.key} - not owned or expired") + self._acquired = False + return False + except redis.RedisError as e: + logger.error(f"Error extending lock {self.key}: {e}") + raise DistributedLockError(f"Failed to extend lock: {e}") + + def is_locked(self) -> bool: + """Check if the lock is currently held (by any process).""" + try: + return bool(self.client.exists(self.key)) + except redis.RedisError as e: + raise DistributedLockError(f"Failed to check lock: {e}") + + @classmethod + def on_cleanup(cls) -> None: + """Close Redis client. Called from worker_process_shutdown signal.""" + if cls._client is not None: + try: + cls._client.close() + logger.info("DistributedLock Redis client closed") + except redis.RedisError as e: + logger.warning(f"Error closing Redis client: {e}") + finally: + cls._client = None + + @classmethod + def on_startup(cls) -> None: + """Initialize Redis client. Called from worker_process_init signal.""" + if cls._client is None: + redis_url = f"{Poolboy.redis_url}/2" + cls._client = redis.from_url(redis_url, decode_responses=True) + + @property + def owned(self) -> bool: + """Check if this instance owns the lock.""" + return self._acquired + + def release(self) -> bool: + """ + Release the distributed lock. + + Returns: + True if released, False if not owned by this instance + """ + if not self._acquired: + return False + + try: + current_token = self.client.get(self.key) + if current_token == self.token: + self.client.delete(self.key) + self._acquired = False + logger.debug(f"Released lock: {self.key}") + return True + else: + logger.warning(f"Cannot release lock {self.key} - token mismatch") + self._acquired = False + return False + except redis.RedisError as e: + logger.error(f"Error releasing lock {self.key}: {e}") + raise DistributedLockError(f"Failed to release lock: {e}") + + +@contextmanager +def distributed_lock( + key: str, + timeout: int = 300, + blocking: bool = False, + blocking_timeout: float = 10.0, +): + """ + Resilient context manager for distributed locking. + + Unlike DistributedLock class, this wrapper: + - Never raises DistributedLockError on acquire failure + - Yields acquired: bool + - Always handles cleanup properly + + Example: + with distributed_lock("resource_pool:ns:name") as acquired: + if not acquired: + raise self.retry(countdown=5) + # Critical section + + Note: For advanced usage (e.g., lock.extend()), use DistributedLock class directly. + """ + lock = DistributedLock( + key=key, + timeout=timeout, + blocking=blocking, + blocking_timeout=blocking_timeout, + ) + + acquired = False + try: + acquired = lock.acquire() + yield acquired + except DistributedLockError: + yield False + finally: + if acquired: + try: + lock.release() + except DistributedLockError: + pass diff --git a/operator/kopfobject.py b/operator/kopfobject.py index 1e042b6..079a277 100644 --- a/operator/kopfobject.py +++ b/operator/kopfobject.py @@ -1,9 +1,10 @@ import asyncio from datetime import datetime -from typing import List, Mapping +from typing import List, Mapping, Optional, Self import kopf import kubernetes_asyncio +from cache import Cache, CacheTag from metrics.timer_decorator import TimerDecoratorMeta from poolboy import Poolboy @@ -49,6 +50,30 @@ def __str__(self) -> str: def api_group_version(self): return f"{self.api_group}/{self.api_version}" + @classmethod + def cache_delete(cls, tag: CacheTag, key: str) -> None: + """Delete object from cache.""" + Cache.delete(tag, key) + + @classmethod + def cache_get(cls, tag: CacheTag, key: str) -> Optional[Self]: + """ + Get object from cache, reconstructing if needed. + + MemoryBackend returns the Python object directly. + RedisBackend returns a dict that needs reconstruction via from_definition(). + """ + cached = Cache.get(tag, key) + if cached is None: + return None + if isinstance(cached, cls): + return cached + return cls.from_definition(cached) + + def cache_set(self, tag: CacheTag, key: str, ttl: int = 300) -> None: + """Store object in cache with TTL in seconds.""" + Cache.set(tag, key, self, ttl) + @property def creation_datetime(self): return datetime.strptime(self.creation_timestamp, "%Y-%m-%dT%H:%H:%S%z") @@ -57,6 +82,16 @@ def creation_datetime(self): def creation_timestamp(self) -> str: return self.meta['creationTimestamp'] + @property + def definition(self) -> Mapping: + return { + 'apiVersion': self.api_group_version, + 'kind': self.kind, + 'metadata': dict(self.meta), + 'spec': dict(self.spec) if self.spec else {}, + 'status': dict(self.status) if self.status else {}, + } + @property def deletion_timestamp(self) -> str|None: return self.meta.get('deletionTimestamp') @@ -74,6 +109,10 @@ def reference(self) -> Mapping: "namespace": self.namespace, } + @property + def resource_version(self) -> str: + return self.meta.get('resourceVersion', '') + def refresh(self, annotations: kopf.Annotations, labels: kopf.Labels, diff --git a/operator/metrics/__init__.py b/operator/metrics/__init__.py index 53571c5..ab5dbcc 100644 --- a/operator/metrics/__init__.py +++ b/operator/metrics/__init__.py @@ -1,3 +1,5 @@ +"""Prometheus metrics module for Poolboy operator and workers.""" + from .app_metrics import AppMetrics from .metrics_service import MetricsService from .timer_decorator import TimerDecoratorMeta, async_timer, sync_timer diff --git a/operator/metrics/app_metrics.py b/operator/metrics/app_metrics.py index fb1e30b..c0ee755 100644 --- a/operator/metrics/app_metrics.py +++ b/operator/metrics/app_metrics.py @@ -1,25 +1,25 @@ +"""Prometheus metrics definitions for Poolboy.""" + from __future__ import annotations -from aioprometheus import REGISTRY, Counter, Histogram +from prometheus_client import REGISTRY, Counter, Histogram class AppMetrics: + """Central registry for all application metrics.""" + registry = REGISTRY process_time = Histogram( "poolboy_process_time_seconds", "Execution time of processes in the app", - { - "method": "The method name", - "status": "The status of the request", - "app": "The application name", - "cluster_domain": "The cluster name", - }, - registry=registry, + ["method", "status", "app", "cluster_domain"], + registry=REGISTRY, ) invalid_resource_counter = Counter( "poolboy_invalid_resource_count", "Counts the number of resources in invalid states", - registry=registry, + ["resource_type", "cluster_domain"], + registry=REGISTRY, ) diff --git a/operator/metrics/metrics_service.py b/operator/metrics/metrics_service.py index 2f2ba46..ea0279d 100644 --- a/operator/metrics/metrics_service.py +++ b/operator/metrics/metrics_service.py @@ -1,26 +1,82 @@ +"""Prometheus metrics HTTP server with multiprocess support.""" + from __future__ import annotations import logging +import os +from pathlib import Path +from threading import Thread +from wsgiref.simple_server import WSGIRequestHandler, make_server -from aioprometheus.service import Service - -from .app_metrics import AppMetrics +from prometheus_client import REGISTRY, CollectorRegistry, multiprocess +from prometheus_client.exposition import ThreadingWSGIServer, make_wsgi_app +from prometheus_client.multiprocess import MultiProcessCollector logger = logging.getLogger(__name__) class MetricsService: - service = Service(registry=AppMetrics.registry) + """HTTP server that exposes Prometheus metrics on /metrics endpoint.""" + + _server = None + _thread = None + _multiproc_dir: Path | None = None @classmethod - async def start(cls, addr="0.0.0.0", port=8000) -> None: - # Reduce logging level for aiohttp to avoid spamming the logs - logging.getLogger("aiohttp").setLevel(logging.ERROR) + def _get_registry(cls) -> CollectorRegistry: + """Return the appropriate collector registry based on environment.""" + multiproc_dir = os.environ.get("PROMETHEUS_MULTIPROC_DIR") + + if multiproc_dir: + cls._multiproc_dir = Path(multiproc_dir) + cls._multiproc_dir.mkdir(parents=True, exist_ok=True) - await cls.service.start(addr=addr, port=port, metrics_url="/metrics") - logger.info(f"Serving metrics on: {cls.service.metrics_url}") + registry = CollectorRegistry() + MultiProcessCollector(registry) + logger.info(f"Using multiprocess registry: {multiproc_dir}") + return registry + + logger.info("Using single-process registry") + return REGISTRY @classmethod - async def stop(cls) -> None: - logger.info("Stopping metrics service") - await cls.service.stop() + def start(cls, port: int = 9090, addr: str = "0.0.0.0") -> None: + """Start the metrics server in a background daemon thread.""" + registry = cls._get_registry() + + app = make_wsgi_app(registry) + cls._server = make_server( + addr, port, app, ThreadingWSGIServer, + handler_class=_SilentHandler + ) + + cls._thread = Thread(target=cls._server.serve_forever, daemon=True) + cls._thread.start() + logger.info(f"Metrics server started on {addr}:{port}") + + @classmethod + def stop(cls) -> None: + """Stop the metrics server and cleanup resources.""" + if cls._server: + cls._server.shutdown() + logger.info("Metrics server stopped") + + cls._cleanup_multiproc() + + @classmethod + def _cleanup_multiproc(cls) -> None: + """Mark current process as dead for multiprocess cleanup.""" + if cls._multiproc_dir and cls._multiproc_dir.exists(): + try: + pid = os.getpid() + multiprocess.mark_process_dead(pid) + logger.debug(f"Marked process {pid} as dead") + except Exception as e: + logger.warning(f"Error cleaning up multiproc: {e}") + + +class _SilentHandler(WSGIRequestHandler): + """WSGI request handler that suppresses access logs.""" + + def log_message(self, format, *args): + pass diff --git a/operator/metrics/timer_decorator.py b/operator/metrics/timer_decorator.py index 1e6b486..be2c50e 100644 --- a/operator/metrics/timer_decorator.py +++ b/operator/metrics/timer_decorator.py @@ -1,3 +1,5 @@ +"""Timer decorators for automatic method execution timing.""" + import inspect import os import time @@ -5,9 +7,11 @@ from .app_metrics import AppMetrics -cluster_domain = os.environ.get('CLUSTER_DOMAIN') +CLUSTER_DOMAIN = os.environ.get('CLUSTER_DOMAIN', 'unknown') + def async_timer(app: str): + """Decorator that records execution time of async functions.""" def decorator(func): @wraps(func) async def wrapper(*args, **kwargs): @@ -15,55 +19,60 @@ async def wrapper(*args, **kwargs): status = 'success' try: result = await func(*args, **kwargs) - status = 'success' + return result except Exception as e: status = 'error' raise e finally: duration = time.time() - start_time - method_name = func.__name__ - labels = {'method': method_name, - 'status': status, - 'app': app, - 'cluster_domain': cluster_domain - } - AppMetrics.process_time.observe(labels, duration) - - return result + AppMetrics.process_time.labels( + method=func.__name__, + status=status, + app=app, + cluster_domain=CLUSTER_DOMAIN, + ).observe(duration) return wrapper return decorator def sync_timer(app: str): + """Decorator that records execution time of sync functions.""" def decorator(func): @wraps(func) def wrapper(*args, **kwargs): start_time = time.time() + status = 'success' try: result = func(*args, **kwargs) - status = 'success' + return result except Exception as e: status = 'error' raise e finally: duration = time.time() - start_time - method_name = func.__name__ - labels = {'method': method_name, - 'status': status, - 'app': app, - 'cluster_domain': cluster_domain - } - AppMetrics.process_time.observe(labels, duration) - - return result + AppMetrics.process_time.labels( + method=func.__name__, + status=status, + app=app, + cluster_domain=CLUSTER_DOMAIN, + ).observe(duration) return wrapper return decorator class TimerDecoratorMeta(type): + """Metaclass that applies timer decorators to all public methods.""" + def __new__(cls, name, bases, dct): for attr_name, attr_value in dct.items(): - if isinstance(attr_value, classmethod): + if isinstance(attr_value, staticmethod): + original_method = attr_value.__func__ + if inspect.iscoroutinefunction(original_method): + decorated_method = async_timer(name)(original_method) + else: + decorated_method = sync_timer(name)(original_method) + dct[attr_name] = staticmethod(decorated_method) + elif isinstance(attr_value, classmethod): original_method = attr_value.__func__ if inspect.iscoroutinefunction(original_method): decorated_method = async_timer(name)(original_method) diff --git a/operator/operator.py b/operator/operator.py index 6cf0104..8fdc53e 100755 --- a/operator/operator.py +++ b/operator/operator.py @@ -5,6 +5,7 @@ from typing import Mapping import kopf +from cache import Cache from configure_kopf_logging import configure_kopf_logging from infinite_relative_backoff import InfiniteRelativeBackoff from metrics import MetricsService @@ -26,16 +27,13 @@ async def startup(logger: kopf.ObjectLogger, settings: kopf.OperatorSettings, ** # Never give up from network errors settings.networking.error_backoffs = InfiniteRelativeBackoff() - # Set finalizer based on operator mode - settings.persistence.finalizer = ( - f"{Poolboy.operator_domain}/handler" if Poolboy.operator_mode_resource_handler else - f"{Poolboy.operator_domain}/watch-{Poolboy.resource_watch_name}" if Poolboy.operator_mode_resource_watch else - Poolboy.operator_domain - ) + # Simplified finalizer - always use base domain + settings.persistence.finalizer = Poolboy.operator_domain - # Support deprecated resource handler finalizer - if Poolboy.operator_mode_resource_handler: - settings.persistence.deprecated_finalizer = re.compile(re.escape(Poolboy.operator_domain) + '/handler-\d+$') + # Support deprecated finalizers for migration (covers /handler and /handler-N patterns) + settings.persistence.deprecated_finalizer = re.compile( + re.escape(Poolboy.operator_domain) + '/handler(-\\d+)?$' + ) # Store progress in status. settings.persistence.progress_storage = kopf.StatusProgressStorage(field='status.kopf.progress') @@ -48,29 +46,27 @@ async def startup(logger: kopf.ObjectLogger, settings: kopf.OperatorSettings, ** # Configure logging configure_kopf_logging() + # Initialize cache before any preload operations + Cache.initialize(standalone=Poolboy.operator_mode_standalone) await Poolboy.on_startup(logger=logger) if Poolboy.metrics_enabled: - # Start metrics service - await MetricsService.start(port=Poolboy.metrics_port) + # Start metrics service (sync but non-blocking - runs in daemon thread) + MetricsService.start(port=Poolboy.metrics_port) # Preload configuration from ResourceProviders await ResourceProvider.preload(logger=logger) - # Preload for matching ResourceClaim templates - if Poolboy.operator_mode_all_in_one or Poolboy.operator_mode_resource_handler: + # Preload ResourceHandles in standalone mode (distributed mode uses workers) + if Poolboy.operator_mode_standalone: await ResourceHandle.preload(logger=logger) - if Poolboy.operator_mode_resource_handler: - ResourceHandle.start_watch_other() @kopf.on.cleanup() async def cleanup(logger: kopf.ObjectLogger, **_): - if Poolboy.operator_mode_resource_handler: - ResourceHandle.stop_watch_other() await ResourceWatch.stop_all() await Poolboy.on_cleanup() - await MetricsService.stop() + MetricsService.stop() @kopf.on.event(Poolboy.operator_domain, Poolboy.operator_version, 'resourceproviders') async def resource_provider_event(event: Mapping, logger: kopf.ObjectLogger, **_) -> None: @@ -80,435 +76,468 @@ async def resource_provider_event(event: Mapping, logger: kopf.ObjectLogger, **_ else: await ResourceProvider.register(definition=definition, logger=logger) +# Simplified label selector - just ignore resources with ignore label label_selector = f"!{Poolboy.ignore_label}" -if Poolboy.operator_mode_resource_handler: - label_selector += f",{Poolboy.resource_handler_idx_label}={Poolboy.resource_handler_idx}" - -if Poolboy.operator_mode_manager: - # In manager mode just label ResourceClaims, ResourceHandles, and ResourcePools - # to assign the correct handler. - @kopf.on.event( - ResourceClaim.api_group, ResourceClaim.api_version, ResourceClaim.plural, - label_selector=label_selector, - ) - async def label_resource_claim( - event: Mapping, - logger: kopf.ObjectLogger, - **_ - ) -> None: - definition = event['object'] - if event['type'] == 'DELETED': - return - resource_claim = ResourceClaim.from_definition(definition) - await resource_claim.assign_resource_handler() - - @kopf.on.event( - ResourceHandle.api_group, ResourceHandle.api_version, ResourceHandle.plural, - label_selector=label_selector, - ) - async def label_resource_handle( - event: Mapping, - logger: kopf.ObjectLogger, - **_ - ) -> None: - definition = event['object'] - if event['type'] == 'DELETED': - return - resource_handle = ResourceHandle.from_definition(definition) - await resource_handle.assign_resource_handler() - - @kopf.on.event( - ResourcePool.api_group, ResourcePool.api_version, ResourcePool.plural, - label_selector=label_selector, - ) - async def label_resource_pool( - event: Mapping, - logger: kopf.ObjectLogger, - **_ - ) -> None: - definition = event['object'] - if event['type'] == 'DELETED': - return - resource_pool = ResourcePool.from_definition(definition) - await resource_pool.assign_resource_handler() - -if( - Poolboy.operator_mode_all_in_one or - Poolboy.operator_mode_resource_handler -): - # Resources are handled in either all-in-one or resource-handler mode. - # The difference is only if labels are used to select which resources to handle. - @kopf.on.create( - ResourceClaim.api_group, ResourceClaim.api_version, ResourceClaim.plural, - label_selector=label_selector, - id='resource_claim_create', - ) - @kopf.on.resume( - ResourceClaim.api_group, ResourceClaim.api_version, ResourceClaim.plural, - label_selector=label_selector, - id='resource_claim_resume', - ) - @kopf.on.update( - ResourceClaim.api_group, ResourceClaim.api_version, ResourceClaim.plural, - label_selector=label_selector, - id='resource_claim_update', +# Resource event handlers - always registered in both standalone and distributed modes +# In distributed mode, handlers dispatch to Celery workers + +@kopf.on.create( + ResourceClaim.api_group, ResourceClaim.api_version, ResourceClaim.plural, + label_selector=label_selector, + id='resource_claim_create', +) +@kopf.on.resume( + ResourceClaim.api_group, ResourceClaim.api_version, ResourceClaim.plural, + label_selector=label_selector, + id='resource_claim_resume', +) +@kopf.on.update( + ResourceClaim.api_group, ResourceClaim.api_version, ResourceClaim.plural, + label_selector=label_selector, + id='resource_claim_update', +) +async def resource_claim_event( + annotations: kopf.Annotations, + labels: kopf.Labels, + logger: kopf.ObjectLogger, + meta: kopf.Meta, + name: str, + namespace: str, + spec: kopf.Spec, + status: kopf.Status, + uid: str, + **_ +): + resource_claim = await ResourceClaim.register( + annotations = annotations, + labels = labels, + meta = meta, + name = name, + namespace = namespace, + spec = spec, + status = status, + uid = uid, ) - async def resource_claim_event( - annotations: kopf.Annotations, - labels: kopf.Labels, - logger: kopf.ObjectLogger, - meta: kopf.Meta, - name: str, - namespace: str, - spec: kopf.Spec, - status: kopf.Status, - uid: str, - **_ - ): - resource_claim = await ResourceClaim.register( - annotations = annotations, - labels = labels, - meta = meta, - name = name, - namespace = namespace, - spec = spec, - status = status, - uid = uid, + + # IMPORTANT: Only dispatch to worker if claim already has a handle. + # Initial binding requires in-memory cache which workers don't have. + # This ensures pool handles are correctly reused. + if Poolboy.workers_resource_claim and resource_claim.has_resource_handle: + from tasks.resourceclaim import dispatch_manage_claim + dispatch_manage_claim( + definition=resource_claim.definition, + name=resource_claim.name, + namespace=resource_claim.namespace, ) + else: await resource_claim.manage(logger=logger) - @kopf.on.delete( - ResourceClaim.api_group, ResourceClaim.api_version, ResourceClaim.plural, - label_selector=label_selector, +@kopf.on.delete( + ResourceClaim.api_group, ResourceClaim.api_version, ResourceClaim.plural, + label_selector=label_selector, +) +async def resource_claim_delete( + annotations: kopf.Annotations, + labels: kopf.Labels, + logger: kopf.ObjectLogger, + meta: kopf.Meta, + name: str, + namespace: str, + spec: kopf.Spec, + status: kopf.Status, + uid: str, + **_ +): + resource_claim = ResourceClaim( + annotations = annotations, + labels = labels, + meta = meta, + name = name, + namespace = namespace, + spec = spec, + status = status, + uid = uid, ) - async def resource_claim_delete( - annotations: kopf.Annotations, - labels: kopf.Labels, - logger: kopf.ObjectLogger, - meta: kopf.Meta, - name: str, - namespace: str, - spec: kopf.Spec, - status: kopf.Status, - uid: str, - **_ - ): - resource_claim = ResourceClaim( - annotations = annotations, - labels = labels, - meta = meta, - name = name, - namespace = namespace, - spec = spec, - status = status, - uid = uid, + + # Delegate to worker if enabled + if Poolboy.workers_resource_claim: + from tasks.resourceclaim import dispatch_delete_claim + dispatch_delete_claim( + definition=resource_claim.definition, + name=resource_claim.name, + namespace=resource_claim.namespace, ) + else: await resource_claim.handle_delete(logger=logger) - await ResourceClaim.unregister(name=name, namespace=namespace) - @kopf.daemon( - ResourceClaim.api_group, ResourceClaim.api_version, ResourceClaim.plural, - cancellation_timeout = 1, - initial_delay = Poolboy.manage_handles_interval, - label_selector=label_selector, + await ResourceClaim.unregister(name=name, namespace=namespace) + +@kopf.daemon( + ResourceClaim.api_group, ResourceClaim.api_version, ResourceClaim.plural, + cancellation_timeout = 1, + initial_delay = Poolboy.manage_handles_interval, + label_selector=label_selector, +) +async def resource_claim_daemon( + annotations: kopf.Annotations, + labels: kopf.Labels, + logger: kopf.ObjectLogger, + meta: kopf.Meta, + name: str, + namespace: str, + spec: kopf.Spec, + status: kopf.Status, + stopped: kopf.DaemonStopped, + uid: str, + **_ +): + resource_claim = await ResourceClaim.register( + annotations = annotations, + labels = labels, + meta = meta, + name = name, + namespace = namespace, + spec = spec, + status = status, + uid = uid, ) - async def resource_claim_daemon( - annotations: kopf.Annotations, - labels: kopf.Labels, - logger: kopf.ObjectLogger, - meta: kopf.Meta, - name: str, - namespace: str, - spec: kopf.Spec, - status: kopf.Status, - stopped: kopf.DaemonStopped, - uid: str, - **_ - ): - resource_claim = await ResourceClaim.register( - annotations = annotations, - labels = labels, - meta = meta, - name = name, - namespace = namespace, - spec = spec, - status = status, - uid = uid, - ) - try: - while not stopped: - description = str(resource_claim) - resource_claim = await resource_claim.refetch() - if not resource_claim: - logger.info(f"{description} found deleted in daemon") - return - if not resource_claim.ignore: + try: + while not stopped: + description = str(resource_claim) + resource_claim = await resource_claim.refetch() + if not resource_claim: + logger.info(f"{description} found deleted in daemon") + return + if not resource_claim.ignore: + # Delegate to worker if enabled, daemon mode active, AND claim has handle + # Claims without handle need operator for binding (cache-dependent) + if ( + Poolboy.workers_resource_claim and + resource_claim.has_resource_handle and + Poolboy.workers_resource_claim_daemon_mode in ('daemon', 'both') + ): + from tasks.resourceclaim import dispatch_manage_claim + dispatch_manage_claim( + definition=resource_claim.definition, + name=resource_claim.name, + namespace=resource_claim.namespace, + ) + else: await resource_claim.manage(logger=logger) - await asyncio.sleep(Poolboy.manage_claims_interval) - except asyncio.CancelledError: - pass - - @kopf.on.create( - ResourceHandle.api_group, ResourceHandle.api_version, ResourceHandle.plural, - id='resource_handle_create', - label_selector=label_selector, - ) - @kopf.on.resume( - ResourceHandle.api_group, ResourceHandle.api_version, ResourceHandle.plural, - id='resource_handle_resume', - label_selector=label_selector, - ) - @kopf.on.update( - ResourceHandle.api_group, ResourceHandle.api_version, ResourceHandle.plural, - id='resource_handle_update', - label_selector=label_selector, + await asyncio.sleep(Poolboy.manage_claims_interval) + except asyncio.CancelledError: + pass + +@kopf.on.create( + ResourceHandle.api_group, ResourceHandle.api_version, ResourceHandle.plural, + id='resource_handle_create', + label_selector=label_selector, +) +@kopf.on.resume( + ResourceHandle.api_group, ResourceHandle.api_version, ResourceHandle.plural, + id='resource_handle_resume', + label_selector=label_selector, +) +@kopf.on.update( + ResourceHandle.api_group, ResourceHandle.api_version, ResourceHandle.plural, + id='resource_handle_update', + label_selector=label_selector, +) +async def resource_handle_event( + annotations: kopf.Annotations, + labels: kopf.Labels, + logger: kopf.ObjectLogger, + meta: kopf.Meta, + name: str, + namespace: str, + spec: kopf.Spec, + status: kopf.Status, + uid: str, + **_ +): + resource_handle = await ResourceHandle.register( + annotations = annotations, + labels = labels, + meta = meta, + name = name, + namespace = namespace, + spec = spec, + status = status, + uid = uid, ) - async def resource_handle_event( - annotations: kopf.Annotations, - labels: kopf.Labels, - logger: kopf.ObjectLogger, - meta: kopf.Meta, - name: str, - namespace: str, - spec: kopf.Spec, - status: kopf.Status, - uid: str, - **_ - ): - resource_handle = await ResourceHandle.register( - annotations = annotations, - labels = labels, - meta = meta, - name = name, - namespace = namespace, - spec = spec, - status = status, - uid = uid, + if resource_handle.ignore: + return + if Poolboy.workers_resource_handle: + from tasks.resourcehandle import dispatch_manage_handle + dispatch_manage_handle( + definition=resource_handle.definition, + name=resource_handle.name, + namespace=resource_handle.namespace, ) - if resource_handle.ignore: - return + else: await resource_handle.manage(logger=logger) - @kopf.on.delete( - ResourceHandle.api_group, ResourceHandle.api_version, ResourceHandle.plural, - label_selector=label_selector, +@kopf.on.delete( + ResourceHandle.api_group, ResourceHandle.api_version, ResourceHandle.plural, + label_selector=label_selector, +) +async def resource_handle_delete( + annotations: kopf.Annotations, + labels: kopf.Labels, + logger: kopf.ObjectLogger, + meta: kopf.Meta, + name: str, + namespace: str, + spec: kopf.Spec, + status: kopf.Status, + uid: str, + **_ +): + await ResourceHandle.unregister(name) + resource_handle = ResourceHandle( + annotations = annotations, + labels = labels, + meta = meta, + name = name, + namespace = namespace, + spec = spec, + status = status, + uid = uid, ) - async def resource_handle_delete( - annotations: kopf.Annotations, - labels: kopf.Labels, - logger: kopf.ObjectLogger, - meta: kopf.Meta, - name: str, - namespace: str, - spec: kopf.Spec, - status: kopf.Status, - uid: str, - **_ - ): - await ResourceHandle.unregister(name) - resource_handle = ResourceHandle( - annotations = annotations, - labels = labels, - meta = meta, - name = name, - namespace = namespace, - spec = spec, - status = status, - uid = uid, + if resource_handle.ignore: + return + if Poolboy.workers_resource_handle: + from tasks.resourcehandle import dispatch_delete_handle + dispatch_delete_handle( + definition=resource_handle.definition, + name=resource_handle.name, + namespace=resource_handle.namespace, ) - if resource_handle.ignore: - return + else: await resource_handle.handle_delete(logger=logger) - @kopf.daemon( - ResourceHandle.api_group, ResourceHandle.api_version, ResourceHandle.plural, - cancellation_timeout = 1, - initial_delay = Poolboy.manage_handles_interval, - label_selector=label_selector, +@kopf.daemon( + ResourceHandle.api_group, ResourceHandle.api_version, ResourceHandle.plural, + cancellation_timeout = 1, + initial_delay = Poolboy.manage_handles_interval, + label_selector=label_selector, +) +async def resource_handle_daemon( + annotations: kopf.Annotations, + labels: kopf.Labels, + logger: kopf.ObjectLogger, + meta: kopf.Meta, + name: str, + namespace: str, + spec: kopf.Spec, + status: kopf.Status, + stopped: kopf.DaemonStopped, + uid: str, + **_ +): + resource_handle = await ResourceHandle.register( + annotations = annotations, + labels = labels, + meta = meta, + name = name, + namespace = namespace, + spec = spec, + status = status, + uid = uid, ) - async def resource_handle_daemon( - annotations: kopf.Annotations, - labels: kopf.Labels, - logger: kopf.ObjectLogger, - meta: kopf.Meta, - name: str, - namespace: str, - spec: kopf.Spec, - status: kopf.Status, - stopped: kopf.DaemonStopped, - uid: str, - **_ - ): - resource_handle = await ResourceHandle.register( - annotations = annotations, - labels = labels, - meta = meta, - name = name, - namespace = namespace, - spec = spec, - status = status, - uid = uid, - ) - try: - while not stopped: - description = str(resource_handle) - resource_handle = await resource_handle.refetch() - if not resource_handle: - logger.info(f"{description} found deleted in daemon") - return - if not resource_handle.ignore: + try: + while not stopped: + description = str(resource_handle) + resource_handle = await resource_handle.refetch() + if not resource_handle: + logger.info(f"{description} found deleted in daemon") + return + if not resource_handle.ignore: + if Poolboy.workers_resource_handle: + if Poolboy.workers_resource_handle_daemon_mode in ('daemon', 'both'): + from tasks.resourcehandle import dispatch_manage_handle + dispatch_manage_handle( + definition=resource_handle.definition, + name=resource_handle.name, + namespace=resource_handle.namespace, + ) + else: await resource_handle.manage(logger=logger) - await asyncio.sleep(Poolboy.manage_handles_interval) - except asyncio.CancelledError: - pass - - @kopf.on.create( - ResourcePool.api_group, ResourcePool.api_version, ResourcePool.plural, - id='resource_pool_create', - label_selector=label_selector, - ) - @kopf.on.resume( - ResourcePool.api_group, ResourcePool.api_version, ResourcePool.plural, - id='resource_pool_resume', - label_selector=label_selector, - ) - @kopf.on.update( - ResourcePool.api_group, ResourcePool.api_version, ResourcePool.plural, - id='resource_pool_update', - label_selector=label_selector, + await asyncio.sleep(Poolboy.manage_handles_interval) + except asyncio.CancelledError: + pass + +@kopf.on.create( + ResourcePool.api_group, ResourcePool.api_version, ResourcePool.plural, + id='resource_pool_create', + label_selector=label_selector, +) +@kopf.on.resume( + ResourcePool.api_group, ResourcePool.api_version, ResourcePool.plural, + id='resource_pool_resume', + label_selector=label_selector, +) +@kopf.on.update( + ResourcePool.api_group, ResourcePool.api_version, ResourcePool.plural, + id='resource_pool_update', + label_selector=label_selector, +) +async def resource_pool_event( + annotations: kopf.Annotations, + labels: kopf.Labels, + logger: kopf.ObjectLogger, + meta: kopf.Meta, + name: str, + namespace: str, + spec: kopf.Spec, + status: kopf.Status, + uid: str, + **_ +): + resource_pool = await ResourcePool.register( + annotations = annotations, + labels = labels, + meta = meta, + name = name, + namespace = namespace, + spec = spec, + status = status, + uid = uid, ) - async def resource_pool_event( - annotations: kopf.Annotations, - labels: kopf.Labels, - logger: kopf.ObjectLogger, - meta: kopf.Meta, - name: str, - namespace: str, - spec: kopf.Spec, - status: kopf.Status, - uid: str, - **_ - ): - resource_pool = await ResourcePool.register( - annotations = annotations, - labels = labels, - meta = meta, - name = name, - namespace = namespace, - spec = spec, - status = status, - uid = uid, + if Poolboy.workers_resource_pool: + from tasks.resourcepool import dispatch_manage_pool + dispatch_manage_pool( + definition=resource_pool.definition, + name=resource_pool.name, + namespace=resource_pool.namespace, ) + else: await resource_pool.manage(logger=logger) - @kopf.on.delete( - Poolboy.operator_domain, Poolboy.operator_version, 'resourcepools', - label_selector=label_selector, +@kopf.on.delete( + Poolboy.operator_domain, Poolboy.operator_version, 'resourcepools', + label_selector=label_selector, +) +async def resource_pool_delete( + annotations: kopf.Annotations, + labels: kopf.Labels, + logger: kopf.ObjectLogger, + meta: kopf.Meta, + name: str, + namespace: str, + spec: kopf.Spec, + status: kopf.Status, + uid: str, + **_ +): + await ResourcePool.unregister(name) + resource_pool = ResourcePool( + annotations = annotations, + labels = labels, + meta = meta, + name = name, + namespace = namespace, + spec = spec, + status = status, + uid = uid, ) - async def resource_pool_delete( - annotations: kopf.Annotations, - labels: kopf.Labels, - logger: kopf.ObjectLogger, - meta: kopf.Meta, - name: str, - namespace: str, - spec: kopf.Spec, - status: kopf.Status, - uid: str, - **_ - ): - await ResourcePool.unregister(name) - resource_pool = ResourcePool( - annotations = annotations, - labels = labels, - meta = meta, - name = name, - namespace = namespace, - spec = spec, - status = status, - uid = uid, + if Poolboy.workers_resource_pool: + from tasks.resourcepool import dispatch_delete_pool_handles + dispatch_delete_pool_handles( + definition=resource_pool.definition, + name=resource_pool.name, + namespace=resource_pool.namespace, ) + else: await resource_pool.handle_delete(logger=logger) - @kopf.daemon(Poolboy.operator_domain, Poolboy.operator_version, 'resourcepools', - cancellation_timeout = 1, - initial_delay = Poolboy.manage_pools_interval, - label_selector=label_selector, - ) - async def resource_pool_daemon( - annotations: kopf.Annotations, - labels: kopf.Labels, - logger: kopf.ObjectLogger, - meta: kopf.Meta, - name: str, - namespace: str, - spec: kopf.Spec, - status: kopf.Status, - stopped: kopf.DaemonStopped, - uid: str, - **_ - ): - resource_pool = await ResourcePool.register( - annotations = annotations, - labels = labels, - meta = meta, - name = name, - namespace = namespace, - spec = spec, - status = status, - uid = uid, - ) - if resource_pool.ignore: - return - try: - while not stopped: - await resource_pool.manage(logger=logger) - await asyncio.sleep(Poolboy.manage_pools_interval) - except asyncio.CancelledError: - pass - -if ( - Poolboy.operator_mode_all_in_one or - Poolboy.operator_mode_resource_watch or - Poolboy.operator_mode_manager +@kopf.daemon(Poolboy.operator_domain, Poolboy.operator_version, 'resourcepools', + cancellation_timeout = 1, + initial_delay = Poolboy.manage_pools_interval, + label_selector=label_selector, +) +async def resource_pool_daemon( + annotations: kopf.Annotations, + labels: kopf.Labels, + logger: kopf.ObjectLogger, + meta: kopf.Meta, + name: str, + namespace: str, + spec: kopf.Spec, + status: kopf.Status, + stopped: kopf.DaemonStopped, + uid: str, + **_ ): - @kopf.on.create( - Poolboy.operator_domain, Poolboy.operator_version, 'resourcewatches', - id='resource_watch_create', - label_selector=label_selector, + resource_pool = await ResourcePool.register( + annotations = annotations, + labels = labels, + meta = meta, + name = name, + namespace = namespace, + spec = spec, + status = status, + uid = uid, ) - @kopf.on.resume( - Poolboy.operator_domain, Poolboy.operator_version, 'resourcewatches', - id='resource_watch_resume', - label_selector=label_selector, + if resource_pool.ignore: + return + try: + while not stopped: + description = str(resource_pool) + resource_pool = await resource_pool.refetch() + if not resource_pool: + logger.info(f"{description} found deleted in daemon") + return + + if not resource_pool.ignore: + if Poolboy.workers_resource_pool: + if Poolboy.workers_resource_pool_daemon_mode in ('daemon', 'both'): + from tasks.resourcepool import dispatch_manage_pool + dispatch_manage_pool( + definition=resource_pool.definition, + name=resource_pool.name, + namespace=resource_pool.namespace, + ) + else: + await resource_pool.manage(logger=logger) + + await asyncio.sleep(Poolboy.manage_pools_interval) + except asyncio.CancelledError: + pass + +# ResourceWatch handlers - always start watch directly (no more create_pod) +@kopf.on.create( + Poolboy.operator_domain, Poolboy.operator_version, 'resourcewatches', + id='resource_watch_create', + label_selector=label_selector, +) +@kopf.on.resume( + Poolboy.operator_domain, Poolboy.operator_version, 'resourcewatches', + id='resource_watch_resume', + label_selector=label_selector, +) +async def resource_watch_create_or_resume( + annotations: kopf.Annotations, + labels: kopf.Labels, + logger: kopf.ObjectLogger, + meta: kopf.Meta, + name: str, + namespace: str, + spec: kopf.Spec, + status: kopf.Status, + uid: str, + **_ +): + resource_watch = await ResourceWatch.register( + annotations = annotations, + labels = labels, + meta = meta, + name = name, + namespace = namespace, + spec = spec, + status = status, + uid = uid, ) - async def resource_watch_create_or_resume( - annotations: kopf.Annotations, - labels: kopf.Labels, - logger: kopf.ObjectLogger, - meta: kopf.Meta, - name: str, - namespace: str, - spec: kopf.Spec, - status: kopf.Status, - uid: str, - **_ - ): - if (not Poolboy.operator_mode_resource_watch or - Poolboy.resource_watch_name == name - ): - resource_watch = await ResourceWatch.register( - annotations = annotations, - labels = labels, - meta = meta, - name = name, - namespace = namespace, - spec = spec, - status = status, - uid = uid, - ) - if Poolboy.operator_mode_manager: - await resource_watch.create_pod(logger=logger) - else: - await resource_watch.start(logger=logger) + # Always start watch directly (no more create_pod for manager mode) + await resource_watch.start(logger=logger) diff --git a/operator/poolboy.py b/operator/poolboy.py index b1fddb3..a6b7b5c 100644 --- a/operator/poolboy.py +++ b/operator/poolboy.py @@ -1,29 +1,32 @@ import os -from copy import deepcopy -from uuid import UUID import kopf import kubernetes_asyncio -import yaml class Poolboy(): metrics_enabled = os.environ.get('METRICS_ENABLED', 'true').lower() == 'true' - metrics_port = int(os.environ.get('METRICS_PORT', 9091)) + metrics_port = int(os.environ.get('METRICS_PORT', 9090)) manage_claims_interval = int(os.environ.get('MANAGE_CLAIMS_INTERVAL', 60)) manage_handles_interval = int(os.environ.get('MANAGE_HANDLES_INTERVAL', 60)) manage_pools_interval = int(os.environ.get('MANAGE_POOLS_INTERVAL', 10)) - operator_mode = os.environ.get('OPERATOR_MODE', 'all-in-one') - operator_mode_all_in_one = operator_mode == 'all-in-one' - operator_mode_manager = operator_mode == 'manager' - operator_mode_resource_handler = operator_mode == 'resource-handler' - operator_mode_resource_watch = operator_mode == 'resource-watch' + + # Operator mode: 'standalone' or 'distributed' + # Backward compatibility: + # - 'all-in-one' maps to 'standalone' + # - 'manager', 'resource-handler', 'resource-watch' map to 'distributed' + _operator_mode_raw = os.environ.get('OPERATOR_MODE', 'distributed') + operator_mode = ( + 'standalone' if _operator_mode_raw == 'all-in-one' + else 'distributed' if _operator_mode_raw in ('manager', 'resource-handler', 'resource-watch') + else _operator_mode_raw + ) + operator_mode_distributed = operator_mode == 'distributed' + operator_mode_standalone = operator_mode == 'standalone' + operator_domain = os.environ.get('OPERATOR_DOMAIN', 'poolboy.gpte.redhat.com') operator_version = os.environ.get('OPERATOR_VERSION', 'v1') operator_api_version = f"{operator_domain}/{operator_version}" - resource_watch_name = os.environ.get('WATCH_NAME') - resource_handler_count = int(os.environ.get('RESOURCE_HANDLER_COUNT', 1)) - resource_handler_idx = int(os.environ.get('RESOURCE_HANDLER_IDX', 0)) resource_refresh_interval = int(os.environ.get('RESOURCE_REFRESH_INTERVAL', 600)) resource_handle_deleted_annotation = f"{operator_domain}/resource-handle-deleted" resource_claim_name_annotation = f"{operator_domain}/resource-claim-name" @@ -45,9 +48,28 @@ class Poolboy(): resource_requester_user_annotation = f"{operator_domain}/resource-requester-user" resource_requester_preferred_username_annotation = f"{operator_domain}/resource-requester-preferred-username" ignore_label = f"{operator_domain}/ignore" + is_worker = os.environ.get('WORKER', 'false').lower() == 'true' + + # TODO: Remove after all production clusters migrated (used for cleanup only) resource_handler_idx_label = f"{operator_domain}/resource-handler-idx" - resource_handler_resources = yaml.safe_load(os.environ['RESOURCE_HANDLER_RESOURCES']) if 'RESOURCE_HANDLER_RESOURCES' in os.environ else None - resource_watch_resources = yaml.safe_load(os.environ['RESOURCE_WATCH_RESOURCES']) if 'RESOURCE_WATCH_RESOURCES' in os.environ else None + + # Worker feature flags (loaded from environment) + # When True, delegate processing to Celery workers + # When False, process synchronously in the main operator (current behavior) + workers_error_retry_countdown = int(os.environ.get('WORKERS_ERROR_RETRY_COUNTDOWN', '30')) + workers_lock_retry_countdown = int(os.environ.get('WORKERS_LOCK_RETRY_COUNTDOWN', '3')) + workers_resource_pool = os.environ.get('WORKERS_RESOURCE_POOL', 'false').lower() == 'true' + workers_resource_pool_daemon_mode = os.environ.get('WORKERS_RESOURCE_POOL_DAEMON_MODE', 'scheduler') + workers_resource_handle = os.environ.get('WORKERS_RESOURCE_HANDLE', 'false').lower() == 'true' + workers_resource_handle_daemon_mode = os.environ.get('WORKERS_RESOURCE_HANDLE_DAEMON_MODE', 'scheduler') + workers_resource_claim = os.environ.get('WORKERS_RESOURCE_CLAIM', 'false').lower() == 'true' + workers_resource_claim_daemon_mode = os.environ.get('WORKERS_RESOURCE_CLAIM_DAEMON_MODE', 'scheduler') + workers_resource_provider = os.environ.get('WORKERS_RESOURCE_PROVIDER', 'false').lower() == 'true' + workers_resource_watch = os.environ.get('WORKERS_RESOURCE_WATCH', 'false').lower() == 'true' + workers_cleanup = os.environ.get('WORKERS_CLEANUP', 'false').lower() == 'true' + + # Redis URL for distributed locking (used by main operator to send tasks) + redis_url = os.environ.get('REDIS_URL') @classmethod async def on_cleanup(cls): @@ -55,6 +77,11 @@ async def on_cleanup(cls): @classmethod async def on_startup(cls, logger: kopf.ObjectLogger): + # Log operator mode on startup + logger.info(f"Poolboy starting in {cls.operator_mode} mode") + if cls.operator_mode_distributed: + logger.info("Distributed mode: delegating to Celery workers") + if os.path.exists('/run/secrets/kubernetes.io/serviceaccount'): kubernetes_asyncio.config.load_incluster_config() with open('/run/secrets/kubernetes.io/serviceaccount/namespace', encoding='utf-8') as f: @@ -74,83 +101,15 @@ async def on_startup(cls, logger: kopf.ObjectLogger): cls.core_v1_api = kubernetes_asyncio.client.CoreV1Api(cls.api_client) cls.custom_objects_api = kubernetes_asyncio.client.CustomObjectsApi(cls.api_client) - if cls.operator_mode == 'manager': - await cls.assign_resource_handlers(logger=logger) - await cls.start_resource_handlers(logger=logger) - elif cls.operator_mode == 'all-in-one': - await cls.clear_resource_handler_assignments(logger=logger) - - @classmethod - async def assign_resource_handlers(cls, logger: kopf.ObjectLogger): - """Label ResourceHandles and ResourcePools to match to appropriate handlers. - Clear any extraneous finalizers.""" - for plural in ('resourcehandles', 'resourcepools'): - _continue = None - while True: - obj_list = await Poolboy.custom_objects_api.list_namespaced_custom_object( - group=Poolboy.operator_domain, - namespace=Poolboy.namespace, - plural=plural, - version=Poolboy.operator_version, - _continue = _continue, - limit = 50, - ) - for item in obj_list.get('items', []): - kind = item['kind'] - name = item['metadata']['name'] - patch = [] - resource_handler_idx = int(UUID(item['metadata']['uid'])) % cls.resource_handler_count - if resource_handler_idx != int(item['metadata'].get('labels', {}).get(cls.resource_handler_idx_label, '-1')): - if 'labels' in item['metadata']: - patch.append({ - "op": "add", - "path": "/metadata/labels", - "value": { - cls.resource_handler_idx_label: str(resource_handler_idx) - } - }) - else: - patch.append({ - "op": "add", - "path": f"/metadata/labels/{cls.resource_handler_idx_label.replace('/', '~1')}", - "value": str(resource_handler_idx), - }) - if 'finalizers' in item['metadata']: - clean_finalizers = [ - entry for entry in item['metadata']['finalizers'] - if entry == f"{Poolboy.operator_domain}/resource-handler-{resource_handler_idx}" - or not entry.startswith(f"{Poolboy.operator_domain}/resource-handler-") - ] - if clean_finalizers != item['metadata']['finalizers']: - patch.append({ - "op": "replace", - "path": "/metadata/finalizers", - "value": clean_finalizers, - }) - if patch: - logger.info( - f"Patching {kind} {name} to assign resource handler" - ) - try: - await Poolboy.custom_objects_api.patch_namespaced_custom_object( - group=Poolboy.operator_domain, - name=item['metadata']['name'], - namespace=Poolboy.namespace, - plural=plural, - version=Poolboy.operator_version, - body=patch, - _content_type = 'application/json-patch+json', - ) - except: - logger.exception("Patch failed.") - - _continue = obj_list['metadata'].get('continue') - if not _continue: - break + # Always run migration cleanup on startup + # TODO: Remove after all production clusters migrated + await cls.clear_resource_handler_assignments(logger=logger) + # TODO: Remove after all production clusters migrated @classmethod async def clear_resource_handler_assignments(cls, logger: kopf.ObjectLogger): - """Remove labels and finalizers applied to run in manager mode.""" + """Remove labels and finalizers from legacy manager mode. Keep for migration.""" + handler_finalizer = f"{cls.operator_domain}/handler" for plural in ('resourcehandles', 'resourcepools'): _continue = None while True: @@ -172,9 +131,11 @@ async def clear_resource_handler_assignments(cls, logger: kopf.ObjectLogger): "path": f"/metadata/labels/{cls.resource_handler_idx_label.replace('/', '~1')}", }) if 'finalizers' in item['metadata']: + # Clean both /resource-handler-* AND /handler patterns clean_finalizers = [ entry for entry in item['metadata']['finalizers'] if not entry.startswith(f"{Poolboy.operator_domain}/resource-handler-") + and not entry.startswith(handler_finalizer) # covers /handler and /handler-N ] if clean_finalizers != item['metadata']['finalizers']: patch.append({ @@ -202,81 +163,3 @@ async def clear_resource_handler_assignments(cls, logger: kopf.ObjectLogger): _continue = obj_list['metadata'].get('continue') if not _continue: break - - @classmethod - async def start_resource_handlers(cls, logger: kopf.ObjectLogger): - cls.manager_pod = await cls.core_v1_api.read_namespaced_pod( - name=os.environ['HOSTNAME'], - namespace=cls.namespace, - ) - logger.info(f"Manager running in pod {cls.manager_pod.metadata.name}") - for idx in range(Poolboy.resource_handler_count): - replicaset = kubernetes_asyncio.client.V1ReplicaSet( - api_version="apps/v1", - kind="ReplicaSet", - metadata=kubernetes_asyncio.client.V1ObjectMeta( - name=f"{cls.manager_pod.metadata.name}-handler-{idx}", - namespace=cls.namespace, - owner_references=[ - kubernetes_asyncio.client.V1OwnerReference( - api_version=cls.manager_pod.api_version, - controller=True, - kind=cls.manager_pod.kind, - name=cls.manager_pod.metadata.name, - uid=cls.manager_pod.metadata.uid, - ) - ] - ), - ) - replicaset.spec = kubernetes_asyncio.client.V1ReplicaSetSpec( - replicas=1, - selector=kubernetes_asyncio.client.V1LabelSelector( - match_labels={ - "app.kubernetes.io/name": cls.manager_pod.metadata.name, - "app.kubernetes.io/instance": f"handler-{idx}", - }, - ), - template=kubernetes_asyncio.client.V1PodTemplateSpec( - metadata=kubernetes_asyncio.client.V1ObjectMeta( - labels={ - "app.kubernetes.io/name": cls.manager_pod.metadata.name, - "app.kubernetes.io/instance": f"handler-{idx}", - }, - ), - spec=deepcopy(cls.manager_pod.spec), - ), - ) - - replicaset.spec.template.spec.containers[0].env = [ - env_var - for env_var in cls.manager_pod.spec.containers[0].env - if env_var.name not in { - 'OPERATOR_MODE', - 'RESOURCE_HANDLER_RESOURCES', - 'RESOURCE_WATCH_RESOURCES', - } - ] - replicaset.spec.template.spec.containers[0].env.append( - kubernetes_asyncio.client.V1EnvVar( - name='OPERATOR_MODE', - value='resource-handler', - ) - ) - replicaset.spec.template.spec.containers[0].env.append( - kubernetes_asyncio.client.V1EnvVar( - name='RESOURCE_HANDLER_IDX', - value=str(idx), - ) - ) - replicaset.spec.template.spec.node_name = None - if cls.resource_handler_resources: - replicaset.spec.template.spec.containers[0].resources = kubernetes_asyncio.client.V1ResourceRequirements( - limits=cls.resource_handler_resources.get('limits'), - requests=cls.resource_handler_resources.get('requests'), - ) - - replicaset = await cls.apps_v1_api.create_namespaced_replica_set( - namespace=cls.namespace, - body=replicaset, - ) - logger.info(f"Created ReplicaSet {replicaset.metadata.name}") diff --git a/operator/poolboy_worker.py b/operator/poolboy_worker.py new file mode 100644 index 0000000..efd28e2 --- /dev/null +++ b/operator/poolboy_worker.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 +""" +Worker entry point for Poolboy. + +This script resolves the naming conflict between the 'operator' directory +and Python's stdlib 'operator' module by: +1. Starting from a neutral directory (not /opt/app-root/operator) +2. Importing Celery FIRST (before our code is in sys.path) +3. Adding our operator directory to sys.path +4. Then loading our Celery app + +Usage: + python poolboy_worker.py worker --loglevel=info + python poolboy_worker.py beat --loglevel=info +""" +import os +import sys + +# Ensure we're not importing from operator directory initially +# This allows Celery and its dependencies to load without conflict +operator_path = '/opt/app-root/operator' +if operator_path in sys.path: + sys.path.remove(operator_path) + +# Now import Celery (and all stdlib dependencies like 'operator' module) +from celery.__main__ import main as celery_main + +# Add our operator directory to path for our app imports +sys.path.insert(0, operator_path) + +# Change to operator directory for relative imports in our code +os.chdir(operator_path) + +if __name__ == '__main__': + celery_main() + diff --git a/operator/processor/__init__.py b/operator/processor/__init__.py new file mode 100644 index 0000000..4e75720 --- /dev/null +++ b/operator/processor/__init__.py @@ -0,0 +1,11 @@ +""" +Celery processor module for Poolboy. + +Imports should be done directly from submodules to avoid circular imports: + from processor.app import app, WorkerState, is_worker_enabled + +This __init__.py intentionally does NOT import from .app to prevent +circular import issues when tasks import processor components. +""" + +__all__ = ['app', 'config'] diff --git a/operator/processor/app.py b/operator/processor/app.py new file mode 100644 index 0000000..eda7f69 --- /dev/null +++ b/operator/processor/app.py @@ -0,0 +1,431 @@ +""" +Worker application for Poolboy. + +Single class that manages: +- Celery app creation and configuration +- Signal handlers (worker lifecycle, task context) +- Task routing to partitioned queues +- Async bridge for running async code in sync tasks + +Note: Celery is an implementation detail, not exposed in public API. +""" + +import asyncio +import os +from contextvars import ContextVar +from functools import lru_cache +from typing import TypeVar + +import aiohttp +from celery import Celery, signals +from celery.utils.log import get_task_logger +from kombu import Queue +from metrics import TimerDecoratorMeta + +from .config import WorkerConfig + +logger = get_task_logger(__name__) +T = TypeVar('T') + + +# ============================================================================= +# TaskRouter - Convention-based routing +# ============================================================================= + +class TaskRouter: + """ + Route tasks to queues based on module naming convention. + + Convention: + - Task module: tasks.{module}.{task_name} + - Resource type: derived from module (resourcepool -> resource_pool) + - Entity name: module without 'resource' prefix (resourcepool -> pool) + - Kwargs: {entity}_name, {entity}_namespace + + Examples: + tasks.resourcepool.create_handles -> queue: resource_pool_0 + (uses pool_name, pool_namespace from kwargs) + tasks.resourceclaim.bind -> queue: resource_claim_2 + (uses claim_name, claim_namespace from kwargs) + tasks.cleanup.delete_old -> queue: cleanup + (no partitioning if PARTITION_CLEANUP not set) + + Configuration: + Partitioning is controlled via environment variables: + - PARTITION_RESOURCE_POOL=4 -> 4 partitions for resource_pool + - PARTITION_RESOURCE_CLAIM=8 -> 8 partitions for resource_claim + - (not set) -> no partitioning, uses simple queue name + """ + + def __call__(self, name: str, args: tuple, kwargs: dict, options: dict, + task=None, **kw) -> dict | None: + """Make router callable for Celery's task_routes.""" + return self.route(name, kwargs) + + def get_entity_from_module(self, module: str) -> str: + """ + Extract entity name from module name. + + Examples: + resourcepool -> pool + resourceclaim -> claim + cleanup -> cleanup + """ + if module.startswith('resource') and len(module) > 8: + return module[8:] # resourcepool -> pool + return module + + def get_partitions(self, resource_type: str) -> int: + """Get number of partitions for a resource type.""" + env_key = f"PARTITION_{resource_type.upper()}" + value = os.environ.get(env_key) + return int(value) if value else 0 + + def get_queue_name(self, resource_type: str, resource_name: str, + namespace: str, partitions: int) -> str: + """Calculate partitioned queue name using consistent hashing.""" + import hashlib + resource_key = f"{namespace}/{resource_name}" + hash_value = int(hashlib.md5(resource_key.encode()).hexdigest(), 16) + partition_index = hash_value % partitions + return f"{resource_type}_{partition_index}" + + def get_resource_type(self, module: str) -> str: + """ + Convert module name to resource type. + + Examples: + resourcepool -> resource_pool + resourceclaim -> resource_claim + cleanup -> cleanup + """ + if module.startswith('resource') and len(module) > 8: + return f"resource_{module[8:]}" + return module + + def parse_task_name(self, name: str) -> tuple[str, str] | None: + """Parse task name to extract module.""" + parts = name.split('.') + if len(parts) >= 3 and parts[0] == 'tasks': + return parts[1], parts[2] + return None + + def route(self, name: str, kwargs: dict) -> dict | None: + """Route a task to appropriate queue based on convention.""" + parsed = self.parse_task_name(name) + if not parsed: + return None + + module, _ = parsed + resource_type = self.get_resource_type(module) + partitions = self.get_partitions(resource_type) + + # No partitioning configured - use default queue + if not partitions: + return {'queue': 'default'} + + # Get resource identifier from kwargs using convention + # Fallback to generic 'name' and 'namespace' if entity-specific not found + entity = self.get_entity_from_module(module) + resource_name = kwargs.get(f'{entity}_name') or kwargs.get('name') + namespace = kwargs.get(f'{entity}_namespace') or kwargs.get('namespace', 'default') + + if resource_name: + queue = self.get_queue_name( + resource_type, resource_name, namespace, partitions + ) + return {'queue': queue} + + # No resource identifier - use default queue + return {'queue': 'default'} + + +# ============================================================================= +# WorkerState - Process-level state management +# ============================================================================= + +# Task context for distributed tracing +task_context: ContextVar[str | None] = ContextVar("task_context", default=None) + + +class WorkerState: + """ + Manages worker process state. + + Uses class-level attributes (like Poolboy) instead of module globals. + Provides clear initialization and cleanup lifecycle with resilience: + - Lazy initialization as fallback + - Max connection age to prevent stale connections + - Automatic reconnect on error + """ + + loop: asyncio.AbstractEventLoop | None = None + k8s_initialized: bool = False + initialized_at: float = 0 + MAX_CONNECTION_AGE: int = 300 # 5 minutes + + @classmethod + def cleanup(cls, log): + """Cleanup resources when worker process shuts down.""" + # Mark as not initialized first to prevent new tasks from starting + cls.k8s_initialized = False + + # Cleanup distributed lock Redis client + from distributed_lock import DistributedLock + DistributedLock.on_cleanup() + + if cls.loop and not cls.loop.is_closed(): + from poolboy import Poolboy + cls.loop.run_until_complete(Poolboy.on_cleanup()) + cls.loop.close() + log.info("Worker state cleaned up") + + cls.loop = None + cls.initialized_at = 0 + + @classmethod + def initialize(cls, log): + """Initialize event loop and K8s client for this worker process.""" + import time + + # Initialize distributed lock Redis client + from distributed_lock import DistributedLock + DistributedLock.on_startup() + + cls.loop = asyncio.new_event_loop() + asyncio.set_event_loop(cls.loop) + + from poolboy import Poolboy + cls.loop.run_until_complete(Poolboy.on_startup(logger=log)) + cls.k8s_initialized = True + cls.initialized_at = time.time() + + @classmethod + def _is_connection_stale(cls) -> bool: + """Check if connection has exceeded max age.""" + import time + if cls.initialized_at == 0: + return True + elapsed = time.time() - cls.initialized_at + return elapsed > cls.MAX_CONNECTION_AGE + + @classmethod + def _ensure_initialized(cls): + """Ensure connection is initialized and fresh (lazy init + max age).""" + not_ready = ( + not cls.k8s_initialized or + cls.loop is None or + cls.loop.is_closed() + ) + if not_ready: + logger.warning("WorkerState not initialized, lazy init...") + cls.initialize(logger) + elif cls._is_connection_stale(): + logger.info("K8s connection stale, refreshing...") + cls.cleanup(logger) + cls.initialize(logger) + + @classmethod + def run_async(cls, coro): + """ + Execute async code in the worker's event loop. + + Features: + - Lazy initialization if not ready + - Automatic refresh if connection is stale + - Automatic reconnect on error + """ + cls._ensure_initialized() + + try: + return cls.loop.run_until_complete(coro) + except aiohttp.ClientError as e: + # Connection error - cleanup stale connection, let Celery retry + logger.warning(f"K8s connection error, cleaning up: {e}") + cls.cleanup(logger) + raise # Celery will retry with fresh connection + # Note: K8sApiException (404, 409, etc.) are API errors, not connection + # errors - they propagate normally for task logic to handle + + +# ============================================================================= +# Helper functions +# ============================================================================= + +def is_worker_enabled(resource_type: str) -> bool: + """ + Check if workers are enabled for a specific resource type. + + Used by the operator to decide whether to dispatch tasks to workers. + + Args: + resource_type: Type of resource (e.g., 'resource_pool') + + Returns: + True if workers are enabled for this resource type. + """ + env_key = f"WORKERS_{resource_type.upper()}" + return os.environ.get(env_key, 'false').lower() == 'true' + + +# ============================================================================= +# WorkerApp +# ============================================================================= + +class WorkerApp(metaclass=TimerDecoratorMeta): + """ + Worker application factory for Poolboy. + + Responsibilities: + - Create and configure worker app from WorkerConfig dataclass + - Setup task queues from environment variables + - Configure task routing via TaskRouter (convention-based) + - Connect signal handlers for worker lifecycle + """ + + def __init__(self, config: WorkerConfig | None = None): + """ + Initialize worker application. + + Args: + config: WorkerConfig instance. If None, creates from env vars. + """ + self.config = config or WorkerConfig() + self.router = TaskRouter() + self.app = Celery('poolboy') + + self._configure_app() + self._configure_queues() + self._connect_signals() + self._setup_autodiscover() + + def _configure_app(self): + """Apply configuration from dataclass.""" + self.app.config_from_object(self.config.to_celery_config()) + + def _configure_queues(self): + """Configure task queues and routing.""" + queue_names = self._get_all_queues() + self.app.conf.task_queues = [Queue(q) for q in queue_names] + self.app.conf.task_default_queue = 'default' + self.app.conf.task_routes = (self.router,) + + def _get_all_queues(self) -> list[str]: + """Generate queue names (default + partitioned).""" + queues = ['default'] + + # Partitioned queues (e.g., 'resource_pool_0', 'resource_pool_1') + config = self._get_partition_config() + for resource_type, partition_count in config.items(): + for i in range(partition_count): + queues.append(f'{resource_type}_{i}') + + return queues + + @staticmethod + @lru_cache(maxsize=1) + def _get_partition_config() -> dict[str, int]: + """Get partition configuration from environment variables.""" + resource_types = [ + 'cleanup', 'resource_claim', 'resource_handle', + 'resource_pool', 'resource_provider', 'resource_watch', + ] + config = {} + for resource_type in resource_types: + env_key = f"PARTITION_{resource_type.upper()}" + value = os.environ.get(env_key) + if value: + config[resource_type] = int(value) + return config + + def _connect_signals(self): + """Connect all signal handlers to the Celery app.""" + signals.worker_init.connect(self._on_worker_init) + signals.worker_shutdown.connect(self._on_worker_shutdown) + signals.worker_process_init.connect(self._on_worker_process_init) + shutdown_signal = signals.worker_process_shutdown + shutdown_signal.connect(self._on_worker_process_shutdown) + signals.task_prerun.connect(self._on_task_prerun) + signals.task_postrun.connect(self._on_task_postrun) + + @staticmethod + def _on_worker_init(**kwargs): + """Initialize metrics server when main worker process starts.""" + if os.environ.get('WORKER_METRICS_ENABLED', 'true').lower() != 'true': + return + + from metrics import MetricsService + + port = int(os.environ.get('WORKER_METRICS_PORT', '9090')) + MetricsService.start(port=port) + logger.info(f"Worker metrics server started on port {port}") + + @staticmethod + def _on_worker_shutdown(**kwargs): + """Stop metrics server and cleanup when worker shuts down.""" + from metrics import MetricsService + + if MetricsService._server is not None: + MetricsService.stop() + logger.info("Worker metrics server stopped") + + @staticmethod + def _on_worker_process_init(**kwargs): + """Initialize event loop and K8s client when worker process starts.""" + from cache import Cache + Cache.initialize(standalone=False) + WorkerState.initialize(logger) + + @staticmethod + def _on_worker_process_shutdown(**kwargs): + """Cleanup when worker process shuts down.""" + WorkerState.cleanup(logger) + + @staticmethod + def _on_task_prerun(task_id=None, **kwargs): + """Set task context before execution.""" + if task_id: + task_context.set(task_id) + + @staticmethod + def _on_task_postrun(task_id=None, **kwargs): + """Clear task context after execution.""" + if task_id: + task_context.set(None) + + def _setup_autodiscover(self): + """Configure task autodiscovery.""" + self.app.autodiscover_tasks(['tasks']) + + +# ============================================================================= +# Module-level exports +# ============================================================================= + +# Create singleton and export app +worker_app = WorkerApp() +app = worker_app.app + + +# ============================================================================= +# Beat Schedule Setup (after all tasks are discovered) +# ============================================================================= + +@app.on_after_finalize.connect +def setup_periodic_tasks(sender, **kwargs): + """ + Configure Celery Beat schedule after app is fully initialized. + + This runs after all tasks have been discovered and registered, + avoiding circular import issues. + """ + enabled = os.environ.get('CELERY_SCHEDULER_ENABLED', 'false') + if enabled.lower() != 'true': + return + + # Import tasks to trigger @register_schedule decorators + import tasks # noqa: F401 + from scheduler.scheduler import setup_beat_schedule + + sender.conf.beat_schedule = setup_beat_schedule() + logger.info("Beat schedule configured") diff --git a/operator/processor/config.py b/operator/processor/config.py new file mode 100644 index 0000000..5b269e0 --- /dev/null +++ b/operator/processor/config.py @@ -0,0 +1,98 @@ +""" +Worker configuration dataclass for Poolboy. + +Reads all configuration from environment variables with sensible defaults. +This allows Helm to configure workers via ConfigMaps. +""" + +import os +from dataclasses import dataclass, field, fields + + +def _env_bool(key: str, default: bool = False) -> bool: + """Read boolean from environment variable.""" + return os.environ.get(key, str(default)).lower() == 'true' + + +def _env_int(key: str, default: int) -> int: + """Read integer from environment variable.""" + return int(os.environ.get(key, default)) + + +def _env_str(key: str, default: str) -> str: + """Read string from environment variable.""" + return os.environ.get(key, default) + + +@dataclass +class WorkerConfig: + """ + Worker configuration loaded from environment variables. + + Known fields are defined with explicit types. Additional CELERY_* env vars + are loaded dynamically into _extras and included in to_celery_config(). + """ + + # Hardcoded (never change) + accept_content: list[str] = field(default_factory=lambda: ['json']) + broker_connection_retry_on_startup: bool = True + result_serializer: str = 'json' + task_serializer: str = 'json' + # Configurable via env vars + broker_url: str = _env_str( + 'CELERY_BROKER_URL', 'redis://localhost:6379/0') + result_backend: str = _env_str( + 'CELERY_RESULT_BACKEND', 'redis://localhost:6379/1') + result_expires: int = _env_int('CELERY_RESULT_EXPIRES', 3600) + result_extended: bool = _env_bool('CELERY_RESULT_EXTENDED', True) + task_ack_late: bool = _env_bool('CELERY_TASK_ACK_LATE', True) + task_default_retry_delay: int = _env_int( + 'CELERY_TASK_DEFAULT_RETRY_DELAY', 60) + task_default_retry_delay_max: int = _env_int( + 'CELERY_TASK_DEFAULT_RETRY_DELAY_MAX', 600) + task_reject_on_worker_lost: bool = _env_bool( + 'CELERY_TASK_REJECT_ON_WORKER_LOST', True) + task_soft_time_limit: int = _env_int('CELERY_TASK_SOFT_TIME_LIMIT', 1740) + task_time_limit: int = _env_int('CELERY_TASK_TIME_LIMIT', 1800) + worker_prefetch_multiplier: int = _env_int( + 'CELERY_WORKER_PREFETCH_MULTIPLIER', 1) + worker_send_task_events: bool = _env_bool( + 'CELERY_WORKER_SEND_TASK_EVENTS', True) + task_send_sent_event: bool = _env_bool( + 'CELERY_TASK_SEND_SENT_EVENT', True) + # Dynamic extras (populated in __post_init__) + _extras: dict = field(default_factory=dict, init=False, repr=False) + + def __post_init__(self): + """Load additional CELERY_* env vars not defined as fields.""" + known = {f.name for f in fields(self) if not f.name.startswith('_')} + for key, value in os.environ.items(): + if key.startswith('CELERY_'): + field_name = key[7:].lower() # Remove CELERY_ prefix + if field_name not in known: + self._extras[field_name] = self._parse_value(value) + + @staticmethod + def _parse_value(value: str): + """Parse string value to appropriate type using heuristics.""" + if value.lower() in ('true', 'false'): + return value.lower() == 'true' + try: + return int(value) + except ValueError: + pass + try: + return float(value) + except ValueError: + pass + return value + + def to_celery_config(self) -> dict: + """Convert to Celery configuration dict including extras.""" + config = { + f.name: getattr(self, f.name) + for f in fields(self) + if not f.name.startswith('_') + } + config.update(self._extras) + return config diff --git a/operator/resourceclaim.py b/operator/resourceclaim.py index e92d6d7..d1330b5 100644 --- a/operator/resourceclaim.py +++ b/operator/resourceclaim.py @@ -1,21 +1,18 @@ import asyncio - from copy import deepcopy from datetime import datetime, timezone from typing import List, Mapping, TypeVar -from uuid import UUID import kopf import kubernetes_asyncio - +import resourcehandle +import resourceprovider +from cache import Cache, CacheTag from deep_merge import deep_merge from kopfobject import KopfObject from poolboy import Poolboy from poolboy_templating import recursive_process_template_strings -import resourcehandle -import resourceprovider - ResourceClaimT = TypeVar('ResourceClaimT', bound='ResourceClaim') ResourceHandleT = TypeVar('ResourceHandleT', bound='ResourceHandle') ResourceProviderT = TypeVar('ResourceProviderT', bound='ResourceProvider') @@ -43,7 +40,7 @@ def prune_k8s_resource(resource: Mapping) -> Mapping: ret["status"] = { key: value for key, value in resource['status'].items() - if not key in {'diffBase'} + if key not in {'diffBase'} } return ret @@ -53,14 +50,14 @@ class ResourceClaim(KopfObject): kind = "ResourceClaim" plural = "resourceclaims" - instances = {} class_lock = asyncio.Lock() @classmethod def __register_definition(cls, definition: Mapping) -> ResourceClaimT: name = definition['metadata']['name'] namespace = definition['metadata']['namespace'] - resource_claim = cls.instances.get((namespace, name)) + cache_key = f"{namespace}/{name}" + resource_claim = cls.cache_get(CacheTag.CLAIM, cache_key) if resource_claim: resource_claim.refresh_from_definition(definition=definition) else: @@ -74,14 +71,17 @@ def __register_definition(cls, definition: Mapping) -> ResourceClaimT: status = definition.get('status', {}), uid = definition['metadata']['uid'], ) - cls.instances[(namespace, name)] = resource_claim + resource_claim.cache_set(CacheTag.CLAIM, cache_key, ttl=300) return resource_claim @classmethod async def get(cls, name: str, namespace: str, use_cache: bool=True) -> ResourceClaimT: async with cls.class_lock: - if use_cache and (namespace, name) in cls.instances: - return cls.instances[(namespace, name)] + cache_key = f"{namespace}/{name}" + if use_cache: + cached = cls.cache_get(CacheTag.CLAIM, cache_key) + if cached: + return cached definition = await Poolboy.custom_objects_api.get_namespaced_custom_object( group=cls.api_group, name=name, @@ -106,7 +106,8 @@ async def register( uid: str, ) -> ResourceClaimT: async with cls.class_lock: - resource_claim = cls.instances.get((namespace, name)) + cache_key = f"{namespace}/{name}" + resource_claim = cls.cache_get(CacheTag.CLAIM, cache_key) if resource_claim: resource_claim.refresh( annotations = annotations, @@ -127,7 +128,7 @@ async def register( status = status, uid = uid, ) - cls.instances[(namespace, name)] = resource_claim + resource_claim.cache_set(CacheTag.CLAIM, cache_key, ttl=300) return resource_claim @classmethod @@ -141,7 +142,11 @@ async def register_definition( @classmethod async def unregister(cls, name: str, namespace: str) -> ResourceClaimT|None: async with cls.class_lock: - return cls.instances.pop((namespace, name), None) + cache_key = f"{namespace}/{name}" + resource_claim = cls.cache_get(CacheTag.CLAIM, cache_key) + if resource_claim: + Cache.delete(CacheTag.CLAIM, cache_key) + return resource_claim @property def approval_state(self) -> str|None: @@ -181,11 +186,11 @@ def has_spec_resources(self) -> bool: def have_resource_providers(self) -> bool: """Return whether this ResourceClaim has ResourceProviders assigned for all resources.""" if not self.status \ - or not 'resources' in self.status \ + or 'resources' not in self.status \ or len(self.spec.get('resources', [])) > len(self.status.get('resources', [])): return False for resource in self.status.get('resources', []): - if not 'provider' in resource: + if 'provider' not in resource: return False return True @@ -299,11 +304,6 @@ def resource_handle_namespace(self): return None return self.status.get('resourceHandle', {}).get('namespace') - @property - def resource_handler_idx(self) -> int: - """Label value used to select which resource handler pod should manage this ResourceClaim.""" - return int(UUID(self.uid)) % Poolboy.resource_handler_count - @property def resource_pool_name(self): if not self.annotations: @@ -346,34 +346,6 @@ def validation_failed(self) -> bool: return True return False - async def assign_resource_handler(self): - """Apply label to indicate resource handler should manage this ResourceClaim. - Do not change label on items which are deleting.""" - if ( - self.deletion_timestamp is None and - self.labels.get(Poolboy.resource_handler_idx_label) != str(self.resource_handler_idx) - ): - try: - patch = [{ - "op": "test", - "path": "/metadata/deletionTimestamp", - "value": None, - }] - patch.append({ - "op": "add", - "path": f"/metadata/labels/{Poolboy.resource_handler_idx_label.replace('/', '~1')}", - "value": str(self.resource_handler_idx), - } if self.labels else { - "op": "add", - "path": f"/metadata/labels", - "value": { - Poolboy.resource_handler_idx_label: str(self.resource_handler_idx), - } - }) - await self.json_patch(patch) - except kubernetes_asyncio.client.exceptions.ApiException as exception: - pass - async def bind_resource_handle(self, logger: kopf.ObjectLogger, resource_claim_resources: List[Mapping], @@ -473,7 +445,7 @@ def check_auto_detach(self, logger, resource_handle, resource_provider): def get_resource_state_from_status(self, resource_index): if not self.status \ - or not 'resources' in self.status \ + or 'resources' not in self.status \ or resource_index >= len(self.status['resources']): return None return self.status['resources'][resource_index].get('state') @@ -505,7 +477,7 @@ async def update_status_from_handle(self, # Adjust requested end if unchanged from default if not self.requested_lifespan_end_datetime \ or lifespan_default_timedelta == self.requested_lifespan_end_datetime - self.lifespan_start_datetime: - logger.info(f"Resetting default lifespan end on first ready") + logger.info("Resetting default lifespan end on first ready") await self.set_requested_lifespan_end( datetime.now(timezone.utc) + lifespan_default_timedelta ) @@ -671,7 +643,7 @@ async def assign_resource_providers(self, logger) -> None: elif 'template' in resource: provider = resourceprovider.ResourceProvider.find_provider_by_template_match(resource['template']) else: - raise kopf.TemporaryError(f"ResourceClaim spec.resources require either an explicit provider or a resource template to match.", delay=600) + raise kopf.TemporaryError("ResourceClaim spec.resources require either an explicit provider or a resource template to match.", delay=600) providers.append(provider) await self.merge_patch_status({ @@ -780,7 +752,7 @@ async def manage(self, logger) -> None: f"{self} has both spec.provider and spec.resources!", delay = 600 ) - if not 'provider' in self.status: + if 'provider' not in self.status: await self.merge_patch_status({ "provider": { "name": self.resource_provider_name_from_spec @@ -818,7 +790,7 @@ async def manage(self, logger) -> None: }) if resource_provider.approval_required: - if not 'approval' in self.status: + if 'approval' not in self.status: await self.merge_patch_status({ "approval": { "message": resource_provider.approval_pending_message, @@ -959,7 +931,7 @@ async def __manage_resource_handle(self, set_lifespan_end_timestamp = set_lifespan_end.strftime('%FT%TZ') - if not 'lifespan' in resource_handle.spec: + if 'lifespan' not in resource_handle.spec: logger.info(f"Setting lifespan end for {resource_handle} to {set_lifespan_end_timestamp}") patch.append({ "op": "add", diff --git a/operator/resourcehandle.py b/operator/resourcehandle.py index e0e5974..fae445e 100644 --- a/operator/resourcehandle.py +++ b/operator/resourcehandle.py @@ -1,23 +1,19 @@ import asyncio -import logging - from copy import deepcopy from datetime import datetime, timedelta, timezone from typing import Any, List, Mapping, TypeVar -from uuid import UUID import jinja2 import jsonpointer import kopf import kubernetes_asyncio -import pytimeparse - import poolboy_k8s +import pytimeparse import resourceclaim import resourcepool import resourceprovider import resourcewatch - +from cache import Cache, CacheTag from kopfobject import KopfObject from poolboy import Poolboy from poolboy_templating import recursive_process_template_strings, timedelta_to_str @@ -66,7 +62,7 @@ def __lt__(self, cmp): # Prefer unknown readiness state to known unready state if self.resource_handle.is_ready is None and cmp.resource_handle.is_ready is False: return True - if not self.resource_handle.is_ready is False and cmp.resource_handle.is_ready is None: + if self.resource_handle.is_ready is not False and cmp.resource_handle.is_ready is None: return False # Prefer older matches @@ -78,16 +74,12 @@ class ResourceHandle(KopfObject): kind = "ResourceHandle" plural = "resourcehandles" - all_instances = {} - bound_instances = {} - unbound_instances = {} class_lock = asyncio.Lock() - watch_other_task = None @classmethod def __register_definition(cls, definition: Mapping) -> ResourceHandleT: name = definition['metadata']['name'] - resource_handle = cls.all_instances.get(name) + resource_handle = cls.cache_get(CacheTag.HANDLE, name) if resource_handle: resource_handle.refresh_from_definition(definition=definition) else: @@ -113,7 +105,8 @@ async def bind_handle_to_claim( ) -> ResourceHandleT|None: async with cls.class_lock: # Check if there is already an assigned claim - resource_handle = cls.bound_instances.get((resource_claim.namespace, resource_claim.name)) + bound_key = f"{resource_claim.namespace}/{resource_claim.name}" + resource_handle = cls.cache_get(CacheTag.HANDLE_BOUND, bound_key) if resource_handle: if await resource_handle.refetch(): logger.warning(f"Rebinding {resource_handle} to {resource_claim}") @@ -124,7 +117,10 @@ async def bind_handle_to_claim( # Loop through unbound instances to find best match matches = [] - for resource_handle in cls.unbound_instances.values(): + for name in Cache.get_keys_by_tag(CacheTag.HANDLE_UNBOUND): + resource_handle = cls.cache_get(CacheTag.HANDLE_UNBOUND, name) + if not resource_handle: + continue # Skip unhealthy if resource_handle.is_healthy is False: continue @@ -391,12 +387,8 @@ async def create_for_claim(cls, version = Poolboy.operator_version, ) resource_handle = cls.from_definition(definition) - if ( - Poolboy.operator_mode_all_in_one or ( - Poolboy.operator_mode_resource_handler and - Poolboy.resource_handler_idx == resource_handle.resource_handler_idx - ) - ): + # Register in standalone mode (no handler partitioning) + if Poolboy.operator_mode_standalone: resource_handle.__register() logger.info( f"Created ResourceHandle {resource_handle.name} for " @@ -465,12 +457,8 @@ async def create_for_pool( version = Poolboy.operator_version, ) resource_handle = cls.from_definition(definition) - if ( - Poolboy.operator_mode_all_in_one or ( - Poolboy.operator_mode_resource_handler and - Poolboy.resource_handler_idx == resource_handle.resource_handler_idx - ) - ): + # Register in standalone mode (no handler partitioning) + if Poolboy.operator_mode_standalone: resource_handle.__register() logger.info(f"Created ResourceHandle {resource_handle.name} for ResourcePool {resource_pool.name}") return resource_handle @@ -481,7 +469,9 @@ async def delete_unbound_handles_for_pool( logger: kopf.ObjectLogger, resource_pool: ResourcePoolT, ) -> List[ResourceHandleT]: - if Poolboy.operator_mode_all_in_one: + # Workers always fetch from API (no shared memory cache) + use_cache = Poolboy.operator_mode_standalone + if use_cache: async with cls.class_lock: resource_handles = [] for resource_handle in list(cls.unbound_instances.values()): @@ -510,8 +500,10 @@ async def delete_unbound_handles_for_pool( @classmethod async def get(cls, name: str, ignore_deleting=True, use_cache=True) -> ResourceHandleT|None: async with cls.class_lock: - if use_cache and name in cls.all_instances: - return cls.all_instances[name] + if use_cache: + cached = cls.cache_get(CacheTag.HANDLE, name) + if cached: + return cached definition = await Poolboy.custom_objects_api.get_namespaced_custom_object( group=Poolboy.operator_domain, @@ -528,12 +520,7 @@ async def get(cls, name: str, ignore_deleting=True, use_cache=True) -> ResourceH @classmethod def get_from_cache(cls, name: str) -> ResourceHandleT|None: - return cls.all_instances.get(name) - - @classmethod - def start_watch_other(cls) -> None: - logger = logging.getLogger('watch_other_handles') - cls.watch_other_task = asyncio.create_task(cls.watch_other(logger)) + return cls.cache_get(CacheTag.HANDLE, name) @classmethod async def get_unbound_handles_for_pool( @@ -541,15 +528,23 @@ async def get_unbound_handles_for_pool( resource_pool: ResourcePoolT, logger: kopf.ObjectLogger, ) -> List[ResourceHandleT]: + """Get unbound handles for a pool.""" resource_handles = [] - if Poolboy.operator_mode_all_in_one: + + # In standalone mode, use cache (Memory or Redis) + # In distributed mode, fetch from K8s API to ensure completeness + # (cache may not be fully populated if operator just started) + if Poolboy.operator_mode_standalone: async with cls.class_lock: - for resource_handle in ResourceHandle.unbound_instances.values(): - if resource_handle.resource_pool_name == resource_pool.name \ + for name in Cache.get_keys_by_tag(CacheTag.HANDLE_UNBOUND): + resource_handle = cls.cache_get(CacheTag.HANDLE_UNBOUND, name) + if resource_handle \ + and resource_handle.resource_pool_name == resource_pool.name \ and resource_handle.resource_pool_namespace == resource_pool.namespace: resource_handles.append(resource_handle) - return resource_handles + return resource_handles + # Distributed mode: fetch from K8s API and cache for other workers _continue = None while True: resource_handle_list = await Poolboy.custom_objects_api.list_namespaced_custom_object( @@ -558,12 +553,15 @@ async def get_unbound_handles_for_pool( namespace=Poolboy.namespace, plural='resourcehandles', version=Poolboy.operator_version, - _continue = _continue, - limit = 50, + _continue=_continue, + limit=50, ) for definition in resource_handle_list['items']: resource_handle = cls.from_definition(definition) if not resource_handle.is_bound: + # Cache for other workers + resource_handle.cache_set(CacheTag.HANDLE, resource_handle.name, ttl=300) + resource_handle.cache_set(CacheTag.HANDLE_UNBOUND, resource_handle.name, ttl=300) resource_handles.append(resource_handle) _continue = resource_handle_list['metadata'].get('continue') if not _continue: @@ -602,7 +600,7 @@ async def register( uid: str, ) -> ResourceHandleT: async with cls.class_lock: - resource_handle = cls.all_instances.get(name) + resource_handle = cls.cache_get(CacheTag.HANDLE, name) if resource_handle: resource_handle.refresh( annotations = annotations, @@ -631,83 +629,41 @@ async def register_definition(cls, definition: Mapping) -> ResourceHandleT: async with cls.class_lock: return cls.__register_definition(definition) - @classmethod - async def stop_watch_other(cls) -> None: - if cls.watch_other_task is None: - return - cls.watch_other_task.cancel() - await cls.watch_other_task - @classmethod async def unregister(cls, name: str) -> ResourceHandleT|None: async with cls.class_lock: - resource_handle = cls.all_instances.pop(name, None) + resource_handle = cls.cache_get(CacheTag.HANDLE, name) if resource_handle: resource_handle.__unregister() return resource_handle - - @classmethod - async def watch_other(cls, logger) -> None: - while True: - try: - # FIXME - clear stale cache entries - await cls.__watch_other(logger) - except kubernetes_asyncio.client.exceptions.ApiException as exception: - if exception.status != 410: - logger.exception("Error watching other resourcehandles") - await asyncio.sleep(10) - except: - logger.exception("Error watching other resourcehandles") - await asyncio.sleep(10) - - @classmethod - async def __watch_other(cls, logger) -> None: - watch = kubernetes_asyncio.watch.Watch() - async for event in watch.stream( - Poolboy.custom_objects_api.list_namespaced_custom_object, - group=cls.api_group, - label_selector=f"!{Poolboy.ignore_label},{Poolboy.resource_handler_idx_label}!={Poolboy.resource_handler_idx}", - namespace=Poolboy.namespace, - plural=cls.plural, - version=cls.api_version, - ): - event_obj = event['object'] - event_type = event['type'] - if event_type == 'DELETED': - await cls.unregister(event_obj['metadata']['name']) - else: - await cls.register_definition(event_obj) + return None def __str__(self) -> str: return f"ResourceHandle {self.name}" def __register(self) -> None: """ - Add ResourceHandle to register of bound or unbound instances. + Add ResourceHandle to cache of bound or unbound instances. This method must be called with the ResourceHandle.lock held. """ # Ensure deleting resource handles are not cached if self.is_deleting: self.__unregister() return - self.all_instances[self.name] = self + self.cache_set(CacheTag.HANDLE, self.name, ttl=300) if self.is_bound: - self.bound_instances[( - self.resource_claim_namespace, - self.resource_claim_name - )] = self - self.unbound_instances.pop(self.name, None) + bound_key = f"{self.resource_claim_namespace}/{self.resource_claim_name}" + Cache.set(CacheTag.HANDLE_BOUND, bound_key, self, ttl=300) + Cache.delete(CacheTag.HANDLE_UNBOUND, self.name) else: - self.unbound_instances[self.name] = self + self.cache_set(CacheTag.HANDLE_UNBOUND, self.name, ttl=300) def __unregister(self) -> None: - self.all_instances.pop(self.name, None) - self.unbound_instances.pop(self.name, None) + Cache.delete(CacheTag.HANDLE, self.name) + Cache.delete(CacheTag.HANDLE_UNBOUND, self.name) if self.is_bound: - self.bound_instances.pop( - (self.resource_claim_namespace, self.resource_claim_name), - None, - ) + bound_key = f"{self.resource_claim_namespace}/{self.resource_claim_name}" + Cache.delete(CacheTag.HANDLE_BOUND, bound_key) @property def guid(self) -> str: @@ -820,11 +776,6 @@ def resource_claim_name(self) -> str|None: def resource_claim_namespace(self) -> str|None: return self.spec.get('resourceClaim', {}).get('namespace') - @property - def resource_handler_idx(self) -> int: - """Label value used to select which resource handler pod should manage this ResourceHandle.""" - return int(UUID(self.uid)) % Poolboy.resource_handler_count - @property def resource_pool_name(self) -> str|None: if 'resourcePool' in self.spec: @@ -939,9 +890,9 @@ async def __manage_init_status_resources(self, return await self.json_patch_status(patch) return - except kubernetes_asyncio.client.exceptions.ApiException as exception: + except kubernetes_asyncio.client.exceptions.ApiException as e: if attempt > 2: - logger.exception(f"{self} failed status patch: {patch}") + logger.warning(f"{self} status patch failed ({e.status}): {patch}") raise attempt += 1 @@ -976,7 +927,7 @@ async def __manage_update_spec_resources(self, resource_handle = self, ) - if not 'resources' in self.spec: + if 'resources' not in self.spec: await self.json_patch([{ "op": "add", "path": "/spec/resources", @@ -1072,34 +1023,6 @@ def set_resource_state(self, resource_index: int, value: Mapping|None) -> None: else: self.status['resources'][resource_index]['state'] = value - async def assign_resource_handler(self): - """Apply label to indicate resource handler should manage this ResourceHandle. - Do not change label on items which are deleting.""" - if ( - self.deletion_timestamp is None and - self.labels.get(Poolboy.resource_handler_idx_label) != str(self.resource_handler_idx) - ): - try: - patch = [{ - "op": "test", - "path": "/metadata/deletionTimestamp", - "value": None, - }] - patch.append({ - "op": "add", - "path": f"/metadata/labels/{Poolboy.resource_handler_idx_label.replace('/', '~1')}", - "value": str(self.resource_handler_idx), - } if self.labels else { - "op": "add", - "path": f"/metadata/labels", - "value": { - Poolboy.resource_handler_idx_label: str(self.resource_handler_idx), - } - }) - await self.json_patch(patch) - except kubernetes_asyncio.client.exceptions.ApiException as exception: - pass - async def get_resource_claim(self, not_found_okay: bool) -> ResourceClaimT|None: if not self.is_bound: return None @@ -1107,7 +1030,7 @@ async def get_resource_claim(self, not_found_okay: bool) -> ResourceClaimT|None: return await resourceclaim.ResourceClaim.get( name = self.resource_claim_name, namespace = self.resource_claim_namespace, - use_cache = Poolboy.operator_mode_all_in_one, + use_cache = Poolboy.operator_mode_standalone, ) except kubernetes_asyncio.client.exceptions.ApiException as e: if e.status == 404 and not_found_okay: @@ -1150,7 +1073,7 @@ async def get_resource_states(self) -> List[Mapping]: name=reference['name'], namespace=reference.get('namespace'), not_found_okay=True, - use_cache=Poolboy.operator_mode_all_in_one, + use_cache=Poolboy.operator_mode_standalone, ) ) return resource_states @@ -1405,7 +1328,21 @@ async def manage(self, logger: kopf.ObjectLogger) -> None: if exception.status != 409: raise - if resource_claim: + # Update handle status with resource states immediately after changes. + # This is only needed in worker context where ResourceWatch timing + # may be unreliable due to running in separate processes. + if Poolboy.is_worker and (resources_to_create or patch): + # Refetch to sync in-memory object with API after patches were applied + await self.refetch() + # Re-fetch claim to ensure we have the latest version + if self.is_bound: + resource_claim = await self.get_resource_claim(not_found_okay=True) + await self.update_status( + logger=logger, + resource_states=resource_states, + resource_claim=resource_claim, + ) + elif resource_claim: await resource_claim.update_status_from_handle( logger=logger, resource_handle=self, @@ -1559,21 +1496,23 @@ async def update_status(self, "path": "/status/summary", "value": status_summary, }) - except kubernetes_asyncio.client.exceptions.ApiException: - logger.exception( - f"Failed to get ResourceProvider {self.resource_provider_name} for {self}" + except kubernetes_asyncio.client.exceptions.ApiException as e: + logger.warning( + f"Failed to get ResourceProvider {self.resource_provider_name} " + f"for {self} ({e.status})" ) except Exception: logger.exception(f"Failed to generate status summary for {self}") if patch: + patch_attempt = 0 while True: try: await self.json_patch_status(patch) break - except kubernetes_asyncio.client.exceptions.ApiException: + except kubernetes_asyncio.client.exceptions.ApiException as e: patch_attempt += 1 if patch_attempt > 5: - logger.exception(f"Failed to patch status on {self}") + logger.warning(f"Failed to patch status on {self} ({e.status})") return await asyncio.sleep(0.2) diff --git a/operator/resourcepool.py b/operator/resourcepool.py index 2ef7b47..c816663 100644 --- a/operator/resourcepool.py +++ b/operator/resourcepool.py @@ -1,15 +1,13 @@ import asyncio - from datetime import timedelta from typing import List, Mapping, TypeVar -from uuid import UUID import kopf +import kubernetes_asyncio import pytimeparse - import resourcehandle import resourceprovider - +from cache import Cache, CacheTag from kopfobject import KopfObject from poolboy import Poolboy @@ -23,13 +21,12 @@ class ResourcePool(KopfObject): kind = "ResourcePool" plural = "resourcepools" - instances = {} class_lock = asyncio.Lock() @classmethod async def get(cls, name: str) -> ResourcePoolT: async with cls.class_lock: - return cls.instances.get(name) + return cls.cache_get(CacheTag.POOL, name) @classmethod async def register( @@ -44,7 +41,7 @@ async def register( uid: str, ) -> ResourcePoolT: async with cls.class_lock: - resource_pool = cls.instances.get(name) + resource_pool = cls.cache_get(CacheTag.POOL, name) if resource_pool: resource_pool.refresh( annotations = annotations, @@ -65,13 +62,16 @@ async def register( status = status, uid = uid, ) - resource_pool.__register() + resource_pool.cache_set(CacheTag.POOL, name, ttl=300) return resource_pool @classmethod async def unregister(cls, name: str) -> ResourcePoolT|None: async with cls.class_lock: - return cls.instances.pop(name, None) + resource_pool = cls.cache_get(CacheTag.POOL, name) + if resource_pool: + Cache.delete(CacheTag.POOL, name) + return resource_pool @property def delete_unhealthy_resource_handles(self) -> bool: @@ -127,11 +127,6 @@ def max_unready(self) -> int|None: def min_available(self) -> int: return self.spec.get('minAvailable', 0) - @property - def resource_handler_idx(self) -> int: - """Label value used to select which resource handler pod should manage this ResourcePool.""" - return int(UUID(self.uid)) % Poolboy.resource_handler_count - @property def resource_provider_name(self) -> str|None: return self.spec.get('provider', {}).get('name') @@ -145,38 +140,10 @@ def vars(self) -> Mapping: return self.spec.get('vars', {}) def __register(self) -> None: - self.instances[self.name] = self + self.cache_set(CacheTag.POOL, self.name, ttl=300) def __unregister(self) -> None: - self.instances.pop(self.name, None) - - async def assign_resource_handler(self): - """Apply label to indicate resource handler should manage this ResourcePool. - Do not change label on items which are deleting.""" - if ( - self.deletion_timestamp is None and - self.labels.get(Poolboy.resource_handler_idx_label) != str(self.resource_handler_idx) - ): - try: - patch = [{ - "op": "test", - "path": "/metadata/deletionTimestamp", - "value": None, - }] - patch.append({ - "op": "add", - "path": f"/metadata/labels/{Poolboy.resource_handler_idx_label.replace('/', '~1')}", - "value": str(self.resource_handler_idx), - } if self.labels else { - "op": "add", - "path": f"/metadata/labels", - "value": { - Poolboy.resource_handler_idx_label: str(self.resource_handler_idx), - } - }) - await self.json_patch(patch) - except kubernetes_asyncio.client.exceptions.ApiException as exception: - pass + Cache.delete(CacheTag.POOL, self.name) async def get_resource_provider(self) -> ResourceProviderT: """Return ResourceProvider configured to manage ResourceHandle.""" @@ -214,14 +181,14 @@ async def manage(self, logger: kopf.ObjectLogger): resource_handle_deficit = self.max_unready - unready_count if resource_handle_deficit > 0: - for i in range(resource_handle_deficit): - resource_handle = await resourcehandle.ResourceHandle.create_for_pool( - logger=logger, - resource_pool=self - ) - resource_handles_for_status.append({ - "name": resource_handle.name, - }) + for i in range(resource_handle_deficit): + resource_handle = await resourcehandle.ResourceHandle.create_for_pool( + logger=logger, + resource_pool=self + ) + resource_handles_for_status.append({ + "name": resource_handle.name, + }) patch = [] if not self.status: @@ -251,3 +218,21 @@ async def manage(self, logger: kopf.ObjectLogger): if patch: await self.json_patch_status(patch) + + async def refetch(self) -> ResourcePoolT | None: + """Fetch updated object from K8s API.""" + try: + definition = await Poolboy.custom_objects_api.get_namespaced_custom_object( + Poolboy.operator_domain, + Poolboy.operator_version, + Poolboy.namespace, + 'resourcepools', + self.name, + ) + self.refresh_from_definition(definition) + return self + except kubernetes_asyncio.client.exceptions.ApiException as e: + if e.status == 404: + await self.unregister(name=self.name) + return None + raise diff --git a/operator/resourceprovider.py b/operator/resourceprovider.py index e8cdd83..d4e4bd3 100644 --- a/operator/resourceprovider.py +++ b/operator/resourceprovider.py @@ -2,13 +2,14 @@ import re from copy import deepcopy from datetime import timedelta -from typing import List, Mapping, TypeVar +from typing import List, Mapping, Optional, TypeVar import jinja2 import jsonpointer import kopf import poolboy_k8s import pytimeparse +from cache import Cache, CacheTag from deep_merge import deep_merge from jsonpatch_from_diff import jsonpatch_from_diff from metrics.timer_decorator import TimerDecoratorMeta @@ -144,26 +145,41 @@ class _ValidationException(Exception): class ResourceProvider(metaclass=TimerDecoratorMeta): - instances = {} lock = asyncio.Lock() + @classmethod + def __cache_get(cls, name: str) -> Optional[ResourceProviderT]: + """Get ResourceProvider from cache.""" + cached = Cache.get(CacheTag.PROVIDER, name) + if cached is None: + return None + if isinstance(cached, cls): + return cached + # RedisBackend returns dict, reconstruct + return cls(definition=cached) + + def __cache_set(self, ttl: int = 300) -> None: + """Store ResourceProvider in cache.""" + # Store the definition dict for Redis compatibility + Cache.set(CacheTag.PROVIDER, self.name, self.definition, ttl) + @classmethod def __register_definition(cls, definition: Mapping) -> ResourceProviderT: name = definition['metadata']['name'] - resource_provider = cls.instances.get(name) + resource_provider = cls.__cache_get(name) if resource_provider: - resource_provider.definition = definition - self.__init_resource_template_validator() + resource_provider.__init__(definition=definition) else: resource_provider = cls(definition=definition) - cls.instances[name] = resource_provider + resource_provider.__cache_set(ttl=300) return resource_provider @classmethod def find_provider_by_template_match(cls, template: Mapping) -> ResourceProviderT: provider_matches = [] - for provider in cls.instances.values(): - if provider.is_match_for_template(template): + for name in Cache.get_keys_by_tag(CacheTag.PROVIDER): + provider = cls.__cache_get(name) + if provider and provider.is_match_for_template(template): provider_matches.append(provider) if len(provider_matches) == 0: raise kopf.TemporaryError("Unable to match template to ResourceProvider", delay=60) @@ -175,7 +191,7 @@ def find_provider_by_template_match(cls, template: Mapping) -> ResourceProviderT @classmethod async def get(cls, name: str) -> ResourceProviderT: async with cls.lock: - resource_provider = cls.instances.get(name) + resource_provider = cls.__cache_get(name) if resource_provider: return resource_provider definition = await Poolboy.custom_objects_api.get_namespaced_custom_object( @@ -210,9 +226,10 @@ async def preload(cls, logger: kopf.ObjectLogger) -> None: async def register(cls, definition: Mapping, logger: kopf.ObjectLogger) -> ResourceProviderT: async with cls.lock: name = definition['metadata']['name'] - resource_provider = cls.instances.get(name) + resource_provider = cls.__cache_get(name) if resource_provider: resource_provider.__init__(definition=definition) + resource_provider.__cache_set(ttl=300) logger.debug(f"Refreshed definition of ResourceProvider {name}") else: resource_provider = cls.__register_definition(definition=definition) @@ -222,11 +239,14 @@ async def register(cls, definition: Mapping, logger: kopf.ObjectLogger) -> Resou @classmethod async def unregister(cls, name: str, logger: kopf.ObjectLogger) -> ResourceProviderT|None: async with cls.lock: - if name in cls.instances: + resource_provider = cls.__cache_get(name) + if resource_provider: + Cache.delete(CacheTag.PROVIDER, name) logger.debug(f"Unregistered ResourceProvider {name}") - return cls.instances.pop(name) + return resource_provider def __init__(self, definition: Mapping) -> None: + self._definition = definition self.meta = definition['metadata'] self.spec = definition['spec'] self.__init_resource_template_validator() @@ -241,6 +261,11 @@ def __init_resource_template_validator(self) -> None: def __str__(self) -> str: return f"ResourceProvider {self.name}" + @property + def definition(self) -> Mapping: + """Return the full resource definition for cache serialization.""" + return self._definition + @property def approval_pending_message(self) -> bool: return self.spec.get('approval', {}).get('pendingMessage', 'Approval pending.') diff --git a/operator/resourcewatch.py b/operator/resourcewatch.py index 13be779..edbbf0b 100644 --- a/operator/resourcewatch.py +++ b/operator/resourcewatch.py @@ -1,22 +1,18 @@ import asyncio -import inflection -import kopf -import kubernetes_asyncio import logging - -from copy import deepcopy -from datetime import datetime, timezone -from typing import Mapping, TypeVar - from base64 import urlsafe_b64encode +from datetime import datetime, timezone from hashlib import sha256 +from typing import Mapping, TypeVar +import inflection +import kopf +import kubernetes_asyncio import poolboy_k8s - +import resourcehandle +from cache import Cache, CacheTag from kopfobject import KopfObject from poolboy import Poolboy -import resourcehandle -import resourceprovider logger = logging.getLogger('resource_watch') @@ -35,18 +31,8 @@ class ResourceWatch(KopfObject): kind = "ResourceWatch" plural = "resourcewatches" - instances = {} class_lock = asyncio.Lock() - class CacheEntry: - def __init__(self, resource: Mapping): - self.resource = resource - self.cache_datetime = datetime.now(timezone.utc) - - @property - def is_expired(self): - return (datetime.now(timezone.utc) - self.cache_datetime).total_seconds() > Poolboy.resource_refresh_interval - @classmethod def __instance_key(cls, api_version: str, kind: str, namespace: str|None) -> str: """Return cache key used to identify ResourceWatch in instances dict""" @@ -71,14 +57,13 @@ def __get_instance(cls, kind: str, namespace: str|None, ): - """Return ResourceWatch from instances dict.""" - return cls.instances.get( - cls.__instance_key( - api_version=api_version, - kind=kind, - namespace=namespace - ) + """Return ResourceWatch from cache.""" + instance_key = cls.__instance_key( + api_version=api_version, + kind=kind, + namespace=namespace ) + return cls.cache_get(CacheTag.WATCH, instance_key) @classmethod def __register_definition(cls, definition: Mapping) -> ResourceWatchT: @@ -284,8 +269,9 @@ async def stop_all(cls) -> None: """Stop all ResourceWatch tasks""" async with cls.class_lock: tasks = [] - for resource_watch in cls.instances.values(): - if resource_watch.task is not None: + for instance_key in Cache.get_keys_by_tag(CacheTag.WATCH): + resource_watch = cls.cache_get(CacheTag.WATCH, instance_key) + if resource_watch and resource_watch.task is not None: resource_watch.task.cancel() tasks.append(resource_watch.task) if tasks: @@ -311,16 +297,14 @@ def __init__(self, status=status, uid=uid, ) - # Cache to store fetched resources - self.cache = {} # Task for when watch is running self.task = None def __register(self) -> None: """ - Add ResourceWatch to register of instances. + Add ResourceWatch to cache. """ - self.instances[self.__self_instance_key] = self + self.cache_set(CacheTag.WATCH, self.__self_instance_key(), ttl=300) def __str__(self) -> str: return ( @@ -331,9 +315,9 @@ def __str__(self) -> str: def __self_instance_key(self) -> str: return self.__instance_key( - api_version=self.api_version, - kind=self.kind, - namespace=self.namespace, + api_version=self.watch_api_version, + kind=self.watch_kind, + namespace=self.watch_namespace, ) @property @@ -352,99 +336,20 @@ def watch_kind(self) -> str: def watch_namespace(self) -> str|None: return self.spec.get('namespace') - def cache_clean(self): - self.cache = { - name: cache_entry - for name, cache_entry in self.cache.items() - if not cache_entry.is_expired - } - - async def create_pod(self, - logger: kopf.ObjectLogger, - ) -> None: - replicaset = kubernetes_asyncio.client.V1ReplicaSet( - api_version="apps/v1", - kind="ReplicaSet", - metadata=kubernetes_asyncio.client.V1ObjectMeta( - name=f"{Poolboy.manager_pod.metadata.name}-watch-{self.name_hash}", - namespace=Poolboy.namespace, - owner_references=[ - kubernetes_asyncio.client.V1OwnerReference( - api_version=Poolboy.manager_pod.api_version, - controller=True, - kind=Poolboy.manager_pod.kind, - name=Poolboy.manager_pod.metadata.name, - uid=Poolboy.manager_pod.metadata.uid, - ) - ] - ), - ) - replicaset.spec = kubernetes_asyncio.client.V1ReplicaSetSpec( - replicas=1, - selector=kubernetes_asyncio.client.V1LabelSelector( - match_labels={ - "app.kubernetes.io/name": Poolboy.manager_pod.metadata.name, - "app.kubernetes.io/instance": f"watch-{self.name_hash}", - }, - ), - template=kubernetes_asyncio.client.V1PodTemplateSpec( - metadata=kubernetes_asyncio.client.V1ObjectMeta( - labels={ - "app.kubernetes.io/name": Poolboy.manager_pod.metadata.name, - "app.kubernetes.io/instance": f"watch-{self.name_hash}", - }, - ), - spec=deepcopy(Poolboy.manager_pod.spec), - ), - ) - - replicaset.spec.template.spec.containers[0].env = [ - env_var - for env_var in Poolboy.manager_pod.spec.containers[0].env - if env_var.name not in { - 'OPERATOR_MODE', - 'RESOURCE_HANDLER_COUNT', - 'RESOURCE_HANDLER_RESOURCES', - 'RESOURCE_WATCH_RESOURCES', - } - ] - replicaset.spec.template.spec.containers[0].env.append( - kubernetes_asyncio.client.V1EnvVar( - name='OPERATOR_MODE', - value='resource-watch', - ) - ) - replicaset.spec.template.spec.containers[0].env.append( - kubernetes_asyncio.client.V1EnvVar( - name='WATCH_NAME', - value=self.name, - ) - ) - replicaset.spec.template.spec.node_name = None - if Poolboy.resource_watch_resources: - replicaset.spec.template.spec.containers[0].resources = kubernetes_asyncio.client.V1ResourceRequirements( - limits=Poolboy.resource_watch_resources.get('limits'), - requests=Poolboy.resource_watch_resources.get('requests'), - ) - - replicaset = await Poolboy.apps_v1_api.create_namespaced_replica_set( - namespace=Poolboy.namespace, - body=replicaset, - ) - logger.info(f"Created ReplicaSet {replicaset.metadata.name} for {self}") + def __resource_cache_key(self, name: str) -> str: + """Build unique cache key for a watched resource.""" + return f"{self.name}:{name}" async def get_resource(self, name: str, not_found_okay: bool=False, use_cache: bool=True, ) -> Mapping|None: + resource_cache_key = self.__resource_cache_key(name) if use_cache: - cache_entry = self.cache.get(name) - if cache_entry: - if cache_entry.is_expired: - self.cache.pop(name, None) - else: - return cache_entry.resource + cached = Cache.get(CacheTag.WATCH_RESOURCE, resource_cache_key) + if cached: + return cached try: resource = await poolboy_k8s.get_object( api_version=self.watch_api_version, @@ -458,7 +363,7 @@ async def get_resource(self, else: raise if use_cache and resource: - self.cache[name] = ResourceWatch.CacheEntry(resource) + Cache.set(CacheTag.WATCH_RESOURCE, resource_cache_key, resource, ttl=Poolboy.resource_refresh_interval) return resource async def start(self, logger) -> None: @@ -504,7 +409,7 @@ async def watch(self): watch_duration = (datetime.now(timezone.utc) - watch_start_dt).total_seconds() if watch_duration < 60: await asyncio.sleep(60 - watch_duration) - except Exception as e: + except Exception: logger.exception(f"{self} exception") watch_duration = (datetime.now(timezone.utc) - watch_start_dt).total_seconds() if watch_duration < 60: @@ -516,7 +421,6 @@ async def watch(self): async def __watch(self, method, **kwargs): watch = None - self.cache_clean() try: watch = kubernetes_asyncio.watch.Watch() async for event in watch.stream(method, **kwargs): @@ -568,15 +472,16 @@ async def __watch_event(self, event_type, event_obj): if not resource_handle_name: return + resource_cache_key = self.__resource_cache_key(resource_name) if event_type == 'DELETED': - self.cache.pop(resource_name, None) + Cache.delete(CacheTag.WATCH_RESOURCE, resource_cache_key) else: - self.cache[resource_name] = ResourceWatch.CacheEntry(event_obj) + Cache.set(CacheTag.WATCH_RESOURCE, resource_cache_key, event_obj, ttl=Poolboy.resource_refresh_interval) try: resource_handle = await resourcehandle.ResourceHandle.get( name=resource_handle_name, - use_cache=Poolboy.operator_mode_all_in_one, + use_cache=Poolboy.operator_mode_standalone, ) except kubernetes_asyncio.client.exceptions.ApiException as exception: if exception.status == 404: @@ -636,7 +541,7 @@ async def __watch_event(self, event_type, event_obj): resource_claim=resource_claim, resource_states=resource_states, ) - except kubernetes_asyncio.client.exceptions.ApiException as exception: + except kubernetes_asyncio.client.exceptions.ApiException: logger.exception( f"Failed updating status on {resource_handle} from event on {resource_description}" ) diff --git a/operator/scheduler/__init__.py b/operator/scheduler/__init__.py new file mode 100644 index 0000000..2dbf4c2 --- /dev/null +++ b/operator/scheduler/__init__.py @@ -0,0 +1,3 @@ +""" +Scheduler module for Celery Beat configuration. +""" diff --git a/operator/scheduler/config_loader.py b/operator/scheduler/config_loader.py new file mode 100644 index 0000000..546fe94 --- /dev/null +++ b/operator/scheduler/config_loader.py @@ -0,0 +1,50 @@ +""" +Configuration loader for schedule overrides from ConfigMap. +""" + +import os +from typing import Any, Dict + +import yaml + + +def load_schedule_config() -> Dict[str, Any]: + """ + Load schedule configuration from YAML file. + + Returns: + Dictionary with schedule configuration. + """ + config_path = os.environ.get( + 'CELERY_SCHEDULE_CONFIG', + '/etc/poolboy/schedule_config.yaml' + ) + if os.path.exists(config_path): + try: + with open(config_path, 'r') as f: + return yaml.safe_load(f) or {} + except yaml.YAMLError as e: + print(f"Error parsing schedule config YAML: {e}") + return {} + return {} + + +def deep_merge(source: Dict, destination: Dict) -> Dict: + """ + Recursively merge two dictionaries. + + Args: + source: Source dictionary to merge from + destination: Destination dictionary to merge into + + Returns: + Merged dictionary + """ + result = destination.copy() + for key, value in source.items(): + if isinstance(value, dict) and key in result and isinstance(result[key], dict): + result[key] = deep_merge(value, result[key]) + else: + result[key] = value + return result + diff --git a/operator/scheduler/registry.py b/operator/scheduler/registry.py new file mode 100644 index 0000000..31db03f --- /dev/null +++ b/operator/scheduler/registry.py @@ -0,0 +1,103 @@ +""" +Beat Registry for declarative periodic task definition. +""" + +import logging +from collections.abc import Callable +from dataclasses import dataclass, field +from typing import ClassVar, Optional + +from metrics import TimerDecoratorMeta + +logger = logging.getLogger(__name__) + +CRON_FIELDS_COUNT = 5 + + +@dataclass +class ScheduledTask: + """Represents a scheduled periodic task.""" + task_name: str + task_func: Callable + description: str + owner: str + cron: Optional[str] = None + seconds: Optional[int] = None + tags: list[str] = field(default_factory=list) + enabled: bool = False + + +class BeatRegistry(metaclass=TimerDecoratorMeta): + """Registry for periodic tasks.""" + _tasks: ClassVar[dict[str, ScheduledTask]] = {} + + @classmethod + def register( + cls, + task_name: str, + description: str, + owner: str, + cron: Optional[str] = None, + seconds: Optional[int] = None, + tags: Optional[list[str]] = None, + enabled: bool = False, + ): + """Decorator to register a periodic task.""" + def decorator(func: Callable) -> Callable: + scheduled_task = ScheduledTask( + task_name=task_name, + task_func=func, + cron=cron, + seconds=seconds, + description=description, + owner=owner, + tags=tags or [], + enabled=enabled, + ) + cls._tasks[task_name] = scheduled_task + logger.info(f"Registered periodic task: {task_name}") + return func + return decorator + + @classmethod + def get_task(cls, task_name: str) -> Optional[ScheduledTask]: + """Get a registered task by name.""" + return cls._tasks.get(task_name) + + @classmethod + def list_all(cls) -> dict[str, ScheduledTask]: + """List all registered tasks.""" + return cls._tasks.copy() + + @classmethod + def validate_registry(cls): + """Validate all registered tasks.""" + errors = [] + for name, task in cls._tasks.items(): + has_cron = task.cron is not None + has_seconds = task.seconds is not None + if not has_cron and not has_seconds: + errors.append(f"{name}: must have cron or seconds") + if has_cron and has_seconds: + errors.append(f"{name}: cannot have both cron and seconds") + if has_cron and not cls._is_valid_cron(task.cron): + errors.append(f"{name}: invalid cron '{task.cron}'") + if has_seconds and task.seconds <= 0: + errors.append(f"{name}: seconds must be positive") + if not task.description: + errors.append(f"{name}: missing description") + if not task.owner: + errors.append(f"{name}: missing owner") + if errors: + msg = f"Registry validation failed: {'; '.join(errors)}" + raise ValueError(msg) + logger.info(f"Registry validated: {len(cls._tasks)} tasks registered") + + @staticmethod + def _is_valid_cron(cron_expr: str) -> bool: + """Check if cron expression has valid number of fields.""" + parts = cron_expr.strip().split() + return len(parts) == CRON_FIELDS_COUNT + + +register_schedule = BeatRegistry.register diff --git a/operator/scheduler/scheduler.py b/operator/scheduler/scheduler.py new file mode 100644 index 0000000..ab1c961 --- /dev/null +++ b/operator/scheduler/scheduler.py @@ -0,0 +1,91 @@ +""" +Beat scheduler configuration builder. +""" + +from datetime import timedelta +from typing import Any + +from celery.schedules import crontab +from metrics import TimerDecoratorMeta + +from .config_loader import deep_merge, load_schedule_config +from .registry import CRON_FIELDS_COUNT, BeatRegistry + + +class BeatScheduler(metaclass=TimerDecoratorMeta): + """Manages beat schedule configuration from ConfigMap.""" + + def __init__(self): + raw_config = load_schedule_config() + self.schedules = raw_config.get('schedules', {}) + + def _build_task_config(self, task_name: str, registry_task) -> dict[str, Any]: + """Merge registry defaults with ConfigMap overrides.""" + if registry_task.seconds is not None: + default_schedule = {"seconds": registry_task.seconds} + else: + default_schedule = {"cron": registry_task.cron} + + config = { + "enabled": registry_task.enabled, + "schedule": default_schedule, + "options": {}, + } + + if task_name in self.schedules: + config = deep_merge(self.schedules[task_name], config) + + return config + + def _parse_cron(self, cron_str: str) -> crontab: + """Parse cron string to celery crontab object.""" + parts = cron_str.strip().split() + if len(parts) != CRON_FIELDS_COUNT: + raise ValueError(f"Invalid cron expression: {cron_str}") + + return crontab( + minute=parts[0], + hour=parts[1], + day_of_month=parts[2], + month_of_year=parts[3], + day_of_week=parts[4], + ) + + def _parse_schedule(self, schedule_config: dict) -> crontab | timedelta: + """Parse schedule config to celery schedule object.""" + if "seconds" in schedule_config: + return timedelta(seconds=schedule_config["seconds"]) + elif "cron" in schedule_config: + return self._parse_cron(schedule_config["cron"]) + else: + raise ValueError(f"Invalid schedule config: {schedule_config}") + + def build_schedule(self) -> dict[str, dict]: + """Build Celery beat_schedule from registry and ConfigMap.""" + BeatRegistry.validate_registry() + + beat_schedule = {} + + for task_name, registry_task in BeatRegistry.list_all().items(): + config = self._build_task_config(task_name, registry_task) + + if not config.get("enabled", False): + continue + + schedule_entry = { + "task": registry_task.task_func.name, + "schedule": self._parse_schedule(config["schedule"]), + } + + if "options" in config: + schedule_entry["options"] = config["options"] + + beat_schedule[task_name] = schedule_entry + + return beat_schedule + + +def setup_beat_schedule() -> dict[str, dict]: + """Setup function called from processor/app.py when scheduler is enabled.""" + scheduler = BeatScheduler() + return scheduler.build_schedule() diff --git a/operator/tasks/__init__.py b/operator/tasks/__init__.py new file mode 100644 index 0000000..f2d66d2 --- /dev/null +++ b/operator/tasks/__init__.py @@ -0,0 +1,35 @@ +""" +Celery tasks for Poolboy resource management. + +Task modules follow the same naming convention as the main Poolboy modules: +- resourcepool.py +- resourceclaim.py +- resourcehandle.py +- resourceprovider.py +- resourcewatch.py +- cleanup.py +""" + +from . import resourcepool + +# Placeholder imports for other task types (to be implemented in future phases) +try: + from . import resourceclaim +except ImportError: + pass +try: + from . import resourcehandle +except ImportError: + pass +try: + from . import resourceprovider +except ImportError: + pass +try: + from . import resourcewatch +except ImportError: + pass +try: + from . import cleanup +except ImportError: + pass diff --git a/operator/tasks/resourceclaim.py b/operator/tasks/resourceclaim.py new file mode 100644 index 0000000..297e93e --- /dev/null +++ b/operator/tasks/resourceclaim.py @@ -0,0 +1,207 @@ +"""Celery tasks for ResourceClaim management.""" + +from celery.utils.log import get_task_logger +from distributed_lock import distributed_lock +from poolboy import Poolboy +from processor.app import WorkerState, app +from scheduler.registry import register_schedule + +logger = get_task_logger(__name__) + +BATCH_SIZE = 20 # claims per batch - distributes across workers + + +def _is_transient_exception(exc: Exception) -> bool: + """Check if exception is transient (expected retry scenario).""" + import kubernetes_asyncio + + if isinstance(exc, kubernetes_asyncio.client.exceptions.ApiException): + return True + + exc_class_name = type(exc).__name__ + exc_module = type(exc).__module__ + if exc_class_name == 'TemporaryError' and 'kopf' in exc_module: + return True + + return False + + +def _log_and_retry(task, name: str, namespace: str, exc: Exception, action: str): + """Log exception appropriately and retry the task.""" + countdown = Poolboy.workers_error_retry_countdown + + if _is_transient_exception(exc): + logger.warning(f"Claim {namespace}/{name} {action} error: {exc}") + else: + logger.error(f"Claim {namespace}/{name} {action} error: {exc}", exc_info=True) + + raise task.retry(exc=exc, countdown=countdown, max_retries=5) + + +async def _collect_claims_to_process() -> list: + """Collect all claims that need processing (not recently processed).""" + claims_to_process = [] + _continue = None + + while True: + # Note: Using cluster-wide listing since claims exist in user namespaces + claim_list = await Poolboy.custom_objects_api.list_cluster_custom_object( + group=Poolboy.operator_domain, + plural='resourceclaims', + version=Poolboy.operator_version, + _continue=_continue, + limit=50, + ) + + for item in claim_list.get('items', []): + # Skip ignored claims + if Poolboy.ignore_label in item['metadata'].get('labels', {}): + continue + + claims_to_process.append(item) + + _continue = claim_list['metadata'].get('continue') + if not _continue: + break + + return claims_to_process + + +async def _delete_claim(definition: dict) -> dict: + """Async wrapper for ResourceClaim.handle_delete(). + + Note: We do NOT refetch for delete operations. The claim may already + be deleted from K8s, but we still need to propagate the delete to + the ResourceHandle using the original definition. + """ + import resourceclaim + claim = resourceclaim.ResourceClaim.from_definition(definition) + await claim.handle_delete(logger=logger) + return {"status": "completed", "claim": claim.name, "namespace": claim.namespace} + + +def _dispatch_batch(claims: list) -> int: + """Dispatch a batch of claims as individual tasks.""" + dispatched = 0 + for item in claims: + uid = item['metadata']['uid'] + rv = item['metadata']['resourceVersion'] + kwargs = { + 'definition': item, + 'name': item['metadata']['name'], + 'namespace': item['metadata']['namespace'], + } + manage_claim.apply_async(kwargs=kwargs, task_id=f"claim-{uid}-{rv}") + dispatched += 1 + return dispatched + + +async def _manage_claim(definition: dict) -> dict: + """Async wrapper for ResourceClaim.manage().""" + import resourceclaim + claim = resourceclaim.ResourceClaim.from_definition(definition) + # Refetch to get current state from K8s API (avoid stale data) + claim = await claim.refetch() + if not claim: + # Claim was deleted between dispatch and execution + return {"status": "skipped", "reason": "not_found", "claim": definition['metadata']['name']} + await claim.manage(logger=logger) + return {"status": "completed", "claim": claim.name, "namespace": claim.namespace} + + +@app.task(bind=True, acks_late=True) +def delete_claim(self, definition: dict, name: str, namespace: str): + """Execute ResourceClaim.handle_delete() in a worker.""" + uid = definition['metadata']['uid'] + lock_key = f"resource_claim:{uid}" + + with distributed_lock(lock_key, timeout=60) as acquired: + if not acquired: + logger.debug(f"Claim {namespace}/{name} locked, retrying") + countdown = Poolboy.workers_lock_retry_countdown + raise self.retry(countdown=countdown, max_retries=None) + + try: + return WorkerState.run_async(_delete_claim(definition)) + except Exception as e: + _log_and_retry(self, name, namespace, e, "delete") + + +def dispatch_delete_claim(definition: dict, name: str, namespace: str): + """Dispatch delete_claim task with unique task_id.""" + uid = definition['metadata']['uid'] + rv = definition['metadata']['resourceVersion'] + kwargs = {'definition': definition, 'name': name, 'namespace': namespace} + delete_claim.apply_async( + kwargs=kwargs, + task_id=f"claim-delete-{uid}-{rv}", + ) + + +def dispatch_manage_claim(definition: dict, name: str, namespace: str): + """Dispatch manage_claim task. Always dispatches for operator events.""" + uid = definition['metadata']['uid'] + rv = definition['metadata']['resourceVersion'] + kwargs = {'definition': definition, 'name': name, 'namespace': namespace} + manage_claim.apply_async( + kwargs=kwargs, + task_id=f"claim-{uid}-{rv}", + ) + + +@app.task(bind=True, acks_late=True) +def manage_claim(self, definition: dict, name: str, namespace: str): + """Execute ResourceClaim.manage() in a worker.""" + uid = definition['metadata']['uid'] + lock_key = f"resource_claim:{uid}" + + with distributed_lock(lock_key, timeout=60) as acquired: + if not acquired: + logger.debug(f"Claim {namespace}/{name} locked, retrying") + countdown = Poolboy.workers_lock_retry_countdown + raise self.retry(countdown=countdown, max_retries=None) + + try: + return WorkerState.run_async(_manage_claim(definition)) + except Exception as e: + _log_and_retry(self, name, namespace, e, "manage") + + +@register_schedule( + task_name="maintain-all-claims", + seconds=60, + description="Periodic task to reconcile all ResourceClaims", + owner="poolboy", +) +@app.task(name="tasks.resourceclaim.maintain_all_claims") +def maintain_all_claims(): + """Periodic task for Celery Beat - reconcile all claims using group for distribution.""" + from celery import group + + lock_key = "maintain_all_claims:global" + + with distributed_lock(lock_key, timeout=300) as acquired: + if not acquired: + return {"status": "skipped", "reason": "already_running"} + + # Collect all claims that need processing + claims = WorkerState.run_async(_collect_claims_to_process()) + + if not claims: + return {"status": "completed", "total": 0, "batches": 0} + + # Split into batches and dispatch using group (distributes across workers) + batches = [claims[i:i + BATCH_SIZE] for i in range(0, len(claims), BATCH_SIZE)] + + # Create group of batch tasks - Celery will distribute across available workers + batch_group = group(process_claim_batch.s(batch) for batch in batches) + batch_group.apply_async() + + logger.info(f"Claim maintenance: {len(claims)} claims in {len(batches)} batches") + return {"status": "dispatched", "total": len(claims), "batches": len(batches)} + + +@app.task(bind=True) +def process_claim_batch(self, claims: list): + """Process a batch of claims. Each batch runs on a different worker.""" + return _dispatch_batch(claims) diff --git a/operator/tasks/resourcehandle.py b/operator/tasks/resourcehandle.py new file mode 100644 index 0000000..d448f4c --- /dev/null +++ b/operator/tasks/resourcehandle.py @@ -0,0 +1,213 @@ +"""Celery tasks for ResourceHandle management.""" + +from celery.utils.log import get_task_logger +from distributed_lock import distributed_lock +from poolboy import Poolboy +from processor.app import WorkerState, app +from scheduler.registry import register_schedule + +logger = get_task_logger(__name__) + +BATCH_SIZE = 20 # handles per batch - distributes across workers + + +def _is_transient_exception(exc: Exception) -> bool: + """Check if exception is transient (expected retry scenario). + """ + import kubernetes_asyncio + + # Check ApiException first (already imported) + if isinstance(exc, kubernetes_asyncio.client.exceptions.ApiException): + return True + + # Check kopf.TemporaryError by class name to avoid importing kopf + # This works because resourcehandle.py raises kopf.TemporaryError + exc_class_name = type(exc).__name__ + exc_module = type(exc).__module__ + if exc_class_name == 'TemporaryError' and 'kopf' in exc_module: + return True + + return False + + +def _log_and_retry(task, name: str, exc: Exception, action: str): + """Log exception appropriately and retry the task.""" + countdown = Poolboy.workers_error_retry_countdown + + if _is_transient_exception(exc): + # Expected transient errors - warning only, no traceback + logger.warning(f"Handle {name} {action} error: {exc}") + else: + # Unexpected error - log with traceback for debugging + logger.error(f"Handle {name} {action} error: {exc}", exc_info=True) + + raise task.retry(exc=exc, countdown=countdown, max_retries=5) + + +async def _collect_handles_to_process() -> list: + """Collect all handles that need processing (not recently processed).""" + handles_to_process = [] + _continue = None + + while True: + handle_list = await Poolboy.custom_objects_api.list_namespaced_custom_object( + group=Poolboy.operator_domain, + namespace=Poolboy.namespace, + plural='resourcehandles', + version=Poolboy.operator_version, + _continue=_continue, + limit=50, + ) + + for item in handle_list.get('items', []): + # Skip ignored handles + if Poolboy.ignore_label in item['metadata'].get('labels', {}): + continue + + handles_to_process.append(item) + + _continue = handle_list['metadata'].get('continue') + if not _continue: + break + + return handles_to_process + + +async def _delete_handle(definition: dict) -> dict: + """Async wrapper for ResourceHandle.handle_delete(). + + Note: We do NOT refetch for delete operations. The handle may already + be deleted from K8s, but we still need to propagate the delete to + child resources (ResourceClaimTest, etc.) using the original definition. + """ + import resourcehandle + handle = resourcehandle.ResourceHandle.from_definition(definition) + await handle.handle_delete(logger=logger) + return {"status": "completed", "handle": handle.name} + + +def _dispatch_batch(handles: list) -> int: + """Dispatch a batch of handles as individual tasks.""" + dispatched = 0 + for item in handles: + uid = item['metadata']['uid'] + rv = item['metadata']['resourceVersion'] + kwargs = { + 'definition': item, + 'name': item['metadata']['name'], + 'namespace': item['metadata']['namespace'], + } + manage_handle.apply_async(kwargs=kwargs, task_id=f"handle-{uid}-{rv}") + dispatched += 1 + return dispatched + + +async def _manage_handle(definition: dict) -> dict: + """Async wrapper for ResourceHandle.manage().""" + import resourcehandle + handle = resourcehandle.ResourceHandle.from_definition(definition) + # Refetch to get current state from K8s API (avoid stale data) + handle = await handle.refetch() + if not handle: + # Handle was deleted between dispatch and execution + return {"status": "skipped", "reason": "not_found", "handle": definition['metadata']['name']} + await handle.manage(logger=logger) + return {"status": "completed", "handle": handle.name} + + +@app.task(bind=True, acks_late=True) +def delete_handle(self, definition: dict, name: str, namespace: str): + """Execute ResourceHandle.handle_delete() in a worker.""" + uid = definition['metadata']['uid'] + lock_key = f"resource_handle:{uid}" + + with distributed_lock(lock_key, timeout=60) as acquired: + if not acquired: + logger.debug(f"Handle {name} locked, retrying") + countdown = Poolboy.workers_lock_retry_countdown + raise self.retry(countdown=countdown, max_retries=None) + + try: + return WorkerState.run_async(_delete_handle(definition)) + except Exception as e: + _log_and_retry(self, name, e, "delete") + + +def dispatch_delete_handle(definition: dict, name: str, namespace: str): + """Dispatch delete_handle task with unique task_id.""" + uid = definition['metadata']['uid'] + rv = definition['metadata']['resourceVersion'] + kwargs = {'definition': definition, 'name': name, 'namespace': namespace} + delete_handle.apply_async( + kwargs=kwargs, + task_id=f"handle-delete-{uid}-{rv}", + ) + + +def dispatch_manage_handle(definition: dict, name: str, namespace: str): + """Dispatch manage_handle task. Always dispatches for operator events.""" + uid = definition['metadata']['uid'] + rv = definition['metadata']['resourceVersion'] + kwargs = {'definition': definition, 'name': name, 'namespace': namespace} + manage_handle.apply_async( + kwargs=kwargs, + task_id=f"handle-{uid}-{rv}", + ) + + +@app.task(bind=True, acks_late=True) +def manage_handle(self, definition: dict, name: str, namespace: str): + """Execute ResourceHandle.manage() in a worker.""" + uid = definition['metadata']['uid'] + lock_key = f"resource_handle:{uid}" + + with distributed_lock(lock_key, timeout=60) as acquired: + if not acquired: + logger.debug(f"Handle {name} locked, retrying") + countdown = Poolboy.workers_lock_retry_countdown + raise self.retry(countdown=countdown, max_retries=None) + + try: + return WorkerState.run_async(_manage_handle(definition)) + except Exception as e: + _log_and_retry(self, name, e, "manage") + + +@register_schedule( + task_name="maintain-all-handles", + seconds=60, + description="Periodic task to reconcile all ResourceHandles", + owner="poolboy", +) +@app.task(name="tasks.resourcehandle.maintain_all_handles") +def maintain_all_handles(): + """Periodic task for Celery Beat - reconcile all handles using group for distribution.""" + from celery import group + + lock_key = "maintain_all_handles:global" + + with distributed_lock(lock_key, timeout=300) as acquired: + if not acquired: + return {"status": "skipped", "reason": "already_running"} + + # Collect all handles that need processing + handles = WorkerState.run_async(_collect_handles_to_process()) + + if not handles: + return {"status": "completed", "total": 0, "batches": 0} + + # Split into batches and dispatch using group (distributes across workers) + batches = [handles[i:i + BATCH_SIZE] for i in range(0, len(handles), BATCH_SIZE)] + + # Create group of batch tasks - Celery will distribute across available workers + batch_group = group(process_handle_batch.s(batch) for batch in batches) + batch_group.apply_async() + + logger.info(f"Handle maintenance: {len(handles)} handles in {len(batches)} batches") + return {"status": "dispatched", "total": len(handles), "batches": len(batches)} + + +@app.task(bind=True) +def process_handle_batch(self, handles: list): + """Process a batch of handles. Each batch runs on a different worker.""" + return _dispatch_batch(handles) diff --git a/operator/tasks/resourcepool.py b/operator/tasks/resourcepool.py new file mode 100644 index 0000000..9027150 --- /dev/null +++ b/operator/tasks/resourcepool.py @@ -0,0 +1,136 @@ +"""Celery tasks for ResourcePool management.""" + +from celery.utils.log import get_task_logger +from distributed_lock import distributed_lock +from processor.app import WorkerState, app +from scheduler.registry import register_schedule + +logger = get_task_logger(__name__) + + +async def _delete_pool_handles(definition: dict) -> dict: + """Async wrapper for ResourcePool.handle_delete().""" + import resourcepool + pool = resourcepool.ResourcePool.from_definition(definition) + await pool.handle_delete(logger=logger) + return {"status": "completed", "pool": pool.name} + + +async def _maintain_all_pools() -> dict: + """List all pools and dispatch manage_pool for each unprocessed.""" + from poolboy import Poolboy + + pool_list = await Poolboy.custom_objects_api.list_namespaced_custom_object( + group=Poolboy.operator_domain, + namespace=Poolboy.namespace, + plural='resourcepools', + version=Poolboy.operator_version, + ) + + dispatched = 0 + for item in pool_list.get('items', []): + uid = item['metadata']['uid'] + rv = item['metadata']['resourceVersion'] + kwargs = { + 'definition': item, + 'name': item['metadata']['name'], + 'namespace': item['metadata']['namespace'], + } + manage_pool.apply_async(kwargs=kwargs, task_id=f"pool-{uid}-{rv}") + dispatched += 1 + + return {"dispatched": dispatched} + + +async def _manage_pool(definition: dict) -> dict: + """Async wrapper for ResourcePool.manage().""" + import resourcepool + pool = resourcepool.ResourcePool.from_definition(definition) + await pool.manage(logger=logger) + return {"status": "completed", "pool": pool.name} + + +@app.task(bind=True, acks_late=True) +def delete_pool_handles(self, definition: dict, name: str, namespace: str): + """Execute ResourcePool.handle_delete() in a worker.""" + from poolboy import Poolboy + + uid = definition['metadata']['uid'] + lock_key = f"resource_pool:{uid}" + + with distributed_lock(lock_key, timeout=60) as acquired: + if not acquired: + logger.debug(f"Pool {namespace}/{name} locked, retrying") + countdown = Poolboy.workers_lock_retry_countdown + raise self.retry(countdown=countdown, max_retries=None) + + try: + return WorkerState.run_async(_delete_pool_handles(definition)) + except Exception as e: + logger.error(f"Pool {namespace}/{name} delete error: {e}") + raise self.retry(exc=e, countdown=Poolboy.workers_error_retry_countdown, max_retries=5) + + +def dispatch_delete_pool_handles(definition: dict, name: str, namespace: str): + """Dispatch delete_pool_handles task with unique task_id.""" + uid = definition['metadata']['uid'] + rv = definition['metadata']['resourceVersion'] + kwargs = {'definition': definition, 'name': name, 'namespace': namespace} + delete_pool_handles.apply_async( + kwargs=kwargs, + task_id=f"pool-delete-{uid}-{rv}", + ) + + +def dispatch_manage_pool(definition: dict, name: str, namespace: str): + """Dispatch manage_pool task. Always dispatches for operator events.""" + uid = definition['metadata']['uid'] + rv = definition['metadata']['resourceVersion'] + kwargs = {'definition': definition, 'name': name, 'namespace': namespace} + manage_pool.apply_async( + kwargs=kwargs, + task_id=f"pool-{uid}-{rv}", + ) + + +@register_schedule( + task_name="maintain-all-pools", + seconds=30, + description="Periodic task to reconcile all ResourcePools", + owner="poolboy", +) +@app.task(name="tasks.resourcepool.maintain_all_pools") +def maintain_all_pools(): + """Periodic task for Celery Beat - reconcile all pools.""" + lock_key = "maintain_all_pools:global" + + with distributed_lock(lock_key, timeout=300) as acquired: + if not acquired: + return {"status": "skipped", "reason": "already_running"} + + result = WorkerState.run_async(_maintain_all_pools()) + if result.get("dispatched", 0) > 0: + logger.info(f"Maintenance dispatched: {result['dispatched']}") + return result + + +@app.task(bind=True, acks_late=True) +def manage_pool(self, definition: dict, name: str, namespace: str): + """Execute ResourcePool.manage() in a worker.""" + from poolboy import Poolboy + + uid = definition['metadata']['uid'] + lock_key = f"resource_pool:{uid}" + + with distributed_lock(lock_key, timeout=60) as acquired: + if not acquired: + logger.debug(f"Pool {namespace}/{name} locked, retrying") + countdown = Poolboy.workers_lock_retry_countdown + raise self.retry(countdown=countdown, max_retries=None) + + try: + result = WorkerState.run_async(_manage_pool(definition)) + return result + except Exception as e: + logger.error(f"Pool {namespace}/{name} error: {e}") + raise self.retry(exc=e, countdown=Poolboy.workers_error_retry_countdown, max_retries=5) diff --git a/requirements.txt b/requirements.txt index 6b992ea..9cb36c4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,19 +1,21 @@ -aioprometheus==23.12.0 +celery>=5.3.0 inflection==0.5.1 Jinja2==3.1.5 jmespath-community==1.1.2 jsonpointer==2.2 jsonschema==3.2.0 +kombu>=5.3.0 openapi-schema-validator==0.1.5 prometheus-client==0.11.0 pyasn1==0.4.8 pyasn1-modules==0.2.8 pydantic==1.10.13 pyOpenSSL==20.0.1 -python-dateutil==2.8.1 +python-dateutil>=2.8.2 python-string-utils==1.0.0 pytimeparse==1.1.8 PyYAML==6.0.1 +redis>=4.5.0 requests==2.32.0 str2bool==1.1 StringGenerator==0.4.4 From 727d846c9ee3ce4636f979e39b3114fb00ba09dc Mon Sep 17 00:00:00 2001 From: Marcos Amorim Date: Fri, 2 Jan 2026 20:22:14 -0500 Subject: [PATCH 2/7] refactor: rename operator.py to main.py to fix stdlib conflict The 'operator' directory conflicts with Python's stdlib 'operator' module when Celery imports it. Renaming the entry point to main.py and using the KOPF_OPERATORS env var eliminates the need for the poolboy_worker.py workaround. Changes: - Rename operator/operator.py to operator/main.py - Add KOPF_OPERATORS=main.py to operator deployment - Simplify worker/scheduler to use direct celery command - Remove poolboy_worker.py workaround --- helm/templates/deployment.yaml | 2 ++ .../worker/scheduler-deployment.yaml | 2 +- helm/templates/worker/worker-deployment.yaml | 2 +- operator/{operator.py => main.py} | 0 operator/poolboy_worker.py | 36 ------------------- 5 files changed, 4 insertions(+), 38 deletions(-) rename operator/{operator.py => main.py} (100%) delete mode 100644 operator/poolboy_worker.py diff --git a/helm/templates/deployment.yaml b/helm/templates/deployment.yaml index daf4807..50dc67d 100644 --- a/helm/templates/deployment.yaml +++ b/helm/templates/deployment.yaml @@ -24,6 +24,8 @@ spec: containers: - name: manager env: + - name: KOPF_OPERATORS + value: main.py - name: CLUSTER_DOMAIN value: "{{ .Values.clusterDomain }}" - name: MANAGE_CLAIMS_INTERVAL diff --git a/helm/templates/worker/scheduler-deployment.yaml b/helm/templates/worker/scheduler-deployment.yaml index 043d561..6631614 100644 --- a/helm/templates/worker/scheduler-deployment.yaml +++ b/helm/templates/worker/scheduler-deployment.yaml @@ -28,7 +28,7 @@ spec: - name: scheduler image: "{{ include "poolboy.image" . }}" imagePullPolicy: {{ .Values.image.pullPolicy }} - command: ["python3", "/opt/app-root/operator/poolboy_worker.py"] + command: ["celery"] args: - "-A" - "processor.app" diff --git a/helm/templates/worker/worker-deployment.yaml b/helm/templates/worker/worker-deployment.yaml index 79e0b87..3600168 100644 --- a/helm/templates/worker/worker-deployment.yaml +++ b/helm/templates/worker/worker-deployment.yaml @@ -28,7 +28,7 @@ spec: - name: worker image: "{{ include "poolboy.image" . }}" imagePullPolicy: {{ .Values.image.pullPolicy }} - command: ["python3", "/opt/app-root/operator/poolboy_worker.py"] + command: ["celery"] args: - "-A" - "processor.app" diff --git a/operator/operator.py b/operator/main.py similarity index 100% rename from operator/operator.py rename to operator/main.py diff --git a/operator/poolboy_worker.py b/operator/poolboy_worker.py deleted file mode 100644 index efd28e2..0000000 --- a/operator/poolboy_worker.py +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env python3 -""" -Worker entry point for Poolboy. - -This script resolves the naming conflict between the 'operator' directory -and Python's stdlib 'operator' module by: -1. Starting from a neutral directory (not /opt/app-root/operator) -2. Importing Celery FIRST (before our code is in sys.path) -3. Adding our operator directory to sys.path -4. Then loading our Celery app - -Usage: - python poolboy_worker.py worker --loglevel=info - python poolboy_worker.py beat --loglevel=info -""" -import os -import sys - -# Ensure we're not importing from operator directory initially -# This allows Celery and its dependencies to load without conflict -operator_path = '/opt/app-root/operator' -if operator_path in sys.path: - sys.path.remove(operator_path) - -# Now import Celery (and all stdlib dependencies like 'operator' module) -from celery.__main__ import main as celery_main - -# Add our operator directory to path for our app imports -sys.path.insert(0, operator_path) - -# Change to operator directory for relative imports in our code -os.chdir(operator_path) - -if __name__ == '__main__': - celery_main() - From 61e6b244545dd6961f865828f6906d14f1550b58 Mon Sep 17 00:00:00 2001 From: Marcos Amorim Date: Fri, 2 Jan 2026 20:38:55 -0500 Subject: [PATCH 3/7] fix: add workingDir and PYTHONPATH for celery workers The celery command requires the operator directory in PYTHONPATH to find the 'tasks' module during autodiscovery. Changes: - Add workingDir: /opt/app-root/operator to worker and scheduler - Add PYTHONPATH=/opt/app-root/operator environment variable - Use direct celery command instead of shell wrapper --- helm/templates/worker/scheduler-deployment.yaml | 3 +++ helm/templates/worker/worker-deployment.yaml | 3 +++ 2 files changed, 6 insertions(+) diff --git a/helm/templates/worker/scheduler-deployment.yaml b/helm/templates/worker/scheduler-deployment.yaml index 6631614..11aa836 100644 --- a/helm/templates/worker/scheduler-deployment.yaml +++ b/helm/templates/worker/scheduler-deployment.yaml @@ -28,6 +28,7 @@ spec: - name: scheduler image: "{{ include "poolboy.image" . }}" imagePullPolicy: {{ .Values.image.pullPolicy }} + workingDir: /opt/app-root/operator command: ["celery"] args: - "-A" @@ -42,6 +43,8 @@ spec: {{- toYaml . | nindent 12 }} {{- end }} env: + - name: PYTHONPATH + value: /opt/app-root/operator {{- range $key, $value := .Values.scheduler.extraEnvVars }} - name: {{ $key }} value: {{ $value | quote }} diff --git a/helm/templates/worker/worker-deployment.yaml b/helm/templates/worker/worker-deployment.yaml index 3600168..7fd0b4b 100644 --- a/helm/templates/worker/worker-deployment.yaml +++ b/helm/templates/worker/worker-deployment.yaml @@ -28,6 +28,7 @@ spec: - name: worker image: "{{ include "poolboy.image" . }}" imagePullPolicy: {{ .Values.image.pullPolicy }} + workingDir: /opt/app-root/operator command: ["celery"] args: - "-A" @@ -51,6 +52,8 @@ spec: {{- toYaml . | nindent 12 }} {{- end }} env: + - name: PYTHONPATH + value: /opt/app-root/operator {{- range $key, $value := .Values.worker.extraEnvVars }} - name: {{ $key }} value: {{ $value | quote }} From 249760455c847d76a9b6cb84708b1308fc5b25b3 Mon Sep 17 00:00:00 2001 From: Marcos Amorim Date: Mon, 5 Jan 2026 17:02:35 -0500 Subject: [PATCH 4/7] fix(resourcehandle): Fix pool handle selection order for distributed mode ResourceHandleMatch comparison was preferring handles with known health/ready status over unknown status, causing newer handles to be selected before older ones when the older handles hadn't been processed yet. Changed comparison to only prefer healthy=True over healthy=False (not over None), ensuring creation_timestamp remains the final tiebreaker for FIFO ordering. --- operator/resourcehandle.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/operator/resourcehandle.py b/operator/resourcehandle.py index fae445e..1bb4569 100644 --- a/operator/resourcehandle.py +++ b/operator/resourcehandle.py @@ -47,25 +47,16 @@ def __lt__(self, cmp): if self.template_difference_count > cmp.template_difference_count: return False - # Prefer healthy resources to unknown health state - if self.resource_handle.is_healthy and cmp.resource_handle.is_healthy is None: + if self.resource_handle.is_healthy and cmp.resource_handle.is_healthy is False: return True - if self.resource_handle.is_healthy is None and cmp.resource_handle.is_healthy: + if self.resource_handle.is_healthy is False and cmp.resource_handle.is_healthy: return False - # Prefer ready resources to unready or unknown readiness state if self.resource_handle.is_ready and not cmp.resource_handle.is_ready: return True if not self.resource_handle.is_ready and cmp.resource_handle.is_ready: return False - # Prefer unknown readiness state to known unready state - if self.resource_handle.is_ready is None and cmp.resource_handle.is_ready is False: - return True - if self.resource_handle.is_ready is not False and cmp.resource_handle.is_ready is None: - return False - - # Prefer older matches return self.resource_handle.creation_timestamp < cmp.resource_handle.creation_timestamp class ResourceHandle(KopfObject): From c6438e36e8f0c4372c251b226a243db2d50fa8f3 Mon Sep 17 00:00:00 2001 From: Marcos Amorim Date: Mon, 5 Jan 2026 17:42:41 -0500 Subject: [PATCH 5/7] test: increase timeouts to accommodate daemon polling interval Status changes (approval, lifespan, auto-delete/detach conditions) do not trigger Kopf events - they are only detected when the daemon runs its next cycle (every 60 seconds). Adjustments: - Increase delay from 1s to 2s and retries from 10-20 to 30-45 - This allows up to 90 seconds for daemon-dependent operations - Update finalizer expectation to match simplified finalizer format --- .../poolboy_test_simple/tasks/test-02.yaml | 4 +- .../tasks/test-approval-01.yaml | 4 +- .../tasks/test-auto-delete-01.yaml | 4 +- .../tasks/test-auto-detach-01.yaml | 4 +- .../tasks/test-finalizers-01.yaml | 6 +- .../tasks/test-ignore-01.yaml | 12 ++-- .../tasks/test-lifespan-start-01.yaml | 4 +- .../tasks/test-linked-01.yaml | 8 +-- .../tasks/test-linked-03.yaml | 60 +++++++++---------- .../tasks/test-pool-04.yaml | 4 +- .../roles/poolboy_test_simple/tasks/test.yaml | 16 ++--- 11 files changed, 63 insertions(+), 63 deletions(-) diff --git a/test/roles/poolboy_test_simple/tasks/test-02.yaml b/test/roles/poolboy_test_simple/tasks/test-02.yaml index 8677914..c2a8560 100644 --- a/test/roles/poolboy_test_simple/tasks/test-02.yaml +++ b/test/roles/poolboy_test_simple/tasks/test-02.yaml @@ -191,8 +191,8 @@ __state.status.lifespan.maximum != '2d' or __state.status.lifespan.relativeMaximum != '1d' until: r_get_resource_claim is success - delay: 1 - retries: 15 + delay: 2 + retries: 30 - name: Selt lifespan test-02 to end now vars: diff --git a/test/roles/poolboy_test_simple/tasks/test-approval-01.yaml b/test/roles/poolboy_test_simple/tasks/test-approval-01.yaml index 9a4cccc..9fd2896 100644 --- a/test/roles/poolboy_test_simple/tasks/test-approval-01.yaml +++ b/test/roles/poolboy_test_simple/tasks/test-approval-01.yaml @@ -93,8 +93,8 @@ r_get_resource_claim.resources[0].status.resourceHandle is undefined or r_get_resource_claim.resources[0].status.resources[0].state is undefined until: r_get_resource_claim is success - delay: 1 - retries: 10 + delay: 2 + retries: 30 - name: Delete ResourceClaim test-approval-01 kubernetes.core.k8s: diff --git a/test/roles/poolboy_test_simple/tasks/test-auto-delete-01.yaml b/test/roles/poolboy_test_simple/tasks/test-auto-delete-01.yaml index 316bc0f..ddae839 100644 --- a/test/roles/poolboy_test_simple/tasks/test-auto-delete-01.yaml +++ b/test/roles/poolboy_test_simple/tasks/test-auto-delete-01.yaml @@ -122,8 +122,8 @@ register: r_get_resource_claim failed_when: r_get_resource_claim.resources | length != 0 until: r_get_resource_claim is success - retries: 5 - delay: 1 + retries: 45 + delay: 2 - name: Verify delete of ResourceHandle for test-auto-delete-01-a kubernetes.core.k8s_info: diff --git a/test/roles/poolboy_test_simple/tasks/test-auto-detach-01.yaml b/test/roles/poolboy_test_simple/tasks/test-auto-detach-01.yaml index d07935f..6287c33 100644 --- a/test/roles/poolboy_test_simple/tasks/test-auto-detach-01.yaml +++ b/test/roles/poolboy_test_simple/tasks/test-auto-detach-01.yaml @@ -124,8 +124,8 @@ r_get_resource_claim.resources | length != 1 or not r_get_resource_claim.resources[0].status.resourceHandle.detached | default(False) | bool until: r_get_resource_claim is success - retries: 5 - delay: 1 + retries: 45 + delay: 2 - name: Verify delete of ResourceHandle for test-auto-detach-01-a kubernetes.core.k8s_info: diff --git a/test/roles/poolboy_test_simple/tasks/test-finalizers-01.yaml b/test/roles/poolboy_test_simple/tasks/test-finalizers-01.yaml index fe4d2e8..78f6c4d 100644 --- a/test/roles/poolboy_test_simple/tasks/test-finalizers-01.yaml +++ b/test/roles/poolboy_test_simple/tasks/test-finalizers-01.yaml @@ -68,8 +68,8 @@ failed_when: >- r_get_resource_claim.resources[0].status.resources[0].state is undefined until: r_get_resource_claim is success - delay: 1 - retries: 10 + delay: 2 + retries: 30 - name: Save facts from for ResourceClaim test-finalizers-01-a vars: @@ -109,7 +109,7 @@ failed_when: >- r_get_resource_handle.resources | length != 1 or r_get_resource_handle.resources[0].metadata.finalizers is undefined or - r_get_resource_handle.resources[0].metadata.finalizers != [poolboy_domain ~ '/handler'] + r_get_resource_handle.resources[0].metadata.finalizers != [poolboy_domain] - name: Set deprecated finalizer on ResourceHandle for test-finalizers-01-a kubernetes.core.k8s: diff --git a/test/roles/poolboy_test_simple/tasks/test-ignore-01.yaml b/test/roles/poolboy_test_simple/tasks/test-ignore-01.yaml index a00444f..88d347c 100644 --- a/test/roles/poolboy_test_simple/tasks/test-ignore-01.yaml +++ b/test/roles/poolboy_test_simple/tasks/test-ignore-01.yaml @@ -168,10 +168,10 @@ register: r_get_resource_claim_test failed_when: >- r_get_resource_claim_test.resources | length != 1 or - (now(true) - r_get_resource_claim_test.resources[0].spec.ts | to_datetime('%Y-%m-%dT%H:%M:%SZ')).total_seconds() > 10 + (now(true) - r_get_resource_claim_test.resources[0].spec.ts | to_datetime('%Y-%m-%dT%H:%M:%SZ')).total_seconds() > 15 until: r_get_resource_claim_test is success - delay: 1 - retries: 20 + delay: 2 + retries: 45 - name: Delete ResourceClaim test-ignore-01-a kubernetes.core.k8s: @@ -438,10 +438,10 @@ register: r_get_resource_claim_test failed_when: >- r_get_resource_claim_test.resources | length != 1 or - (now(true) - r_get_resource_claim_test.resources[0].spec.ts | to_datetime('%Y-%m-%dT%H:%M:%SZ')).total_seconds() > 10 + (now(true) - r_get_resource_claim_test.resources[0].spec.ts | to_datetime('%Y-%m-%dT%H:%M:%SZ')).total_seconds() > 15 until: r_get_resource_claim_test is success - delay: 1 - retries: 20 + delay: 2 + retries: 45 - name: Delete ResourceClaim test-ignore-01-b kubernetes.core.k8s: diff --git a/test/roles/poolboy_test_simple/tasks/test-lifespan-start-01.yaml b/test/roles/poolboy_test_simple/tasks/test-lifespan-start-01.yaml index 87eb9d1..4dbfa89 100644 --- a/test/roles/poolboy_test_simple/tasks/test-lifespan-start-01.yaml +++ b/test/roles/poolboy_test_simple/tasks/test-lifespan-start-01.yaml @@ -85,8 +85,8 @@ r_get_resource_claim.resources[0].status.lifespan.start is undefined or r_get_resource_claim.resources[0].status.resourceHandle.name is undefined or r_get_resource_claim.resources[0].status.resources[0].reference is undefined - delay: 1 - retries: 10 + delay: 2 + retries: 45 until: r_get_resource_claim is successful - name: Delete ResourceClaim test-lifespan-start-01 diff --git a/test/roles/poolboy_test_simple/tasks/test-linked-01.yaml b/test/roles/poolboy_test_simple/tasks/test-linked-01.yaml index 7b40dfb..926531e 100644 --- a/test/roles/poolboy_test_simple/tasks/test-linked-01.yaml +++ b/test/roles/poolboy_test_simple/tasks/test-linked-01.yaml @@ -147,8 +147,8 @@ __resource_claim.status.resources[1].provider.name != 'test-linked-01-binder' or __resource_claim.status.resources[1].waitingFor != 'Linked ResourceProvider' until: r_get_resource_claim is success - delay: 1 - retries: 10 + delay: 2 + retries: 45 - name: Save facts from for ResourceClaim test-linked-01-a vars: @@ -226,8 +226,8 @@ __resource_claim.status.resources[1].state is undefined or __resource_claim.status.resources[1].waitingFor is defined until: r_get_resource_claim is success - delay: 1 - retries: 10 + delay: 2 + retries: 45 - name: Verify state of ResourceClaim test-linked-01-a binder vars: diff --git a/test/roles/poolboy_test_simple/tasks/test-linked-03.yaml b/test/roles/poolboy_test_simple/tasks/test-linked-03.yaml index 2a4ccb2..0d55bfb 100644 --- a/test/roles/poolboy_test_simple/tasks/test-linked-03.yaml +++ b/test/roles/poolboy_test_simple/tasks/test-linked-03.yaml @@ -208,8 +208,8 @@ __resource_claim.status.resources[0].state.spec.numbervalue != 0 or __resource_claim.status.resources[0].state.spec.stringvalue != 'NO BASE' until: r_get_resource_claim is success - delay: 1 - retries: 10 + delay: 2 + retries: 30 - name: Create ResourceClaim test-linked-03-b kubernetes.core.k8s: @@ -252,8 +252,8 @@ __resource_claim.status.resources[1].state.spec.numbervalue != 10 or __resource_claim.status.resources[1].state.spec.stringvalue != 'ONE-A' until: r_get_resource_claim is success - delay: 1 - retries: 10 + delay: 2 + retries: 30 - name: Save facts from for ResourceClaim test-linked-03-b vars: @@ -276,8 +276,8 @@ register: r_get_resource_claim_test failed_when: r_get_resource_claim_test.resources | length != 1 until: r_get_resource_claim_test is success - delay: 1 - retries: 10 + delay: 2 + retries: 30 - name: Verify creation of binder ResourceClaimTest for test-linked-03-b kubernetes.core.k8s_info: @@ -288,8 +288,8 @@ register: r_get_resource_claim_test failed_when: r_get_resource_claim_test.resources | length != 1 until: r_get_resource_claim_test is success - delay: 1 - retries: 10 + delay: 2 + retries: 30 - name: Delete ResourceClaim test-linked-03-b kubernetes.core.k8s: @@ -308,8 +308,8 @@ register: r_get_resource_claim failed_when: r_get_resource_claim.resources | length != 0 until: r_get_resource_claim is success - retries: 5 - delay: 1 + retries: 10 + delay: 2 - name: Verify delete of ResourceHandle for test-linked-03-b kubernetes.core.k8s_info: @@ -320,8 +320,8 @@ register: r_get_resource_handle failed_when: r_get_resource_handle.resources | length != 0 until: r_get_resource_handle is success - retries: 5 - delay: 1 + retries: 10 + delay: 2 - name: Verify delete of base ResourceClaimTest test-linked-03-b kubernetes.core.k8s_info: @@ -332,8 +332,8 @@ register: r_get_resource_claim_test failed_when: r_get_resource_claim_test.resources | length != 0 until: r_get_resource_claim_test is success - delay: 1 - retries: 10 + delay: 2 + retries: 30 - name: Verify delete of binder ResourceClaimTest test-linked-03-b kubernetes.core.k8s_info: @@ -344,8 +344,8 @@ register: r_get_resource_claim_test failed_when: r_get_resource_claim_test.resources | length != 0 until: r_get_resource_claim_test is success - delay: 1 - retries: 10 + delay: 2 + retries: 30 - name: Create ResourceClaim test-linked-03-c kubernetes.core.k8s: @@ -388,8 +388,8 @@ __resource_claim.status.resources[1].state.spec.numbervalue != 200 or __resource_claim.status.resources[1].state.spec.stringvalue != 'TWO-B' until: r_get_resource_claim is success - delay: 1 - retries: 10 + delay: 2 + retries: 30 - name: Save facts from for ResourceClaim test-linked-03-c vars: @@ -412,8 +412,8 @@ register: r_get_resource_claim_test failed_when: r_get_resource_claim_test.resources | length != 1 until: r_get_resource_claim_test is success - delay: 1 - retries: 10 + delay: 2 + retries: 30 - name: Verify creation of binder ResourceClaimTest for test-linked-03-c kubernetes.core.k8s_info: @@ -424,8 +424,8 @@ register: r_get_resource_claim_test failed_when: r_get_resource_claim_test.resources | length != 1 until: r_get_resource_claim_test is success - delay: 1 - retries: 10 + delay: 2 + retries: 30 - name: Delete ResourceClaim test-linked-03-c kubernetes.core.k8s: @@ -444,8 +444,8 @@ register: r_get_resource_claim failed_when: r_get_resource_claim.resources | length != 0 until: r_get_resource_claim is success - retries: 5 - delay: 1 + retries: 10 + delay: 2 - name: Verify delete of ResourceHandle for test-linked-03-c kubernetes.core.k8s_info: @@ -456,8 +456,8 @@ register: r_get_resource_handle failed_when: r_get_resource_handle.resources | length != 0 until: r_get_resource_handle is success - retries: 5 - delay: 1 + retries: 10 + delay: 2 - name: Verify delete of base ResourceClaimTest test-linked-03-c kubernetes.core.k8s_info: @@ -468,8 +468,8 @@ register: r_get_resource_claim_test failed_when: r_get_resource_claim_test.resources | length != 0 until: r_get_resource_claim_test is success - delay: 1 - retries: 10 + delay: 2 + retries: 30 - name: Verify delete of binder ResourceClaimTest test-linked-03-c kubernetes.core.k8s_info: @@ -480,6 +480,6 @@ register: r_get_resource_claim_test failed_when: r_get_resource_claim_test.resources | length != 0 until: r_get_resource_claim_test is success - delay: 1 - retries: 10 + delay: 2 + retries: 30 ... diff --git a/test/roles/poolboy_test_simple/tasks/test-pool-04.yaml b/test/roles/poolboy_test_simple/tasks/test-pool-04.yaml index 2b5a02d..ed24b75 100644 --- a/test/roles/poolboy_test_simple/tasks/test-pool-04.yaml +++ b/test/roles/poolboy_test_simple/tasks/test-pool-04.yaml @@ -120,8 +120,8 @@ r_get_resource_claim.resources[0].status.resourceHandle is undefined or r_get_resource_claim.resources[0].status.resourceHandle.name == failed_resource_handle_name until: r_get_resource_claim is success - delay: 1 - retries: 10 + delay: 2 + retries: 30 - name: Update ResourcePool test-pool-04 to create healthy but unready ResourceHandles kubernetes.core.k8s: diff --git a/test/roles/poolboy_test_simple/tasks/test.yaml b/test/roles/poolboy_test_simple/tasks/test.yaml index c07367f..eaf494e 100644 --- a/test/roles/poolboy_test_simple/tasks/test.yaml +++ b/test/roles/poolboy_test_simple/tasks/test.yaml @@ -586,8 +586,8 @@ claim.status.resourceHandle.name != first_pool_resource_handle_name or claim.status.resources[0].state.spec.vars.desired_state != 'started' until: r_get_test_pool_match_claim is success - retries: 10 - delay: 1 + retries: 30 + delay: 3 - name: Create ResourceClaim test-pool-explicit kubernetes.core.k8s: @@ -658,8 +658,8 @@ claim.status.resources[0].state.spec.vars.name != 'test-pool' or claim.status.resources[0].state.spec.vars.number != 23 until: r_get_test_pool_explicit_claim is success - retries: 10 - delay: 1 + retries: 30 + delay: 2 - name: Delete test resource pool kubernetes.core.k8s: @@ -716,7 +716,7 @@ failed_when: r_verify_test_pool_handle_deletion.resources | length != 0 until: r_verify_test_pool_handle_deletion is success delay: 2 - retries: 10 + retries: 30 - name: Create test-templated ResourceProvider kubernetes.core.k8s: @@ -840,7 +840,7 @@ r_get_test_lifespan_1.resources | length > 0 until: r_get_test_lifespan_1 is success delay: 5 - retries: 10 + retries: 20 - name: Create ResourceClaim for test-lifespan-2 kubernetes.core.k8s: @@ -924,7 +924,7 @@ r_get_test_lifespan_2.resources | length > 0 until: r_get_test_lifespan_2 is success delay: 5 - retries: 10 + retries: 30 - name: Create test-lifespan resource pool kubernetes.core.k8s: @@ -1160,7 +1160,7 @@ not claim.status.resources[1].state is defined or not claim.status.resources[1].state.spec.vars.test_value == 'foo' until: r_test_linked_1_claim is success - retries: 5 + retries: 30 delay: 2 - name: Create ResourceHandle for test-linked-2 From 914ddb1b2c19fef43f722a0f374c01adf5ba199f Mon Sep 17 00:00:00 2001 From: Marcos Amorim Date: Tue, 6 Jan 2026 04:52:50 -0500 Subject: [PATCH 6/7] test(linked-02): increase delay/retries for status change detection --- .../tasks/test-linked-02.yaml | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/test/roles/poolboy_test_simple/tasks/test-linked-02.yaml b/test/roles/poolboy_test_simple/tasks/test-linked-02.yaml index 8fbedf8..8798707 100644 --- a/test/roles/poolboy_test_simple/tasks/test-linked-02.yaml +++ b/test/roles/poolboy_test_simple/tasks/test-linked-02.yaml @@ -122,8 +122,8 @@ __resource_claim.status.resources[0].provider.name != 'test-linked-02-base' or __resource_claim.status.resources[0].state is undefined until: r_get_resource_claim is success - delay: 1 - retries: 10 + delay: 2 + retries: 30 - name: Save facts from for ResourceClaim test-linked-02-a vars: @@ -149,8 +149,8 @@ register: r_get_resource_claim_test failed_when: r_get_resource_claim_test.resources | length != 1 until: r_get_resource_claim_test is success - delay: 1 - retries: 10 + delay: 2 + retries: 30 - name: Verify state of ResourceClaimTest for test-linked-02-a-base vars: @@ -185,8 +185,8 @@ r_get_resource_claim_test.resources[0].spec.stringvalue != 'TWO' or r_get_resource_claim_test.resources[0].spec.numbervalue != 20 until: r_get_resource_claim_test is success - delay: 1 - retries: 10 + delay: 2 + retries: 30 - name: Delete ResourceClaim test-linked-02-a kubernetes.core.k8s: @@ -205,8 +205,8 @@ register: r_get_resource_claim failed_when: r_get_resource_claim.resources | length != 0 until: r_get_resource_claim is success - retries: 5 - delay: 1 + retries: 10 + delay: 2 - name: Verify delete of ResourceHandle for test-linked-02-a kubernetes.core.k8s_info: @@ -217,8 +217,8 @@ register: r_get_resource_handle failed_when: r_get_resource_handle.resources | length != 0 until: r_get_resource_handle is success - retries: 5 - delay: 1 + retries: 10 + delay: 2 - name: Verify delete of ResourceClaimTest test-linked-02-a-base kubernetes.core.k8s_info: @@ -229,6 +229,6 @@ register: r_get_resource_claim_test failed_when: r_get_resource_claim_test.resources | length != 0 until: r_get_resource_claim_test is success - delay: 1 + delay: 2 retries: 10 ... From f08f4c5c1fbea1bc0f06f87a934298bc47cd12ba Mon Sep 17 00:00:00 2001 From: Marcos Amorim Date: Tue, 27 Jan 2026 07:27:26 -0500 Subject: [PATCH 7/7] Simplify operator mode: use IS_STANDALONE from Helm - Remove 9 workers_resource_* variables from poolboy.py - Remove workers_*_daemon_mode variables (3) - Remove operator_mode_distributed variable - Simplify is_standalone to read directly from IS_STANDALONE env var - Replace all workers_resource_* checks with "not is_standalone" in main.py - Daemons remain active for periodic processing This reduces complexity by moving mode logic to Helm templates. The operator now only needs to check is_standalone boolean. --- helm/helm-vars-dev-standalone.yaml | 32 + helm/helm-vars-dev.yaml | 33 +- helm/templates/_helpers.tpl | 18 + helm/templates/deployment.yaml | 10 +- helm/templates/worker/redis-deployment.yaml | 3 +- helm/templates/worker/redis-pvc.yaml | 2 +- helm/templates/worker/redis-service.yaml | 3 +- helm/templates/worker/useworkers-cm.yaml | 44 +- helm/templates/worker/worker-cm.yaml | 12 +- helm/values.yaml | 40 +- operator/cache.py | 13 +- operator/claimwatch.py | 287 ++++ operator/handlewatch.py | 270 ++++ operator/main.py | 60 +- operator/poolboy.py | 33 +- operator/processor/__init__.py | 4 +- operator/processor/app.py | 94 +- operator/resourcehandle.py | 1226 ++++++++++------- operator/resourcewatch.py | 305 ++-- operator/tasks/resourceclaim.py | 72 +- operator/tasks/resourcehandle.py | 77 +- operator/tasks/resourcepool.py | 52 +- test/ansible.cfg | 2 + test/dev-local.yaml | 19 + .../tasks/test-ignore-01.yaml | 34 +- .../tasks/test-lifespan-start-01.yaml | 6 +- .../tasks/test-pool-03.yaml | 2 +- .../tasks/test-ready-01.yaml | 30 +- .../tasks/test-requester-01.yaml | 2 +- .../tasks/test-vars-03.yaml | 4 +- .../roles/poolboy_test_simple/tasks/test.yaml | 29 +- 31 files changed, 1870 insertions(+), 948 deletions(-) create mode 100644 helm/helm-vars-dev-standalone.yaml create mode 100644 operator/claimwatch.py create mode 100644 operator/handlewatch.py create mode 100644 test/dev-local.yaml diff --git a/helm/helm-vars-dev-standalone.yaml b/helm/helm-vars-dev-standalone.yaml new file mode 100644 index 0000000..78214f8 --- /dev/null +++ b/helm/helm-vars-dev-standalone.yaml @@ -0,0 +1,32 @@ +# Standalone mode for development +# Usage: helm template poolboy-dev helm -f helm/helm-vars-dev-standalone.yaml | oc apply -f - + +clusterDomain: apps-crc.testing + +# Standalone mode - single operator pod handles everything +operatorMode: standalone + +namespace: + name: poolboy-dev + create: false + +image: + repository: image-registry.openshift-image-registry.svc:5000/poolboy-dev/poolboy + tagOverride: latest + pullPolicy: Always + +# =========================================== +# Disable distributed components +# =========================================== +redis: + enabled: false + +worker: + enabled: false + +scheduler: + enabled: false + +flower: + enabled: false + diff --git a/helm/helm-vars-dev.yaml b/helm/helm-vars-dev.yaml index f70e65b..cfa9651 100644 --- a/helm/helm-vars-dev.yaml +++ b/helm/helm-vars-dev.yaml @@ -3,8 +3,8 @@ clusterDomain: apps-crc.testing -# Use distributed mode with Celery workers for development -operatorMode: distributed +# Use standalone mode for testing watches without Celery overhead +operatorMode: standalone namespace: name: poolboy-dev @@ -45,36 +45,29 @@ flower: enabled: true # =========================================== -# Feature Flags - Enable Celery Workers +# Worker Settings (simplified) # =========================================== useWorkers: - resourcePool: - enabled: true - daemonMode: "scheduler" - partitions: 2 - resourceHandle: - enabled: true - daemonMode: "scheduler" - partitions: 4 - resourceClaim: - enabled: true - daemonMode: "scheduler" - partitions: 4 + lockRetryCountdown: 3 + errorRetryCountdown: 30 + partitions: + resourcePool: 2 + resourceHandle: 4 + resourceClaim: 4 # =========================================== -# Scheduled Tasks +# Scheduled Tasks (safety net - watches handle real-time events) # =========================================== schedules: maintain-all-pools: enabled: true schedule: - seconds: 30 + seconds: 30 # No PoolWatch yet, keep frequent maintain-all-handles: enabled: true schedule: - seconds: 60 + seconds: 60 # Needs polling for lifespan.end triggers maintain-all-claims: enabled: true schedule: - seconds: 60 - + seconds: 60 # Same as old daemon interval diff --git a/helm/templates/_helpers.tpl b/helm/templates/_helpers.tpl index 78fe298..93a50ed 100644 --- a/helm/templates/_helpers.tpl +++ b/helm/templates/_helpers.tpl @@ -106,3 +106,21 @@ Define the image to deploy {{- printf "%s:v%s" .Values.image.repository .Chart.AppVersion -}} {{- end -}} {{- end -}} + +{{/* +Determine if operator is running in standalone mode. +Backward compatibility mapping: + - 'all-in-one' -> standalone (true) + - 'standalone' -> standalone (true) + - 'manager', 'resource-handler', 'resource-watch' -> distributed (false) + - 'distributed' -> distributed (false) + - any other value -> distributed (false) +*/}} +{{- define "poolboy.isStandalone" -}} +{{- $mode := .Values.operatorMode | default "distributed" -}} +{{- if or (eq $mode "standalone") (eq $mode "all-in-one") -}} +true +{{- else -}} +false +{{- end -}} +{{- end -}} diff --git a/helm/templates/deployment.yaml b/helm/templates/deployment.yaml index 50dc67d..ff4ace2 100644 --- a/helm/templates/deployment.yaml +++ b/helm/templates/deployment.yaml @@ -28,16 +28,10 @@ spec: value: main.py - name: CLUSTER_DOMAIN value: "{{ .Values.clusterDomain }}" - - name: MANAGE_CLAIMS_INTERVAL - value: "{{ .Values.manageClaimsInterval }}" - - name: MANAGE_HANDLES_INTERVAL - value: "{{ .Values.manageHandlesInterval }}" - - name: MANAGE_POOLS_INTERVAL - value: "{{ .Values.managePoolsInterval }}" + - name: IS_STANDALONE + value: {{ include "poolboy.isStandalone" . | quote }} - name: OPERATOR_DOMAIN value: {{ include "poolboy.operatorDomain" . }} - - name: OPERATOR_MODE - value: "{{ .Values.operatorMode }}" {{- if .Values.enablePrometheusMetrics}} - name: METRICS_ENABLED value: "true" diff --git a/helm/templates/worker/redis-deployment.yaml b/helm/templates/worker/redis-deployment.yaml index 6795f4b..aed3309 100644 --- a/helm/templates/worker/redis-deployment.yaml +++ b/helm/templates/worker/redis-deployment.yaml @@ -1,3 +1,4 @@ +{{- if or .Values.redis.enabled (eq .Values.operatorMode "distributed") }} apiVersion: apps/v1 kind: Deployment metadata: @@ -54,4 +55,4 @@ spec: persistentVolumeClaim: claimName: {{ include "poolboy.name" . }}-redis-pvc {{- end }} - +{{- end }} diff --git a/helm/templates/worker/redis-pvc.yaml b/helm/templates/worker/redis-pvc.yaml index 53ca96f..2973ba1 100644 --- a/helm/templates/worker/redis-pvc.yaml +++ b/helm/templates/worker/redis-pvc.yaml @@ -1,4 +1,4 @@ -{{- if .Values.redis.persistence.enabled }} +{{- if and (or .Values.redis.enabled (eq .Values.operatorMode "distributed")) .Values.redis.persistence.enabled }} apiVersion: v1 kind: PersistentVolumeClaim metadata: diff --git a/helm/templates/worker/redis-service.yaml b/helm/templates/worker/redis-service.yaml index dfb1bbc..ceda039 100644 --- a/helm/templates/worker/redis-service.yaml +++ b/helm/templates/worker/redis-service.yaml @@ -1,3 +1,4 @@ +{{- if or .Values.redis.enabled (eq .Values.operatorMode "distributed") }} apiVersion: v1 kind: Service metadata: @@ -16,4 +17,4 @@ spec: selector: {{- include "poolboy.selectorLabels" . | nindent 4 }} app.kubernetes.io/component: redis - +{{- end }} diff --git a/helm/templates/worker/useworkers-cm.yaml b/helm/templates/worker/useworkers-cm.yaml index ee5c7ba..5911eda 100644 --- a/helm/templates/worker/useworkers-cm.yaml +++ b/helm/templates/worker/useworkers-cm.yaml @@ -1,5 +1,5 @@ -{{- /* UseWorkers ConfigMap - auto-enabled when operatorMode is 'distributed' */ -}} -{{- if or .Values.worker.enabled (eq .Values.operatorMode "distributed") }} +{{- /* UseWorkers ConfigMap - auto-enabled when not standalone */ -}} +{{- if or .Values.worker.enabled (ne (include "poolboy.isStandalone" .) "true") }} apiVersion: v1 kind: ConfigMap metadata: @@ -14,40 +14,20 @@ data: # Celery config for sending tasks from operator CELERY_BROKER_URL: {{ printf "redis://%s-redis:6379/0" (include "poolboy.name" .) | quote }} CELERY_RESULT_BACKEND: {{ printf "redis://%s-redis:6379/1" (include "poolboy.name" .) | quote }} - # Lock retry delay (seconds) + # Lock retry delay (seconds) - used when resource is locked by another task WORKERS_LOCK_RETRY_COUNTDOWN: {{ .Values.useWorkers.lockRetryCountdown | default 3 | quote }} - # Error retry delay (seconds) + # Error retry delay (seconds) - used when task fails with an error WORKERS_ERROR_RETRY_COUNTDOWN: {{ .Values.useWorkers.errorRetryCountdown | default 30 | quote }} - # ResourcePool configuration - {{- if .Values.useWorkers.resourcePool.enabled }} - WORKERS_RESOURCE_POOL: "true" - WORKERS_RESOURCE_POOL_DAEMON_MODE: {{ .Values.useWorkers.resourcePool.daemonMode | default "scheduler" | quote }} - {{- if .Values.useWorkers.resourcePool.partitions }} - PARTITION_RESOURCE_POOL: {{ .Values.useWorkers.resourcePool.partitions | quote }} + # Partitions for Celery queue routing (load distribution) + {{- with .Values.useWorkers.partitions }} + {{- if .resourcePool }} + PARTITION_RESOURCE_POOL: {{ .resourcePool | quote }} {{- end }} + {{- if .resourceHandle }} + PARTITION_RESOURCE_HANDLE: {{ .resourceHandle | quote }} {{- end }} - # ResourceHandle configuration - {{- if .Values.useWorkers.resourceHandle.enabled }} - WORKERS_RESOURCE_HANDLE: "true" - WORKERS_RESOURCE_HANDLE_DAEMON_MODE: {{ .Values.useWorkers.resourceHandle.daemonMode | default "scheduler" | quote }} - {{- if .Values.useWorkers.resourceHandle.partitions }} - PARTITION_RESOURCE_HANDLE: {{ .Values.useWorkers.resourceHandle.partitions | quote }} - {{- end }} - {{- end }} - # ResourceClaim configuration - {{- if .Values.useWorkers.resourceClaim.enabled }} - WORKERS_RESOURCE_CLAIM: "true" - WORKERS_RESOURCE_CLAIM_DAEMON_MODE: {{ .Values.useWorkers.resourceClaim.daemonMode | default "scheduler" | quote }} - {{- if .Values.useWorkers.resourceClaim.partitions }} - PARTITION_RESOURCE_CLAIM: {{ .Values.useWorkers.resourceClaim.partitions | quote }} - {{- end }} - {{- end }} - # Other resource types - {{- range $name, $config := .Values.useWorkers }} - {{- if and (kindIs "map" $config) (ne $name "resourcePool") (ne $name "resourceHandle") (ne $name "resourceClaim") (ne $name "lockRetryCountdown") (ne $name "errorRetryCountdown") }} - {{- if $config.enabled }} - WORKERS_{{ $name | snakecase | upper }}: "true" - {{- end }} + {{- if .resourceClaim }} + PARTITION_RESOURCE_CLAIM: {{ .resourceClaim | quote }} {{- end }} {{- end }} {{- end }} diff --git a/helm/templates/worker/worker-cm.yaml b/helm/templates/worker/worker-cm.yaml index 2d70452..b192274 100644 --- a/helm/templates/worker/worker-cm.yaml +++ b/helm/templates/worker/worker-cm.yaml @@ -18,9 +18,15 @@ data: CELERY_{{ $key | upper }}: {{ $value | quote }} {{- end }} # Partition configuration (from useWorkers) - {{- range $name, $config := .Values.useWorkers }} - {{- if and (kindIs "map" $config) $config.enabled $config.partitions }} - PARTITION_{{ $name | snakecase | upper }}: {{ $config.partitions | quote }} + {{- with .Values.useWorkers.partitions }} + {{- if .resourcePool }} + PARTITION_RESOURCE_POOL: {{ .resourcePool | quote }} + {{- end }} + {{- if .resourceHandle }} + PARTITION_RESOURCE_HANDLE: {{ .resourceHandle | quote }} + {{- end }} + {{- if .resourceClaim }} + PARTITION_RESOURCE_CLAIM: {{ .resourceClaim | quote }} {{- end }} {{- end }} # Operator configuration diff --git a/helm/values.yaml b/helm/values.yaml index bb4f086..653d789 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -24,9 +24,6 @@ operatorDomain: # Backward compatibility: 'all-in-one' maps to 'standalone', 'manager' maps to 'distributed' operatorMode: distributed -manageClaimsInterval: 60 -manageHandlesInterval: 60 -managePoolsInterval: 10 resourceRefreshInterval: 600 anarchy: @@ -107,30 +104,12 @@ useWorkers: # After 5 retries, the task fails permanently. errorRetryCountdown: 30 - resourcePool: - enabled: false - # Operation mode for periodic tasks: daemon, scheduler, or both - daemonMode: "scheduler" - partitions: 2 - resourceHandle: - enabled: false - # Operation mode for periodic tasks: daemon, scheduler, or both - daemonMode: "scheduler" - partitions: 4 - resourceClaim: - enabled: false - # Operation mode for periodic tasks: daemon, scheduler, or both - daemonMode: "scheduler" - partitions: 4 - resourceProvider: - enabled: false - partitions: 2 - resourceWatch: - enabled: false - partitions: 2 - cleanup: - enabled: false - partitions: 4 + # Partitions for Celery queue routing (load distribution across workers) + # Higher values = more parallelism but more queues to manage + partitions: + resourcePool: 2 + resourceHandle: 4 + resourceClaim: 4 # =========================================== # Redis Configuration @@ -277,20 +256,21 @@ flower: # =========================================== # Task Schedules # Supports: seconds (interval) or cron (expression) +# With ClaimWatch/HandleWatch, these are now safety nets (not primary) # =========================================== schedules: maintain-all-pools: enabled: false schedule: - seconds: 30 + seconds: 30 # No PoolWatch yet, keep frequent maintain-all-handles: enabled: false schedule: - seconds: 60 + seconds: 60 # Needs polling for lifespan.end triggers maintain-all-claims: enabled: false schedule: - seconds: 60 + seconds: 60 # Same as old daemon interval cleanup-stale-handles: enabled: false schedule: diff --git a/operator/cache.py b/operator/cache.py index ef6e696..9ac732b 100644 --- a/operator/cache.py +++ b/operator/cache.py @@ -33,6 +33,7 @@ class CacheTag(Enum): """Tags for cache key namespacing.""" + CLAIM = "claim" HANDLE = "handle" HANDLE_BOUND = "handle_bound" @@ -161,7 +162,7 @@ def keys(self, pattern: str) -> list[str]: def set(self, key: str, value: Any, ttl: int) -> None: """Set value with TTL in seconds. Serializes using 'definition' property if available.""" try: - if hasattr(value, 'definition'): + if hasattr(value, "definition"): data = json.dumps(value.definition) else: data = json.dumps(value) @@ -223,7 +224,7 @@ def get_keys_by_tag(cls, tag: CacheTag) -> list[str]: pattern = f"poolboy:{tag.value}:*" prefix = f"poolboy:{tag.value}:" keys = cls._backend.keys(pattern) - return [k[len(prefix):] for k in keys] + return [k[len(prefix) :] for k in keys] @classmethod def initialize(cls, standalone: Optional[bool] = None) -> None: @@ -231,13 +232,13 @@ def initialize(cls, standalone: Optional[bool] = None) -> None: Initialize the cache backend. Args: - standalone: Force standalone mode. If None, uses Poolboy.operator_mode_standalone. + standalone: Force standalone mode. If None, uses Poolboy.is_standalone. """ if cls._initialized: return if standalone is None: - standalone = Poolboy.operator_mode_standalone + standalone = Poolboy.is_standalone if standalone: logger.info("Cache: Using MemoryBackend (standalone mode)") @@ -248,7 +249,9 @@ def initialize(cls, standalone: Optional[bool] = None) -> None: try: cls._backend = RedisBackend(redis_url) except Exception as e: - logger.warning(f"Redis connection failed, falling back to MemoryBackend: {e}") + logger.warning( + f"Redis connection failed, falling back to MemoryBackend: {e}" + ) cls._backend = MemoryBackend() cls._initialized = True diff --git a/operator/claimwatch.py b/operator/claimwatch.py new file mode 100644 index 0000000..0593e05 --- /dev/null +++ b/operator/claimwatch.py @@ -0,0 +1,287 @@ +""" +ClaimWatch - Event-driven watch for ResourceClaims. + +Follows the proven pattern from ResourceWatch: +- Uses kubernetes_asyncio.watch.Watch() for event stream +- Handles 410 Expired and connection errors with automatic restart +- Works in both standalone and distributed modes +- Replaces the per-resource daemon with a single efficient watch + +Key difference from daemons: +- Daemons: Loop every 60s per resource (N coroutines for N resources) +- ClaimWatch: Single watch, event-driven processing (~instant latency) +""" + +import asyncio +import logging +from datetime import datetime, timezone +from typing import Mapping + +import kubernetes_asyncio +from poolboy import Poolboy +from resourceclaim import ResourceClaim + +logger = logging.getLogger("claim_watch") + + +class ClaimWatchRestartError(Exception): + """Raised when watch needs to restart (e.g., 410 Expired).""" + + pass + + +class ClaimWatchFailedError(Exception): + """Raised when watch encounters an unrecoverable error.""" + + pass + + +class ClaimWatch: + """Watch ResourceClaims for changes that require processing. + + This replaces the per-resource daemon with a single event-driven watch. + When a claim changes, we check if it needs processing and either: + - Process directly (standalone mode) + - Dispatch to Celery workers (distributed mode) + """ + + # Singleton instance + _instance = None + _lock = asyncio.Lock() + + @classmethod + async def start(cls): + """Start the singleton ClaimWatch instance.""" + async with cls._lock: + if cls._instance is None: + cls._instance = cls() + await cls._instance.initialize() + return cls._instance + + @classmethod + async def stop_all(cls): + """Stop the singleton ClaimWatch instance.""" + async with cls._lock: + if cls._instance is not None: + await cls._instance.shutdown() + cls._instance = None + + def __init__(self): + self.task = None + # Cache last seen resourceVersion per claim for change detection + self._rv_cache: dict[str, str] = {} + + async def initialize(self): + """Start the watch loop as a background task.""" + logger.info("Starting ClaimWatch") + self.task = asyncio.create_task(self._watch_loop()) + + async def shutdown(self): + """Stop the watch loop.""" + if self.task: + self.task.cancel() + try: + await self.task + except asyncio.CancelledError: + pass + self.task = None + logger.info("ClaimWatch stopped") + + async def _watch_loop(self): + """Main watch loop with automatic restart on errors.""" + while True: + watch_start = datetime.now(timezone.utc) + try: + await self._watch() + except asyncio.CancelledError: + logger.debug("ClaimWatch cancelled") + return + except ClaimWatchRestartError as e: + logger.debug(f"ClaimWatch restart: {e}") + # Avoid tight restart loops + duration = (datetime.now(timezone.utc) - watch_start).total_seconds() + if duration < 10: + await asyncio.sleep(10 - duration) + except ClaimWatchFailedError as e: + logger.warning(f"ClaimWatch failed: {e}") + duration = (datetime.now(timezone.utc) - watch_start).total_seconds() + if duration < 60: + await asyncio.sleep(60 - duration) + except Exception: + logger.exception("ClaimWatch exception") + duration = (datetime.now(timezone.utc) - watch_start).total_seconds() + if duration < 60: + await asyncio.sleep(60 - duration) + logger.debug("Restarting ClaimWatch") + + async def _watch(self): + """Stream events from Kubernetes API.""" + watch = None + try: + watch = kubernetes_asyncio.watch.Watch() + # Watch all ResourceClaims cluster-wide + method = Poolboy.custom_objects_api.list_cluster_custom_object + kwargs = { + "group": ResourceClaim.api_group, + "version": ResourceClaim.api_version, + "plural": ResourceClaim.plural, + } + + async for event in watch.stream(method, **kwargs): + if not isinstance(event, Mapping): + raise ClaimWatchFailedError(f"Unknown event: {event}") + + event_type = event["type"] + event_obj = event["object"] + + if not isinstance(event_obj, Mapping): + event_obj = Poolboy.api_client.sanitize_for_serialization(event_obj) + + if event_type == "ERROR": + if event_obj.get("kind") == "Status": + reason = event_obj.get("reason", "") + if reason in ("Expired", "Gone"): + raise ClaimWatchRestartError(reason.lower()) + raise ClaimWatchFailedError( + f"{reason} {event_obj.get('message', '')}" + ) + raise ClaimWatchFailedError(f"Unknown error: {event}") + + try: + await self._handle_event(event_type, event_obj) + except Exception: + name = event_obj.get("metadata", {}).get("name", "unknown") + ns = event_obj.get("metadata", {}).get("namespace", "unknown") + logger.exception(f"Error handling event for {ns}/{name}") + + except kubernetes_asyncio.client.exceptions.ApiException as e: + if e.status == 410: + raise ClaimWatchRestartError("410 Expired") + raise + finally: + if watch: + await watch.close() + + async def _handle_event(self, event_type: str, claim: Mapping) -> None: + """Handle a single claim event.""" + metadata = claim.get("metadata", {}) + name = metadata.get("name") + namespace = metadata.get("namespace") + uid = metadata.get("uid") + rv = metadata.get("resourceVersion") + labels = metadata.get("labels", {}) + + if not name or not namespace: + return + + cache_key = f"{namespace}/{name}" + + # Handle deletion + if event_type == "DELETED": + self._rv_cache.pop(cache_key, None) + return + + # Check if should be ignored + if Poolboy.ignore_label in labels: + return + + # Check if we've already processed this version + if self._rv_cache.get(cache_key) == rv: + return + + # Check if claim needs processing + if not self._needs_processing(claim): + # Update cache even if we don't process (to avoid recheck) + self._rv_cache[cache_key] = rv + return + + # Process the claim + await self._process_claim(claim) + self._rv_cache[cache_key] = rv + + def _needs_processing(self, claim: Mapping) -> bool: + """Check if claim needs processing based on its state. + + Returns True if: + - Claim has a handle and might need reconciliation + - Lifespan start time has been reached + - Status indicates processing needed + """ + status = claim.get("status", {}) + spec = claim.get("spec", {}) + + # If claim has a handle, it might need processing + if "resourceHandle" in status: + return True + + # Check if lifespan start is in the future + lifespan_start = spec.get("lifespan", {}).get("start") + if lifespan_start: + try: + start_dt = datetime.strptime(lifespan_start, "%Y-%m-%dT%H:%M:%S%z") + if start_dt > datetime.now(timezone.utc): + # Future start - don't process yet + return False + except (ValueError, TypeError): + pass + + # If detached, check lifespan end + if status.get("resourceHandle", {}).get("detached", False): + lifespan_end = status.get("lifespan", {}).get("end") + if lifespan_end: + try: + end_dt = datetime.strptime(lifespan_end, "%Y-%m-%dT%H:%M:%S%z") + if end_dt < datetime.now(timezone.utc): + return True # Past lifespan end, needs delete + except (ValueError, TypeError): + pass + return False # Detached, no processing needed + + # Default: process it + return True + + async def _process_claim(self, claim: Mapping) -> None: + """Process a claim - works in both standalone and distributed modes. + + IMPORTANT: ClaimWatch only processes claims that ALREADY have a handle. + Initial binding (no handle) is done by Kopf on.create handler to avoid + race conditions where both would try to create a handle simultaneously. + + Like ResourceWatch, this method works in both modes: + - Standalone: calls resource_claim.manage() directly + - Distributed: dispatches to Celery workers + """ + metadata = claim.get("metadata", {}) + name = metadata.get("name") + namespace = metadata.get("namespace") + status = claim.get("status", {}) + has_handle = "resourceHandle" in status + + # Only process claims that already have a handle + # Initial binding is done by Kopf on.create handler + if not has_handle: + logger.debug( + f"ClaimWatch skipping {namespace}/{name} - no handle yet" + ) + return + + # In distributed mode, dispatch to Celery + if not Poolboy.is_standalone: + from tasks.resourceclaim import dispatch_manage_claim + + dispatch_manage_claim( + definition=claim, + name=name, + namespace=namespace, + ) + logger.debug(f"ClaimWatch dispatched {namespace}/{name} to worker") + else: + # In standalone mode, Kopf handlers also process create/update/delete. + # ClaimWatch provides backup processing for time-based operations. + # + # IMPORTANT: Use register_definition which updates from the event data. + # This ensures we have the latest data from Kubernetes. + resource_claim = await ResourceClaim.register_definition(claim) + if not resource_claim.ignore: + await resource_claim.manage(logger=logger) + logger.debug(f"ClaimWatch processed {namespace}/{name} directly") diff --git a/operator/handlewatch.py b/operator/handlewatch.py new file mode 100644 index 0000000..a7c55a3 --- /dev/null +++ b/operator/handlewatch.py @@ -0,0 +1,270 @@ +""" +HandleWatch - Event-driven watch for ResourceHandles. + +Follows the proven pattern from ResourceWatch: +- Uses kubernetes_asyncio.watch.Watch() for event stream +- Handles 410 Expired and connection errors with automatic restart +- Works in both standalone and distributed modes +- Replaces the per-resource daemon with a single efficient watch + +Key difference from daemons: +- Daemons: Loop every 60s per resource (N coroutines for N resources) +- HandleWatch: Single watch, event-driven processing (~instant latency) +""" + +import asyncio +import logging +from datetime import datetime, timezone +from typing import Mapping + +import kubernetes_asyncio +from poolboy import Poolboy +from resourcehandle import ResourceHandle + +logger = logging.getLogger("handle_watch") + + +class HandleWatchRestartError(Exception): + """Raised when watch needs to restart (e.g., 410 Expired).""" + + pass + + +class HandleWatchFailedError(Exception): + """Raised when watch encounters an unrecoverable error.""" + + pass + + +class HandleWatch: + """Watch ResourceHandles for changes that require processing. + + This replaces the per-resource daemon with a single event-driven watch. + When a handle changes, we check if it needs processing and either: + - Process directly (standalone mode) + - Dispatch to Celery workers (distributed mode) + """ + + # Singleton instance + _instance = None + _lock = asyncio.Lock() + + @classmethod + async def start(cls): + """Start the singleton HandleWatch instance.""" + async with cls._lock: + if cls._instance is None: + cls._instance = cls() + await cls._instance.initialize() + return cls._instance + + @classmethod + async def stop_all(cls): + """Stop the singleton HandleWatch instance.""" + async with cls._lock: + if cls._instance is not None: + await cls._instance.shutdown() + cls._instance = None + + def __init__(self): + self.task = None + # Cache last seen resourceVersion per handle for change detection + self._rv_cache: dict[str, str] = {} + + async def initialize(self): + """Start the watch loop as a background task.""" + logger.info("Starting HandleWatch") + self.task = asyncio.create_task(self._watch_loop()) + + async def shutdown(self): + """Stop the watch loop.""" + if self.task: + self.task.cancel() + try: + await self.task + except asyncio.CancelledError: + pass + self.task = None + logger.info("HandleWatch stopped") + + async def _watch_loop(self): + """Main watch loop with automatic restart on errors.""" + while True: + watch_start = datetime.now(timezone.utc) + try: + await self._watch() + except asyncio.CancelledError: + logger.debug("HandleWatch cancelled") + return + except HandleWatchRestartError as e: + logger.debug(f"HandleWatch restart: {e}") + # Avoid tight restart loops + duration = (datetime.now(timezone.utc) - watch_start).total_seconds() + if duration < 10: + await asyncio.sleep(10 - duration) + except HandleWatchFailedError as e: + logger.warning(f"HandleWatch failed: {e}") + duration = (datetime.now(timezone.utc) - watch_start).total_seconds() + if duration < 60: + await asyncio.sleep(60 - duration) + except Exception: + logger.exception("HandleWatch exception") + duration = (datetime.now(timezone.utc) - watch_start).total_seconds() + if duration < 60: + await asyncio.sleep(60 - duration) + logger.debug("Restarting HandleWatch") + + async def _watch(self): + """Stream events from Kubernetes API.""" + watch = None + try: + watch = kubernetes_asyncio.watch.Watch() + # Watch ResourceHandles in operator namespace + method = Poolboy.custom_objects_api.list_namespaced_custom_object + kwargs = { + "group": ResourceHandle.api_group, + "version": ResourceHandle.api_version, + "plural": ResourceHandle.plural, + "namespace": Poolboy.namespace, + } + + async for event in watch.stream(method, **kwargs): + if not isinstance(event, Mapping): + raise HandleWatchFailedError(f"Unknown event: {event}") + + event_type = event["type"] + event_obj = event["object"] + + if not isinstance(event_obj, Mapping): + event_obj = Poolboy.api_client.sanitize_for_serialization(event_obj) + + if event_type == "ERROR": + if event_obj.get("kind") == "Status": + reason = event_obj.get("reason", "") + if reason in ("Expired", "Gone"): + raise HandleWatchRestartError(reason.lower()) + raise HandleWatchFailedError( + f"{reason} {event_obj.get('message', '')}" + ) + raise HandleWatchFailedError(f"Unknown error: {event}") + + try: + await self._handle_event(event_type, event_obj) + except Exception: + name = event_obj.get("metadata", {}).get("name", "unknown") + logger.exception(f"Error handling event for {name}") + + except kubernetes_asyncio.client.exceptions.ApiException as e: + if e.status == 410: + raise HandleWatchRestartError("410 Expired") + raise + finally: + if watch: + await watch.close() + + async def _handle_event(self, event_type: str, handle: Mapping) -> None: + """Handle a single handle event.""" + metadata = handle.get("metadata", {}) + name = metadata.get("name") + namespace = metadata.get("namespace") + uid = metadata.get("uid") + rv = metadata.get("resourceVersion") + labels = metadata.get("labels", {}) + + if not name: + return + + cache_key = name + + # Handle deletion + if event_type == "DELETED": + self._rv_cache.pop(cache_key, None) + return + + # Check if should be ignored + if Poolboy.ignore_label in labels: + return + + # Check if we've already processed this version + if self._rv_cache.get(cache_key) == rv: + return + + # Check if handle needs processing + if not self._needs_processing(handle): + # Update cache even if we don't process (to avoid recheck) + self._rv_cache[cache_key] = rv + return + + # Process the handle + await self._process_handle(handle) + self._rv_cache[cache_key] = rv + + def _needs_processing(self, handle: Mapping) -> bool: + """Check if handle needs processing based on its state. + + Returns True if handle might need reconciliation. + """ + spec = handle.get("spec", {}) + status = handle.get("status", {}) + + # Check if past lifespan end + lifespan_end = spec.get("lifespan", {}).get("end") + if lifespan_end: + try: + end_dt = datetime.strptime(lifespan_end, "%Y-%m-%dT%H:%M:%S%z") + if end_dt < datetime.now(timezone.utc): + return True # Past lifespan end, needs delete + except (ValueError, TypeError): + pass + + # Check if bound to claim that might not exist + if "resourceClaim" in spec: + return True + + # Check if has resources that might need management + if spec.get("resources"): + return True + + return True # Default: process it + + async def _process_handle(self, handle: Mapping) -> None: + """Process a handle - works in both standalone and distributed modes. + + IMPORTANT: HandleWatch only processes handles that have been initialized. + Initial setup (no status.resources) is done by Kopf on.create handler + to avoid race conditions where both would try to create resources. + + Like ResourceWatch, this method works in both modes: + - Standalone: calls resource_handle.manage() directly + - Distributed: dispatches to Celery workers + """ + metadata = handle.get("metadata", {}) + name = metadata.get("name") + namespace = metadata.get("namespace") + status = handle.get("status", {}) + + # Only process handles that have been initialized (have status.resources) + # Initial setup is done by Kopf on.create handler + if "resources" not in status: + logger.debug( + f"HandleWatch skipping {name} - not initialized yet" + ) + return + + # In distributed mode, dispatch to Celery + if not Poolboy.is_standalone: + from tasks.resourcehandle import dispatch_manage_handle + + dispatch_manage_handle( + definition=handle, + name=name, + namespace=namespace, + ) + logger.debug(f"HandleWatch dispatched {name} to worker") + else: + # In standalone mode, Kopf handlers also process create/update/delete. + # HandleWatch provides backup processing for time-based operations. + resource_handle = await ResourceHandle.register_definition(handle) + if not resource_handle.ignore: + await resource_handle.manage(logger=logger) + logger.debug(f"HandleWatch processed {name} directly") diff --git a/operator/main.py b/operator/main.py index 8fdc53e..eb90653 100755 --- a/operator/main.py +++ b/operator/main.py @@ -47,7 +47,7 @@ async def startup(logger: kopf.ObjectLogger, settings: kopf.OperatorSettings, ** # Configure logging configure_kopf_logging() # Initialize cache before any preload operations - Cache.initialize(standalone=Poolboy.operator_mode_standalone) + Cache.initialize(standalone=Poolboy.is_standalone) await Poolboy.on_startup(logger=logger) @@ -59,7 +59,7 @@ async def startup(logger: kopf.ObjectLogger, settings: kopf.OperatorSettings, ** await ResourceProvider.preload(logger=logger) # Preload ResourceHandles in standalone mode (distributed mode uses workers) - if Poolboy.operator_mode_standalone: + if Poolboy.is_standalone: await ResourceHandle.preload(logger=logger) @kopf.on.cleanup() @@ -123,7 +123,7 @@ async def resource_claim_event( # IMPORTANT: Only dispatch to worker if claim already has a handle. # Initial binding requires in-memory cache which workers don't have. # This ensures pool handles are correctly reused. - if Poolboy.workers_resource_claim and resource_claim.has_resource_handle: + if not Poolboy.is_standalone and resource_claim.has_resource_handle: from tasks.resourceclaim import dispatch_manage_claim dispatch_manage_claim( definition=resource_claim.definition, @@ -160,18 +160,18 @@ async def resource_claim_delete( uid = uid, ) - # Delegate to worker if enabled - if Poolboy.workers_resource_claim: + # Delegate to worker if not standalone + if not Poolboy.is_standalone: from tasks.resourceclaim import dispatch_delete_claim dispatch_delete_claim( definition=resource_claim.definition, name=resource_claim.name, namespace=resource_claim.namespace, ) + logger.info(f"Dispatched delete_claim for {name} in {namespace}") else: await resource_claim.handle_delete(logger=logger) - - await ResourceClaim.unregister(name=name, namespace=namespace) + await ResourceClaim.unregister(name=name, namespace=namespace) @kopf.daemon( ResourceClaim.api_group, ResourceClaim.api_version, ResourceClaim.plural, @@ -210,13 +210,9 @@ async def resource_claim_daemon( logger.info(f"{description} found deleted in daemon") return if not resource_claim.ignore: - # Delegate to worker if enabled, daemon mode active, AND claim has handle + # In distributed mode, dispatch to worker (if claim has handle) # Claims without handle need operator for binding (cache-dependent) - if ( - Poolboy.workers_resource_claim and - resource_claim.has_resource_handle and - Poolboy.workers_resource_claim_daemon_mode in ('daemon', 'both') - ): + if not Poolboy.is_standalone and resource_claim.has_resource_handle: from tasks.resourceclaim import dispatch_manage_claim dispatch_manage_claim( definition=resource_claim.definition, @@ -268,7 +264,7 @@ async def resource_handle_event( ) if resource_handle.ignore: return - if Poolboy.workers_resource_handle: + if not Poolboy.is_standalone: from tasks.resourcehandle import dispatch_manage_handle dispatch_manage_handle( definition=resource_handle.definition, @@ -307,7 +303,7 @@ async def resource_handle_delete( ) if resource_handle.ignore: return - if Poolboy.workers_resource_handle: + if not Poolboy.is_standalone: from tasks.resourcehandle import dispatch_delete_handle dispatch_delete_handle( definition=resource_handle.definition, @@ -354,14 +350,13 @@ async def resource_handle_daemon( logger.info(f"{description} found deleted in daemon") return if not resource_handle.ignore: - if Poolboy.workers_resource_handle: - if Poolboy.workers_resource_handle_daemon_mode in ('daemon', 'both'): - from tasks.resourcehandle import dispatch_manage_handle - dispatch_manage_handle( - definition=resource_handle.definition, - name=resource_handle.name, - namespace=resource_handle.namespace, - ) + if not Poolboy.is_standalone: + from tasks.resourcehandle import dispatch_manage_handle + dispatch_manage_handle( + definition=resource_handle.definition, + name=resource_handle.name, + namespace=resource_handle.namespace, + ) else: await resource_handle.manage(logger=logger) await asyncio.sleep(Poolboy.manage_handles_interval) @@ -405,7 +400,7 @@ async def resource_pool_event( status = status, uid = uid, ) - if Poolboy.workers_resource_pool: + if not Poolboy.is_standalone: from tasks.resourcepool import dispatch_manage_pool dispatch_manage_pool( definition=resource_pool.definition, @@ -442,7 +437,7 @@ async def resource_pool_delete( status = status, uid = uid, ) - if Poolboy.workers_resource_pool: + if not Poolboy.is_standalone: from tasks.resourcepool import dispatch_delete_pool_handles dispatch_delete_pool_handles( definition=resource_pool.definition, @@ -491,14 +486,13 @@ async def resource_pool_daemon( return if not resource_pool.ignore: - if Poolboy.workers_resource_pool: - if Poolboy.workers_resource_pool_daemon_mode in ('daemon', 'both'): - from tasks.resourcepool import dispatch_manage_pool - dispatch_manage_pool( - definition=resource_pool.definition, - name=resource_pool.name, - namespace=resource_pool.namespace, - ) + if not Poolboy.is_standalone: + from tasks.resourcepool import dispatch_manage_pool + dispatch_manage_pool( + definition=resource_pool.definition, + name=resource_pool.name, + namespace=resource_pool.namespace, + ) else: await resource_pool.manage(logger=logger) diff --git a/operator/poolboy.py b/operator/poolboy.py index a6b7b5c..c2f5bfe 100644 --- a/operator/poolboy.py +++ b/operator/poolboy.py @@ -11,18 +11,9 @@ class Poolboy(): manage_handles_interval = int(os.environ.get('MANAGE_HANDLES_INTERVAL', 60)) manage_pools_interval = int(os.environ.get('MANAGE_POOLS_INTERVAL', 10)) - # Operator mode: 'standalone' or 'distributed' - # Backward compatibility: - # - 'all-in-one' maps to 'standalone' - # - 'manager', 'resource-handler', 'resource-watch' map to 'distributed' - _operator_mode_raw = os.environ.get('OPERATOR_MODE', 'distributed') - operator_mode = ( - 'standalone' if _operator_mode_raw == 'all-in-one' - else 'distributed' if _operator_mode_raw in ('manager', 'resource-handler', 'resource-watch') - else _operator_mode_raw - ) - operator_mode_distributed = operator_mode == 'distributed' - operator_mode_standalone = operator_mode == 'standalone' + # Operator mode: standalone (local) or distributed (Celery workers) + # IS_STANDALONE is set by Helm based on operatorMode value + is_standalone = os.environ.get('IS_STANDALONE', 'false').lower() == 'true' operator_domain = os.environ.get('OPERATOR_DOMAIN', 'poolboy.gpte.redhat.com') operator_version = os.environ.get('OPERATOR_VERSION', 'v1') @@ -53,20 +44,9 @@ class Poolboy(): # TODO: Remove after all production clusters migrated (used for cleanup only) resource_handler_idx_label = f"{operator_domain}/resource-handler-idx" - # Worker feature flags (loaded from environment) - # When True, delegate processing to Celery workers - # When False, process synchronously in the main operator (current behavior) + # Worker retry config (used by Celery tasks) workers_error_retry_countdown = int(os.environ.get('WORKERS_ERROR_RETRY_COUNTDOWN', '30')) workers_lock_retry_countdown = int(os.environ.get('WORKERS_LOCK_RETRY_COUNTDOWN', '3')) - workers_resource_pool = os.environ.get('WORKERS_RESOURCE_POOL', 'false').lower() == 'true' - workers_resource_pool_daemon_mode = os.environ.get('WORKERS_RESOURCE_POOL_DAEMON_MODE', 'scheduler') - workers_resource_handle = os.environ.get('WORKERS_RESOURCE_HANDLE', 'false').lower() == 'true' - workers_resource_handle_daemon_mode = os.environ.get('WORKERS_RESOURCE_HANDLE_DAEMON_MODE', 'scheduler') - workers_resource_claim = os.environ.get('WORKERS_RESOURCE_CLAIM', 'false').lower() == 'true' - workers_resource_claim_daemon_mode = os.environ.get('WORKERS_RESOURCE_CLAIM_DAEMON_MODE', 'scheduler') - workers_resource_provider = os.environ.get('WORKERS_RESOURCE_PROVIDER', 'false').lower() == 'true' - workers_resource_watch = os.environ.get('WORKERS_RESOURCE_WATCH', 'false').lower() == 'true' - workers_cleanup = os.environ.get('WORKERS_CLEANUP', 'false').lower() == 'true' # Redis URL for distributed locking (used by main operator to send tasks) redis_url = os.environ.get('REDIS_URL') @@ -78,8 +58,9 @@ async def on_cleanup(cls): @classmethod async def on_startup(cls, logger: kopf.ObjectLogger): # Log operator mode on startup - logger.info(f"Poolboy starting in {cls.operator_mode} mode") - if cls.operator_mode_distributed: + mode = "standalone" if cls.is_standalone else "distributed" + logger.info(f"Poolboy starting in {mode} mode") + if not cls.is_standalone: logger.info("Distributed mode: delegating to Celery workers") if os.path.exists('/run/secrets/kubernetes.io/serviceaccount'): diff --git a/operator/processor/__init__.py b/operator/processor/__init__.py index 4e75720..7bcbafa 100644 --- a/operator/processor/__init__.py +++ b/operator/processor/__init__.py @@ -2,10 +2,10 @@ Celery processor module for Poolboy. Imports should be done directly from submodules to avoid circular imports: - from processor.app import app, WorkerState, is_worker_enabled + from processor.app import app, WorkerState This __init__.py intentionally does NOT import from .app to prevent circular import issues when tasks import processor components. """ -__all__ = ['app', 'config'] +__all__ = ["app", "config"] diff --git a/operator/processor/app.py b/operator/processor/app.py index eda7f69..7448168 100644 --- a/operator/processor/app.py +++ b/operator/processor/app.py @@ -25,13 +25,14 @@ from .config import WorkerConfig logger = get_task_logger(__name__) -T = TypeVar('T') +T = TypeVar("T") # ============================================================================= # TaskRouter - Convention-based routing # ============================================================================= + class TaskRouter: """ Route tasks to queues based on module naming convention. @@ -57,8 +58,9 @@ class TaskRouter: - (not set) -> no partitioning, uses simple queue name """ - def __call__(self, name: str, args: tuple, kwargs: dict, options: dict, - task=None, **kw) -> dict | None: + def __call__( + self, name: str, args: tuple, kwargs: dict, options: dict, task=None, **kw + ) -> dict | None: """Make router callable for Celery's task_routes.""" return self.route(name, kwargs) @@ -71,7 +73,7 @@ def get_entity_from_module(self, module: str) -> str: resourceclaim -> claim cleanup -> cleanup """ - if module.startswith('resource') and len(module) > 8: + if module.startswith("resource") and len(module) > 8: return module[8:] # resourcepool -> pool return module @@ -81,10 +83,12 @@ def get_partitions(self, resource_type: str) -> int: value = os.environ.get(env_key) return int(value) if value else 0 - def get_queue_name(self, resource_type: str, resource_name: str, - namespace: str, partitions: int) -> str: + def get_queue_name( + self, resource_type: str, resource_name: str, namespace: str, partitions: int + ) -> str: """Calculate partitioned queue name using consistent hashing.""" import hashlib + resource_key = f"{namespace}/{resource_name}" hash_value = int(hashlib.md5(resource_key.encode()).hexdigest(), 16) partition_index = hash_value % partitions @@ -99,14 +103,14 @@ def get_resource_type(self, module: str) -> str: resourceclaim -> resource_claim cleanup -> cleanup """ - if module.startswith('resource') and len(module) > 8: + if module.startswith("resource") and len(module) > 8: return f"resource_{module[8:]}" return module def parse_task_name(self, name: str) -> tuple[str, str] | None: """Parse task name to extract module.""" - parts = name.split('.') - if len(parts) >= 3 and parts[0] == 'tasks': + parts = name.split(".") + if len(parts) >= 3 and parts[0] == "tasks": return parts[1], parts[2] return None @@ -122,22 +126,24 @@ def route(self, name: str, kwargs: dict) -> dict | None: # No partitioning configured - use default queue if not partitions: - return {'queue': 'default'} + return {"queue": "default"} # Get resource identifier from kwargs using convention # Fallback to generic 'name' and 'namespace' if entity-specific not found entity = self.get_entity_from_module(module) - resource_name = kwargs.get(f'{entity}_name') or kwargs.get('name') - namespace = kwargs.get(f'{entity}_namespace') or kwargs.get('namespace', 'default') + resource_name = kwargs.get(f"{entity}_name") or kwargs.get("name") + namespace = kwargs.get(f"{entity}_namespace") or kwargs.get( + "namespace", "default" + ) if resource_name: queue = self.get_queue_name( resource_type, resource_name, namespace, partitions ) - return {'queue': queue} + return {"queue": queue} # No resource identifier - use default queue - return {'queue': 'default'} + return {"queue": "default"} # ============================================================================= @@ -172,10 +178,12 @@ def cleanup(cls, log): # Cleanup distributed lock Redis client from distributed_lock import DistributedLock + DistributedLock.on_cleanup() if cls.loop and not cls.loop.is_closed(): from poolboy import Poolboy + cls.loop.run_until_complete(Poolboy.on_cleanup()) cls.loop.close() log.info("Worker state cleaned up") @@ -190,12 +198,14 @@ def initialize(cls, log): # Initialize distributed lock Redis client from distributed_lock import DistributedLock + DistributedLock.on_startup() cls.loop = asyncio.new_event_loop() asyncio.set_event_loop(cls.loop) from poolboy import Poolboy + cls.loop.run_until_complete(Poolboy.on_startup(logger=log)) cls.k8s_initialized = True cls.initialized_at = time.time() @@ -204,6 +214,7 @@ def initialize(cls, log): def _is_connection_stale(cls) -> bool: """Check if connection has exceeded max age.""" import time + if cls.initialized_at == 0: return True elapsed = time.time() - cls.initialized_at @@ -212,11 +223,7 @@ def _is_connection_stale(cls) -> bool: @classmethod def _ensure_initialized(cls): """Ensure connection is initialized and fresh (lazy init + max age).""" - not_ready = ( - not cls.k8s_initialized or - cls.loop is None or - cls.loop.is_closed() - ) + not_ready = not cls.k8s_initialized or cls.loop is None or cls.loop.is_closed() if not_ready: logger.warning("WorkerState not initialized, lazy init...") cls.initialize(logger) @@ -248,30 +255,11 @@ def run_async(cls, coro): # errors - they propagate normally for task logic to handle -# ============================================================================= -# Helper functions -# ============================================================================= - -def is_worker_enabled(resource_type: str) -> bool: - """ - Check if workers are enabled for a specific resource type. - - Used by the operator to decide whether to dispatch tasks to workers. - - Args: - resource_type: Type of resource (e.g., 'resource_pool') - - Returns: - True if workers are enabled for this resource type. - """ - env_key = f"WORKERS_{resource_type.upper()}" - return os.environ.get(env_key, 'false').lower() == 'true' - - # ============================================================================= # WorkerApp # ============================================================================= + class WorkerApp(metaclass=TimerDecoratorMeta): """ Worker application factory for Poolboy. @@ -292,7 +280,7 @@ def __init__(self, config: WorkerConfig | None = None): """ self.config = config or WorkerConfig() self.router = TaskRouter() - self.app = Celery('poolboy') + self.app = Celery("poolboy") self._configure_app() self._configure_queues() @@ -307,18 +295,18 @@ def _configure_queues(self): """Configure task queues and routing.""" queue_names = self._get_all_queues() self.app.conf.task_queues = [Queue(q) for q in queue_names] - self.app.conf.task_default_queue = 'default' + self.app.conf.task_default_queue = "default" self.app.conf.task_routes = (self.router,) def _get_all_queues(self) -> list[str]: """Generate queue names (default + partitioned).""" - queues = ['default'] + queues = ["default"] # Partitioned queues (e.g., 'resource_pool_0', 'resource_pool_1') config = self._get_partition_config() for resource_type, partition_count in config.items(): for i in range(partition_count): - queues.append(f'{resource_type}_{i}') + queues.append(f"{resource_type}_{i}") return queues @@ -327,8 +315,12 @@ def _get_all_queues(self) -> list[str]: def _get_partition_config() -> dict[str, int]: """Get partition configuration from environment variables.""" resource_types = [ - 'cleanup', 'resource_claim', 'resource_handle', - 'resource_pool', 'resource_provider', 'resource_watch', + "cleanup", + "resource_claim", + "resource_handle", + "resource_pool", + "resource_provider", + "resource_watch", ] config = {} for resource_type in resource_types: @@ -351,12 +343,12 @@ def _connect_signals(self): @staticmethod def _on_worker_init(**kwargs): """Initialize metrics server when main worker process starts.""" - if os.environ.get('WORKER_METRICS_ENABLED', 'true').lower() != 'true': + if os.environ.get("WORKER_METRICS_ENABLED", "true").lower() != "true": return from metrics import MetricsService - port = int(os.environ.get('WORKER_METRICS_PORT', '9090')) + port = int(os.environ.get("WORKER_METRICS_PORT", "9090")) MetricsService.start(port=port) logger.info(f"Worker metrics server started on port {port}") @@ -373,6 +365,7 @@ def _on_worker_shutdown(**kwargs): def _on_worker_process_init(**kwargs): """Initialize event loop and K8s client when worker process starts.""" from cache import Cache + Cache.initialize(standalone=False) WorkerState.initialize(logger) @@ -395,7 +388,7 @@ def _on_task_postrun(task_id=None, **kwargs): def _setup_autodiscover(self): """Configure task autodiscovery.""" - self.app.autodiscover_tasks(['tasks']) + self.app.autodiscover_tasks(["tasks"]) # ============================================================================= @@ -411,6 +404,7 @@ def _setup_autodiscover(self): # Beat Schedule Setup (after all tasks are discovered) # ============================================================================= + @app.on_after_finalize.connect def setup_periodic_tasks(sender, **kwargs): """ @@ -419,8 +413,8 @@ def setup_periodic_tasks(sender, **kwargs): This runs after all tasks have been discovered and registered, avoiding circular import issues. """ - enabled = os.environ.get('CELERY_SCHEDULER_ENABLED', 'false') - if enabled.lower() != 'true': + enabled = os.environ.get("CELERY_SCHEDULER_ENABLED", "false") + if enabled.lower() != "true": return # Import tasks to trigger @register_schedule decorators diff --git a/operator/resourcehandle.py b/operator/resourcehandle.py index 1bb4569..2e75e44 100644 --- a/operator/resourcehandle.py +++ b/operator/resourcehandle.py @@ -18,10 +18,11 @@ from poolboy import Poolboy from poolboy_templating import recursive_process_template_strings, timedelta_to_str -ResourceClaimT = TypeVar('ResourceClaimT', bound='ResourceClaim') -ResourceHandleT = TypeVar('ResourceHandleT', bound='ResourceHandle') -ResourcePoolT = TypeVar('ResourcePoolT', bound='ResourcePool') -ResourceProviderT = TypeVar('ResourceProviderT', bound='ResourceProvider') +ResourceClaimT = TypeVar("ResourceClaimT", bound="ResourceClaim") +ResourceHandleT = TypeVar("ResourceHandleT", bound="ResourceHandle") +ResourcePoolT = TypeVar("ResourcePoolT", bound="ResourcePool") +ResourceProviderT = TypeVar("ResourceProviderT", bound="ResourceProvider") + class ResourceHandleMatch: def __init__(self, resource_handle): @@ -31,7 +32,7 @@ def __init__(self, resource_handle): self.template_difference_count = 0 def __lt__(self, cmp): - '''Compare matches by preference''' + """Compare matches by preference""" if self.resource_count_difference < cmp.resource_count_difference: return True if self.resource_count_difference > cmp.resource_count_difference: @@ -52,12 +53,16 @@ def __lt__(self, cmp): if self.resource_handle.is_healthy is False and cmp.resource_handle.is_healthy: return False - if self.resource_handle.is_ready and not cmp.resource_handle.is_ready: + if self.resource_handle.is_ready and cmp.resource_handle.is_ready is False: return True - if not self.resource_handle.is_ready and cmp.resource_handle.is_ready: + if self.resource_handle.is_ready is False and cmp.resource_handle.is_ready: return False - return self.resource_handle.creation_timestamp < cmp.resource_handle.creation_timestamp + return ( + self.resource_handle.creation_timestamp + < cmp.resource_handle.creation_timestamp + ) + class ResourceHandle(KopfObject): api_group = Poolboy.operator_domain @@ -69,20 +74,20 @@ class ResourceHandle(KopfObject): @classmethod def __register_definition(cls, definition: Mapping) -> ResourceHandleT: - name = definition['metadata']['name'] + name = definition["metadata"]["name"] resource_handle = cls.cache_get(CacheTag.HANDLE, name) if resource_handle: resource_handle.refresh_from_definition(definition=definition) else: resource_handle = cls( - annotations = definition['metadata'].get('annotations', {}), - labels = definition['metadata'].get('labels', {}), - meta = definition['metadata'], - name = name, - namespace = Poolboy.namespace, - spec = definition['spec'], - status = definition.get('status', {}), - uid = definition['metadata']['uid'], + annotations=definition["metadata"].get("annotations", {}), + labels=definition["metadata"].get("labels", {}), + meta=definition["metadata"], + name=name, + namespace=Poolboy.namespace, + spec=definition["spec"], + status=definition.get("status", {}), + uid=definition["metadata"]["uid"], ) resource_handle.__register() return resource_handle @@ -93,7 +98,7 @@ async def bind_handle_to_claim( logger: kopf.ObjectLogger, resource_claim: ResourceClaimT, resource_claim_resources: List[Mapping], - ) -> ResourceHandleT|None: + ) -> ResourceHandleT | None: async with cls.class_lock: # Check if there is already an assigned claim bound_key = f"{resource_claim.namespace}/{resource_claim.name}" @@ -117,13 +122,18 @@ async def bind_handle_to_claim( continue # Honor explicit pool requests - if resource_claim.resource_pool_name \ - and resource_claim.resource_pool_name != resource_handle.resource_pool_name: + if ( + resource_claim.resource_pool_name + and resource_claim.resource_pool_name + != resource_handle.resource_pool_name + ): continue # Do not bind to handles that are near end of lifespan - if resource_handle.has_lifespan_end \ - and resource_handle.timedelta_to_lifespan_end.total_seconds() < 120: + if ( + resource_handle.has_lifespan_end + and resource_handle.timedelta_to_lifespan_end.total_seconds() < 120 + ): continue handle_resources = resource_handle.resources @@ -133,28 +143,32 @@ async def bind_handle_to_claim( continue match = ResourceHandleMatch(resource_handle) - match.resource_count_difference = len(resource_claim_resources) - len(handle_resources) + match.resource_count_difference = len(resource_claim_resources) - len( + handle_resources + ) for i, handle_resource in enumerate(handle_resources): claim_resource = resource_claim_resources[i] # ResourceProvider must match - provider_name = claim_status_resources[i]['provider']['name'] - if provider_name != handle_resource['provider']['name']: + provider_name = claim_status_resources[i]["provider"]["name"] + if provider_name != handle_resource["provider"]["name"]: match = None break # Check resource name match - claim_resource_name = claim_resource.get('name') - handle_resource_name = handle_resource.get('name') + claim_resource_name = claim_resource.get("name") + handle_resource_name = handle_resource.get("name") if claim_resource_name != handle_resource_name: match.resource_name_difference_count += 1 # Use provider to check if templates match and get list of allowed differences - provider = await resourceprovider.ResourceProvider.get(provider_name) + provider = await resourceprovider.ResourceProvider.get( + provider_name + ) diff_patch = provider.check_template_match( - handle_resource_template = handle_resource.get('template', {}), - claim_resource_template = claim_resource.get('template', {}), + handle_resource_template=handle_resource.get("template", {}), + claim_resource_template=claim_resource.get("template", {}), ) if diff_patch is None: match = None @@ -179,58 +193,79 @@ async def bind_handle_to_claim( "kind": "ResourceClaim", "name": resource_claim.name, "namespace": resource_claim.namespace, - } + }, } ] # Update ResourceProvider to match ResourceClaim if resource_claim.has_resource_provider: - patch.append({ - "op": "add", - "path": "/spec/provider", - "value": resource_claim.spec['provider'], - }) + patch.append( + { + "op": "add", + "path": "/spec/provider", + "value": resource_claim.spec["provider"], + } + ) # Set resource names and add any additional resources to handle - for resource_index, claim_resource in enumerate(resource_claim_resources): - resource_name = resource_claim_resources[resource_index].get('name') + for resource_index, claim_resource in enumerate( + resource_claim_resources + ): + resource_name = resource_claim_resources[resource_index].get("name") if resource_index < len(matched_resource_handle.resources): - handle_resource = matched_resource_handle.resources[resource_index] - if resource_name != handle_resource.get('name'): - patch.append({ - "op": "add", - "path": f"/spec/resources/{resource_index}/name", - "value": resource_name, - }) + handle_resource = matched_resource_handle.resources[ + resource_index + ] + if resource_name != handle_resource.get("name"): + patch.append( + { + "op": "add", + "path": f"/spec/resources/{resource_index}/name", + "value": resource_name, + } + ) else: patch_value = { - "provider": resource_claim_resources[resource_index]['provider'], + "provider": resource_claim_resources[resource_index][ + "provider" + ], } if resource_name: - patch_value['name'] = resource_name - patch.append({ - "op": "add", - "path": f"/spec/resources/{resource_index}", - "value": patch_value, - }) + patch_value["name"] = resource_name + patch.append( + { + "op": "add", + "path": f"/spec/resources/{resource_index}", + "value": patch_value, + } + ) # Set lifespan end from default on claim bind - lifespan_default = matched_resource_handle.get_lifespan_default(resource_claim) + lifespan_default = matched_resource_handle.get_lifespan_default( + resource_claim + ) if lifespan_default: - patch.append({ - "op": "add", - "path": "/spec/lifespan/end", - "value": ( - datetime.now(timezone.utc) + matched_resource_handle.get_lifespan_default_timedelta(resource_claim) - ).strftime('%FT%TZ'), - }) + patch.append( + { + "op": "add", + "path": "/spec/lifespan/end", + "value": ( + datetime.now(timezone.utc) + + matched_resource_handle.get_lifespan_default_timedelta( + resource_claim + ) + ).strftime("%FT%TZ"), + } + ) try: await matched_resource_handle.json_patch(patch) matched_resource_handle.__register() except kubernetes_asyncio.client.exceptions.ApiException as exception: if exception.status == 404: - logger.warning(f"Attempt to bind deleted {matched_resource_handle} to {resource_claim}") + logger.warning( + f"Attempt to bind deleted {matched_resource_handle} to {resource_claim}" + ) matched_resource_handle.__unregister() matched_resource_handle = None else: @@ -243,7 +278,9 @@ async def bind_handle_to_claim( return None if matched_resource_handle.is_from_resource_pool: - resource_pool = await resourcepool.ResourcePool.get(matched_resource_handle.resource_pool_name) + resource_pool = await resourcepool.ResourcePool.get( + matched_resource_handle.resource_pool_name + ) if resource_pool: await resource_pool.manage(logger=logger) else: @@ -254,30 +291,31 @@ async def bind_handle_to_claim( return matched_resource_handle @classmethod - async def create_for_claim(cls, + async def create_for_claim( + cls, logger: kopf.ObjectLogger, resource_claim: ResourceClaimT, resource_claim_resources: List[Mapping], ): definition = { - 'apiVersion': Poolboy.operator_api_version, - 'kind': 'ResourceHandle', - 'metadata': { - 'finalizers': [ Poolboy.operator_domain ], - 'generateName': 'guid-', - 'labels': { + "apiVersion": Poolboy.operator_api_version, + "kind": "ResourceHandle", + "metadata": { + "finalizers": [Poolboy.operator_domain], + "generateName": "guid-", + "labels": { Poolboy.resource_claim_name_label: resource_claim.name, Poolboy.resource_claim_namespace_label: resource_claim.namespace, - } + }, }, - 'spec': { - 'resourceClaim': { - 'apiVersion': Poolboy.operator_api_version, - 'kind': 'ResourceClaim', - 'name': resource_claim.name, - 'namespace': resource_claim.namespace + "spec": { + "resourceClaim": { + "apiVersion": Poolboy.operator_api_version, + "kind": "ResourceClaim", + "name": resource_claim.name, + "namespace": resource_claim.namespace, }, - } + }, } resources = [] @@ -288,45 +326,70 @@ async def create_for_claim(cls, lifespan_relative_maximum_timedelta = None if resource_claim.has_resource_provider: resource_provider = await resource_claim.get_resource_provider() - definition['spec']['resources'] = resource_claim_resources - definition['spec']['provider'] = resource_claim.spec['provider'] - lifespan_default_timedelta = resource_provider.get_lifespan_default_timedelta(resource_claim) + definition["spec"]["resources"] = resource_claim_resources + definition["spec"]["provider"] = resource_claim.spec["provider"] + lifespan_default_timedelta = ( + resource_provider.get_lifespan_default_timedelta(resource_claim) + ) lifespan_maximum = resource_provider.lifespan_maximum - lifespan_maximum_timedelta = resource_provider.get_lifespan_maximum_timedelta(resource_claim) + lifespan_maximum_timedelta = ( + resource_provider.get_lifespan_maximum_timedelta(resource_claim) + ) lifespan_relative_maximum = resource_provider.lifespan_relative_maximum - lifespan_relative_maximum_timedelta = resource_provider.get_lifespan_maximum_timedelta(resource_claim) + lifespan_relative_maximum_timedelta = ( + resource_provider.get_lifespan_maximum_timedelta(resource_claim) + ) else: - resource_providers = await resource_claim.get_resource_providers(resource_claim_resources) + resource_providers = await resource_claim.get_resource_providers( + resource_claim_resources + ) for i, claim_resource in enumerate(resource_claim_resources): provider = resource_providers[i] - provider_lifespan_default_timedelta = provider.get_lifespan_default_timedelta(resource_claim) + provider_lifespan_default_timedelta = ( + provider.get_lifespan_default_timedelta(resource_claim) + ) if provider_lifespan_default_timedelta: - if not lifespan_default_timedelta \ - or provider_lifespan_default_timedelta < lifespan_default_timedelta: + if ( + not lifespan_default_timedelta + or provider_lifespan_default_timedelta + < lifespan_default_timedelta + ): lifespan_default_timedelta = provider_lifespan_default_timedelta - provider_lifespan_maximum_timedelta = provider.get_lifespan_maximum_timedelta(resource_claim) + provider_lifespan_maximum_timedelta = ( + provider.get_lifespan_maximum_timedelta(resource_claim) + ) if provider_lifespan_maximum_timedelta: - if not lifespan_maximum_timedelta \ - or provider_lifespan_maximum_timedelta < lifespan_maximum_timedelta: + if ( + not lifespan_maximum_timedelta + or provider_lifespan_maximum_timedelta + < lifespan_maximum_timedelta + ): lifespan_maximum = provider.lifespan_maximum lifespan_maximum_timedelta = provider_lifespan_maximum_timedelta - provider_lifespan_relative_maximum_timedelta = provider.get_lifespan_relative_maximum_timedelta(resource_claim) + provider_lifespan_relative_maximum_timedelta = ( + provider.get_lifespan_relative_maximum_timedelta(resource_claim) + ) if provider_lifespan_relative_maximum_timedelta: - if not lifespan_relative_maximum_timedelta \ - or provider_lifespan_relative_maximum_timedelta < lifespan_relative_maximum_timedelta: + if ( + not lifespan_relative_maximum_timedelta + or provider_lifespan_relative_maximum_timedelta + < lifespan_relative_maximum_timedelta + ): lifespan_relative_maximum = provider.lifespan_relative_maximum - lifespan_relative_maximum_timedelta = provider_lifespan_relative_maximum_timedelta + lifespan_relative_maximum_timedelta = ( + provider_lifespan_relative_maximum_timedelta + ) resources_item = {"provider": provider.as_reference()} - if 'name' in claim_resource: - resources_item['name'] = claim_resource['name'] - if 'template' in claim_resource: - resources_item['template'] = claim_resource['template'] + if "name" in claim_resource: + resources_item["name"] = claim_resource["name"] + if "template" in claim_resource: + resources_item["template"] = claim_resource["template"] resources.append(resources_item) - definition['spec']['resources'] = resources + definition["spec"]["resources"] = resources lifespan_end_datetime = None lifespan_start_datetime = datetime.now(timezone.utc) @@ -336,21 +399,31 @@ async def create_for_claim(cls, elif lifespan_default_timedelta: lifespan_end_datetime = lifespan_start_datetime + lifespan_default_timedelta elif lifespan_relative_maximum_timedelta: - lifespan_end_datetime = lifespan_start_datetime + lifespan_relative_maximum_timedelta + lifespan_end_datetime = ( + lifespan_start_datetime + lifespan_relative_maximum_timedelta + ) elif lifespan_maximum_timedelta: lifespan_end_datetime = lifespan_start_datetime + lifespan_maximum_timedelta if lifespan_end_datetime: - if lifespan_relative_maximum_timedelta \ - and lifespan_end_datetime > lifespan_start_datetime + lifespan_relative_maximum_timedelta: + if ( + lifespan_relative_maximum_timedelta + and lifespan_end_datetime + > lifespan_start_datetime + lifespan_relative_maximum_timedelta + ): logger.warning( f"Requested lifespan end {resource_claim.requested_lifespan_end_timestamp} " f"for ResourceClaim {resource_claim.name} in {resource_claim.namespace} " f"exceeds relativeMaximum for ResourceProviders" ) - lifespan_end = lifespan_start_datetime + lifespan_relative_maximum_timedelta - if lifespan_maximum_timedelta \ - and lifespan_end_datetime > lifespan_start_datetime + lifespan_maximum_timedelta: + lifespan_end = ( + lifespan_start_datetime + lifespan_relative_maximum_timedelta + ) + if ( + lifespan_maximum_timedelta + and lifespan_end_datetime + > lifespan_start_datetime + lifespan_maximum_timedelta + ): logger.warning( f"Requested lifespan end {resource_claim.requested_lifespan_end_timestamp} " f"for ResourceClaim {resource_claim.name} in {resource_claim.namespace} " @@ -359,27 +432,33 @@ async def create_for_claim(cls, lifespan_end = lifespan_start_datetime + lifespan_maximum_timedelta if lifespan_default_timedelta: - definition['spec'].setdefault('lifespan', {})['default'] = timedelta_to_str(lifespan_default_timedelta) + definition["spec"].setdefault("lifespan", {})["default"] = timedelta_to_str( + lifespan_default_timedelta + ) if lifespan_end_datetime: - definition['spec'].setdefault('lifespan', {})['end'] = lifespan_end_datetime.strftime('%FT%TZ') + definition["spec"].setdefault("lifespan", {})["end"] = ( + lifespan_end_datetime.strftime("%FT%TZ") + ) if lifespan_maximum: - definition['spec'].setdefault('lifespan', {})['maximum'] = lifespan_maximum + definition["spec"].setdefault("lifespan", {})["maximum"] = lifespan_maximum if lifespan_relative_maximum: - definition['spec'].setdefault('lifespan', {})['relativeMaximum'] = lifespan_relative_maximum + definition["spec"].setdefault("lifespan", {})["relativeMaximum"] = ( + lifespan_relative_maximum + ) definition = await Poolboy.custom_objects_api.create_namespaced_custom_object( - body = definition, - group = Poolboy.operator_domain, - namespace = Poolboy.namespace, - plural = 'resourcehandles', - version = Poolboy.operator_version, + body=definition, + group=Poolboy.operator_domain, + namespace=Poolboy.namespace, + plural="resourcehandles", + version=Poolboy.operator_version, ) resource_handle = cls.from_definition(definition) # Register in standalone mode (no handler partitioning) - if Poolboy.operator_mode_standalone: + if Poolboy.is_standalone: resource_handle.__register() logger.info( f"Created ResourceHandle {resource_handle.name} for " @@ -406,52 +485,68 @@ async def create_for_pool( "spec": { "resourcePool": resource_pool.reference, "vars": resource_pool.vars, - } + }, } if resource_pool.has_resource_provider: - definition['spec']['provider'] = resource_pool.spec['provider'] + definition["spec"]["provider"] = resource_pool.spec["provider"] resource_provider = await resource_pool.get_resource_provider() if resource_provider.has_lifespan: - definition['spec']['lifespan'] = {} + definition["spec"]["lifespan"] = {} if resource_provider.lifespan_default: - definition['spec']['lifespan']['default'] = resource_provider.lifespan_default + definition["spec"]["lifespan"]["default"] = ( + resource_provider.lifespan_default + ) if resource_provider.lifespan_maximum: - definition['spec']['lifespan']['maximum'] = resource_provider.lifespan_maximum + definition["spec"]["lifespan"]["maximum"] = ( + resource_provider.lifespan_maximum + ) if resource_provider.lifespan_relative_maximum: - definition['spec']['lifespan']['maximum'] = resource_provider.lifespan_relative_maximum + definition["spec"]["lifespan"]["maximum"] = ( + resource_provider.lifespan_relative_maximum + ) if resource_provider.lifespan_unclaimed: - definition['spec']['lifespan']['end'] = ( - datetime.now(timezone.utc) + resource_provider.lifespan_unclaimed_timedelta + definition["spec"]["lifespan"]["end"] = ( + datetime.now(timezone.utc) + + resource_provider.lifespan_unclaimed_timedelta ).strftime("%FT%TZ") else: - definition['spec']['resources'] = resource_pool.resources + definition["spec"]["resources"] = resource_pool.resources if resource_pool.has_lifespan: - definition['spec']['lifespan'] = {} + definition["spec"]["lifespan"] = {} if resource_pool.lifespan_default: - definition['spec']['lifespan']['default'] = resource_pool.lifespan_default + definition["spec"]["lifespan"]["default"] = ( + resource_pool.lifespan_default + ) if resource_pool.lifespan_maximum: - definition['spec']['lifespan']['maximum'] = resource_pool.lifespan_maximum + definition["spec"]["lifespan"]["maximum"] = ( + resource_pool.lifespan_maximum + ) if resource_pool.lifespan_relative_maximum: - definition['spec']['lifespan']['relativeMaximum'] = resource_pool.lifespan_relative_maximum + definition["spec"]["lifespan"]["relativeMaximum"] = ( + resource_pool.lifespan_relative_maximum + ) if resource_pool.lifespan_unclaimed: - definition['spec']['lifespan']['end'] = ( - datetime.now(timezone.utc) + resource_pool.lifespan_unclaimed_timedelta + definition["spec"]["lifespan"]["end"] = ( + datetime.now(timezone.utc) + + resource_pool.lifespan_unclaimed_timedelta ).strftime("%FT%TZ") definition = await Poolboy.custom_objects_api.create_namespaced_custom_object( - body = definition, - group = Poolboy.operator_domain, - namespace = Poolboy.namespace, - plural = "resourcehandles", - version = Poolboy.operator_version, + body=definition, + group=Poolboy.operator_domain, + namespace=Poolboy.namespace, + plural="resourcehandles", + version=Poolboy.operator_version, ) resource_handle = cls.from_definition(definition) # Register in standalone mode (no handler partitioning) - if Poolboy.operator_mode_standalone: + if Poolboy.is_standalone: resource_handle.__register() - logger.info(f"Created ResourceHandle {resource_handle.name} for ResourcePool {resource_pool.name}") + logger.info( + f"Created ResourceHandle {resource_handle.name} for ResourcePool {resource_pool.name}" + ) return resource_handle @classmethod @@ -460,22 +555,7 @@ async def delete_unbound_handles_for_pool( logger: kopf.ObjectLogger, resource_pool: ResourcePoolT, ) -> List[ResourceHandleT]: - # Workers always fetch from API (no shared memory cache) - use_cache = Poolboy.operator_mode_standalone - if use_cache: - async with cls.class_lock: - resource_handles = [] - for resource_handle in list(cls.unbound_instances.values()): - if resource_handle.resource_pool_name == resource_pool.name \ - and resource_handle.resource_pool_namespace == resource_pool.namespace: - logger.info( - f"Deleting unbound ResourceHandle {resource_handle.name} " - f"for ResourcePool {resource_pool.name}" - ) - resource_handle.__unregister() - await resource_handle.delete() - return resource_handles - + """Delete all unbound handles for a pool.""" resource_handles = await cls.get_unbound_handles_for_pool( resource_pool=resource_pool, logger=logger, @@ -485,11 +565,14 @@ async def delete_unbound_handles_for_pool( f"Deleting unbound ResourceHandle {resource_handle.name} " f"for ResourcePool {resource_pool.name}" ) + resource_handle.__unregister() await resource_handle.delete() return resource_handles @classmethod - async def get(cls, name: str, ignore_deleting=True, use_cache=True) -> ResourceHandleT|None: + async def get( + cls, name: str, ignore_deleting=True, use_cache=True + ) -> ResourceHandleT | None: async with cls.class_lock: if use_cache: cached = cls.cache_get(CacheTag.HANDLE, name) @@ -500,17 +583,17 @@ async def get(cls, name: str, ignore_deleting=True, use_cache=True) -> ResourceH group=Poolboy.operator_domain, name=name, namespace=Poolboy.namespace, - plural='resourcehandles', + plural="resourcehandles", version=Poolboy.operator_version, ) - if ignore_deleting and 'deletionTimestamp' in definition['metadata']: + if ignore_deleting and "deletionTimestamp" in definition["metadata"]: return None if use_cache: return cls.__register_definition(definition) return cls.from_definition(definition) @classmethod - def get_from_cache(cls, name: str) -> ResourceHandleT|None: + def get_from_cache(cls, name: str) -> ResourceHandleT | None: return cls.cache_get(CacheTag.HANDLE, name) @classmethod @@ -525,13 +608,16 @@ async def get_unbound_handles_for_pool( # In standalone mode, use cache (Memory or Redis) # In distributed mode, fetch from K8s API to ensure completeness # (cache may not be fully populated if operator just started) - if Poolboy.operator_mode_standalone: + if Poolboy.is_standalone: async with cls.class_lock: for name in Cache.get_keys_by_tag(CacheTag.HANDLE_UNBOUND): resource_handle = cls.cache_get(CacheTag.HANDLE_UNBOUND, name) - if resource_handle \ - and resource_handle.resource_pool_name == resource_pool.name \ - and resource_handle.resource_pool_namespace == resource_pool.namespace: + if ( + resource_handle + and resource_handle.resource_pool_name == resource_pool.name + and resource_handle.resource_pool_namespace + == resource_pool.namespace + ): resource_handles.append(resource_handle) return resource_handles @@ -542,19 +628,23 @@ async def get_unbound_handles_for_pool( group=Poolboy.operator_domain, label_selector=f"{Poolboy.resource_pool_name_label}={resource_pool.name},!{Poolboy.resource_claim_name_label}", namespace=Poolboy.namespace, - plural='resourcehandles', + plural="resourcehandles", version=Poolboy.operator_version, _continue=_continue, limit=50, ) - for definition in resource_handle_list['items']: + for definition in resource_handle_list["items"]: resource_handle = cls.from_definition(definition) if not resource_handle.is_bound: # Cache for other workers - resource_handle.cache_set(CacheTag.HANDLE, resource_handle.name, ttl=300) - resource_handle.cache_set(CacheTag.HANDLE_UNBOUND, resource_handle.name, ttl=300) + resource_handle.cache_set( + CacheTag.HANDLE, resource_handle.name, ttl=300 + ) + resource_handle.cache_set( + CacheTag.HANDLE_UNBOUND, resource_handle.name, ttl=300 + ) resource_handles.append(resource_handle) - _continue = resource_handle_list['metadata'].get('continue') + _continue = resource_handle_list["metadata"].get("continue") if not _continue: break return resource_handles @@ -564,17 +654,19 @@ async def preload(cls, logger: kopf.ObjectLogger) -> None: async with cls.class_lock: _continue = None while True: - resource_handle_list = await Poolboy.custom_objects_api.list_namespaced_custom_object( - group=Poolboy.operator_domain, - namespace=Poolboy.namespace, - plural='resourcehandles', - version=Poolboy.operator_version, - _continue = _continue, - limit = 50, + resource_handle_list = ( + await Poolboy.custom_objects_api.list_namespaced_custom_object( + group=Poolboy.operator_domain, + namespace=Poolboy.namespace, + plural="resourcehandles", + version=Poolboy.operator_version, + _continue=_continue, + limit=50, + ) ) - for definition in resource_handle_list['items']: + for definition in resource_handle_list["items"]: cls.__register_definition(definition=definition) - _continue = resource_handle_list['metadata'].get('continue') + _continue = resource_handle_list["metadata"].get("continue") if not _continue: break @@ -594,23 +686,23 @@ async def register( resource_handle = cls.cache_get(CacheTag.HANDLE, name) if resource_handle: resource_handle.refresh( - annotations = annotations, - labels = labels, - meta = meta, - spec = spec, - status = status, - uid = uid, + annotations=annotations, + labels=labels, + meta=meta, + spec=spec, + status=status, + uid=uid, ) else: resource_handle = cls( - annotations = annotations, - labels = labels, - meta = meta, - name = name, - namespace = namespace, - spec = spec, - status = status, - uid = uid, + annotations=annotations, + labels=labels, + meta=meta, + name=name, + namespace=namespace, + spec=spec, + status=status, + uid=uid, ) resource_handle.__register() return resource_handle @@ -621,7 +713,7 @@ async def register_definition(cls, definition: Mapping) -> ResourceHandleT: return cls.__register_definition(definition) @classmethod - async def unregister(cls, name: str) -> ResourceHandleT|None: + async def unregister(cls, name: str) -> ResourceHandleT | None: async with cls.class_lock: resource_handle = cls.cache_get(CacheTag.HANDLE, name) if resource_handle: @@ -659,21 +751,21 @@ def __unregister(self) -> None: @property def guid(self) -> str: name = self.name - generate_name = self.meta.get('generateName') + generate_name = self.meta.get("generateName") if generate_name and name.startswith(generate_name): - return name[len(generate_name):] - elif name.startswith('guid-'): + return name[len(generate_name) :] + elif name.startswith("guid-"): return name[5:] return name[-5:] @property def has_lifespan_end(self) -> bool: - 'end' in self.spec.get('lifespan', {}) + return "end" in self.spec.get("lifespan", {}) @property def has_resource_provider(self) -> bool: """Return whether this ResourceHandle is managed by a ResourceProvider.""" - return 'provider' in self.spec + return "provider" in self.spec @property def ignore(self) -> bool: @@ -682,7 +774,7 @@ def ignore(self) -> bool: @property def is_bound(self) -> bool: - return 'resourceClaim' in self.spec + return "resourceClaim" in self.spec @property def is_deleting(self) -> bool: @@ -690,39 +782,36 @@ def is_deleting(self) -> bool: @property def is_from_resource_pool(self) -> bool: - return 'resourcePool' in self.spec + return "resourcePool" in self.spec @property - def is_healthy(self) -> bool|None: + def is_healthy(self) -> bool | None: """Return overall health of resources. - False if any resource has healthy False. - None if any non-waiting resource lacks a value for healthy. - True if all non-waiting resources are healthy.""" ret = True for resource in self.status_resources: - if resource.get('healthy') is False: + if resource.get("healthy") is False: return False - if( - resource.get('waitingFor') is not None and - resource.get('healthy') is None + if ( + resource.get("waitingFor") is not None + and resource.get("healthy") is None ): ret = None return ret @property - def is_ready(self) -> bool|None: + def is_ready(self) -> bool | None: """Return overall readiness of resources. - False if any resource has ready False. - None if any non-waiting resource lacks a value for ready. - True if all non-waiting resources are ready.""" ret = True for resource in self.status_resources: - if resource.get('ready') is False: + if resource.get("ready") is False: return False - if( - resource.get('waitingFor') is not None and - resource.get('ready') is None - ): + if resource.get("waitingFor") is not None and resource.get("ready") is None: ret = None return ret @@ -734,66 +823,66 @@ def is_past_lifespan_end(self) -> bool: return dt < datetime.now(timezone.utc) @property - def is_ready(self) -> bool|None: - return self.status.get('ready') + def is_ready(self) -> bool | None: + return self.status.get("ready") @property def lifespan_end_datetime(self) -> Any: timestamp = self.lifespan_end_timestamp if timestamp: - return datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S%z') + return datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S%z") @property - def lifespan_end_timestamp(self) -> str|None: - lifespan = self.spec.get('lifespan') + def lifespan_end_timestamp(self) -> str | None: + lifespan = self.spec.get("lifespan") if lifespan: - return lifespan.get('end') + return lifespan.get("end") @property def parameter_values(self) -> Mapping: - return self.spec.get('provider', {}).get('parameterValues', {}) + return self.spec.get("provider", {}).get("parameterValues", {}) @property - def resource_claim_description(self) -> str|None: - if not 'resourceClaim' not in self.spec: + def resource_claim_description(self) -> str | None: + if not "resourceClaim" not in self.spec: return None return f"ResourceClaim {self.resource_claim_name} in {self.resource_claim_namespace}" @property - def resource_claim_name(self) -> str|None: - return self.spec.get('resourceClaim', {}).get('name') + def resource_claim_name(self) -> str | None: + return self.spec.get("resourceClaim", {}).get("name") @property - def resource_claim_namespace(self) -> str|None: - return self.spec.get('resourceClaim', {}).get('namespace') + def resource_claim_namespace(self) -> str | None: + return self.spec.get("resourceClaim", {}).get("namespace") @property - def resource_pool_name(self) -> str|None: - if 'resourcePool' in self.spec: - return self.spec['resourcePool']['name'] + def resource_pool_name(self) -> str | None: + if "resourcePool" in self.spec: + return self.spec["resourcePool"]["name"] @property - def resource_pool_namespace(self) -> str|None: - if 'resourcePool' in self.spec: - return self.spec['resourcePool'].get('namespace', Poolboy.namespace) + def resource_pool_namespace(self) -> str | None: + if "resourcePool" in self.spec: + return self.spec["resourcePool"].get("namespace", Poolboy.namespace) @property - def resource_provider_name(self) -> str|None: - return self.spec.get('provider', {}).get('name') + def resource_provider_name(self) -> str | None: + return self.spec.get("provider", {}).get("name") @property def resources(self) -> List[Mapping]: """Resources as listed in spec.""" - return self.spec.get('resources', []) + return self.spec.get("resources", []) @property def status_resources(self) -> List[Mapping]: """Resources as listed in status.""" - return self.status.get('resources', []) + return self.status.get("resources", []) @property def vars(self) -> Mapping: - return self.spec.get('vars', {}) + return self.spec.get("vars", {}) @property def timedelta_to_lifespan_end(self) -> Any: @@ -802,34 +891,38 @@ def timedelta_to_lifespan_end(self) -> Any: return dt - datetime.now(timezone.utc) def __lifespan_value(self, name, resource_claim): - value = self.spec.get('lifespan', {}).get(name) + value = self.spec.get("lifespan", {}).get(name) if not value: return value = recursive_process_template_strings( - template = value, - variables = { + template=value, + variables={ "resource_claim": resource_claim, "resource_handle": self, }, - template_variables = self.vars, + template_variables=self.vars, ) return value - def __lifespan_value_as_timedelta(self, + def __lifespan_value_as_timedelta( + self, name: str, resource_claim: ResourceClaimT, - ) -> timedelta|None: + ) -> timedelta | None: value = self.__lifespan_value(name, resource_claim) if not value: return None seconds = pytimeparse.parse(value) if seconds is None: - raise kopf.TemporaryError(f"Failed to parse {name} time interval: {value}", delay=60) + raise kopf.TemporaryError( + f"Failed to parse {name} time interval: {value}", delay=60 + ) return timedelta(seconds=seconds) - async def __manage_init_status_resources(self, + async def __manage_init_status_resources( + self, logger: kopf.ObjectLogger, ) -> None: """Initialize resources in status from spec.""" @@ -842,41 +935,56 @@ async def __manage_init_status_resources(self, entry = deepcopy(self.status_resources[idx]) else: entry = {} - if 'name' in resource and resource['name'] != entry.get('name'): - entry['name'] = resource['name'] + if "name" in resource and resource["name"] != entry.get("name"): + entry["name"] = resource["name"] set_resources.append(entry) patch = [] if not self.status: - patch.extend(({ - "op": "test", - "path": "/status", - "value": None, - }, { - "op": "add", - "path": "/status", - "value": {}, - })) - if 'resources' not in self.status: - patch.extend(({ - "op": "test", - "path": "/status/resources", - "value": None, - }, { - "op": "add", - "path": "/status/resources", - "value": set_resources, - })) + patch.extend( + ( + { + "op": "test", + "path": "/status", + "value": None, + }, + { + "op": "add", + "path": "/status", + "value": {}, + }, + ) + ) + if "resources" not in self.status: + patch.extend( + ( + { + "op": "test", + "path": "/status/resources", + "value": None, + }, + { + "op": "add", + "path": "/status/resources", + "value": set_resources, + }, + ) + ) else: - patch.extend(({ - "op": "test", - "path": "/status/resources", - "value": self.status_resources, - }, { - "op": "replace", - "path": "/status/resources", - "value": set_resources, - })) + patch.extend( + ( + { + "op": "test", + "path": "/status/resources", + "value": self.status_resources, + }, + { + "op": "replace", + "path": "/status/resources", + "value": set_resources, + }, + ) + ) if 0 == len(patch): return await self.json_patch_status(patch) @@ -887,216 +995,255 @@ async def __manage_init_status_resources(self, raise attempt += 1 - async def __manage_check_delete(self, - logger: kopf.ObjectLogger, - resource_claim: ResourceClaimT + async def __manage_check_delete( + self, logger: kopf.ObjectLogger, resource_claim: ResourceClaimT ) -> bool: """Delete this ResourceHandle if it meets conditions which trigger delete. - Is past lifespan end. - Is bound to resource claim that has been deleted. """ if self.is_past_lifespan_end: - logger.info(f"Deleting {self} at end of lifespan ({self.lifespan_end_timestamp})") + logger.info( + f"Deleting {self} at end of lifespan ({self.lifespan_end_timestamp})" + ) await self.delete() return True if self.is_bound and not resource_claim: - logger.info(f"Propagating deletion of {self.resource_claim_description} to {self}") + logger.info( + f"Propagating deletion of {self.resource_claim_description} to {self}" + ) await self.delete() return True - async def __manage_update_spec_resources(self, + async def __manage_update_spec_resources( + self, logger: kopf.ObjectLogger, - resource_claim: ResourceClaimT|None, - resource_provider: ResourceProviderT|None, + resource_claim: ResourceClaimT | None, + resource_provider: ResourceProviderT | None, ): """Update this ResourecHandle's spec.resources by applying parameter values from ResourceProvider.""" if not resource_provider: return resources = await resource_provider.get_resources( - resource_claim = resource_claim, - resource_handle = self, + resource_claim=resource_claim, + resource_handle=self, ) - if 'resources' not in self.spec: - await self.json_patch([{ - "op": "add", - "path": "/spec/resources", - "value": resources, - }]) + if "resources" not in self.spec: + await self.json_patch( + [ + { + "op": "add", + "path": "/spec/resources", + "value": resources, + } + ] + ) return patch = [] for idx, resource in enumerate(resources): - if idx < len(self.spec['resources']): - current_provider = self.spec['resources'][idx]['provider']['name'] - updated_provider = resource['provider']['name'] + if idx < len(self.spec["resources"]): + current_provider = self.spec["resources"][idx]["provider"]["name"] + updated_provider = resource["provider"]["name"] if current_provider != updated_provider: logger.warning( f"Refusing update resources in {self} as it would change " f"ResourceProvider from {current_provider} to {updated_provider}" ) - current_template = self.spec['resources'][idx].get('template') - updated_template = resource.get('template') + current_template = self.spec["resources"][idx].get("template") + updated_template = resource.get("template") if current_template != updated_template: - patch.append({ - "op": "add", - "path": f"/spec/resources/{idx}/template", - "value": updated_template, - }) + patch.append( + { + "op": "add", + "path": f"/spec/resources/{idx}/template", + "value": updated_template, + } + ) else: - patch.append({ - "op": "add", - "path": f"/spec/resources/{idx}", - "value": resource - }) + patch.append( + {"op": "add", "path": f"/spec/resources/{idx}", "value": resource} + ) if patch: await self.json_patch(patch) logger.info(f"Updated resources for {self} from {resource_provider}") def get_lifespan_default(self, resource_claim=None): - return self.__lifespan_value('default', resource_claim=resource_claim) + return self.__lifespan_value("default", resource_claim=resource_claim) def get_lifespan_default_timedelta(self, resource_claim=None): - return self.__lifespan_value_as_timedelta('default', resource_claim=resource_claim) + return self.__lifespan_value_as_timedelta( + "default", resource_claim=resource_claim + ) def get_lifespan_maximum(self, resource_claim=None): - return self.__lifespan_value('maximum', resource_claim=resource_claim) + return self.__lifespan_value("maximum", resource_claim=resource_claim) def get_lifespan_maximum_timedelta(self, resource_claim=None): - return self.__lifespan_value_as_timedelta('maximum', resource_claim=resource_claim) + return self.__lifespan_value_as_timedelta( + "maximum", resource_claim=resource_claim + ) def get_lifespan_relative_maximum(self, resource_claim=None): - return self.__lifespan_value('relativeMaximum', resource_claim=resource_claim) + return self.__lifespan_value("relativeMaximum", resource_claim=resource_claim) def get_lifespan_relative_maximum_timedelta(self, resource_claim=None): - return self.__lifespan_value_as_timedelta('relativeMaximum', resource_claim=resource_claim) + return self.__lifespan_value_as_timedelta( + "relativeMaximum", resource_claim=resource_claim + ) def get_lifespan_end_maximum_datetime(self, resource_claim=None): - lifespan_start_datetime = resource_claim.lifespan_start_datetime if resource_claim else self.creation_datetime + lifespan_start_datetime = ( + resource_claim.lifespan_start_datetime + if resource_claim + else self.creation_datetime + ) - maximum_timedelta = self.get_lifespan_maximum_timedelta(resource_claim=resource_claim) + maximum_timedelta = self.get_lifespan_maximum_timedelta( + resource_claim=resource_claim + ) if maximum_timedelta: if resource_claim.lifespan_first_ready_timestamp: - maximum_end = resource_claim.lifespan_first_ready_datetime + maximum_timedelta + maximum_end = ( + resource_claim.lifespan_first_ready_datetime + maximum_timedelta + ) else: maximum_end = lifespan_start_datetime + maximum_timedelta else: maximum_end = None - relative_maximum_timedelta = self.get_lifespan_relative_maximum_timedelta(resource_claim=resource_claim) + relative_maximum_timedelta = self.get_lifespan_relative_maximum_timedelta( + resource_claim=resource_claim + ) if relative_maximum_timedelta: - relative_maximum_end = datetime.now(timezone.utc) + relative_maximum_timedelta + relative_maximum_end = ( + datetime.now(timezone.utc) + relative_maximum_timedelta + ) else: relative_maximum_end = None - if relative_maximum_end \ - and (not maximum_end or relative_maximum_end < maximum_end): + if relative_maximum_end and ( + not maximum_end or relative_maximum_end < maximum_end + ): return relative_maximum_end return maximum_end - def set_resource_healthy(self, resource_index: int, value: bool|None) -> None: + def set_resource_healthy(self, resource_index: int, value: bool | None) -> None: if value is None: - self.status['resources'][resource_index].pop('healthy', None) + self.status["resources"][resource_index].pop("healthy", None) else: - self.status['resources'][resource_index]['healthy'] = value + self.status["resources"][resource_index]["healthy"] = value - def set_resource_ready(self, resource_index: int, value: bool|None) -> None: + def set_resource_ready(self, resource_index: int, value: bool | None) -> None: if value is None: - self.status['resources'][resource_index].pop('ready', None) + self.status["resources"][resource_index].pop("ready", None) else: - self.status['resources'][resource_index]['ready'] = value + self.status["resources"][resource_index]["ready"] = value - def set_resource_state(self, resource_index: int, value: Mapping|None) -> None: + def set_resource_state(self, resource_index: int, value: Mapping | None) -> None: if value is None: - self.status['resources'][resource_index].pop('state', None) + self.status["resources"][resource_index].pop("state", None) else: - self.status['resources'][resource_index]['state'] = value + self.status["resources"][resource_index]["state"] = value - async def get_resource_claim(self, not_found_okay: bool) -> ResourceClaimT|None: + async def get_resource_claim(self, not_found_okay: bool) -> ResourceClaimT | None: if not self.is_bound: return None try: return await resourceclaim.ResourceClaim.get( - name = self.resource_claim_name, - namespace = self.resource_claim_namespace, - use_cache = Poolboy.operator_mode_standalone, + name=self.resource_claim_name, + namespace=self.resource_claim_namespace, + use_cache=Poolboy.is_standalone, ) except kubernetes_asyncio.client.exceptions.ApiException as e: if e.status == 404 and not_found_okay: return None raise - async def get_resource_pool(self) -> ResourcePoolT|None: + async def get_resource_pool(self) -> ResourcePoolT | None: if not self.is_from_resource_pool: return None return await resourcepool.ResourcePool.get(self.resource_pool_name) - async def get_resource_provider(self) -> ResourceProviderT|None: + async def get_resource_provider(self) -> ResourceProviderT | None: """Return ResourceProvider configured to manage ResourceHandle.""" if self.resource_provider_name: - return await resourceprovider.ResourceProvider.get(self.resource_provider_name) + return await resourceprovider.ResourceProvider.get( + self.resource_provider_name + ) async def get_resource_providers(self) -> List[ResourceProviderT]: """Return list of ResourceProviders for all managed resources.""" resource_providers = [] - for resource in self.spec.get('resources', []): + for resource in self.spec.get("resources", []): resource_providers.append( - await resourceprovider.ResourceProvider.get(resource['provider']['name']) + await resourceprovider.ResourceProvider.get( + resource["provider"]["name"] + ) ) return resource_providers async def get_resource_states(self) -> List[Mapping]: """Return list of states fom resources referenced by ResourceHandle.""" resource_states = [] - for idx in range(len(self.spec['resources'])): + for idx in range(len(self.spec["resources"])): reference = None - if idx < len(self.status.get('resources', [])): - reference = self.status['resources'][idx].get('reference') + if idx < len(self.status.get("resources", [])): + reference = self.status["resources"][idx].get("reference") if not reference: resource_states.append(None) continue resource_states.append( await resourcewatch.ResourceWatch.get_resource_from_any( - api_version=reference['apiVersion'], - kind=reference['kind'], - name=reference['name'], - namespace=reference.get('namespace'), + api_version=reference["apiVersion"], + kind=reference["kind"], + name=reference["name"], + namespace=reference.get("namespace"), not_found_okay=True, - use_cache=Poolboy.operator_mode_standalone, + use_cache=Poolboy.is_standalone, ) ) return resource_states async def handle_delete(self, logger: kopf.ObjectLogger) -> None: - for resource in self.status.get('resources', []): - reference = resource.get('reference') + for resource in self.status.get("resources", []): + reference = resource.get("reference") if reference: try: - resource_description = f"{reference['apiVersion']} {reference['kind']} " + ( - f"{reference['name']} in {reference['namespace']}" - if 'namespace' in reference else reference['name'] + resource_description = ( + f"{reference['apiVersion']} {reference['kind']} " + + ( + f"{reference['name']} in {reference['namespace']}" + if "namespace" in reference + else reference["name"] + ) + ) + logger.info( + f"Propagating delete of {self} to {resource_description}" ) - logger.info(f"Propagating delete of {self} to {resource_description}") # Annotate managed resource to indicate resource handle deletion. await poolboy_k8s.patch_object( - api_version = reference['apiVersion'], - kind = reference['kind'], - name = reference['name'], - namespace = reference.get('namespace'), - patch = [{ - "op": "add", - "path": f"/metadata/annotations/{Poolboy.resource_handle_deleted_annotation.replace('/', '~1')}", - "value": datetime.now(timezone.utc).strftime('%FT%TZ'), - }], + api_version=reference["apiVersion"], + kind=reference["kind"], + name=reference["name"], + namespace=reference.get("namespace"), + patch=[ + { + "op": "add", + "path": f"/metadata/annotations/{Poolboy.resource_handle_deleted_annotation.replace('/', '~1')}", + "value": datetime.now(timezone.utc).strftime("%FT%TZ"), + } + ], ) # Delete managed resource await poolboy_k8s.delete_object( - api_version = reference['apiVersion'], - kind = reference['kind'], - name = reference['name'], - namespace = reference.get('namespace'), + api_version=reference["apiVersion"], + kind=reference["kind"], + name=reference["name"], + namespace=reference.get("namespace"), ) except kubernetes_asyncio.client.exceptions.ApiException as e: if e.status != 404: @@ -1105,7 +1252,9 @@ async def handle_delete(self, logger: kopf.ObjectLogger) -> None: resource_claim = await self.get_resource_claim(not_found_okay=True) if resource_claim and not resource_claim.is_detached: await resource_claim.delete() - logger.info(f"Propagated delete of {self} to ResourceClaim {resource_claim}") + logger.info( + f"Propagated delete of {self} to ResourceClaim {resource_claim}" + ) if self.is_from_resource_pool: resource_pool = await resourcepool.ResourcePool.get(self.resource_pool_name) @@ -1122,7 +1271,9 @@ async def manage(self, logger: kopf.ObjectLogger) -> None: # Get ResourceClaim bound to this ResourceHandle if there is one. resource_claim = await self.get_resource_claim(not_found_okay=True) # Delete this ResourceHandle if it meets delete trigger conditions. - if await self.__manage_check_delete(logger=logger, resource_claim=resource_claim): + if await self.__manage_check_delete( + logger=logger, resource_claim=resource_claim + ): return # Get top-level ResourceProvider managing this ResourceHandle @@ -1140,6 +1291,7 @@ async def manage(self, logger: kopf.ObjectLogger) -> None: resource_providers = await self.get_resource_providers() resource_states = await self.get_resource_states() resources_to_create = [] + resources_updated = False patch = [] # Loop through management for each managed resource @@ -1149,12 +1301,14 @@ async def manage(self, logger: kopf.ObjectLogger) -> None: resource_provider = resource_providers[resource_index] if resource_provider.resource_requires_claim and not resource_claim: - if 'ResourceClaim' != status_resource.get('waitingFor'): - patch.append({ - "op": "add", - "path": f"/status/resources/{resource_index}/waitingFor", - "value": "ResourceClaim", - }) + if "ResourceClaim" != status_resource.get("waitingFor"): + patch.append( + { + "op": "add", + "path": f"/status/resources/{resource_index}/waitingFor", + "value": "ResourceClaim", + } + ) continue vars_ = deepcopy(self.vars) @@ -1164,8 +1318,9 @@ async def manage(self, logger: kopf.ObjectLogger) -> None: linked_resource_state = None for pn, pv in enumerate(resource_providers): if ( - pv.name == linked_provider.name and - self.resources[pn].get('name', pv.name) == linked_provider.resource_name + pv.name == linked_provider.name + and self.resources[pn].get("name", pv.name) + == linked_provider.resource_name ): linked_resource_provider = pv linked_resource_state = resource_states[pn] @@ -1179,12 +1334,12 @@ async def manage(self, logger: kopf.ObjectLogger) -> None: continue if not linked_provider.check_wait_for( - linked_resource_provider = linked_resource_provider, - linked_resource_state = linked_resource_state, - resource_claim = resource_claim, - resource_handle = self, - resource_provider = resource_provider, - resource_state = resource_state, + linked_resource_provider=linked_resource_provider, + linked_resource_state=linked_resource_state, + resource_claim=resource_claim, + resource_handle=self, + resource_provider=resource_provider, + resource_state=resource_state, ): wait_for_linked_provider = True break @@ -1192,105 +1347,127 @@ async def manage(self, logger: kopf.ObjectLogger) -> None: if linked_resource_state: for template_var in linked_provider.template_vars: vars_[template_var.name] = jsonpointer.resolve_pointer( - linked_resource_state, template_var.value_from, - default = jinja2.ChainableUndefined() + linked_resource_state, + template_var.value_from, + default=jinja2.ChainableUndefined(), ) if wait_for_linked_provider: - if 'Linked ResourceProvider' != status_resource.get('waitingFor'): - patch.append({ - "op": "add", - "path": f"/status/resources/{resource_index}/waitingFor", - "value": "Linked ResourceProvider", - }) + if "Linked ResourceProvider" != status_resource.get("waitingFor"): + patch.append( + { + "op": "add", + "path": f"/status/resources/{resource_index}/waitingFor", + "value": "Linked ResourceProvider", + } + ) continue - resource_definition = await resource_provider.resource_definition_from_template( - logger = logger, - resource_claim = resource_claim, - resource_handle = self, - resource_index = resource_index, - resource_states = resource_states, - vars_ = vars_, + resource_definition = ( + await resource_provider.resource_definition_from_template( + logger=logger, + resource_claim=resource_claim, + resource_handle=self, + resource_index=resource_index, + resource_states=resource_states, + vars_=vars_, + ) ) if not resource_definition: - if 'Resource Definition' != status_resource.get('waitingFor'): - patch.append({ - "op": "add", - "path": f"/status/resources/{resource_index}/waitingFor", - "value": "Resource Definition", - }) + if "Resource Definition" != status_resource.get("waitingFor"): + patch.append( + { + "op": "add", + "path": f"/status/resources/{resource_index}/waitingFor", + "value": "Resource Definition", + } + ) continue - resource_api_version = resource_definition['apiVersion'] - resource_kind = resource_definition['kind'] - resource_name = resource_definition['metadata']['name'] - resource_namespace = resource_definition['metadata'].get('namespace', None) + resource_api_version = resource_definition["apiVersion"] + resource_kind = resource_definition["kind"] + resource_name = resource_definition["metadata"]["name"] + resource_namespace = resource_definition["metadata"].get( + "namespace", None + ) reference = { - 'apiVersion': resource_api_version, - 'kind': resource_kind, - 'name': resource_name + "apiVersion": resource_api_version, + "kind": resource_kind, + "name": resource_name, } if resource_namespace: - reference['namespace'] = resource_namespace + reference["namespace"] = resource_namespace - if 'reference' not in status_resource: + if "reference" not in status_resource: # Add reference to status resources - status_resource['reference'] = reference - patch.append({ - "op": "add", - "path": f"/status/resources/{resource_index}/reference", - "value": reference, - }) + status_resource["reference"] = reference + patch.append( + { + "op": "add", + "path": f"/status/resources/{resource_index}/reference", + "value": reference, + } + ) # Remove waitingFor from status if present as we are preceeding to resource creation - if 'waitingFor' in status_resource: - patch.append({ - "op": "remove", - "path": f"/status/resources/{resource_index}/waitingFor", - }) - elif resource_api_version != status_resource['reference']['apiVersion']: + if "waitingFor" in status_resource: + patch.append( + { + "op": "remove", + "path": f"/status/resources/{resource_index}/waitingFor", + } + ) + elif resource_api_version != status_resource["reference"]["apiVersion"]: raise kopf.TemporaryError( f"ResourceHandle {self.name} would change from apiVersion " f"{status_resource['reference']['apiVersion']} to {resource_api_version}!", - delay=600 + delay=600, ) - elif resource_kind != status_resource['reference']['kind']: + elif resource_kind != status_resource["reference"]["kind"]: raise kopf.TemporaryError( f"ResourceHandle {self.name} would change from kind " f"{status_resource['reference']['kind']} to {resource_kind}!", - delay=600 + delay=600, ) else: # Maintain name and namespace - if resource_name != status_resource['reference']['name']: - resource_name = status_resource['reference']['name'] - resource_definition['metadata']['name'] = resource_name - if resource_namespace != status_resource['reference'].get('namespace'): - resource_namespace = status_resource['reference']['namespace'] - resource_definition['metadata']['namespace'] = resource_namespace - - resource_description = f"{resource_api_version} {resource_kind} {resource_name}" + if resource_name != status_resource["reference"]["name"]: + resource_name = status_resource["reference"]["name"] + resource_definition["metadata"]["name"] = resource_name + if resource_namespace != status_resource["reference"].get( + "namespace" + ): + resource_namespace = status_resource["reference"]["namespace"] + resource_definition["metadata"]["namespace"] = ( + resource_namespace + ) + + resource_description = ( + f"{resource_api_version} {resource_kind} {resource_name}" + ) if resource_namespace: resource_description += f" in {resource_namespace}" # Ensure there is a ResourceWatch for this resource. await resourcewatch.ResourceWatch.create_as_needed( - api_version = resource_api_version, - kind = resource_kind, - namespace = resource_namespace, + api_version=resource_api_version, + kind=resource_kind, + namespace=resource_namespace, ) if resource_state: updated_state = await resource_provider.update_resource( - logger = logger, - resource_definition = resource_definition, - resource_handle = self, - resource_state = resource_state, + logger=logger, + resource_definition=resource_definition, + resource_handle=self, + resource_state=resource_state, ) if updated_state: resource_states[resource_index] = updated_state - logger.info(f"Updated {resource_description} for ResourceHandle {self.name}") + resources_updated = True + logger.info( + f"Updated {resource_description} for ResourceHandle {self.name}" + ) else: resources_to_create.append((resource_index, resource_definition)) @@ -1303,15 +1480,21 @@ async def manage(self, logger: kopf.ObjectLogger) -> None: raise for resource_index, resource_definition in resources_to_create: - resource_api_version = resource_definition['apiVersion'] - resource_kind = resource_definition['kind'] - resource_name = resource_definition['metadata']['name'] - resource_namespace = resource_definition['metadata'].get('namespace', None) - resource_description = f"{resource_api_version} {resource_kind} {resource_name}" + resource_api_version = resource_definition["apiVersion"] + resource_kind = resource_definition["kind"] + resource_name = resource_definition["metadata"]["name"] + resource_namespace = resource_definition["metadata"].get( + "namespace", None + ) + resource_description = ( + f"{resource_api_version} {resource_kind} {resource_name}" + ) if resource_namespace: resource_description += f" in {resource_namespace}" try: - created_resource = await poolboy_k8s.create_object(resource_definition) + created_resource = await poolboy_k8s.create_object( + resource_definition + ) if created_resource: resource_states[resource_index] = created_resource logger.info(f"Created {resource_description} for {self}") @@ -1320,9 +1503,11 @@ async def manage(self, logger: kopf.ObjectLogger) -> None: raise # Update handle status with resource states immediately after changes. - # This is only needed in worker context where ResourceWatch timing - # may be unreliable due to running in separate processes. - if Poolboy.is_worker and (resources_to_create or patch): + # This is needed in worker context where ResourceWatch runs in the + # operator process and may not immediately sync with worker changes. + if Poolboy.is_worker and ( + resources_to_create or resources_updated or patch + ): # Refetch to sync in-memory object with API after patches were applied await self.refetch() # Re-fetch claim to ensure we have the latest version @@ -1340,10 +1525,14 @@ async def manage(self, logger: kopf.ObjectLogger) -> None: resource_states=resource_states, ) - async def refetch(self) -> ResourceHandleT|None: + async def refetch(self) -> ResourceHandleT | None: try: definition = await Poolboy.custom_objects_api.get_namespaced_custom_object( - Poolboy.operator_domain, Poolboy.operator_version, Poolboy.namespace, 'resourcehandles', self.name + Poolboy.operator_domain, + Poolboy.operator_version, + Poolboy.namespace, + "resourcehandles", + self.name, ) self.refresh_from_definition(definition) return self @@ -1353,26 +1542,31 @@ async def refetch(self) -> ResourceHandleT|None: return None raise - async def update_status(self, + async def update_status( + self, logger: kopf.ObjectLogger, - resource_states: List[Mapping|None], - resource_claim: ResourceClaimT|None=None, + resource_states: List[Mapping | None], + resource_claim: ResourceClaimT | None = None, ) -> None: """Update status from resources state.""" status = self.status while len(self.resources) < len(resource_states): - logger.warning(f"{self} update status with resource states longer that list of resources, attempting refetch: {len(self.resources)} < {len(resource_states)}") + logger.warning( + f"{self} update status with resource states longer that list of resources, attempting refetch: {len(self.resources)} < {len(resource_states)}" + ) await asyncio.sleep(0.2) await self.refetch() if len(self.resources) < len(resource_states): - logger.error(f"{self} update status with resource states longer that list of resources after refetch: {len(self.resources)} < {len(resource_states)}") + logger.error( + f"{self} update status with resource states longer that list of resources after refetch: {len(self.resources)} < {len(resource_states)}" + ) return # Create consolidated information about resources resources = deepcopy(self.resources) for idx, state in enumerate(resource_states): - resources[idx]['state'] = state + resources[idx]["state"] = state patch = [] have_healthy_resource = False @@ -1380,15 +1574,19 @@ async def update_status(self, have_ready_resource = False all_resources_ready = True - status_resources = status.get('resources', []) + status_resources = status.get("resources", []) for idx, resource in enumerate(resources): - status_resource = status_resources[idx] if idx < len(status_resources) else {} + status_resource = ( + status_resources[idx] if idx < len(status_resources) else {} + ) resource_healthy = None resource_ready = False - state = resource.get('state') + state = resource.get("state") if state: - resource_provider = await resourceprovider.ResourceProvider.get(resource['provider']['name']) + resource_provider = await resourceprovider.ResourceProvider.get( + resource["provider"]["name"] + ) resource_healthy = resource_provider.check_health( logger=logger, resource_handle=self, @@ -1417,29 +1615,37 @@ async def update_status(self, elif all_resources_ready is True: all_resources_ready = None - if resource_healthy is None and 'healthy' in status_resource: - patch.append({ - "op": "remove", - "path": f"/status/resources/{idx}/healthy", - }) - elif resource_healthy != status_resource.get('healthy'): - patch.append({ - "op": "add", - "path": f"/status/resources/{idx}/healthy", - "value": resource_healthy, - }) - - if resource_ready is None and 'ready' in status_resource: - patch.append({ - "op": "remove", - "path": f"/status/resources/{idx}/ready", - }) - elif resource_ready != status_resource.get('ready'): - patch.append({ - "op": "add", - "path": f"/status/resources/{idx}/ready", - "value": resource_ready, - }) + if resource_healthy is None and "healthy" in status_resource: + patch.append( + { + "op": "remove", + "path": f"/status/resources/{idx}/healthy", + } + ) + elif resource_healthy != status_resource.get("healthy"): + patch.append( + { + "op": "add", + "path": f"/status/resources/{idx}/healthy", + "value": resource_healthy, + } + ) + + if resource_ready is None and "ready" in status_resource: + patch.append( + { + "op": "remove", + "path": f"/status/resources/{idx}/ready", + } + ) + elif resource_ready != status_resource.get("ready"): + patch.append( + { + "op": "add", + "path": f"/status/resources/{idx}/ready", + "value": resource_ready, + } + ) if all_resources_healthy and not have_healthy_resource: all_resources_healthy = None @@ -1447,30 +1653,38 @@ async def update_status(self, all_resources_ready = None if all_resources_healthy is None: - if 'healthy' in status: - patch.append({ - "op": "remove", + if "healthy" in status: + patch.append( + { + "op": "remove", + "path": "/status/healthy", + } + ) + elif all_resources_healthy != status.get("healthy"): + patch.append( + { + "op": "add", "path": "/status/healthy", - }) - elif all_resources_healthy != status.get('healthy'): - patch.append({ - "op": "add", - "path": "/status/healthy", - "value": all_resources_healthy, - }) + "value": all_resources_healthy, + } + ) if all_resources_ready is None: - if 'ready' in status: - patch.append({ - "op": "remove", + if "ready" in status: + patch.append( + { + "op": "remove", + "path": "/status/ready", + } + ) + elif all_resources_ready != status.get("ready"): + patch.append( + { + "op": "add", "path": "/status/ready", - }) - elif all_resources_ready != status.get('ready'): - patch.append({ - "op": "add", - "path": "/status/ready", - "value": all_resources_ready, - }) + "value": all_resources_ready, + } + ) if self.has_resource_provider: resource_provider = None @@ -1481,12 +1695,14 @@ async def update_status(self, resource_handle=self, resources=resources, ) - if status_summary != status.get('summary'): - patch.append({ - "op": "add", - "path": "/status/summary", - "value": status_summary, - }) + if status_summary != status.get("summary"): + patch.append( + { + "op": "add", + "path": "/status/summary", + "value": status_summary, + } + ) except kubernetes_asyncio.client.exceptions.ApiException as e: logger.warning( f"Failed to get ResourceProvider {self.resource_provider_name} " diff --git a/operator/resourcewatch.py b/operator/resourcewatch.py index edbbf0b..bdb8ec9 100644 --- a/operator/resourcewatch.py +++ b/operator/resourcewatch.py @@ -14,16 +14,20 @@ from kopfobject import KopfObject from poolboy import Poolboy -logger = logging.getLogger('resource_watch') +logger = logging.getLogger("resource_watch") + class ResourceWatchFailedError(Exception): pass + class ResourceWatchRestartError(Exception): pass -ResourceHandleT = TypeVar('ResourceHandleT', bound='ResourceHandle') -ResourceWatchT = TypeVar('ResourceWatchT', bound='ResourceWatch') + +ResourceHandleT = TypeVar("ResourceHandleT", bound="ResourceHandle") +ResourceWatchT = TypeVar("ResourceWatchT", bound="ResourceWatch") + class ResourceWatch(KopfObject): api_group = Poolboy.operator_domain @@ -34,66 +38,77 @@ class ResourceWatch(KopfObject): class_lock = asyncio.Lock() @classmethod - def __instance_key(cls, api_version: str, kind: str, namespace: str|None) -> str: + def __instance_key(cls, api_version: str, kind: str, namespace: str | None) -> str: """Return cache key used to identify ResourceWatch in instances dict""" - return "|".join((api_version, kind, namespace or '*')) + return "|".join((api_version, kind, namespace or "*")) @classmethod - def __make_name(cls, + def __make_name( + cls, api_version: str, kind: str, - namespace: str|None, + namespace: str | None, ): """Return unique name for ResourceWatch determined by watch target. This hash prevents race conditions when otherwise multiple watches might be created.""" - return (namespace or 'cluster') + '-' + urlsafe_b64encode( - sha256(':'.join((api_version,kind,namespace or '')).encode('utf-8')) - .digest() - ).decode('utf-8').replace('=', '').replace('-', '').replace('_', '').lower()[:12] + return ( + (namespace or "cluster") + + "-" + + urlsafe_b64encode( + sha256( + ":".join((api_version, kind, namespace or "")).encode("utf-8") + ).digest() + ) + .decode("utf-8") + .replace("=", "") + .replace("-", "") + .replace("_", "") + .lower()[:12] + ) @classmethod - def __get_instance(cls, + def __get_instance( + cls, api_version: str, kind: str, - namespace: str|None, + namespace: str | None, ): """Return ResourceWatch from cache.""" instance_key = cls.__instance_key( - api_version=api_version, - kind=kind, - namespace=namespace + api_version=api_version, kind=kind, namespace=namespace ) return cls.cache_get(CacheTag.WATCH, instance_key) @classmethod def __register_definition(cls, definition: Mapping) -> ResourceWatchT: resource_watch = cls.__get_instance( - api_version=definition['spec']['apiVersion'], - kind=definition['spec']['kind'], - namespace=definition['spec'].get('namespace'), + api_version=definition["spec"]["apiVersion"], + kind=definition["spec"]["kind"], + namespace=definition["spec"].get("namespace"), ) if resource_watch: resource_watch.refresh_from_definition(definition=definition) else: resource_watch = cls( - annotations=definition['metadata'].get('annotations', {}), - labels=definition['metadata'].get('labels', {}), - meta=definition['metadata'], - name=definition['metadata']['name'], + annotations=definition["metadata"].get("annotations", {}), + labels=definition["metadata"].get("labels", {}), + meta=definition["metadata"], + name=definition["metadata"]["name"], namespace=Poolboy.namespace, - spec=definition['spec'], - status=definition.get('status', {}), - uid=definition['metadata']['uid'], + spec=definition["spec"], + status=definition.get("status", {}), + uid=definition["metadata"]["uid"], ) resource_watch.__register() return resource_watch @classmethod - async def create_as_needed(cls, + async def create_as_needed( + cls, api_version: str, kind: str, - namespace: str|None, - ) -> ResourceWatchT|None: + namespace: str | None, + ) -> ResourceWatchT | None: async with cls.class_lock: resource_watch = await cls.__get( api_version=api_version, @@ -110,7 +125,7 @@ async def create_as_needed(cls, ) definition = { - "apiVersion": '/'.join((cls.api_group, cls.api_version)), + "apiVersion": "/".join((cls.api_group, cls.api_version)), "kind": cls.kind, "metadata": { "name": name, @@ -118,18 +133,20 @@ async def create_as_needed(cls, "spec": { "apiVersion": api_version, "kind": kind, - } + }, } if namespace: - definition['spec']['namespace'] = namespace + definition["spec"]["namespace"] = namespace try: - definition = await Poolboy.custom_objects_api.create_namespaced_custom_object( - group = cls.api_group, - namespace = Poolboy.namespace, - plural = cls.plural, - version = cls.api_version, - body = definition, + definition = ( + await Poolboy.custom_objects_api.create_namespaced_custom_object( + group=cls.api_group, + namespace=Poolboy.namespace, + plural=cls.plural, + version=cls.api_version, + body=definition, + ) ) resource_watch = cls.from_definition(definition) logger.info(f"Created {resource_watch}") @@ -142,10 +159,11 @@ async def create_as_needed(cls, raise @classmethod - async def get(cls, + async def get( + cls, api_version: str, kind: str, - namespace: str|None, + namespace: str | None, ) -> ResourceWatchT: """Get ResourceWatch by watched resources""" async with cls.class_lock: @@ -156,10 +174,11 @@ async def get(cls, ) @classmethod - async def __get(cls, + async def __get( + cls, api_version: str, kind: str, - namespace: str|None, + namespace: str | None, ) -> ResourceWatchT: resource_watch = cls.__get_instance( api_version=api_version, @@ -177,11 +196,11 @@ async def __get(cls, try: list_object = await Poolboy.custom_objects_api.get_namespaced_custom_object( - group = cls.api_group, - name = name, - namespace = Poolboy.namespace, - plural = cls.plural, - version = cls.api_version, + group=cls.api_group, + name=name, + namespace=Poolboy.namespace, + plural=cls.plural, + version=cls.api_version, ) except kubernetes_asyncio.client.exceptions.ApiException as exception: if exception.status == 404: @@ -190,14 +209,15 @@ async def __get(cls, raise @classmethod - async def get_resource_from_any(cls, + async def get_resource_from_any( + cls, api_version: str, kind: str, name: str, - namespace: str|None, - not_found_okay: bool=False, - use_cache: bool=True, - ) -> Mapping|None: + namespace: str | None, + not_found_okay: bool = False, + use_cache: bool = True, + ) -> Mapping | None: # Try to get from other watch object watch = cls.__get_instance( api_version=api_version, @@ -237,29 +257,29 @@ async def register( ) -> ResourceWatchT: async with cls.class_lock: resource_watch = cls.__get_instance( - api_version=spec['apiVersion'], - kind=spec['kind'], - namespace=spec.get('namespace') + api_version=spec["apiVersion"], + kind=spec["kind"], + namespace=spec.get("namespace"), ) if resource_watch: resource_watch.refresh( - annotations = annotations, - labels = labels, - meta = meta, - spec = spec, - status = status, - uid = uid, + annotations=annotations, + labels=labels, + meta=meta, + spec=spec, + status=status, + uid=uid, ) else: resource_watch = cls( - annotations = annotations, - labels = labels, - meta = meta, - name = name, - namespace = namespace, - spec = spec, - status = status, - uid = uid, + annotations=annotations, + labels=labels, + meta=meta, + name=name, + namespace=namespace, + spec=spec, + status=status, + uid=uid, ) resource_watch.__register() return resource_watch @@ -277,14 +297,15 @@ async def stop_all(cls) -> None: if tasks: await asyncio.gather(*tasks) - def __init__(self, - annotations: kopf.Annotations|Mapping, - labels: kopf.Labels|Mapping, - meta: kopf.Meta|Mapping, + def __init__( + self, + annotations: kopf.Annotations | Mapping, + labels: kopf.Labels | Mapping, + meta: kopf.Meta | Mapping, name: str, namespace: str, - spec: kopf.Spec|Mapping, - status: kopf.Status|Mapping, + spec: kopf.Spec | Mapping, + status: kopf.Status | Mapping, uid: str, ): super().__init__( @@ -309,8 +330,8 @@ def __register(self) -> None: def __str__(self) -> str: return ( f"{self.kind} {self.name} ({self.watch_api_version} {self.watch_kind} in {self.watch_namespace})" - if self.watch_namespace else - f"{self.kind} {self.name} ({self.watch_api_version} {self.watch_kind})" + if self.watch_namespace + else f"{self.kind} {self.name} ({self.watch_api_version} {self.watch_kind})" ) def __self_instance_key(self) -> str: @@ -322,29 +343,30 @@ def __self_instance_key(self) -> str: @property def name_hash(self) -> str: - return self.name.rsplit('-', 1)[1] + return self.name.rsplit("-", 1)[1] @property def watch_api_version(self) -> str: - return self.spec['apiVersion'] + return self.spec["apiVersion"] @property def watch_kind(self) -> str: - return self.spec['kind'] + return self.spec["kind"] @property - def watch_namespace(self) -> str|None: - return self.spec.get('namespace') + def watch_namespace(self) -> str | None: + return self.spec.get("namespace") def __resource_cache_key(self, name: str) -> str: """Build unique cache key for a watched resource.""" return f"{self.name}:{name}" - async def get_resource(self, + async def get_resource( + self, name: str, - not_found_okay: bool=False, - use_cache: bool=True, - ) -> Mapping|None: + not_found_okay: bool = False, + use_cache: bool = True, + ) -> Mapping | None: resource_cache_key = self.__resource_cache_key(name) if use_cache: cached = Cache.get(CacheTag.WATCH_RESOURCE, resource_cache_key) @@ -363,7 +385,12 @@ async def get_resource(self, else: raise if use_cache and resource: - Cache.set(CacheTag.WATCH_RESOURCE, resource_cache_key, resource, ttl=Poolboy.resource_refresh_interval) + Cache.set( + CacheTag.WATCH_RESOURCE, + resource_cache_key, + resource, + ttl=Poolboy.resource_refresh_interval, + ) return resource async def start(self, logger) -> None: @@ -372,23 +399,27 @@ async def start(self, logger) -> None: async def watch(self): try: - if '/' in self.watch_api_version: - group, version = self.watch_api_version.split('/') - plural = await poolboy_k8s.kind_to_plural(group=group, version=version, kind=self.watch_kind) + if "/" in self.watch_api_version: + group, version = self.watch_api_version.split("/") + plural = await poolboy_k8s.kind_to_plural( + group=group, version=version, kind=self.watch_kind + ) kwargs = {"group": group, "plural": plural, "version": version} if self.watch_namespace: method = Poolboy.custom_objects_api.list_namespaced_custom_object - kwargs['namespace'] = self.watch_namespace + kwargs["namespace"] = self.watch_namespace else: method = Poolboy.custom_objects_api.list_cluster_custom_object elif self.watch_namespace: method = getattr( - Poolboy.core_v1_api, "list_namespaced_" + inflection.underscore(self.watch_kind) + Poolboy.core_v1_api, + "list_namespaced_" + inflection.underscore(self.watch_kind), ) kwargs = {"namespace": self.watch_namespace} else: method = getattr( - Poolboy.core_v1_api, "list_" + inflection.underscore(self.watch_kind) + Poolboy.core_v1_api, + "list_" + inflection.underscore(self.watch_kind), ) kwargs = {} @@ -401,17 +432,23 @@ async def watch(self): return except ResourceWatchRestartError as e: logger.debug(f"{self} restart: {e}") - watch_duration = (datetime.now(timezone.utc) - watch_start_dt).total_seconds() + watch_duration = ( + datetime.now(timezone.utc) - watch_start_dt + ).total_seconds() if watch_duration < 10: await asyncio.sleep(10 - watch_duration) except ResourceWatchFailedError as e: logger.warning(f"{self} failed: {e}") - watch_duration = (datetime.now(timezone.utc) - watch_start_dt).total_seconds() + watch_duration = ( + datetime.now(timezone.utc) - watch_start_dt + ).total_seconds() if watch_duration < 60: await asyncio.sleep(60 - watch_duration) except Exception: logger.exception(f"{self} exception") - watch_duration = (datetime.now(timezone.utc) - watch_start_dt).total_seconds() + watch_duration = ( + datetime.now(timezone.utc) - watch_start_dt + ).total_seconds() if watch_duration < 60: await asyncio.sleep(60 - watch_duration) logger.debug(f"Restarting {self}") @@ -427,16 +464,18 @@ async def __watch(self, method, **kwargs): if not isinstance(event, Mapping): raise ResourceWatchFailedError(f"UNKNOWN EVENT: {event}") - event_obj = event['object'] - event_type = event['type'] + event_obj = event["object"] + event_type = event["type"] if not isinstance(event_obj, Mapping): event_obj = Poolboy.api_client.sanitize_for_serialization(event_obj) - if event_type == 'ERROR': - if event_obj['kind'] == 'Status': - if event_obj['reason'] in ('Expired', 'Gone'): - raise ResourceWatchRestartError(event_obj['reason'].lower()) + if event_type == "ERROR": + if event_obj["kind"] == "Status": + if event_obj["reason"] in ("Expired", "Gone"): + raise ResourceWatchRestartError(event_obj["reason"].lower()) else: - raise ResourceWatchFailedError(f"{event_obj['reason']} {event_obj['message']}") + raise ResourceWatchFailedError( + f"{event_obj['reason']} {event_obj['message']}" + ) else: raise ResourceWatchFailedError(f"UNKNOWN EVENT: {event}") try: @@ -453,39 +492,53 @@ async def __watch(self, method, **kwargs): await watch.close() async def __watch_event(self, event_type, event_obj): - event_obj_annotations = event_obj['metadata'].get('annotations') + event_obj_annotations = event_obj["metadata"].get("annotations") if not event_obj_annotations: return - if event_obj_annotations.get(Poolboy.resource_handle_deleted_annotation) is not None: + if ( + event_obj_annotations.get(Poolboy.resource_handle_deleted_annotation) + is not None + ): return - resource_handle_name = event_obj_annotations.get(Poolboy.resource_handle_name_annotation) - resource_index = int(event_obj_annotations.get(Poolboy.resource_index_annotation, 0)) - resource_name = event_obj['metadata']['name'] - resource_namespace = event_obj['metadata'].get('namespace') + resource_handle_name = event_obj_annotations.get( + Poolboy.resource_handle_name_annotation + ) + resource_index = int( + event_obj_annotations.get(Poolboy.resource_index_annotation, 0) + ) + resource_name = event_obj["metadata"]["name"] + resource_namespace = event_obj["metadata"].get("namespace") resource_description = ( f"{event_obj['apiVersion']} {event_obj['kind']} {resource_name} in {resource_namespace}" - if resource_namespace else - f"{event_obj['apiVersion']} {event_obj['kind']} {resource_name}" + if resource_namespace + else f"{event_obj['apiVersion']} {event_obj['kind']} {resource_name}" ) if not resource_handle_name: return resource_cache_key = self.__resource_cache_key(resource_name) - if event_type == 'DELETED': + if event_type == "DELETED": Cache.delete(CacheTag.WATCH_RESOURCE, resource_cache_key) else: - Cache.set(CacheTag.WATCH_RESOURCE, resource_cache_key, event_obj, ttl=Poolboy.resource_refresh_interval) + Cache.set( + CacheTag.WATCH_RESOURCE, + resource_cache_key, + event_obj, + ttl=Poolboy.resource_refresh_interval, + ) try: resource_handle = await resourcehandle.ResourceHandle.get( name=resource_handle_name, - use_cache=Poolboy.operator_mode_standalone, + use_cache=Poolboy.is_standalone, ) except kubernetes_asyncio.client.exceptions.ApiException as exception: if exception.status == 404: - logger.warning(f"ResourceHandle {resource_handle_name} not found for event on {resource_description}") + logger.warning( + f"ResourceHandle {resource_handle_name} not found for event on {resource_description}" + ) else: logger.exception( f"Failed to get ResourceHandle {resource_handle_name} for event on {resource_description}" @@ -505,30 +558,30 @@ async def __watch_event(self, event_type, event_obj): # Get full list of resources to update ResourceHandle status resource_states = [] - for (idx, resource) in enumerate(resource_handle.status_resources): + for idx, resource in enumerate(resource_handle.status_resources): if idx == resource_index: resource_states.append(event_obj) continue - reference = resource.get('reference') + reference = resource.get("reference") if reference: - if( - reference['apiVersion'] == self.watch_api_version and - reference['kind'] == self.watch_kind and - reference.get('namespace') == self.watch_namespace + if ( + reference["apiVersion"] == self.watch_api_version + and reference["kind"] == self.watch_kind + and reference.get("namespace") == self.watch_namespace ): resource_states.append( await self.get_resource( - name=reference['name'], + name=reference["name"], not_found_okay=True, ) ) else: resource_states.append( await self.get_resource_from_any( - api_version=reference['apiVersion'], - kind=reference['kind'], - name=reference['name'], - namespace=reference.get('namespace'), + api_version=reference["apiVersion"], + kind=reference["kind"], + name=reference["name"], + namespace=reference.get("namespace"), not_found_okay=True, ) ) diff --git a/operator/tasks/resourceclaim.py b/operator/tasks/resourceclaim.py index 297e93e..209567e 100644 --- a/operator/tasks/resourceclaim.py +++ b/operator/tasks/resourceclaim.py @@ -20,7 +20,7 @@ def _is_transient_exception(exc: Exception) -> bool: exc_class_name = type(exc).__name__ exc_module = type(exc).__module__ - if exc_class_name == 'TemporaryError' and 'kopf' in exc_module: + if exc_class_name == "TemporaryError" and "kopf" in exc_module: return True return False @@ -47,20 +47,20 @@ async def _collect_claims_to_process() -> list: # Note: Using cluster-wide listing since claims exist in user namespaces claim_list = await Poolboy.custom_objects_api.list_cluster_custom_object( group=Poolboy.operator_domain, - plural='resourceclaims', + plural="resourceclaims", version=Poolboy.operator_version, _continue=_continue, limit=50, ) - for item in claim_list.get('items', []): + for item in claim_list.get("items", []): # Skip ignored claims - if Poolboy.ignore_label in item['metadata'].get('labels', {}): + if Poolboy.ignore_label in item["metadata"].get("labels", {}): continue claims_to_process.append(item) - _continue = claim_list['metadata'].get('continue') + _continue = claim_list["metadata"].get("continue") if not _continue: break @@ -75,23 +75,36 @@ async def _delete_claim(definition: dict) -> dict: the ResourceHandle using the original definition. """ import resourceclaim + claim = resourceclaim.ResourceClaim.from_definition(definition) await claim.handle_delete(logger=logger) + await claim.unregister(name=claim.name, namespace=claim.namespace) return {"status": "completed", "claim": claim.name, "namespace": claim.namespace} def _dispatch_batch(claims: list) -> int: - """Dispatch a batch of claims as individual tasks.""" + """Dispatch a batch of claims as individual tasks. + + Note: Uses timestamp (truncated to minute) instead of resourceVersion for task_id. + This allows periodic reprocessing even when resourceVersion hasn't changed, + which is necessary for time-based triggers like lifespan.start. + """ + import time + + ts_minute = int(time.time() // 60) # One dispatch allowed per minute per claim + dispatched = 0 for item in claims: - uid = item['metadata']['uid'] - rv = item['metadata']['resourceVersion'] + uid = item["metadata"]["uid"] kwargs = { - 'definition': item, - 'name': item['metadata']['name'], - 'namespace': item['metadata']['namespace'], + "definition": item, + "name": item["metadata"]["name"], + "namespace": item["metadata"]["namespace"], } - manage_claim.apply_async(kwargs=kwargs, task_id=f"claim-{uid}-{rv}") + # Use timestamp instead of resourceVersion to allow periodic reprocessing + manage_claim.apply_async( + kwargs=kwargs, task_id=f"claim-sched-{uid}-{ts_minute}" + ) dispatched += 1 return dispatched @@ -99,12 +112,21 @@ def _dispatch_batch(claims: list) -> int: async def _manage_claim(definition: dict) -> dict: """Async wrapper for ResourceClaim.manage().""" import resourceclaim + claim = resourceclaim.ResourceClaim.from_definition(definition) # Refetch to get current state from K8s API (avoid stale data) claim = await claim.refetch() if not claim: # Claim was deleted between dispatch and execution - return {"status": "skipped", "reason": "not_found", "claim": definition['metadata']['name']} + return { + "status": "skipped", + "reason": "not_found", + "claim": definition["metadata"]["name"], + } + + # Register claim in cache to keep it fresh + await claim.register_definition(claim.definition) + await claim.manage(logger=logger) return {"status": "completed", "claim": claim.name, "namespace": claim.namespace} @@ -112,7 +134,7 @@ async def _manage_claim(definition: dict) -> dict: @app.task(bind=True, acks_late=True) def delete_claim(self, definition: dict, name: str, namespace: str): """Execute ResourceClaim.handle_delete() in a worker.""" - uid = definition['metadata']['uid'] + uid = definition["metadata"]["uid"] lock_key = f"resource_claim:{uid}" with distributed_lock(lock_key, timeout=60) as acquired: @@ -129,9 +151,9 @@ def delete_claim(self, definition: dict, name: str, namespace: str): def dispatch_delete_claim(definition: dict, name: str, namespace: str): """Dispatch delete_claim task with unique task_id.""" - uid = definition['metadata']['uid'] - rv = definition['metadata']['resourceVersion'] - kwargs = {'definition': definition, 'name': name, 'namespace': namespace} + uid = definition["metadata"]["uid"] + rv = definition["metadata"]["resourceVersion"] + kwargs = {"definition": definition, "name": name, "namespace": namespace} delete_claim.apply_async( kwargs=kwargs, task_id=f"claim-delete-{uid}-{rv}", @@ -140,9 +162,9 @@ def dispatch_delete_claim(definition: dict, name: str, namespace: str): def dispatch_manage_claim(definition: dict, name: str, namespace: str): """Dispatch manage_claim task. Always dispatches for operator events.""" - uid = definition['metadata']['uid'] - rv = definition['metadata']['resourceVersion'] - kwargs = {'definition': definition, 'name': name, 'namespace': namespace} + uid = definition["metadata"]["uid"] + rv = definition["metadata"]["resourceVersion"] + kwargs = {"definition": definition, "name": name, "namespace": namespace} manage_claim.apply_async( kwargs=kwargs, task_id=f"claim-{uid}-{rv}", @@ -152,7 +174,7 @@ def dispatch_manage_claim(definition: dict, name: str, namespace: str): @app.task(bind=True, acks_late=True) def manage_claim(self, definition: dict, name: str, namespace: str): """Execute ResourceClaim.manage() in a worker.""" - uid = definition['metadata']['uid'] + uid = definition["metadata"]["uid"] lock_key = f"resource_claim:{uid}" with distributed_lock(lock_key, timeout=60) as acquired: @@ -191,13 +213,17 @@ def maintain_all_claims(): return {"status": "completed", "total": 0, "batches": 0} # Split into batches and dispatch using group (distributes across workers) - batches = [claims[i:i + BATCH_SIZE] for i in range(0, len(claims), BATCH_SIZE)] + batches = [ + claims[i : i + BATCH_SIZE] for i in range(0, len(claims), BATCH_SIZE) + ] # Create group of batch tasks - Celery will distribute across available workers batch_group = group(process_claim_batch.s(batch) for batch in batches) batch_group.apply_async() - logger.info(f"Claim maintenance: {len(claims)} claims in {len(batches)} batches") + logger.info( + f"Claim maintenance: {len(claims)} claims in {len(batches)} batches" + ) return {"status": "dispatched", "total": len(claims), "batches": len(batches)} diff --git a/operator/tasks/resourcehandle.py b/operator/tasks/resourcehandle.py index d448f4c..9f2585c 100644 --- a/operator/tasks/resourcehandle.py +++ b/operator/tasks/resourcehandle.py @@ -12,8 +12,7 @@ def _is_transient_exception(exc: Exception) -> bool: - """Check if exception is transient (expected retry scenario). - """ + """Check if exception is transient (expected retry scenario).""" import kubernetes_asyncio # Check ApiException first (already imported) @@ -24,7 +23,7 @@ def _is_transient_exception(exc: Exception) -> bool: # This works because resourcehandle.py raises kopf.TemporaryError exc_class_name = type(exc).__name__ exc_module = type(exc).__module__ - if exc_class_name == 'TemporaryError' and 'kopf' in exc_module: + if exc_class_name == "TemporaryError" and "kopf" in exc_module: return True return False @@ -53,20 +52,20 @@ async def _collect_handles_to_process() -> list: handle_list = await Poolboy.custom_objects_api.list_namespaced_custom_object( group=Poolboy.operator_domain, namespace=Poolboy.namespace, - plural='resourcehandles', + plural="resourcehandles", version=Poolboy.operator_version, _continue=_continue, limit=50, ) - for item in handle_list.get('items', []): + for item in handle_list.get("items", []): # Skip ignored handles - if Poolboy.ignore_label in item['metadata'].get('labels', {}): + if Poolboy.ignore_label in item["metadata"].get("labels", {}): continue handles_to_process.append(item) - _continue = handle_list['metadata'].get('continue') + _continue = handle_list["metadata"].get("continue") if not _continue: break @@ -75,29 +74,41 @@ async def _collect_handles_to_process() -> list: async def _delete_handle(definition: dict) -> dict: """Async wrapper for ResourceHandle.handle_delete(). - + Note: We do NOT refetch for delete operations. The handle may already be deleted from K8s, but we still need to propagate the delete to child resources (ResourceClaimTest, etc.) using the original definition. """ import resourcehandle + handle = resourcehandle.ResourceHandle.from_definition(definition) await handle.handle_delete(logger=logger) return {"status": "completed", "handle": handle.name} def _dispatch_batch(handles: list) -> int: - """Dispatch a batch of handles as individual tasks.""" + """Dispatch a batch of handles as individual tasks. + + Note: Uses timestamp (truncated to minute) instead of resourceVersion for task_id. + This allows periodic reprocessing even when resourceVersion hasn't changed, + which is necessary for time-based triggers like lifespan.end. + """ + import time + + ts_minute = int(time.time() // 60) # One dispatch allowed per minute per handle + dispatched = 0 for item in handles: - uid = item['metadata']['uid'] - rv = item['metadata']['resourceVersion'] + uid = item["metadata"]["uid"] kwargs = { - 'definition': item, - 'name': item['metadata']['name'], - 'namespace': item['metadata']['namespace'], + "definition": item, + "name": item["metadata"]["name"], + "namespace": item["metadata"]["namespace"], } - manage_handle.apply_async(kwargs=kwargs, task_id=f"handle-{uid}-{rv}") + # Use timestamp instead of resourceVersion to allow periodic reprocessing + manage_handle.apply_async( + kwargs=kwargs, task_id=f"handle-sched-{uid}-{ts_minute}" + ) dispatched += 1 return dispatched @@ -105,12 +116,22 @@ def _dispatch_batch(handles: list) -> int: async def _manage_handle(definition: dict) -> dict: """Async wrapper for ResourceHandle.manage().""" import resourcehandle + handle = resourcehandle.ResourceHandle.from_definition(definition) # Refetch to get current state from K8s API (avoid stale data) handle = await handle.refetch() if not handle: # Handle was deleted between dispatch and execution - return {"status": "skipped", "reason": "not_found", "handle": definition['metadata']['name']} + return { + "status": "skipped", + "reason": "not_found", + "handle": definition["metadata"]["name"], + } + + # Register handle in cache for binding operations + # This ensures unbound handles are available for claim binding + await handle.register_definition(handle.definition) + await handle.manage(logger=logger) return {"status": "completed", "handle": handle.name} @@ -118,7 +139,7 @@ async def _manage_handle(definition: dict) -> dict: @app.task(bind=True, acks_late=True) def delete_handle(self, definition: dict, name: str, namespace: str): """Execute ResourceHandle.handle_delete() in a worker.""" - uid = definition['metadata']['uid'] + uid = definition["metadata"]["uid"] lock_key = f"resource_handle:{uid}" with distributed_lock(lock_key, timeout=60) as acquired: @@ -135,9 +156,9 @@ def delete_handle(self, definition: dict, name: str, namespace: str): def dispatch_delete_handle(definition: dict, name: str, namespace: str): """Dispatch delete_handle task with unique task_id.""" - uid = definition['metadata']['uid'] - rv = definition['metadata']['resourceVersion'] - kwargs = {'definition': definition, 'name': name, 'namespace': namespace} + uid = definition["metadata"]["uid"] + rv = definition["metadata"]["resourceVersion"] + kwargs = {"definition": definition, "name": name, "namespace": namespace} delete_handle.apply_async( kwargs=kwargs, task_id=f"handle-delete-{uid}-{rv}", @@ -146,9 +167,9 @@ def dispatch_delete_handle(definition: dict, name: str, namespace: str): def dispatch_manage_handle(definition: dict, name: str, namespace: str): """Dispatch manage_handle task. Always dispatches for operator events.""" - uid = definition['metadata']['uid'] - rv = definition['metadata']['resourceVersion'] - kwargs = {'definition': definition, 'name': name, 'namespace': namespace} + uid = definition["metadata"]["uid"] + rv = definition["metadata"]["resourceVersion"] + kwargs = {"definition": definition, "name": name, "namespace": namespace} manage_handle.apply_async( kwargs=kwargs, task_id=f"handle-{uid}-{rv}", @@ -158,7 +179,7 @@ def dispatch_manage_handle(definition: dict, name: str, namespace: str): @app.task(bind=True, acks_late=True) def manage_handle(self, definition: dict, name: str, namespace: str): """Execute ResourceHandle.manage() in a worker.""" - uid = definition['metadata']['uid'] + uid = definition["metadata"]["uid"] lock_key = f"resource_handle:{uid}" with distributed_lock(lock_key, timeout=60) as acquired: @@ -197,13 +218,17 @@ def maintain_all_handles(): return {"status": "completed", "total": 0, "batches": 0} # Split into batches and dispatch using group (distributes across workers) - batches = [handles[i:i + BATCH_SIZE] for i in range(0, len(handles), BATCH_SIZE)] + batches = [ + handles[i : i + BATCH_SIZE] for i in range(0, len(handles), BATCH_SIZE) + ] # Create group of batch tasks - Celery will distribute across available workers batch_group = group(process_handle_batch.s(batch) for batch in batches) batch_group.apply_async() - logger.info(f"Handle maintenance: {len(handles)} handles in {len(batches)} batches") + logger.info( + f"Handle maintenance: {len(handles)} handles in {len(batches)} batches" + ) return {"status": "dispatched", "total": len(handles), "batches": len(batches)} diff --git a/operator/tasks/resourcepool.py b/operator/tasks/resourcepool.py index 9027150..5fe424c 100644 --- a/operator/tasks/resourcepool.py +++ b/operator/tasks/resourcepool.py @@ -11,32 +11,41 @@ async def _delete_pool_handles(definition: dict) -> dict: """Async wrapper for ResourcePool.handle_delete().""" import resourcepool + pool = resourcepool.ResourcePool.from_definition(definition) await pool.handle_delete(logger=logger) return {"status": "completed", "pool": pool.name} async def _maintain_all_pools() -> dict: - """List all pools and dispatch manage_pool for each unprocessed.""" + """List all pools and dispatch manage_pool for each. + + Note: Uses timestamp (truncated to minute) instead of resourceVersion for task_id. + This allows periodic reprocessing even when resourceVersion hasn't changed. + """ + import time + from poolboy import Poolboy + ts_minute = int(time.time() // 60) # One dispatch allowed per minute per pool + pool_list = await Poolboy.custom_objects_api.list_namespaced_custom_object( group=Poolboy.operator_domain, namespace=Poolboy.namespace, - plural='resourcepools', + plural="resourcepools", version=Poolboy.operator_version, ) dispatched = 0 - for item in pool_list.get('items', []): - uid = item['metadata']['uid'] - rv = item['metadata']['resourceVersion'] + for item in pool_list.get("items", []): + uid = item["metadata"]["uid"] kwargs = { - 'definition': item, - 'name': item['metadata']['name'], - 'namespace': item['metadata']['namespace'], + "definition": item, + "name": item["metadata"]["name"], + "namespace": item["metadata"]["namespace"], } - manage_pool.apply_async(kwargs=kwargs, task_id=f"pool-{uid}-{rv}") + # Use timestamp instead of resourceVersion to allow periodic reprocessing + manage_pool.apply_async(kwargs=kwargs, task_id=f"pool-sched-{uid}-{ts_minute}") dispatched += 1 return {"dispatched": dispatched} @@ -45,6 +54,7 @@ async def _maintain_all_pools() -> dict: async def _manage_pool(definition: dict) -> dict: """Async wrapper for ResourcePool.manage().""" import resourcepool + pool = resourcepool.ResourcePool.from_definition(definition) await pool.manage(logger=logger) return {"status": "completed", "pool": pool.name} @@ -55,7 +65,7 @@ def delete_pool_handles(self, definition: dict, name: str, namespace: str): """Execute ResourcePool.handle_delete() in a worker.""" from poolboy import Poolboy - uid = definition['metadata']['uid'] + uid = definition["metadata"]["uid"] lock_key = f"resource_pool:{uid}" with distributed_lock(lock_key, timeout=60) as acquired: @@ -68,14 +78,16 @@ def delete_pool_handles(self, definition: dict, name: str, namespace: str): return WorkerState.run_async(_delete_pool_handles(definition)) except Exception as e: logger.error(f"Pool {namespace}/{name} delete error: {e}") - raise self.retry(exc=e, countdown=Poolboy.workers_error_retry_countdown, max_retries=5) + raise self.retry( + exc=e, countdown=Poolboy.workers_error_retry_countdown, max_retries=5 + ) def dispatch_delete_pool_handles(definition: dict, name: str, namespace: str): """Dispatch delete_pool_handles task with unique task_id.""" - uid = definition['metadata']['uid'] - rv = definition['metadata']['resourceVersion'] - kwargs = {'definition': definition, 'name': name, 'namespace': namespace} + uid = definition["metadata"]["uid"] + rv = definition["metadata"]["resourceVersion"] + kwargs = {"definition": definition, "name": name, "namespace": namespace} delete_pool_handles.apply_async( kwargs=kwargs, task_id=f"pool-delete-{uid}-{rv}", @@ -84,9 +96,9 @@ def dispatch_delete_pool_handles(definition: dict, name: str, namespace: str): def dispatch_manage_pool(definition: dict, name: str, namespace: str): """Dispatch manage_pool task. Always dispatches for operator events.""" - uid = definition['metadata']['uid'] - rv = definition['metadata']['resourceVersion'] - kwargs = {'definition': definition, 'name': name, 'namespace': namespace} + uid = definition["metadata"]["uid"] + rv = definition["metadata"]["resourceVersion"] + kwargs = {"definition": definition, "name": name, "namespace": namespace} manage_pool.apply_async( kwargs=kwargs, task_id=f"pool-{uid}-{rv}", @@ -119,7 +131,7 @@ def manage_pool(self, definition: dict, name: str, namespace: str): """Execute ResourcePool.manage() in a worker.""" from poolboy import Poolboy - uid = definition['metadata']['uid'] + uid = definition["metadata"]["uid"] lock_key = f"resource_pool:{uid}" with distributed_lock(lock_key, timeout=60) as acquired: @@ -133,4 +145,6 @@ def manage_pool(self, definition: dict, name: str, namespace: str): return result except Exception as e: logger.error(f"Pool {namespace}/{name} error: {e}") - raise self.retry(exc=e, countdown=Poolboy.workers_error_retry_countdown, max_retries=5) + raise self.retry( + exc=e, countdown=Poolboy.workers_error_retry_countdown, max_retries=5 + ) diff --git a/test/ansible.cfg b/test/ansible.cfg index ed865bf..1943b71 100644 --- a/test/ansible.cfg +++ b/test/ansible.cfg @@ -1,2 +1,4 @@ [defaults] inventory = hosts +# Show task execution time +callbacks_enabled = profile_tasks diff --git a/test/dev-local.yaml b/test/dev-local.yaml new file mode 100644 index 0000000..2703305 --- /dev/null +++ b/test/dev-local.yaml @@ -0,0 +1,19 @@ +# Development environment variables for tests +# Usage: ansible-playbook playbook.yaml -e @dev-local.yaml +ansible_python_interpreter: "{{ ansible_playbook_python }}" + +# Operator namespace (where Poolboy is deployed) +poolboy_namespace: poolboy-dev + +# Test namespace (where test resources are created) +poolboy_test_namespace: poolboy-dev-test + +# Operator domain (CRD API group) +poolboy_domain: poolboy.gpte.redhat.com + +# Service account name +poolboy_service_account: poolboy + +# Cleanup test resources after tests +poolboy_test_cleanup: true + diff --git a/test/roles/poolboy_test_simple/tasks/test-ignore-01.yaml b/test/roles/poolboy_test_simple/tasks/test-ignore-01.yaml index 88d347c..9beea73 100644 --- a/test/roles/poolboy_test_simple/tasks/test-ignore-01.yaml +++ b/test/roles/poolboy_test_simple/tasks/test-ignore-01.yaml @@ -223,8 +223,10 @@ apiVersion: "{{ poolboy_domain }}/v1" kind: ResourceClaim metadata: - annotations: - poolboy.dev.local/resource-claim-init-timestamp: "1970-01-01T00:00:00Z" + annotations: >- + {{ { + poolboy_domain ~ "/resource-claim-init-timestamp": "1970-01-01T00:00:00Z" + } }} finalizers: - "{{ poolboy_domain }}" labels: >- @@ -266,7 +268,7 @@ "{{ poolboy_domain }}/test": "simple" }, "annotations": { - "poolboy.dev.local/resource-claim-init-timestamp": "1970-01-01T00:00:00Z" + "{{ poolboy_domain }}/resource-claim-init-timestamp": "1970-01-01T00:00:00Z" } }, "spec": { @@ -285,14 +287,14 @@ parameterValues: stringvar: one resourceHandle: - apiVersion: poolboy.dev.local/v1 + apiVersion: "{{ poolboy_domain }}/v1" kind: ResourceHandle name: guid-abcde namespace: poolboy-dev resources: - name: test-ignore-01 provider: - apiVersion: poolboy.dev.local/v1 + apiVersion: "{{ poolboy_domain }}/v1" kind: ResourceProvider name: test-ignore-01 namespace: poolboy-dev @@ -389,18 +391,20 @@ - name: Create ResourceHandle guid-abcde kubernetes.core.k8s: definition: - apiVersion: poolboy.dev.local/v1 + apiVersion: "{{ poolboy_domain }}/v1" kind: ResourceClaimTest metadata: - annotations: - poolboy.dev.local/resource-claim-name: test-ignore-01-b - poolboy.dev.local/resource-claim-namespace: "{{ poolboy_test_namespace }}" - poolboy.dev.local/resource-handle-name: guid-abcde - poolboy.dev.local/resource-handle-namespace: "{{ poolboy_namespace }}" - poolboy.dev.local/resource-handle-uid: 00000000-0000-0000-0000-000000000000 - poolboy.dev.local/resource-index: "0" - poolboy.dev.local/resource-provider-name: test-ignore-01 - poolboy.dev.local/resource-provider-namespace: "{{ poolboy_namespace }}" + annotations: >- + {{ { + poolboy_domain ~ "/resource-claim-name": "test-ignore-01-b", + poolboy_domain ~ "/resource-claim-namespace": poolboy_test_namespace, + poolboy_domain ~ "/resource-handle-name": "guid-abcde", + poolboy_domain ~ "/resource-handle-namespace": poolboy_namespace, + poolboy_domain ~ "/resource-handle-uid": "00000000-0000-0000-0000-000000000000", + poolboy_domain ~ "/resource-index": "0", + poolboy_domain ~ "/resource-provider-name": "test-ignore-01", + poolboy_domain ~ "/resource-provider-namespace": poolboy_namespace, + } }} name: test-ignore-01-abcde namespace: "{{ poolboy_test_namespace }}" spec: diff --git a/test/roles/poolboy_test_simple/tasks/test-lifespan-start-01.yaml b/test/roles/poolboy_test_simple/tasks/test-lifespan-start-01.yaml index 4dbfa89..c45bd4d 100644 --- a/test/roles/poolboy_test_simple/tasks/test-lifespan-start-01.yaml +++ b/test/roles/poolboy_test_simple/tasks/test-lifespan-start-01.yaml @@ -86,7 +86,7 @@ r_get_resource_claim.resources[0].status.resourceHandle.name is undefined or r_get_resource_claim.resources[0].status.resources[0].reference is undefined delay: 2 - retries: 45 + retries: 60 until: r_get_resource_claim is successful - name: Delete ResourceClaim test-lifespan-start-01 @@ -130,6 +130,6 @@ register: r_get_resource_claim_test failed_when: r_get_resource_claim_test.resources | length != 0 until: r_get_resource_claim_test is success - delay: 1 - retries: 10 + delay: 2 + retries: 30 ... diff --git a/test/roles/poolboy_test_simple/tasks/test-pool-03.yaml b/test/roles/poolboy_test_simple/tasks/test-pool-03.yaml index 3f45180..1efbf25 100644 --- a/test/roles/poolboy_test_simple/tasks/test-pool-03.yaml +++ b/test/roles/poolboy_test_simple/tasks/test-pool-03.yaml @@ -191,7 +191,7 @@ resources: - name: test-pool-03 provider: - apiVersion: poolboy.dev.local/v1 + apiVersion: "{{ poolboy_domain }}/v1" kind: ResourceProvider name: test-pool-03 namespace: poolboy-dev diff --git a/test/roles/poolboy_test_simple/tasks/test-ready-01.yaml b/test/roles/poolboy_test_simple/tasks/test-ready-01.yaml index ee50851..5e597ae 100644 --- a/test/roles/poolboy_test_simple/tasks/test-ready-01.yaml +++ b/test/roles/poolboy_test_simple/tasks/test-ready-01.yaml @@ -69,13 +69,18 @@ name: test-ready-01 namespace: "{{ poolboy_test_namespace }}" register: r_get_resource_claim + vars: + _lifespan_seconds: >- + {{ + ( + r_get_resource_claim.resources[0].status.lifespan.end | to_datetime("%Y-%m-%dT%H:%M:%SZ") - + r_get_resource_claim.resources[0].status.lifespan.start | to_datetime("%Y-%m-%dT%H:%M:%SZ") + ).total_seconds() | int + }} failed_when: >- r_get_resource_claim.resources[0].status.ready != false or r_get_resource_claim.resources[0].status.lifespan.firstReady is defined or - ( - r_get_resource_claim.resources[0].status.lifespan.end | to_datetime("%Y-%m-%dT%H:%M:%SZ") - - r_get_resource_claim.resources[0].status.lifespan.start | to_datetime("%Y-%m-%dT%H:%M:%SZ") - ).total_seconds() != 24 * 60 * 60 + (_lifespan_seconds | int) < 86398 or (_lifespan_seconds | int) > 86402 until: r_get_resource_claim is success delay: 1 retries: 10 @@ -104,16 +109,21 @@ name: test-ready-01 namespace: "{{ poolboy_test_namespace }}" register: r_get_resource_claim + vars: + _lifespan_seconds: >- + {{ + ( + r_get_resource_claim.resources[0].status.lifespan.end | to_datetime("%Y-%m-%dT%H:%M:%SZ") - + r_get_resource_claim.resources[0].status.lifespan.firstReady | to_datetime("%Y-%m-%dT%H:%M:%SZ") + ).total_seconds() | int + }} failed_when: >- r_get_resource_claim.resources[0].status.ready != true or r_get_resource_claim.resources[0].status.lifespan.firstReady is undefined or - ( - r_get_resource_claim.resources[0].status.lifespan.end | to_datetime("%Y-%m-%dT%H:%M:%SZ") - - r_get_resource_claim.resources[0].status.lifespan.firstReady | to_datetime("%Y-%m-%dT%H:%M:%SZ") - ).total_seconds() != 24 * 60 * 60 + (_lifespan_seconds | int) < 86398 or (_lifespan_seconds | int) > 86402 until: r_get_resource_claim is success - delay: 1 - retries: 10 + delay: 2 + retries: 45 - name: Delete ResourceClaim test-ready-01 kubernetes.core.k8s: diff --git a/test/roles/poolboy_test_simple/tasks/test-requester-01.yaml b/test/roles/poolboy_test_simple/tasks/test-requester-01.yaml index e4c548c..fd7178e 100644 --- a/test/roles/poolboy_test_simple/tasks/test-requester-01.yaml +++ b/test/roles/poolboy_test_simple/tasks/test-requester-01.yaml @@ -92,7 +92,7 @@ spec: resources: - provider: - apiVersion: poolboy.dev.local/v1 + apiVersion: "{{ poolboy_domain }}/v1" kind: ResourceProvider name: test-requester-01 namespace: poolboy-dev diff --git a/test/roles/poolboy_test_simple/tasks/test-vars-03.yaml b/test/roles/poolboy_test_simple/tasks/test-vars-03.yaml index 3129de4..ef4939f 100644 --- a/test/roles/poolboy_test_simple/tasks/test-vars-03.yaml +++ b/test/roles/poolboy_test_simple/tasks/test-vars-03.yaml @@ -190,8 +190,8 @@ failed_when: >- r_get_resource_claim.resources[0].status.resources[0].validationError is defined until: r_get_resource_claim is success - delay: 1 - retries: 10 + delay: 2 + retries: 35 - name: Verify update of ResourceClaimTest test-vars-03-a kubernetes.core.k8s_info: diff --git a/test/roles/poolboy_test_simple/tasks/test.yaml b/test/roles/poolboy_test_simple/tasks/test.yaml index eaf494e..27e6f37 100644 --- a/test/roles/poolboy_test_simple/tasks/test.yaml +++ b/test/roles/poolboy_test_simple/tasks/test.yaml @@ -525,6 +525,25 @@ retries: 10 delay: 1 +- name: Wait for all pool handles to be ready + kubernetes.core.k8s_info: + api_version: "{{ poolboy_domain }}/v1" + kind: ResourceHandle + namespace: "{{ poolboy_namespace }}" + label_selectors: + - "{{ poolboy_domain }}/resource-pool-name=test" + - "{{ poolboy_domain }}/resource-pool-namespace={{ poolboy_namespace }}" + register: r_get_pool_handles_ready + vars: + ready_handles: >- + {{ r_get_pool_handles_ready.resources | selectattr('status.ready', 'defined') | list }} + failed_when: >- + r_get_pool_handles_ready.resources | length != 3 or + ready_handles | length != 3 + until: r_get_pool_handles_ready is success + retries: 30 + delay: 2 + - name: Create ResourceClaim test-pool-match kubernetes.core.k8s: state: present @@ -774,7 +793,7 @@ __test_resource.spec.vars.desired_state | default('') != 'stopped' until: r_get_test_templated_1 is success delay: 5 - retries: 10 + retries: 30 - name: Delete resource claim test-templated-1 kubernetes.core.k8s: @@ -987,8 +1006,8 @@ __test_lifespan_3.status.lifespan.end is undefined or 23 != (__test_lifespan_3.status.lifespan.end | to_datetime('%Y-%m-%dT%H:%M:%S%z') - __test_lifespan_3.status.lifespan.start | to_datetime('%Y-%m-%dT%H:%M:%S%z')).total_seconds() until: r_get_test_lifespan_3 is success - delay: 5 - retries: 10 + delay: 2 + retries: 25 - name: Create test-disable-creation ResourceProvider kubernetes.core.k8s: @@ -1174,10 +1193,10 @@ spec: resources: - provider: - apiVersion: poolboy.dev.local/v1 + apiVersion: "{{ poolboy_domain }}/v1" kind: ResourceProvider name: test-base - namespace: "{{ poolboy_namespace}}" + namespace: "{{ poolboy_namespace }}" - name: Update ResourceClaimTest to set provision_vars for test-linked-2 kubernetes.core.k8s: