diff --git a/build-template.yaml b/build-template.yaml index 3f0c168..8c193eb 100644 --- a/build-template.yaml +++ b/build-template.yaml @@ -28,6 +28,8 @@ objects: metadata: name: ${NAME} spec: + successfulBuildsHistoryLimit: 2 + failedBuildsHistoryLimit: 1 output: to: kind: ImageStreamTag diff --git a/helm/helm-vars-dev-standalone.yaml b/helm/helm-vars-dev-standalone.yaml new file mode 100644 index 0000000..78214f8 --- /dev/null +++ b/helm/helm-vars-dev-standalone.yaml @@ -0,0 +1,32 @@ +# Standalone mode for development +# Usage: helm template poolboy-dev helm -f helm/helm-vars-dev-standalone.yaml | oc apply -f - + +clusterDomain: apps-crc.testing + +# Standalone mode - single operator pod handles everything +operatorMode: standalone + +namespace: + name: poolboy-dev + create: false + +image: + repository: image-registry.openshift-image-registry.svc:5000/poolboy-dev/poolboy + tagOverride: latest + pullPolicy: Always + +# =========================================== +# Disable distributed components +# =========================================== +redis: + enabled: false + +worker: + enabled: false + +scheduler: + enabled: false + +flower: + enabled: false + diff --git a/helm/helm-vars-dev.yaml b/helm/helm-vars-dev.yaml new file mode 100644 index 0000000..cfa9651 --- /dev/null +++ b/helm/helm-vars-dev.yaml @@ -0,0 +1,73 @@ +# Development environment values for Poolboy +# Usage: helm template poolboy-dev helm -f helm/helm-vars-dev.yaml | oc apply -f - + +clusterDomain: apps-crc.testing + +# Use standalone mode for testing watches without Celery overhead +operatorMode: standalone + +namespace: + name: poolboy-dev + create: false + +image: + repository: image-registry.openshift-image-registry.svc:5000/poolboy-dev/poolboy + tagOverride: latest + pullPolicy: Always + +# =========================================== +# Redis Configuration +# =========================================== +redis: + enabled: true + +# =========================================== +# Worker Configuration +# =========================================== +worker: + enabled: true + replicas: 1 + hpa: + enabled: false + +# =========================================== +# Scheduler Configuration +# =========================================== +scheduler: + enabled: true + +# =========================================== +# Celery Flower - Task Monitoring UI +# =========================================== +flower: + enabled: true + route: + enabled: true + +# =========================================== +# Worker Settings (simplified) +# =========================================== +useWorkers: + lockRetryCountdown: 3 + errorRetryCountdown: 30 + partitions: + resourcePool: 2 + resourceHandle: 4 + resourceClaim: 4 + +# =========================================== +# Scheduled Tasks (safety net - watches handle real-time events) +# =========================================== +schedules: + maintain-all-pools: + enabled: true + schedule: + seconds: 30 # No PoolWatch yet, keep frequent + maintain-all-handles: + enabled: true + schedule: + seconds: 60 # Needs polling for lifespan.end triggers + maintain-all-claims: + enabled: true + schedule: + seconds: 60 # Same as old daemon interval diff --git a/helm/templates/_helpers.tpl b/helm/templates/_helpers.tpl index 78fe298..93a50ed 100644 --- a/helm/templates/_helpers.tpl +++ b/helm/templates/_helpers.tpl @@ -106,3 +106,21 @@ Define the image to deploy {{- printf "%s:v%s" .Values.image.repository .Chart.AppVersion -}} {{- end -}} {{- end -}} + +{{/* +Determine if operator is running in standalone mode. +Backward compatibility mapping: + - 'all-in-one' -> standalone (true) + - 'standalone' -> standalone (true) + - 'manager', 'resource-handler', 'resource-watch' -> distributed (false) + - 'distributed' -> distributed (false) + - any other value -> distributed (false) +*/}} +{{- define "poolboy.isStandalone" -}} +{{- $mode := .Values.operatorMode | default "distributed" -}} +{{- if or (eq $mode "standalone") (eq $mode "all-in-one") -}} +true +{{- else -}} +false +{{- end -}} +{{- end -}} diff --git a/helm/templates/deployment.yaml b/helm/templates/deployment.yaml index cab8c4c..ff4ace2 100644 --- a/helm/templates/deployment.yaml +++ b/helm/templates/deployment.yaml @@ -6,51 +6,43 @@ metadata: namespace: {{ include "poolboy.namespaceName" . }} labels: {{- include "poolboy.labels" . | nindent 4 }} + app.kubernetes.io/component: operator spec: replicas: {{ .Values.replicaCount }} selector: matchLabels: {{- include "poolboy.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: operator strategy: type: Recreate template: metadata: labels: {{- include "poolboy.selectorLabels" . | nindent 8 }} + app.kubernetes.io/component: operator spec: containers: - name: manager env: + - name: KOPF_OPERATORS + value: main.py - name: CLUSTER_DOMAIN value: "{{ .Values.clusterDomain }}" - - name: MANAGE_CLAIMS_INTERVAL - value: "{{ .Values.manageClaimsInterval }}" - - name: MANAGE_HANDLES_INTERVAL - value: "{{ .Values.manageHandlesInterval }}" - - name: MANAGE_POOLS_INTERVAL - value: "{{ .Values.managePoolsInterval }}" + - name: IS_STANDALONE + value: {{ include "poolboy.isStandalone" . | quote }} - name: OPERATOR_DOMAIN value: {{ include "poolboy.operatorDomain" . }} - - name: OPERATOR_MODE - value: "{{ .Values.operatorMode }}" - {{- if eq .Values.operatorMode "manager" }} - - name: RESOURCE_HANDLER_COUNT - value: "{{ .Values.resourceHandlerCount }}" - {{- if .Values.resourceHandlerResources }} - - name: RESOURCE_HANDLER_RESOURCES - value: {{ .Values.resourceHandlerResources | toJson | quote }} - {{- end }} - {{- if .Values.resourceWatchResources }} - - name: RESOURCE_WATCH_RESOURCES - value: {{ .Values.resourceWatchResources | toJson | quote }} - {{- end }} - {{- end }} {{- if .Values.enablePrometheusMetrics}} - name: METRICS_ENABLED value: "true" {{- end }} - name: RESOURCE_REFRESH_INTERVAL value: "{{ .Values.resourceRefreshInterval }}" + {{- if .Values.worker.enabled }} + envFrom: + - configMapRef: + name: {{ include "poolboy.name" . }}-useworkers-cm + {{- end }} image: "{{ include "poolboy.image" . }}" imagePullPolicy: {{ .Values.image.pullPolicy }} resources: @@ -64,7 +56,7 @@ spec: - name: kopf containerPort: 8080 - name: metrics - containerPort: 9091 + containerPort: 9090 {{- with .Values.imagePullSecrets }} imagePullSecrets: {{- toYaml . | nindent 8 }} diff --git a/helm/templates/metrics-credentials.yaml b/helm/templates/metrics-credentials.yaml index 9172b6d..4f7f9c8 100644 --- a/helm/templates/metrics-credentials.yaml +++ b/helm/templates/metrics-credentials.yaml @@ -1,19 +1,19 @@ {{- if .Values.enablePrometheusMetrics -}} -apiVersion: secretgenerator.mittwald.de/v1alpha1 -kind: StringSecret +{{- $secretName := printf "%s-metrics-credentials" (include "poolboy.name" .) }} +{{- $existingSecret := lookup "v1" "Secret" (include "poolboy.namespaceName" .) $secretName }} +apiVersion: v1 +kind: Secret metadata: - name: {{ include "poolboy.name" . }}-metrics-credentials + name: {{ $secretName }} namespace: {{ include "poolboy.namespaceName" . }} labels: {{- include "poolboy.labels" . | nindent 4 }} - annotations: - secret-generator.v1.mittwald.de/type: basic-auth -spec: - forceRegenerate: false - data: - metrics_username: {{ .Values.metrics.username }} - fields: - - fieldName: metrics_password - encoding: "hex" - length: "32" +type: Opaque +data: + metrics_username: {{ .Values.metrics.username | b64enc }} + {{- if and $existingSecret $existingSecret.data.metrics_password }} + metrics_password: {{ $existingSecret.data.metrics_password }} + {{- else }} + metrics_password: {{ randAlphaNum 32 | b64enc }} + {{- end }} {{- end }} diff --git a/helm/templates/service-monitor.yaml b/helm/templates/service-monitor.yaml index 6273228..25cb9ca 100644 --- a/helm/templates/service-monitor.yaml +++ b/helm/templates/service-monitor.yaml @@ -10,6 +10,7 @@ spec: selector: matchLabels: {{- include "poolboy.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: operator namespaceSelector: matchNames: - {{ include "poolboy.namespaceName" . }} diff --git a/helm/templates/service.yaml b/helm/templates/service.yaml index 1424f83..4b08454 100644 --- a/helm/templates/service.yaml +++ b/helm/templates/service.yaml @@ -6,6 +6,7 @@ metadata: namespace: {{ include "poolboy.namespaceName" . }} labels: {{- include "poolboy.labels" . | nindent 4 }} + app.kubernetes.io/component: operator spec: type: {{ .Values.service.type }} {{- with .Values.service.ports }} @@ -14,5 +15,6 @@ spec: {{- end }} selector: {{- include "poolboy.selectorLabels" . | nindent 4 }} + app.kubernetes.io/component: operator sessionAffinity: None {{- end -}} diff --git a/helm/templates/worker/flower-credentials.yaml b/helm/templates/worker/flower-credentials.yaml new file mode 100644 index 0000000..8357a59 --- /dev/null +++ b/helm/templates/worker/flower-credentials.yaml @@ -0,0 +1,20 @@ +{{- if and .Values.flower.enabled .Values.flower.auth.enabled }} +{{- $secretName := printf "%s-flower-credentials" (include "poolboy.name" .) }} +{{- $existingSecret := lookup "v1" "Secret" (include "poolboy.namespaceName" .) $secretName }} +apiVersion: v1 +kind: Secret +metadata: + name: {{ $secretName }} + namespace: {{ include "poolboy.namespaceName" . }} + labels: + {{- include "poolboy.labels" . | nindent 4 }} + app.kubernetes.io/component: flower +type: Opaque +data: + username: {{ .Values.flower.auth.username | default "admin" | b64enc }} + {{- if and $existingSecret $existingSecret.data.password }} + password: {{ $existingSecret.data.password }} + {{- else }} + password: {{ randAlphaNum 32 | b64enc }} + {{- end }} +{{- end }} diff --git a/helm/templates/worker/flower-deployment.yaml b/helm/templates/worker/flower-deployment.yaml new file mode 100644 index 0000000..2729b1e --- /dev/null +++ b/helm/templates/worker/flower-deployment.yaml @@ -0,0 +1,101 @@ +{{- if .Values.flower.enabled }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "poolboy.name" . }}-flower + namespace: {{ include "poolboy.namespaceName" . }} + labels: + {{- include "poolboy.labels" . | nindent 4 }} + app.kubernetes.io/component: flower + annotations: + reloader.stakater.com/auto: "true" +spec: + replicas: {{ .Values.flower.replicas | default 1 }} + selector: + matchLabels: + {{- include "poolboy.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: flower + template: + metadata: + labels: + {{- include "poolboy.selectorLabels" . | nindent 8 }} + app.kubernetes.io/component: flower + spec: + containers: + - name: flower + image: "{{ .Values.flower.image.repository }}:{{ .Values.flower.image.tag }}" + imagePullPolicy: {{ .Values.flower.image.pullPolicy | default "IfNotPresent" }} + command: ["celery"] + args: + - "--broker={{ printf "redis://%s-redis:6379/0" (include "poolboy.name" .) }}" + - "--result-backend={{ printf "redis://%s-redis:6379/1" (include "poolboy.name" .) }}" + - "flower" + - "--port={{ .Values.flower.port | default 5555 }}" + {{- if .Values.flower.auth.enabled }} + - "--basic-auth=$(FLOWER_USERNAME):$(FLOWER_PASSWORD)" + {{- end }} + {{- if .Values.flower.config.maxTasks }} + - "--max-tasks={{ .Values.flower.config.maxTasks }}" + {{- end }} + {{- if .Values.flower.config.purgeOfflineWorkers }} + - "--purge_offline_workers={{ .Values.flower.config.purgeOfflineWorkers }}" + {{- end }} + {{- if .Values.flower.persistence.enabled }} + - "--persistent=true" + - "--db=/data/flower.db" + {{- end }} + ports: + - name: flower + containerPort: {{ .Values.flower.port | default 5555 }} + envFrom: + {{- with .Values.flower.extraEnvFrom }} + {{- toYaml . | nindent 12 }} + {{- end }} + env: + {{- range $key, $value := .Values.flower.extraEnvVars }} + - name: {{ $key }} + value: {{ $value | quote }} + {{- end }} + {{- if .Values.flower.auth.enabled }} + - name: FLOWER_USERNAME + valueFrom: + secretKeyRef: + name: {{ include "poolboy.name" . }}-flower-credentials + key: username + - name: FLOWER_PASSWORD + valueFrom: + secretKeyRef: + name: {{ include "poolboy.name" . }}-flower-credentials + key: password + {{- end }} + livenessProbe: + httpGet: + path: /healthcheck + port: flower + initialDelaySeconds: 10 + periodSeconds: 30 + readinessProbe: + httpGet: + path: /healthcheck + port: flower + initialDelaySeconds: 5 + periodSeconds: 10 + resources: + {{- toYaml .Values.flower.resources | nindent 12 }} + volumeMounts: + - name: flower-data + mountPath: /data + volumes: + - name: flower-data + {{- if .Values.flower.persistence.enabled }} + persistentVolumeClaim: + claimName: {{ include "poolboy.name" . }}-flower-pvc + {{- else }} + emptyDir: {} + {{- end }} + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "poolboy.serviceAccountName" . }} +{{- end }} diff --git a/helm/templates/worker/flower-pvc.yaml b/helm/templates/worker/flower-pvc.yaml new file mode 100644 index 0000000..4351cd4 --- /dev/null +++ b/helm/templates/worker/flower-pvc.yaml @@ -0,0 +1,19 @@ +{{- if and .Values.flower.enabled .Values.flower.persistence.enabled }} +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ include "poolboy.name" . }}-flower-pvc + namespace: {{ include "poolboy.namespaceName" . }} + labels: + {{- include "poolboy.labels" . | nindent 4 }} + app.kubernetes.io/component: flower +spec: + accessModes: + - ReadWriteOnce + {{- if .Values.flower.persistence.storageClass }} + storageClassName: {{ .Values.flower.persistence.storageClass }} + {{- end }} + resources: + requests: + storage: {{ .Values.flower.persistence.size }} +{{- end }} diff --git a/helm/templates/worker/flower-route.yaml b/helm/templates/worker/flower-route.yaml new file mode 100644 index 0000000..85ee023 --- /dev/null +++ b/helm/templates/worker/flower-route.yaml @@ -0,0 +1,21 @@ +{{- if and .Values.flower.enabled .Values.flower.route.enabled }} +apiVersion: route.openshift.io/v1 +kind: Route +metadata: + name: {{ include "poolboy.name" . }}-flower + namespace: {{ include "poolboy.namespaceName" . }} + labels: + {{- include "poolboy.labels" . | nindent 4 }} + app.kubernetes.io/component: flower +spec: + to: + kind: Service + name: {{ include "poolboy.name" . }}-flower + weight: 100 + port: + targetPort: flower + tls: + termination: edge + insecureEdgeTerminationPolicy: Redirect + wildcardPolicy: None +{{- end }} diff --git a/helm/templates/worker/flower-service.yaml b/helm/templates/worker/flower-service.yaml new file mode 100644 index 0000000..eaead65 --- /dev/null +++ b/helm/templates/worker/flower-service.yaml @@ -0,0 +1,21 @@ +{{- if .Values.flower.enabled }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "poolboy.name" . }}-flower + namespace: {{ include "poolboy.namespaceName" . }} + labels: + {{- include "poolboy.labels" . | nindent 4 }} + app.kubernetes.io/component: flower +spec: + type: ClusterIP + ports: + - name: flower + port: {{ .Values.flower.port | default 5555 }} + protocol: TCP + targetPort: flower + selector: + {{- include "poolboy.selectorLabels" . | nindent 4 }} + app.kubernetes.io/component: flower +{{- end }} + diff --git a/helm/templates/worker/redis-deployment.yaml b/helm/templates/worker/redis-deployment.yaml new file mode 100644 index 0000000..aed3309 --- /dev/null +++ b/helm/templates/worker/redis-deployment.yaml @@ -0,0 +1,58 @@ +{{- if or .Values.redis.enabled (eq .Values.operatorMode "distributed") }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "poolboy.name" . }}-redis + namespace: {{ include "poolboy.namespaceName" . }} + labels: + {{- include "poolboy.labels" . | nindent 4 }} + app.kubernetes.io/component: redis +spec: + replicas: 1 + selector: + matchLabels: + {{- include "poolboy.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: redis + strategy: + type: Recreate + template: + metadata: + labels: + {{- include "poolboy.selectorLabels" . | nindent 8 }} + app.kubernetes.io/component: redis + spec: + containers: + - name: redis + image: "{{ .Values.redis.image.repository }}:{{ .Values.redis.image.tag }}" + imagePullPolicy: {{ .Values.redis.image.pullPolicy }} + ports: + - containerPort: 6379 + name: redis + command: + - redis-server + - --appendonly + - "yes" + resources: + {{- toYaml .Values.redis.resources | nindent 12 }} + {{- if .Values.redis.persistence.enabled }} + volumeMounts: + - name: redis-data + mountPath: /data + {{- end }} + livenessProbe: + tcpSocket: + port: 6379 + initialDelaySeconds: 5 + periodSeconds: 10 + readinessProbe: + tcpSocket: + port: 6379 + initialDelaySeconds: 5 + periodSeconds: 5 + {{- if .Values.redis.persistence.enabled }} + volumes: + - name: redis-data + persistentVolumeClaim: + claimName: {{ include "poolboy.name" . }}-redis-pvc + {{- end }} +{{- end }} diff --git a/helm/templates/worker/redis-pvc.yaml b/helm/templates/worker/redis-pvc.yaml new file mode 100644 index 0000000..2973ba1 --- /dev/null +++ b/helm/templates/worker/redis-pvc.yaml @@ -0,0 +1,20 @@ +{{- if and (or .Values.redis.enabled (eq .Values.operatorMode "distributed")) .Values.redis.persistence.enabled }} +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ include "poolboy.name" . }}-redis-pvc + namespace: {{ include "poolboy.namespaceName" . }} + labels: + {{- include "poolboy.labels" . | nindent 4 }} + app.kubernetes.io/component: redis +spec: + accessModes: + - ReadWriteOnce + {{- if .Values.redis.persistence.storageClass }} + storageClassName: {{ .Values.redis.persistence.storageClass }} + {{- end }} + resources: + requests: + storage: {{ .Values.redis.persistence.size }} +{{- end }} + diff --git a/helm/templates/worker/redis-service.yaml b/helm/templates/worker/redis-service.yaml new file mode 100644 index 0000000..ceda039 --- /dev/null +++ b/helm/templates/worker/redis-service.yaml @@ -0,0 +1,20 @@ +{{- if or .Values.redis.enabled (eq .Values.operatorMode "distributed") }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "poolboy.name" . }}-redis + namespace: {{ include "poolboy.namespaceName" . }} + labels: + {{- include "poolboy.labels" . | nindent 4 }} + app.kubernetes.io/component: redis +spec: + type: ClusterIP + ports: + - port: 6379 + targetPort: 6379 + protocol: TCP + name: redis + selector: + {{- include "poolboy.selectorLabels" . | nindent 4 }} + app.kubernetes.io/component: redis +{{- end }} diff --git a/helm/templates/worker/scheduler-cm.yaml b/helm/templates/worker/scheduler-cm.yaml new file mode 100644 index 0000000..39b6d10 --- /dev/null +++ b/helm/templates/worker/scheduler-cm.yaml @@ -0,0 +1,16 @@ +{{- /* Scheduler ConfigMap - auto-enabled when operatorMode is 'distributed' */ -}} +{{- if or .Values.scheduler.enabled (eq .Values.operatorMode "distributed") }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "poolboy.name" . }}-scheduler-cm + namespace: {{ include "poolboy.namespaceName" . }} + labels: + {{- include "poolboy.labels" . | nindent 4 }} + app.kubernetes.io/component: scheduler-cm +data: + schedule_config.yaml: | + schedules: + {{- toYaml .Values.schedules | nindent 6 }} +{{- end }} + diff --git a/helm/templates/worker/scheduler-deployment.yaml b/helm/templates/worker/scheduler-deployment.yaml new file mode 100644 index 0000000..11aa836 --- /dev/null +++ b/helm/templates/worker/scheduler-deployment.yaml @@ -0,0 +1,92 @@ +{{- /* Scheduler deployment - auto-enabled when operatorMode is 'distributed' */ -}} +{{- if or .Values.scheduler.enabled (eq .Values.operatorMode "distributed") }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "poolboy.name" . }}-scheduler + namespace: {{ include "poolboy.namespaceName" . }} + labels: + {{- include "poolboy.labels" . | nindent 4 }} + app.kubernetes.io/component: scheduler + annotations: + reloader.stakater.com/auto: "true" +spec: + replicas: 1 + selector: + matchLabels: + {{- include "poolboy.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: scheduler + strategy: + type: Recreate + template: + metadata: + labels: + {{- include "poolboy.selectorLabels" . | nindent 8 }} + app.kubernetes.io/component: scheduler + spec: + containers: + - name: scheduler + image: "{{ include "poolboy.image" . }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + workingDir: /opt/app-root/operator + command: ["celery"] + args: + - "-A" + - "processor.app" + - "beat" + - "--loglevel={{ .Values.scheduler.config.logging.level | lower }}" + - "--schedule=/tmp/celerybeat-schedule" + envFrom: + - configMapRef: + name: {{ include "poolboy.name" . }}-worker-cm + {{- with .Values.scheduler.extraEnvFrom }} + {{- toYaml . | nindent 12 }} + {{- end }} + env: + - name: PYTHONPATH + value: /opt/app-root/operator + {{- range $key, $value := .Values.scheduler.extraEnvVars }} + - name: {{ $key }} + value: {{ $value | quote }} + {{- end }} + - name: CELERY_SCHEDULER_ENABLED + value: "true" + - name: CELERY_SCHEDULE_CONFIG + value: "/etc/poolboy/schedule_config.yaml" + - name: PROMETHEUS_MULTIPROC_DIR + value: "" + - name: WORKER_METRICS_ENABLED + value: "false" + resources: + {{- toYaml .Values.scheduler.resources | nindent 12 }} + volumeMounts: + - name: schedule-data + mountPath: /tmp + - name: scheduler-cm + mountPath: /etc/poolboy + readOnly: true + volumes: + - name: schedule-data + emptyDir: {} + - name: scheduler-cm + configMap: + name: {{ include "poolboy.name" . }}-scheduler-cm + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "poolboy.serviceAccountName" . }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} +{{- end }} + diff --git a/helm/templates/worker/useworkers-cm.yaml b/helm/templates/worker/useworkers-cm.yaml new file mode 100644 index 0000000..5911eda --- /dev/null +++ b/helm/templates/worker/useworkers-cm.yaml @@ -0,0 +1,33 @@ +{{- /* UseWorkers ConfigMap - auto-enabled when not standalone */ -}} +{{- if or .Values.worker.enabled (ne (include "poolboy.isStandalone" .) "true") }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "poolboy.name" . }}-useworkers-cm + namespace: {{ include "poolboy.namespaceName" . }} + labels: + {{- include "poolboy.labels" . | nindent 4 }} + app.kubernetes.io/component: useworkers-cm +data: + # Redis URL (base URL, components append DB number: /0, /1, /2, /3) + REDIS_URL: {{ printf "redis://%s-redis:6379" (include "poolboy.name" .) | quote }} + # Celery config for sending tasks from operator + CELERY_BROKER_URL: {{ printf "redis://%s-redis:6379/0" (include "poolboy.name" .) | quote }} + CELERY_RESULT_BACKEND: {{ printf "redis://%s-redis:6379/1" (include "poolboy.name" .) | quote }} + # Lock retry delay (seconds) - used when resource is locked by another task + WORKERS_LOCK_RETRY_COUNTDOWN: {{ .Values.useWorkers.lockRetryCountdown | default 3 | quote }} + # Error retry delay (seconds) - used when task fails with an error + WORKERS_ERROR_RETRY_COUNTDOWN: {{ .Values.useWorkers.errorRetryCountdown | default 30 | quote }} + # Partitions for Celery queue routing (load distribution) + {{- with .Values.useWorkers.partitions }} + {{- if .resourcePool }} + PARTITION_RESOURCE_POOL: {{ .resourcePool | quote }} + {{- end }} + {{- if .resourceHandle }} + PARTITION_RESOURCE_HANDLE: {{ .resourceHandle | quote }} + {{- end }} + {{- if .resourceClaim }} + PARTITION_RESOURCE_CLAIM: {{ .resourceClaim | quote }} + {{- end }} + {{- end }} +{{- end }} diff --git a/helm/templates/worker/worker-cm.yaml b/helm/templates/worker/worker-cm.yaml new file mode 100644 index 0000000..b192274 --- /dev/null +++ b/helm/templates/worker/worker-cm.yaml @@ -0,0 +1,43 @@ +{{- /* Worker ConfigMap - auto-enabled when operatorMode is 'distributed' */ -}} +{{- if or .Values.worker.enabled (eq .Values.operatorMode "distributed") }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "poolboy.name" . }}-worker-cm + namespace: {{ include "poolboy.namespaceName" . }} + labels: + {{- include "poolboy.labels" . | nindent 4 }} + app.kubernetes.io/component: worker-cm +data: + # Redis URL (base URL, components append DB number: /0, /1, /2, /3) + REDIS_URL: {{ printf "redis://%s-redis:6379" (include "poolboy.name" .) | quote }} + CELERY_BROKER_URL: {{ printf "redis://%s-redis:6379/0" (include "poolboy.name" .) | quote }} + CELERY_RESULT_BACKEND: {{ printf "redis://%s-redis:6379/1" (include "poolboy.name" .) | quote }} + # Celery configuration (from worker.config) + {{- range $key, $value := .Values.worker.config }} + CELERY_{{ $key | upper }}: {{ $value | quote }} + {{- end }} + # Partition configuration (from useWorkers) + {{- with .Values.useWorkers.partitions }} + {{- if .resourcePool }} + PARTITION_RESOURCE_POOL: {{ .resourcePool | quote }} + {{- end }} + {{- if .resourceHandle }} + PARTITION_RESOURCE_HANDLE: {{ .resourceHandle | quote }} + {{- end }} + {{- if .resourceClaim }} + PARTITION_RESOURCE_CLAIM: {{ .resourceClaim | quote }} + {{- end }} + {{- end }} + # Operator configuration + WORKER: "true" + CLUSTER_DOMAIN: {{ .Values.clusterDomain | quote }} + OPERATOR_DOMAIN: {{ include "poolboy.operatorDomain" . | quote }} + RESOURCE_REFRESH_INTERVAL: {{ .Values.resourceRefreshInterval | quote }} + # Metrics configuration + {{- if .Values.worker.metrics.enabled }} + PROMETHEUS_MULTIPROC_DIR: "/tmp/prometheus_metrics" + WORKER_METRICS_ENABLED: "true" + WORKER_METRICS_PORT: {{ .Values.worker.metrics.port | default 9090 | quote }} + {{- end }} +{{- end }} diff --git a/helm/templates/worker/worker-deployment.yaml b/helm/templates/worker/worker-deployment.yaml new file mode 100644 index 0000000..7fd0b4b --- /dev/null +++ b/helm/templates/worker/worker-deployment.yaml @@ -0,0 +1,94 @@ +{{- /* Worker deployment - auto-enabled when operatorMode is 'distributed' */ -}} +{{- if or .Values.worker.enabled (eq .Values.operatorMode "distributed") }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ include "poolboy.name" . }}-worker + namespace: {{ include "poolboy.namespaceName" . }} + labels: + {{- include "poolboy.labels" . | nindent 4 }} + app.kubernetes.io/component: worker + annotations: + reloader.stakater.com/auto: "true" +spec: + {{- if not .Values.worker.hpa.enabled }} + replicas: {{ .Values.worker.replicas }} + {{- end }} + selector: + matchLabels: + {{- include "poolboy.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: worker + template: + metadata: + labels: + {{- include "poolboy.selectorLabels" . | nindent 8 }} + app.kubernetes.io/component: worker + spec: + containers: + - name: worker + image: "{{ include "poolboy.image" . }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + workingDir: /opt/app-root/operator + command: ["celery"] + args: + - "-A" + - "processor.app" + - "worker" + {{- with .Values.worker.args }} + - "--loglevel={{ .loglevel | default "info" }}" + - "--concurrency={{ .concurrency | default 4 }}" + - "--pool={{ .pool | default "prefork" }}" + - "--max-tasks-per-child={{ .maxTasksPerChild | default 100 }}" + {{- end }} + {{- if .Values.worker.metrics.enabled }} + ports: + - name: worker-metrics + containerPort: {{ .Values.worker.metrics.port | default 9090 }} + {{- end }} + envFrom: + - configMapRef: + name: {{ include "poolboy.name" . }}-worker-cm + {{- with .Values.worker.extraEnvFrom }} + {{- toYaml . | nindent 12 }} + {{- end }} + env: + - name: PYTHONPATH + value: /opt/app-root/operator + {{- range $key, $value := .Values.worker.extraEnvVars }} + - name: {{ $key }} + value: {{ $value | quote }} + {{- end }} + - name: OPERATOR_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + {{- if .Values.worker.metrics.enabled }} + volumeMounts: + - name: prometheus-metrics + mountPath: /tmp/prometheus_metrics + {{- end }} + resources: + {{- toYaml .Values.worker.resources | nindent 12 }} + {{- if .Values.worker.metrics.enabled }} + volumes: + - name: prometheus-metrics + emptyDir: {} + {{- end }} + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "poolboy.serviceAccountName" . }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} +{{- end }} diff --git a/helm/templates/worker/worker-hpa.yaml b/helm/templates/worker/worker-hpa.yaml new file mode 100644 index 0000000..056de98 --- /dev/null +++ b/helm/templates/worker/worker-hpa.yaml @@ -0,0 +1,51 @@ +{{- /* Worker HPA - auto-enabled when operatorMode is 'distributed' */ -}} +{{- if and (or .Values.worker.enabled (eq .Values.operatorMode "distributed")) .Values.worker.hpa.enabled }} +apiVersion: autoscaling/v2 +kind: HorizontalPodAutoscaler +metadata: + name: {{ include "poolboy.name" . }}-worker + namespace: {{ include "poolboy.namespaceName" . }} + labels: + {{- include "poolboy.labels" . | nindent 4 }} + app.kubernetes.io/component: worker +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: {{ include "poolboy.name" . }}-worker + minReplicas: {{ .Values.worker.hpa.minReplicas }} + maxReplicas: {{ .Values.worker.hpa.maxReplicas }} + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: {{ .Values.worker.hpa.targetCPUUtilizationPercentage }} + {{- if .Values.worker.hpa.targetMemoryUtilizationPercentage }} + - type: Resource + resource: + name: memory + target: + type: Utilization + averageUtilization: {{ .Values.worker.hpa.targetMemoryUtilizationPercentage }} + {{- end }} + behavior: + scaleDown: + stabilizationWindowSeconds: {{ .Values.worker.hpa.scaleDownStabilizationWindowSeconds | default 300 }} + policies: + - type: Pods + value: 1 + periodSeconds: 120 + scaleUp: + stabilizationWindowSeconds: 0 + policies: + - type: Pods + value: 2 + periodSeconds: 60 + - type: Percent + value: 100 + periodSeconds: 60 + selectPolicy: Max +{{- end }} + diff --git a/helm/templates/worker/worker-service-monitor.yaml b/helm/templates/worker/worker-service-monitor.yaml new file mode 100644 index 0000000..a9ec30f --- /dev/null +++ b/helm/templates/worker/worker-service-monitor.yaml @@ -0,0 +1,31 @@ +{{- /* Worker ServiceMonitor - auto-enabled when operatorMode is 'distributed' */ -}} +{{- if and (or .Values.worker.enabled (eq .Values.operatorMode "distributed")) .Values.worker.metrics.enabled .Values.enablePrometheusMetrics }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "poolboy.name" . }}-worker-metrics + namespace: {{ include "poolboy.namespaceName" . }} + labels: + {{- include "poolboy.labels" . | nindent 4 }} + app.kubernetes.io/component: worker +spec: + selector: + matchLabels: + {{- include "poolboy.selectorLabels" . | nindent 6 }} + app.kubernetes.io/component: worker + namespaceSelector: + matchNames: + - {{ include "poolboy.namespaceName" . }} + endpoints: + - port: worker-metrics + interval: "30s" + path: {{ .Values.worker.metrics.path | default "/metrics" }} + basicAuth: + username: + name: {{ include "poolboy.name" . }}-metrics-credentials + key: metrics_username + password: + name: {{ include "poolboy.name" . }}-metrics-credentials + key: metrics_password +{{- end }} + diff --git a/helm/templates/worker/worker-service.yaml b/helm/templates/worker/worker-service.yaml new file mode 100644 index 0000000..be2f409 --- /dev/null +++ b/helm/templates/worker/worker-service.yaml @@ -0,0 +1,22 @@ +{{- /* Worker Service - auto-enabled when operatorMode is 'distributed' */ -}} +{{- if and (or .Values.worker.enabled (eq .Values.operatorMode "distributed")) .Values.worker.metrics.enabled }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "poolboy.name" . }}-worker + namespace: {{ include "poolboy.namespaceName" . }} + labels: + {{- include "poolboy.labels" . | nindent 4 }} + app.kubernetes.io/component: worker +spec: + type: ClusterIP + ports: + - name: worker-metrics + port: {{ .Values.worker.metrics.port | default 9090 }} + protocol: TCP + targetPort: worker-metrics + selector: + {{- include "poolboy.selectorLabels" . | nindent 4 }} + app.kubernetes.io/component: worker +{{- end }} + diff --git a/helm/values.yaml b/helm/values.yaml index af477de..653d789 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -20,27 +20,11 @@ operatorDomain: # If not set and create is true, a name is generated using the operatorDomain template name: -operatorMode: manager +# Operator mode: 'standalone' (single operator) or 'distributed' (with Celery workers) +# Backward compatibility: 'all-in-one' maps to 'standalone', 'manager' maps to 'distributed' +operatorMode: distributed -manageClaimsInterval: 60 -manageHandlesInterval: 60 -managePoolsInterval: 10 resourceRefreshInterval: 600 -resourceHandlerCount: 1 -resourceHandlerResources: - limits: - cpu: 1000m - memory: 256Mi - requests: - cpu: 100m - memory: 128Mi -resourceWatchResources: - limits: - cpu: 1000m - memory: 256Mi - requests: - cpu: 100m - memory: 128Mi anarchy: # Control whether anarchy integration should be created @@ -74,9 +58,9 @@ service: type: ClusterIP ports: - name: metrics - port: 9091 + port: 9090 protocol: TCP - targetPort: 9091 + targetPort: 9090 resources: {} # We usually recommend not to specify default resources and to leave this as a conscious @@ -104,3 +88,190 @@ enablePrometheusMetrics: true # Metrics Credentials metrics: username: metrics + +# =========================================== +# Worker Feature Flags +# Enable/disable async processing per resource type +# =========================================== +useWorkers: + # Retry delay (seconds) when a task cannot acquire the distributed lock. + # Lower values = faster retry but more Redis load. + # Higher values = slower retry but less Redis load. + lockRetryCountdown: 3 + + # Retry delay (seconds) when a task fails due to a real error (not lock contention). + # Tasks will retry up to 5 times with this delay between attempts. + # After 5 retries, the task fails permanently. + errorRetryCountdown: 30 + + # Partitions for Celery queue routing (load distribution across workers) + # Higher values = more parallelism but more queues to manage + partitions: + resourcePool: 2 + resourceHandle: 4 + resourceClaim: 4 + +# =========================================== +# Redis Configuration +# =========================================== +redis: + enabled: true + image: + repository: redis + tag: 7-alpine + pullPolicy: IfNotPresent + persistence: + enabled: true + size: 4Gi + storageClass: "" + resources: + requests: + memory: "256Mi" + cpu: "100m" + limits: + memory: "512Mi" + cpu: "400m" + +# =========================================== +# Worker Configuration +# =========================================== +worker: + enabled: false + replicas: 2 + + # Prometheus metrics endpoint + metrics: + enabled: true + port: 9090 + path: /metrics + + resources: + requests: + memory: "256Mi" + cpu: "100m" + limits: + memory: "512Mi" + cpu: "500m" + + hpa: + enabled: true + minReplicas: 2 + maxReplicas: 10 + targetCPUUtilizationPercentage: 70 + targetMemoryUtilizationPercentage: 80 + scaleDownStabilizationWindowSeconds: 300 + + extraEnvVars: {} + extraEnvFrom: [] + + # Command-line args for celery worker + args: + loglevel: info + concurrency: 4 + pool: prefork + maxTasksPerChild: 100 + + # Celery configuration (CELERY_ prefix added automatically) + # Keys are snake_case matching Celery config names + config: + result_expires: 3600 + result_extended: true + task_ack_late: true + task_default_retry_delay: 60 + task_default_retry_delay_max: 600 + task_reject_on_worker_lost: true + task_soft_time_limit: 1740 + task_time_limit: 1800 + worker_prefetch_multiplier: 1 + worker_send_task_events: true + task_send_sent_event: true + +# =========================================== +# Scheduler Configuration (Celery Beat) +# =========================================== +scheduler: + enabled: false + resources: + requests: + memory: "64Mi" + cpu: "25m" + limits: + memory: "128Mi" + cpu: "100m" + extraEnvVars: {} + extraEnvFrom: [] + config: + logging: + level: INFO + +# =========================================== +# Celery Flower Configuration +# =========================================== +flower: + enabled: false + replicas: 1 + + image: + repository: mher/flower + tag: "2.0" + pullPolicy: IfNotPresent + + # Web UI port + port: 5555 + + # Basic authentication + auth: + enabled: true + username: admin + + resources: + requests: + memory: "64Mi" + cpu: "25m" + limits: + memory: "256Mi" + cpu: "200m" + + # Flower behavior settings + config: + # Maximum number of tasks to keep in memory + maxTasks: 10000 + # Remove offline workers after N seconds (0 = keep forever) + purgeOfflineWorkers: 0 + + # Persistent storage for task history + persistence: + enabled: false + size: 1Gi + storageClass: "" + + # Additional environment variables + extraEnvVars: {} + extraEnvFrom: [] + + # OpenShift Route for external access + route: + enabled: false + +# =========================================== +# Task Schedules +# Supports: seconds (interval) or cron (expression) +# With ClaimWatch/HandleWatch, these are now safety nets (not primary) +# =========================================== +schedules: + maintain-all-pools: + enabled: false + schedule: + seconds: 30 # No PoolWatch yet, keep frequent + maintain-all-handles: + enabled: false + schedule: + seconds: 60 # Needs polling for lifespan.end triggers + maintain-all-claims: + enabled: false + schedule: + seconds: 60 # Same as old daemon interval + cleanup-stale-handles: + enabled: false + schedule: + cron: "0 * * * *" diff --git a/operator/cache.py b/operator/cache.py new file mode 100644 index 0000000..9ac732b --- /dev/null +++ b/operator/cache.py @@ -0,0 +1,268 @@ +""" +Unified cache system for Poolboy. + +Provides a single interface for caching with automatic backend selection: +- MemoryBackend: Used in standalone mode (single process) +- RedisBackend: Used in distributed mode (shared across workers) + +Usage: + from cache import Cache, CacheTag + + # Cache an object + instance.cache_set(CacheTag.HANDLE, name, ttl=300) + + # Retrieve from cache + instance = cls.cache_get(CacheTag.HANDLE, name) + + # Delete from cache + cls.cache_delete(CacheTag.HANDLE, name) +""" + +import fnmatch +import json +import logging +import time +from enum import Enum +from typing import Any, Optional, Protocol + +import redis +from poolboy import Poolboy + +logger = logging.getLogger(__name__) + + +class CacheTag(Enum): + """Tags for cache key namespacing.""" + + CLAIM = "claim" + HANDLE = "handle" + HANDLE_BOUND = "handle_bound" + HANDLE_UNBOUND = "handle_unbound" + POOL = "pool" + PROVIDER = "provider" + WATCH = "watch" + WATCH_RESOURCE = "watch_resource" + + +class CacheBackend(Protocol): + """Protocol defining cache backend interface.""" + + def delete(self, key: str) -> None: ... + def delete_pattern(self, pattern: str) -> int: ... + def exists(self, key: str) -> bool: ... + def get(self, key: str) -> Optional[Any]: ... + def keys(self, pattern: str) -> list[str]: ... + def set(self, key: str, value: Any, ttl: int) -> None: ... + + +class MemoryBackend: + """In-memory cache backend for standalone mode.""" + + def __init__(self): + self._cache: dict[str, tuple[Any, float]] = {} + + def _cleanup_expired(self) -> None: + """Remove expired entries.""" + now = time.time() + expired = [k for k, (_, exp) in self._cache.items() if exp <= now] + for k in expired: + del self._cache[k] + + def delete(self, key: str) -> None: + """Delete a key from the cache.""" + self._cache.pop(key, None) + + def delete_pattern(self, pattern: str) -> int: + """Delete all keys matching pattern. Returns count of deleted keys.""" + keys_to_delete = [k for k in self._cache.keys() if fnmatch.fnmatch(k, pattern)] + for k in keys_to_delete: + del self._cache[k] + return len(keys_to_delete) + + def exists(self, key: str) -> bool: + """Check if key exists and is not expired.""" + if key not in self._cache: + return False + _, expires_at = self._cache[key] + if expires_at <= time.time(): + del self._cache[key] + return False + return True + + def get(self, key: str) -> Optional[Any]: + """Get value. Returns Python object directly.""" + if not self.exists(key): + return None + value, _ = self._cache[key] + return value + + def keys(self, pattern: str) -> list[str]: + """Get all keys matching pattern.""" + self._cleanup_expired() + return [k for k in self._cache.keys() if fnmatch.fnmatch(k, pattern)] + + def set(self, key: str, value: Any, ttl: int) -> None: + """Set value with TTL in seconds. Stores Python object directly.""" + self._cleanup_expired() + expires_at = time.time() + ttl + self._cache[key] = (value, expires_at) + + +class RedisBackend: + """Redis cache backend for distributed mode.""" + + def __init__(self, url: str): + self._client = redis.from_url(url, decode_responses=True) + + def delete(self, key: str) -> None: + """Delete a key from Redis.""" + try: + self._client.delete(key) + except Exception as e: + logger.warning(f"Redis delete failed for {key}: {e}") + + def delete_pattern(self, pattern: str) -> int: + """Delete all keys matching pattern. Returns count of deleted keys.""" + try: + keys = self._client.keys(pattern) + if keys: + return self._client.delete(*keys) + return 0 + except Exception as e: + logger.warning(f"Redis delete_pattern failed for {pattern}: {e}") + return 0 + + def exists(self, key: str) -> bool: + """Check if key exists in Redis.""" + try: + return bool(self._client.exists(key)) + except Exception as e: + logger.warning(f"Redis exists check failed for {key}: {e}") + return False + + def get(self, key: str) -> Optional[Any]: + """Get value. Returns deserialized dict.""" + try: + data = self._client.get(key) + if data: + return json.loads(data) + return None + except Exception as e: + logger.warning(f"Redis get failed for {key}: {e}") + return None + + def keys(self, pattern: str) -> list[str]: + """Get all keys matching pattern.""" + try: + return self._client.keys(pattern) + except Exception as e: + logger.warning(f"Redis keys failed for {pattern}: {e}") + return [] + + def set(self, key: str, value: Any, ttl: int) -> None: + """Set value with TTL in seconds. Serializes using 'definition' property if available.""" + try: + if hasattr(value, "definition"): + data = json.dumps(value.definition) + else: + data = json.dumps(value) + self._client.setex(key, ttl, data) + except Exception as e: + logger.warning(f"Redis set failed for {key}: {e}") + + +class CacheManager: + """Unified cache interface with automatic backend selection.""" + + _backend: Optional[CacheBackend] = None + _initialized: bool = False + + @classmethod + def _ensure_initialized(cls) -> None: + """Lazy initialization of backend.""" + if cls._initialized: + return + cls.initialize() + + @classmethod + def _make_key(cls, tag: CacheTag, identifier: str) -> str: + """Build cache key from tag and identifier.""" + return f"poolboy:{tag.value}:{identifier}" + + @classmethod + def delete(cls, tag: CacheTag, identifier: str) -> None: + """Delete a cached value.""" + cls._ensure_initialized() + key = cls._make_key(tag, identifier) + cls._backend.delete(key) + + @classmethod + def delete_by_tag(cls, tag: CacheTag) -> int: + """Delete all cached values for a tag. Returns count of deleted keys.""" + cls._ensure_initialized() + pattern = f"poolboy:{tag.value}:*" + return cls._backend.delete_pattern(pattern) + + @classmethod + def exists(cls, tag: CacheTag, identifier: str) -> bool: + """Check if a value exists in the cache.""" + cls._ensure_initialized() + key = cls._make_key(tag, identifier) + return cls._backend.exists(key) + + @classmethod + def get(cls, tag: CacheTag, identifier: str) -> Optional[Any]: + """Get a value from the cache.""" + cls._ensure_initialized() + key = cls._make_key(tag, identifier) + return cls._backend.get(key) + + @classmethod + def get_keys_by_tag(cls, tag: CacheTag) -> list[str]: + """Get all identifiers for a given tag.""" + cls._ensure_initialized() + pattern = f"poolboy:{tag.value}:*" + prefix = f"poolboy:{tag.value}:" + keys = cls._backend.keys(pattern) + return [k[len(prefix) :] for k in keys] + + @classmethod + def initialize(cls, standalone: Optional[bool] = None) -> None: + """ + Initialize the cache backend. + + Args: + standalone: Force standalone mode. If None, uses Poolboy.is_standalone. + """ + if cls._initialized: + return + + if standalone is None: + standalone = Poolboy.is_standalone + + if standalone: + logger.info("Cache: Using MemoryBackend (standalone mode)") + cls._backend = MemoryBackend() + else: + redis_url = f"{Poolboy.redis_url}/3" + logger.info(f"Cache: Using RedisBackend ({redis_url})") + try: + cls._backend = RedisBackend(redis_url) + except Exception as e: + logger.warning( + f"Redis connection failed, falling back to MemoryBackend: {e}" + ) + cls._backend = MemoryBackend() + + cls._initialized = True + + @classmethod + def set(cls, tag: CacheTag, identifier: str, value: Any, ttl: int = 60) -> None: + """Set a value in the cache with TTL in seconds.""" + cls._ensure_initialized() + key = cls._make_key(tag, identifier) + cls._backend.set(key, value, ttl) + + +# Module-level singleton +Cache = CacheManager diff --git a/operator/claimwatch.py b/operator/claimwatch.py new file mode 100644 index 0000000..0593e05 --- /dev/null +++ b/operator/claimwatch.py @@ -0,0 +1,287 @@ +""" +ClaimWatch - Event-driven watch for ResourceClaims. + +Follows the proven pattern from ResourceWatch: +- Uses kubernetes_asyncio.watch.Watch() for event stream +- Handles 410 Expired and connection errors with automatic restart +- Works in both standalone and distributed modes +- Replaces the per-resource daemon with a single efficient watch + +Key difference from daemons: +- Daemons: Loop every 60s per resource (N coroutines for N resources) +- ClaimWatch: Single watch, event-driven processing (~instant latency) +""" + +import asyncio +import logging +from datetime import datetime, timezone +from typing import Mapping + +import kubernetes_asyncio +from poolboy import Poolboy +from resourceclaim import ResourceClaim + +logger = logging.getLogger("claim_watch") + + +class ClaimWatchRestartError(Exception): + """Raised when watch needs to restart (e.g., 410 Expired).""" + + pass + + +class ClaimWatchFailedError(Exception): + """Raised when watch encounters an unrecoverable error.""" + + pass + + +class ClaimWatch: + """Watch ResourceClaims for changes that require processing. + + This replaces the per-resource daemon with a single event-driven watch. + When a claim changes, we check if it needs processing and either: + - Process directly (standalone mode) + - Dispatch to Celery workers (distributed mode) + """ + + # Singleton instance + _instance = None + _lock = asyncio.Lock() + + @classmethod + async def start(cls): + """Start the singleton ClaimWatch instance.""" + async with cls._lock: + if cls._instance is None: + cls._instance = cls() + await cls._instance.initialize() + return cls._instance + + @classmethod + async def stop_all(cls): + """Stop the singleton ClaimWatch instance.""" + async with cls._lock: + if cls._instance is not None: + await cls._instance.shutdown() + cls._instance = None + + def __init__(self): + self.task = None + # Cache last seen resourceVersion per claim for change detection + self._rv_cache: dict[str, str] = {} + + async def initialize(self): + """Start the watch loop as a background task.""" + logger.info("Starting ClaimWatch") + self.task = asyncio.create_task(self._watch_loop()) + + async def shutdown(self): + """Stop the watch loop.""" + if self.task: + self.task.cancel() + try: + await self.task + except asyncio.CancelledError: + pass + self.task = None + logger.info("ClaimWatch stopped") + + async def _watch_loop(self): + """Main watch loop with automatic restart on errors.""" + while True: + watch_start = datetime.now(timezone.utc) + try: + await self._watch() + except asyncio.CancelledError: + logger.debug("ClaimWatch cancelled") + return + except ClaimWatchRestartError as e: + logger.debug(f"ClaimWatch restart: {e}") + # Avoid tight restart loops + duration = (datetime.now(timezone.utc) - watch_start).total_seconds() + if duration < 10: + await asyncio.sleep(10 - duration) + except ClaimWatchFailedError as e: + logger.warning(f"ClaimWatch failed: {e}") + duration = (datetime.now(timezone.utc) - watch_start).total_seconds() + if duration < 60: + await asyncio.sleep(60 - duration) + except Exception: + logger.exception("ClaimWatch exception") + duration = (datetime.now(timezone.utc) - watch_start).total_seconds() + if duration < 60: + await asyncio.sleep(60 - duration) + logger.debug("Restarting ClaimWatch") + + async def _watch(self): + """Stream events from Kubernetes API.""" + watch = None + try: + watch = kubernetes_asyncio.watch.Watch() + # Watch all ResourceClaims cluster-wide + method = Poolboy.custom_objects_api.list_cluster_custom_object + kwargs = { + "group": ResourceClaim.api_group, + "version": ResourceClaim.api_version, + "plural": ResourceClaim.plural, + } + + async for event in watch.stream(method, **kwargs): + if not isinstance(event, Mapping): + raise ClaimWatchFailedError(f"Unknown event: {event}") + + event_type = event["type"] + event_obj = event["object"] + + if not isinstance(event_obj, Mapping): + event_obj = Poolboy.api_client.sanitize_for_serialization(event_obj) + + if event_type == "ERROR": + if event_obj.get("kind") == "Status": + reason = event_obj.get("reason", "") + if reason in ("Expired", "Gone"): + raise ClaimWatchRestartError(reason.lower()) + raise ClaimWatchFailedError( + f"{reason} {event_obj.get('message', '')}" + ) + raise ClaimWatchFailedError(f"Unknown error: {event}") + + try: + await self._handle_event(event_type, event_obj) + except Exception: + name = event_obj.get("metadata", {}).get("name", "unknown") + ns = event_obj.get("metadata", {}).get("namespace", "unknown") + logger.exception(f"Error handling event for {ns}/{name}") + + except kubernetes_asyncio.client.exceptions.ApiException as e: + if e.status == 410: + raise ClaimWatchRestartError("410 Expired") + raise + finally: + if watch: + await watch.close() + + async def _handle_event(self, event_type: str, claim: Mapping) -> None: + """Handle a single claim event.""" + metadata = claim.get("metadata", {}) + name = metadata.get("name") + namespace = metadata.get("namespace") + uid = metadata.get("uid") + rv = metadata.get("resourceVersion") + labels = metadata.get("labels", {}) + + if not name or not namespace: + return + + cache_key = f"{namespace}/{name}" + + # Handle deletion + if event_type == "DELETED": + self._rv_cache.pop(cache_key, None) + return + + # Check if should be ignored + if Poolboy.ignore_label in labels: + return + + # Check if we've already processed this version + if self._rv_cache.get(cache_key) == rv: + return + + # Check if claim needs processing + if not self._needs_processing(claim): + # Update cache even if we don't process (to avoid recheck) + self._rv_cache[cache_key] = rv + return + + # Process the claim + await self._process_claim(claim) + self._rv_cache[cache_key] = rv + + def _needs_processing(self, claim: Mapping) -> bool: + """Check if claim needs processing based on its state. + + Returns True if: + - Claim has a handle and might need reconciliation + - Lifespan start time has been reached + - Status indicates processing needed + """ + status = claim.get("status", {}) + spec = claim.get("spec", {}) + + # If claim has a handle, it might need processing + if "resourceHandle" in status: + return True + + # Check if lifespan start is in the future + lifespan_start = spec.get("lifespan", {}).get("start") + if lifespan_start: + try: + start_dt = datetime.strptime(lifespan_start, "%Y-%m-%dT%H:%M:%S%z") + if start_dt > datetime.now(timezone.utc): + # Future start - don't process yet + return False + except (ValueError, TypeError): + pass + + # If detached, check lifespan end + if status.get("resourceHandle", {}).get("detached", False): + lifespan_end = status.get("lifespan", {}).get("end") + if lifespan_end: + try: + end_dt = datetime.strptime(lifespan_end, "%Y-%m-%dT%H:%M:%S%z") + if end_dt < datetime.now(timezone.utc): + return True # Past lifespan end, needs delete + except (ValueError, TypeError): + pass + return False # Detached, no processing needed + + # Default: process it + return True + + async def _process_claim(self, claim: Mapping) -> None: + """Process a claim - works in both standalone and distributed modes. + + IMPORTANT: ClaimWatch only processes claims that ALREADY have a handle. + Initial binding (no handle) is done by Kopf on.create handler to avoid + race conditions where both would try to create a handle simultaneously. + + Like ResourceWatch, this method works in both modes: + - Standalone: calls resource_claim.manage() directly + - Distributed: dispatches to Celery workers + """ + metadata = claim.get("metadata", {}) + name = metadata.get("name") + namespace = metadata.get("namespace") + status = claim.get("status", {}) + has_handle = "resourceHandle" in status + + # Only process claims that already have a handle + # Initial binding is done by Kopf on.create handler + if not has_handle: + logger.debug( + f"ClaimWatch skipping {namespace}/{name} - no handle yet" + ) + return + + # In distributed mode, dispatch to Celery + if not Poolboy.is_standalone: + from tasks.resourceclaim import dispatch_manage_claim + + dispatch_manage_claim( + definition=claim, + name=name, + namespace=namespace, + ) + logger.debug(f"ClaimWatch dispatched {namespace}/{name} to worker") + else: + # In standalone mode, Kopf handlers also process create/update/delete. + # ClaimWatch provides backup processing for time-based operations. + # + # IMPORTANT: Use register_definition which updates from the event data. + # This ensures we have the latest data from Kubernetes. + resource_claim = await ResourceClaim.register_definition(claim) + if not resource_claim.ignore: + await resource_claim.manage(logger=logger) + logger.debug(f"ClaimWatch processed {namespace}/{name} directly") diff --git a/operator/distributed_lock.py b/operator/distributed_lock.py new file mode 100644 index 0000000..22f4dc0 --- /dev/null +++ b/operator/distributed_lock.py @@ -0,0 +1,267 @@ +""" +Distributed locking for Poolboy using Redis. + +Provides process-safe locking for Celery tasks and other components. +Uses token-based locking to prevent accidental unlock by other processes. +""" + +import logging +import time +import uuid +from contextlib import contextmanager +from typing import Optional + +import redis +from metrics import TimerDecoratorMeta +from poolboy import Poolboy + +logger = logging.getLogger(__name__) + + +class DistributedLockError(Exception): + """Exception raised when distributed lock operations fail.""" + pass + + +class DistributedLock(metaclass=TimerDecoratorMeta): + """ + A distributed lock implementation using Redis. + + Features: + - Token-based ownership (prevents accidental unlock) + - Automatic expiration to prevent deadlocks + - Configurable timeout and retry behavior + - Context manager support + - Lock extension support + + Example: + lock = DistributedLock("resource_pool:default:my-pool") + with lock: + # Critical section + pass + + # Or with acquire check: + lock = DistributedLock("resource_pool:default:my-pool", blocking=False) + if lock.acquire(): + try: + # Critical section + pass + finally: + lock.release() + """ + + _client: Optional[redis.Redis] = None + + def __init__( + self, + key: str, + timeout: int = 300, + blocking: bool = True, + blocking_timeout: float = 10.0, + retry_interval: float = 0.1, + ): + """ + Initialize the distributed lock. + + Args: + key: Unique identifier (prefixed with "poolboy:lock:") + timeout: Lock expiration time (seconds) + blocking: If True, wait for lock acquisition + blocking_timeout: Max time to wait if blocking (seconds) + retry_interval: Time between acquisition attempts (seconds) + """ + # Lazy init if on_startup() wasn't called (e.g., outside worker context) + if self._client is None: + self.on_startup() + self.client = self._client + self.key = f"poolboy:lock:{key}" + self.timeout = timeout + self.blocking = blocking + self.blocking_timeout = blocking_timeout + self.retry_interval = retry_interval + self.token = str(uuid.uuid4()) + self._acquired = False + + def __enter__(self): + """Context manager entry.""" + if not self.acquire(): + raise DistributedLockError( + f"Could not acquire lock '{self.key}' within {self.blocking_timeout}s" + ) + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + """Context manager exit.""" + self.release() + + def _try_acquire(self) -> bool: + """Single attempt to acquire the lock.""" + try: + result = self.client.set( + self.key, + self.token, + nx=True, + ex=self.timeout, + ) + if result: + self._acquired = True + logger.debug(f"Acquired lock: {self.key}") + return True + return False + except redis.RedisError as e: + logger.error(f"Error acquiring lock {self.key}: {e}") + raise DistributedLockError(f"Failed to acquire lock: {e}") + + def acquire(self, blocking: Optional[bool] = None, timeout: Optional[float] = None) -> bool: + """ + Acquire the distributed lock. + + Args: + blocking: Override instance blocking setting + timeout: Override instance blocking_timeout setting + + Returns: + True if lock was acquired, False otherwise + """ + should_block = blocking if blocking is not None else self.blocking + wait_timeout = timeout if timeout is not None else self.blocking_timeout + + if not should_block: + return self._try_acquire() + + start_time = time.time() + while (time.time() - start_time) < wait_timeout: + if self._try_acquire(): + return True + time.sleep(self.retry_interval) + + logger.warning(f"Failed to acquire lock {self.key} within {wait_timeout}s") + return False + + def extend(self, additional_time: Optional[int] = None) -> bool: + """ + Extend the lock expiration time. + + Args: + additional_time: Additional seconds. Defaults to original timeout. + + Returns: + True if extended, False otherwise + """ + if not self._acquired: + return False + + extension = additional_time or self.timeout + + try: + current_token = self.client.get(self.key) + if current_token == self.token: + if self.client.expire(self.key, extension): + logger.debug(f"Extended lock {self.key} by {extension}s") + return True + logger.warning(f"Cannot extend lock {self.key} - not owned or expired") + self._acquired = False + return False + except redis.RedisError as e: + logger.error(f"Error extending lock {self.key}: {e}") + raise DistributedLockError(f"Failed to extend lock: {e}") + + def is_locked(self) -> bool: + """Check if the lock is currently held (by any process).""" + try: + return bool(self.client.exists(self.key)) + except redis.RedisError as e: + raise DistributedLockError(f"Failed to check lock: {e}") + + @classmethod + def on_cleanup(cls) -> None: + """Close Redis client. Called from worker_process_shutdown signal.""" + if cls._client is not None: + try: + cls._client.close() + logger.info("DistributedLock Redis client closed") + except redis.RedisError as e: + logger.warning(f"Error closing Redis client: {e}") + finally: + cls._client = None + + @classmethod + def on_startup(cls) -> None: + """Initialize Redis client. Called from worker_process_init signal.""" + if cls._client is None: + redis_url = f"{Poolboy.redis_url}/2" + cls._client = redis.from_url(redis_url, decode_responses=True) + + @property + def owned(self) -> bool: + """Check if this instance owns the lock.""" + return self._acquired + + def release(self) -> bool: + """ + Release the distributed lock. + + Returns: + True if released, False if not owned by this instance + """ + if not self._acquired: + return False + + try: + current_token = self.client.get(self.key) + if current_token == self.token: + self.client.delete(self.key) + self._acquired = False + logger.debug(f"Released lock: {self.key}") + return True + else: + logger.warning(f"Cannot release lock {self.key} - token mismatch") + self._acquired = False + return False + except redis.RedisError as e: + logger.error(f"Error releasing lock {self.key}: {e}") + raise DistributedLockError(f"Failed to release lock: {e}") + + +@contextmanager +def distributed_lock( + key: str, + timeout: int = 300, + blocking: bool = False, + blocking_timeout: float = 10.0, +): + """ + Resilient context manager for distributed locking. + + Unlike DistributedLock class, this wrapper: + - Never raises DistributedLockError on acquire failure + - Yields acquired: bool + - Always handles cleanup properly + + Example: + with distributed_lock("resource_pool:ns:name") as acquired: + if not acquired: + raise self.retry(countdown=5) + # Critical section + + Note: For advanced usage (e.g., lock.extend()), use DistributedLock class directly. + """ + lock = DistributedLock( + key=key, + timeout=timeout, + blocking=blocking, + blocking_timeout=blocking_timeout, + ) + + acquired = False + try: + acquired = lock.acquire() + yield acquired + except DistributedLockError: + yield False + finally: + if acquired: + try: + lock.release() + except DistributedLockError: + pass diff --git a/operator/handlewatch.py b/operator/handlewatch.py new file mode 100644 index 0000000..a7c55a3 --- /dev/null +++ b/operator/handlewatch.py @@ -0,0 +1,270 @@ +""" +HandleWatch - Event-driven watch for ResourceHandles. + +Follows the proven pattern from ResourceWatch: +- Uses kubernetes_asyncio.watch.Watch() for event stream +- Handles 410 Expired and connection errors with automatic restart +- Works in both standalone and distributed modes +- Replaces the per-resource daemon with a single efficient watch + +Key difference from daemons: +- Daemons: Loop every 60s per resource (N coroutines for N resources) +- HandleWatch: Single watch, event-driven processing (~instant latency) +""" + +import asyncio +import logging +from datetime import datetime, timezone +from typing import Mapping + +import kubernetes_asyncio +from poolboy import Poolboy +from resourcehandle import ResourceHandle + +logger = logging.getLogger("handle_watch") + + +class HandleWatchRestartError(Exception): + """Raised when watch needs to restart (e.g., 410 Expired).""" + + pass + + +class HandleWatchFailedError(Exception): + """Raised when watch encounters an unrecoverable error.""" + + pass + + +class HandleWatch: + """Watch ResourceHandles for changes that require processing. + + This replaces the per-resource daemon with a single event-driven watch. + When a handle changes, we check if it needs processing and either: + - Process directly (standalone mode) + - Dispatch to Celery workers (distributed mode) + """ + + # Singleton instance + _instance = None + _lock = asyncio.Lock() + + @classmethod + async def start(cls): + """Start the singleton HandleWatch instance.""" + async with cls._lock: + if cls._instance is None: + cls._instance = cls() + await cls._instance.initialize() + return cls._instance + + @classmethod + async def stop_all(cls): + """Stop the singleton HandleWatch instance.""" + async with cls._lock: + if cls._instance is not None: + await cls._instance.shutdown() + cls._instance = None + + def __init__(self): + self.task = None + # Cache last seen resourceVersion per handle for change detection + self._rv_cache: dict[str, str] = {} + + async def initialize(self): + """Start the watch loop as a background task.""" + logger.info("Starting HandleWatch") + self.task = asyncio.create_task(self._watch_loop()) + + async def shutdown(self): + """Stop the watch loop.""" + if self.task: + self.task.cancel() + try: + await self.task + except asyncio.CancelledError: + pass + self.task = None + logger.info("HandleWatch stopped") + + async def _watch_loop(self): + """Main watch loop with automatic restart on errors.""" + while True: + watch_start = datetime.now(timezone.utc) + try: + await self._watch() + except asyncio.CancelledError: + logger.debug("HandleWatch cancelled") + return + except HandleWatchRestartError as e: + logger.debug(f"HandleWatch restart: {e}") + # Avoid tight restart loops + duration = (datetime.now(timezone.utc) - watch_start).total_seconds() + if duration < 10: + await asyncio.sleep(10 - duration) + except HandleWatchFailedError as e: + logger.warning(f"HandleWatch failed: {e}") + duration = (datetime.now(timezone.utc) - watch_start).total_seconds() + if duration < 60: + await asyncio.sleep(60 - duration) + except Exception: + logger.exception("HandleWatch exception") + duration = (datetime.now(timezone.utc) - watch_start).total_seconds() + if duration < 60: + await asyncio.sleep(60 - duration) + logger.debug("Restarting HandleWatch") + + async def _watch(self): + """Stream events from Kubernetes API.""" + watch = None + try: + watch = kubernetes_asyncio.watch.Watch() + # Watch ResourceHandles in operator namespace + method = Poolboy.custom_objects_api.list_namespaced_custom_object + kwargs = { + "group": ResourceHandle.api_group, + "version": ResourceHandle.api_version, + "plural": ResourceHandle.plural, + "namespace": Poolboy.namespace, + } + + async for event in watch.stream(method, **kwargs): + if not isinstance(event, Mapping): + raise HandleWatchFailedError(f"Unknown event: {event}") + + event_type = event["type"] + event_obj = event["object"] + + if not isinstance(event_obj, Mapping): + event_obj = Poolboy.api_client.sanitize_for_serialization(event_obj) + + if event_type == "ERROR": + if event_obj.get("kind") == "Status": + reason = event_obj.get("reason", "") + if reason in ("Expired", "Gone"): + raise HandleWatchRestartError(reason.lower()) + raise HandleWatchFailedError( + f"{reason} {event_obj.get('message', '')}" + ) + raise HandleWatchFailedError(f"Unknown error: {event}") + + try: + await self._handle_event(event_type, event_obj) + except Exception: + name = event_obj.get("metadata", {}).get("name", "unknown") + logger.exception(f"Error handling event for {name}") + + except kubernetes_asyncio.client.exceptions.ApiException as e: + if e.status == 410: + raise HandleWatchRestartError("410 Expired") + raise + finally: + if watch: + await watch.close() + + async def _handle_event(self, event_type: str, handle: Mapping) -> None: + """Handle a single handle event.""" + metadata = handle.get("metadata", {}) + name = metadata.get("name") + namespace = metadata.get("namespace") + uid = metadata.get("uid") + rv = metadata.get("resourceVersion") + labels = metadata.get("labels", {}) + + if not name: + return + + cache_key = name + + # Handle deletion + if event_type == "DELETED": + self._rv_cache.pop(cache_key, None) + return + + # Check if should be ignored + if Poolboy.ignore_label in labels: + return + + # Check if we've already processed this version + if self._rv_cache.get(cache_key) == rv: + return + + # Check if handle needs processing + if not self._needs_processing(handle): + # Update cache even if we don't process (to avoid recheck) + self._rv_cache[cache_key] = rv + return + + # Process the handle + await self._process_handle(handle) + self._rv_cache[cache_key] = rv + + def _needs_processing(self, handle: Mapping) -> bool: + """Check if handle needs processing based on its state. + + Returns True if handle might need reconciliation. + """ + spec = handle.get("spec", {}) + status = handle.get("status", {}) + + # Check if past lifespan end + lifespan_end = spec.get("lifespan", {}).get("end") + if lifespan_end: + try: + end_dt = datetime.strptime(lifespan_end, "%Y-%m-%dT%H:%M:%S%z") + if end_dt < datetime.now(timezone.utc): + return True # Past lifespan end, needs delete + except (ValueError, TypeError): + pass + + # Check if bound to claim that might not exist + if "resourceClaim" in spec: + return True + + # Check if has resources that might need management + if spec.get("resources"): + return True + + return True # Default: process it + + async def _process_handle(self, handle: Mapping) -> None: + """Process a handle - works in both standalone and distributed modes. + + IMPORTANT: HandleWatch only processes handles that have been initialized. + Initial setup (no status.resources) is done by Kopf on.create handler + to avoid race conditions where both would try to create resources. + + Like ResourceWatch, this method works in both modes: + - Standalone: calls resource_handle.manage() directly + - Distributed: dispatches to Celery workers + """ + metadata = handle.get("metadata", {}) + name = metadata.get("name") + namespace = metadata.get("namespace") + status = handle.get("status", {}) + + # Only process handles that have been initialized (have status.resources) + # Initial setup is done by Kopf on.create handler + if "resources" not in status: + logger.debug( + f"HandleWatch skipping {name} - not initialized yet" + ) + return + + # In distributed mode, dispatch to Celery + if not Poolboy.is_standalone: + from tasks.resourcehandle import dispatch_manage_handle + + dispatch_manage_handle( + definition=handle, + name=name, + namespace=namespace, + ) + logger.debug(f"HandleWatch dispatched {name} to worker") + else: + # In standalone mode, Kopf handlers also process create/update/delete. + # HandleWatch provides backup processing for time-based operations. + resource_handle = await ResourceHandle.register_definition(handle) + if not resource_handle.ignore: + await resource_handle.manage(logger=logger) + logger.debug(f"HandleWatch processed {name} directly") diff --git a/operator/kopfobject.py b/operator/kopfobject.py index 1e042b6..079a277 100644 --- a/operator/kopfobject.py +++ b/operator/kopfobject.py @@ -1,9 +1,10 @@ import asyncio from datetime import datetime -from typing import List, Mapping +from typing import List, Mapping, Optional, Self import kopf import kubernetes_asyncio +from cache import Cache, CacheTag from metrics.timer_decorator import TimerDecoratorMeta from poolboy import Poolboy @@ -49,6 +50,30 @@ def __str__(self) -> str: def api_group_version(self): return f"{self.api_group}/{self.api_version}" + @classmethod + def cache_delete(cls, tag: CacheTag, key: str) -> None: + """Delete object from cache.""" + Cache.delete(tag, key) + + @classmethod + def cache_get(cls, tag: CacheTag, key: str) -> Optional[Self]: + """ + Get object from cache, reconstructing if needed. + + MemoryBackend returns the Python object directly. + RedisBackend returns a dict that needs reconstruction via from_definition(). + """ + cached = Cache.get(tag, key) + if cached is None: + return None + if isinstance(cached, cls): + return cached + return cls.from_definition(cached) + + def cache_set(self, tag: CacheTag, key: str, ttl: int = 300) -> None: + """Store object in cache with TTL in seconds.""" + Cache.set(tag, key, self, ttl) + @property def creation_datetime(self): return datetime.strptime(self.creation_timestamp, "%Y-%m-%dT%H:%H:%S%z") @@ -57,6 +82,16 @@ def creation_datetime(self): def creation_timestamp(self) -> str: return self.meta['creationTimestamp'] + @property + def definition(self) -> Mapping: + return { + 'apiVersion': self.api_group_version, + 'kind': self.kind, + 'metadata': dict(self.meta), + 'spec': dict(self.spec) if self.spec else {}, + 'status': dict(self.status) if self.status else {}, + } + @property def deletion_timestamp(self) -> str|None: return self.meta.get('deletionTimestamp') @@ -74,6 +109,10 @@ def reference(self) -> Mapping: "namespace": self.namespace, } + @property + def resource_version(self) -> str: + return self.meta.get('resourceVersion', '') + def refresh(self, annotations: kopf.Annotations, labels: kopf.Labels, diff --git a/operator/main.py b/operator/main.py new file mode 100755 index 0000000..eb90653 --- /dev/null +++ b/operator/main.py @@ -0,0 +1,537 @@ +#!/usr/bin/env python3 +import asyncio +import logging +import re +from typing import Mapping + +import kopf +from cache import Cache +from configure_kopf_logging import configure_kopf_logging +from infinite_relative_backoff import InfiniteRelativeBackoff +from metrics import MetricsService +from poolboy import Poolboy +from resourceclaim import ResourceClaim +from resourcehandle import ResourceHandle +from resourcepool import ResourcePool +from resourceprovider import ResourceProvider +from resourcewatch import ResourceWatch + + +@kopf.on.startup() +async def startup(logger: kopf.ObjectLogger, settings: kopf.OperatorSettings, **_): + # Store last handled configuration in status + settings.persistence.diffbase_storage = kopf.StatusDiffBaseStorage( + field='status.diffBase', + ) + + # Never give up from network errors + settings.networking.error_backoffs = InfiniteRelativeBackoff() + + # Simplified finalizer - always use base domain + settings.persistence.finalizer = Poolboy.operator_domain + + # Support deprecated finalizers for migration (covers /handler and /handler-N patterns) + settings.persistence.deprecated_finalizer = re.compile( + re.escape(Poolboy.operator_domain) + '/handler(-\\d+)?$' + ) + + # Store progress in status. + settings.persistence.progress_storage = kopf.StatusProgressStorage(field='status.kopf.progress') + + # Only create events for warnings and errors + settings.posting.level = logging.WARNING + + # Disable scanning for CustomResourceDefinitions updates + settings.scanning.disabled = True + + # Configure logging + configure_kopf_logging() + # Initialize cache before any preload operations + Cache.initialize(standalone=Poolboy.is_standalone) + + await Poolboy.on_startup(logger=logger) + + if Poolboy.metrics_enabled: + # Start metrics service (sync but non-blocking - runs in daemon thread) + MetricsService.start(port=Poolboy.metrics_port) + + # Preload configuration from ResourceProviders + await ResourceProvider.preload(logger=logger) + + # Preload ResourceHandles in standalone mode (distributed mode uses workers) + if Poolboy.is_standalone: + await ResourceHandle.preload(logger=logger) + +@kopf.on.cleanup() +async def cleanup(logger: kopf.ObjectLogger, **_): + await ResourceWatch.stop_all() + await Poolboy.on_cleanup() + MetricsService.stop() + +@kopf.on.event(Poolboy.operator_domain, Poolboy.operator_version, 'resourceproviders') +async def resource_provider_event(event: Mapping, logger: kopf.ObjectLogger, **_) -> None: + definition = event['object'] + if event['type'] == 'DELETED': + await ResourceProvider.unregister(name=definition['metadata']['name'], logger=logger) + else: + await ResourceProvider.register(definition=definition, logger=logger) + +# Simplified label selector - just ignore resources with ignore label +label_selector = f"!{Poolboy.ignore_label}" + +# Resource event handlers - always registered in both standalone and distributed modes +# In distributed mode, handlers dispatch to Celery workers + +@kopf.on.create( + ResourceClaim.api_group, ResourceClaim.api_version, ResourceClaim.plural, + label_selector=label_selector, + id='resource_claim_create', +) +@kopf.on.resume( + ResourceClaim.api_group, ResourceClaim.api_version, ResourceClaim.plural, + label_selector=label_selector, + id='resource_claim_resume', +) +@kopf.on.update( + ResourceClaim.api_group, ResourceClaim.api_version, ResourceClaim.plural, + label_selector=label_selector, + id='resource_claim_update', +) +async def resource_claim_event( + annotations: kopf.Annotations, + labels: kopf.Labels, + logger: kopf.ObjectLogger, + meta: kopf.Meta, + name: str, + namespace: str, + spec: kopf.Spec, + status: kopf.Status, + uid: str, + **_ +): + resource_claim = await ResourceClaim.register( + annotations = annotations, + labels = labels, + meta = meta, + name = name, + namespace = namespace, + spec = spec, + status = status, + uid = uid, + ) + + # IMPORTANT: Only dispatch to worker if claim already has a handle. + # Initial binding requires in-memory cache which workers don't have. + # This ensures pool handles are correctly reused. + if not Poolboy.is_standalone and resource_claim.has_resource_handle: + from tasks.resourceclaim import dispatch_manage_claim + dispatch_manage_claim( + definition=resource_claim.definition, + name=resource_claim.name, + namespace=resource_claim.namespace, + ) + else: + await resource_claim.manage(logger=logger) + +@kopf.on.delete( + ResourceClaim.api_group, ResourceClaim.api_version, ResourceClaim.plural, + label_selector=label_selector, +) +async def resource_claim_delete( + annotations: kopf.Annotations, + labels: kopf.Labels, + logger: kopf.ObjectLogger, + meta: kopf.Meta, + name: str, + namespace: str, + spec: kopf.Spec, + status: kopf.Status, + uid: str, + **_ +): + resource_claim = ResourceClaim( + annotations = annotations, + labels = labels, + meta = meta, + name = name, + namespace = namespace, + spec = spec, + status = status, + uid = uid, + ) + + # Delegate to worker if not standalone + if not Poolboy.is_standalone: + from tasks.resourceclaim import dispatch_delete_claim + dispatch_delete_claim( + definition=resource_claim.definition, + name=resource_claim.name, + namespace=resource_claim.namespace, + ) + logger.info(f"Dispatched delete_claim for {name} in {namespace}") + else: + await resource_claim.handle_delete(logger=logger) + await ResourceClaim.unregister(name=name, namespace=namespace) + +@kopf.daemon( + ResourceClaim.api_group, ResourceClaim.api_version, ResourceClaim.plural, + cancellation_timeout = 1, + initial_delay = Poolboy.manage_handles_interval, + label_selector=label_selector, +) +async def resource_claim_daemon( + annotations: kopf.Annotations, + labels: kopf.Labels, + logger: kopf.ObjectLogger, + meta: kopf.Meta, + name: str, + namespace: str, + spec: kopf.Spec, + status: kopf.Status, + stopped: kopf.DaemonStopped, + uid: str, + **_ +): + resource_claim = await ResourceClaim.register( + annotations = annotations, + labels = labels, + meta = meta, + name = name, + namespace = namespace, + spec = spec, + status = status, + uid = uid, + ) + try: + while not stopped: + description = str(resource_claim) + resource_claim = await resource_claim.refetch() + if not resource_claim: + logger.info(f"{description} found deleted in daemon") + return + if not resource_claim.ignore: + # In distributed mode, dispatch to worker (if claim has handle) + # Claims without handle need operator for binding (cache-dependent) + if not Poolboy.is_standalone and resource_claim.has_resource_handle: + from tasks.resourceclaim import dispatch_manage_claim + dispatch_manage_claim( + definition=resource_claim.definition, + name=resource_claim.name, + namespace=resource_claim.namespace, + ) + else: + await resource_claim.manage(logger=logger) + await asyncio.sleep(Poolboy.manage_claims_interval) + except asyncio.CancelledError: + pass + +@kopf.on.create( + ResourceHandle.api_group, ResourceHandle.api_version, ResourceHandle.plural, + id='resource_handle_create', + label_selector=label_selector, +) +@kopf.on.resume( + ResourceHandle.api_group, ResourceHandle.api_version, ResourceHandle.plural, + id='resource_handle_resume', + label_selector=label_selector, +) +@kopf.on.update( + ResourceHandle.api_group, ResourceHandle.api_version, ResourceHandle.plural, + id='resource_handle_update', + label_selector=label_selector, +) +async def resource_handle_event( + annotations: kopf.Annotations, + labels: kopf.Labels, + logger: kopf.ObjectLogger, + meta: kopf.Meta, + name: str, + namespace: str, + spec: kopf.Spec, + status: kopf.Status, + uid: str, + **_ +): + resource_handle = await ResourceHandle.register( + annotations = annotations, + labels = labels, + meta = meta, + name = name, + namespace = namespace, + spec = spec, + status = status, + uid = uid, + ) + if resource_handle.ignore: + return + if not Poolboy.is_standalone: + from tasks.resourcehandle import dispatch_manage_handle + dispatch_manage_handle( + definition=resource_handle.definition, + name=resource_handle.name, + namespace=resource_handle.namespace, + ) + else: + await resource_handle.manage(logger=logger) + +@kopf.on.delete( + ResourceHandle.api_group, ResourceHandle.api_version, ResourceHandle.plural, + label_selector=label_selector, +) +async def resource_handle_delete( + annotations: kopf.Annotations, + labels: kopf.Labels, + logger: kopf.ObjectLogger, + meta: kopf.Meta, + name: str, + namespace: str, + spec: kopf.Spec, + status: kopf.Status, + uid: str, + **_ +): + await ResourceHandle.unregister(name) + resource_handle = ResourceHandle( + annotations = annotations, + labels = labels, + meta = meta, + name = name, + namespace = namespace, + spec = spec, + status = status, + uid = uid, + ) + if resource_handle.ignore: + return + if not Poolboy.is_standalone: + from tasks.resourcehandle import dispatch_delete_handle + dispatch_delete_handle( + definition=resource_handle.definition, + name=resource_handle.name, + namespace=resource_handle.namespace, + ) + else: + await resource_handle.handle_delete(logger=logger) + +@kopf.daemon( + ResourceHandle.api_group, ResourceHandle.api_version, ResourceHandle.plural, + cancellation_timeout = 1, + initial_delay = Poolboy.manage_handles_interval, + label_selector=label_selector, +) +async def resource_handle_daemon( + annotations: kopf.Annotations, + labels: kopf.Labels, + logger: kopf.ObjectLogger, + meta: kopf.Meta, + name: str, + namespace: str, + spec: kopf.Spec, + status: kopf.Status, + stopped: kopf.DaemonStopped, + uid: str, + **_ +): + resource_handle = await ResourceHandle.register( + annotations = annotations, + labels = labels, + meta = meta, + name = name, + namespace = namespace, + spec = spec, + status = status, + uid = uid, + ) + try: + while not stopped: + description = str(resource_handle) + resource_handle = await resource_handle.refetch() + if not resource_handle: + logger.info(f"{description} found deleted in daemon") + return + if not resource_handle.ignore: + if not Poolboy.is_standalone: + from tasks.resourcehandle import dispatch_manage_handle + dispatch_manage_handle( + definition=resource_handle.definition, + name=resource_handle.name, + namespace=resource_handle.namespace, + ) + else: + await resource_handle.manage(logger=logger) + await asyncio.sleep(Poolboy.manage_handles_interval) + except asyncio.CancelledError: + pass + +@kopf.on.create( + ResourcePool.api_group, ResourcePool.api_version, ResourcePool.plural, + id='resource_pool_create', + label_selector=label_selector, +) +@kopf.on.resume( + ResourcePool.api_group, ResourcePool.api_version, ResourcePool.plural, + id='resource_pool_resume', + label_selector=label_selector, +) +@kopf.on.update( + ResourcePool.api_group, ResourcePool.api_version, ResourcePool.plural, + id='resource_pool_update', + label_selector=label_selector, +) +async def resource_pool_event( + annotations: kopf.Annotations, + labels: kopf.Labels, + logger: kopf.ObjectLogger, + meta: kopf.Meta, + name: str, + namespace: str, + spec: kopf.Spec, + status: kopf.Status, + uid: str, + **_ +): + resource_pool = await ResourcePool.register( + annotations = annotations, + labels = labels, + meta = meta, + name = name, + namespace = namespace, + spec = spec, + status = status, + uid = uid, + ) + if not Poolboy.is_standalone: + from tasks.resourcepool import dispatch_manage_pool + dispatch_manage_pool( + definition=resource_pool.definition, + name=resource_pool.name, + namespace=resource_pool.namespace, + ) + else: + await resource_pool.manage(logger=logger) + +@kopf.on.delete( + Poolboy.operator_domain, Poolboy.operator_version, 'resourcepools', + label_selector=label_selector, +) +async def resource_pool_delete( + annotations: kopf.Annotations, + labels: kopf.Labels, + logger: kopf.ObjectLogger, + meta: kopf.Meta, + name: str, + namespace: str, + spec: kopf.Spec, + status: kopf.Status, + uid: str, + **_ +): + await ResourcePool.unregister(name) + resource_pool = ResourcePool( + annotations = annotations, + labels = labels, + meta = meta, + name = name, + namespace = namespace, + spec = spec, + status = status, + uid = uid, + ) + if not Poolboy.is_standalone: + from tasks.resourcepool import dispatch_delete_pool_handles + dispatch_delete_pool_handles( + definition=resource_pool.definition, + name=resource_pool.name, + namespace=resource_pool.namespace, + ) + else: + await resource_pool.handle_delete(logger=logger) + +@kopf.daemon(Poolboy.operator_domain, Poolboy.operator_version, 'resourcepools', + cancellation_timeout = 1, + initial_delay = Poolboy.manage_pools_interval, + label_selector=label_selector, +) +async def resource_pool_daemon( + annotations: kopf.Annotations, + labels: kopf.Labels, + logger: kopf.ObjectLogger, + meta: kopf.Meta, + name: str, + namespace: str, + spec: kopf.Spec, + status: kopf.Status, + stopped: kopf.DaemonStopped, + uid: str, + **_ +): + resource_pool = await ResourcePool.register( + annotations = annotations, + labels = labels, + meta = meta, + name = name, + namespace = namespace, + spec = spec, + status = status, + uid = uid, + ) + if resource_pool.ignore: + return + try: + while not stopped: + description = str(resource_pool) + resource_pool = await resource_pool.refetch() + if not resource_pool: + logger.info(f"{description} found deleted in daemon") + return + + if not resource_pool.ignore: + if not Poolboy.is_standalone: + from tasks.resourcepool import dispatch_manage_pool + dispatch_manage_pool( + definition=resource_pool.definition, + name=resource_pool.name, + namespace=resource_pool.namespace, + ) + else: + await resource_pool.manage(logger=logger) + + await asyncio.sleep(Poolboy.manage_pools_interval) + except asyncio.CancelledError: + pass + +# ResourceWatch handlers - always start watch directly (no more create_pod) +@kopf.on.create( + Poolboy.operator_domain, Poolboy.operator_version, 'resourcewatches', + id='resource_watch_create', + label_selector=label_selector, +) +@kopf.on.resume( + Poolboy.operator_domain, Poolboy.operator_version, 'resourcewatches', + id='resource_watch_resume', + label_selector=label_selector, +) +async def resource_watch_create_or_resume( + annotations: kopf.Annotations, + labels: kopf.Labels, + logger: kopf.ObjectLogger, + meta: kopf.Meta, + name: str, + namespace: str, + spec: kopf.Spec, + status: kopf.Status, + uid: str, + **_ +): + resource_watch = await ResourceWatch.register( + annotations = annotations, + labels = labels, + meta = meta, + name = name, + namespace = namespace, + spec = spec, + status = status, + uid = uid, + ) + # Always start watch directly (no more create_pod for manager mode) + await resource_watch.start(logger=logger) diff --git a/operator/metrics/__init__.py b/operator/metrics/__init__.py index 53571c5..ab5dbcc 100644 --- a/operator/metrics/__init__.py +++ b/operator/metrics/__init__.py @@ -1,3 +1,5 @@ +"""Prometheus metrics module for Poolboy operator and workers.""" + from .app_metrics import AppMetrics from .metrics_service import MetricsService from .timer_decorator import TimerDecoratorMeta, async_timer, sync_timer diff --git a/operator/metrics/app_metrics.py b/operator/metrics/app_metrics.py index fb1e30b..c0ee755 100644 --- a/operator/metrics/app_metrics.py +++ b/operator/metrics/app_metrics.py @@ -1,25 +1,25 @@ +"""Prometheus metrics definitions for Poolboy.""" + from __future__ import annotations -from aioprometheus import REGISTRY, Counter, Histogram +from prometheus_client import REGISTRY, Counter, Histogram class AppMetrics: + """Central registry for all application metrics.""" + registry = REGISTRY process_time = Histogram( "poolboy_process_time_seconds", "Execution time of processes in the app", - { - "method": "The method name", - "status": "The status of the request", - "app": "The application name", - "cluster_domain": "The cluster name", - }, - registry=registry, + ["method", "status", "app", "cluster_domain"], + registry=REGISTRY, ) invalid_resource_counter = Counter( "poolboy_invalid_resource_count", "Counts the number of resources in invalid states", - registry=registry, + ["resource_type", "cluster_domain"], + registry=REGISTRY, ) diff --git a/operator/metrics/metrics_service.py b/operator/metrics/metrics_service.py index 2f2ba46..ea0279d 100644 --- a/operator/metrics/metrics_service.py +++ b/operator/metrics/metrics_service.py @@ -1,26 +1,82 @@ +"""Prometheus metrics HTTP server with multiprocess support.""" + from __future__ import annotations import logging +import os +from pathlib import Path +from threading import Thread +from wsgiref.simple_server import WSGIRequestHandler, make_server -from aioprometheus.service import Service - -from .app_metrics import AppMetrics +from prometheus_client import REGISTRY, CollectorRegistry, multiprocess +from prometheus_client.exposition import ThreadingWSGIServer, make_wsgi_app +from prometheus_client.multiprocess import MultiProcessCollector logger = logging.getLogger(__name__) class MetricsService: - service = Service(registry=AppMetrics.registry) + """HTTP server that exposes Prometheus metrics on /metrics endpoint.""" + + _server = None + _thread = None + _multiproc_dir: Path | None = None @classmethod - async def start(cls, addr="0.0.0.0", port=8000) -> None: - # Reduce logging level for aiohttp to avoid spamming the logs - logging.getLogger("aiohttp").setLevel(logging.ERROR) + def _get_registry(cls) -> CollectorRegistry: + """Return the appropriate collector registry based on environment.""" + multiproc_dir = os.environ.get("PROMETHEUS_MULTIPROC_DIR") + + if multiproc_dir: + cls._multiproc_dir = Path(multiproc_dir) + cls._multiproc_dir.mkdir(parents=True, exist_ok=True) - await cls.service.start(addr=addr, port=port, metrics_url="/metrics") - logger.info(f"Serving metrics on: {cls.service.metrics_url}") + registry = CollectorRegistry() + MultiProcessCollector(registry) + logger.info(f"Using multiprocess registry: {multiproc_dir}") + return registry + + logger.info("Using single-process registry") + return REGISTRY @classmethod - async def stop(cls) -> None: - logger.info("Stopping metrics service") - await cls.service.stop() + def start(cls, port: int = 9090, addr: str = "0.0.0.0") -> None: + """Start the metrics server in a background daemon thread.""" + registry = cls._get_registry() + + app = make_wsgi_app(registry) + cls._server = make_server( + addr, port, app, ThreadingWSGIServer, + handler_class=_SilentHandler + ) + + cls._thread = Thread(target=cls._server.serve_forever, daemon=True) + cls._thread.start() + logger.info(f"Metrics server started on {addr}:{port}") + + @classmethod + def stop(cls) -> None: + """Stop the metrics server and cleanup resources.""" + if cls._server: + cls._server.shutdown() + logger.info("Metrics server stopped") + + cls._cleanup_multiproc() + + @classmethod + def _cleanup_multiproc(cls) -> None: + """Mark current process as dead for multiprocess cleanup.""" + if cls._multiproc_dir and cls._multiproc_dir.exists(): + try: + pid = os.getpid() + multiprocess.mark_process_dead(pid) + logger.debug(f"Marked process {pid} as dead") + except Exception as e: + logger.warning(f"Error cleaning up multiproc: {e}") + + +class _SilentHandler(WSGIRequestHandler): + """WSGI request handler that suppresses access logs.""" + + def log_message(self, format, *args): + pass diff --git a/operator/metrics/timer_decorator.py b/operator/metrics/timer_decorator.py index 1e6b486..be2c50e 100644 --- a/operator/metrics/timer_decorator.py +++ b/operator/metrics/timer_decorator.py @@ -1,3 +1,5 @@ +"""Timer decorators for automatic method execution timing.""" + import inspect import os import time @@ -5,9 +7,11 @@ from .app_metrics import AppMetrics -cluster_domain = os.environ.get('CLUSTER_DOMAIN') +CLUSTER_DOMAIN = os.environ.get('CLUSTER_DOMAIN', 'unknown') + def async_timer(app: str): + """Decorator that records execution time of async functions.""" def decorator(func): @wraps(func) async def wrapper(*args, **kwargs): @@ -15,55 +19,60 @@ async def wrapper(*args, **kwargs): status = 'success' try: result = await func(*args, **kwargs) - status = 'success' + return result except Exception as e: status = 'error' raise e finally: duration = time.time() - start_time - method_name = func.__name__ - labels = {'method': method_name, - 'status': status, - 'app': app, - 'cluster_domain': cluster_domain - } - AppMetrics.process_time.observe(labels, duration) - - return result + AppMetrics.process_time.labels( + method=func.__name__, + status=status, + app=app, + cluster_domain=CLUSTER_DOMAIN, + ).observe(duration) return wrapper return decorator def sync_timer(app: str): + """Decorator that records execution time of sync functions.""" def decorator(func): @wraps(func) def wrapper(*args, **kwargs): start_time = time.time() + status = 'success' try: result = func(*args, **kwargs) - status = 'success' + return result except Exception as e: status = 'error' raise e finally: duration = time.time() - start_time - method_name = func.__name__ - labels = {'method': method_name, - 'status': status, - 'app': app, - 'cluster_domain': cluster_domain - } - AppMetrics.process_time.observe(labels, duration) - - return result + AppMetrics.process_time.labels( + method=func.__name__, + status=status, + app=app, + cluster_domain=CLUSTER_DOMAIN, + ).observe(duration) return wrapper return decorator class TimerDecoratorMeta(type): + """Metaclass that applies timer decorators to all public methods.""" + def __new__(cls, name, bases, dct): for attr_name, attr_value in dct.items(): - if isinstance(attr_value, classmethod): + if isinstance(attr_value, staticmethod): + original_method = attr_value.__func__ + if inspect.iscoroutinefunction(original_method): + decorated_method = async_timer(name)(original_method) + else: + decorated_method = sync_timer(name)(original_method) + dct[attr_name] = staticmethod(decorated_method) + elif isinstance(attr_value, classmethod): original_method = attr_value.__func__ if inspect.iscoroutinefunction(original_method): decorated_method = async_timer(name)(original_method) diff --git a/operator/operator.py b/operator/operator.py deleted file mode 100755 index 6cf0104..0000000 --- a/operator/operator.py +++ /dev/null @@ -1,514 +0,0 @@ -#!/usr/bin/env python3 -import asyncio -import logging -import re -from typing import Mapping - -import kopf -from configure_kopf_logging import configure_kopf_logging -from infinite_relative_backoff import InfiniteRelativeBackoff -from metrics import MetricsService -from poolboy import Poolboy -from resourceclaim import ResourceClaim -from resourcehandle import ResourceHandle -from resourcepool import ResourcePool -from resourceprovider import ResourceProvider -from resourcewatch import ResourceWatch - - -@kopf.on.startup() -async def startup(logger: kopf.ObjectLogger, settings: kopf.OperatorSettings, **_): - # Store last handled configuration in status - settings.persistence.diffbase_storage = kopf.StatusDiffBaseStorage( - field='status.diffBase', - ) - - # Never give up from network errors - settings.networking.error_backoffs = InfiniteRelativeBackoff() - - # Set finalizer based on operator mode - settings.persistence.finalizer = ( - f"{Poolboy.operator_domain}/handler" if Poolboy.operator_mode_resource_handler else - f"{Poolboy.operator_domain}/watch-{Poolboy.resource_watch_name}" if Poolboy.operator_mode_resource_watch else - Poolboy.operator_domain - ) - - # Support deprecated resource handler finalizer - if Poolboy.operator_mode_resource_handler: - settings.persistence.deprecated_finalizer = re.compile(re.escape(Poolboy.operator_domain) + '/handler-\d+$') - - # Store progress in status. - settings.persistence.progress_storage = kopf.StatusProgressStorage(field='status.kopf.progress') - - # Only create events for warnings and errors - settings.posting.level = logging.WARNING - - # Disable scanning for CustomResourceDefinitions updates - settings.scanning.disabled = True - - # Configure logging - configure_kopf_logging() - - await Poolboy.on_startup(logger=logger) - - if Poolboy.metrics_enabled: - # Start metrics service - await MetricsService.start(port=Poolboy.metrics_port) - - # Preload configuration from ResourceProviders - await ResourceProvider.preload(logger=logger) - - # Preload for matching ResourceClaim templates - if Poolboy.operator_mode_all_in_one or Poolboy.operator_mode_resource_handler: - await ResourceHandle.preload(logger=logger) - if Poolboy.operator_mode_resource_handler: - ResourceHandle.start_watch_other() - -@kopf.on.cleanup() -async def cleanup(logger: kopf.ObjectLogger, **_): - if Poolboy.operator_mode_resource_handler: - ResourceHandle.stop_watch_other() - await ResourceWatch.stop_all() - await Poolboy.on_cleanup() - await MetricsService.stop() - -@kopf.on.event(Poolboy.operator_domain, Poolboy.operator_version, 'resourceproviders') -async def resource_provider_event(event: Mapping, logger: kopf.ObjectLogger, **_) -> None: - definition = event['object'] - if event['type'] == 'DELETED': - await ResourceProvider.unregister(name=definition['metadata']['name'], logger=logger) - else: - await ResourceProvider.register(definition=definition, logger=logger) - -label_selector = f"!{Poolboy.ignore_label}" -if Poolboy.operator_mode_resource_handler: - label_selector += f",{Poolboy.resource_handler_idx_label}={Poolboy.resource_handler_idx}" - -if Poolboy.operator_mode_manager: - # In manager mode just label ResourceClaims, ResourceHandles, and ResourcePools - # to assign the correct handler. - @kopf.on.event( - ResourceClaim.api_group, ResourceClaim.api_version, ResourceClaim.plural, - label_selector=label_selector, - ) - async def label_resource_claim( - event: Mapping, - logger: kopf.ObjectLogger, - **_ - ) -> None: - definition = event['object'] - if event['type'] == 'DELETED': - return - resource_claim = ResourceClaim.from_definition(definition) - await resource_claim.assign_resource_handler() - - @kopf.on.event( - ResourceHandle.api_group, ResourceHandle.api_version, ResourceHandle.plural, - label_selector=label_selector, - ) - async def label_resource_handle( - event: Mapping, - logger: kopf.ObjectLogger, - **_ - ) -> None: - definition = event['object'] - if event['type'] == 'DELETED': - return - resource_handle = ResourceHandle.from_definition(definition) - await resource_handle.assign_resource_handler() - - @kopf.on.event( - ResourcePool.api_group, ResourcePool.api_version, ResourcePool.plural, - label_selector=label_selector, - ) - async def label_resource_pool( - event: Mapping, - logger: kopf.ObjectLogger, - **_ - ) -> None: - definition = event['object'] - if event['type'] == 'DELETED': - return - resource_pool = ResourcePool.from_definition(definition) - await resource_pool.assign_resource_handler() - -if( - Poolboy.operator_mode_all_in_one or - Poolboy.operator_mode_resource_handler -): - # Resources are handled in either all-in-one or resource-handler mode. - # The difference is only if labels are used to select which resources to handle. - - @kopf.on.create( - ResourceClaim.api_group, ResourceClaim.api_version, ResourceClaim.plural, - label_selector=label_selector, - id='resource_claim_create', - ) - @kopf.on.resume( - ResourceClaim.api_group, ResourceClaim.api_version, ResourceClaim.plural, - label_selector=label_selector, - id='resource_claim_resume', - ) - @kopf.on.update( - ResourceClaim.api_group, ResourceClaim.api_version, ResourceClaim.plural, - label_selector=label_selector, - id='resource_claim_update', - ) - async def resource_claim_event( - annotations: kopf.Annotations, - labels: kopf.Labels, - logger: kopf.ObjectLogger, - meta: kopf.Meta, - name: str, - namespace: str, - spec: kopf.Spec, - status: kopf.Status, - uid: str, - **_ - ): - resource_claim = await ResourceClaim.register( - annotations = annotations, - labels = labels, - meta = meta, - name = name, - namespace = namespace, - spec = spec, - status = status, - uid = uid, - ) - await resource_claim.manage(logger=logger) - - @kopf.on.delete( - ResourceClaim.api_group, ResourceClaim.api_version, ResourceClaim.plural, - label_selector=label_selector, - ) - async def resource_claim_delete( - annotations: kopf.Annotations, - labels: kopf.Labels, - logger: kopf.ObjectLogger, - meta: kopf.Meta, - name: str, - namespace: str, - spec: kopf.Spec, - status: kopf.Status, - uid: str, - **_ - ): - resource_claim = ResourceClaim( - annotations = annotations, - labels = labels, - meta = meta, - name = name, - namespace = namespace, - spec = spec, - status = status, - uid = uid, - ) - await resource_claim.handle_delete(logger=logger) - await ResourceClaim.unregister(name=name, namespace=namespace) - - @kopf.daemon( - ResourceClaim.api_group, ResourceClaim.api_version, ResourceClaim.plural, - cancellation_timeout = 1, - initial_delay = Poolboy.manage_handles_interval, - label_selector=label_selector, - ) - async def resource_claim_daemon( - annotations: kopf.Annotations, - labels: kopf.Labels, - logger: kopf.ObjectLogger, - meta: kopf.Meta, - name: str, - namespace: str, - spec: kopf.Spec, - status: kopf.Status, - stopped: kopf.DaemonStopped, - uid: str, - **_ - ): - resource_claim = await ResourceClaim.register( - annotations = annotations, - labels = labels, - meta = meta, - name = name, - namespace = namespace, - spec = spec, - status = status, - uid = uid, - ) - try: - while not stopped: - description = str(resource_claim) - resource_claim = await resource_claim.refetch() - if not resource_claim: - logger.info(f"{description} found deleted in daemon") - return - if not resource_claim.ignore: - await resource_claim.manage(logger=logger) - await asyncio.sleep(Poolboy.manage_claims_interval) - except asyncio.CancelledError: - pass - - @kopf.on.create( - ResourceHandle.api_group, ResourceHandle.api_version, ResourceHandle.plural, - id='resource_handle_create', - label_selector=label_selector, - ) - @kopf.on.resume( - ResourceHandle.api_group, ResourceHandle.api_version, ResourceHandle.plural, - id='resource_handle_resume', - label_selector=label_selector, - ) - @kopf.on.update( - ResourceHandle.api_group, ResourceHandle.api_version, ResourceHandle.plural, - id='resource_handle_update', - label_selector=label_selector, - ) - async def resource_handle_event( - annotations: kopf.Annotations, - labels: kopf.Labels, - logger: kopf.ObjectLogger, - meta: kopf.Meta, - name: str, - namespace: str, - spec: kopf.Spec, - status: kopf.Status, - uid: str, - **_ - ): - resource_handle = await ResourceHandle.register( - annotations = annotations, - labels = labels, - meta = meta, - name = name, - namespace = namespace, - spec = spec, - status = status, - uid = uid, - ) - if resource_handle.ignore: - return - await resource_handle.manage(logger=logger) - - @kopf.on.delete( - ResourceHandle.api_group, ResourceHandle.api_version, ResourceHandle.plural, - label_selector=label_selector, - ) - async def resource_handle_delete( - annotations: kopf.Annotations, - labels: kopf.Labels, - logger: kopf.ObjectLogger, - meta: kopf.Meta, - name: str, - namespace: str, - spec: kopf.Spec, - status: kopf.Status, - uid: str, - **_ - ): - await ResourceHandle.unregister(name) - resource_handle = ResourceHandle( - annotations = annotations, - labels = labels, - meta = meta, - name = name, - namespace = namespace, - spec = spec, - status = status, - uid = uid, - ) - if resource_handle.ignore: - return - await resource_handle.handle_delete(logger=logger) - - @kopf.daemon( - ResourceHandle.api_group, ResourceHandle.api_version, ResourceHandle.plural, - cancellation_timeout = 1, - initial_delay = Poolboy.manage_handles_interval, - label_selector=label_selector, - ) - async def resource_handle_daemon( - annotations: kopf.Annotations, - labels: kopf.Labels, - logger: kopf.ObjectLogger, - meta: kopf.Meta, - name: str, - namespace: str, - spec: kopf.Spec, - status: kopf.Status, - stopped: kopf.DaemonStopped, - uid: str, - **_ - ): - resource_handle = await ResourceHandle.register( - annotations = annotations, - labels = labels, - meta = meta, - name = name, - namespace = namespace, - spec = spec, - status = status, - uid = uid, - ) - try: - while not stopped: - description = str(resource_handle) - resource_handle = await resource_handle.refetch() - if not resource_handle: - logger.info(f"{description} found deleted in daemon") - return - if not resource_handle.ignore: - await resource_handle.manage(logger=logger) - await asyncio.sleep(Poolboy.manage_handles_interval) - except asyncio.CancelledError: - pass - - @kopf.on.create( - ResourcePool.api_group, ResourcePool.api_version, ResourcePool.plural, - id='resource_pool_create', - label_selector=label_selector, - ) - @kopf.on.resume( - ResourcePool.api_group, ResourcePool.api_version, ResourcePool.plural, - id='resource_pool_resume', - label_selector=label_selector, - ) - @kopf.on.update( - ResourcePool.api_group, ResourcePool.api_version, ResourcePool.plural, - id='resource_pool_update', - label_selector=label_selector, - ) - async def resource_pool_event( - annotations: kopf.Annotations, - labels: kopf.Labels, - logger: kopf.ObjectLogger, - meta: kopf.Meta, - name: str, - namespace: str, - spec: kopf.Spec, - status: kopf.Status, - uid: str, - **_ - ): - resource_pool = await ResourcePool.register( - annotations = annotations, - labels = labels, - meta = meta, - name = name, - namespace = namespace, - spec = spec, - status = status, - uid = uid, - ) - await resource_pool.manage(logger=logger) - - @kopf.on.delete( - Poolboy.operator_domain, Poolboy.operator_version, 'resourcepools', - label_selector=label_selector, - ) - async def resource_pool_delete( - annotations: kopf.Annotations, - labels: kopf.Labels, - logger: kopf.ObjectLogger, - meta: kopf.Meta, - name: str, - namespace: str, - spec: kopf.Spec, - status: kopf.Status, - uid: str, - **_ - ): - await ResourcePool.unregister(name) - resource_pool = ResourcePool( - annotations = annotations, - labels = labels, - meta = meta, - name = name, - namespace = namespace, - spec = spec, - status = status, - uid = uid, - ) - await resource_pool.handle_delete(logger=logger) - - @kopf.daemon(Poolboy.operator_domain, Poolboy.operator_version, 'resourcepools', - cancellation_timeout = 1, - initial_delay = Poolboy.manage_pools_interval, - label_selector=label_selector, - ) - async def resource_pool_daemon( - annotations: kopf.Annotations, - labels: kopf.Labels, - logger: kopf.ObjectLogger, - meta: kopf.Meta, - name: str, - namespace: str, - spec: kopf.Spec, - status: kopf.Status, - stopped: kopf.DaemonStopped, - uid: str, - **_ - ): - resource_pool = await ResourcePool.register( - annotations = annotations, - labels = labels, - meta = meta, - name = name, - namespace = namespace, - spec = spec, - status = status, - uid = uid, - ) - if resource_pool.ignore: - return - try: - while not stopped: - await resource_pool.manage(logger=logger) - await asyncio.sleep(Poolboy.manage_pools_interval) - except asyncio.CancelledError: - pass - -if ( - Poolboy.operator_mode_all_in_one or - Poolboy.operator_mode_resource_watch or - Poolboy.operator_mode_manager -): - @kopf.on.create( - Poolboy.operator_domain, Poolboy.operator_version, 'resourcewatches', - id='resource_watch_create', - label_selector=label_selector, - ) - @kopf.on.resume( - Poolboy.operator_domain, Poolboy.operator_version, 'resourcewatches', - id='resource_watch_resume', - label_selector=label_selector, - ) - async def resource_watch_create_or_resume( - annotations: kopf.Annotations, - labels: kopf.Labels, - logger: kopf.ObjectLogger, - meta: kopf.Meta, - name: str, - namespace: str, - spec: kopf.Spec, - status: kopf.Status, - uid: str, - **_ - ): - if (not Poolboy.operator_mode_resource_watch or - Poolboy.resource_watch_name == name - ): - resource_watch = await ResourceWatch.register( - annotations = annotations, - labels = labels, - meta = meta, - name = name, - namespace = namespace, - spec = spec, - status = status, - uid = uid, - ) - if Poolboy.operator_mode_manager: - await resource_watch.create_pod(logger=logger) - else: - await resource_watch.start(logger=logger) diff --git a/operator/poolboy.py b/operator/poolboy.py index b1fddb3..c2f5bfe 100644 --- a/operator/poolboy.py +++ b/operator/poolboy.py @@ -1,29 +1,23 @@ import os -from copy import deepcopy -from uuid import UUID import kopf import kubernetes_asyncio -import yaml class Poolboy(): metrics_enabled = os.environ.get('METRICS_ENABLED', 'true').lower() == 'true' - metrics_port = int(os.environ.get('METRICS_PORT', 9091)) + metrics_port = int(os.environ.get('METRICS_PORT', 9090)) manage_claims_interval = int(os.environ.get('MANAGE_CLAIMS_INTERVAL', 60)) manage_handles_interval = int(os.environ.get('MANAGE_HANDLES_INTERVAL', 60)) manage_pools_interval = int(os.environ.get('MANAGE_POOLS_INTERVAL', 10)) - operator_mode = os.environ.get('OPERATOR_MODE', 'all-in-one') - operator_mode_all_in_one = operator_mode == 'all-in-one' - operator_mode_manager = operator_mode == 'manager' - operator_mode_resource_handler = operator_mode == 'resource-handler' - operator_mode_resource_watch = operator_mode == 'resource-watch' + + # Operator mode: standalone (local) or distributed (Celery workers) + # IS_STANDALONE is set by Helm based on operatorMode value + is_standalone = os.environ.get('IS_STANDALONE', 'false').lower() == 'true' + operator_domain = os.environ.get('OPERATOR_DOMAIN', 'poolboy.gpte.redhat.com') operator_version = os.environ.get('OPERATOR_VERSION', 'v1') operator_api_version = f"{operator_domain}/{operator_version}" - resource_watch_name = os.environ.get('WATCH_NAME') - resource_handler_count = int(os.environ.get('RESOURCE_HANDLER_COUNT', 1)) - resource_handler_idx = int(os.environ.get('RESOURCE_HANDLER_IDX', 0)) resource_refresh_interval = int(os.environ.get('RESOURCE_REFRESH_INTERVAL', 600)) resource_handle_deleted_annotation = f"{operator_domain}/resource-handle-deleted" resource_claim_name_annotation = f"{operator_domain}/resource-claim-name" @@ -45,9 +39,17 @@ class Poolboy(): resource_requester_user_annotation = f"{operator_domain}/resource-requester-user" resource_requester_preferred_username_annotation = f"{operator_domain}/resource-requester-preferred-username" ignore_label = f"{operator_domain}/ignore" + is_worker = os.environ.get('WORKER', 'false').lower() == 'true' + + # TODO: Remove after all production clusters migrated (used for cleanup only) resource_handler_idx_label = f"{operator_domain}/resource-handler-idx" - resource_handler_resources = yaml.safe_load(os.environ['RESOURCE_HANDLER_RESOURCES']) if 'RESOURCE_HANDLER_RESOURCES' in os.environ else None - resource_watch_resources = yaml.safe_load(os.environ['RESOURCE_WATCH_RESOURCES']) if 'RESOURCE_WATCH_RESOURCES' in os.environ else None + + # Worker retry config (used by Celery tasks) + workers_error_retry_countdown = int(os.environ.get('WORKERS_ERROR_RETRY_COUNTDOWN', '30')) + workers_lock_retry_countdown = int(os.environ.get('WORKERS_LOCK_RETRY_COUNTDOWN', '3')) + + # Redis URL for distributed locking (used by main operator to send tasks) + redis_url = os.environ.get('REDIS_URL') @classmethod async def on_cleanup(cls): @@ -55,6 +57,12 @@ async def on_cleanup(cls): @classmethod async def on_startup(cls, logger: kopf.ObjectLogger): + # Log operator mode on startup + mode = "standalone" if cls.is_standalone else "distributed" + logger.info(f"Poolboy starting in {mode} mode") + if not cls.is_standalone: + logger.info("Distributed mode: delegating to Celery workers") + if os.path.exists('/run/secrets/kubernetes.io/serviceaccount'): kubernetes_asyncio.config.load_incluster_config() with open('/run/secrets/kubernetes.io/serviceaccount/namespace', encoding='utf-8') as f: @@ -74,83 +82,15 @@ async def on_startup(cls, logger: kopf.ObjectLogger): cls.core_v1_api = kubernetes_asyncio.client.CoreV1Api(cls.api_client) cls.custom_objects_api = kubernetes_asyncio.client.CustomObjectsApi(cls.api_client) - if cls.operator_mode == 'manager': - await cls.assign_resource_handlers(logger=logger) - await cls.start_resource_handlers(logger=logger) - elif cls.operator_mode == 'all-in-one': - await cls.clear_resource_handler_assignments(logger=logger) - - @classmethod - async def assign_resource_handlers(cls, logger: kopf.ObjectLogger): - """Label ResourceHandles and ResourcePools to match to appropriate handlers. - Clear any extraneous finalizers.""" - for plural in ('resourcehandles', 'resourcepools'): - _continue = None - while True: - obj_list = await Poolboy.custom_objects_api.list_namespaced_custom_object( - group=Poolboy.operator_domain, - namespace=Poolboy.namespace, - plural=plural, - version=Poolboy.operator_version, - _continue = _continue, - limit = 50, - ) - for item in obj_list.get('items', []): - kind = item['kind'] - name = item['metadata']['name'] - patch = [] - resource_handler_idx = int(UUID(item['metadata']['uid'])) % cls.resource_handler_count - if resource_handler_idx != int(item['metadata'].get('labels', {}).get(cls.resource_handler_idx_label, '-1')): - if 'labels' in item['metadata']: - patch.append({ - "op": "add", - "path": "/metadata/labels", - "value": { - cls.resource_handler_idx_label: str(resource_handler_idx) - } - }) - else: - patch.append({ - "op": "add", - "path": f"/metadata/labels/{cls.resource_handler_idx_label.replace('/', '~1')}", - "value": str(resource_handler_idx), - }) - if 'finalizers' in item['metadata']: - clean_finalizers = [ - entry for entry in item['metadata']['finalizers'] - if entry == f"{Poolboy.operator_domain}/resource-handler-{resource_handler_idx}" - or not entry.startswith(f"{Poolboy.operator_domain}/resource-handler-") - ] - if clean_finalizers != item['metadata']['finalizers']: - patch.append({ - "op": "replace", - "path": "/metadata/finalizers", - "value": clean_finalizers, - }) - if patch: - logger.info( - f"Patching {kind} {name} to assign resource handler" - ) - try: - await Poolboy.custom_objects_api.patch_namespaced_custom_object( - group=Poolboy.operator_domain, - name=item['metadata']['name'], - namespace=Poolboy.namespace, - plural=plural, - version=Poolboy.operator_version, - body=patch, - _content_type = 'application/json-patch+json', - ) - except: - logger.exception("Patch failed.") - - _continue = obj_list['metadata'].get('continue') - if not _continue: - break + # Always run migration cleanup on startup + # TODO: Remove after all production clusters migrated + await cls.clear_resource_handler_assignments(logger=logger) + # TODO: Remove after all production clusters migrated @classmethod async def clear_resource_handler_assignments(cls, logger: kopf.ObjectLogger): - """Remove labels and finalizers applied to run in manager mode.""" + """Remove labels and finalizers from legacy manager mode. Keep for migration.""" + handler_finalizer = f"{cls.operator_domain}/handler" for plural in ('resourcehandles', 'resourcepools'): _continue = None while True: @@ -172,9 +112,11 @@ async def clear_resource_handler_assignments(cls, logger: kopf.ObjectLogger): "path": f"/metadata/labels/{cls.resource_handler_idx_label.replace('/', '~1')}", }) if 'finalizers' in item['metadata']: + # Clean both /resource-handler-* AND /handler patterns clean_finalizers = [ entry for entry in item['metadata']['finalizers'] if not entry.startswith(f"{Poolboy.operator_domain}/resource-handler-") + and not entry.startswith(handler_finalizer) # covers /handler and /handler-N ] if clean_finalizers != item['metadata']['finalizers']: patch.append({ @@ -202,81 +144,3 @@ async def clear_resource_handler_assignments(cls, logger: kopf.ObjectLogger): _continue = obj_list['metadata'].get('continue') if not _continue: break - - @classmethod - async def start_resource_handlers(cls, logger: kopf.ObjectLogger): - cls.manager_pod = await cls.core_v1_api.read_namespaced_pod( - name=os.environ['HOSTNAME'], - namespace=cls.namespace, - ) - logger.info(f"Manager running in pod {cls.manager_pod.metadata.name}") - for idx in range(Poolboy.resource_handler_count): - replicaset = kubernetes_asyncio.client.V1ReplicaSet( - api_version="apps/v1", - kind="ReplicaSet", - metadata=kubernetes_asyncio.client.V1ObjectMeta( - name=f"{cls.manager_pod.metadata.name}-handler-{idx}", - namespace=cls.namespace, - owner_references=[ - kubernetes_asyncio.client.V1OwnerReference( - api_version=cls.manager_pod.api_version, - controller=True, - kind=cls.manager_pod.kind, - name=cls.manager_pod.metadata.name, - uid=cls.manager_pod.metadata.uid, - ) - ] - ), - ) - replicaset.spec = kubernetes_asyncio.client.V1ReplicaSetSpec( - replicas=1, - selector=kubernetes_asyncio.client.V1LabelSelector( - match_labels={ - "app.kubernetes.io/name": cls.manager_pod.metadata.name, - "app.kubernetes.io/instance": f"handler-{idx}", - }, - ), - template=kubernetes_asyncio.client.V1PodTemplateSpec( - metadata=kubernetes_asyncio.client.V1ObjectMeta( - labels={ - "app.kubernetes.io/name": cls.manager_pod.metadata.name, - "app.kubernetes.io/instance": f"handler-{idx}", - }, - ), - spec=deepcopy(cls.manager_pod.spec), - ), - ) - - replicaset.spec.template.spec.containers[0].env = [ - env_var - for env_var in cls.manager_pod.spec.containers[0].env - if env_var.name not in { - 'OPERATOR_MODE', - 'RESOURCE_HANDLER_RESOURCES', - 'RESOURCE_WATCH_RESOURCES', - } - ] - replicaset.spec.template.spec.containers[0].env.append( - kubernetes_asyncio.client.V1EnvVar( - name='OPERATOR_MODE', - value='resource-handler', - ) - ) - replicaset.spec.template.spec.containers[0].env.append( - kubernetes_asyncio.client.V1EnvVar( - name='RESOURCE_HANDLER_IDX', - value=str(idx), - ) - ) - replicaset.spec.template.spec.node_name = None - if cls.resource_handler_resources: - replicaset.spec.template.spec.containers[0].resources = kubernetes_asyncio.client.V1ResourceRequirements( - limits=cls.resource_handler_resources.get('limits'), - requests=cls.resource_handler_resources.get('requests'), - ) - - replicaset = await cls.apps_v1_api.create_namespaced_replica_set( - namespace=cls.namespace, - body=replicaset, - ) - logger.info(f"Created ReplicaSet {replicaset.metadata.name}") diff --git a/operator/processor/__init__.py b/operator/processor/__init__.py new file mode 100644 index 0000000..7bcbafa --- /dev/null +++ b/operator/processor/__init__.py @@ -0,0 +1,11 @@ +""" +Celery processor module for Poolboy. + +Imports should be done directly from submodules to avoid circular imports: + from processor.app import app, WorkerState + +This __init__.py intentionally does NOT import from .app to prevent +circular import issues when tasks import processor components. +""" + +__all__ = ["app", "config"] diff --git a/operator/processor/app.py b/operator/processor/app.py new file mode 100644 index 0000000..7448168 --- /dev/null +++ b/operator/processor/app.py @@ -0,0 +1,425 @@ +""" +Worker application for Poolboy. + +Single class that manages: +- Celery app creation and configuration +- Signal handlers (worker lifecycle, task context) +- Task routing to partitioned queues +- Async bridge for running async code in sync tasks + +Note: Celery is an implementation detail, not exposed in public API. +""" + +import asyncio +import os +from contextvars import ContextVar +from functools import lru_cache +from typing import TypeVar + +import aiohttp +from celery import Celery, signals +from celery.utils.log import get_task_logger +from kombu import Queue +from metrics import TimerDecoratorMeta + +from .config import WorkerConfig + +logger = get_task_logger(__name__) +T = TypeVar("T") + + +# ============================================================================= +# TaskRouter - Convention-based routing +# ============================================================================= + + +class TaskRouter: + """ + Route tasks to queues based on module naming convention. + + Convention: + - Task module: tasks.{module}.{task_name} + - Resource type: derived from module (resourcepool -> resource_pool) + - Entity name: module without 'resource' prefix (resourcepool -> pool) + - Kwargs: {entity}_name, {entity}_namespace + + Examples: + tasks.resourcepool.create_handles -> queue: resource_pool_0 + (uses pool_name, pool_namespace from kwargs) + tasks.resourceclaim.bind -> queue: resource_claim_2 + (uses claim_name, claim_namespace from kwargs) + tasks.cleanup.delete_old -> queue: cleanup + (no partitioning if PARTITION_CLEANUP not set) + + Configuration: + Partitioning is controlled via environment variables: + - PARTITION_RESOURCE_POOL=4 -> 4 partitions for resource_pool + - PARTITION_RESOURCE_CLAIM=8 -> 8 partitions for resource_claim + - (not set) -> no partitioning, uses simple queue name + """ + + def __call__( + self, name: str, args: tuple, kwargs: dict, options: dict, task=None, **kw + ) -> dict | None: + """Make router callable for Celery's task_routes.""" + return self.route(name, kwargs) + + def get_entity_from_module(self, module: str) -> str: + """ + Extract entity name from module name. + + Examples: + resourcepool -> pool + resourceclaim -> claim + cleanup -> cleanup + """ + if module.startswith("resource") and len(module) > 8: + return module[8:] # resourcepool -> pool + return module + + def get_partitions(self, resource_type: str) -> int: + """Get number of partitions for a resource type.""" + env_key = f"PARTITION_{resource_type.upper()}" + value = os.environ.get(env_key) + return int(value) if value else 0 + + def get_queue_name( + self, resource_type: str, resource_name: str, namespace: str, partitions: int + ) -> str: + """Calculate partitioned queue name using consistent hashing.""" + import hashlib + + resource_key = f"{namespace}/{resource_name}" + hash_value = int(hashlib.md5(resource_key.encode()).hexdigest(), 16) + partition_index = hash_value % partitions + return f"{resource_type}_{partition_index}" + + def get_resource_type(self, module: str) -> str: + """ + Convert module name to resource type. + + Examples: + resourcepool -> resource_pool + resourceclaim -> resource_claim + cleanup -> cleanup + """ + if module.startswith("resource") and len(module) > 8: + return f"resource_{module[8:]}" + return module + + def parse_task_name(self, name: str) -> tuple[str, str] | None: + """Parse task name to extract module.""" + parts = name.split(".") + if len(parts) >= 3 and parts[0] == "tasks": + return parts[1], parts[2] + return None + + def route(self, name: str, kwargs: dict) -> dict | None: + """Route a task to appropriate queue based on convention.""" + parsed = self.parse_task_name(name) + if not parsed: + return None + + module, _ = parsed + resource_type = self.get_resource_type(module) + partitions = self.get_partitions(resource_type) + + # No partitioning configured - use default queue + if not partitions: + return {"queue": "default"} + + # Get resource identifier from kwargs using convention + # Fallback to generic 'name' and 'namespace' if entity-specific not found + entity = self.get_entity_from_module(module) + resource_name = kwargs.get(f"{entity}_name") or kwargs.get("name") + namespace = kwargs.get(f"{entity}_namespace") or kwargs.get( + "namespace", "default" + ) + + if resource_name: + queue = self.get_queue_name( + resource_type, resource_name, namespace, partitions + ) + return {"queue": queue} + + # No resource identifier - use default queue + return {"queue": "default"} + + +# ============================================================================= +# WorkerState - Process-level state management +# ============================================================================= + +# Task context for distributed tracing +task_context: ContextVar[str | None] = ContextVar("task_context", default=None) + + +class WorkerState: + """ + Manages worker process state. + + Uses class-level attributes (like Poolboy) instead of module globals. + Provides clear initialization and cleanup lifecycle with resilience: + - Lazy initialization as fallback + - Max connection age to prevent stale connections + - Automatic reconnect on error + """ + + loop: asyncio.AbstractEventLoop | None = None + k8s_initialized: bool = False + initialized_at: float = 0 + MAX_CONNECTION_AGE: int = 300 # 5 minutes + + @classmethod + def cleanup(cls, log): + """Cleanup resources when worker process shuts down.""" + # Mark as not initialized first to prevent new tasks from starting + cls.k8s_initialized = False + + # Cleanup distributed lock Redis client + from distributed_lock import DistributedLock + + DistributedLock.on_cleanup() + + if cls.loop and not cls.loop.is_closed(): + from poolboy import Poolboy + + cls.loop.run_until_complete(Poolboy.on_cleanup()) + cls.loop.close() + log.info("Worker state cleaned up") + + cls.loop = None + cls.initialized_at = 0 + + @classmethod + def initialize(cls, log): + """Initialize event loop and K8s client for this worker process.""" + import time + + # Initialize distributed lock Redis client + from distributed_lock import DistributedLock + + DistributedLock.on_startup() + + cls.loop = asyncio.new_event_loop() + asyncio.set_event_loop(cls.loop) + + from poolboy import Poolboy + + cls.loop.run_until_complete(Poolboy.on_startup(logger=log)) + cls.k8s_initialized = True + cls.initialized_at = time.time() + + @classmethod + def _is_connection_stale(cls) -> bool: + """Check if connection has exceeded max age.""" + import time + + if cls.initialized_at == 0: + return True + elapsed = time.time() - cls.initialized_at + return elapsed > cls.MAX_CONNECTION_AGE + + @classmethod + def _ensure_initialized(cls): + """Ensure connection is initialized and fresh (lazy init + max age).""" + not_ready = not cls.k8s_initialized or cls.loop is None or cls.loop.is_closed() + if not_ready: + logger.warning("WorkerState not initialized, lazy init...") + cls.initialize(logger) + elif cls._is_connection_stale(): + logger.info("K8s connection stale, refreshing...") + cls.cleanup(logger) + cls.initialize(logger) + + @classmethod + def run_async(cls, coro): + """ + Execute async code in the worker's event loop. + + Features: + - Lazy initialization if not ready + - Automatic refresh if connection is stale + - Automatic reconnect on error + """ + cls._ensure_initialized() + + try: + return cls.loop.run_until_complete(coro) + except aiohttp.ClientError as e: + # Connection error - cleanup stale connection, let Celery retry + logger.warning(f"K8s connection error, cleaning up: {e}") + cls.cleanup(logger) + raise # Celery will retry with fresh connection + # Note: K8sApiException (404, 409, etc.) are API errors, not connection + # errors - they propagate normally for task logic to handle + + +# ============================================================================= +# WorkerApp +# ============================================================================= + + +class WorkerApp(metaclass=TimerDecoratorMeta): + """ + Worker application factory for Poolboy. + + Responsibilities: + - Create and configure worker app from WorkerConfig dataclass + - Setup task queues from environment variables + - Configure task routing via TaskRouter (convention-based) + - Connect signal handlers for worker lifecycle + """ + + def __init__(self, config: WorkerConfig | None = None): + """ + Initialize worker application. + + Args: + config: WorkerConfig instance. If None, creates from env vars. + """ + self.config = config or WorkerConfig() + self.router = TaskRouter() + self.app = Celery("poolboy") + + self._configure_app() + self._configure_queues() + self._connect_signals() + self._setup_autodiscover() + + def _configure_app(self): + """Apply configuration from dataclass.""" + self.app.config_from_object(self.config.to_celery_config()) + + def _configure_queues(self): + """Configure task queues and routing.""" + queue_names = self._get_all_queues() + self.app.conf.task_queues = [Queue(q) for q in queue_names] + self.app.conf.task_default_queue = "default" + self.app.conf.task_routes = (self.router,) + + def _get_all_queues(self) -> list[str]: + """Generate queue names (default + partitioned).""" + queues = ["default"] + + # Partitioned queues (e.g., 'resource_pool_0', 'resource_pool_1') + config = self._get_partition_config() + for resource_type, partition_count in config.items(): + for i in range(partition_count): + queues.append(f"{resource_type}_{i}") + + return queues + + @staticmethod + @lru_cache(maxsize=1) + def _get_partition_config() -> dict[str, int]: + """Get partition configuration from environment variables.""" + resource_types = [ + "cleanup", + "resource_claim", + "resource_handle", + "resource_pool", + "resource_provider", + "resource_watch", + ] + config = {} + for resource_type in resource_types: + env_key = f"PARTITION_{resource_type.upper()}" + value = os.environ.get(env_key) + if value: + config[resource_type] = int(value) + return config + + def _connect_signals(self): + """Connect all signal handlers to the Celery app.""" + signals.worker_init.connect(self._on_worker_init) + signals.worker_shutdown.connect(self._on_worker_shutdown) + signals.worker_process_init.connect(self._on_worker_process_init) + shutdown_signal = signals.worker_process_shutdown + shutdown_signal.connect(self._on_worker_process_shutdown) + signals.task_prerun.connect(self._on_task_prerun) + signals.task_postrun.connect(self._on_task_postrun) + + @staticmethod + def _on_worker_init(**kwargs): + """Initialize metrics server when main worker process starts.""" + if os.environ.get("WORKER_METRICS_ENABLED", "true").lower() != "true": + return + + from metrics import MetricsService + + port = int(os.environ.get("WORKER_METRICS_PORT", "9090")) + MetricsService.start(port=port) + logger.info(f"Worker metrics server started on port {port}") + + @staticmethod + def _on_worker_shutdown(**kwargs): + """Stop metrics server and cleanup when worker shuts down.""" + from metrics import MetricsService + + if MetricsService._server is not None: + MetricsService.stop() + logger.info("Worker metrics server stopped") + + @staticmethod + def _on_worker_process_init(**kwargs): + """Initialize event loop and K8s client when worker process starts.""" + from cache import Cache + + Cache.initialize(standalone=False) + WorkerState.initialize(logger) + + @staticmethod + def _on_worker_process_shutdown(**kwargs): + """Cleanup when worker process shuts down.""" + WorkerState.cleanup(logger) + + @staticmethod + def _on_task_prerun(task_id=None, **kwargs): + """Set task context before execution.""" + if task_id: + task_context.set(task_id) + + @staticmethod + def _on_task_postrun(task_id=None, **kwargs): + """Clear task context after execution.""" + if task_id: + task_context.set(None) + + def _setup_autodiscover(self): + """Configure task autodiscovery.""" + self.app.autodiscover_tasks(["tasks"]) + + +# ============================================================================= +# Module-level exports +# ============================================================================= + +# Create singleton and export app +worker_app = WorkerApp() +app = worker_app.app + + +# ============================================================================= +# Beat Schedule Setup (after all tasks are discovered) +# ============================================================================= + + +@app.on_after_finalize.connect +def setup_periodic_tasks(sender, **kwargs): + """ + Configure Celery Beat schedule after app is fully initialized. + + This runs after all tasks have been discovered and registered, + avoiding circular import issues. + """ + enabled = os.environ.get("CELERY_SCHEDULER_ENABLED", "false") + if enabled.lower() != "true": + return + + # Import tasks to trigger @register_schedule decorators + import tasks # noqa: F401 + from scheduler.scheduler import setup_beat_schedule + + sender.conf.beat_schedule = setup_beat_schedule() + logger.info("Beat schedule configured") diff --git a/operator/processor/config.py b/operator/processor/config.py new file mode 100644 index 0000000..5b269e0 --- /dev/null +++ b/operator/processor/config.py @@ -0,0 +1,98 @@ +""" +Worker configuration dataclass for Poolboy. + +Reads all configuration from environment variables with sensible defaults. +This allows Helm to configure workers via ConfigMaps. +""" + +import os +from dataclasses import dataclass, field, fields + + +def _env_bool(key: str, default: bool = False) -> bool: + """Read boolean from environment variable.""" + return os.environ.get(key, str(default)).lower() == 'true' + + +def _env_int(key: str, default: int) -> int: + """Read integer from environment variable.""" + return int(os.environ.get(key, default)) + + +def _env_str(key: str, default: str) -> str: + """Read string from environment variable.""" + return os.environ.get(key, default) + + +@dataclass +class WorkerConfig: + """ + Worker configuration loaded from environment variables. + + Known fields are defined with explicit types. Additional CELERY_* env vars + are loaded dynamically into _extras and included in to_celery_config(). + """ + + # Hardcoded (never change) + accept_content: list[str] = field(default_factory=lambda: ['json']) + broker_connection_retry_on_startup: bool = True + result_serializer: str = 'json' + task_serializer: str = 'json' + # Configurable via env vars + broker_url: str = _env_str( + 'CELERY_BROKER_URL', 'redis://localhost:6379/0') + result_backend: str = _env_str( + 'CELERY_RESULT_BACKEND', 'redis://localhost:6379/1') + result_expires: int = _env_int('CELERY_RESULT_EXPIRES', 3600) + result_extended: bool = _env_bool('CELERY_RESULT_EXTENDED', True) + task_ack_late: bool = _env_bool('CELERY_TASK_ACK_LATE', True) + task_default_retry_delay: int = _env_int( + 'CELERY_TASK_DEFAULT_RETRY_DELAY', 60) + task_default_retry_delay_max: int = _env_int( + 'CELERY_TASK_DEFAULT_RETRY_DELAY_MAX', 600) + task_reject_on_worker_lost: bool = _env_bool( + 'CELERY_TASK_REJECT_ON_WORKER_LOST', True) + task_soft_time_limit: int = _env_int('CELERY_TASK_SOFT_TIME_LIMIT', 1740) + task_time_limit: int = _env_int('CELERY_TASK_TIME_LIMIT', 1800) + worker_prefetch_multiplier: int = _env_int( + 'CELERY_WORKER_PREFETCH_MULTIPLIER', 1) + worker_send_task_events: bool = _env_bool( + 'CELERY_WORKER_SEND_TASK_EVENTS', True) + task_send_sent_event: bool = _env_bool( + 'CELERY_TASK_SEND_SENT_EVENT', True) + # Dynamic extras (populated in __post_init__) + _extras: dict = field(default_factory=dict, init=False, repr=False) + + def __post_init__(self): + """Load additional CELERY_* env vars not defined as fields.""" + known = {f.name for f in fields(self) if not f.name.startswith('_')} + for key, value in os.environ.items(): + if key.startswith('CELERY_'): + field_name = key[7:].lower() # Remove CELERY_ prefix + if field_name not in known: + self._extras[field_name] = self._parse_value(value) + + @staticmethod + def _parse_value(value: str): + """Parse string value to appropriate type using heuristics.""" + if value.lower() in ('true', 'false'): + return value.lower() == 'true' + try: + return int(value) + except ValueError: + pass + try: + return float(value) + except ValueError: + pass + return value + + def to_celery_config(self) -> dict: + """Convert to Celery configuration dict including extras.""" + config = { + f.name: getattr(self, f.name) + for f in fields(self) + if not f.name.startswith('_') + } + config.update(self._extras) + return config diff --git a/operator/resourceclaim.py b/operator/resourceclaim.py index e92d6d7..d1330b5 100644 --- a/operator/resourceclaim.py +++ b/operator/resourceclaim.py @@ -1,21 +1,18 @@ import asyncio - from copy import deepcopy from datetime import datetime, timezone from typing import List, Mapping, TypeVar -from uuid import UUID import kopf import kubernetes_asyncio - +import resourcehandle +import resourceprovider +from cache import Cache, CacheTag from deep_merge import deep_merge from kopfobject import KopfObject from poolboy import Poolboy from poolboy_templating import recursive_process_template_strings -import resourcehandle -import resourceprovider - ResourceClaimT = TypeVar('ResourceClaimT', bound='ResourceClaim') ResourceHandleT = TypeVar('ResourceHandleT', bound='ResourceHandle') ResourceProviderT = TypeVar('ResourceProviderT', bound='ResourceProvider') @@ -43,7 +40,7 @@ def prune_k8s_resource(resource: Mapping) -> Mapping: ret["status"] = { key: value for key, value in resource['status'].items() - if not key in {'diffBase'} + if key not in {'diffBase'} } return ret @@ -53,14 +50,14 @@ class ResourceClaim(KopfObject): kind = "ResourceClaim" plural = "resourceclaims" - instances = {} class_lock = asyncio.Lock() @classmethod def __register_definition(cls, definition: Mapping) -> ResourceClaimT: name = definition['metadata']['name'] namespace = definition['metadata']['namespace'] - resource_claim = cls.instances.get((namespace, name)) + cache_key = f"{namespace}/{name}" + resource_claim = cls.cache_get(CacheTag.CLAIM, cache_key) if resource_claim: resource_claim.refresh_from_definition(definition=definition) else: @@ -74,14 +71,17 @@ def __register_definition(cls, definition: Mapping) -> ResourceClaimT: status = definition.get('status', {}), uid = definition['metadata']['uid'], ) - cls.instances[(namespace, name)] = resource_claim + resource_claim.cache_set(CacheTag.CLAIM, cache_key, ttl=300) return resource_claim @classmethod async def get(cls, name: str, namespace: str, use_cache: bool=True) -> ResourceClaimT: async with cls.class_lock: - if use_cache and (namespace, name) in cls.instances: - return cls.instances[(namespace, name)] + cache_key = f"{namespace}/{name}" + if use_cache: + cached = cls.cache_get(CacheTag.CLAIM, cache_key) + if cached: + return cached definition = await Poolboy.custom_objects_api.get_namespaced_custom_object( group=cls.api_group, name=name, @@ -106,7 +106,8 @@ async def register( uid: str, ) -> ResourceClaimT: async with cls.class_lock: - resource_claim = cls.instances.get((namespace, name)) + cache_key = f"{namespace}/{name}" + resource_claim = cls.cache_get(CacheTag.CLAIM, cache_key) if resource_claim: resource_claim.refresh( annotations = annotations, @@ -127,7 +128,7 @@ async def register( status = status, uid = uid, ) - cls.instances[(namespace, name)] = resource_claim + resource_claim.cache_set(CacheTag.CLAIM, cache_key, ttl=300) return resource_claim @classmethod @@ -141,7 +142,11 @@ async def register_definition( @classmethod async def unregister(cls, name: str, namespace: str) -> ResourceClaimT|None: async with cls.class_lock: - return cls.instances.pop((namespace, name), None) + cache_key = f"{namespace}/{name}" + resource_claim = cls.cache_get(CacheTag.CLAIM, cache_key) + if resource_claim: + Cache.delete(CacheTag.CLAIM, cache_key) + return resource_claim @property def approval_state(self) -> str|None: @@ -181,11 +186,11 @@ def has_spec_resources(self) -> bool: def have_resource_providers(self) -> bool: """Return whether this ResourceClaim has ResourceProviders assigned for all resources.""" if not self.status \ - or not 'resources' in self.status \ + or 'resources' not in self.status \ or len(self.spec.get('resources', [])) > len(self.status.get('resources', [])): return False for resource in self.status.get('resources', []): - if not 'provider' in resource: + if 'provider' not in resource: return False return True @@ -299,11 +304,6 @@ def resource_handle_namespace(self): return None return self.status.get('resourceHandle', {}).get('namespace') - @property - def resource_handler_idx(self) -> int: - """Label value used to select which resource handler pod should manage this ResourceClaim.""" - return int(UUID(self.uid)) % Poolboy.resource_handler_count - @property def resource_pool_name(self): if not self.annotations: @@ -346,34 +346,6 @@ def validation_failed(self) -> bool: return True return False - async def assign_resource_handler(self): - """Apply label to indicate resource handler should manage this ResourceClaim. - Do not change label on items which are deleting.""" - if ( - self.deletion_timestamp is None and - self.labels.get(Poolboy.resource_handler_idx_label) != str(self.resource_handler_idx) - ): - try: - patch = [{ - "op": "test", - "path": "/metadata/deletionTimestamp", - "value": None, - }] - patch.append({ - "op": "add", - "path": f"/metadata/labels/{Poolboy.resource_handler_idx_label.replace('/', '~1')}", - "value": str(self.resource_handler_idx), - } if self.labels else { - "op": "add", - "path": f"/metadata/labels", - "value": { - Poolboy.resource_handler_idx_label: str(self.resource_handler_idx), - } - }) - await self.json_patch(patch) - except kubernetes_asyncio.client.exceptions.ApiException as exception: - pass - async def bind_resource_handle(self, logger: kopf.ObjectLogger, resource_claim_resources: List[Mapping], @@ -473,7 +445,7 @@ def check_auto_detach(self, logger, resource_handle, resource_provider): def get_resource_state_from_status(self, resource_index): if not self.status \ - or not 'resources' in self.status \ + or 'resources' not in self.status \ or resource_index >= len(self.status['resources']): return None return self.status['resources'][resource_index].get('state') @@ -505,7 +477,7 @@ async def update_status_from_handle(self, # Adjust requested end if unchanged from default if not self.requested_lifespan_end_datetime \ or lifespan_default_timedelta == self.requested_lifespan_end_datetime - self.lifespan_start_datetime: - logger.info(f"Resetting default lifespan end on first ready") + logger.info("Resetting default lifespan end on first ready") await self.set_requested_lifespan_end( datetime.now(timezone.utc) + lifespan_default_timedelta ) @@ -671,7 +643,7 @@ async def assign_resource_providers(self, logger) -> None: elif 'template' in resource: provider = resourceprovider.ResourceProvider.find_provider_by_template_match(resource['template']) else: - raise kopf.TemporaryError(f"ResourceClaim spec.resources require either an explicit provider or a resource template to match.", delay=600) + raise kopf.TemporaryError("ResourceClaim spec.resources require either an explicit provider or a resource template to match.", delay=600) providers.append(provider) await self.merge_patch_status({ @@ -780,7 +752,7 @@ async def manage(self, logger) -> None: f"{self} has both spec.provider and spec.resources!", delay = 600 ) - if not 'provider' in self.status: + if 'provider' not in self.status: await self.merge_patch_status({ "provider": { "name": self.resource_provider_name_from_spec @@ -818,7 +790,7 @@ async def manage(self, logger) -> None: }) if resource_provider.approval_required: - if not 'approval' in self.status: + if 'approval' not in self.status: await self.merge_patch_status({ "approval": { "message": resource_provider.approval_pending_message, @@ -959,7 +931,7 @@ async def __manage_resource_handle(self, set_lifespan_end_timestamp = set_lifespan_end.strftime('%FT%TZ') - if not 'lifespan' in resource_handle.spec: + if 'lifespan' not in resource_handle.spec: logger.info(f"Setting lifespan end for {resource_handle} to {set_lifespan_end_timestamp}") patch.append({ "op": "add", diff --git a/operator/resourcehandle.py b/operator/resourcehandle.py index e0e5974..2e75e44 100644 --- a/operator/resourcehandle.py +++ b/operator/resourcehandle.py @@ -1,31 +1,28 @@ import asyncio -import logging - from copy import deepcopy from datetime import datetime, timedelta, timezone from typing import Any, List, Mapping, TypeVar -from uuid import UUID import jinja2 import jsonpointer import kopf import kubernetes_asyncio -import pytimeparse - import poolboy_k8s +import pytimeparse import resourceclaim import resourcepool import resourceprovider import resourcewatch - +from cache import Cache, CacheTag from kopfobject import KopfObject from poolboy import Poolboy from poolboy_templating import recursive_process_template_strings, timedelta_to_str -ResourceClaimT = TypeVar('ResourceClaimT', bound='ResourceClaim') -ResourceHandleT = TypeVar('ResourceHandleT', bound='ResourceHandle') -ResourcePoolT = TypeVar('ResourcePoolT', bound='ResourcePool') -ResourceProviderT = TypeVar('ResourceProviderT', bound='ResourceProvider') +ResourceClaimT = TypeVar("ResourceClaimT", bound="ResourceClaim") +ResourceHandleT = TypeVar("ResourceHandleT", bound="ResourceHandle") +ResourcePoolT = TypeVar("ResourcePoolT", bound="ResourcePool") +ResourceProviderT = TypeVar("ResourceProviderT", bound="ResourceProvider") + class ResourceHandleMatch: def __init__(self, resource_handle): @@ -35,7 +32,7 @@ def __init__(self, resource_handle): self.template_difference_count = 0 def __lt__(self, cmp): - '''Compare matches by preference''' + """Compare matches by preference""" if self.resource_count_difference < cmp.resource_count_difference: return True if self.resource_count_difference > cmp.resource_count_difference: @@ -51,26 +48,21 @@ def __lt__(self, cmp): if self.template_difference_count > cmp.template_difference_count: return False - # Prefer healthy resources to unknown health state - if self.resource_handle.is_healthy and cmp.resource_handle.is_healthy is None: + if self.resource_handle.is_healthy and cmp.resource_handle.is_healthy is False: return True - if self.resource_handle.is_healthy is None and cmp.resource_handle.is_healthy: + if self.resource_handle.is_healthy is False and cmp.resource_handle.is_healthy: return False - # Prefer ready resources to unready or unknown readiness state - if self.resource_handle.is_ready and not cmp.resource_handle.is_ready: + if self.resource_handle.is_ready and cmp.resource_handle.is_ready is False: return True - if not self.resource_handle.is_ready and cmp.resource_handle.is_ready: + if self.resource_handle.is_ready is False and cmp.resource_handle.is_ready: return False - # Prefer unknown readiness state to known unready state - if self.resource_handle.is_ready is None and cmp.resource_handle.is_ready is False: - return True - if not self.resource_handle.is_ready is False and cmp.resource_handle.is_ready is None: - return False + return ( + self.resource_handle.creation_timestamp + < cmp.resource_handle.creation_timestamp + ) - # Prefer older matches - return self.resource_handle.creation_timestamp < cmp.resource_handle.creation_timestamp class ResourceHandle(KopfObject): api_group = Poolboy.operator_domain @@ -78,28 +70,24 @@ class ResourceHandle(KopfObject): kind = "ResourceHandle" plural = "resourcehandles" - all_instances = {} - bound_instances = {} - unbound_instances = {} class_lock = asyncio.Lock() - watch_other_task = None @classmethod def __register_definition(cls, definition: Mapping) -> ResourceHandleT: - name = definition['metadata']['name'] - resource_handle = cls.all_instances.get(name) + name = definition["metadata"]["name"] + resource_handle = cls.cache_get(CacheTag.HANDLE, name) if resource_handle: resource_handle.refresh_from_definition(definition=definition) else: resource_handle = cls( - annotations = definition['metadata'].get('annotations', {}), - labels = definition['metadata'].get('labels', {}), - meta = definition['metadata'], - name = name, - namespace = Poolboy.namespace, - spec = definition['spec'], - status = definition.get('status', {}), - uid = definition['metadata']['uid'], + annotations=definition["metadata"].get("annotations", {}), + labels=definition["metadata"].get("labels", {}), + meta=definition["metadata"], + name=name, + namespace=Poolboy.namespace, + spec=definition["spec"], + status=definition.get("status", {}), + uid=definition["metadata"]["uid"], ) resource_handle.__register() return resource_handle @@ -110,10 +98,11 @@ async def bind_handle_to_claim( logger: kopf.ObjectLogger, resource_claim: ResourceClaimT, resource_claim_resources: List[Mapping], - ) -> ResourceHandleT|None: + ) -> ResourceHandleT | None: async with cls.class_lock: # Check if there is already an assigned claim - resource_handle = cls.bound_instances.get((resource_claim.namespace, resource_claim.name)) + bound_key = f"{resource_claim.namespace}/{resource_claim.name}" + resource_handle = cls.cache_get(CacheTag.HANDLE_BOUND, bound_key) if resource_handle: if await resource_handle.refetch(): logger.warning(f"Rebinding {resource_handle} to {resource_claim}") @@ -124,19 +113,27 @@ async def bind_handle_to_claim( # Loop through unbound instances to find best match matches = [] - for resource_handle in cls.unbound_instances.values(): + for name in Cache.get_keys_by_tag(CacheTag.HANDLE_UNBOUND): + resource_handle = cls.cache_get(CacheTag.HANDLE_UNBOUND, name) + if not resource_handle: + continue # Skip unhealthy if resource_handle.is_healthy is False: continue # Honor explicit pool requests - if resource_claim.resource_pool_name \ - and resource_claim.resource_pool_name != resource_handle.resource_pool_name: + if ( + resource_claim.resource_pool_name + and resource_claim.resource_pool_name + != resource_handle.resource_pool_name + ): continue # Do not bind to handles that are near end of lifespan - if resource_handle.has_lifespan_end \ - and resource_handle.timedelta_to_lifespan_end.total_seconds() < 120: + if ( + resource_handle.has_lifespan_end + and resource_handle.timedelta_to_lifespan_end.total_seconds() < 120 + ): continue handle_resources = resource_handle.resources @@ -146,28 +143,32 @@ async def bind_handle_to_claim( continue match = ResourceHandleMatch(resource_handle) - match.resource_count_difference = len(resource_claim_resources) - len(handle_resources) + match.resource_count_difference = len(resource_claim_resources) - len( + handle_resources + ) for i, handle_resource in enumerate(handle_resources): claim_resource = resource_claim_resources[i] # ResourceProvider must match - provider_name = claim_status_resources[i]['provider']['name'] - if provider_name != handle_resource['provider']['name']: + provider_name = claim_status_resources[i]["provider"]["name"] + if provider_name != handle_resource["provider"]["name"]: match = None break # Check resource name match - claim_resource_name = claim_resource.get('name') - handle_resource_name = handle_resource.get('name') + claim_resource_name = claim_resource.get("name") + handle_resource_name = handle_resource.get("name") if claim_resource_name != handle_resource_name: match.resource_name_difference_count += 1 # Use provider to check if templates match and get list of allowed differences - provider = await resourceprovider.ResourceProvider.get(provider_name) + provider = await resourceprovider.ResourceProvider.get( + provider_name + ) diff_patch = provider.check_template_match( - handle_resource_template = handle_resource.get('template', {}), - claim_resource_template = claim_resource.get('template', {}), + handle_resource_template=handle_resource.get("template", {}), + claim_resource_template=claim_resource.get("template", {}), ) if diff_patch is None: match = None @@ -192,58 +193,79 @@ async def bind_handle_to_claim( "kind": "ResourceClaim", "name": resource_claim.name, "namespace": resource_claim.namespace, - } + }, } ] # Update ResourceProvider to match ResourceClaim if resource_claim.has_resource_provider: - patch.append({ - "op": "add", - "path": "/spec/provider", - "value": resource_claim.spec['provider'], - }) + patch.append( + { + "op": "add", + "path": "/spec/provider", + "value": resource_claim.spec["provider"], + } + ) # Set resource names and add any additional resources to handle - for resource_index, claim_resource in enumerate(resource_claim_resources): - resource_name = resource_claim_resources[resource_index].get('name') + for resource_index, claim_resource in enumerate( + resource_claim_resources + ): + resource_name = resource_claim_resources[resource_index].get("name") if resource_index < len(matched_resource_handle.resources): - handle_resource = matched_resource_handle.resources[resource_index] - if resource_name != handle_resource.get('name'): - patch.append({ - "op": "add", - "path": f"/spec/resources/{resource_index}/name", - "value": resource_name, - }) + handle_resource = matched_resource_handle.resources[ + resource_index + ] + if resource_name != handle_resource.get("name"): + patch.append( + { + "op": "add", + "path": f"/spec/resources/{resource_index}/name", + "value": resource_name, + } + ) else: patch_value = { - "provider": resource_claim_resources[resource_index]['provider'], + "provider": resource_claim_resources[resource_index][ + "provider" + ], } if resource_name: - patch_value['name'] = resource_name - patch.append({ - "op": "add", - "path": f"/spec/resources/{resource_index}", - "value": patch_value, - }) + patch_value["name"] = resource_name + patch.append( + { + "op": "add", + "path": f"/spec/resources/{resource_index}", + "value": patch_value, + } + ) # Set lifespan end from default on claim bind - lifespan_default = matched_resource_handle.get_lifespan_default(resource_claim) + lifespan_default = matched_resource_handle.get_lifespan_default( + resource_claim + ) if lifespan_default: - patch.append({ - "op": "add", - "path": "/spec/lifespan/end", - "value": ( - datetime.now(timezone.utc) + matched_resource_handle.get_lifespan_default_timedelta(resource_claim) - ).strftime('%FT%TZ'), - }) + patch.append( + { + "op": "add", + "path": "/spec/lifespan/end", + "value": ( + datetime.now(timezone.utc) + + matched_resource_handle.get_lifespan_default_timedelta( + resource_claim + ) + ).strftime("%FT%TZ"), + } + ) try: await matched_resource_handle.json_patch(patch) matched_resource_handle.__register() except kubernetes_asyncio.client.exceptions.ApiException as exception: if exception.status == 404: - logger.warning(f"Attempt to bind deleted {matched_resource_handle} to {resource_claim}") + logger.warning( + f"Attempt to bind deleted {matched_resource_handle} to {resource_claim}" + ) matched_resource_handle.__unregister() matched_resource_handle = None else: @@ -256,7 +278,9 @@ async def bind_handle_to_claim( return None if matched_resource_handle.is_from_resource_pool: - resource_pool = await resourcepool.ResourcePool.get(matched_resource_handle.resource_pool_name) + resource_pool = await resourcepool.ResourcePool.get( + matched_resource_handle.resource_pool_name + ) if resource_pool: await resource_pool.manage(logger=logger) else: @@ -267,30 +291,31 @@ async def bind_handle_to_claim( return matched_resource_handle @classmethod - async def create_for_claim(cls, + async def create_for_claim( + cls, logger: kopf.ObjectLogger, resource_claim: ResourceClaimT, resource_claim_resources: List[Mapping], ): definition = { - 'apiVersion': Poolboy.operator_api_version, - 'kind': 'ResourceHandle', - 'metadata': { - 'finalizers': [ Poolboy.operator_domain ], - 'generateName': 'guid-', - 'labels': { + "apiVersion": Poolboy.operator_api_version, + "kind": "ResourceHandle", + "metadata": { + "finalizers": [Poolboy.operator_domain], + "generateName": "guid-", + "labels": { Poolboy.resource_claim_name_label: resource_claim.name, Poolboy.resource_claim_namespace_label: resource_claim.namespace, - } + }, }, - 'spec': { - 'resourceClaim': { - 'apiVersion': Poolboy.operator_api_version, - 'kind': 'ResourceClaim', - 'name': resource_claim.name, - 'namespace': resource_claim.namespace + "spec": { + "resourceClaim": { + "apiVersion": Poolboy.operator_api_version, + "kind": "ResourceClaim", + "name": resource_claim.name, + "namespace": resource_claim.namespace, }, - } + }, } resources = [] @@ -301,45 +326,70 @@ async def create_for_claim(cls, lifespan_relative_maximum_timedelta = None if resource_claim.has_resource_provider: resource_provider = await resource_claim.get_resource_provider() - definition['spec']['resources'] = resource_claim_resources - definition['spec']['provider'] = resource_claim.spec['provider'] - lifespan_default_timedelta = resource_provider.get_lifespan_default_timedelta(resource_claim) + definition["spec"]["resources"] = resource_claim_resources + definition["spec"]["provider"] = resource_claim.spec["provider"] + lifespan_default_timedelta = ( + resource_provider.get_lifespan_default_timedelta(resource_claim) + ) lifespan_maximum = resource_provider.lifespan_maximum - lifespan_maximum_timedelta = resource_provider.get_lifespan_maximum_timedelta(resource_claim) + lifespan_maximum_timedelta = ( + resource_provider.get_lifespan_maximum_timedelta(resource_claim) + ) lifespan_relative_maximum = resource_provider.lifespan_relative_maximum - lifespan_relative_maximum_timedelta = resource_provider.get_lifespan_maximum_timedelta(resource_claim) + lifespan_relative_maximum_timedelta = ( + resource_provider.get_lifespan_maximum_timedelta(resource_claim) + ) else: - resource_providers = await resource_claim.get_resource_providers(resource_claim_resources) + resource_providers = await resource_claim.get_resource_providers( + resource_claim_resources + ) for i, claim_resource in enumerate(resource_claim_resources): provider = resource_providers[i] - provider_lifespan_default_timedelta = provider.get_lifespan_default_timedelta(resource_claim) + provider_lifespan_default_timedelta = ( + provider.get_lifespan_default_timedelta(resource_claim) + ) if provider_lifespan_default_timedelta: - if not lifespan_default_timedelta \ - or provider_lifespan_default_timedelta < lifespan_default_timedelta: + if ( + not lifespan_default_timedelta + or provider_lifespan_default_timedelta + < lifespan_default_timedelta + ): lifespan_default_timedelta = provider_lifespan_default_timedelta - provider_lifespan_maximum_timedelta = provider.get_lifespan_maximum_timedelta(resource_claim) + provider_lifespan_maximum_timedelta = ( + provider.get_lifespan_maximum_timedelta(resource_claim) + ) if provider_lifespan_maximum_timedelta: - if not lifespan_maximum_timedelta \ - or provider_lifespan_maximum_timedelta < lifespan_maximum_timedelta: + if ( + not lifespan_maximum_timedelta + or provider_lifespan_maximum_timedelta + < lifespan_maximum_timedelta + ): lifespan_maximum = provider.lifespan_maximum lifespan_maximum_timedelta = provider_lifespan_maximum_timedelta - provider_lifespan_relative_maximum_timedelta = provider.get_lifespan_relative_maximum_timedelta(resource_claim) + provider_lifespan_relative_maximum_timedelta = ( + provider.get_lifespan_relative_maximum_timedelta(resource_claim) + ) if provider_lifespan_relative_maximum_timedelta: - if not lifespan_relative_maximum_timedelta \ - or provider_lifespan_relative_maximum_timedelta < lifespan_relative_maximum_timedelta: + if ( + not lifespan_relative_maximum_timedelta + or provider_lifespan_relative_maximum_timedelta + < lifespan_relative_maximum_timedelta + ): lifespan_relative_maximum = provider.lifespan_relative_maximum - lifespan_relative_maximum_timedelta = provider_lifespan_relative_maximum_timedelta + lifespan_relative_maximum_timedelta = ( + provider_lifespan_relative_maximum_timedelta + ) resources_item = {"provider": provider.as_reference()} - if 'name' in claim_resource: - resources_item['name'] = claim_resource['name'] - if 'template' in claim_resource: - resources_item['template'] = claim_resource['template'] + if "name" in claim_resource: + resources_item["name"] = claim_resource["name"] + if "template" in claim_resource: + resources_item["template"] = claim_resource["template"] resources.append(resources_item) - definition['spec']['resources'] = resources + definition["spec"]["resources"] = resources lifespan_end_datetime = None lifespan_start_datetime = datetime.now(timezone.utc) @@ -349,21 +399,31 @@ async def create_for_claim(cls, elif lifespan_default_timedelta: lifespan_end_datetime = lifespan_start_datetime + lifespan_default_timedelta elif lifespan_relative_maximum_timedelta: - lifespan_end_datetime = lifespan_start_datetime + lifespan_relative_maximum_timedelta + lifespan_end_datetime = ( + lifespan_start_datetime + lifespan_relative_maximum_timedelta + ) elif lifespan_maximum_timedelta: lifespan_end_datetime = lifespan_start_datetime + lifespan_maximum_timedelta if lifespan_end_datetime: - if lifespan_relative_maximum_timedelta \ - and lifespan_end_datetime > lifespan_start_datetime + lifespan_relative_maximum_timedelta: + if ( + lifespan_relative_maximum_timedelta + and lifespan_end_datetime + > lifespan_start_datetime + lifespan_relative_maximum_timedelta + ): logger.warning( f"Requested lifespan end {resource_claim.requested_lifespan_end_timestamp} " f"for ResourceClaim {resource_claim.name} in {resource_claim.namespace} " f"exceeds relativeMaximum for ResourceProviders" ) - lifespan_end = lifespan_start_datetime + lifespan_relative_maximum_timedelta - if lifespan_maximum_timedelta \ - and lifespan_end_datetime > lifespan_start_datetime + lifespan_maximum_timedelta: + lifespan_end = ( + lifespan_start_datetime + lifespan_relative_maximum_timedelta + ) + if ( + lifespan_maximum_timedelta + and lifespan_end_datetime + > lifespan_start_datetime + lifespan_maximum_timedelta + ): logger.warning( f"Requested lifespan end {resource_claim.requested_lifespan_end_timestamp} " f"for ResourceClaim {resource_claim.name} in {resource_claim.namespace} " @@ -372,31 +432,33 @@ async def create_for_claim(cls, lifespan_end = lifespan_start_datetime + lifespan_maximum_timedelta if lifespan_default_timedelta: - definition['spec'].setdefault('lifespan', {})['default'] = timedelta_to_str(lifespan_default_timedelta) + definition["spec"].setdefault("lifespan", {})["default"] = timedelta_to_str( + lifespan_default_timedelta + ) if lifespan_end_datetime: - definition['spec'].setdefault('lifespan', {})['end'] = lifespan_end_datetime.strftime('%FT%TZ') + definition["spec"].setdefault("lifespan", {})["end"] = ( + lifespan_end_datetime.strftime("%FT%TZ") + ) if lifespan_maximum: - definition['spec'].setdefault('lifespan', {})['maximum'] = lifespan_maximum + definition["spec"].setdefault("lifespan", {})["maximum"] = lifespan_maximum if lifespan_relative_maximum: - definition['spec'].setdefault('lifespan', {})['relativeMaximum'] = lifespan_relative_maximum + definition["spec"].setdefault("lifespan", {})["relativeMaximum"] = ( + lifespan_relative_maximum + ) definition = await Poolboy.custom_objects_api.create_namespaced_custom_object( - body = definition, - group = Poolboy.operator_domain, - namespace = Poolboy.namespace, - plural = 'resourcehandles', - version = Poolboy.operator_version, + body=definition, + group=Poolboy.operator_domain, + namespace=Poolboy.namespace, + plural="resourcehandles", + version=Poolboy.operator_version, ) resource_handle = cls.from_definition(definition) - if ( - Poolboy.operator_mode_all_in_one or ( - Poolboy.operator_mode_resource_handler and - Poolboy.resource_handler_idx == resource_handle.resource_handler_idx - ) - ): + # Register in standalone mode (no handler partitioning) + if Poolboy.is_standalone: resource_handle.__register() logger.info( f"Created ResourceHandle {resource_handle.name} for " @@ -423,56 +485,68 @@ async def create_for_pool( "spec": { "resourcePool": resource_pool.reference, "vars": resource_pool.vars, - } + }, } if resource_pool.has_resource_provider: - definition['spec']['provider'] = resource_pool.spec['provider'] + definition["spec"]["provider"] = resource_pool.spec["provider"] resource_provider = await resource_pool.get_resource_provider() if resource_provider.has_lifespan: - definition['spec']['lifespan'] = {} + definition["spec"]["lifespan"] = {} if resource_provider.lifespan_default: - definition['spec']['lifespan']['default'] = resource_provider.lifespan_default + definition["spec"]["lifespan"]["default"] = ( + resource_provider.lifespan_default + ) if resource_provider.lifespan_maximum: - definition['spec']['lifespan']['maximum'] = resource_provider.lifespan_maximum + definition["spec"]["lifespan"]["maximum"] = ( + resource_provider.lifespan_maximum + ) if resource_provider.lifespan_relative_maximum: - definition['spec']['lifespan']['maximum'] = resource_provider.lifespan_relative_maximum + definition["spec"]["lifespan"]["maximum"] = ( + resource_provider.lifespan_relative_maximum + ) if resource_provider.lifespan_unclaimed: - definition['spec']['lifespan']['end'] = ( - datetime.now(timezone.utc) + resource_provider.lifespan_unclaimed_timedelta + definition["spec"]["lifespan"]["end"] = ( + datetime.now(timezone.utc) + + resource_provider.lifespan_unclaimed_timedelta ).strftime("%FT%TZ") else: - definition['spec']['resources'] = resource_pool.resources + definition["spec"]["resources"] = resource_pool.resources if resource_pool.has_lifespan: - definition['spec']['lifespan'] = {} + definition["spec"]["lifespan"] = {} if resource_pool.lifespan_default: - definition['spec']['lifespan']['default'] = resource_pool.lifespan_default + definition["spec"]["lifespan"]["default"] = ( + resource_pool.lifespan_default + ) if resource_pool.lifespan_maximum: - definition['spec']['lifespan']['maximum'] = resource_pool.lifespan_maximum + definition["spec"]["lifespan"]["maximum"] = ( + resource_pool.lifespan_maximum + ) if resource_pool.lifespan_relative_maximum: - definition['spec']['lifespan']['relativeMaximum'] = resource_pool.lifespan_relative_maximum + definition["spec"]["lifespan"]["relativeMaximum"] = ( + resource_pool.lifespan_relative_maximum + ) if resource_pool.lifespan_unclaimed: - definition['spec']['lifespan']['end'] = ( - datetime.now(timezone.utc) + resource_pool.lifespan_unclaimed_timedelta + definition["spec"]["lifespan"]["end"] = ( + datetime.now(timezone.utc) + + resource_pool.lifespan_unclaimed_timedelta ).strftime("%FT%TZ") definition = await Poolboy.custom_objects_api.create_namespaced_custom_object( - body = definition, - group = Poolboy.operator_domain, - namespace = Poolboy.namespace, - plural = "resourcehandles", - version = Poolboy.operator_version, + body=definition, + group=Poolboy.operator_domain, + namespace=Poolboy.namespace, + plural="resourcehandles", + version=Poolboy.operator_version, ) resource_handle = cls.from_definition(definition) - if ( - Poolboy.operator_mode_all_in_one or ( - Poolboy.operator_mode_resource_handler and - Poolboy.resource_handler_idx == resource_handle.resource_handler_idx - ) - ): + # Register in standalone mode (no handler partitioning) + if Poolboy.is_standalone: resource_handle.__register() - logger.info(f"Created ResourceHandle {resource_handle.name} for ResourcePool {resource_pool.name}") + logger.info( + f"Created ResourceHandle {resource_handle.name} for ResourcePool {resource_pool.name}" + ) return resource_handle @classmethod @@ -481,20 +555,7 @@ async def delete_unbound_handles_for_pool( logger: kopf.ObjectLogger, resource_pool: ResourcePoolT, ) -> List[ResourceHandleT]: - if Poolboy.operator_mode_all_in_one: - async with cls.class_lock: - resource_handles = [] - for resource_handle in list(cls.unbound_instances.values()): - if resource_handle.resource_pool_name == resource_pool.name \ - and resource_handle.resource_pool_namespace == resource_pool.namespace: - logger.info( - f"Deleting unbound ResourceHandle {resource_handle.name} " - f"for ResourcePool {resource_pool.name}" - ) - resource_handle.__unregister() - await resource_handle.delete() - return resource_handles - + """Delete all unbound handles for a pool.""" resource_handles = await cls.get_unbound_handles_for_pool( resource_pool=resource_pool, logger=logger, @@ -504,36 +565,36 @@ async def delete_unbound_handles_for_pool( f"Deleting unbound ResourceHandle {resource_handle.name} " f"for ResourcePool {resource_pool.name}" ) + resource_handle.__unregister() await resource_handle.delete() return resource_handles @classmethod - async def get(cls, name: str, ignore_deleting=True, use_cache=True) -> ResourceHandleT|None: + async def get( + cls, name: str, ignore_deleting=True, use_cache=True + ) -> ResourceHandleT | None: async with cls.class_lock: - if use_cache and name in cls.all_instances: - return cls.all_instances[name] + if use_cache: + cached = cls.cache_get(CacheTag.HANDLE, name) + if cached: + return cached definition = await Poolboy.custom_objects_api.get_namespaced_custom_object( group=Poolboy.operator_domain, name=name, namespace=Poolboy.namespace, - plural='resourcehandles', + plural="resourcehandles", version=Poolboy.operator_version, ) - if ignore_deleting and 'deletionTimestamp' in definition['metadata']: + if ignore_deleting and "deletionTimestamp" in definition["metadata"]: return None if use_cache: return cls.__register_definition(definition) return cls.from_definition(definition) @classmethod - def get_from_cache(cls, name: str) -> ResourceHandleT|None: - return cls.all_instances.get(name) - - @classmethod - def start_watch_other(cls) -> None: - logger = logging.getLogger('watch_other_handles') - cls.watch_other_task = asyncio.create_task(cls.watch_other(logger)) + def get_from_cache(cls, name: str) -> ResourceHandleT | None: + return cls.cache_get(CacheTag.HANDLE, name) @classmethod async def get_unbound_handles_for_pool( @@ -541,31 +602,49 @@ async def get_unbound_handles_for_pool( resource_pool: ResourcePoolT, logger: kopf.ObjectLogger, ) -> List[ResourceHandleT]: + """Get unbound handles for a pool.""" resource_handles = [] - if Poolboy.operator_mode_all_in_one: + + # In standalone mode, use cache (Memory or Redis) + # In distributed mode, fetch from K8s API to ensure completeness + # (cache may not be fully populated if operator just started) + if Poolboy.is_standalone: async with cls.class_lock: - for resource_handle in ResourceHandle.unbound_instances.values(): - if resource_handle.resource_pool_name == resource_pool.name \ - and resource_handle.resource_pool_namespace == resource_pool.namespace: + for name in Cache.get_keys_by_tag(CacheTag.HANDLE_UNBOUND): + resource_handle = cls.cache_get(CacheTag.HANDLE_UNBOUND, name) + if ( + resource_handle + and resource_handle.resource_pool_name == resource_pool.name + and resource_handle.resource_pool_namespace + == resource_pool.namespace + ): resource_handles.append(resource_handle) - return resource_handles + return resource_handles + # Distributed mode: fetch from K8s API and cache for other workers _continue = None while True: resource_handle_list = await Poolboy.custom_objects_api.list_namespaced_custom_object( group=Poolboy.operator_domain, label_selector=f"{Poolboy.resource_pool_name_label}={resource_pool.name},!{Poolboy.resource_claim_name_label}", namespace=Poolboy.namespace, - plural='resourcehandles', + plural="resourcehandles", version=Poolboy.operator_version, - _continue = _continue, - limit = 50, + _continue=_continue, + limit=50, ) - for definition in resource_handle_list['items']: + for definition in resource_handle_list["items"]: resource_handle = cls.from_definition(definition) if not resource_handle.is_bound: + # Cache for other workers + resource_handle.cache_set( + CacheTag.HANDLE, resource_handle.name, ttl=300 + ) + resource_handle.cache_set( + CacheTag.HANDLE_UNBOUND, resource_handle.name, ttl=300 + ) resource_handles.append(resource_handle) - _continue = resource_handle_list['metadata'].get('continue') + _continue = resource_handle_list["metadata"].get("continue") if not _continue: break return resource_handles @@ -575,17 +654,19 @@ async def preload(cls, logger: kopf.ObjectLogger) -> None: async with cls.class_lock: _continue = None while True: - resource_handle_list = await Poolboy.custom_objects_api.list_namespaced_custom_object( - group=Poolboy.operator_domain, - namespace=Poolboy.namespace, - plural='resourcehandles', - version=Poolboy.operator_version, - _continue = _continue, - limit = 50, + resource_handle_list = ( + await Poolboy.custom_objects_api.list_namespaced_custom_object( + group=Poolboy.operator_domain, + namespace=Poolboy.namespace, + plural="resourcehandles", + version=Poolboy.operator_version, + _continue=_continue, + limit=50, + ) ) - for definition in resource_handle_list['items']: + for definition in resource_handle_list["items"]: cls.__register_definition(definition=definition) - _continue = resource_handle_list['metadata'].get('continue') + _continue = resource_handle_list["metadata"].get("continue") if not _continue: break @@ -602,26 +683,26 @@ async def register( uid: str, ) -> ResourceHandleT: async with cls.class_lock: - resource_handle = cls.all_instances.get(name) + resource_handle = cls.cache_get(CacheTag.HANDLE, name) if resource_handle: resource_handle.refresh( - annotations = annotations, - labels = labels, - meta = meta, - spec = spec, - status = status, - uid = uid, + annotations=annotations, + labels=labels, + meta=meta, + spec=spec, + status=status, + uid=uid, ) else: resource_handle = cls( - annotations = annotations, - labels = labels, - meta = meta, - name = name, - namespace = namespace, - spec = spec, - status = status, - uid = uid, + annotations=annotations, + labels=labels, + meta=meta, + name=name, + namespace=namespace, + spec=spec, + status=status, + uid=uid, ) resource_handle.__register() return resource_handle @@ -632,101 +713,59 @@ async def register_definition(cls, definition: Mapping) -> ResourceHandleT: return cls.__register_definition(definition) @classmethod - async def stop_watch_other(cls) -> None: - if cls.watch_other_task is None: - return - cls.watch_other_task.cancel() - await cls.watch_other_task - - @classmethod - async def unregister(cls, name: str) -> ResourceHandleT|None: + async def unregister(cls, name: str) -> ResourceHandleT | None: async with cls.class_lock: - resource_handle = cls.all_instances.pop(name, None) + resource_handle = cls.cache_get(CacheTag.HANDLE, name) if resource_handle: resource_handle.__unregister() return resource_handle - - @classmethod - async def watch_other(cls, logger) -> None: - while True: - try: - # FIXME - clear stale cache entries - await cls.__watch_other(logger) - except kubernetes_asyncio.client.exceptions.ApiException as exception: - if exception.status != 410: - logger.exception("Error watching other resourcehandles") - await asyncio.sleep(10) - except: - logger.exception("Error watching other resourcehandles") - await asyncio.sleep(10) - - @classmethod - async def __watch_other(cls, logger) -> None: - watch = kubernetes_asyncio.watch.Watch() - async for event in watch.stream( - Poolboy.custom_objects_api.list_namespaced_custom_object, - group=cls.api_group, - label_selector=f"!{Poolboy.ignore_label},{Poolboy.resource_handler_idx_label}!={Poolboy.resource_handler_idx}", - namespace=Poolboy.namespace, - plural=cls.plural, - version=cls.api_version, - ): - event_obj = event['object'] - event_type = event['type'] - if event_type == 'DELETED': - await cls.unregister(event_obj['metadata']['name']) - else: - await cls.register_definition(event_obj) + return None def __str__(self) -> str: return f"ResourceHandle {self.name}" def __register(self) -> None: """ - Add ResourceHandle to register of bound or unbound instances. + Add ResourceHandle to cache of bound or unbound instances. This method must be called with the ResourceHandle.lock held. """ # Ensure deleting resource handles are not cached if self.is_deleting: self.__unregister() return - self.all_instances[self.name] = self + self.cache_set(CacheTag.HANDLE, self.name, ttl=300) if self.is_bound: - self.bound_instances[( - self.resource_claim_namespace, - self.resource_claim_name - )] = self - self.unbound_instances.pop(self.name, None) + bound_key = f"{self.resource_claim_namespace}/{self.resource_claim_name}" + Cache.set(CacheTag.HANDLE_BOUND, bound_key, self, ttl=300) + Cache.delete(CacheTag.HANDLE_UNBOUND, self.name) else: - self.unbound_instances[self.name] = self + self.cache_set(CacheTag.HANDLE_UNBOUND, self.name, ttl=300) def __unregister(self) -> None: - self.all_instances.pop(self.name, None) - self.unbound_instances.pop(self.name, None) + Cache.delete(CacheTag.HANDLE, self.name) + Cache.delete(CacheTag.HANDLE_UNBOUND, self.name) if self.is_bound: - self.bound_instances.pop( - (self.resource_claim_namespace, self.resource_claim_name), - None, - ) + bound_key = f"{self.resource_claim_namespace}/{self.resource_claim_name}" + Cache.delete(CacheTag.HANDLE_BOUND, bound_key) @property def guid(self) -> str: name = self.name - generate_name = self.meta.get('generateName') + generate_name = self.meta.get("generateName") if generate_name and name.startswith(generate_name): - return name[len(generate_name):] - elif name.startswith('guid-'): + return name[len(generate_name) :] + elif name.startswith("guid-"): return name[5:] return name[-5:] @property def has_lifespan_end(self) -> bool: - 'end' in self.spec.get('lifespan', {}) + return "end" in self.spec.get("lifespan", {}) @property def has_resource_provider(self) -> bool: """Return whether this ResourceHandle is managed by a ResourceProvider.""" - return 'provider' in self.spec + return "provider" in self.spec @property def ignore(self) -> bool: @@ -735,7 +774,7 @@ def ignore(self) -> bool: @property def is_bound(self) -> bool: - return 'resourceClaim' in self.spec + return "resourceClaim" in self.spec @property def is_deleting(self) -> bool: @@ -743,39 +782,36 @@ def is_deleting(self) -> bool: @property def is_from_resource_pool(self) -> bool: - return 'resourcePool' in self.spec + return "resourcePool" in self.spec @property - def is_healthy(self) -> bool|None: + def is_healthy(self) -> bool | None: """Return overall health of resources. - False if any resource has healthy False. - None if any non-waiting resource lacks a value for healthy. - True if all non-waiting resources are healthy.""" ret = True for resource in self.status_resources: - if resource.get('healthy') is False: + if resource.get("healthy") is False: return False - if( - resource.get('waitingFor') is not None and - resource.get('healthy') is None + if ( + resource.get("waitingFor") is not None + and resource.get("healthy") is None ): ret = None return ret @property - def is_ready(self) -> bool|None: + def is_ready(self) -> bool | None: """Return overall readiness of resources. - False if any resource has ready False. - None if any non-waiting resource lacks a value for ready. - True if all non-waiting resources are ready.""" ret = True for resource in self.status_resources: - if resource.get('ready') is False: + if resource.get("ready") is False: return False - if( - resource.get('waitingFor') is not None and - resource.get('ready') is None - ): + if resource.get("waitingFor") is not None and resource.get("ready") is None: ret = None return ret @@ -787,71 +823,66 @@ def is_past_lifespan_end(self) -> bool: return dt < datetime.now(timezone.utc) @property - def is_ready(self) -> bool|None: - return self.status.get('ready') + def is_ready(self) -> bool | None: + return self.status.get("ready") @property def lifespan_end_datetime(self) -> Any: timestamp = self.lifespan_end_timestamp if timestamp: - return datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S%z') + return datetime.strptime(timestamp, "%Y-%m-%dT%H:%M:%S%z") @property - def lifespan_end_timestamp(self) -> str|None: - lifespan = self.spec.get('lifespan') + def lifespan_end_timestamp(self) -> str | None: + lifespan = self.spec.get("lifespan") if lifespan: - return lifespan.get('end') + return lifespan.get("end") @property def parameter_values(self) -> Mapping: - return self.spec.get('provider', {}).get('parameterValues', {}) + return self.spec.get("provider", {}).get("parameterValues", {}) @property - def resource_claim_description(self) -> str|None: - if not 'resourceClaim' not in self.spec: + def resource_claim_description(self) -> str | None: + if not "resourceClaim" not in self.spec: return None return f"ResourceClaim {self.resource_claim_name} in {self.resource_claim_namespace}" @property - def resource_claim_name(self) -> str|None: - return self.spec.get('resourceClaim', {}).get('name') - - @property - def resource_claim_namespace(self) -> str|None: - return self.spec.get('resourceClaim', {}).get('namespace') + def resource_claim_name(self) -> str | None: + return self.spec.get("resourceClaim", {}).get("name") @property - def resource_handler_idx(self) -> int: - """Label value used to select which resource handler pod should manage this ResourceHandle.""" - return int(UUID(self.uid)) % Poolboy.resource_handler_count + def resource_claim_namespace(self) -> str | None: + return self.spec.get("resourceClaim", {}).get("namespace") @property - def resource_pool_name(self) -> str|None: - if 'resourcePool' in self.spec: - return self.spec['resourcePool']['name'] + def resource_pool_name(self) -> str | None: + if "resourcePool" in self.spec: + return self.spec["resourcePool"]["name"] @property - def resource_pool_namespace(self) -> str|None: - if 'resourcePool' in self.spec: - return self.spec['resourcePool'].get('namespace', Poolboy.namespace) + def resource_pool_namespace(self) -> str | None: + if "resourcePool" in self.spec: + return self.spec["resourcePool"].get("namespace", Poolboy.namespace) @property - def resource_provider_name(self) -> str|None: - return self.spec.get('provider', {}).get('name') + def resource_provider_name(self) -> str | None: + return self.spec.get("provider", {}).get("name") @property def resources(self) -> List[Mapping]: """Resources as listed in spec.""" - return self.spec.get('resources', []) + return self.spec.get("resources", []) @property def status_resources(self) -> List[Mapping]: """Resources as listed in status.""" - return self.status.get('resources', []) + return self.status.get("resources", []) @property def vars(self) -> Mapping: - return self.spec.get('vars', {}) + return self.spec.get("vars", {}) @property def timedelta_to_lifespan_end(self) -> Any: @@ -860,34 +891,38 @@ def timedelta_to_lifespan_end(self) -> Any: return dt - datetime.now(timezone.utc) def __lifespan_value(self, name, resource_claim): - value = self.spec.get('lifespan', {}).get(name) + value = self.spec.get("lifespan", {}).get(name) if not value: return value = recursive_process_template_strings( - template = value, - variables = { + template=value, + variables={ "resource_claim": resource_claim, "resource_handle": self, }, - template_variables = self.vars, + template_variables=self.vars, ) return value - def __lifespan_value_as_timedelta(self, + def __lifespan_value_as_timedelta( + self, name: str, resource_claim: ResourceClaimT, - ) -> timedelta|None: + ) -> timedelta | None: value = self.__lifespan_value(name, resource_claim) if not value: return None seconds = pytimeparse.parse(value) if seconds is None: - raise kopf.TemporaryError(f"Failed to parse {name} time interval: {value}", delay=60) + raise kopf.TemporaryError( + f"Failed to parse {name} time interval: {value}", delay=60 + ) return timedelta(seconds=seconds) - async def __manage_init_status_resources(self, + async def __manage_init_status_resources( + self, logger: kopf.ObjectLogger, ) -> None: """Initialize resources in status from spec.""" @@ -900,289 +935,315 @@ async def __manage_init_status_resources(self, entry = deepcopy(self.status_resources[idx]) else: entry = {} - if 'name' in resource and resource['name'] != entry.get('name'): - entry['name'] = resource['name'] + if "name" in resource and resource["name"] != entry.get("name"): + entry["name"] = resource["name"] set_resources.append(entry) patch = [] if not self.status: - patch.extend(({ - "op": "test", - "path": "/status", - "value": None, - }, { - "op": "add", - "path": "/status", - "value": {}, - })) - if 'resources' not in self.status: - patch.extend(({ - "op": "test", - "path": "/status/resources", - "value": None, - }, { - "op": "add", - "path": "/status/resources", - "value": set_resources, - })) + patch.extend( + ( + { + "op": "test", + "path": "/status", + "value": None, + }, + { + "op": "add", + "path": "/status", + "value": {}, + }, + ) + ) + if "resources" not in self.status: + patch.extend( + ( + { + "op": "test", + "path": "/status/resources", + "value": None, + }, + { + "op": "add", + "path": "/status/resources", + "value": set_resources, + }, + ) + ) else: - patch.extend(({ - "op": "test", - "path": "/status/resources", - "value": self.status_resources, - }, { - "op": "replace", - "path": "/status/resources", - "value": set_resources, - })) + patch.extend( + ( + { + "op": "test", + "path": "/status/resources", + "value": self.status_resources, + }, + { + "op": "replace", + "path": "/status/resources", + "value": set_resources, + }, + ) + ) if 0 == len(patch): return await self.json_patch_status(patch) return - except kubernetes_asyncio.client.exceptions.ApiException as exception: + except kubernetes_asyncio.client.exceptions.ApiException as e: if attempt > 2: - logger.exception(f"{self} failed status patch: {patch}") + logger.warning(f"{self} status patch failed ({e.status}): {patch}") raise attempt += 1 - async def __manage_check_delete(self, - logger: kopf.ObjectLogger, - resource_claim: ResourceClaimT + async def __manage_check_delete( + self, logger: kopf.ObjectLogger, resource_claim: ResourceClaimT ) -> bool: """Delete this ResourceHandle if it meets conditions which trigger delete. - Is past lifespan end. - Is bound to resource claim that has been deleted. """ if self.is_past_lifespan_end: - logger.info(f"Deleting {self} at end of lifespan ({self.lifespan_end_timestamp})") + logger.info( + f"Deleting {self} at end of lifespan ({self.lifespan_end_timestamp})" + ) await self.delete() return True if self.is_bound and not resource_claim: - logger.info(f"Propagating deletion of {self.resource_claim_description} to {self}") + logger.info( + f"Propagating deletion of {self.resource_claim_description} to {self}" + ) await self.delete() return True - async def __manage_update_spec_resources(self, + async def __manage_update_spec_resources( + self, logger: kopf.ObjectLogger, - resource_claim: ResourceClaimT|None, - resource_provider: ResourceProviderT|None, + resource_claim: ResourceClaimT | None, + resource_provider: ResourceProviderT | None, ): """Update this ResourecHandle's spec.resources by applying parameter values from ResourceProvider.""" if not resource_provider: return resources = await resource_provider.get_resources( - resource_claim = resource_claim, - resource_handle = self, + resource_claim=resource_claim, + resource_handle=self, ) - if not 'resources' in self.spec: - await self.json_patch([{ - "op": "add", - "path": "/spec/resources", - "value": resources, - }]) + if "resources" not in self.spec: + await self.json_patch( + [ + { + "op": "add", + "path": "/spec/resources", + "value": resources, + } + ] + ) return patch = [] for idx, resource in enumerate(resources): - if idx < len(self.spec['resources']): - current_provider = self.spec['resources'][idx]['provider']['name'] - updated_provider = resource['provider']['name'] + if idx < len(self.spec["resources"]): + current_provider = self.spec["resources"][idx]["provider"]["name"] + updated_provider = resource["provider"]["name"] if current_provider != updated_provider: logger.warning( f"Refusing update resources in {self} as it would change " f"ResourceProvider from {current_provider} to {updated_provider}" ) - current_template = self.spec['resources'][idx].get('template') - updated_template = resource.get('template') + current_template = self.spec["resources"][idx].get("template") + updated_template = resource.get("template") if current_template != updated_template: - patch.append({ - "op": "add", - "path": f"/spec/resources/{idx}/template", - "value": updated_template, - }) + patch.append( + { + "op": "add", + "path": f"/spec/resources/{idx}/template", + "value": updated_template, + } + ) else: - patch.append({ - "op": "add", - "path": f"/spec/resources/{idx}", - "value": resource - }) + patch.append( + {"op": "add", "path": f"/spec/resources/{idx}", "value": resource} + ) if patch: await self.json_patch(patch) logger.info(f"Updated resources for {self} from {resource_provider}") def get_lifespan_default(self, resource_claim=None): - return self.__lifespan_value('default', resource_claim=resource_claim) + return self.__lifespan_value("default", resource_claim=resource_claim) def get_lifespan_default_timedelta(self, resource_claim=None): - return self.__lifespan_value_as_timedelta('default', resource_claim=resource_claim) + return self.__lifespan_value_as_timedelta( + "default", resource_claim=resource_claim + ) def get_lifespan_maximum(self, resource_claim=None): - return self.__lifespan_value('maximum', resource_claim=resource_claim) + return self.__lifespan_value("maximum", resource_claim=resource_claim) def get_lifespan_maximum_timedelta(self, resource_claim=None): - return self.__lifespan_value_as_timedelta('maximum', resource_claim=resource_claim) + return self.__lifespan_value_as_timedelta( + "maximum", resource_claim=resource_claim + ) def get_lifespan_relative_maximum(self, resource_claim=None): - return self.__lifespan_value('relativeMaximum', resource_claim=resource_claim) + return self.__lifespan_value("relativeMaximum", resource_claim=resource_claim) def get_lifespan_relative_maximum_timedelta(self, resource_claim=None): - return self.__lifespan_value_as_timedelta('relativeMaximum', resource_claim=resource_claim) + return self.__lifespan_value_as_timedelta( + "relativeMaximum", resource_claim=resource_claim + ) def get_lifespan_end_maximum_datetime(self, resource_claim=None): - lifespan_start_datetime = resource_claim.lifespan_start_datetime if resource_claim else self.creation_datetime + lifespan_start_datetime = ( + resource_claim.lifespan_start_datetime + if resource_claim + else self.creation_datetime + ) - maximum_timedelta = self.get_lifespan_maximum_timedelta(resource_claim=resource_claim) + maximum_timedelta = self.get_lifespan_maximum_timedelta( + resource_claim=resource_claim + ) if maximum_timedelta: if resource_claim.lifespan_first_ready_timestamp: - maximum_end = resource_claim.lifespan_first_ready_datetime + maximum_timedelta + maximum_end = ( + resource_claim.lifespan_first_ready_datetime + maximum_timedelta + ) else: maximum_end = lifespan_start_datetime + maximum_timedelta else: maximum_end = None - relative_maximum_timedelta = self.get_lifespan_relative_maximum_timedelta(resource_claim=resource_claim) + relative_maximum_timedelta = self.get_lifespan_relative_maximum_timedelta( + resource_claim=resource_claim + ) if relative_maximum_timedelta: - relative_maximum_end = datetime.now(timezone.utc) + relative_maximum_timedelta + relative_maximum_end = ( + datetime.now(timezone.utc) + relative_maximum_timedelta + ) else: relative_maximum_end = None - if relative_maximum_end \ - and (not maximum_end or relative_maximum_end < maximum_end): + if relative_maximum_end and ( + not maximum_end or relative_maximum_end < maximum_end + ): return relative_maximum_end return maximum_end - def set_resource_healthy(self, resource_index: int, value: bool|None) -> None: + def set_resource_healthy(self, resource_index: int, value: bool | None) -> None: if value is None: - self.status['resources'][resource_index].pop('healthy', None) + self.status["resources"][resource_index].pop("healthy", None) else: - self.status['resources'][resource_index]['healthy'] = value + self.status["resources"][resource_index]["healthy"] = value - def set_resource_ready(self, resource_index: int, value: bool|None) -> None: + def set_resource_ready(self, resource_index: int, value: bool | None) -> None: if value is None: - self.status['resources'][resource_index].pop('ready', None) + self.status["resources"][resource_index].pop("ready", None) else: - self.status['resources'][resource_index]['ready'] = value + self.status["resources"][resource_index]["ready"] = value - def set_resource_state(self, resource_index: int, value: Mapping|None) -> None: + def set_resource_state(self, resource_index: int, value: Mapping | None) -> None: if value is None: - self.status['resources'][resource_index].pop('state', None) + self.status["resources"][resource_index].pop("state", None) else: - self.status['resources'][resource_index]['state'] = value - - async def assign_resource_handler(self): - """Apply label to indicate resource handler should manage this ResourceHandle. - Do not change label on items which are deleting.""" - if ( - self.deletion_timestamp is None and - self.labels.get(Poolboy.resource_handler_idx_label) != str(self.resource_handler_idx) - ): - try: - patch = [{ - "op": "test", - "path": "/metadata/deletionTimestamp", - "value": None, - }] - patch.append({ - "op": "add", - "path": f"/metadata/labels/{Poolboy.resource_handler_idx_label.replace('/', '~1')}", - "value": str(self.resource_handler_idx), - } if self.labels else { - "op": "add", - "path": f"/metadata/labels", - "value": { - Poolboy.resource_handler_idx_label: str(self.resource_handler_idx), - } - }) - await self.json_patch(patch) - except kubernetes_asyncio.client.exceptions.ApiException as exception: - pass + self.status["resources"][resource_index]["state"] = value - async def get_resource_claim(self, not_found_okay: bool) -> ResourceClaimT|None: + async def get_resource_claim(self, not_found_okay: bool) -> ResourceClaimT | None: if not self.is_bound: return None try: return await resourceclaim.ResourceClaim.get( - name = self.resource_claim_name, - namespace = self.resource_claim_namespace, - use_cache = Poolboy.operator_mode_all_in_one, + name=self.resource_claim_name, + namespace=self.resource_claim_namespace, + use_cache=Poolboy.is_standalone, ) except kubernetes_asyncio.client.exceptions.ApiException as e: if e.status == 404 and not_found_okay: return None raise - async def get_resource_pool(self) -> ResourcePoolT|None: + async def get_resource_pool(self) -> ResourcePoolT | None: if not self.is_from_resource_pool: return None return await resourcepool.ResourcePool.get(self.resource_pool_name) - async def get_resource_provider(self) -> ResourceProviderT|None: + async def get_resource_provider(self) -> ResourceProviderT | None: """Return ResourceProvider configured to manage ResourceHandle.""" if self.resource_provider_name: - return await resourceprovider.ResourceProvider.get(self.resource_provider_name) + return await resourceprovider.ResourceProvider.get( + self.resource_provider_name + ) async def get_resource_providers(self) -> List[ResourceProviderT]: """Return list of ResourceProviders for all managed resources.""" resource_providers = [] - for resource in self.spec.get('resources', []): + for resource in self.spec.get("resources", []): resource_providers.append( - await resourceprovider.ResourceProvider.get(resource['provider']['name']) + await resourceprovider.ResourceProvider.get( + resource["provider"]["name"] + ) ) return resource_providers async def get_resource_states(self) -> List[Mapping]: """Return list of states fom resources referenced by ResourceHandle.""" resource_states = [] - for idx in range(len(self.spec['resources'])): + for idx in range(len(self.spec["resources"])): reference = None - if idx < len(self.status.get('resources', [])): - reference = self.status['resources'][idx].get('reference') + if idx < len(self.status.get("resources", [])): + reference = self.status["resources"][idx].get("reference") if not reference: resource_states.append(None) continue resource_states.append( await resourcewatch.ResourceWatch.get_resource_from_any( - api_version=reference['apiVersion'], - kind=reference['kind'], - name=reference['name'], - namespace=reference.get('namespace'), + api_version=reference["apiVersion"], + kind=reference["kind"], + name=reference["name"], + namespace=reference.get("namespace"), not_found_okay=True, - use_cache=Poolboy.operator_mode_all_in_one, + use_cache=Poolboy.is_standalone, ) ) return resource_states async def handle_delete(self, logger: kopf.ObjectLogger) -> None: - for resource in self.status.get('resources', []): - reference = resource.get('reference') + for resource in self.status.get("resources", []): + reference = resource.get("reference") if reference: try: - resource_description = f"{reference['apiVersion']} {reference['kind']} " + ( - f"{reference['name']} in {reference['namespace']}" - if 'namespace' in reference else reference['name'] + resource_description = ( + f"{reference['apiVersion']} {reference['kind']} " + + ( + f"{reference['name']} in {reference['namespace']}" + if "namespace" in reference + else reference["name"] + ) + ) + logger.info( + f"Propagating delete of {self} to {resource_description}" ) - logger.info(f"Propagating delete of {self} to {resource_description}") # Annotate managed resource to indicate resource handle deletion. await poolboy_k8s.patch_object( - api_version = reference['apiVersion'], - kind = reference['kind'], - name = reference['name'], - namespace = reference.get('namespace'), - patch = [{ - "op": "add", - "path": f"/metadata/annotations/{Poolboy.resource_handle_deleted_annotation.replace('/', '~1')}", - "value": datetime.now(timezone.utc).strftime('%FT%TZ'), - }], + api_version=reference["apiVersion"], + kind=reference["kind"], + name=reference["name"], + namespace=reference.get("namespace"), + patch=[ + { + "op": "add", + "path": f"/metadata/annotations/{Poolboy.resource_handle_deleted_annotation.replace('/', '~1')}", + "value": datetime.now(timezone.utc).strftime("%FT%TZ"), + } + ], ) # Delete managed resource await poolboy_k8s.delete_object( - api_version = reference['apiVersion'], - kind = reference['kind'], - name = reference['name'], - namespace = reference.get('namespace'), + api_version=reference["apiVersion"], + kind=reference["kind"], + name=reference["name"], + namespace=reference.get("namespace"), ) except kubernetes_asyncio.client.exceptions.ApiException as e: if e.status != 404: @@ -1191,7 +1252,9 @@ async def handle_delete(self, logger: kopf.ObjectLogger) -> None: resource_claim = await self.get_resource_claim(not_found_okay=True) if resource_claim and not resource_claim.is_detached: await resource_claim.delete() - logger.info(f"Propagated delete of {self} to ResourceClaim {resource_claim}") + logger.info( + f"Propagated delete of {self} to ResourceClaim {resource_claim}" + ) if self.is_from_resource_pool: resource_pool = await resourcepool.ResourcePool.get(self.resource_pool_name) @@ -1208,7 +1271,9 @@ async def manage(self, logger: kopf.ObjectLogger) -> None: # Get ResourceClaim bound to this ResourceHandle if there is one. resource_claim = await self.get_resource_claim(not_found_okay=True) # Delete this ResourceHandle if it meets delete trigger conditions. - if await self.__manage_check_delete(logger=logger, resource_claim=resource_claim): + if await self.__manage_check_delete( + logger=logger, resource_claim=resource_claim + ): return # Get top-level ResourceProvider managing this ResourceHandle @@ -1226,6 +1291,7 @@ async def manage(self, logger: kopf.ObjectLogger) -> None: resource_providers = await self.get_resource_providers() resource_states = await self.get_resource_states() resources_to_create = [] + resources_updated = False patch = [] # Loop through management for each managed resource @@ -1235,12 +1301,14 @@ async def manage(self, logger: kopf.ObjectLogger) -> None: resource_provider = resource_providers[resource_index] if resource_provider.resource_requires_claim and not resource_claim: - if 'ResourceClaim' != status_resource.get('waitingFor'): - patch.append({ - "op": "add", - "path": f"/status/resources/{resource_index}/waitingFor", - "value": "ResourceClaim", - }) + if "ResourceClaim" != status_resource.get("waitingFor"): + patch.append( + { + "op": "add", + "path": f"/status/resources/{resource_index}/waitingFor", + "value": "ResourceClaim", + } + ) continue vars_ = deepcopy(self.vars) @@ -1250,8 +1318,9 @@ async def manage(self, logger: kopf.ObjectLogger) -> None: linked_resource_state = None for pn, pv in enumerate(resource_providers): if ( - pv.name == linked_provider.name and - self.resources[pn].get('name', pv.name) == linked_provider.resource_name + pv.name == linked_provider.name + and self.resources[pn].get("name", pv.name) + == linked_provider.resource_name ): linked_resource_provider = pv linked_resource_state = resource_states[pn] @@ -1265,12 +1334,12 @@ async def manage(self, logger: kopf.ObjectLogger) -> None: continue if not linked_provider.check_wait_for( - linked_resource_provider = linked_resource_provider, - linked_resource_state = linked_resource_state, - resource_claim = resource_claim, - resource_handle = self, - resource_provider = resource_provider, - resource_state = resource_state, + linked_resource_provider=linked_resource_provider, + linked_resource_state=linked_resource_state, + resource_claim=resource_claim, + resource_handle=self, + resource_provider=resource_provider, + resource_state=resource_state, ): wait_for_linked_provider = True break @@ -1278,105 +1347,127 @@ async def manage(self, logger: kopf.ObjectLogger) -> None: if linked_resource_state: for template_var in linked_provider.template_vars: vars_[template_var.name] = jsonpointer.resolve_pointer( - linked_resource_state, template_var.value_from, - default = jinja2.ChainableUndefined() + linked_resource_state, + template_var.value_from, + default=jinja2.ChainableUndefined(), ) if wait_for_linked_provider: - if 'Linked ResourceProvider' != status_resource.get('waitingFor'): - patch.append({ - "op": "add", - "path": f"/status/resources/{resource_index}/waitingFor", - "value": "Linked ResourceProvider", - }) + if "Linked ResourceProvider" != status_resource.get("waitingFor"): + patch.append( + { + "op": "add", + "path": f"/status/resources/{resource_index}/waitingFor", + "value": "Linked ResourceProvider", + } + ) continue - resource_definition = await resource_provider.resource_definition_from_template( - logger = logger, - resource_claim = resource_claim, - resource_handle = self, - resource_index = resource_index, - resource_states = resource_states, - vars_ = vars_, + resource_definition = ( + await resource_provider.resource_definition_from_template( + logger=logger, + resource_claim=resource_claim, + resource_handle=self, + resource_index=resource_index, + resource_states=resource_states, + vars_=vars_, + ) ) if not resource_definition: - if 'Resource Definition' != status_resource.get('waitingFor'): - patch.append({ - "op": "add", - "path": f"/status/resources/{resource_index}/waitingFor", - "value": "Resource Definition", - }) + if "Resource Definition" != status_resource.get("waitingFor"): + patch.append( + { + "op": "add", + "path": f"/status/resources/{resource_index}/waitingFor", + "value": "Resource Definition", + } + ) continue - resource_api_version = resource_definition['apiVersion'] - resource_kind = resource_definition['kind'] - resource_name = resource_definition['metadata']['name'] - resource_namespace = resource_definition['metadata'].get('namespace', None) + resource_api_version = resource_definition["apiVersion"] + resource_kind = resource_definition["kind"] + resource_name = resource_definition["metadata"]["name"] + resource_namespace = resource_definition["metadata"].get( + "namespace", None + ) reference = { - 'apiVersion': resource_api_version, - 'kind': resource_kind, - 'name': resource_name + "apiVersion": resource_api_version, + "kind": resource_kind, + "name": resource_name, } if resource_namespace: - reference['namespace'] = resource_namespace + reference["namespace"] = resource_namespace - if 'reference' not in status_resource: + if "reference" not in status_resource: # Add reference to status resources - status_resource['reference'] = reference - patch.append({ - "op": "add", - "path": f"/status/resources/{resource_index}/reference", - "value": reference, - }) + status_resource["reference"] = reference + patch.append( + { + "op": "add", + "path": f"/status/resources/{resource_index}/reference", + "value": reference, + } + ) # Remove waitingFor from status if present as we are preceeding to resource creation - if 'waitingFor' in status_resource: - patch.append({ - "op": "remove", - "path": f"/status/resources/{resource_index}/waitingFor", - }) - elif resource_api_version != status_resource['reference']['apiVersion']: + if "waitingFor" in status_resource: + patch.append( + { + "op": "remove", + "path": f"/status/resources/{resource_index}/waitingFor", + } + ) + elif resource_api_version != status_resource["reference"]["apiVersion"]: raise kopf.TemporaryError( f"ResourceHandle {self.name} would change from apiVersion " f"{status_resource['reference']['apiVersion']} to {resource_api_version}!", - delay=600 + delay=600, ) - elif resource_kind != status_resource['reference']['kind']: + elif resource_kind != status_resource["reference"]["kind"]: raise kopf.TemporaryError( f"ResourceHandle {self.name} would change from kind " f"{status_resource['reference']['kind']} to {resource_kind}!", - delay=600 + delay=600, ) else: # Maintain name and namespace - if resource_name != status_resource['reference']['name']: - resource_name = status_resource['reference']['name'] - resource_definition['metadata']['name'] = resource_name - if resource_namespace != status_resource['reference'].get('namespace'): - resource_namespace = status_resource['reference']['namespace'] - resource_definition['metadata']['namespace'] = resource_namespace - - resource_description = f"{resource_api_version} {resource_kind} {resource_name}" + if resource_name != status_resource["reference"]["name"]: + resource_name = status_resource["reference"]["name"] + resource_definition["metadata"]["name"] = resource_name + if resource_namespace != status_resource["reference"].get( + "namespace" + ): + resource_namespace = status_resource["reference"]["namespace"] + resource_definition["metadata"]["namespace"] = ( + resource_namespace + ) + + resource_description = ( + f"{resource_api_version} {resource_kind} {resource_name}" + ) if resource_namespace: resource_description += f" in {resource_namespace}" # Ensure there is a ResourceWatch for this resource. await resourcewatch.ResourceWatch.create_as_needed( - api_version = resource_api_version, - kind = resource_kind, - namespace = resource_namespace, + api_version=resource_api_version, + kind=resource_kind, + namespace=resource_namespace, ) if resource_state: updated_state = await resource_provider.update_resource( - logger = logger, - resource_definition = resource_definition, - resource_handle = self, - resource_state = resource_state, + logger=logger, + resource_definition=resource_definition, + resource_handle=self, + resource_state=resource_state, ) if updated_state: resource_states[resource_index] = updated_state - logger.info(f"Updated {resource_description} for ResourceHandle {self.name}") + resources_updated = True + logger.info( + f"Updated {resource_description} for ResourceHandle {self.name}" + ) else: resources_to_create.append((resource_index, resource_definition)) @@ -1389,15 +1480,21 @@ async def manage(self, logger: kopf.ObjectLogger) -> None: raise for resource_index, resource_definition in resources_to_create: - resource_api_version = resource_definition['apiVersion'] - resource_kind = resource_definition['kind'] - resource_name = resource_definition['metadata']['name'] - resource_namespace = resource_definition['metadata'].get('namespace', None) - resource_description = f"{resource_api_version} {resource_kind} {resource_name}" + resource_api_version = resource_definition["apiVersion"] + resource_kind = resource_definition["kind"] + resource_name = resource_definition["metadata"]["name"] + resource_namespace = resource_definition["metadata"].get( + "namespace", None + ) + resource_description = ( + f"{resource_api_version} {resource_kind} {resource_name}" + ) if resource_namespace: resource_description += f" in {resource_namespace}" try: - created_resource = await poolboy_k8s.create_object(resource_definition) + created_resource = await poolboy_k8s.create_object( + resource_definition + ) if created_resource: resource_states[resource_index] = created_resource logger.info(f"Created {resource_description} for {self}") @@ -1405,17 +1502,37 @@ async def manage(self, logger: kopf.ObjectLogger) -> None: if exception.status != 409: raise - if resource_claim: + # Update handle status with resource states immediately after changes. + # This is needed in worker context where ResourceWatch runs in the + # operator process and may not immediately sync with worker changes. + if Poolboy.is_worker and ( + resources_to_create or resources_updated or patch + ): + # Refetch to sync in-memory object with API after patches were applied + await self.refetch() + # Re-fetch claim to ensure we have the latest version + if self.is_bound: + resource_claim = await self.get_resource_claim(not_found_okay=True) + await self.update_status( + logger=logger, + resource_states=resource_states, + resource_claim=resource_claim, + ) + elif resource_claim: await resource_claim.update_status_from_handle( logger=logger, resource_handle=self, resource_states=resource_states, ) - async def refetch(self) -> ResourceHandleT|None: + async def refetch(self) -> ResourceHandleT | None: try: definition = await Poolboy.custom_objects_api.get_namespaced_custom_object( - Poolboy.operator_domain, Poolboy.operator_version, Poolboy.namespace, 'resourcehandles', self.name + Poolboy.operator_domain, + Poolboy.operator_version, + Poolboy.namespace, + "resourcehandles", + self.name, ) self.refresh_from_definition(definition) return self @@ -1425,26 +1542,31 @@ async def refetch(self) -> ResourceHandleT|None: return None raise - async def update_status(self, + async def update_status( + self, logger: kopf.ObjectLogger, - resource_states: List[Mapping|None], - resource_claim: ResourceClaimT|None=None, + resource_states: List[Mapping | None], + resource_claim: ResourceClaimT | None = None, ) -> None: """Update status from resources state.""" status = self.status while len(self.resources) < len(resource_states): - logger.warning(f"{self} update status with resource states longer that list of resources, attempting refetch: {len(self.resources)} < {len(resource_states)}") + logger.warning( + f"{self} update status with resource states longer that list of resources, attempting refetch: {len(self.resources)} < {len(resource_states)}" + ) await asyncio.sleep(0.2) await self.refetch() if len(self.resources) < len(resource_states): - logger.error(f"{self} update status with resource states longer that list of resources after refetch: {len(self.resources)} < {len(resource_states)}") + logger.error( + f"{self} update status with resource states longer that list of resources after refetch: {len(self.resources)} < {len(resource_states)}" + ) return # Create consolidated information about resources resources = deepcopy(self.resources) for idx, state in enumerate(resource_states): - resources[idx]['state'] = state + resources[idx]["state"] = state patch = [] have_healthy_resource = False @@ -1452,15 +1574,19 @@ async def update_status(self, have_ready_resource = False all_resources_ready = True - status_resources = status.get('resources', []) + status_resources = status.get("resources", []) for idx, resource in enumerate(resources): - status_resource = status_resources[idx] if idx < len(status_resources) else {} + status_resource = ( + status_resources[idx] if idx < len(status_resources) else {} + ) resource_healthy = None resource_ready = False - state = resource.get('state') + state = resource.get("state") if state: - resource_provider = await resourceprovider.ResourceProvider.get(resource['provider']['name']) + resource_provider = await resourceprovider.ResourceProvider.get( + resource["provider"]["name"] + ) resource_healthy = resource_provider.check_health( logger=logger, resource_handle=self, @@ -1489,29 +1615,37 @@ async def update_status(self, elif all_resources_ready is True: all_resources_ready = None - if resource_healthy is None and 'healthy' in status_resource: - patch.append({ - "op": "remove", - "path": f"/status/resources/{idx}/healthy", - }) - elif resource_healthy != status_resource.get('healthy'): - patch.append({ - "op": "add", - "path": f"/status/resources/{idx}/healthy", - "value": resource_healthy, - }) - - if resource_ready is None and 'ready' in status_resource: - patch.append({ - "op": "remove", - "path": f"/status/resources/{idx}/ready", - }) - elif resource_ready != status_resource.get('ready'): - patch.append({ - "op": "add", - "path": f"/status/resources/{idx}/ready", - "value": resource_ready, - }) + if resource_healthy is None and "healthy" in status_resource: + patch.append( + { + "op": "remove", + "path": f"/status/resources/{idx}/healthy", + } + ) + elif resource_healthy != status_resource.get("healthy"): + patch.append( + { + "op": "add", + "path": f"/status/resources/{idx}/healthy", + "value": resource_healthy, + } + ) + + if resource_ready is None and "ready" in status_resource: + patch.append( + { + "op": "remove", + "path": f"/status/resources/{idx}/ready", + } + ) + elif resource_ready != status_resource.get("ready"): + patch.append( + { + "op": "add", + "path": f"/status/resources/{idx}/ready", + "value": resource_ready, + } + ) if all_resources_healthy and not have_healthy_resource: all_resources_healthy = None @@ -1519,30 +1653,38 @@ async def update_status(self, all_resources_ready = None if all_resources_healthy is None: - if 'healthy' in status: - patch.append({ - "op": "remove", + if "healthy" in status: + patch.append( + { + "op": "remove", + "path": "/status/healthy", + } + ) + elif all_resources_healthy != status.get("healthy"): + patch.append( + { + "op": "add", "path": "/status/healthy", - }) - elif all_resources_healthy != status.get('healthy'): - patch.append({ - "op": "add", - "path": "/status/healthy", - "value": all_resources_healthy, - }) + "value": all_resources_healthy, + } + ) if all_resources_ready is None: - if 'ready' in status: - patch.append({ - "op": "remove", + if "ready" in status: + patch.append( + { + "op": "remove", + "path": "/status/ready", + } + ) + elif all_resources_ready != status.get("ready"): + patch.append( + { + "op": "add", "path": "/status/ready", - }) - elif all_resources_ready != status.get('ready'): - patch.append({ - "op": "add", - "path": "/status/ready", - "value": all_resources_ready, - }) + "value": all_resources_ready, + } + ) if self.has_resource_provider: resource_provider = None @@ -1553,27 +1695,31 @@ async def update_status(self, resource_handle=self, resources=resources, ) - if status_summary != status.get('summary'): - patch.append({ - "op": "add", - "path": "/status/summary", - "value": status_summary, - }) - except kubernetes_asyncio.client.exceptions.ApiException: - logger.exception( - f"Failed to get ResourceProvider {self.resource_provider_name} for {self}" + if status_summary != status.get("summary"): + patch.append( + { + "op": "add", + "path": "/status/summary", + "value": status_summary, + } + ) + except kubernetes_asyncio.client.exceptions.ApiException as e: + logger.warning( + f"Failed to get ResourceProvider {self.resource_provider_name} " + f"for {self} ({e.status})" ) except Exception: logger.exception(f"Failed to generate status summary for {self}") if patch: + patch_attempt = 0 while True: try: await self.json_patch_status(patch) break - except kubernetes_asyncio.client.exceptions.ApiException: + except kubernetes_asyncio.client.exceptions.ApiException as e: patch_attempt += 1 if patch_attempt > 5: - logger.exception(f"Failed to patch status on {self}") + logger.warning(f"Failed to patch status on {self} ({e.status})") return await asyncio.sleep(0.2) diff --git a/operator/resourcepool.py b/operator/resourcepool.py index 2ef7b47..c816663 100644 --- a/operator/resourcepool.py +++ b/operator/resourcepool.py @@ -1,15 +1,13 @@ import asyncio - from datetime import timedelta from typing import List, Mapping, TypeVar -from uuid import UUID import kopf +import kubernetes_asyncio import pytimeparse - import resourcehandle import resourceprovider - +from cache import Cache, CacheTag from kopfobject import KopfObject from poolboy import Poolboy @@ -23,13 +21,12 @@ class ResourcePool(KopfObject): kind = "ResourcePool" plural = "resourcepools" - instances = {} class_lock = asyncio.Lock() @classmethod async def get(cls, name: str) -> ResourcePoolT: async with cls.class_lock: - return cls.instances.get(name) + return cls.cache_get(CacheTag.POOL, name) @classmethod async def register( @@ -44,7 +41,7 @@ async def register( uid: str, ) -> ResourcePoolT: async with cls.class_lock: - resource_pool = cls.instances.get(name) + resource_pool = cls.cache_get(CacheTag.POOL, name) if resource_pool: resource_pool.refresh( annotations = annotations, @@ -65,13 +62,16 @@ async def register( status = status, uid = uid, ) - resource_pool.__register() + resource_pool.cache_set(CacheTag.POOL, name, ttl=300) return resource_pool @classmethod async def unregister(cls, name: str) -> ResourcePoolT|None: async with cls.class_lock: - return cls.instances.pop(name, None) + resource_pool = cls.cache_get(CacheTag.POOL, name) + if resource_pool: + Cache.delete(CacheTag.POOL, name) + return resource_pool @property def delete_unhealthy_resource_handles(self) -> bool: @@ -127,11 +127,6 @@ def max_unready(self) -> int|None: def min_available(self) -> int: return self.spec.get('minAvailable', 0) - @property - def resource_handler_idx(self) -> int: - """Label value used to select which resource handler pod should manage this ResourcePool.""" - return int(UUID(self.uid)) % Poolboy.resource_handler_count - @property def resource_provider_name(self) -> str|None: return self.spec.get('provider', {}).get('name') @@ -145,38 +140,10 @@ def vars(self) -> Mapping: return self.spec.get('vars', {}) def __register(self) -> None: - self.instances[self.name] = self + self.cache_set(CacheTag.POOL, self.name, ttl=300) def __unregister(self) -> None: - self.instances.pop(self.name, None) - - async def assign_resource_handler(self): - """Apply label to indicate resource handler should manage this ResourcePool. - Do not change label on items which are deleting.""" - if ( - self.deletion_timestamp is None and - self.labels.get(Poolboy.resource_handler_idx_label) != str(self.resource_handler_idx) - ): - try: - patch = [{ - "op": "test", - "path": "/metadata/deletionTimestamp", - "value": None, - }] - patch.append({ - "op": "add", - "path": f"/metadata/labels/{Poolboy.resource_handler_idx_label.replace('/', '~1')}", - "value": str(self.resource_handler_idx), - } if self.labels else { - "op": "add", - "path": f"/metadata/labels", - "value": { - Poolboy.resource_handler_idx_label: str(self.resource_handler_idx), - } - }) - await self.json_patch(patch) - except kubernetes_asyncio.client.exceptions.ApiException as exception: - pass + Cache.delete(CacheTag.POOL, self.name) async def get_resource_provider(self) -> ResourceProviderT: """Return ResourceProvider configured to manage ResourceHandle.""" @@ -214,14 +181,14 @@ async def manage(self, logger: kopf.ObjectLogger): resource_handle_deficit = self.max_unready - unready_count if resource_handle_deficit > 0: - for i in range(resource_handle_deficit): - resource_handle = await resourcehandle.ResourceHandle.create_for_pool( - logger=logger, - resource_pool=self - ) - resource_handles_for_status.append({ - "name": resource_handle.name, - }) + for i in range(resource_handle_deficit): + resource_handle = await resourcehandle.ResourceHandle.create_for_pool( + logger=logger, + resource_pool=self + ) + resource_handles_for_status.append({ + "name": resource_handle.name, + }) patch = [] if not self.status: @@ -251,3 +218,21 @@ async def manage(self, logger: kopf.ObjectLogger): if patch: await self.json_patch_status(patch) + + async def refetch(self) -> ResourcePoolT | None: + """Fetch updated object from K8s API.""" + try: + definition = await Poolboy.custom_objects_api.get_namespaced_custom_object( + Poolboy.operator_domain, + Poolboy.operator_version, + Poolboy.namespace, + 'resourcepools', + self.name, + ) + self.refresh_from_definition(definition) + return self + except kubernetes_asyncio.client.exceptions.ApiException as e: + if e.status == 404: + await self.unregister(name=self.name) + return None + raise diff --git a/operator/resourceprovider.py b/operator/resourceprovider.py index e8cdd83..d4e4bd3 100644 --- a/operator/resourceprovider.py +++ b/operator/resourceprovider.py @@ -2,13 +2,14 @@ import re from copy import deepcopy from datetime import timedelta -from typing import List, Mapping, TypeVar +from typing import List, Mapping, Optional, TypeVar import jinja2 import jsonpointer import kopf import poolboy_k8s import pytimeparse +from cache import Cache, CacheTag from deep_merge import deep_merge from jsonpatch_from_diff import jsonpatch_from_diff from metrics.timer_decorator import TimerDecoratorMeta @@ -144,26 +145,41 @@ class _ValidationException(Exception): class ResourceProvider(metaclass=TimerDecoratorMeta): - instances = {} lock = asyncio.Lock() + @classmethod + def __cache_get(cls, name: str) -> Optional[ResourceProviderT]: + """Get ResourceProvider from cache.""" + cached = Cache.get(CacheTag.PROVIDER, name) + if cached is None: + return None + if isinstance(cached, cls): + return cached + # RedisBackend returns dict, reconstruct + return cls(definition=cached) + + def __cache_set(self, ttl: int = 300) -> None: + """Store ResourceProvider in cache.""" + # Store the definition dict for Redis compatibility + Cache.set(CacheTag.PROVIDER, self.name, self.definition, ttl) + @classmethod def __register_definition(cls, definition: Mapping) -> ResourceProviderT: name = definition['metadata']['name'] - resource_provider = cls.instances.get(name) + resource_provider = cls.__cache_get(name) if resource_provider: - resource_provider.definition = definition - self.__init_resource_template_validator() + resource_provider.__init__(definition=definition) else: resource_provider = cls(definition=definition) - cls.instances[name] = resource_provider + resource_provider.__cache_set(ttl=300) return resource_provider @classmethod def find_provider_by_template_match(cls, template: Mapping) -> ResourceProviderT: provider_matches = [] - for provider in cls.instances.values(): - if provider.is_match_for_template(template): + for name in Cache.get_keys_by_tag(CacheTag.PROVIDER): + provider = cls.__cache_get(name) + if provider and provider.is_match_for_template(template): provider_matches.append(provider) if len(provider_matches) == 0: raise kopf.TemporaryError("Unable to match template to ResourceProvider", delay=60) @@ -175,7 +191,7 @@ def find_provider_by_template_match(cls, template: Mapping) -> ResourceProviderT @classmethod async def get(cls, name: str) -> ResourceProviderT: async with cls.lock: - resource_provider = cls.instances.get(name) + resource_provider = cls.__cache_get(name) if resource_provider: return resource_provider definition = await Poolboy.custom_objects_api.get_namespaced_custom_object( @@ -210,9 +226,10 @@ async def preload(cls, logger: kopf.ObjectLogger) -> None: async def register(cls, definition: Mapping, logger: kopf.ObjectLogger) -> ResourceProviderT: async with cls.lock: name = definition['metadata']['name'] - resource_provider = cls.instances.get(name) + resource_provider = cls.__cache_get(name) if resource_provider: resource_provider.__init__(definition=definition) + resource_provider.__cache_set(ttl=300) logger.debug(f"Refreshed definition of ResourceProvider {name}") else: resource_provider = cls.__register_definition(definition=definition) @@ -222,11 +239,14 @@ async def register(cls, definition: Mapping, logger: kopf.ObjectLogger) -> Resou @classmethod async def unregister(cls, name: str, logger: kopf.ObjectLogger) -> ResourceProviderT|None: async with cls.lock: - if name in cls.instances: + resource_provider = cls.__cache_get(name) + if resource_provider: + Cache.delete(CacheTag.PROVIDER, name) logger.debug(f"Unregistered ResourceProvider {name}") - return cls.instances.pop(name) + return resource_provider def __init__(self, definition: Mapping) -> None: + self._definition = definition self.meta = definition['metadata'] self.spec = definition['spec'] self.__init_resource_template_validator() @@ -241,6 +261,11 @@ def __init_resource_template_validator(self) -> None: def __str__(self) -> str: return f"ResourceProvider {self.name}" + @property + def definition(self) -> Mapping: + """Return the full resource definition for cache serialization.""" + return self._definition + @property def approval_pending_message(self) -> bool: return self.spec.get('approval', {}).get('pendingMessage', 'Approval pending.') diff --git a/operator/resourcewatch.py b/operator/resourcewatch.py index 13be779..bdb8ec9 100644 --- a/operator/resourcewatch.py +++ b/operator/resourcewatch.py @@ -1,33 +1,33 @@ import asyncio -import inflection -import kopf -import kubernetes_asyncio import logging - -from copy import deepcopy -from datetime import datetime, timezone -from typing import Mapping, TypeVar - from base64 import urlsafe_b64encode +from datetime import datetime, timezone from hashlib import sha256 +from typing import Mapping, TypeVar +import inflection +import kopf +import kubernetes_asyncio import poolboy_k8s - +import resourcehandle +from cache import Cache, CacheTag from kopfobject import KopfObject from poolboy import Poolboy -import resourcehandle -import resourceprovider -logger = logging.getLogger('resource_watch') +logger = logging.getLogger("resource_watch") + class ResourceWatchFailedError(Exception): pass + class ResourceWatchRestartError(Exception): pass -ResourceHandleT = TypeVar('ResourceHandleT', bound='ResourceHandle') -ResourceWatchT = TypeVar('ResourceWatchT', bound='ResourceWatch') + +ResourceHandleT = TypeVar("ResourceHandleT", bound="ResourceHandle") +ResourceWatchT = TypeVar("ResourceWatchT", bound="ResourceWatch") + class ResourceWatch(KopfObject): api_group = Poolboy.operator_domain @@ -35,80 +35,80 @@ class ResourceWatch(KopfObject): kind = "ResourceWatch" plural = "resourcewatches" - instances = {} class_lock = asyncio.Lock() - class CacheEntry: - def __init__(self, resource: Mapping): - self.resource = resource - self.cache_datetime = datetime.now(timezone.utc) - - @property - def is_expired(self): - return (datetime.now(timezone.utc) - self.cache_datetime).total_seconds() > Poolboy.resource_refresh_interval - @classmethod - def __instance_key(cls, api_version: str, kind: str, namespace: str|None) -> str: + def __instance_key(cls, api_version: str, kind: str, namespace: str | None) -> str: """Return cache key used to identify ResourceWatch in instances dict""" - return "|".join((api_version, kind, namespace or '*')) + return "|".join((api_version, kind, namespace or "*")) @classmethod - def __make_name(cls, + def __make_name( + cls, api_version: str, kind: str, - namespace: str|None, + namespace: str | None, ): """Return unique name for ResourceWatch determined by watch target. This hash prevents race conditions when otherwise multiple watches might be created.""" - return (namespace or 'cluster') + '-' + urlsafe_b64encode( - sha256(':'.join((api_version,kind,namespace or '')).encode('utf-8')) - .digest() - ).decode('utf-8').replace('=', '').replace('-', '').replace('_', '').lower()[:12] + return ( + (namespace or "cluster") + + "-" + + urlsafe_b64encode( + sha256( + ":".join((api_version, kind, namespace or "")).encode("utf-8") + ).digest() + ) + .decode("utf-8") + .replace("=", "") + .replace("-", "") + .replace("_", "") + .lower()[:12] + ) @classmethod - def __get_instance(cls, + def __get_instance( + cls, api_version: str, kind: str, - namespace: str|None, + namespace: str | None, ): - """Return ResourceWatch from instances dict.""" - return cls.instances.get( - cls.__instance_key( - api_version=api_version, - kind=kind, - namespace=namespace - ) + """Return ResourceWatch from cache.""" + instance_key = cls.__instance_key( + api_version=api_version, kind=kind, namespace=namespace ) + return cls.cache_get(CacheTag.WATCH, instance_key) @classmethod def __register_definition(cls, definition: Mapping) -> ResourceWatchT: resource_watch = cls.__get_instance( - api_version=definition['spec']['apiVersion'], - kind=definition['spec']['kind'], - namespace=definition['spec'].get('namespace'), + api_version=definition["spec"]["apiVersion"], + kind=definition["spec"]["kind"], + namespace=definition["spec"].get("namespace"), ) if resource_watch: resource_watch.refresh_from_definition(definition=definition) else: resource_watch = cls( - annotations=definition['metadata'].get('annotations', {}), - labels=definition['metadata'].get('labels', {}), - meta=definition['metadata'], - name=definition['metadata']['name'], + annotations=definition["metadata"].get("annotations", {}), + labels=definition["metadata"].get("labels", {}), + meta=definition["metadata"], + name=definition["metadata"]["name"], namespace=Poolboy.namespace, - spec=definition['spec'], - status=definition.get('status', {}), - uid=definition['metadata']['uid'], + spec=definition["spec"], + status=definition.get("status", {}), + uid=definition["metadata"]["uid"], ) resource_watch.__register() return resource_watch @classmethod - async def create_as_needed(cls, + async def create_as_needed( + cls, api_version: str, kind: str, - namespace: str|None, - ) -> ResourceWatchT|None: + namespace: str | None, + ) -> ResourceWatchT | None: async with cls.class_lock: resource_watch = await cls.__get( api_version=api_version, @@ -125,7 +125,7 @@ async def create_as_needed(cls, ) definition = { - "apiVersion": '/'.join((cls.api_group, cls.api_version)), + "apiVersion": "/".join((cls.api_group, cls.api_version)), "kind": cls.kind, "metadata": { "name": name, @@ -133,18 +133,20 @@ async def create_as_needed(cls, "spec": { "apiVersion": api_version, "kind": kind, - } + }, } if namespace: - definition['spec']['namespace'] = namespace + definition["spec"]["namespace"] = namespace try: - definition = await Poolboy.custom_objects_api.create_namespaced_custom_object( - group = cls.api_group, - namespace = Poolboy.namespace, - plural = cls.plural, - version = cls.api_version, - body = definition, + definition = ( + await Poolboy.custom_objects_api.create_namespaced_custom_object( + group=cls.api_group, + namespace=Poolboy.namespace, + plural=cls.plural, + version=cls.api_version, + body=definition, + ) ) resource_watch = cls.from_definition(definition) logger.info(f"Created {resource_watch}") @@ -157,10 +159,11 @@ async def create_as_needed(cls, raise @classmethod - async def get(cls, + async def get( + cls, api_version: str, kind: str, - namespace: str|None, + namespace: str | None, ) -> ResourceWatchT: """Get ResourceWatch by watched resources""" async with cls.class_lock: @@ -171,10 +174,11 @@ async def get(cls, ) @classmethod - async def __get(cls, + async def __get( + cls, api_version: str, kind: str, - namespace: str|None, + namespace: str | None, ) -> ResourceWatchT: resource_watch = cls.__get_instance( api_version=api_version, @@ -192,11 +196,11 @@ async def __get(cls, try: list_object = await Poolboy.custom_objects_api.get_namespaced_custom_object( - group = cls.api_group, - name = name, - namespace = Poolboy.namespace, - plural = cls.plural, - version = cls.api_version, + group=cls.api_group, + name=name, + namespace=Poolboy.namespace, + plural=cls.plural, + version=cls.api_version, ) except kubernetes_asyncio.client.exceptions.ApiException as exception: if exception.status == 404: @@ -205,14 +209,15 @@ async def __get(cls, raise @classmethod - async def get_resource_from_any(cls, + async def get_resource_from_any( + cls, api_version: str, kind: str, name: str, - namespace: str|None, - not_found_okay: bool=False, - use_cache: bool=True, - ) -> Mapping|None: + namespace: str | None, + not_found_okay: bool = False, + use_cache: bool = True, + ) -> Mapping | None: # Try to get from other watch object watch = cls.__get_instance( api_version=api_version, @@ -252,29 +257,29 @@ async def register( ) -> ResourceWatchT: async with cls.class_lock: resource_watch = cls.__get_instance( - api_version=spec['apiVersion'], - kind=spec['kind'], - namespace=spec.get('namespace') + api_version=spec["apiVersion"], + kind=spec["kind"], + namespace=spec.get("namespace"), ) if resource_watch: resource_watch.refresh( - annotations = annotations, - labels = labels, - meta = meta, - spec = spec, - status = status, - uid = uid, + annotations=annotations, + labels=labels, + meta=meta, + spec=spec, + status=status, + uid=uid, ) else: resource_watch = cls( - annotations = annotations, - labels = labels, - meta = meta, - name = name, - namespace = namespace, - spec = spec, - status = status, - uid = uid, + annotations=annotations, + labels=labels, + meta=meta, + name=name, + namespace=namespace, + spec=spec, + status=status, + uid=uid, ) resource_watch.__register() return resource_watch @@ -284,21 +289,23 @@ async def stop_all(cls) -> None: """Stop all ResourceWatch tasks""" async with cls.class_lock: tasks = [] - for resource_watch in cls.instances.values(): - if resource_watch.task is not None: + for instance_key in Cache.get_keys_by_tag(CacheTag.WATCH): + resource_watch = cls.cache_get(CacheTag.WATCH, instance_key) + if resource_watch and resource_watch.task is not None: resource_watch.task.cancel() tasks.append(resource_watch.task) if tasks: await asyncio.gather(*tasks) - def __init__(self, - annotations: kopf.Annotations|Mapping, - labels: kopf.Labels|Mapping, - meta: kopf.Meta|Mapping, + def __init__( + self, + annotations: kopf.Annotations | Mapping, + labels: kopf.Labels | Mapping, + meta: kopf.Meta | Mapping, name: str, namespace: str, - spec: kopf.Spec|Mapping, - status: kopf.Status|Mapping, + spec: kopf.Spec | Mapping, + status: kopf.Status | Mapping, uid: str, ): super().__init__( @@ -311,140 +318,60 @@ def __init__(self, status=status, uid=uid, ) - # Cache to store fetched resources - self.cache = {} # Task for when watch is running self.task = None def __register(self) -> None: """ - Add ResourceWatch to register of instances. + Add ResourceWatch to cache. """ - self.instances[self.__self_instance_key] = self + self.cache_set(CacheTag.WATCH, self.__self_instance_key(), ttl=300) def __str__(self) -> str: return ( f"{self.kind} {self.name} ({self.watch_api_version} {self.watch_kind} in {self.watch_namespace})" - if self.watch_namespace else - f"{self.kind} {self.name} ({self.watch_api_version} {self.watch_kind})" + if self.watch_namespace + else f"{self.kind} {self.name} ({self.watch_api_version} {self.watch_kind})" ) def __self_instance_key(self) -> str: return self.__instance_key( - api_version=self.api_version, - kind=self.kind, - namespace=self.namespace, + api_version=self.watch_api_version, + kind=self.watch_kind, + namespace=self.watch_namespace, ) @property def name_hash(self) -> str: - return self.name.rsplit('-', 1)[1] + return self.name.rsplit("-", 1)[1] @property def watch_api_version(self) -> str: - return self.spec['apiVersion'] + return self.spec["apiVersion"] @property def watch_kind(self) -> str: - return self.spec['kind'] + return self.spec["kind"] @property - def watch_namespace(self) -> str|None: - return self.spec.get('namespace') - - def cache_clean(self): - self.cache = { - name: cache_entry - for name, cache_entry in self.cache.items() - if not cache_entry.is_expired - } - - async def create_pod(self, - logger: kopf.ObjectLogger, - ) -> None: - replicaset = kubernetes_asyncio.client.V1ReplicaSet( - api_version="apps/v1", - kind="ReplicaSet", - metadata=kubernetes_asyncio.client.V1ObjectMeta( - name=f"{Poolboy.manager_pod.metadata.name}-watch-{self.name_hash}", - namespace=Poolboy.namespace, - owner_references=[ - kubernetes_asyncio.client.V1OwnerReference( - api_version=Poolboy.manager_pod.api_version, - controller=True, - kind=Poolboy.manager_pod.kind, - name=Poolboy.manager_pod.metadata.name, - uid=Poolboy.manager_pod.metadata.uid, - ) - ] - ), - ) - replicaset.spec = kubernetes_asyncio.client.V1ReplicaSetSpec( - replicas=1, - selector=kubernetes_asyncio.client.V1LabelSelector( - match_labels={ - "app.kubernetes.io/name": Poolboy.manager_pod.metadata.name, - "app.kubernetes.io/instance": f"watch-{self.name_hash}", - }, - ), - template=kubernetes_asyncio.client.V1PodTemplateSpec( - metadata=kubernetes_asyncio.client.V1ObjectMeta( - labels={ - "app.kubernetes.io/name": Poolboy.manager_pod.metadata.name, - "app.kubernetes.io/instance": f"watch-{self.name_hash}", - }, - ), - spec=deepcopy(Poolboy.manager_pod.spec), - ), - ) + def watch_namespace(self) -> str | None: + return self.spec.get("namespace") - replicaset.spec.template.spec.containers[0].env = [ - env_var - for env_var in Poolboy.manager_pod.spec.containers[0].env - if env_var.name not in { - 'OPERATOR_MODE', - 'RESOURCE_HANDLER_COUNT', - 'RESOURCE_HANDLER_RESOURCES', - 'RESOURCE_WATCH_RESOURCES', - } - ] - replicaset.spec.template.spec.containers[0].env.append( - kubernetes_asyncio.client.V1EnvVar( - name='OPERATOR_MODE', - value='resource-watch', - ) - ) - replicaset.spec.template.spec.containers[0].env.append( - kubernetes_asyncio.client.V1EnvVar( - name='WATCH_NAME', - value=self.name, - ) - ) - replicaset.spec.template.spec.node_name = None - if Poolboy.resource_watch_resources: - replicaset.spec.template.spec.containers[0].resources = kubernetes_asyncio.client.V1ResourceRequirements( - limits=Poolboy.resource_watch_resources.get('limits'), - requests=Poolboy.resource_watch_resources.get('requests'), - ) + def __resource_cache_key(self, name: str) -> str: + """Build unique cache key for a watched resource.""" + return f"{self.name}:{name}" - replicaset = await Poolboy.apps_v1_api.create_namespaced_replica_set( - namespace=Poolboy.namespace, - body=replicaset, - ) - logger.info(f"Created ReplicaSet {replicaset.metadata.name} for {self}") - - async def get_resource(self, + async def get_resource( + self, name: str, - not_found_okay: bool=False, - use_cache: bool=True, - ) -> Mapping|None: + not_found_okay: bool = False, + use_cache: bool = True, + ) -> Mapping | None: + resource_cache_key = self.__resource_cache_key(name) if use_cache: - cache_entry = self.cache.get(name) - if cache_entry: - if cache_entry.is_expired: - self.cache.pop(name, None) - else: - return cache_entry.resource + cached = Cache.get(CacheTag.WATCH_RESOURCE, resource_cache_key) + if cached: + return cached try: resource = await poolboy_k8s.get_object( api_version=self.watch_api_version, @@ -458,7 +385,12 @@ async def get_resource(self, else: raise if use_cache and resource: - self.cache[name] = ResourceWatch.CacheEntry(resource) + Cache.set( + CacheTag.WATCH_RESOURCE, + resource_cache_key, + resource, + ttl=Poolboy.resource_refresh_interval, + ) return resource async def start(self, logger) -> None: @@ -467,23 +399,27 @@ async def start(self, logger) -> None: async def watch(self): try: - if '/' in self.watch_api_version: - group, version = self.watch_api_version.split('/') - plural = await poolboy_k8s.kind_to_plural(group=group, version=version, kind=self.watch_kind) + if "/" in self.watch_api_version: + group, version = self.watch_api_version.split("/") + plural = await poolboy_k8s.kind_to_plural( + group=group, version=version, kind=self.watch_kind + ) kwargs = {"group": group, "plural": plural, "version": version} if self.watch_namespace: method = Poolboy.custom_objects_api.list_namespaced_custom_object - kwargs['namespace'] = self.watch_namespace + kwargs["namespace"] = self.watch_namespace else: method = Poolboy.custom_objects_api.list_cluster_custom_object elif self.watch_namespace: method = getattr( - Poolboy.core_v1_api, "list_namespaced_" + inflection.underscore(self.watch_kind) + Poolboy.core_v1_api, + "list_namespaced_" + inflection.underscore(self.watch_kind), ) kwargs = {"namespace": self.watch_namespace} else: method = getattr( - Poolboy.core_v1_api, "list_" + inflection.underscore(self.watch_kind) + Poolboy.core_v1_api, + "list_" + inflection.underscore(self.watch_kind), ) kwargs = {} @@ -496,17 +432,23 @@ async def watch(self): return except ResourceWatchRestartError as e: logger.debug(f"{self} restart: {e}") - watch_duration = (datetime.now(timezone.utc) - watch_start_dt).total_seconds() + watch_duration = ( + datetime.now(timezone.utc) - watch_start_dt + ).total_seconds() if watch_duration < 10: await asyncio.sleep(10 - watch_duration) except ResourceWatchFailedError as e: logger.warning(f"{self} failed: {e}") - watch_duration = (datetime.now(timezone.utc) - watch_start_dt).total_seconds() + watch_duration = ( + datetime.now(timezone.utc) - watch_start_dt + ).total_seconds() if watch_duration < 60: await asyncio.sleep(60 - watch_duration) - except Exception as e: + except Exception: logger.exception(f"{self} exception") - watch_duration = (datetime.now(timezone.utc) - watch_start_dt).total_seconds() + watch_duration = ( + datetime.now(timezone.utc) - watch_start_dt + ).total_seconds() if watch_duration < 60: await asyncio.sleep(60 - watch_duration) logger.debug(f"Restarting {self}") @@ -516,23 +458,24 @@ async def watch(self): async def __watch(self, method, **kwargs): watch = None - self.cache_clean() try: watch = kubernetes_asyncio.watch.Watch() async for event in watch.stream(method, **kwargs): if not isinstance(event, Mapping): raise ResourceWatchFailedError(f"UNKNOWN EVENT: {event}") - event_obj = event['object'] - event_type = event['type'] + event_obj = event["object"] + event_type = event["type"] if not isinstance(event_obj, Mapping): event_obj = Poolboy.api_client.sanitize_for_serialization(event_obj) - if event_type == 'ERROR': - if event_obj['kind'] == 'Status': - if event_obj['reason'] in ('Expired', 'Gone'): - raise ResourceWatchRestartError(event_obj['reason'].lower()) + if event_type == "ERROR": + if event_obj["kind"] == "Status": + if event_obj["reason"] in ("Expired", "Gone"): + raise ResourceWatchRestartError(event_obj["reason"].lower()) else: - raise ResourceWatchFailedError(f"{event_obj['reason']} {event_obj['message']}") + raise ResourceWatchFailedError( + f"{event_obj['reason']} {event_obj['message']}" + ) else: raise ResourceWatchFailedError(f"UNKNOWN EVENT: {event}") try: @@ -549,38 +492,53 @@ async def __watch(self, method, **kwargs): await watch.close() async def __watch_event(self, event_type, event_obj): - event_obj_annotations = event_obj['metadata'].get('annotations') + event_obj_annotations = event_obj["metadata"].get("annotations") if not event_obj_annotations: return - if event_obj_annotations.get(Poolboy.resource_handle_deleted_annotation) is not None: + if ( + event_obj_annotations.get(Poolboy.resource_handle_deleted_annotation) + is not None + ): return - resource_handle_name = event_obj_annotations.get(Poolboy.resource_handle_name_annotation) - resource_index = int(event_obj_annotations.get(Poolboy.resource_index_annotation, 0)) - resource_name = event_obj['metadata']['name'] - resource_namespace = event_obj['metadata'].get('namespace') + resource_handle_name = event_obj_annotations.get( + Poolboy.resource_handle_name_annotation + ) + resource_index = int( + event_obj_annotations.get(Poolboy.resource_index_annotation, 0) + ) + resource_name = event_obj["metadata"]["name"] + resource_namespace = event_obj["metadata"].get("namespace") resource_description = ( f"{event_obj['apiVersion']} {event_obj['kind']} {resource_name} in {resource_namespace}" - if resource_namespace else - f"{event_obj['apiVersion']} {event_obj['kind']} {resource_name}" + if resource_namespace + else f"{event_obj['apiVersion']} {event_obj['kind']} {resource_name}" ) if not resource_handle_name: return - if event_type == 'DELETED': - self.cache.pop(resource_name, None) + resource_cache_key = self.__resource_cache_key(resource_name) + if event_type == "DELETED": + Cache.delete(CacheTag.WATCH_RESOURCE, resource_cache_key) else: - self.cache[resource_name] = ResourceWatch.CacheEntry(event_obj) + Cache.set( + CacheTag.WATCH_RESOURCE, + resource_cache_key, + event_obj, + ttl=Poolboy.resource_refresh_interval, + ) try: resource_handle = await resourcehandle.ResourceHandle.get( name=resource_handle_name, - use_cache=Poolboy.operator_mode_all_in_one, + use_cache=Poolboy.is_standalone, ) except kubernetes_asyncio.client.exceptions.ApiException as exception: if exception.status == 404: - logger.warning(f"ResourceHandle {resource_handle_name} not found for event on {resource_description}") + logger.warning( + f"ResourceHandle {resource_handle_name} not found for event on {resource_description}" + ) else: logger.exception( f"Failed to get ResourceHandle {resource_handle_name} for event on {resource_description}" @@ -600,30 +558,30 @@ async def __watch_event(self, event_type, event_obj): # Get full list of resources to update ResourceHandle status resource_states = [] - for (idx, resource) in enumerate(resource_handle.status_resources): + for idx, resource in enumerate(resource_handle.status_resources): if idx == resource_index: resource_states.append(event_obj) continue - reference = resource.get('reference') + reference = resource.get("reference") if reference: - if( - reference['apiVersion'] == self.watch_api_version and - reference['kind'] == self.watch_kind and - reference.get('namespace') == self.watch_namespace + if ( + reference["apiVersion"] == self.watch_api_version + and reference["kind"] == self.watch_kind + and reference.get("namespace") == self.watch_namespace ): resource_states.append( await self.get_resource( - name=reference['name'], + name=reference["name"], not_found_okay=True, ) ) else: resource_states.append( await self.get_resource_from_any( - api_version=reference['apiVersion'], - kind=reference['kind'], - name=reference['name'], - namespace=reference.get('namespace'), + api_version=reference["apiVersion"], + kind=reference["kind"], + name=reference["name"], + namespace=reference.get("namespace"), not_found_okay=True, ) ) @@ -636,7 +594,7 @@ async def __watch_event(self, event_type, event_obj): resource_claim=resource_claim, resource_states=resource_states, ) - except kubernetes_asyncio.client.exceptions.ApiException as exception: + except kubernetes_asyncio.client.exceptions.ApiException: logger.exception( f"Failed updating status on {resource_handle} from event on {resource_description}" ) diff --git a/operator/scheduler/__init__.py b/operator/scheduler/__init__.py new file mode 100644 index 0000000..2dbf4c2 --- /dev/null +++ b/operator/scheduler/__init__.py @@ -0,0 +1,3 @@ +""" +Scheduler module for Celery Beat configuration. +""" diff --git a/operator/scheduler/config_loader.py b/operator/scheduler/config_loader.py new file mode 100644 index 0000000..546fe94 --- /dev/null +++ b/operator/scheduler/config_loader.py @@ -0,0 +1,50 @@ +""" +Configuration loader for schedule overrides from ConfigMap. +""" + +import os +from typing import Any, Dict + +import yaml + + +def load_schedule_config() -> Dict[str, Any]: + """ + Load schedule configuration from YAML file. + + Returns: + Dictionary with schedule configuration. + """ + config_path = os.environ.get( + 'CELERY_SCHEDULE_CONFIG', + '/etc/poolboy/schedule_config.yaml' + ) + if os.path.exists(config_path): + try: + with open(config_path, 'r') as f: + return yaml.safe_load(f) or {} + except yaml.YAMLError as e: + print(f"Error parsing schedule config YAML: {e}") + return {} + return {} + + +def deep_merge(source: Dict, destination: Dict) -> Dict: + """ + Recursively merge two dictionaries. + + Args: + source: Source dictionary to merge from + destination: Destination dictionary to merge into + + Returns: + Merged dictionary + """ + result = destination.copy() + for key, value in source.items(): + if isinstance(value, dict) and key in result and isinstance(result[key], dict): + result[key] = deep_merge(value, result[key]) + else: + result[key] = value + return result + diff --git a/operator/scheduler/registry.py b/operator/scheduler/registry.py new file mode 100644 index 0000000..31db03f --- /dev/null +++ b/operator/scheduler/registry.py @@ -0,0 +1,103 @@ +""" +Beat Registry for declarative periodic task definition. +""" + +import logging +from collections.abc import Callable +from dataclasses import dataclass, field +from typing import ClassVar, Optional + +from metrics import TimerDecoratorMeta + +logger = logging.getLogger(__name__) + +CRON_FIELDS_COUNT = 5 + + +@dataclass +class ScheduledTask: + """Represents a scheduled periodic task.""" + task_name: str + task_func: Callable + description: str + owner: str + cron: Optional[str] = None + seconds: Optional[int] = None + tags: list[str] = field(default_factory=list) + enabled: bool = False + + +class BeatRegistry(metaclass=TimerDecoratorMeta): + """Registry for periodic tasks.""" + _tasks: ClassVar[dict[str, ScheduledTask]] = {} + + @classmethod + def register( + cls, + task_name: str, + description: str, + owner: str, + cron: Optional[str] = None, + seconds: Optional[int] = None, + tags: Optional[list[str]] = None, + enabled: bool = False, + ): + """Decorator to register a periodic task.""" + def decorator(func: Callable) -> Callable: + scheduled_task = ScheduledTask( + task_name=task_name, + task_func=func, + cron=cron, + seconds=seconds, + description=description, + owner=owner, + tags=tags or [], + enabled=enabled, + ) + cls._tasks[task_name] = scheduled_task + logger.info(f"Registered periodic task: {task_name}") + return func + return decorator + + @classmethod + def get_task(cls, task_name: str) -> Optional[ScheduledTask]: + """Get a registered task by name.""" + return cls._tasks.get(task_name) + + @classmethod + def list_all(cls) -> dict[str, ScheduledTask]: + """List all registered tasks.""" + return cls._tasks.copy() + + @classmethod + def validate_registry(cls): + """Validate all registered tasks.""" + errors = [] + for name, task in cls._tasks.items(): + has_cron = task.cron is not None + has_seconds = task.seconds is not None + if not has_cron and not has_seconds: + errors.append(f"{name}: must have cron or seconds") + if has_cron and has_seconds: + errors.append(f"{name}: cannot have both cron and seconds") + if has_cron and not cls._is_valid_cron(task.cron): + errors.append(f"{name}: invalid cron '{task.cron}'") + if has_seconds and task.seconds <= 0: + errors.append(f"{name}: seconds must be positive") + if not task.description: + errors.append(f"{name}: missing description") + if not task.owner: + errors.append(f"{name}: missing owner") + if errors: + msg = f"Registry validation failed: {'; '.join(errors)}" + raise ValueError(msg) + logger.info(f"Registry validated: {len(cls._tasks)} tasks registered") + + @staticmethod + def _is_valid_cron(cron_expr: str) -> bool: + """Check if cron expression has valid number of fields.""" + parts = cron_expr.strip().split() + return len(parts) == CRON_FIELDS_COUNT + + +register_schedule = BeatRegistry.register diff --git a/operator/scheduler/scheduler.py b/operator/scheduler/scheduler.py new file mode 100644 index 0000000..ab1c961 --- /dev/null +++ b/operator/scheduler/scheduler.py @@ -0,0 +1,91 @@ +""" +Beat scheduler configuration builder. +""" + +from datetime import timedelta +from typing import Any + +from celery.schedules import crontab +from metrics import TimerDecoratorMeta + +from .config_loader import deep_merge, load_schedule_config +from .registry import CRON_FIELDS_COUNT, BeatRegistry + + +class BeatScheduler(metaclass=TimerDecoratorMeta): + """Manages beat schedule configuration from ConfigMap.""" + + def __init__(self): + raw_config = load_schedule_config() + self.schedules = raw_config.get('schedules', {}) + + def _build_task_config(self, task_name: str, registry_task) -> dict[str, Any]: + """Merge registry defaults with ConfigMap overrides.""" + if registry_task.seconds is not None: + default_schedule = {"seconds": registry_task.seconds} + else: + default_schedule = {"cron": registry_task.cron} + + config = { + "enabled": registry_task.enabled, + "schedule": default_schedule, + "options": {}, + } + + if task_name in self.schedules: + config = deep_merge(self.schedules[task_name], config) + + return config + + def _parse_cron(self, cron_str: str) -> crontab: + """Parse cron string to celery crontab object.""" + parts = cron_str.strip().split() + if len(parts) != CRON_FIELDS_COUNT: + raise ValueError(f"Invalid cron expression: {cron_str}") + + return crontab( + minute=parts[0], + hour=parts[1], + day_of_month=parts[2], + month_of_year=parts[3], + day_of_week=parts[4], + ) + + def _parse_schedule(self, schedule_config: dict) -> crontab | timedelta: + """Parse schedule config to celery schedule object.""" + if "seconds" in schedule_config: + return timedelta(seconds=schedule_config["seconds"]) + elif "cron" in schedule_config: + return self._parse_cron(schedule_config["cron"]) + else: + raise ValueError(f"Invalid schedule config: {schedule_config}") + + def build_schedule(self) -> dict[str, dict]: + """Build Celery beat_schedule from registry and ConfigMap.""" + BeatRegistry.validate_registry() + + beat_schedule = {} + + for task_name, registry_task in BeatRegistry.list_all().items(): + config = self._build_task_config(task_name, registry_task) + + if not config.get("enabled", False): + continue + + schedule_entry = { + "task": registry_task.task_func.name, + "schedule": self._parse_schedule(config["schedule"]), + } + + if "options" in config: + schedule_entry["options"] = config["options"] + + beat_schedule[task_name] = schedule_entry + + return beat_schedule + + +def setup_beat_schedule() -> dict[str, dict]: + """Setup function called from processor/app.py when scheduler is enabled.""" + scheduler = BeatScheduler() + return scheduler.build_schedule() diff --git a/operator/tasks/__init__.py b/operator/tasks/__init__.py new file mode 100644 index 0000000..f2d66d2 --- /dev/null +++ b/operator/tasks/__init__.py @@ -0,0 +1,35 @@ +""" +Celery tasks for Poolboy resource management. + +Task modules follow the same naming convention as the main Poolboy modules: +- resourcepool.py +- resourceclaim.py +- resourcehandle.py +- resourceprovider.py +- resourcewatch.py +- cleanup.py +""" + +from . import resourcepool + +# Placeholder imports for other task types (to be implemented in future phases) +try: + from . import resourceclaim +except ImportError: + pass +try: + from . import resourcehandle +except ImportError: + pass +try: + from . import resourceprovider +except ImportError: + pass +try: + from . import resourcewatch +except ImportError: + pass +try: + from . import cleanup +except ImportError: + pass diff --git a/operator/tasks/resourceclaim.py b/operator/tasks/resourceclaim.py new file mode 100644 index 0000000..209567e --- /dev/null +++ b/operator/tasks/resourceclaim.py @@ -0,0 +1,233 @@ +"""Celery tasks for ResourceClaim management.""" + +from celery.utils.log import get_task_logger +from distributed_lock import distributed_lock +from poolboy import Poolboy +from processor.app import WorkerState, app +from scheduler.registry import register_schedule + +logger = get_task_logger(__name__) + +BATCH_SIZE = 20 # claims per batch - distributes across workers + + +def _is_transient_exception(exc: Exception) -> bool: + """Check if exception is transient (expected retry scenario).""" + import kubernetes_asyncio + + if isinstance(exc, kubernetes_asyncio.client.exceptions.ApiException): + return True + + exc_class_name = type(exc).__name__ + exc_module = type(exc).__module__ + if exc_class_name == "TemporaryError" and "kopf" in exc_module: + return True + + return False + + +def _log_and_retry(task, name: str, namespace: str, exc: Exception, action: str): + """Log exception appropriately and retry the task.""" + countdown = Poolboy.workers_error_retry_countdown + + if _is_transient_exception(exc): + logger.warning(f"Claim {namespace}/{name} {action} error: {exc}") + else: + logger.error(f"Claim {namespace}/{name} {action} error: {exc}", exc_info=True) + + raise task.retry(exc=exc, countdown=countdown, max_retries=5) + + +async def _collect_claims_to_process() -> list: + """Collect all claims that need processing (not recently processed).""" + claims_to_process = [] + _continue = None + + while True: + # Note: Using cluster-wide listing since claims exist in user namespaces + claim_list = await Poolboy.custom_objects_api.list_cluster_custom_object( + group=Poolboy.operator_domain, + plural="resourceclaims", + version=Poolboy.operator_version, + _continue=_continue, + limit=50, + ) + + for item in claim_list.get("items", []): + # Skip ignored claims + if Poolboy.ignore_label in item["metadata"].get("labels", {}): + continue + + claims_to_process.append(item) + + _continue = claim_list["metadata"].get("continue") + if not _continue: + break + + return claims_to_process + + +async def _delete_claim(definition: dict) -> dict: + """Async wrapper for ResourceClaim.handle_delete(). + + Note: We do NOT refetch for delete operations. The claim may already + be deleted from K8s, but we still need to propagate the delete to + the ResourceHandle using the original definition. + """ + import resourceclaim + + claim = resourceclaim.ResourceClaim.from_definition(definition) + await claim.handle_delete(logger=logger) + await claim.unregister(name=claim.name, namespace=claim.namespace) + return {"status": "completed", "claim": claim.name, "namespace": claim.namespace} + + +def _dispatch_batch(claims: list) -> int: + """Dispatch a batch of claims as individual tasks. + + Note: Uses timestamp (truncated to minute) instead of resourceVersion for task_id. + This allows periodic reprocessing even when resourceVersion hasn't changed, + which is necessary for time-based triggers like lifespan.start. + """ + import time + + ts_minute = int(time.time() // 60) # One dispatch allowed per minute per claim + + dispatched = 0 + for item in claims: + uid = item["metadata"]["uid"] + kwargs = { + "definition": item, + "name": item["metadata"]["name"], + "namespace": item["metadata"]["namespace"], + } + # Use timestamp instead of resourceVersion to allow periodic reprocessing + manage_claim.apply_async( + kwargs=kwargs, task_id=f"claim-sched-{uid}-{ts_minute}" + ) + dispatched += 1 + return dispatched + + +async def _manage_claim(definition: dict) -> dict: + """Async wrapper for ResourceClaim.manage().""" + import resourceclaim + + claim = resourceclaim.ResourceClaim.from_definition(definition) + # Refetch to get current state from K8s API (avoid stale data) + claim = await claim.refetch() + if not claim: + # Claim was deleted between dispatch and execution + return { + "status": "skipped", + "reason": "not_found", + "claim": definition["metadata"]["name"], + } + + # Register claim in cache to keep it fresh + await claim.register_definition(claim.definition) + + await claim.manage(logger=logger) + return {"status": "completed", "claim": claim.name, "namespace": claim.namespace} + + +@app.task(bind=True, acks_late=True) +def delete_claim(self, definition: dict, name: str, namespace: str): + """Execute ResourceClaim.handle_delete() in a worker.""" + uid = definition["metadata"]["uid"] + lock_key = f"resource_claim:{uid}" + + with distributed_lock(lock_key, timeout=60) as acquired: + if not acquired: + logger.debug(f"Claim {namespace}/{name} locked, retrying") + countdown = Poolboy.workers_lock_retry_countdown + raise self.retry(countdown=countdown, max_retries=None) + + try: + return WorkerState.run_async(_delete_claim(definition)) + except Exception as e: + _log_and_retry(self, name, namespace, e, "delete") + + +def dispatch_delete_claim(definition: dict, name: str, namespace: str): + """Dispatch delete_claim task with unique task_id.""" + uid = definition["metadata"]["uid"] + rv = definition["metadata"]["resourceVersion"] + kwargs = {"definition": definition, "name": name, "namespace": namespace} + delete_claim.apply_async( + kwargs=kwargs, + task_id=f"claim-delete-{uid}-{rv}", + ) + + +def dispatch_manage_claim(definition: dict, name: str, namespace: str): + """Dispatch manage_claim task. Always dispatches for operator events.""" + uid = definition["metadata"]["uid"] + rv = definition["metadata"]["resourceVersion"] + kwargs = {"definition": definition, "name": name, "namespace": namespace} + manage_claim.apply_async( + kwargs=kwargs, + task_id=f"claim-{uid}-{rv}", + ) + + +@app.task(bind=True, acks_late=True) +def manage_claim(self, definition: dict, name: str, namespace: str): + """Execute ResourceClaim.manage() in a worker.""" + uid = definition["metadata"]["uid"] + lock_key = f"resource_claim:{uid}" + + with distributed_lock(lock_key, timeout=60) as acquired: + if not acquired: + logger.debug(f"Claim {namespace}/{name} locked, retrying") + countdown = Poolboy.workers_lock_retry_countdown + raise self.retry(countdown=countdown, max_retries=None) + + try: + return WorkerState.run_async(_manage_claim(definition)) + except Exception as e: + _log_and_retry(self, name, namespace, e, "manage") + + +@register_schedule( + task_name="maintain-all-claims", + seconds=60, + description="Periodic task to reconcile all ResourceClaims", + owner="poolboy", +) +@app.task(name="tasks.resourceclaim.maintain_all_claims") +def maintain_all_claims(): + """Periodic task for Celery Beat - reconcile all claims using group for distribution.""" + from celery import group + + lock_key = "maintain_all_claims:global" + + with distributed_lock(lock_key, timeout=300) as acquired: + if not acquired: + return {"status": "skipped", "reason": "already_running"} + + # Collect all claims that need processing + claims = WorkerState.run_async(_collect_claims_to_process()) + + if not claims: + return {"status": "completed", "total": 0, "batches": 0} + + # Split into batches and dispatch using group (distributes across workers) + batches = [ + claims[i : i + BATCH_SIZE] for i in range(0, len(claims), BATCH_SIZE) + ] + + # Create group of batch tasks - Celery will distribute across available workers + batch_group = group(process_claim_batch.s(batch) for batch in batches) + batch_group.apply_async() + + logger.info( + f"Claim maintenance: {len(claims)} claims in {len(batches)} batches" + ) + return {"status": "dispatched", "total": len(claims), "batches": len(batches)} + + +@app.task(bind=True) +def process_claim_batch(self, claims: list): + """Process a batch of claims. Each batch runs on a different worker.""" + return _dispatch_batch(claims) diff --git a/operator/tasks/resourcehandle.py b/operator/tasks/resourcehandle.py new file mode 100644 index 0000000..9f2585c --- /dev/null +++ b/operator/tasks/resourcehandle.py @@ -0,0 +1,238 @@ +"""Celery tasks for ResourceHandle management.""" + +from celery.utils.log import get_task_logger +from distributed_lock import distributed_lock +from poolboy import Poolboy +from processor.app import WorkerState, app +from scheduler.registry import register_schedule + +logger = get_task_logger(__name__) + +BATCH_SIZE = 20 # handles per batch - distributes across workers + + +def _is_transient_exception(exc: Exception) -> bool: + """Check if exception is transient (expected retry scenario).""" + import kubernetes_asyncio + + # Check ApiException first (already imported) + if isinstance(exc, kubernetes_asyncio.client.exceptions.ApiException): + return True + + # Check kopf.TemporaryError by class name to avoid importing kopf + # This works because resourcehandle.py raises kopf.TemporaryError + exc_class_name = type(exc).__name__ + exc_module = type(exc).__module__ + if exc_class_name == "TemporaryError" and "kopf" in exc_module: + return True + + return False + + +def _log_and_retry(task, name: str, exc: Exception, action: str): + """Log exception appropriately and retry the task.""" + countdown = Poolboy.workers_error_retry_countdown + + if _is_transient_exception(exc): + # Expected transient errors - warning only, no traceback + logger.warning(f"Handle {name} {action} error: {exc}") + else: + # Unexpected error - log with traceback for debugging + logger.error(f"Handle {name} {action} error: {exc}", exc_info=True) + + raise task.retry(exc=exc, countdown=countdown, max_retries=5) + + +async def _collect_handles_to_process() -> list: + """Collect all handles that need processing (not recently processed).""" + handles_to_process = [] + _continue = None + + while True: + handle_list = await Poolboy.custom_objects_api.list_namespaced_custom_object( + group=Poolboy.operator_domain, + namespace=Poolboy.namespace, + plural="resourcehandles", + version=Poolboy.operator_version, + _continue=_continue, + limit=50, + ) + + for item in handle_list.get("items", []): + # Skip ignored handles + if Poolboy.ignore_label in item["metadata"].get("labels", {}): + continue + + handles_to_process.append(item) + + _continue = handle_list["metadata"].get("continue") + if not _continue: + break + + return handles_to_process + + +async def _delete_handle(definition: dict) -> dict: + """Async wrapper for ResourceHandle.handle_delete(). + + Note: We do NOT refetch for delete operations. The handle may already + be deleted from K8s, but we still need to propagate the delete to + child resources (ResourceClaimTest, etc.) using the original definition. + """ + import resourcehandle + + handle = resourcehandle.ResourceHandle.from_definition(definition) + await handle.handle_delete(logger=logger) + return {"status": "completed", "handle": handle.name} + + +def _dispatch_batch(handles: list) -> int: + """Dispatch a batch of handles as individual tasks. + + Note: Uses timestamp (truncated to minute) instead of resourceVersion for task_id. + This allows periodic reprocessing even when resourceVersion hasn't changed, + which is necessary for time-based triggers like lifespan.end. + """ + import time + + ts_minute = int(time.time() // 60) # One dispatch allowed per minute per handle + + dispatched = 0 + for item in handles: + uid = item["metadata"]["uid"] + kwargs = { + "definition": item, + "name": item["metadata"]["name"], + "namespace": item["metadata"]["namespace"], + } + # Use timestamp instead of resourceVersion to allow periodic reprocessing + manage_handle.apply_async( + kwargs=kwargs, task_id=f"handle-sched-{uid}-{ts_minute}" + ) + dispatched += 1 + return dispatched + + +async def _manage_handle(definition: dict) -> dict: + """Async wrapper for ResourceHandle.manage().""" + import resourcehandle + + handle = resourcehandle.ResourceHandle.from_definition(definition) + # Refetch to get current state from K8s API (avoid stale data) + handle = await handle.refetch() + if not handle: + # Handle was deleted between dispatch and execution + return { + "status": "skipped", + "reason": "not_found", + "handle": definition["metadata"]["name"], + } + + # Register handle in cache for binding operations + # This ensures unbound handles are available for claim binding + await handle.register_definition(handle.definition) + + await handle.manage(logger=logger) + return {"status": "completed", "handle": handle.name} + + +@app.task(bind=True, acks_late=True) +def delete_handle(self, definition: dict, name: str, namespace: str): + """Execute ResourceHandle.handle_delete() in a worker.""" + uid = definition["metadata"]["uid"] + lock_key = f"resource_handle:{uid}" + + with distributed_lock(lock_key, timeout=60) as acquired: + if not acquired: + logger.debug(f"Handle {name} locked, retrying") + countdown = Poolboy.workers_lock_retry_countdown + raise self.retry(countdown=countdown, max_retries=None) + + try: + return WorkerState.run_async(_delete_handle(definition)) + except Exception as e: + _log_and_retry(self, name, e, "delete") + + +def dispatch_delete_handle(definition: dict, name: str, namespace: str): + """Dispatch delete_handle task with unique task_id.""" + uid = definition["metadata"]["uid"] + rv = definition["metadata"]["resourceVersion"] + kwargs = {"definition": definition, "name": name, "namespace": namespace} + delete_handle.apply_async( + kwargs=kwargs, + task_id=f"handle-delete-{uid}-{rv}", + ) + + +def dispatch_manage_handle(definition: dict, name: str, namespace: str): + """Dispatch manage_handle task. Always dispatches for operator events.""" + uid = definition["metadata"]["uid"] + rv = definition["metadata"]["resourceVersion"] + kwargs = {"definition": definition, "name": name, "namespace": namespace} + manage_handle.apply_async( + kwargs=kwargs, + task_id=f"handle-{uid}-{rv}", + ) + + +@app.task(bind=True, acks_late=True) +def manage_handle(self, definition: dict, name: str, namespace: str): + """Execute ResourceHandle.manage() in a worker.""" + uid = definition["metadata"]["uid"] + lock_key = f"resource_handle:{uid}" + + with distributed_lock(lock_key, timeout=60) as acquired: + if not acquired: + logger.debug(f"Handle {name} locked, retrying") + countdown = Poolboy.workers_lock_retry_countdown + raise self.retry(countdown=countdown, max_retries=None) + + try: + return WorkerState.run_async(_manage_handle(definition)) + except Exception as e: + _log_and_retry(self, name, e, "manage") + + +@register_schedule( + task_name="maintain-all-handles", + seconds=60, + description="Periodic task to reconcile all ResourceHandles", + owner="poolboy", +) +@app.task(name="tasks.resourcehandle.maintain_all_handles") +def maintain_all_handles(): + """Periodic task for Celery Beat - reconcile all handles using group for distribution.""" + from celery import group + + lock_key = "maintain_all_handles:global" + + with distributed_lock(lock_key, timeout=300) as acquired: + if not acquired: + return {"status": "skipped", "reason": "already_running"} + + # Collect all handles that need processing + handles = WorkerState.run_async(_collect_handles_to_process()) + + if not handles: + return {"status": "completed", "total": 0, "batches": 0} + + # Split into batches and dispatch using group (distributes across workers) + batches = [ + handles[i : i + BATCH_SIZE] for i in range(0, len(handles), BATCH_SIZE) + ] + + # Create group of batch tasks - Celery will distribute across available workers + batch_group = group(process_handle_batch.s(batch) for batch in batches) + batch_group.apply_async() + + logger.info( + f"Handle maintenance: {len(handles)} handles in {len(batches)} batches" + ) + return {"status": "dispatched", "total": len(handles), "batches": len(batches)} + + +@app.task(bind=True) +def process_handle_batch(self, handles: list): + """Process a batch of handles. Each batch runs on a different worker.""" + return _dispatch_batch(handles) diff --git a/operator/tasks/resourcepool.py b/operator/tasks/resourcepool.py new file mode 100644 index 0000000..5fe424c --- /dev/null +++ b/operator/tasks/resourcepool.py @@ -0,0 +1,150 @@ +"""Celery tasks for ResourcePool management.""" + +from celery.utils.log import get_task_logger +from distributed_lock import distributed_lock +from processor.app import WorkerState, app +from scheduler.registry import register_schedule + +logger = get_task_logger(__name__) + + +async def _delete_pool_handles(definition: dict) -> dict: + """Async wrapper for ResourcePool.handle_delete().""" + import resourcepool + + pool = resourcepool.ResourcePool.from_definition(definition) + await pool.handle_delete(logger=logger) + return {"status": "completed", "pool": pool.name} + + +async def _maintain_all_pools() -> dict: + """List all pools and dispatch manage_pool for each. + + Note: Uses timestamp (truncated to minute) instead of resourceVersion for task_id. + This allows periodic reprocessing even when resourceVersion hasn't changed. + """ + import time + + from poolboy import Poolboy + + ts_minute = int(time.time() // 60) # One dispatch allowed per minute per pool + + pool_list = await Poolboy.custom_objects_api.list_namespaced_custom_object( + group=Poolboy.operator_domain, + namespace=Poolboy.namespace, + plural="resourcepools", + version=Poolboy.operator_version, + ) + + dispatched = 0 + for item in pool_list.get("items", []): + uid = item["metadata"]["uid"] + kwargs = { + "definition": item, + "name": item["metadata"]["name"], + "namespace": item["metadata"]["namespace"], + } + # Use timestamp instead of resourceVersion to allow periodic reprocessing + manage_pool.apply_async(kwargs=kwargs, task_id=f"pool-sched-{uid}-{ts_minute}") + dispatched += 1 + + return {"dispatched": dispatched} + + +async def _manage_pool(definition: dict) -> dict: + """Async wrapper for ResourcePool.manage().""" + import resourcepool + + pool = resourcepool.ResourcePool.from_definition(definition) + await pool.manage(logger=logger) + return {"status": "completed", "pool": pool.name} + + +@app.task(bind=True, acks_late=True) +def delete_pool_handles(self, definition: dict, name: str, namespace: str): + """Execute ResourcePool.handle_delete() in a worker.""" + from poolboy import Poolboy + + uid = definition["metadata"]["uid"] + lock_key = f"resource_pool:{uid}" + + with distributed_lock(lock_key, timeout=60) as acquired: + if not acquired: + logger.debug(f"Pool {namespace}/{name} locked, retrying") + countdown = Poolboy.workers_lock_retry_countdown + raise self.retry(countdown=countdown, max_retries=None) + + try: + return WorkerState.run_async(_delete_pool_handles(definition)) + except Exception as e: + logger.error(f"Pool {namespace}/{name} delete error: {e}") + raise self.retry( + exc=e, countdown=Poolboy.workers_error_retry_countdown, max_retries=5 + ) + + +def dispatch_delete_pool_handles(definition: dict, name: str, namespace: str): + """Dispatch delete_pool_handles task with unique task_id.""" + uid = definition["metadata"]["uid"] + rv = definition["metadata"]["resourceVersion"] + kwargs = {"definition": definition, "name": name, "namespace": namespace} + delete_pool_handles.apply_async( + kwargs=kwargs, + task_id=f"pool-delete-{uid}-{rv}", + ) + + +def dispatch_manage_pool(definition: dict, name: str, namespace: str): + """Dispatch manage_pool task. Always dispatches for operator events.""" + uid = definition["metadata"]["uid"] + rv = definition["metadata"]["resourceVersion"] + kwargs = {"definition": definition, "name": name, "namespace": namespace} + manage_pool.apply_async( + kwargs=kwargs, + task_id=f"pool-{uid}-{rv}", + ) + + +@register_schedule( + task_name="maintain-all-pools", + seconds=30, + description="Periodic task to reconcile all ResourcePools", + owner="poolboy", +) +@app.task(name="tasks.resourcepool.maintain_all_pools") +def maintain_all_pools(): + """Periodic task for Celery Beat - reconcile all pools.""" + lock_key = "maintain_all_pools:global" + + with distributed_lock(lock_key, timeout=300) as acquired: + if not acquired: + return {"status": "skipped", "reason": "already_running"} + + result = WorkerState.run_async(_maintain_all_pools()) + if result.get("dispatched", 0) > 0: + logger.info(f"Maintenance dispatched: {result['dispatched']}") + return result + + +@app.task(bind=True, acks_late=True) +def manage_pool(self, definition: dict, name: str, namespace: str): + """Execute ResourcePool.manage() in a worker.""" + from poolboy import Poolboy + + uid = definition["metadata"]["uid"] + lock_key = f"resource_pool:{uid}" + + with distributed_lock(lock_key, timeout=60) as acquired: + if not acquired: + logger.debug(f"Pool {namespace}/{name} locked, retrying") + countdown = Poolboy.workers_lock_retry_countdown + raise self.retry(countdown=countdown, max_retries=None) + + try: + result = WorkerState.run_async(_manage_pool(definition)) + return result + except Exception as e: + logger.error(f"Pool {namespace}/{name} error: {e}") + raise self.retry( + exc=e, countdown=Poolboy.workers_error_retry_countdown, max_retries=5 + ) diff --git a/requirements.txt b/requirements.txt index 6b992ea..9cb36c4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,19 +1,21 @@ -aioprometheus==23.12.0 +celery>=5.3.0 inflection==0.5.1 Jinja2==3.1.5 jmespath-community==1.1.2 jsonpointer==2.2 jsonschema==3.2.0 +kombu>=5.3.0 openapi-schema-validator==0.1.5 prometheus-client==0.11.0 pyasn1==0.4.8 pyasn1-modules==0.2.8 pydantic==1.10.13 pyOpenSSL==20.0.1 -python-dateutil==2.8.1 +python-dateutil>=2.8.2 python-string-utils==1.0.0 pytimeparse==1.1.8 PyYAML==6.0.1 +redis>=4.5.0 requests==2.32.0 str2bool==1.1 StringGenerator==0.4.4 diff --git a/test/ansible.cfg b/test/ansible.cfg index ed865bf..1943b71 100644 --- a/test/ansible.cfg +++ b/test/ansible.cfg @@ -1,2 +1,4 @@ [defaults] inventory = hosts +# Show task execution time +callbacks_enabled = profile_tasks diff --git a/test/dev-local.yaml b/test/dev-local.yaml new file mode 100644 index 0000000..2703305 --- /dev/null +++ b/test/dev-local.yaml @@ -0,0 +1,19 @@ +# Development environment variables for tests +# Usage: ansible-playbook playbook.yaml -e @dev-local.yaml +ansible_python_interpreter: "{{ ansible_playbook_python }}" + +# Operator namespace (where Poolboy is deployed) +poolboy_namespace: poolboy-dev + +# Test namespace (where test resources are created) +poolboy_test_namespace: poolboy-dev-test + +# Operator domain (CRD API group) +poolboy_domain: poolboy.gpte.redhat.com + +# Service account name +poolboy_service_account: poolboy + +# Cleanup test resources after tests +poolboy_test_cleanup: true + diff --git a/test/roles/poolboy_test_simple/tasks/test-02.yaml b/test/roles/poolboy_test_simple/tasks/test-02.yaml index 8677914..c2a8560 100644 --- a/test/roles/poolboy_test_simple/tasks/test-02.yaml +++ b/test/roles/poolboy_test_simple/tasks/test-02.yaml @@ -191,8 +191,8 @@ __state.status.lifespan.maximum != '2d' or __state.status.lifespan.relativeMaximum != '1d' until: r_get_resource_claim is success - delay: 1 - retries: 15 + delay: 2 + retries: 30 - name: Selt lifespan test-02 to end now vars: diff --git a/test/roles/poolboy_test_simple/tasks/test-approval-01.yaml b/test/roles/poolboy_test_simple/tasks/test-approval-01.yaml index 9a4cccc..9fd2896 100644 --- a/test/roles/poolboy_test_simple/tasks/test-approval-01.yaml +++ b/test/roles/poolboy_test_simple/tasks/test-approval-01.yaml @@ -93,8 +93,8 @@ r_get_resource_claim.resources[0].status.resourceHandle is undefined or r_get_resource_claim.resources[0].status.resources[0].state is undefined until: r_get_resource_claim is success - delay: 1 - retries: 10 + delay: 2 + retries: 30 - name: Delete ResourceClaim test-approval-01 kubernetes.core.k8s: diff --git a/test/roles/poolboy_test_simple/tasks/test-auto-delete-01.yaml b/test/roles/poolboy_test_simple/tasks/test-auto-delete-01.yaml index 316bc0f..ddae839 100644 --- a/test/roles/poolboy_test_simple/tasks/test-auto-delete-01.yaml +++ b/test/roles/poolboy_test_simple/tasks/test-auto-delete-01.yaml @@ -122,8 +122,8 @@ register: r_get_resource_claim failed_when: r_get_resource_claim.resources | length != 0 until: r_get_resource_claim is success - retries: 5 - delay: 1 + retries: 45 + delay: 2 - name: Verify delete of ResourceHandle for test-auto-delete-01-a kubernetes.core.k8s_info: diff --git a/test/roles/poolboy_test_simple/tasks/test-auto-detach-01.yaml b/test/roles/poolboy_test_simple/tasks/test-auto-detach-01.yaml index d07935f..6287c33 100644 --- a/test/roles/poolboy_test_simple/tasks/test-auto-detach-01.yaml +++ b/test/roles/poolboy_test_simple/tasks/test-auto-detach-01.yaml @@ -124,8 +124,8 @@ r_get_resource_claim.resources | length != 1 or not r_get_resource_claim.resources[0].status.resourceHandle.detached | default(False) | bool until: r_get_resource_claim is success - retries: 5 - delay: 1 + retries: 45 + delay: 2 - name: Verify delete of ResourceHandle for test-auto-detach-01-a kubernetes.core.k8s_info: diff --git a/test/roles/poolboy_test_simple/tasks/test-finalizers-01.yaml b/test/roles/poolboy_test_simple/tasks/test-finalizers-01.yaml index fe4d2e8..78f6c4d 100644 --- a/test/roles/poolboy_test_simple/tasks/test-finalizers-01.yaml +++ b/test/roles/poolboy_test_simple/tasks/test-finalizers-01.yaml @@ -68,8 +68,8 @@ failed_when: >- r_get_resource_claim.resources[0].status.resources[0].state is undefined until: r_get_resource_claim is success - delay: 1 - retries: 10 + delay: 2 + retries: 30 - name: Save facts from for ResourceClaim test-finalizers-01-a vars: @@ -109,7 +109,7 @@ failed_when: >- r_get_resource_handle.resources | length != 1 or r_get_resource_handle.resources[0].metadata.finalizers is undefined or - r_get_resource_handle.resources[0].metadata.finalizers != [poolboy_domain ~ '/handler'] + r_get_resource_handle.resources[0].metadata.finalizers != [poolboy_domain] - name: Set deprecated finalizer on ResourceHandle for test-finalizers-01-a kubernetes.core.k8s: diff --git a/test/roles/poolboy_test_simple/tasks/test-ignore-01.yaml b/test/roles/poolboy_test_simple/tasks/test-ignore-01.yaml index a00444f..9beea73 100644 --- a/test/roles/poolboy_test_simple/tasks/test-ignore-01.yaml +++ b/test/roles/poolboy_test_simple/tasks/test-ignore-01.yaml @@ -168,10 +168,10 @@ register: r_get_resource_claim_test failed_when: >- r_get_resource_claim_test.resources | length != 1 or - (now(true) - r_get_resource_claim_test.resources[0].spec.ts | to_datetime('%Y-%m-%dT%H:%M:%SZ')).total_seconds() > 10 + (now(true) - r_get_resource_claim_test.resources[0].spec.ts | to_datetime('%Y-%m-%dT%H:%M:%SZ')).total_seconds() > 15 until: r_get_resource_claim_test is success - delay: 1 - retries: 20 + delay: 2 + retries: 45 - name: Delete ResourceClaim test-ignore-01-a kubernetes.core.k8s: @@ -223,8 +223,10 @@ apiVersion: "{{ poolboy_domain }}/v1" kind: ResourceClaim metadata: - annotations: - poolboy.dev.local/resource-claim-init-timestamp: "1970-01-01T00:00:00Z" + annotations: >- + {{ { + poolboy_domain ~ "/resource-claim-init-timestamp": "1970-01-01T00:00:00Z" + } }} finalizers: - "{{ poolboy_domain }}" labels: >- @@ -266,7 +268,7 @@ "{{ poolboy_domain }}/test": "simple" }, "annotations": { - "poolboy.dev.local/resource-claim-init-timestamp": "1970-01-01T00:00:00Z" + "{{ poolboy_domain }}/resource-claim-init-timestamp": "1970-01-01T00:00:00Z" } }, "spec": { @@ -285,14 +287,14 @@ parameterValues: stringvar: one resourceHandle: - apiVersion: poolboy.dev.local/v1 + apiVersion: "{{ poolboy_domain }}/v1" kind: ResourceHandle name: guid-abcde namespace: poolboy-dev resources: - name: test-ignore-01 provider: - apiVersion: poolboy.dev.local/v1 + apiVersion: "{{ poolboy_domain }}/v1" kind: ResourceProvider name: test-ignore-01 namespace: poolboy-dev @@ -389,18 +391,20 @@ - name: Create ResourceHandle guid-abcde kubernetes.core.k8s: definition: - apiVersion: poolboy.dev.local/v1 + apiVersion: "{{ poolboy_domain }}/v1" kind: ResourceClaimTest metadata: - annotations: - poolboy.dev.local/resource-claim-name: test-ignore-01-b - poolboy.dev.local/resource-claim-namespace: "{{ poolboy_test_namespace }}" - poolboy.dev.local/resource-handle-name: guid-abcde - poolboy.dev.local/resource-handle-namespace: "{{ poolboy_namespace }}" - poolboy.dev.local/resource-handle-uid: 00000000-0000-0000-0000-000000000000 - poolboy.dev.local/resource-index: "0" - poolboy.dev.local/resource-provider-name: test-ignore-01 - poolboy.dev.local/resource-provider-namespace: "{{ poolboy_namespace }}" + annotations: >- + {{ { + poolboy_domain ~ "/resource-claim-name": "test-ignore-01-b", + poolboy_domain ~ "/resource-claim-namespace": poolboy_test_namespace, + poolboy_domain ~ "/resource-handle-name": "guid-abcde", + poolboy_domain ~ "/resource-handle-namespace": poolboy_namespace, + poolboy_domain ~ "/resource-handle-uid": "00000000-0000-0000-0000-000000000000", + poolboy_domain ~ "/resource-index": "0", + poolboy_domain ~ "/resource-provider-name": "test-ignore-01", + poolboy_domain ~ "/resource-provider-namespace": poolboy_namespace, + } }} name: test-ignore-01-abcde namespace: "{{ poolboy_test_namespace }}" spec: @@ -438,10 +442,10 @@ register: r_get_resource_claim_test failed_when: >- r_get_resource_claim_test.resources | length != 1 or - (now(true) - r_get_resource_claim_test.resources[0].spec.ts | to_datetime('%Y-%m-%dT%H:%M:%SZ')).total_seconds() > 10 + (now(true) - r_get_resource_claim_test.resources[0].spec.ts | to_datetime('%Y-%m-%dT%H:%M:%SZ')).total_seconds() > 15 until: r_get_resource_claim_test is success - delay: 1 - retries: 20 + delay: 2 + retries: 45 - name: Delete ResourceClaim test-ignore-01-b kubernetes.core.k8s: diff --git a/test/roles/poolboy_test_simple/tasks/test-lifespan-start-01.yaml b/test/roles/poolboy_test_simple/tasks/test-lifespan-start-01.yaml index 87eb9d1..c45bd4d 100644 --- a/test/roles/poolboy_test_simple/tasks/test-lifespan-start-01.yaml +++ b/test/roles/poolboy_test_simple/tasks/test-lifespan-start-01.yaml @@ -85,8 +85,8 @@ r_get_resource_claim.resources[0].status.lifespan.start is undefined or r_get_resource_claim.resources[0].status.resourceHandle.name is undefined or r_get_resource_claim.resources[0].status.resources[0].reference is undefined - delay: 1 - retries: 10 + delay: 2 + retries: 60 until: r_get_resource_claim is successful - name: Delete ResourceClaim test-lifespan-start-01 @@ -130,6 +130,6 @@ register: r_get_resource_claim_test failed_when: r_get_resource_claim_test.resources | length != 0 until: r_get_resource_claim_test is success - delay: 1 - retries: 10 + delay: 2 + retries: 30 ... diff --git a/test/roles/poolboy_test_simple/tasks/test-linked-01.yaml b/test/roles/poolboy_test_simple/tasks/test-linked-01.yaml index 7b40dfb..926531e 100644 --- a/test/roles/poolboy_test_simple/tasks/test-linked-01.yaml +++ b/test/roles/poolboy_test_simple/tasks/test-linked-01.yaml @@ -147,8 +147,8 @@ __resource_claim.status.resources[1].provider.name != 'test-linked-01-binder' or __resource_claim.status.resources[1].waitingFor != 'Linked ResourceProvider' until: r_get_resource_claim is success - delay: 1 - retries: 10 + delay: 2 + retries: 45 - name: Save facts from for ResourceClaim test-linked-01-a vars: @@ -226,8 +226,8 @@ __resource_claim.status.resources[1].state is undefined or __resource_claim.status.resources[1].waitingFor is defined until: r_get_resource_claim is success - delay: 1 - retries: 10 + delay: 2 + retries: 45 - name: Verify state of ResourceClaim test-linked-01-a binder vars: diff --git a/test/roles/poolboy_test_simple/tasks/test-linked-02.yaml b/test/roles/poolboy_test_simple/tasks/test-linked-02.yaml index 8fbedf8..8798707 100644 --- a/test/roles/poolboy_test_simple/tasks/test-linked-02.yaml +++ b/test/roles/poolboy_test_simple/tasks/test-linked-02.yaml @@ -122,8 +122,8 @@ __resource_claim.status.resources[0].provider.name != 'test-linked-02-base' or __resource_claim.status.resources[0].state is undefined until: r_get_resource_claim is success - delay: 1 - retries: 10 + delay: 2 + retries: 30 - name: Save facts from for ResourceClaim test-linked-02-a vars: @@ -149,8 +149,8 @@ register: r_get_resource_claim_test failed_when: r_get_resource_claim_test.resources | length != 1 until: r_get_resource_claim_test is success - delay: 1 - retries: 10 + delay: 2 + retries: 30 - name: Verify state of ResourceClaimTest for test-linked-02-a-base vars: @@ -185,8 +185,8 @@ r_get_resource_claim_test.resources[0].spec.stringvalue != 'TWO' or r_get_resource_claim_test.resources[0].spec.numbervalue != 20 until: r_get_resource_claim_test is success - delay: 1 - retries: 10 + delay: 2 + retries: 30 - name: Delete ResourceClaim test-linked-02-a kubernetes.core.k8s: @@ -205,8 +205,8 @@ register: r_get_resource_claim failed_when: r_get_resource_claim.resources | length != 0 until: r_get_resource_claim is success - retries: 5 - delay: 1 + retries: 10 + delay: 2 - name: Verify delete of ResourceHandle for test-linked-02-a kubernetes.core.k8s_info: @@ -217,8 +217,8 @@ register: r_get_resource_handle failed_when: r_get_resource_handle.resources | length != 0 until: r_get_resource_handle is success - retries: 5 - delay: 1 + retries: 10 + delay: 2 - name: Verify delete of ResourceClaimTest test-linked-02-a-base kubernetes.core.k8s_info: @@ -229,6 +229,6 @@ register: r_get_resource_claim_test failed_when: r_get_resource_claim_test.resources | length != 0 until: r_get_resource_claim_test is success - delay: 1 + delay: 2 retries: 10 ... diff --git a/test/roles/poolboy_test_simple/tasks/test-linked-03.yaml b/test/roles/poolboy_test_simple/tasks/test-linked-03.yaml index 2a4ccb2..0d55bfb 100644 --- a/test/roles/poolboy_test_simple/tasks/test-linked-03.yaml +++ b/test/roles/poolboy_test_simple/tasks/test-linked-03.yaml @@ -208,8 +208,8 @@ __resource_claim.status.resources[0].state.spec.numbervalue != 0 or __resource_claim.status.resources[0].state.spec.stringvalue != 'NO BASE' until: r_get_resource_claim is success - delay: 1 - retries: 10 + delay: 2 + retries: 30 - name: Create ResourceClaim test-linked-03-b kubernetes.core.k8s: @@ -252,8 +252,8 @@ __resource_claim.status.resources[1].state.spec.numbervalue != 10 or __resource_claim.status.resources[1].state.spec.stringvalue != 'ONE-A' until: r_get_resource_claim is success - delay: 1 - retries: 10 + delay: 2 + retries: 30 - name: Save facts from for ResourceClaim test-linked-03-b vars: @@ -276,8 +276,8 @@ register: r_get_resource_claim_test failed_when: r_get_resource_claim_test.resources | length != 1 until: r_get_resource_claim_test is success - delay: 1 - retries: 10 + delay: 2 + retries: 30 - name: Verify creation of binder ResourceClaimTest for test-linked-03-b kubernetes.core.k8s_info: @@ -288,8 +288,8 @@ register: r_get_resource_claim_test failed_when: r_get_resource_claim_test.resources | length != 1 until: r_get_resource_claim_test is success - delay: 1 - retries: 10 + delay: 2 + retries: 30 - name: Delete ResourceClaim test-linked-03-b kubernetes.core.k8s: @@ -308,8 +308,8 @@ register: r_get_resource_claim failed_when: r_get_resource_claim.resources | length != 0 until: r_get_resource_claim is success - retries: 5 - delay: 1 + retries: 10 + delay: 2 - name: Verify delete of ResourceHandle for test-linked-03-b kubernetes.core.k8s_info: @@ -320,8 +320,8 @@ register: r_get_resource_handle failed_when: r_get_resource_handle.resources | length != 0 until: r_get_resource_handle is success - retries: 5 - delay: 1 + retries: 10 + delay: 2 - name: Verify delete of base ResourceClaimTest test-linked-03-b kubernetes.core.k8s_info: @@ -332,8 +332,8 @@ register: r_get_resource_claim_test failed_when: r_get_resource_claim_test.resources | length != 0 until: r_get_resource_claim_test is success - delay: 1 - retries: 10 + delay: 2 + retries: 30 - name: Verify delete of binder ResourceClaimTest test-linked-03-b kubernetes.core.k8s_info: @@ -344,8 +344,8 @@ register: r_get_resource_claim_test failed_when: r_get_resource_claim_test.resources | length != 0 until: r_get_resource_claim_test is success - delay: 1 - retries: 10 + delay: 2 + retries: 30 - name: Create ResourceClaim test-linked-03-c kubernetes.core.k8s: @@ -388,8 +388,8 @@ __resource_claim.status.resources[1].state.spec.numbervalue != 200 or __resource_claim.status.resources[1].state.spec.stringvalue != 'TWO-B' until: r_get_resource_claim is success - delay: 1 - retries: 10 + delay: 2 + retries: 30 - name: Save facts from for ResourceClaim test-linked-03-c vars: @@ -412,8 +412,8 @@ register: r_get_resource_claim_test failed_when: r_get_resource_claim_test.resources | length != 1 until: r_get_resource_claim_test is success - delay: 1 - retries: 10 + delay: 2 + retries: 30 - name: Verify creation of binder ResourceClaimTest for test-linked-03-c kubernetes.core.k8s_info: @@ -424,8 +424,8 @@ register: r_get_resource_claim_test failed_when: r_get_resource_claim_test.resources | length != 1 until: r_get_resource_claim_test is success - delay: 1 - retries: 10 + delay: 2 + retries: 30 - name: Delete ResourceClaim test-linked-03-c kubernetes.core.k8s: @@ -444,8 +444,8 @@ register: r_get_resource_claim failed_when: r_get_resource_claim.resources | length != 0 until: r_get_resource_claim is success - retries: 5 - delay: 1 + retries: 10 + delay: 2 - name: Verify delete of ResourceHandle for test-linked-03-c kubernetes.core.k8s_info: @@ -456,8 +456,8 @@ register: r_get_resource_handle failed_when: r_get_resource_handle.resources | length != 0 until: r_get_resource_handle is success - retries: 5 - delay: 1 + retries: 10 + delay: 2 - name: Verify delete of base ResourceClaimTest test-linked-03-c kubernetes.core.k8s_info: @@ -468,8 +468,8 @@ register: r_get_resource_claim_test failed_when: r_get_resource_claim_test.resources | length != 0 until: r_get_resource_claim_test is success - delay: 1 - retries: 10 + delay: 2 + retries: 30 - name: Verify delete of binder ResourceClaimTest test-linked-03-c kubernetes.core.k8s_info: @@ -480,6 +480,6 @@ register: r_get_resource_claim_test failed_when: r_get_resource_claim_test.resources | length != 0 until: r_get_resource_claim_test is success - delay: 1 - retries: 10 + delay: 2 + retries: 30 ... diff --git a/test/roles/poolboy_test_simple/tasks/test-pool-03.yaml b/test/roles/poolboy_test_simple/tasks/test-pool-03.yaml index 3f45180..1efbf25 100644 --- a/test/roles/poolboy_test_simple/tasks/test-pool-03.yaml +++ b/test/roles/poolboy_test_simple/tasks/test-pool-03.yaml @@ -191,7 +191,7 @@ resources: - name: test-pool-03 provider: - apiVersion: poolboy.dev.local/v1 + apiVersion: "{{ poolboy_domain }}/v1" kind: ResourceProvider name: test-pool-03 namespace: poolboy-dev diff --git a/test/roles/poolboy_test_simple/tasks/test-pool-04.yaml b/test/roles/poolboy_test_simple/tasks/test-pool-04.yaml index 2b5a02d..ed24b75 100644 --- a/test/roles/poolboy_test_simple/tasks/test-pool-04.yaml +++ b/test/roles/poolboy_test_simple/tasks/test-pool-04.yaml @@ -120,8 +120,8 @@ r_get_resource_claim.resources[0].status.resourceHandle is undefined or r_get_resource_claim.resources[0].status.resourceHandle.name == failed_resource_handle_name until: r_get_resource_claim is success - delay: 1 - retries: 10 + delay: 2 + retries: 30 - name: Update ResourcePool test-pool-04 to create healthy but unready ResourceHandles kubernetes.core.k8s: diff --git a/test/roles/poolboy_test_simple/tasks/test-ready-01.yaml b/test/roles/poolboy_test_simple/tasks/test-ready-01.yaml index ee50851..5e597ae 100644 --- a/test/roles/poolboy_test_simple/tasks/test-ready-01.yaml +++ b/test/roles/poolboy_test_simple/tasks/test-ready-01.yaml @@ -69,13 +69,18 @@ name: test-ready-01 namespace: "{{ poolboy_test_namespace }}" register: r_get_resource_claim + vars: + _lifespan_seconds: >- + {{ + ( + r_get_resource_claim.resources[0].status.lifespan.end | to_datetime("%Y-%m-%dT%H:%M:%SZ") - + r_get_resource_claim.resources[0].status.lifespan.start | to_datetime("%Y-%m-%dT%H:%M:%SZ") + ).total_seconds() | int + }} failed_when: >- r_get_resource_claim.resources[0].status.ready != false or r_get_resource_claim.resources[0].status.lifespan.firstReady is defined or - ( - r_get_resource_claim.resources[0].status.lifespan.end | to_datetime("%Y-%m-%dT%H:%M:%SZ") - - r_get_resource_claim.resources[0].status.lifespan.start | to_datetime("%Y-%m-%dT%H:%M:%SZ") - ).total_seconds() != 24 * 60 * 60 + (_lifespan_seconds | int) < 86398 or (_lifespan_seconds | int) > 86402 until: r_get_resource_claim is success delay: 1 retries: 10 @@ -104,16 +109,21 @@ name: test-ready-01 namespace: "{{ poolboy_test_namespace }}" register: r_get_resource_claim + vars: + _lifespan_seconds: >- + {{ + ( + r_get_resource_claim.resources[0].status.lifespan.end | to_datetime("%Y-%m-%dT%H:%M:%SZ") - + r_get_resource_claim.resources[0].status.lifespan.firstReady | to_datetime("%Y-%m-%dT%H:%M:%SZ") + ).total_seconds() | int + }} failed_when: >- r_get_resource_claim.resources[0].status.ready != true or r_get_resource_claim.resources[0].status.lifespan.firstReady is undefined or - ( - r_get_resource_claim.resources[0].status.lifespan.end | to_datetime("%Y-%m-%dT%H:%M:%SZ") - - r_get_resource_claim.resources[0].status.lifespan.firstReady | to_datetime("%Y-%m-%dT%H:%M:%SZ") - ).total_seconds() != 24 * 60 * 60 + (_lifespan_seconds | int) < 86398 or (_lifespan_seconds | int) > 86402 until: r_get_resource_claim is success - delay: 1 - retries: 10 + delay: 2 + retries: 45 - name: Delete ResourceClaim test-ready-01 kubernetes.core.k8s: diff --git a/test/roles/poolboy_test_simple/tasks/test-requester-01.yaml b/test/roles/poolboy_test_simple/tasks/test-requester-01.yaml index e4c548c..fd7178e 100644 --- a/test/roles/poolboy_test_simple/tasks/test-requester-01.yaml +++ b/test/roles/poolboy_test_simple/tasks/test-requester-01.yaml @@ -92,7 +92,7 @@ spec: resources: - provider: - apiVersion: poolboy.dev.local/v1 + apiVersion: "{{ poolboy_domain }}/v1" kind: ResourceProvider name: test-requester-01 namespace: poolboy-dev diff --git a/test/roles/poolboy_test_simple/tasks/test-vars-03.yaml b/test/roles/poolboy_test_simple/tasks/test-vars-03.yaml index 3129de4..ef4939f 100644 --- a/test/roles/poolboy_test_simple/tasks/test-vars-03.yaml +++ b/test/roles/poolboy_test_simple/tasks/test-vars-03.yaml @@ -190,8 +190,8 @@ failed_when: >- r_get_resource_claim.resources[0].status.resources[0].validationError is defined until: r_get_resource_claim is success - delay: 1 - retries: 10 + delay: 2 + retries: 35 - name: Verify update of ResourceClaimTest test-vars-03-a kubernetes.core.k8s_info: diff --git a/test/roles/poolboy_test_simple/tasks/test.yaml b/test/roles/poolboy_test_simple/tasks/test.yaml index c07367f..27e6f37 100644 --- a/test/roles/poolboy_test_simple/tasks/test.yaml +++ b/test/roles/poolboy_test_simple/tasks/test.yaml @@ -525,6 +525,25 @@ retries: 10 delay: 1 +- name: Wait for all pool handles to be ready + kubernetes.core.k8s_info: + api_version: "{{ poolboy_domain }}/v1" + kind: ResourceHandle + namespace: "{{ poolboy_namespace }}" + label_selectors: + - "{{ poolboy_domain }}/resource-pool-name=test" + - "{{ poolboy_domain }}/resource-pool-namespace={{ poolboy_namespace }}" + register: r_get_pool_handles_ready + vars: + ready_handles: >- + {{ r_get_pool_handles_ready.resources | selectattr('status.ready', 'defined') | list }} + failed_when: >- + r_get_pool_handles_ready.resources | length != 3 or + ready_handles | length != 3 + until: r_get_pool_handles_ready is success + retries: 30 + delay: 2 + - name: Create ResourceClaim test-pool-match kubernetes.core.k8s: state: present @@ -586,8 +605,8 @@ claim.status.resourceHandle.name != first_pool_resource_handle_name or claim.status.resources[0].state.spec.vars.desired_state != 'started' until: r_get_test_pool_match_claim is success - retries: 10 - delay: 1 + retries: 30 + delay: 3 - name: Create ResourceClaim test-pool-explicit kubernetes.core.k8s: @@ -658,8 +677,8 @@ claim.status.resources[0].state.spec.vars.name != 'test-pool' or claim.status.resources[0].state.spec.vars.number != 23 until: r_get_test_pool_explicit_claim is success - retries: 10 - delay: 1 + retries: 30 + delay: 2 - name: Delete test resource pool kubernetes.core.k8s: @@ -716,7 +735,7 @@ failed_when: r_verify_test_pool_handle_deletion.resources | length != 0 until: r_verify_test_pool_handle_deletion is success delay: 2 - retries: 10 + retries: 30 - name: Create test-templated ResourceProvider kubernetes.core.k8s: @@ -774,7 +793,7 @@ __test_resource.spec.vars.desired_state | default('') != 'stopped' until: r_get_test_templated_1 is success delay: 5 - retries: 10 + retries: 30 - name: Delete resource claim test-templated-1 kubernetes.core.k8s: @@ -840,7 +859,7 @@ r_get_test_lifespan_1.resources | length > 0 until: r_get_test_lifespan_1 is success delay: 5 - retries: 10 + retries: 20 - name: Create ResourceClaim for test-lifespan-2 kubernetes.core.k8s: @@ -924,7 +943,7 @@ r_get_test_lifespan_2.resources | length > 0 until: r_get_test_lifespan_2 is success delay: 5 - retries: 10 + retries: 30 - name: Create test-lifespan resource pool kubernetes.core.k8s: @@ -987,8 +1006,8 @@ __test_lifespan_3.status.lifespan.end is undefined or 23 != (__test_lifespan_3.status.lifespan.end | to_datetime('%Y-%m-%dT%H:%M:%S%z') - __test_lifespan_3.status.lifespan.start | to_datetime('%Y-%m-%dT%H:%M:%S%z')).total_seconds() until: r_get_test_lifespan_3 is success - delay: 5 - retries: 10 + delay: 2 + retries: 25 - name: Create test-disable-creation ResourceProvider kubernetes.core.k8s: @@ -1160,7 +1179,7 @@ not claim.status.resources[1].state is defined or not claim.status.resources[1].state.spec.vars.test_value == 'foo' until: r_test_linked_1_claim is success - retries: 5 + retries: 30 delay: 2 - name: Create ResourceHandle for test-linked-2 @@ -1174,10 +1193,10 @@ spec: resources: - provider: - apiVersion: poolboy.dev.local/v1 + apiVersion: "{{ poolboy_domain }}/v1" kind: ResourceProvider name: test-base - namespace: "{{ poolboy_namespace}}" + namespace: "{{ poolboy_namespace }}" - name: Update ResourceClaimTest to set provision_vars for test-linked-2 kubernetes.core.k8s: