From 3853a1c6c4ed71576f9708eadf5d2f1204368400 Mon Sep 17 00:00:00 2001 From: Erez Freiberger Date: Wed, 21 Jan 2026 22:42:21 +0100 Subject: [PATCH 1/6] feat: add KWOK mode for compute-domain-dra-plugin Add new kwok-compute-domain-dra-plugin component that creates ResourceSlices for compute domain channels on KWOK simulated nodes. Key changes: - New internal/kwok-compute-domain-dra-plugin package with Node controller - Controller watches nodes with label type=kwok or annotation kwok.x-k8s.io/node=fake - Creates ResourceSlice named kwok--compute-domain-channel for each KWOK node - ResourceSlice contains channel-0 device for compute domain allocation - New cmd/kwok-compute-domain-dra-plugin entrypoint - Updated Dockerfile and Makefile to build new component --- Dockerfile | 10 ++ Makefile | 2 +- cmd/kwok-compute-domain-dra-plugin/main.go | 15 ++ .../kwok-compute-domain-dra-plugin/app.go | 90 ++++++++++ .../controllers/node/reconciler.go | 161 ++++++++++++++++++ 5 files changed, 277 insertions(+), 1 deletion(-) create mode 100644 cmd/kwok-compute-domain-dra-plugin/main.go create mode 100644 internal/kwok-compute-domain-dra-plugin/app.go create mode 100644 internal/kwok-compute-domain-dra-plugin/controllers/node/reconciler.go diff --git a/Dockerfile b/Dockerfile index 3052852..03f5339 100644 --- a/Dockerfile +++ b/Dockerfile @@ -65,6 +65,12 @@ COPY ./internal/status-updater/ ./internal/status-updater/ COPY ./internal/kwok-dra-plugin/ ./internal/kwok-dra-plugin/ RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENTS=kwok-dra-plugin +FROM common-builder AS kwok-compute-domain-dra-plugin-builder +COPY ./cmd/kwok-compute-domain-dra-plugin/ ./cmd/kwok-compute-domain-dra-plugin/ +COPY ./pkg/compute-domain/ ./pkg/compute-domain/ +COPY ./internal/kwok-compute-domain-dra-plugin/ ./internal/kwok-compute-domain-dra-plugin/ +RUN --mount=type=cache,target=/root/.cache/go-build make build OS=$TARGETOS ARCH=$TARGETARCH COMPONENTS=kwok-compute-domain-dra-plugin + FROM common-builder AS preloader-builder COPY ./cmd/preloader/ ./cmd/preloader/ RUN make build-preloader @@ -114,6 +120,10 @@ FROM ubuntu AS kwok-dra-plugin COPY --from=kwok-dra-plugin-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/kwok-dra-plugin /bin/ ENTRYPOINT ["/bin/kwok-dra-plugin"] +FROM ubuntu AS kwok-compute-domain-dra-plugin +COPY --from=kwok-compute-domain-dra-plugin-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/kwok-compute-domain-dra-plugin /bin/ +ENTRYPOINT ["/bin/kwok-compute-domain-dra-plugin"] + FROM ubuntu AS compute-domain-controller COPY --from=compute-domain-controller-builder /go/src/github.com/run-ai/fake-gpu-operator/bin/compute-domain-controller /bin/ ENTRYPOINT ["/bin/compute-domain-controller"] diff --git a/Makefile b/Makefile index 1b1d579..e755114 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ BUILD_DIR=$(shell pwd)/bin -COMPONENTS?=device-plugin dra-plugin-gpu status-updater kwok-gpu-device-plugin kwok-dra-plugin status-exporter status-exporter-kwok topology-server mig-faker compute-domain-controller compute-domain-dra-plugin +COMPONENTS?=device-plugin dra-plugin-gpu status-updater kwok-gpu-device-plugin kwok-dra-plugin kwok-compute-domain-dra-plugin status-exporter status-exporter-kwok topology-server mig-faker compute-domain-controller compute-domain-dra-plugin DOCKER_REPO_BASE=ghcr.io/run-ai/fake-gpu-operator DOCKER_TAG?=0.0.0-dev diff --git a/cmd/kwok-compute-domain-dra-plugin/main.go b/cmd/kwok-compute-domain-dra-plugin/main.go new file mode 100644 index 0000000..4d924f5 --- /dev/null +++ b/cmd/kwok-compute-domain-dra-plugin/main.go @@ -0,0 +1,15 @@ +package main + +import ( + "github.com/run-ai/fake-gpu-operator/internal/common/app" + "github.com/run-ai/fake-gpu-operator/internal/common/config" + kwokcomputedomaindraplugin "github.com/run-ai/fake-gpu-operator/internal/kwok-compute-domain-dra-plugin" +) + +func main() { + requiredEnvVars := []string{kwokcomputedomaindraplugin.EnvFakeGpuOperatorNamespace} + config.ValidateConfig(requiredEnvVars) + + appRunner := app.NewAppRunner(&kwokcomputedomaindraplugin.KWOKComputeDomainDraPluginApp{}) + appRunner.Run() +} diff --git a/internal/kwok-compute-domain-dra-plugin/app.go b/internal/kwok-compute-domain-dra-plugin/app.go new file mode 100644 index 0000000..5fea0c6 --- /dev/null +++ b/internal/kwok-compute-domain-dra-plugin/app.go @@ -0,0 +1,90 @@ +package kwokcomputedomaindraplugin + +import ( + "context" + "log" + + "github.com/spf13/viper" + corev1 "k8s.io/api/core/v1" + resourceapi "k8s.io/api/resource/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/kubernetes" + "k8s.io/klog/v2" + ctrl "sigs.k8s.io/controller-runtime" + + nodecontroller "github.com/run-ai/fake-gpu-operator/internal/kwok-compute-domain-dra-plugin/controllers/node" +) + +const ( + EnvFakeGpuOperatorNamespace = "FAKE_GPU_OPERATOR_NAMESPACE" +) + +type KWOKComputeDomainDraPluginAppConfiguration struct { + FakeGpuOperatorNamespace string `mapstructure:"FAKE_GPU_OPERATOR_NAMESPACE" validate:"required"` +} + +type KWOKComputeDomainDraPluginApp struct { + mgr ctrl.Manager + stopCh chan struct{} +} + +func (app *KWOKComputeDomainDraPluginApp) Run() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + go func() { + <-app.stopCh + cancel() + }() + + if err := app.mgr.Start(ctx); err != nil { + log.Fatalf("Failed to start manager: %v", err) + } +} + +func (app *KWOKComputeDomainDraPluginApp) Init(stopCh chan struct{}) { + app.stopCh = stopCh + + ctrl.SetLogger(klog.NewKlogr()) + + cfg, err := ctrl.GetConfig() + if err != nil { + log.Fatalf("Failed to get config: %v", err) + } + cfg.QPS = 100 + cfg.Burst = 200 + + scheme := runtime.NewScheme() + if err := corev1.AddToScheme(scheme); err != nil { + log.Fatalf("Failed to add corev1 to scheme: %v", err) + } + if err := resourceapi.AddToScheme(scheme); err != nil { + log.Fatalf("Failed to add resource.k8s.io to scheme: %v", err) + } + + namespace := viper.GetString(EnvFakeGpuOperatorNamespace) + app.mgr, err = ctrl.NewManager(cfg, ctrl.Options{ + Scheme: scheme, + }) + if err != nil { + log.Fatalf("Failed to create manager: %v", err) + } + + kubeClient, err := kubernetes.NewForConfig(cfg) + if err != nil { + log.Fatalf("Failed to create kubernetes client: %v", err) + } + + if err := nodecontroller.SetupWithManager(app.mgr, kubeClient, namespace); err != nil { + log.Fatalf("Failed to setup Node controller: %v", err) + } +} + +func (app *KWOKComputeDomainDraPluginApp) Name() string { + return "KWOKComputeDomainDraPlugin" +} + +func (app *KWOKComputeDomainDraPluginApp) GetConfig() interface{} { + var config KWOKComputeDomainDraPluginAppConfiguration + return config +} diff --git a/internal/kwok-compute-domain-dra-plugin/controllers/node/reconciler.go b/internal/kwok-compute-domain-dra-plugin/controllers/node/reconciler.go new file mode 100644 index 0000000..76ae072 --- /dev/null +++ b/internal/kwok-compute-domain-dra-plugin/controllers/node/reconciler.go @@ -0,0 +1,161 @@ +package node + +import ( + "context" + "fmt" + "log" + + corev1 "k8s.io/api/core/v1" + resourceapi "k8s.io/api/resource/v1" + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/kubernetes" + "k8s.io/utils/ptr" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/predicate" + + "github.com/run-ai/fake-gpu-operator/internal/common/constants" + "github.com/run-ai/fake-gpu-operator/pkg/compute-domain/consts" +) + +type NodeReconciler struct { + client.Client + Scheme *runtime.Scheme + kubeClient kubernetes.Interface + namespace string +} + +func (r *NodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + var node corev1.Node + if err := r.Get(ctx, req.NamespacedName, &node); err != nil { + if client.IgnoreNotFound(err) != nil { + return ctrl.Result{}, err + } + if err := r.deleteResourceSlice(ctx, req.Name); err != nil { + log.Printf("Failed to delete ResourceSlice for node %s: %v", req.Name, err) + return ctrl.Result{}, err + } + return ctrl.Result{}, nil + } + + if err := r.createOrUpdateResourceSlice(ctx, &node); err != nil { + log.Printf("Failed to create/update ResourceSlice for node %s: %v", node.Name, err) + return ctrl.Result{}, err + } + + return ctrl.Result{}, nil +} + +func (r *NodeReconciler) resourceSliceName(nodeName string) string { + return fmt.Sprintf("kwok-%s-compute-domain-channel", nodeName) +} + +func (r *NodeReconciler) createOrUpdateResourceSlice(ctx context.Context, node *corev1.Node) error { + devices := r.enumerateComputeDomainDevices() + + resourceSlice := &resourceapi.ResourceSlice{ + ObjectMeta: metav1.ObjectMeta{ + Name: r.resourceSliceName(node.Name), + }, + Spec: resourceapi.ResourceSliceSpec{ + Driver: consts.ComputeDomainDriverName, + NodeName: ptr.To(node.Name), + Pool: resourceapi.ResourcePool{ + Name: node.Name, + ResourceSliceCount: 1, + }, + Devices: devices, + }, + } + + existing, err := r.kubeClient.ResourceV1().ResourceSlices().Get(ctx, resourceSlice.Name, metav1.GetOptions{}) + if err != nil { + if errors.IsNotFound(err) { + _, err = r.kubeClient.ResourceV1().ResourceSlices().Create(ctx, resourceSlice, metav1.CreateOptions{}) + if err != nil { + return fmt.Errorf("failed to create ResourceSlice for node %s: %w", node.Name, err) + } + log.Printf("Created ResourceSlice for KWOK node %s with %d devices\n", node.Name, len(devices)) + return nil + } + return fmt.Errorf("failed to get ResourceSlice for node %s: %w", node.Name, err) + } + + resourceSlice.ResourceVersion = existing.ResourceVersion + _, err = r.kubeClient.ResourceV1().ResourceSlices().Update(ctx, resourceSlice, metav1.UpdateOptions{}) + if err != nil { + return fmt.Errorf("failed to update ResourceSlice for node %s: %w", node.Name, err) + } + log.Printf("Updated ResourceSlice for KWOK node %s with %d devices\n", node.Name, len(devices)) + return nil +} + +func (r *NodeReconciler) deleteResourceSlice(ctx context.Context, nodeName string) error { + err := r.kubeClient.ResourceV1().ResourceSlices().Delete(ctx, r.resourceSliceName(nodeName), metav1.DeleteOptions{}) + if err != nil && !errors.IsNotFound(err) { + return fmt.Errorf("failed to delete ResourceSlice for node %s: %w", nodeName, err) + } + log.Printf("Deleted ResourceSlice for KWOK node %s\n", nodeName) + return nil +} + +func (r *NodeReconciler) enumerateComputeDomainDevices() []resourceapi.Device { + devices := make([]resourceapi.Device, 0, 1) + device := r.newChannelDevice(0) + devices = append(devices, device) + return devices +} + +func (r *NodeReconciler) newChannelDevice(channelID int) resourceapi.Device { + return resourceapi.Device{ + Name: fmt.Sprintf("channel-%d", channelID), + Attributes: map[resourceapi.QualifiedName]resourceapi.DeviceAttribute{ + "compute-domain.nvidia.com/type": { + StringValue: ptr.To("channel"), + }, + "compute-domain.nvidia.com/id": { + IntValue: ptr.To(int64(channelID)), + }, + }, + } +} + +func (r *NodeReconciler) SetupWithManager(mgr ctrl.Manager) error { + kwokNodePredicate := predicate.NewPredicateFuncs(func(obj client.Object) bool { + node, ok := obj.(*corev1.Node) + if !ok { + return false + } + return isKWOKNode(node) + }) + + return ctrl.NewControllerManagedBy(mgr). + For(&corev1.Node{}). + WithEventFilter(kwokNodePredicate). + Complete(r) +} + +func isKWOKNode(node *corev1.Node) bool { + if node == nil || node.Labels == nil { + return false + } + return node.Labels["type"] == "kwok" || node.Annotations[constants.AnnotationKwokNode] == "fake" +} + +func SetupWithManager(mgr ctrl.Manager, kubeClient kubernetes.Interface, namespace string) error { + reconciler := &NodeReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + kubeClient: kubeClient, + namespace: namespace, + } + + if err := reconciler.SetupWithManager(mgr); err != nil { + return fmt.Errorf("failed to setup Node reconciler: %w", err) + } + + log.Println("Node reconciler setup complete for KWOK Compute Domain DRA plugin") + return nil +} From 84cf9bac3125a05c1cef82449f94bc86909ad542 Mon Sep 17 00:00:00 2001 From: Erez Freiberger Date: Wed, 21 Jan 2026 22:43:05 +0100 Subject: [PATCH 2/6] feat(helm): add kwokComputeDomainDraPlugin values Add new Helm values block for KWOK compute-domain DRA plugin: - kwokComputeDomainDraPlugin.enabled (default: false) - image configuration - resource requests/limits --- deploy/fake-gpu-operator/values.yaml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/deploy/fake-gpu-operator/values.yaml b/deploy/fake-gpu-operator/values.yaml index 9226dc2..6d313d4 100644 --- a/deploy/fake-gpu-operator/values.yaml +++ b/deploy/fake-gpu-operator/values.yaml @@ -207,3 +207,17 @@ computeDomainDraPlugin: limits: cpu: "200m" memory: "400Mi" + +kwokComputeDomainDraPlugin: + enabled: false + image: + pullPolicy: Always + repository: ghcr.io/run-ai/fake-gpu-operator/kwok-compute-domain-dra-plugin + tag: "" + resources: + requests: + cpu: "100m" + memory: "200Mi" + limits: + cpu: "200m" + memory: "400Mi" From cc2ee1c61a61d1c1373894078eec724e58c0117c Mon Sep 17 00:00:00 2001 From: Erez Freiberger Date: Wed, 21 Jan 2026 22:44:06 +0100 Subject: [PATCH 3/6] feat(helm): add KWOK compute-domain-dra-plugin deployment templates Add Helm templates for KWOK compute-domain DRA plugin deployment: - deployment.yaml: single replica Deployment with no host volumes - serviceaccount.yaml: dedicated service account - clusterrole.yaml: permissions for nodes and resourceslices - clusterrolebinding.yaml: binds role to service account All templates gated by kwokComputeDomainDraPlugin.enabled flag. --- .../clusterrole.yaml | 26 +++++++++++++++ .../clusterrolebinding.yaml | 14 ++++++++ .../deployment.yaml | 33 +++++++++++++++++++ .../serviceaccount.yaml | 8 +++++ 4 files changed, 81 insertions(+) create mode 100644 deploy/fake-gpu-operator/templates/kwok-compute-domain-dra-plugin/clusterrole.yaml create mode 100644 deploy/fake-gpu-operator/templates/kwok-compute-domain-dra-plugin/clusterrolebinding.yaml create mode 100644 deploy/fake-gpu-operator/templates/kwok-compute-domain-dra-plugin/deployment.yaml create mode 100644 deploy/fake-gpu-operator/templates/kwok-compute-domain-dra-plugin/serviceaccount.yaml diff --git a/deploy/fake-gpu-operator/templates/kwok-compute-domain-dra-plugin/clusterrole.yaml b/deploy/fake-gpu-operator/templates/kwok-compute-domain-dra-plugin/clusterrole.yaml new file mode 100644 index 0000000..1fd0fab --- /dev/null +++ b/deploy/fake-gpu-operator/templates/kwok-compute-domain-dra-plugin/clusterrole.yaml @@ -0,0 +1,26 @@ +{{- if .Values.kwokComputeDomainDraPlugin.enabled }} +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kwok-compute-domain-dra-plugin +rules: + - apiGroups: + - "" + resources: + - nodes + verbs: + - get + - list + - watch + - apiGroups: + - resource.k8s.io + resources: + - resourceslices + verbs: + - get + - list + - watch + - create + - update + - delete +{{- end }} diff --git a/deploy/fake-gpu-operator/templates/kwok-compute-domain-dra-plugin/clusterrolebinding.yaml b/deploy/fake-gpu-operator/templates/kwok-compute-domain-dra-plugin/clusterrolebinding.yaml new file mode 100644 index 0000000..6991697 --- /dev/null +++ b/deploy/fake-gpu-operator/templates/kwok-compute-domain-dra-plugin/clusterrolebinding.yaml @@ -0,0 +1,14 @@ +{{- if .Values.kwokComputeDomainDraPlugin.enabled }} +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: kwok-compute-domain-dra-plugin +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kwok-compute-domain-dra-plugin +subjects: + - kind: ServiceAccount + name: kwok-compute-domain-dra-plugin + namespace: "{{ .Release.Namespace }}" +{{- end }} diff --git a/deploy/fake-gpu-operator/templates/kwok-compute-domain-dra-plugin/deployment.yaml b/deploy/fake-gpu-operator/templates/kwok-compute-domain-dra-plugin/deployment.yaml new file mode 100644 index 0000000..8f28a82 --- /dev/null +++ b/deploy/fake-gpu-operator/templates/kwok-compute-domain-dra-plugin/deployment.yaml @@ -0,0 +1,33 @@ +{{- if .Values.kwokComputeDomainDraPlugin.enabled }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kwok-compute-domain-dra-plugin + labels: + app: kwok-compute-domain-dra-plugin +spec: + selector: + matchLabels: + app: kwok-compute-domain-dra-plugin + component: kwok-compute-domain-dra-plugin + replicas: 1 + template: + metadata: + labels: + app: kwok-compute-domain-dra-plugin + component: kwok-compute-domain-dra-plugin + spec: + containers: + - name: kwok-compute-domain-dra-plugin + image: "{{ .Values.kwokComputeDomainDraPlugin.image.repository }}:{{ .Values.kwokComputeDomainDraPlugin.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: "{{ .Values.kwokComputeDomainDraPlugin.image.pullPolicy }}" + resources: + {{- toYaml .Values.kwokComputeDomainDraPlugin.resources | nindent 12 }} + env: + - name: FAKE_GPU_OPERATOR_NAMESPACE + value: "{{ .Release.Namespace }}" + restartPolicy: Always + serviceAccountName: kwok-compute-domain-dra-plugin + imagePullSecrets: + - name: gcr-secret +{{- end }} diff --git a/deploy/fake-gpu-operator/templates/kwok-compute-domain-dra-plugin/serviceaccount.yaml b/deploy/fake-gpu-operator/templates/kwok-compute-domain-dra-plugin/serviceaccount.yaml new file mode 100644 index 0000000..309af94 --- /dev/null +++ b/deploy/fake-gpu-operator/templates/kwok-compute-domain-dra-plugin/serviceaccount.yaml @@ -0,0 +1,8 @@ +{{- if .Values.kwokComputeDomainDraPlugin.enabled }} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: kwok-compute-domain-dra-plugin + labels: + app: kwok-compute-domain-dra-plugin +{{- end }} From ea0b0cbb97c12ecc7d6a0a3600814f9acf77fad7 Mon Sep 17 00:00:00 2001 From: Erez Freiberger Date: Wed, 21 Jan 2026 22:44:56 +0100 Subject: [PATCH 4/6] test: enable KWOK compute-domain-dra-plugin in integration tests Update integration test harness: - Enable kwokComputeDomainDraPlugin in values.yaml - Load kwok-compute-domain-dra-plugin image into kind cluster - Pass image tag to helm install - Wait for kwok-compute-domain-dra-plugin deployment readiness --- test/integration/setup.sh | 8 ++++++-- test/integration/values.yaml | 5 +++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/test/integration/setup.sh b/test/integration/setup.sh index ae90901..56a6773 100755 --- a/test/integration/setup.sh +++ b/test/integration/setup.sh @@ -53,7 +53,7 @@ if [[ "${SKIP_SETUP}" != "true" ]]; then echo "Loading images into kind cluster..." DOCKER_REPO_BASE="${DOCKER_REPO_BASE:-ghcr.io/run-ai/fake-gpu-operator}" - for component in dra-plugin-gpu status-updater status-exporter topology-server kwok-dra-plugin compute-domain-controller compute-domain-dra-plugin; do + for component in dra-plugin-gpu status-updater status-exporter topology-server kwok-dra-plugin kwok-compute-domain-dra-plugin compute-domain-controller compute-domain-dra-plugin; do IMAGE="${DOCKER_REPO_BASE}/${component}:${DOCKER_TAG}" echo "Loading ${IMAGE}..." kind load docker-image \ @@ -83,7 +83,8 @@ if [[ "${SKIP_SETUP}" != "true" ]]; then --set topologyServer.image.tag="${DOCKER_TAG}" \ --set kwokDraPlugin.image.tag="${DOCKER_TAG}" \ --set computeDomainController.image.tag="${DOCKER_TAG}" \ - --set computeDomainDraPlugin.image.tag="${DOCKER_TAG}" + --set computeDomainDraPlugin.image.tag="${DOCKER_TAG}" \ + --set kwokComputeDomainDraPlugin.image.tag="${DOCKER_TAG}" echo "Waiting for status-updater pod to be ready..." kubectl wait --for=condition=Ready pod -l app=status-updater -n gpu-operator --timeout=120s @@ -103,6 +104,9 @@ if [[ "${SKIP_SETUP}" != "true" ]]; then echo "Waiting for compute-domain-dra-plugin daemonset to be ready..." kubectl wait --for=condition=Ready pod -l app=compute-domain-dra-plugin -n gpu-operator --timeout=120s + echo "Waiting for kwok-compute-domain-dra-plugin deployment to be ready..." + kubectl wait --for=condition=Ready pod -l app=kwok-compute-domain-dra-plugin -n gpu-operator --timeout=120s + # Install KWOK controller for simulated nodes echo "Installing KWOK controller..." KWOK_VERSION="${KWOK_VERSION:-v0.7.0}" diff --git a/test/integration/values.yaml b/test/integration/values.yaml index 4892cc6..8a32522 100644 --- a/test/integration/values.yaml +++ b/test/integration/values.yaml @@ -62,6 +62,11 @@ computeDomainDraPlugin: image: pullPolicy: Never +kwokComputeDomainDraPlugin: + enabled: true + image: + pullPolicy: Never + # GPU topology configuration # Status-updater will use this to create topology ConfigMaps topology: From e8eb57bce99a1140f727d8a3f312718d66706b45 Mon Sep 17 00:00:00 2001 From: Erez Freiberger Date: Wed, 21 Jan 2026 22:46:01 +0100 Subject: [PATCH 5/6] test: add KWOK compute-domain integration tests Add integration tests for compute-domain on KWOK nodes: - Test manifest: compute-domain-kwok-pod.yaml with ComputeDomain CR and Pod targeting KWOK node with nodeSelector and tolerations - Test: ResourceSlice created for KWOK nodes with compute-domain channels - Test: Pod scheduled on KWOK node can allocate compute-domain claim - Test: ComputeDomain status updated to Ready with KWOK node listed --- test/integration/compute_domain_test.go | 115 ++++++++++++++++++ .../manifests/compute-domain-kwok-pod.yaml | 47 +++++++ 2 files changed, 162 insertions(+) create mode 100644 test/integration/manifests/compute-domain-kwok-pod.yaml diff --git a/test/integration/compute_domain_test.go b/test/integration/compute_domain_test.go index 8d2d149..cafefb6 100644 --- a/test/integration/compute_domain_test.go +++ b/test/integration/compute_domain_test.go @@ -246,6 +246,121 @@ metadata: }) }) +var _ = Describe("KWOK Compute Domain Integration Tests", func() { + var testNamespaces []string + + AfterEach(func() { + for _, ns := range testNamespaces { + deleteNamespace(ns) + } + + for _, ns := range testNamespaces { + Eventually(func() error { + _, err := kubeClient.CoreV1().Namespaces().Get(context.Background(), ns, metav1.GetOptions{}) + if err != nil { + return err + } + return nil + }).WithTimeout(60*time.Second).ShouldNot(Succeed(), "Namespace %s should be deleted", ns) + } + + testNamespaces = nil + }) + + Describe("KWOK Compute Domain ResourceSlice", func() { + It("should create ResourceSlice for KWOK nodes with compute domain channels", func() { + kwokNodeName := "kwok-gpu-node-1" + resourceSliceName := fmt.Sprintf("kwok-%s-compute-domain-channel", kwokNodeName) + + Eventually(func() error { + _, err := kubeClient.ResourceV1().ResourceSlices().Get( + context.Background(), resourceSliceName, metav1.GetOptions{}) + return err + }).WithTimeout(60*time.Second).Should(Succeed(), "ResourceSlice should exist for KWOK node") + + resourceSlice, err := kubeClient.ResourceV1().ResourceSlices().Get( + context.Background(), resourceSliceName, metav1.GetOptions{}) + Expect(err).NotTo(HaveOccurred()) + + Expect(resourceSlice.Spec.Driver).To(Equal(consts.ComputeDomainDriverName)) + Expect(resourceSlice.Spec.NodeName).NotTo(BeNil()) + Expect(*resourceSlice.Spec.NodeName).To(Equal(kwokNodeName)) + Expect(resourceSlice.Spec.Pool.Name).To(Equal(kwokNodeName)) + Expect(resourceSlice.Spec.Devices).NotTo(BeEmpty()) + Expect(resourceSlice.Spec.Devices[0].Name).To(Equal("channel-0")) + }) + }) + + Describe("KWOK Compute Domain with Pod", func() { + It("should allocate ComputeDomain to pod on KWOK node and update status", func() { + manifestPath := "manifests/compute-domain-kwok-pod.yaml" + namespace := "compute-domain-test-kwok" + computeDomainName := "kwok-domain" + podName := "kwok-pod" + + setupTest(manifestPath, namespace, &testNamespaces) + + Eventually(func() error { + _, err := nvidiaClient.ResourceV1beta1().ComputeDomains(namespace).Get( + context.Background(), computeDomainName, metav1.GetOptions{}) + return err + }).WithTimeout(30 * time.Second).Should(Succeed()) + + Eventually(func() error { + _, err := kubeClient.ResourceV1().ResourceClaimTemplates(namespace).Get( + context.Background(), computeDomainName, metav1.GetOptions{}) + return err + }).WithTimeout(30 * time.Second).Should(Succeed()) + + Eventually(func() error { + _, err := kubeClient.CoreV1().Pods(namespace).Get( + context.Background(), podName, metav1.GetOptions{}) + return err + }).WithTimeout(30 * time.Second).Should(Succeed()) + + var claimName string + Eventually(func() error { + claimNames, err := getResourceClaimNameFromPod(namespace, podName) + if err != nil { + return err + } + if len(claimNames) == 0 { + return fmt.Errorf("no ResourceClaims found for pod") + } + claimName = claimNames[0] + return nil + }).WithTimeout(30 * time.Second).Should(Succeed()) + + Eventually(func() error { + return waitForResourceClaimAllocated(namespace, claimName, 30*time.Second) + }).WithTimeout(60 * time.Second).Should(Succeed()) + + waitForPodReady(namespace, podName, podReadyTimeout) + + Eventually(func() error { + cd, err := nvidiaClient.ResourceV1beta1().ComputeDomains(namespace).Get( + context.Background(), computeDomainName, metav1.GetOptions{}) + if err != nil { + return err + } + if len(cd.Status.Nodes) == 0 { + return fmt.Errorf("no nodes in ComputeDomain status") + } + if cd.Status.Status != "Ready" { + return fmt.Errorf("ComputeDomain status is %s, expected Ready", cd.Status.Status) + } + return nil + }).WithTimeout(30 * time.Second).Should(Succeed()) + + cd, err := nvidiaClient.ResourceV1beta1().ComputeDomains(namespace).Get( + context.Background(), computeDomainName, metav1.GetOptions{}) + Expect(err).NotTo(HaveOccurred()) + Expect(cd.Status.Nodes).To(HaveLen(1)) + Expect(cd.Status.Nodes[0].Name).To(HavePrefix("kwok-gpu-node-")) + }) + }) +}) + // Helper function to apply a manifest from string func applyManifestFromString(manifest string) { cmd := exec.Command("kubectl", "apply", "-f", "-") diff --git a/test/integration/manifests/compute-domain-kwok-pod.yaml b/test/integration/manifests/compute-domain-kwok-pod.yaml new file mode 100644 index 0000000..c3a127e --- /dev/null +++ b/test/integration/manifests/compute-domain-kwok-pod.yaml @@ -0,0 +1,47 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: compute-domain-test-kwok + +--- +apiVersion: resource.nvidia.com/v1beta1 +kind: ComputeDomain +metadata: + name: kwok-domain + namespace: compute-domain-test-kwok +spec: + numNodes: 1 + channel: + allocationMode: Single + resourceClaimTemplate: + name: kwok-domain + +--- +apiVersion: v1 +kind: Pod +metadata: + namespace: compute-domain-test-kwok + name: kwok-pod + labels: + app: kwok-pod +spec: + nodeSelector: + type: kwok + tolerations: + - key: kwok.x-k8s.io/node + operator: Equal + value: fake + effect: NoSchedule + terminationGracePeriodSeconds: 0 + containers: + - name: ctr0 + image: busybox + command: ["sh", "-c"] + args: ["printenv; sleep 9999 & wait"] + resources: + claims: + - name: compute-domain + resourceClaims: + - name: compute-domain + resourceClaimTemplateName: kwok-domain From 2017adf4dcc25ff27114436aab93368febf6622a Mon Sep 17 00:00:00 2001 From: Erez Freiberger Date: Mon, 2 Feb 2026 10:16:06 +0100 Subject: [PATCH 6/6] refactor(kwok-compute-domain-dra-plugin): klog replace log with klog for structured logging --- .../controllers/node/reconciler.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/internal/kwok-compute-domain-dra-plugin/controllers/node/reconciler.go b/internal/kwok-compute-domain-dra-plugin/controllers/node/reconciler.go index 76ae072..7937090 100644 --- a/internal/kwok-compute-domain-dra-plugin/controllers/node/reconciler.go +++ b/internal/kwok-compute-domain-dra-plugin/controllers/node/reconciler.go @@ -3,7 +3,6 @@ package node import ( "context" "fmt" - "log" corev1 "k8s.io/api/core/v1" resourceapi "k8s.io/api/resource/v1" @@ -11,6 +10,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/kubernetes" + "k8s.io/klog/v2" "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" @@ -34,14 +34,14 @@ func (r *NodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl. return ctrl.Result{}, err } if err := r.deleteResourceSlice(ctx, req.Name); err != nil { - log.Printf("Failed to delete ResourceSlice for node %s: %v", req.Name, err) + klog.ErrorS(err, "Failed to delete ResourceSlice for node", "node", req.Name) return ctrl.Result{}, err } return ctrl.Result{}, nil } if err := r.createOrUpdateResourceSlice(ctx, &node); err != nil { - log.Printf("Failed to create/update ResourceSlice for node %s: %v", node.Name, err) + klog.ErrorS(err, "Failed to create/update ResourceSlice for node", "node", node.Name) return ctrl.Result{}, err } @@ -77,7 +77,7 @@ func (r *NodeReconciler) createOrUpdateResourceSlice(ctx context.Context, node * if err != nil { return fmt.Errorf("failed to create ResourceSlice for node %s: %w", node.Name, err) } - log.Printf("Created ResourceSlice for KWOK node %s with %d devices\n", node.Name, len(devices)) + klog.InfoS("Created ResourceSlice for KWOK node", "node", node.Name, "deviceCount", len(devices)) return nil } return fmt.Errorf("failed to get ResourceSlice for node %s: %w", node.Name, err) @@ -88,7 +88,7 @@ func (r *NodeReconciler) createOrUpdateResourceSlice(ctx context.Context, node * if err != nil { return fmt.Errorf("failed to update ResourceSlice for node %s: %w", node.Name, err) } - log.Printf("Updated ResourceSlice for KWOK node %s with %d devices\n", node.Name, len(devices)) + klog.InfoS("Updated ResourceSlice for KWOK node", "node", node.Name, "deviceCount", len(devices)) return nil } @@ -97,7 +97,7 @@ func (r *NodeReconciler) deleteResourceSlice(ctx context.Context, nodeName strin if err != nil && !errors.IsNotFound(err) { return fmt.Errorf("failed to delete ResourceSlice for node %s: %w", nodeName, err) } - log.Printf("Deleted ResourceSlice for KWOK node %s\n", nodeName) + klog.InfoS("Deleted ResourceSlice for KWOK node", "node", nodeName) return nil } @@ -156,6 +156,6 @@ func SetupWithManager(mgr ctrl.Manager, kubeClient kubernetes.Interface, namespa return fmt.Errorf("failed to setup Node reconciler: %w", err) } - log.Println("Node reconciler setup complete for KWOK Compute Domain DRA plugin") + klog.InfoS("Node reconciler setup complete for KWOK Compute Domain DRA plugin") return nil }