From 2a78f0d575713bf00e5f6a226ef53741cab834b6 Mon Sep 17 00:00:00 2001 From: Jonathan Rosenboim Date: Mon, 2 Mar 2026 12:53:55 +0200 Subject: [PATCH 01/25] Add Kai Scheduler VPA objects --- pkg/apis/kai/v1/global.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pkg/apis/kai/v1/global.go b/pkg/apis/kai/v1/global.go index c355d84c3..407028f76 100644 --- a/pkg/apis/kai/v1/global.go +++ b/pkg/apis/kai/v1/global.go @@ -13,6 +13,10 @@ import ( // GlobalConfig defines the global configuration of the system type GlobalConfig struct { + // VPA defines the default Vertical Pod Autoscaler configuration for all services + // +kubebuilder:validation:Optional + VPA *common.VPASpec `json:"vpa,omitempty"` + // Openshift configures the operator to install on Openshift // +kubebuilder:validation:Optional Openshift *bool `json:"openshift,omitempty"` @@ -103,6 +107,11 @@ func (g *GlobalConfig) SetDefaultWhereNeeded() { } g.RequireDefaultPodAntiAffinityTerm = common.SetDefault(g.RequireDefaultPodAntiAffinityTerm, ptr.To(false)) + + if g.VPA == nil { + g.VPA = &common.VPASpec{} + } + g.VPA.SetDefaultsWhereNeeded() } func (g *GlobalConfig) GetSecurityContext() *v1.SecurityContext { From 83845b152f51db4c047f3c0666ffc8cda74e88ae Mon Sep 17 00:00:00 2001 From: Jonathan Rosenboim Date: Mon, 2 Mar 2026 12:54:06 +0200 Subject: [PATCH 02/25] Add VPA Spec definition --- pkg/apis/kai/v1/common/vpa.go | 37 +++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 pkg/apis/kai/v1/common/vpa.go diff --git a/pkg/apis/kai/v1/common/vpa.go b/pkg/apis/kai/v1/common/vpa.go new file mode 100644 index 000000000..e4ba3ee9a --- /dev/null +++ b/pkg/apis/kai/v1/common/vpa.go @@ -0,0 +1,37 @@ +// Copyright 2025 NVIDIA CORPORATION +// SPDX-License-Identifier: Apache-2.0 + +// +kubebuilder:object:generate:=true +package common + +import ( + vpav1 "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/apis/autoscaling.k8s.io/v1" + "k8s.io/utils/ptr" +) + +// VPASpec defines Vertical Pod Autoscaler configuration +type VPASpec struct { + // Enabled specifies if VPA should be enabled + // +kubebuilder:validation:Optional + Enabled *bool `json:"enabled,omitempty"` + + // UpdatePolicy controls when and how VPA applies changes to pod resources + // +kubebuilder:validation:Optional + UpdatePolicy *vpav1.PodUpdatePolicy `json:"updatePolicy,omitempty"` + + // ResourcePolicy controls how VPA computes recommended resources for containers + // +kubebuilder:validation:Optional + ResourcePolicy *vpav1.PodResourcePolicy `json:"resourcePolicy,omitempty"` +} + +func (v *VPASpec) SetDefaultsWhereNeeded() { + if v.Enabled == nil { + v.Enabled = ptr.To(false) + } + if v.UpdatePolicy == nil { + mode := vpav1.UpdateModeInPlaceOrRecreate + v.UpdatePolicy = &vpav1.PodUpdatePolicy{ + UpdateMode: &mode, + } + } +} From 7755f13a383704028a968db0b2f6294acb28bc72 Mon Sep 17 00:00:00 2001 From: Jonathan Rosenboim Date: Mon, 2 Mar 2026 12:59:21 +0200 Subject: [PATCH 03/25] Add VPA config to the scheduler's components --- pkg/apis/kai/v1/admission/admission.go | 10 +++++++++- pkg/apis/kai/v1/binder/binder.go | 10 +++++++++- pkg/apis/kai/v1/config_types.go | 14 +++++++------- .../v1/node_scale_adjuster/node_scale_adjuster.go | 10 +++++++++- .../pod_group_controller/pod_group_controller.go | 10 +++++++++- pkg/apis/kai/v1/pod_grouper/pod_grouper.go | 10 +++++++++- .../kai/v1/queue_controller/queue_controller.go | 10 +++++++++- pkg/apis/kai/v1/scheduler/scheduler.go | 10 +++++++++- 8 files changed, 70 insertions(+), 14 deletions(-) diff --git a/pkg/apis/kai/v1/admission/admission.go b/pkg/apis/kai/v1/admission/admission.go index 61f67a393..0cea4f5fc 100644 --- a/pkg/apis/kai/v1/admission/admission.go +++ b/pkg/apis/kai/v1/admission/admission.go @@ -48,9 +48,13 @@ type Admission struct { // set to empty string to disable // +kubebuilder:validation:Optional GPUPodRuntimeClassName *string `json:"gpuPodRuntimeClassName,omitempty"` + + // VPA specifies Vertical Pod Autoscaler configuration for the admission service + // +kubebuilder:validation:Optional + VPA *common.VPASpec `json:"vpa,omitempty"` } -func (b *Admission) SetDefaultsWhereNeeded(replicaCount *int32) { +func (b *Admission) SetDefaultsWhereNeeded(replicaCount *int32, globalVPA *common.VPASpec) { b.Service = common.SetDefault(b.Service, &common.Service{}) b.Service.SetDefaultsWhereNeeded(imageName) @@ -68,6 +72,10 @@ func (b *Admission) SetDefaultsWhereNeeded(replicaCount *int32) { b.MutatingWebhookConfigurationName = common.SetDefault(b.MutatingWebhookConfigurationName, ptr.To(defaultMutatingWebhookName)) b.GPUPodRuntimeClassName = common.SetDefault(b.GPUPodRuntimeClassName, ptr.To(constants.DefaultRuntimeClassName)) + + if b.VPA == nil { + b.VPA = globalVPA + } } // Webhook defines configuration for the admission webhook diff --git a/pkg/apis/kai/v1/binder/binder.go b/pkg/apis/kai/v1/binder/binder.go index 5763c87ab..4839cbaef 100644 --- a/pkg/apis/kai/v1/binder/binder.go +++ b/pkg/apis/kai/v1/binder/binder.go @@ -47,9 +47,13 @@ type Binder struct { // leave empty if unsure to let the operator auto detect using ClusterPolicy (nvidia gpu-operator only) // +kubebuilder:validation:Optional CDIEnabled *bool `json:"cdiEnabled,omitempty"` + + // VPA specifies Vertical Pod Autoscaler configuration for the binder + // +kubebuilder:validation:Optional + VPA *common.VPASpec `json:"vpa,omitempty"` } -func (b *Binder) SetDefaultsWhereNeeded(replicaCount *int32) { +func (b *Binder) SetDefaultsWhereNeeded(replicaCount *int32, globalVPA *common.VPASpec) { b.Service = common.SetDefault(b.Service, &common.Service{}) b.Service.Resources = common.SetDefault(b.Service.Resources, &common.Resources{}) if b.Service.Resources.Requests == nil { @@ -81,6 +85,10 @@ func (b *Binder) SetDefaultsWhereNeeded(replicaCount *int32) { b.ProbePort = common.SetDefault(b.ProbePort, ptr.To(8081)) b.MetricsPort = common.SetDefault(b.MetricsPort, ptr.To(8080)) + + if b.VPA == nil { + b.VPA = globalVPA + } } type ResourceReservation struct { diff --git a/pkg/apis/kai/v1/config_types.go b/pkg/apis/kai/v1/config_types.go index ac5cd35a1..ce35e2411 100644 --- a/pkg/apis/kai/v1/config_types.go +++ b/pkg/apis/kai/v1/config_types.go @@ -97,25 +97,25 @@ func (c *ConfigSpec) SetDefaultsWhereNeeded() { c.Global.SetDefaultWhereNeeded() c.QueueController = common.SetDefault(c.QueueController, &queue_controller.QueueController{}) - c.QueueController.SetDefaultsWhereNeeded(c.Global.ReplicaCount) + c.QueueController.SetDefaultsWhereNeeded(c.Global.ReplicaCount, c.Global.VPA) c.Binder = common.SetDefault(c.Binder, &binder.Binder{}) - c.Binder.SetDefaultsWhereNeeded(c.Global.ReplicaCount) + c.Binder.SetDefaultsWhereNeeded(c.Global.ReplicaCount, c.Global.VPA) c.PodGrouper = common.SetDefault(c.PodGrouper, &pod_grouper.PodGrouper{}) - c.PodGrouper.SetDefaultsWhereNeeded(c.Global.ReplicaCount) + c.PodGrouper.SetDefaultsWhereNeeded(c.Global.ReplicaCount, c.Global.VPA) c.Scheduler = common.SetDefault(c.Scheduler, &scheduler.Scheduler{}) - c.Scheduler.SetDefaultsWhereNeeded(c.Global.ReplicaCount) + c.Scheduler.SetDefaultsWhereNeeded(c.Global.ReplicaCount, c.Global.VPA) c.PodGroupController = common.SetDefault(c.PodGroupController, &pod_group_controller.PodGroupController{}) - c.PodGroupController.SetDefaultsWhereNeeded(c.Global.ReplicaCount) + c.PodGroupController.SetDefaultsWhereNeeded(c.Global.ReplicaCount, c.Global.VPA) c.Admission = common.SetDefault(c.Admission, &admission.Admission{}) - c.Admission.SetDefaultsWhereNeeded(c.Global.ReplicaCount) + c.Admission.SetDefaultsWhereNeeded(c.Global.ReplicaCount, c.Global.VPA) c.NodeScaleAdjuster = common.SetDefault(c.NodeScaleAdjuster, &node_scale_adjuster.NodeScaleAdjuster{}) - c.NodeScaleAdjuster.SetDefaultsWhereNeeded() + c.NodeScaleAdjuster.SetDefaultsWhereNeeded(c.Global.VPA) c.Prometheus = common.SetDefault(c.Prometheus, &prometheus.Prometheus{}) c.Prometheus.SetDefaultsWhereNeeded() diff --git a/pkg/apis/kai/v1/node_scale_adjuster/node_scale_adjuster.go b/pkg/apis/kai/v1/node_scale_adjuster/node_scale_adjuster.go index a8acc61fe..e83d8f357 100644 --- a/pkg/apis/kai/v1/node_scale_adjuster/node_scale_adjuster.go +++ b/pkg/apis/kai/v1/node_scale_adjuster/node_scale_adjuster.go @@ -22,6 +22,10 @@ type NodeScaleAdjuster struct { // Args specifies the CLI arguments for node-scale-adjuster // +kubebuilder:validation:Optional Args *Args `json:"args,omitempty"` + + // VPA specifies Vertical Pod Autoscaler configuration for the node-scale-adjuster + // +kubebuilder:validation:Optional + VPA *common.VPASpec `json:"vpa,omitempty"` } // Args specifies the CLI arguments for node-scale-adjuster @@ -54,10 +58,14 @@ func (args *Args) SetDefaultsWhereNeeded() { } // SetDefaultsWhereNeeded sets default for unset fields -func (nsa *NodeScaleAdjuster) SetDefaultsWhereNeeded() { +func (nsa *NodeScaleAdjuster) SetDefaultsWhereNeeded(globalVPA *common.VPASpec) { nsa.Service = common.SetDefault(nsa.Service, &common.Service{}) nsa.Service.SetDefaultsWhereNeeded(imageName) nsa.Args = common.SetDefault(nsa.Args, &Args{}) nsa.Args.SetDefaultsWhereNeeded() + + if nsa.VPA == nil { + nsa.VPA = globalVPA + } } diff --git a/pkg/apis/kai/v1/pod_group_controller/pod_group_controller.go b/pkg/apis/kai/v1/pod_group_controller/pod_group_controller.go index ef77f3623..f60418f67 100644 --- a/pkg/apis/kai/v1/pod_group_controller/pod_group_controller.go +++ b/pkg/apis/kai/v1/pod_group_controller/pod_group_controller.go @@ -35,9 +35,13 @@ type PodGroupController struct { // Replicas specifies the number podgroup controller replicas // +kubebuilder:validation:Optional Replicas *int32 `json:"replicas,omitempty"` + + // VPA specifies Vertical Pod Autoscaler configuration for the pod group controller + // +kubebuilder:validation:Optional + VPA *common.VPASpec `json:"vpa,omitempty"` } -func (pg *PodGroupController) SetDefaultsWhereNeeded(replicaCount *int32) { +func (pg *PodGroupController) SetDefaultsWhereNeeded(replicaCount *int32, globalVPA *common.VPASpec) { pg.Service = common.SetDefault(pg.Service, &common.Service{}) pg.Service.SetDefaultsWhereNeeded(imageName) @@ -61,6 +65,10 @@ func (pg *PodGroupController) SetDefaultsWhereNeeded(replicaCount *int32) { pg.Webhooks = common.SetDefault(pg.Webhooks, &PodGroupControllerWebhooks{}) pg.Webhooks.SetDefaultsWhereNeeded() + + if pg.VPA == nil { + pg.VPA = globalVPA + } } type Service struct { diff --git a/pkg/apis/kai/v1/pod_grouper/pod_grouper.go b/pkg/apis/kai/v1/pod_grouper/pod_grouper.go index 33d7c0a6b..a47dbfcab 100644 --- a/pkg/apis/kai/v1/pod_grouper/pod_grouper.go +++ b/pkg/apis/kai/v1/pod_grouper/pod_grouper.go @@ -33,6 +33,10 @@ type PodGrouper struct { // Replicas specifies the number of replicas of the pod-grouper controller // +kubebuilder:validation:Optional Replicas *int32 `json:"replicas,omitempty"` + + // VPA specifies Vertical Pod Autoscaler configuration for the pod-grouper + // +kubebuilder:validation:Optional + VPA *common.VPASpec `json:"vpa,omitempty"` } // Args defines command line arguments for the pod-grouper @@ -50,7 +54,7 @@ type Args struct { DefaultPrioritiesConfigMapNamespace *string `json:"defaultPrioritiesConfigMapNamespace,omitempty"` } -func (pg *PodGrouper) SetDefaultsWhereNeeded(replicaCount *int32) { +func (pg *PodGrouper) SetDefaultsWhereNeeded(replicaCount *int32, globalVPA *common.VPASpec) { pg.Service = common.SetDefault(pg.Service, &common.Service{}) pg.Service.SetDefaultsWhereNeeded(imageName) @@ -70,4 +74,8 @@ func (pg *PodGrouper) SetDefaultsWhereNeeded(replicaCount *int32) { pg.Args = common.SetDefault(pg.Args, &Args{}) pg.Replicas = common.SetDefault(pg.Replicas, ptr.To(ptr.Deref(replicaCount, 1))) pg.K8sClientConfig = common.SetDefault(pg.K8sClientConfig, &common.K8sClientConfig{}) + + if pg.VPA == nil { + pg.VPA = globalVPA + } } diff --git a/pkg/apis/kai/v1/queue_controller/queue_controller.go b/pkg/apis/kai/v1/queue_controller/queue_controller.go index 62e974023..9ac0deabd 100644 --- a/pkg/apis/kai/v1/queue_controller/queue_controller.go +++ b/pkg/apis/kai/v1/queue_controller/queue_controller.go @@ -42,9 +42,13 @@ type QueueController struct { // QueueLabelToDefaultMetricValue maps queue label keys to default metric values when the label is absent // +kubebuilder:validation:Optional QueueLabelToDefaultMetricValue *string `json:"queueLabelToDefaultMetricValue,omitempty"` + + // VPA specifies Vertical Pod Autoscaler configuration for the queue controller + // +kubebuilder:validation:Optional + VPA *common.VPASpec `json:"vpa,omitempty"` } -func (q *QueueController) SetDefaultsWhereNeeded(replicaCount *int32) { +func (q *QueueController) SetDefaultsWhereNeeded(replicaCount *int32, globalVPA *common.VPASpec) { q.Service = common.SetDefault(q.Service, &common.Service{}) q.Service.SetDefaultsWhereNeeded(imageName) @@ -68,6 +72,10 @@ func (q *QueueController) SetDefaultsWhereNeeded(replicaCount *int32) { q.Webhooks = common.SetDefault(q.Webhooks, &QueueControllerWebhooks{}) q.Webhooks.SetDefaultsWhereNeeded() + + if q.VPA == nil { + q.VPA = globalVPA + } } type Service struct { diff --git a/pkg/apis/kai/v1/scheduler/scheduler.go b/pkg/apis/kai/v1/scheduler/scheduler.go index edb862e32..e7d8beae0 100644 --- a/pkg/apis/kai/v1/scheduler/scheduler.go +++ b/pkg/apis/kai/v1/scheduler/scheduler.go @@ -30,9 +30,13 @@ type Scheduler struct { // Replicas specifies the number of replicas of the scheduler service // +kubebuilder:validation:Optional Replicas *int32 `json:"replicas,omitempty"` + + // VPA specifies Vertical Pod Autoscaler configuration for the scheduler + // +kubebuilder:validation:Optional + VPA *common.VPASpec `json:"vpa,omitempty"` } -func (s *Scheduler) SetDefaultsWhereNeeded(replicaCount *int32) { +func (s *Scheduler) SetDefaultsWhereNeeded(replicaCount *int32, globalVPA *common.VPASpec) { s.Service = common.SetDefault(s.Service, &common.Service{}) s.Service.Resources = common.SetDefault(s.Service.Resources, &common.Resources{}) @@ -63,6 +67,10 @@ func (s *Scheduler) SetDefaultsWhereNeeded(replicaCount *int32) { s.SchedulerService.SetDefaultsWhereNeeded() s.Replicas = common.SetDefault(s.Replicas, ptr.To(ptr.Deref(replicaCount, 1))) + + if s.VPA == nil { + s.VPA = globalVPA + } } // Service defines configuration for the scheduler service From 884bfbe921e3409cad12ed6bc5d7b703ad95f8ff Mon Sep 17 00:00:00 2001 From: Jonathan Rosenboim Date: Mon, 2 Mar 2026 12:59:51 +0200 Subject: [PATCH 04/25] Add construction of the VPA object --- pkg/operator/operands/common/vpa.go | 59 +++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 pkg/operator/operands/common/vpa.go diff --git a/pkg/operator/operands/common/vpa.go b/pkg/operator/operands/common/vpa.go new file mode 100644 index 000000000..ffa4db08b --- /dev/null +++ b/pkg/operator/operands/common/vpa.go @@ -0,0 +1,59 @@ +// Copyright 2025 NVIDIA CORPORATION +// SPDX-License-Identifier: Apache-2.0 + +package common + +import ( + appsv1 "k8s.io/api/apps/v1" + autoscalingv1 "k8s.io/api/autoscaling/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + vpav1 "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/apis/autoscaling.k8s.io/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + + kaicommon "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1/common" +) + +// BuildVPA creates a VerticalPodAutoscaler targeting the named resource of the given kind. +// Returns nil if VPA is not enabled. +func BuildVPA(vpaSpec *kaicommon.VPASpec, targetName, namespace, targetKind string) client.Object { + if vpaSpec == nil || vpaSpec.Enabled == nil || !*vpaSpec.Enabled { + return nil + } + + return &vpav1.VerticalPodAutoscaler{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "autoscaling.k8s.io/v1", + Kind: "VerticalPodAutoscaler", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: targetName, + Namespace: namespace, + }, + Spec: vpav1.VerticalPodAutoscalerSpec{ + TargetRef: &autoscalingv1.CrossVersionObjectReference{ + APIVersion: "apps/v1", + Kind: targetKind, + Name: targetName, + }, + UpdatePolicy: vpaSpec.UpdatePolicy, + ResourcePolicy: vpaSpec.ResourcePolicy, + }, + } +} + +// BuildVPAFromObjects finds the first Deployment or DaemonSet in objects and builds a VPA +// targeting it. Returns nil if VPA is not enabled or no workload is found. +func BuildVPAFromObjects(vpaSpec *kaicommon.VPASpec, objects []client.Object, namespace string) client.Object { + if vpaSpec == nil || vpaSpec.Enabled == nil || !*vpaSpec.Enabled { + return nil + } + for _, obj := range objects { + switch o := obj.(type) { + case *appsv1.Deployment: + return BuildVPA(vpaSpec, o.Name, namespace, "Deployment") + case *appsv1.DaemonSet: + return BuildVPA(vpaSpec, o.Name, namespace, "DaemonSet") + } + } + return nil +} From 0889d229c46e84dc9467a2c18d91f1a05f63652d Mon Sep 17 00:00:00 2001 From: Jonathan Rosenboim Date: Mon, 2 Mar 2026 13:06:52 +0200 Subject: [PATCH 05/25] Build the VPA object for the scheduler's components --- cmd/operator/app/app.go | 2 + pkg/operator/controller/config_controller.go | 1 + pkg/operator/operands/admission/admission.go | 4 ++ pkg/operator/operands/binder/binder.go | 4 ++ .../operands/known_types/known_types.go | 1 + .../known_types/verticalpodautoscalers.go | 58 +++++++++++++++++++ .../node_scale_adjuster.go | 4 ++ .../pod_group_controller.go | 4 ++ .../operands/pod_grouper/pod_grouper.go | 4 ++ .../queue_controller/queue_controller.go | 4 ++ pkg/operator/operands/scheduler/scheduler.go | 4 ++ 11 files changed, 90 insertions(+) create mode 100644 pkg/operator/operands/known_types/verticalpodautoscalers.go diff --git a/cmd/operator/app/app.go b/cmd/operator/app/app.go index 3717d3863..332fce33b 100644 --- a/cmd/operator/app/app.go +++ b/cmd/operator/app/app.go @@ -8,6 +8,7 @@ import ( nvidiav1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1" monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + vpav1 "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/apis/autoscaling.k8s.io/v1" "github.com/NVIDIA/KAI-scheduler/cmd/operator/config" kaiv1 "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1" @@ -45,6 +46,7 @@ func init() { utilruntime.Must(kaiv1alpha1.AddToScheme(scheme)) utilruntime.Must(nvidiav1.AddToScheme(scheme)) utilruntime.Must(monitoringv1.AddToScheme(scheme)) + utilruntime.Must(vpav1.AddToScheme(scheme)) // +kubebuilder:scaffold:scheme } diff --git a/pkg/operator/controller/config_controller.go b/pkg/operator/controller/config_controller.go index 9dce04dc1..ab0ac22bb 100644 --- a/pkg/operator/controller/config_controller.go +++ b/pkg/operator/controller/config_controller.go @@ -83,6 +83,7 @@ func (r *ConfigReconciler) SetOperands(ops []operands.Operand) { // +kubebuilder:rbac:groups="nvidia.com",resources=clusterpolicies,verbs=get;list;watch // +kubebuilder:rbac:groups="monitoring.coreos.com",resources=prometheuses;servicemonitors,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups="scheduling.run.ai",resources=queues,verbs=get;list;watch +// +kubebuilder:rbac:groups="autoscaling.k8s.io",resources=verticalpodautoscalers,verbs=get;list;watch;create;update;patch;delete // Reconcile is part of the main kubernetes reconciliation loop which aims to // move the current state of the cluster closer to the desired state. diff --git a/pkg/operator/operands/admission/admission.go b/pkg/operator/operands/admission/admission.go index c3694d6af..c38229683 100644 --- a/pkg/operator/operands/admission/admission.go +++ b/pkg/operator/operands/admission/admission.go @@ -62,6 +62,10 @@ func (a *Admission) DesiredState( objects = append(objects, newResources...) } + if vpa := common.BuildVPAFromObjects(kaiConfig.Spec.Admission.VPA, objects, kaiConfig.Spec.Namespace); vpa != nil { + objects = append(objects, vpa) + } + a.lastDesiredState = objects return objects, nil } diff --git a/pkg/operator/operands/binder/binder.go b/pkg/operator/operands/binder/binder.go index 0d9898dd0..2c28ef156 100644 --- a/pkg/operator/operands/binder/binder.go +++ b/pkg/operator/operands/binder/binder.go @@ -47,6 +47,10 @@ func (b *Binder) DesiredState( objects = append(objects, newResources...) } + if vpa := common.BuildVPAFromObjects(kaiConfig.Spec.Binder.VPA, objects, kaiConfig.Spec.Namespace); vpa != nil { + objects = append(objects, vpa) + } + b.lastDesiredState = objects return objects, nil } diff --git a/pkg/operator/operands/known_types/known_types.go b/pkg/operator/operands/known_types/known_types.go index 9e69c180f..8a04444b7 100644 --- a/pkg/operator/operands/known_types/known_types.go +++ b/pkg/operator/operands/known_types/known_types.go @@ -47,6 +47,7 @@ func init() { registerValidatingWebhookConfigurations() registerCustomResourceDefinitions() registerPrometheus() + registerVerticalPodAutoscalers() } func SetupKAIConfigOwned(fn *Collectable) { diff --git a/pkg/operator/operands/known_types/verticalpodautoscalers.go b/pkg/operator/operands/known_types/verticalpodautoscalers.go new file mode 100644 index 000000000..3a6363b49 --- /dev/null +++ b/pkg/operator/operands/known_types/verticalpodautoscalers.go @@ -0,0 +1,58 @@ +// Copyright 2025 NVIDIA CORPORATION +// SPDX-License-Identifier: Apache-2.0 + +package known_types + +import ( + "context" + + vpav1 "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/apis/autoscaling.k8s.io/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/manager" +) + +func vpaIndexer(object client.Object) []string { + vpa := object.(*vpav1.VerticalPodAutoscaler) + owner := metav1.GetControllerOf(vpa) + if !checkOwnerType(owner) { + return nil + } + return []string{getOwnerKey(owner)} +} + +func registerVerticalPodAutoscalers() { + collectable := &Collectable{ + Collect: getCurrentVPAState, + InitWithManager: func(ctx context.Context, mgr manager.Manager) error { + return mgr.GetFieldIndexer().IndexField(ctx, &vpav1.VerticalPodAutoscaler{}, CollectableOwnerKey, vpaIndexer) + }, + InitWithBuilder: func(builder *builder.Builder) *builder.Builder { + return builder.Owns(&vpav1.VerticalPodAutoscaler{}) + }, + InitWithFakeClientBuilder: func(fakeClientBuilder *fake.ClientBuilder) { + fakeClientBuilder.WithIndex(&vpav1.VerticalPodAutoscaler{}, CollectableOwnerKey, vpaIndexer) + }, + } + SetupKAIConfigOwned(collectable) + SetupSchedulingShardOwned(collectable) +} + +func getCurrentVPAState(ctx context.Context, runtimeClient client.Client, reconciler client.Object) (map[string]client.Object, error) { + result := map[string]client.Object{} + vpas := &vpav1.VerticalPodAutoscalerList{} + reconcilerKey := getReconcilerKey(reconciler) + + err := runtimeClient.List(ctx, vpas, client.MatchingFields{CollectableOwnerKey: reconcilerKey}) + if err != nil { + return nil, err + } + + for _, vpa := range vpas.Items { + result[GetKey(vpa.GroupVersionKind(), vpa.Namespace, vpa.Name)] = &vpa + } + + return result, nil +} diff --git a/pkg/operator/operands/node_scale_adjuster/node_scale_adjuster.go b/pkg/operator/operands/node_scale_adjuster/node_scale_adjuster.go index 9726ceb50..f22c7c475 100644 --- a/pkg/operator/operands/node_scale_adjuster/node_scale_adjuster.go +++ b/pkg/operator/operands/node_scale_adjuster/node_scale_adjuster.go @@ -42,6 +42,10 @@ func (nsa *NodeScaleAdjuster) DesiredState( objects = append(objects, obj) } + if vpa := common.BuildVPAFromObjects(kaiConfig.Spec.NodeScaleAdjuster.VPA, objects, kaiConfig.Spec.Namespace); vpa != nil { + objects = append(objects, vpa) + } + nsa.lastDesiredState = objects return objects, nil } diff --git a/pkg/operator/operands/pod_group_controller/pod_group_controller.go b/pkg/operator/operands/pod_group_controller/pod_group_controller.go index 1f24e066c..8f79b8d12 100644 --- a/pkg/operator/operands/pod_group_controller/pod_group_controller.go +++ b/pkg/operator/operands/pod_group_controller/pod_group_controller.go @@ -62,6 +62,10 @@ func (p *PodGroupController) DesiredState( objects = append(objects, obj...) } + if vpa := common.BuildVPAFromObjects(kaiConfig.Spec.PodGroupController.VPA, objects, kaiConfig.Spec.Namespace); vpa != nil { + objects = append(objects, vpa) + } + p.lastDesiredState = objects return objects, nil } diff --git a/pkg/operator/operands/pod_grouper/pod_grouper.go b/pkg/operator/operands/pod_grouper/pod_grouper.go index 5c677fa59..b1f9902c6 100644 --- a/pkg/operator/operands/pod_grouper/pod_grouper.go +++ b/pkg/operator/operands/pod_grouper/pod_grouper.go @@ -45,6 +45,10 @@ func (p *PodGrouper) DesiredState( objects = append(objects, obj) } + if vpa := common.BuildVPAFromObjects(kaiConfig.Spec.PodGrouper.VPA, objects, kaiConfig.Spec.Namespace); vpa != nil { + objects = append(objects, vpa) + } + p.lastDesiredState = objects return objects, nil } diff --git a/pkg/operator/operands/queue_controller/queue_controller.go b/pkg/operator/operands/queue_controller/queue_controller.go index 93753262d..35fd3782c 100644 --- a/pkg/operator/operands/queue_controller/queue_controller.go +++ b/pkg/operator/operands/queue_controller/queue_controller.go @@ -63,6 +63,10 @@ func (q *QueueController) DesiredState( objects = append(objects, obj...) } + if vpa := common.BuildVPAFromObjects(kaiConfig.Spec.QueueController.VPA, objects, kaiConfig.Spec.Namespace); vpa != nil { + objects = append(objects, vpa) + } + q.lastDesiredState = objects return objects, nil } diff --git a/pkg/operator/operands/scheduler/scheduler.go b/pkg/operator/operands/scheduler/scheduler.go index 2f916f903..8edbfd8b0 100644 --- a/pkg/operator/operands/scheduler/scheduler.go +++ b/pkg/operator/operands/scheduler/scheduler.go @@ -68,6 +68,10 @@ func (s *SchedulerForShard) DesiredState( objects = append(objects, object) } + if vpa := common.BuildVPAFromObjects(kaiConfig.Spec.Scheduler.VPA, objects, kaiConfig.Spec.Namespace); vpa != nil { + objects = append(objects, vpa) + } + s.lastDesiredState = objects return s.lastDesiredState, nil From 9846906f1e7181ae5d31c6b03880494e3d152895 Mon Sep 17 00:00:00 2001 From: Jonathan Rosenboim Date: Mon, 2 Mar 2026 13:14:57 +0200 Subject: [PATCH 06/25] Add VPA package to dependencies --- go.mod | 19 ++++++++++--------- go.sum | 38 ++++++++++++++++++++------------------ 2 files changed, 30 insertions(+), 27 deletions(-) diff --git a/go.mod b/go.mod index 6eb595069..838e4248f 100644 --- a/go.mod +++ b/go.mod @@ -46,6 +46,7 @@ require ( k8s.io/apiextensions-apiserver v0.34.3 k8s.io/apimachinery v0.34.3 k8s.io/apiserver v0.34.3 + k8s.io/autoscaler/vertical-pod-autoscaler v1.5.1 k8s.io/cli-runtime v0.34.1 k8s.io/client-go v0.34.3 k8s.io/cluster-bootstrap v0.34.1 @@ -98,7 +99,7 @@ require ( github.com/cyphar/filepath-securejoin v0.6.0 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/distribution/reference v0.6.0 // indirect - github.com/emicklei/go-restful/v3 v3.12.2 // indirect + github.com/emicklei/go-restful/v3 v3.13.0 // indirect github.com/evanphx/json-patch/v5 v5.9.11 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect github.com/fsnotify/fsnotify v1.9.0 // indirect @@ -106,7 +107,7 @@ require ( github.com/gabriel-vasile/mimetype v1.4.7 // indirect github.com/gin-contrib/sse v0.1.0 // indirect github.com/go-logr/zapr v1.3.0 // indirect - github.com/go-openapi/jsonpointer v0.21.1 // indirect + github.com/go-openapi/jsonpointer v0.21.2 // indirect github.com/go-openapi/jsonreference v0.21.0 // indirect github.com/go-openapi/swag v0.23.1 // indirect github.com/go-playground/locales v0.14.1 // indirect @@ -148,7 +149,7 @@ require ( github.com/pelletier/go-toml/v2 v2.2.3 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/prometheus/client_model v0.6.2 // indirect - github.com/prometheus/procfs v0.16.1 // indirect + github.com/prometheus/procfs v0.17.0 // indirect github.com/robfig/cron/v3 v3.0.1 // indirect github.com/sirupsen/logrus v1.9.3 // indirect github.com/spf13/cobra v1.10.1 // indirect @@ -159,12 +160,12 @@ require ( go.opentelemetry.io/auto/sdk v1.1.0 // indirect go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.60.0 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.59.0 // indirect - go.opentelemetry.io/otel v1.35.0 // indirect + go.opentelemetry.io/otel v1.37.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.34.0 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.34.0 // indirect - go.opentelemetry.io/otel/metric v1.35.0 // indirect + go.opentelemetry.io/otel/metric v1.37.0 // indirect go.opentelemetry.io/otel/sdk v1.35.0 // indirect - go.opentelemetry.io/otel/trace v1.35.0 // indirect + go.opentelemetry.io/otel/trace v1.37.0 // indirect go.opentelemetry.io/proto/otlp v1.5.0 // indirect go.uber.org/automaxprocs v1.6.0 // indirect go.yaml.in/yaml/v2 v2.4.3 // indirect @@ -177,19 +178,19 @@ require ( golang.org/x/sys v0.38.0 // indirect golang.org/x/term v0.37.0 // indirect golang.org/x/text v0.31.0 // indirect - golang.org/x/time v0.11.0 // indirect + golang.org/x/time v0.12.0 // indirect golang.org/x/tools v0.38.0 // indirect google.golang.org/genproto v0.0.0-20250303144028-a0af3efb3deb // indirect google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb // indirect google.golang.org/genproto/googleapis/rpc v0.0.0-20250313205543-e70fdf4c4cb4 // indirect google.golang.org/protobuf v1.36.8 // indirect - gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect + gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect k8s.io/cloud-provider v0.34.1 // indirect k8s.io/controller-manager v0.34.1 // indirect k8s.io/cri-api v0.34.1 // indirect k8s.io/csi-translation-lib v0.34.1 // indirect - k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b // indirect + k8s.io/kube-openapi v0.0.0-20250814151709-d7b6acb124c3 // indirect k8s.io/kubelet v0.34.1 // indirect knative.dev/networking v0.0.0-20250117155906-67d1c274ba6a // indirect sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect diff --git a/go.sum b/go.sum index f0df2a691..b4c6c785d 100644 --- a/go.sum +++ b/go.sum @@ -71,8 +71,8 @@ github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5Qvfr github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E= github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto= -github.com/emicklei/go-restful/v3 v3.12.2 h1:DhwDP0vY3k8ZzE0RunuJy8GhNpPL6zqLkDf9B/a0/xU= -github.com/emicklei/go-restful/v3 v3.12.2/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= +github.com/emicklei/go-restful/v3 v3.13.0 h1:C4Bl2xDndpU6nJ4bc1jXd+uTmYPVUwkD6bFY/oTyCes= +github.com/emicklei/go-restful/v3 v3.13.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98= @@ -107,8 +107,8 @@ github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ= github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg= -github.com/go-openapi/jsonpointer v0.21.1 h1:whnzv/pNXtK2FbX/W9yJfRmE2gsmkfahjMKB0fZvcic= -github.com/go-openapi/jsonpointer v0.21.1/go.mod h1:50I1STOfbY1ycR8jGz8DaMeLCdXiI6aDteEdRNNzpdk= +github.com/go-openapi/jsonpointer v0.21.2 h1:AqQaNADVwq/VnkCmQg6ogE+M3FOsKTytwges0JdwVuA= +github.com/go-openapi/jsonpointer v0.21.2/go.mod h1:50I1STOfbY1ycR8jGz8DaMeLCdXiI6aDteEdRNNzpdk= github.com/go-openapi/jsonreference v0.21.0 h1:Rs+Y7hSXT83Jacb7kFyjn4ijOuVGSvOdF2+tg1TRrwQ= github.com/go-openapi/jsonreference v0.21.0/go.mod h1:LmZmgsrTkVg9LG4EaHeY8cBDslNPMo06cago5JNLkm4= github.com/go-openapi/swag v0.23.1 h1:lpsStH0n2ittzTnbaSloVZLuB5+fvSY/+hnagBjSNZU= @@ -257,8 +257,8 @@ github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNw github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs= github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA= -github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg= -github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is= +github.com/prometheus/procfs v0.17.0 h1:FuLQ+05u4ZI+SS/w9+BWEM2TXiHKsUQ9TADiRH7DuK0= +github.com/prometheus/procfs v0.17.0/go.mod h1:oPQLaDAMRbA+u8H5Pbfq+dl3VDAvHxMUOVhe0wYB2zw= github.com/prometheus/statsd_exporter v0.22.7 h1:7Pji/i2GuhK6Lu7DHrtTkFmNBCudCPT1pX2CziuyQR0= github.com/prometheus/statsd_exporter v0.22.7/go.mod h1:N/TevpjkIh9ccs6nuzY3jQn9dFqnUakOjnEuMPJJJnI= github.com/ray-project/kuberay/ray-operator v1.4.2 h1:A4tGzbIky8sInAUxZBdBb+rrpZ7fbqoxdsOtm559Zqg= @@ -314,20 +314,20 @@ go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.6 go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.60.0/go.mod h1:rg+RlpR5dKwaS95IyyZqj5Wd4E13lk/msnTS0Xl9lJM= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.59.0 h1:CV7UdSGJt/Ao6Gp4CXckLxVRRsRgDHoI8XjbL3PDl8s= go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.59.0/go.mod h1:FRmFuRJfag1IZ2dPkHnEoSFVgTVPUd2qf5Vi69hLb8I= -go.opentelemetry.io/otel v1.35.0 h1:xKWKPxrxB6OtMCbmMY021CqC45J+3Onta9MqjhnusiQ= -go.opentelemetry.io/otel v1.35.0/go.mod h1:UEqy8Zp11hpkUrL73gSlELM0DupHoiq72dR+Zqel/+Y= +go.opentelemetry.io/otel v1.37.0 h1:9zhNfelUvx0KBfu/gb+ZgeAfAgtWrfHJZcAqFC228wQ= +go.opentelemetry.io/otel v1.37.0/go.mod h1:ehE/umFRLnuLa/vSccNq9oS1ErUlkkK71gMcN34UG8I= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.34.0 h1:OeNbIYk/2C15ckl7glBlOBp5+WlYsOElzTNmiPW/x60= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.34.0/go.mod h1:7Bept48yIeqxP2OZ9/AqIpYS94h2or0aB4FypJTc8ZM= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.34.0 h1:tgJ0uaNS4c98WRNUEx5U3aDlrDOI5Rs+1Vifcw4DJ8U= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.34.0/go.mod h1:U7HYyW0zt/a9x5J1Kjs+r1f/d4ZHnYFclhYY2+YbeoE= -go.opentelemetry.io/otel/metric v1.35.0 h1:0znxYu2SNyuMSQT4Y9WDWej0VpcsxkuklLa4/siN90M= -go.opentelemetry.io/otel/metric v1.35.0/go.mod h1:nKVFgxBZ2fReX6IlyW28MgZojkoAkJGaE8CpgeAU3oE= +go.opentelemetry.io/otel/metric v1.37.0 h1:mvwbQS5m0tbmqML4NqK+e3aDiO02vsf/WgbsdpcPoZE= +go.opentelemetry.io/otel/metric v1.37.0/go.mod h1:04wGrZurHYKOc+RKeye86GwKiTb9FKm1WHtO+4EVr2E= go.opentelemetry.io/otel/sdk v1.35.0 h1:iPctf8iprVySXSKJffSS79eOjl9pvxV9ZqOWT0QejKY= go.opentelemetry.io/otel/sdk v1.35.0/go.mod h1:+ga1bZliga3DxJ3CQGg3updiaAJoNECOgJREo9KHGQg= go.opentelemetry.io/otel/sdk/metric v1.34.0 h1:5CeK9ujjbFVL5c1PhLuStg1wxA7vQv7ce1EK0Gyvahk= go.opentelemetry.io/otel/sdk/metric v1.34.0/go.mod h1:jQ/r8Ze28zRKoNRdkjCZxfs6YvBTG1+YIqyFVFYec5w= -go.opentelemetry.io/otel/trace v1.35.0 h1:dPpEfJu1sDIqruz7BHFG3c7528f6ddfSWfFDVt/xgMs= -go.opentelemetry.io/otel/trace v1.35.0/go.mod h1:WUk7DtFp1Aw2MkvqGdwiXYDZZNvA/1J8o6xRXLrIkyc= +go.opentelemetry.io/otel/trace v1.37.0 h1:HLdcFNbRQBE2imdSEgm/kwqmQj1Or1l/7bW6mxVK7z4= +go.opentelemetry.io/otel/trace v1.37.0/go.mod h1:TlgrlQ+PtQO5XFerSPUYG0JSgGyryXewPGyayAWSBS0= go.opentelemetry.io/proto/otlp v1.5.0 h1:xJvq7gMzB31/d406fB8U5CBdyQGw4P399D1aQWU/3i4= go.opentelemetry.io/proto/otlp v1.5.0/go.mod h1:keN8WnHxOy8PG0rQZjJJ5A2ebUoafqWp0eVQ4yIXvJ4= go.uber.org/atomic v1.4.0/go.mod h1:gD2HeocX3+yG+ygLZcrzQJaqmWj9AIm7n08wl/qW/PE= @@ -403,8 +403,8 @@ golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM= golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM= -golang.org/x/time v0.11.0 h1:/bpjEDfN9tkoN/ryeYHnv5hcMlc8ncjMcM4XBk5NWV0= -golang.org/x/time v0.11.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg= +golang.org/x/time v0.12.0 h1:ScB/8o8olJvc+CQPWrK3fPZNfh7qgwCrY0zJmoEQLSE= +golang.org/x/time v0.12.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= @@ -446,8 +446,8 @@ google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXn gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= -gopkg.in/evanphx/json-patch.v4 v4.12.0 h1:n6jtcsulIzXPJaxegRbvFNNrZDjbij7ny3gmSPG+6V4= -gopkg.in/evanphx/json-patch.v4 v4.12.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= +gopkg.in/evanphx/json-patch.v4 v4.13.0 h1:czT3CmqEaQ1aanPc5SdlgQrrEIb8w/wwCvWWnfEbYzo= +gopkg.in/evanphx/json-patch.v4 v4.13.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= gopkg.in/h2non/gock.v1 v1.1.2 h1:jBbHXgGBK/AoPVfJh5x4r/WxIrElvbLel8TCZkkZJoY= gopkg.in/h2non/gock.v1 v1.1.2/go.mod h1:n7UGz/ckNChHiK05rDoiC4MYSunEC/lyaUm2WWaDva0= gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= @@ -470,6 +470,8 @@ k8s.io/apimachinery v0.34.3 h1:/TB+SFEiQvN9HPldtlWOTp0hWbJ+fjU+wkxysf/aQnE= k8s.io/apimachinery v0.34.3/go.mod h1:/GwIlEcWuTX9zKIg2mbw0LRFIsXwrfoVxn+ef0X13lw= k8s.io/apiserver v0.34.3 h1:uGH1qpDvSiYG4HVFqc6A3L4CKiX+aBWDrrsxHYK0Bdo= k8s.io/apiserver v0.34.3/go.mod h1:QPnnahMO5C2m3lm6fPW3+JmyQbvHZQ8uudAu/493P2w= +k8s.io/autoscaler/vertical-pod-autoscaler v1.5.1 h1:LlVtM3IKqIVHz1ZXC3ahe/mAtDWb7Eob0tyTzqFULqg= +k8s.io/autoscaler/vertical-pod-autoscaler v1.5.1/go.mod h1:znhUnV0Yn+CkZu3TZ2HVqd8GFRMkPj/CXszX1gdBjTU= k8s.io/cli-runtime v0.34.1 h1:btlgAgTrYd4sk8vJTRG6zVtqBKt9ZMDeQZo2PIzbL7M= k8s.io/cli-runtime v0.34.1/go.mod h1:aVA65c+f0MZiMUPbseU/M9l1Wo2byeaGwUuQEQVVveE= k8s.io/client-go v0.34.3 h1:wtYtpzy/OPNYf7WyNBTj3iUA0XaBHVqhv4Iv3tbrF5A= @@ -502,8 +504,8 @@ k8s.io/kube-aggregator v0.34.1 h1:WNLV0dVNoFKmuyvdWLd92iDSyD/TSTjqwaPj0U9XAEU= k8s.io/kube-aggregator v0.34.1/go.mod h1:RU8j+5ERfp0h+gIvWtxRPfsa5nK7rboDm8RST8BJfYQ= k8s.io/kube-controller-manager v0.34.1 h1:hrPRR4toT+xABAxzGpnldTL1RocYXyVhx6A5Einb9wU= k8s.io/kube-controller-manager v0.34.1/go.mod h1:+7jKjj5i7NLGM6zPHbdMh7qHaWFOBsF/oeUDdS70DSg= -k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b h1:MloQ9/bdJyIu9lb1PzujOPolHyvO06MXG5TUIj2mNAA= -k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b/go.mod h1:UZ2yyWbFTpuhSbFhv24aGNOdoRdJZgsIObGBUaYVsts= +k8s.io/kube-openapi v0.0.0-20250814151709-d7b6acb124c3 h1:liMHz39T5dJO1aOKHLvwaCjDbf07wVh6yaUlTpunnkE= +k8s.io/kube-openapi v0.0.0-20250814151709-d7b6acb124c3/go.mod h1:UZ2yyWbFTpuhSbFhv24aGNOdoRdJZgsIObGBUaYVsts= k8s.io/kube-proxy v0.34.1 h1:cIriNCJY5XmRhXCCyQiazyqi47lbwcBQf0H76fVOpkw= k8s.io/kube-proxy v0.34.1/go.mod h1:syed9c5+gUVFMo6p24SnlTHzsp+BMd4ACcTw2dbArw0= k8s.io/kube-scheduler v0.34.1 h1:S5td6VZwC3lCqERXclerDXhJ26zYc6JroY0s03+PqJ8= From 8101c2f94cfd2f09534848e5d881244683709264 Mon Sep 17 00:00:00 2001 From: Jonathan Rosenboim Date: Mon, 2 Mar 2026 13:22:45 +0200 Subject: [PATCH 07/25] Run make generate --- .../kai/v1/admission/zz_generated.deepcopy.go | 5 +++ .../kai/v1/binder/zz_generated.deepcopy.go | 5 +++ .../kai/v1/common/zz_generated.deepcopy.go | 31 +++++++++++++++++++ .../zz_generated.deepcopy.go | 5 +++ .../zz_generated.deepcopy.go | 5 +++ .../v1/pod_grouper/zz_generated.deepcopy.go | 5 +++ .../queue_controller/zz_generated.deepcopy.go | 5 +++ .../kai/v1/scheduler/zz_generated.deepcopy.go | 5 +++ pkg/apis/kai/v1/zz_generated.deepcopy.go | 6 ++++ 9 files changed, 72 insertions(+) diff --git a/pkg/apis/kai/v1/admission/zz_generated.deepcopy.go b/pkg/apis/kai/v1/admission/zz_generated.deepcopy.go index 1a8dc1d96..0fdb5f3b0 100644 --- a/pkg/apis/kai/v1/admission/zz_generated.deepcopy.go +++ b/pkg/apis/kai/v1/admission/zz_generated.deepcopy.go @@ -56,6 +56,11 @@ func (in *Admission) DeepCopyInto(out *Admission) { *out = new(string) **out = **in } + if in.VPA != nil { + in, out := &in.VPA, &out.VPA + *out = new(common.VPASpec) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Admission. diff --git a/pkg/apis/kai/v1/binder/zz_generated.deepcopy.go b/pkg/apis/kai/v1/binder/zz_generated.deepcopy.go index 966241a10..645752049 100644 --- a/pkg/apis/kai/v1/binder/zz_generated.deepcopy.go +++ b/pkg/apis/kai/v1/binder/zz_generated.deepcopy.go @@ -56,6 +56,11 @@ func (in *Binder) DeepCopyInto(out *Binder) { *out = new(bool) **out = **in } + if in.VPA != nil { + in, out := &in.VPA, &out.VPA + *out = new(common.VPASpec) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Binder. diff --git a/pkg/apis/kai/v1/common/zz_generated.deepcopy.go b/pkg/apis/kai/v1/common/zz_generated.deepcopy.go index 2f75d2df5..3e0b6143c 100644 --- a/pkg/apis/kai/v1/common/zz_generated.deepcopy.go +++ b/pkg/apis/kai/v1/common/zz_generated.deepcopy.go @@ -11,6 +11,7 @@ package common import ( "k8s.io/api/core/v1" + autoscaling_k8s_iov1 "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/apis/autoscaling.k8s.io/v1" ) // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. @@ -146,3 +147,33 @@ func (in *Service) DeepCopy() *Service { in.DeepCopyInto(out) return out } + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *VPASpec) DeepCopyInto(out *VPASpec) { + *out = *in + if in.Enabled != nil { + in, out := &in.Enabled, &out.Enabled + *out = new(bool) + **out = **in + } + if in.UpdatePolicy != nil { + in, out := &in.UpdatePolicy, &out.UpdatePolicy + *out = new(autoscaling_k8s_iov1.PodUpdatePolicy) + (*in).DeepCopyInto(*out) + } + if in.ResourcePolicy != nil { + in, out := &in.ResourcePolicy, &out.ResourcePolicy + *out = new(autoscaling_k8s_iov1.PodResourcePolicy) + (*in).DeepCopyInto(*out) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new VPASpec. +func (in *VPASpec) DeepCopy() *VPASpec { + if in == nil { + return nil + } + out := new(VPASpec) + in.DeepCopyInto(out) + return out +} diff --git a/pkg/apis/kai/v1/node_scale_adjuster/zz_generated.deepcopy.go b/pkg/apis/kai/v1/node_scale_adjuster/zz_generated.deepcopy.go index b07990aa4..f31ad55ae 100644 --- a/pkg/apis/kai/v1/node_scale_adjuster/zz_generated.deepcopy.go +++ b/pkg/apis/kai/v1/node_scale_adjuster/zz_generated.deepcopy.go @@ -61,6 +61,11 @@ func (in *NodeScaleAdjuster) DeepCopyInto(out *NodeScaleAdjuster) { *out = new(Args) (*in).DeepCopyInto(*out) } + if in.VPA != nil { + in, out := &in.VPA, &out.VPA + *out = new(common.VPASpec) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeScaleAdjuster. diff --git a/pkg/apis/kai/v1/pod_group_controller/zz_generated.deepcopy.go b/pkg/apis/kai/v1/pod_group_controller/zz_generated.deepcopy.go index dceee04c2..95205e292 100644 --- a/pkg/apis/kai/v1/pod_group_controller/zz_generated.deepcopy.go +++ b/pkg/apis/kai/v1/pod_group_controller/zz_generated.deepcopy.go @@ -41,6 +41,11 @@ func (in *PodGroupController) DeepCopyInto(out *PodGroupController) { *out = new(int32) **out = **in } + if in.VPA != nil { + in, out := &in.VPA, &out.VPA + *out = new(common.VPASpec) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodGroupController. diff --git a/pkg/apis/kai/v1/pod_grouper/zz_generated.deepcopy.go b/pkg/apis/kai/v1/pod_grouper/zz_generated.deepcopy.go index 8f9c83509..30800b7f2 100644 --- a/pkg/apis/kai/v1/pod_grouper/zz_generated.deepcopy.go +++ b/pkg/apis/kai/v1/pod_grouper/zz_generated.deepcopy.go @@ -71,6 +71,11 @@ func (in *PodGrouper) DeepCopyInto(out *PodGrouper) { *out = new(int32) **out = **in } + if in.VPA != nil { + in, out := &in.VPA, &out.VPA + *out = new(common.VPASpec) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new PodGrouper. diff --git a/pkg/apis/kai/v1/queue_controller/zz_generated.deepcopy.go b/pkg/apis/kai/v1/queue_controller/zz_generated.deepcopy.go index e133778f6..737e9971b 100644 --- a/pkg/apis/kai/v1/queue_controller/zz_generated.deepcopy.go +++ b/pkg/apis/kai/v1/queue_controller/zz_generated.deepcopy.go @@ -81,6 +81,11 @@ func (in *QueueController) DeepCopyInto(out *QueueController) { *out = new(string) **out = **in } + if in.VPA != nil { + in, out := &in.VPA, &out.VPA + *out = new(common.VPASpec) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new QueueController. diff --git a/pkg/apis/kai/v1/scheduler/zz_generated.deepcopy.go b/pkg/apis/kai/v1/scheduler/zz_generated.deepcopy.go index 90e2dee4b..854101b10 100644 --- a/pkg/apis/kai/v1/scheduler/zz_generated.deepcopy.go +++ b/pkg/apis/kai/v1/scheduler/zz_generated.deepcopy.go @@ -37,6 +37,11 @@ func (in *Scheduler) DeepCopyInto(out *Scheduler) { *out = new(int32) **out = **in } + if in.VPA != nil { + in, out := &in.VPA, &out.VPA + *out = new(common.VPASpec) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Scheduler. diff --git a/pkg/apis/kai/v1/zz_generated.deepcopy.go b/pkg/apis/kai/v1/zz_generated.deepcopy.go index 45b78c48c..519b01ef9 100644 --- a/pkg/apis/kai/v1/zz_generated.deepcopy.go +++ b/pkg/apis/kai/v1/zz_generated.deepcopy.go @@ -12,6 +12,7 @@ package v1 import ( "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1/admission" "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1/binder" + "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1/common" "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1/node_scale_adjuster" "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1/pod_group_controller" "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1/pod_grouper" @@ -192,6 +193,11 @@ func (in *ConfigStatus) DeepCopy() *ConfigStatus { // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *GlobalConfig) DeepCopyInto(out *GlobalConfig) { *out = *in + if in.VPA != nil { + in, out := &in.VPA, &out.VPA + *out = new(common.VPASpec) + (*in).DeepCopyInto(*out) + } if in.Openshift != nil { in, out := &in.Openshift, &out.Openshift *out = new(bool) From 47bc5050616c2cb018450468659024180518e045 Mon Sep 17 00:00:00 2001 From: Jonathan Rosenboim Date: Mon, 2 Mar 2026 13:30:27 +0200 Subject: [PATCH 08/25] Update tests to account for VPA --- pkg/apis/kai/v1/admission/admission_test.go | 4 ++-- pkg/apis/kai/v1/binder/binder_test.go | 10 +++++----- .../v1/node_scale_adjuster/node_scale_adjuster_test.go | 2 +- .../pod_group_controller/pod_group_controller_test.go | 4 ++-- pkg/apis/kai/v1/pod_grouper/pod_grouper_test.go | 4 ++-- .../kai/v1/queue_controller/queue_controller_test.go | 4 ++-- pkg/apis/kai/v1/scheduler/scheduler_test.go | 10 +++++----- pkg/operator/operands/deployable/deployable_test.go | 2 ++ 8 files changed, 21 insertions(+), 19 deletions(-) diff --git a/pkg/apis/kai/v1/admission/admission_test.go b/pkg/apis/kai/v1/admission/admission_test.go index e8dc15cf0..8c5aa85d9 100644 --- a/pkg/apis/kai/v1/admission/admission_test.go +++ b/pkg/apis/kai/v1/admission/admission_test.go @@ -22,7 +22,7 @@ var _ = Describe("Admission", func() { Admission := &Admission{} var replicaCount int32 replicaCount = 1 - Admission.SetDefaultsWhereNeeded(&replicaCount) + Admission.SetDefaultsWhereNeeded(&replicaCount, nil) Expect(*Admission.Service.Enabled).To(Equal(true)) Expect(*Admission.Service.Image.Name).To(Equal("admission")) Expect(*Admission.Replicas).To(Equal(int32(1))) @@ -32,7 +32,7 @@ var _ = Describe("Admission", func() { Admission := &Admission{} var replicaCount int32 replicaCount = 3 - Admission.SetDefaultsWhereNeeded(&replicaCount) + Admission.SetDefaultsWhereNeeded(&replicaCount, nil) Expect(*Admission.Replicas).To(Equal(int32(3))) }) }) diff --git a/pkg/apis/kai/v1/binder/binder_test.go b/pkg/apis/kai/v1/binder/binder_test.go index 8a18b65c4..b43d0fa60 100644 --- a/pkg/apis/kai/v1/binder/binder_test.go +++ b/pkg/apis/kai/v1/binder/binder_test.go @@ -23,7 +23,7 @@ func TestBinder(t *testing.T) { var _ = Describe("Binder", func() { It("Set Defaults", func(ctx context.Context) { binder := &Binder{} - binder.SetDefaultsWhereNeeded(nil) + binder.SetDefaultsWhereNeeded(nil, nil) Expect(*binder.Service.Enabled).To(Equal(true)) Expect(*binder.Service.Image.Name).To(Equal("binder")) Expect(binder.Service.Resources.Requests[v1.ResourceCPU]).To(Equal(resource.MustParse("50m"))) @@ -35,14 +35,14 @@ var _ = Describe("Binder", func() { binder := &Binder{} var replicaCount int32 replicaCount = 3 - binder.SetDefaultsWhereNeeded(&replicaCount) + binder.SetDefaultsWhereNeeded(&replicaCount, nil) Expect(*binder.Replicas).To(Equal(int32(3))) }) Context("ResourceReservation PodResources configuration", func() { It("should not set default PodResources when not configured", func(ctx context.Context) { binder := &Binder{} - binder.SetDefaultsWhereNeeded(nil) + binder.SetDefaultsWhereNeeded(nil, nil) // PodResources should be nil when not configured Expect(binder.ResourceReservation.PodResources).To(BeNil()) @@ -64,7 +64,7 @@ var _ = Describe("Binder", func() { PodResources: podResources, }, } - binder.SetDefaultsWhereNeeded(nil) + binder.SetDefaultsWhereNeeded(nil, nil) // Configured values should be preserved Expect(binder.ResourceReservation.PodResources).NotTo(BeNil()) @@ -88,7 +88,7 @@ var _ = Describe("Binder", func() { PodResources: podResources, }, } - binder.SetDefaultsWhereNeeded(nil) + binder.SetDefaultsWhereNeeded(nil, nil) // Only CPU should be set Expect(binder.ResourceReservation.PodResources).NotTo(BeNil()) diff --git a/pkg/apis/kai/v1/node_scale_adjuster/node_scale_adjuster_test.go b/pkg/apis/kai/v1/node_scale_adjuster/node_scale_adjuster_test.go index 1f08bc6f0..317da60fd 100644 --- a/pkg/apis/kai/v1/node_scale_adjuster/node_scale_adjuster_test.go +++ b/pkg/apis/kai/v1/node_scale_adjuster/node_scale_adjuster_test.go @@ -19,7 +19,7 @@ func TestNodeScaleAdjuster(t *testing.T) { var _ = Describe("NodeScaleAdjuster", func() { It("Set Defaults", func(ctx context.Context) { adjuster := &NodeScaleAdjuster{} - adjuster.SetDefaultsWhereNeeded() + adjuster.SetDefaultsWhereNeeded(nil) Expect(*adjuster.Service.Enabled).To(Equal(true)) Expect(*adjuster.Service.Image.Name).To(Equal(imageName)) }) diff --git a/pkg/apis/kai/v1/pod_group_controller/pod_group_controller_test.go b/pkg/apis/kai/v1/pod_group_controller/pod_group_controller_test.go index 48b1c37dc..24e9c2fad 100644 --- a/pkg/apis/kai/v1/pod_group_controller/pod_group_controller_test.go +++ b/pkg/apis/kai/v1/pod_group_controller/pod_group_controller_test.go @@ -19,7 +19,7 @@ func TestPodGroupController(t *testing.T) { var _ = Describe("PodGroupController", func() { It("Set Defaults", func(ctx context.Context) { podGroupController := &PodGroupController{} - podGroupController.SetDefaultsWhereNeeded(nil) + podGroupController.SetDefaultsWhereNeeded(nil, nil) Expect(*podGroupController.Service.Enabled).To(Equal(true)) Expect(*podGroupController.Service.Image.Name).To(Equal(imageName)) }) @@ -27,7 +27,7 @@ var _ = Describe("PodGroupController", func() { podGroupController := &PodGroupController{} var replicaCount int32 replicaCount = 3 - podGroupController.SetDefaultsWhereNeeded(&replicaCount) + podGroupController.SetDefaultsWhereNeeded(&replicaCount, nil) Expect(*podGroupController.Replicas).To(Equal(int32(3))) }) }) diff --git a/pkg/apis/kai/v1/pod_grouper/pod_grouper_test.go b/pkg/apis/kai/v1/pod_grouper/pod_grouper_test.go index 7471d03e9..1d50ae9dd 100644 --- a/pkg/apis/kai/v1/pod_grouper/pod_grouper_test.go +++ b/pkg/apis/kai/v1/pod_grouper/pod_grouper_test.go @@ -21,7 +21,7 @@ var _ = Describe("PodGrouper", func() { podGrouper := &PodGrouper{} var replicaCount int32 replicaCount = 1 - podGrouper.SetDefaultsWhereNeeded(&replicaCount) + podGrouper.SetDefaultsWhereNeeded(&replicaCount, nil) Expect(*podGrouper.Service.Enabled).To(Equal(true)) Expect(*podGrouper.Service.Image.Name).To(Equal("podgrouper")) Expect(*podGrouper.Replicas).To(Equal(int32(1))) @@ -30,7 +30,7 @@ var _ = Describe("PodGrouper", func() { podGrouper := &PodGrouper{} var replicaCount int32 replicaCount = 3 - podGrouper.SetDefaultsWhereNeeded(&replicaCount) + podGrouper.SetDefaultsWhereNeeded(&replicaCount, nil) Expect(*podGrouper.Replicas).To(Equal(int32(3))) }) }) diff --git a/pkg/apis/kai/v1/queue_controller/queue_controller_test.go b/pkg/apis/kai/v1/queue_controller/queue_controller_test.go index dc0f2dbcc..1912ac629 100644 --- a/pkg/apis/kai/v1/queue_controller/queue_controller_test.go +++ b/pkg/apis/kai/v1/queue_controller/queue_controller_test.go @@ -21,7 +21,7 @@ var _ = Describe("QueueController", func() { queueController := &QueueController{} var replicaCount int32 replicaCount = 1 - queueController.SetDefaultsWhereNeeded(&replicaCount) + queueController.SetDefaultsWhereNeeded(&replicaCount, nil) Expect(*queueController.Service.Enabled).To(Equal(true)) Expect(*queueController.Service.Image.Name).To(Equal("queuecontroller")) Expect(*queueController.Replicas).To(Equal(int32(1))) @@ -30,7 +30,7 @@ var _ = Describe("QueueController", func() { queueController := &QueueController{} var replicaCount int32 replicaCount = 3 - queueController.SetDefaultsWhereNeeded(&replicaCount) + queueController.SetDefaultsWhereNeeded(&replicaCount, nil) Expect(*queueController.Replicas).To(Equal(int32(3))) }) }) diff --git a/pkg/apis/kai/v1/scheduler/scheduler_test.go b/pkg/apis/kai/v1/scheduler/scheduler_test.go index 1efb33c29..d744b61e9 100644 --- a/pkg/apis/kai/v1/scheduler/scheduler_test.go +++ b/pkg/apis/kai/v1/scheduler/scheduler_test.go @@ -23,7 +23,7 @@ var _ = Describe("Scheduler", func() { It("Set Defaults when Service is nil", func(ctx context.Context) { scheduler := &Scheduler{} var replicaCount int32 = 1 - scheduler.SetDefaultsWhereNeeded(&replicaCount) + scheduler.SetDefaultsWhereNeeded(&replicaCount, nil) Expect(scheduler.Service).NotTo(BeNil()) Expect(*scheduler.Service.Enabled).To(Equal(true)) Expect(*scheduler.Service.Image.Name).To(Equal("scheduler")) @@ -36,14 +36,14 @@ var _ = Describe("Scheduler", func() { It("Set Defaults with GOGC unset", func(ctx context.Context) { scheduler := &Scheduler{} var replicaCount int32 = 2 - scheduler.SetDefaultsWhereNeeded(&replicaCount) + scheduler.SetDefaultsWhereNeeded(&replicaCount, nil) Expect(*scheduler.GOGC).To(Equal(400)) }) It("Set Defaults with SchedulerService unset", func(ctx context.Context) { scheduler := &Scheduler{} var replicaCount int32 = 3 - scheduler.SetDefaultsWhereNeeded(&replicaCount) + scheduler.SetDefaultsWhereNeeded(&replicaCount, nil) Expect(scheduler.SchedulerService).NotTo(BeNil()) Expect(*scheduler.SchedulerService.Type).To(Equal(v1.ServiceTypeClusterIP)) Expect(*scheduler.SchedulerService.Port).To(Equal(8080)) @@ -53,14 +53,14 @@ var _ = Describe("Scheduler", func() { It("Replicas set to replicaCount value", func(ctx context.Context) { scheduler := &Scheduler{} var replicaCount int32 = 4 - scheduler.SetDefaultsWhereNeeded(&replicaCount) + scheduler.SetDefaultsWhereNeeded(&replicaCount, nil) Expect(*scheduler.Replicas).To(Equal(int32(4))) }) It("Replicas default to 1 when replicaCount is nil", func(ctx context.Context) { scheduler := &Scheduler{} var replicaCount *int32 - scheduler.SetDefaultsWhereNeeded(replicaCount) + scheduler.SetDefaultsWhereNeeded(replicaCount, nil) Expect(*scheduler.Replicas).To(Equal(int32(1))) }) }) diff --git a/pkg/operator/operands/deployable/deployable_test.go b/pkg/operator/operands/deployable/deployable_test.go index b48589ae8..0ac33543d 100644 --- a/pkg/operator/operands/deployable/deployable_test.go +++ b/pkg/operator/operands/deployable/deployable_test.go @@ -17,6 +17,7 @@ import ( "github.com/NVIDIA/KAI-scheduler/pkg/operator/operands/known_types" monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + vpav1 "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/apis/autoscaling.k8s.io/v1" appsv1 "k8s.io/api/apps/v1" v1 "k8s.io/api/core/v1" apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" @@ -53,6 +54,7 @@ var _ = Describe("Deployable", func() { Expect(kaiv1.AddToScheme(testScheme)).To(Succeed()) Expect(apiextensionsv1.AddToScheme(testScheme)).To(Succeed()) Expect(monitoringv1.AddToScheme(testScheme)).To(Succeed()) + Expect(vpav1.AddToScheme(testScheme)).To(Succeed()) fakeClientBuilder = fake.NewClientBuilder(). WithScheme(testScheme). From 97bd33f79a4643eb06aed4e6697c96025cb00668 Mon Sep 17 00:00:00 2001 From: Jonathan Rosenboim Date: Mon, 2 Mar 2026 15:14:06 +0200 Subject: [PATCH 09/25] Add option to setup vpa as part of the e2e script --- hack/setup-e2e-cluster.sh | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/hack/setup-e2e-cluster.sh b/hack/setup-e2e-cluster.sh index 5332f5aa2..4d46cc2ae 100755 --- a/hack/setup-e2e-cluster.sh +++ b/hack/setup-e2e-cluster.sh @@ -18,6 +18,7 @@ KIND_CONFIG=${REPO_ROOT}/hack/e2e-kind-config.yaml # Parse named parameters TEST_THIRD_PARTY_INTEGRATIONS=${TEST_THIRD_PARTY_INTEGRATIONS:-"false"} LOCAL_IMAGES_BUILD=${LOCAL_IMAGES_BUILD:-"false"} +INSTALL_VPA=${INSTALL_VPA:-"false"} while [[ $# -gt 0 ]]; do case $1 in @@ -29,10 +30,15 @@ while [[ $# -gt 0 ]]; do LOCAL_IMAGES_BUILD="true" shift ;; + --install-vpa) + INSTALL_VPA="true" + shift + ;; -h|--help) - echo "Usage: $0 [--test-third-party-integrations] [--local-images-build]" + echo "Usage: $0 [--test-third-party-integrations] [--local-images-build] [--install-vpa]" echo " --test-third-party-integrations: Install third party operators for compatibility testing" echo " --local-images-build: Build and use local images instead of pulling from registry" + echo " --install-vpa: Install Vertical Pod Autoscaler and metrics-server" exit 0 ;; *) @@ -67,6 +73,23 @@ helm install prometheus prometheus-community/kube-prometheus-stack --namespace m --set "prometheus.enabled=false" \ --wait +# Install VPA and its prerequisites +if [ "$INSTALL_VPA" = "true" ]; then + echo "Installing metrics-server (required by VPA recommender)..." + kubectl apply -f https://github.com/kubernetes-sigs/metrics-server/releases/latest/download/components.yaml + # kind uses self-signed kubelet certs, so metrics-server needs --kubelet-insecure-tls + kubectl patch deployment metrics-server -n kube-system --type=json \ + -p '[{"op":"add","path":"/spec/template/spec/containers/0/args/-","value":"--kubelet-insecure-tls"}]' + kubectl wait --for=condition=available --timeout=120s deployment/metrics-server -n kube-system + + echo "Installing Vertical Pod Autoscaler..." + VPA_TMPDIR=$(mktemp -d) + git clone https://github.com/kubernetes/autoscaler.git "$VPA_TMPDIR/autoscaler" + (cd "$VPA_TMPDIR/autoscaler/vertical-pod-autoscaler" && git checkout vertical-pod-autoscaler-1.5.1 && ./hack/vpa-up.sh) + rm -rf "$VPA_TMPDIR" + echo "VPA installation complete." +fi + # Install third party operators to check the compatibility with the kai-scheduler if [ "$TEST_THIRD_PARTY_INTEGRATIONS" = "true" ]; then ${REPO_ROOT}/hack/third_party_integrations/deploy_ray.sh From 2e41494e8192133e8325945c66b27f85ade2a395 Mon Sep 17 00:00:00 2001 From: Jonathan Rosenboim Date: Mon, 2 Mar 2026 15:54:41 +0200 Subject: [PATCH 10/25] Add kai scheduler CRDs updates --- .../crds/kai.scheduler_configs.yaml | 1058 ++++++++++++++++- .../crds/kai.scheduler_schedulingshards.yaml | 2 +- .../crds/kai.scheduler_topologies.yaml | 17 +- .../crds/scheduling.run.ai_bindrequests.yaml | 2 +- .../crds/scheduling.run.ai_podgroups.yaml | 2 +- .../crds/scheduling.run.ai_queues.yaml | 2 +- 6 files changed, 1062 insertions(+), 21 deletions(-) diff --git a/deployments/kai-scheduler/crds/kai.scheduler_configs.yaml b/deployments/kai-scheduler/crds/kai.scheduler_configs.yaml index 5f0bb574c..8d4450006 100644 --- a/deployments/kai-scheduler/crds/kai.scheduler_configs.yaml +++ b/deployments/kai-scheduler/crds/kai.scheduler_configs.yaml @@ -9,7 +9,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.1 + controller-gen.kubebuilder.io/version: v0.17.3 name: configs.kai.scheduler spec: group: kai.scheduler @@ -1094,6 +1094,138 @@ spec: description: ValidatingWebhookConfigurationName is the name of the ValidatingWebhookConfiguration for the admission service type: string + vpa: + description: VPA specifies Vertical Pod Autoscaler configuration + for the admission service + properties: + enabled: + description: Enabled specifies if VPA should be enabled + type: boolean + resourcePolicy: + description: ResourcePolicy controls how VPA computes recommended + resources for containers + properties: + containerPolicies: + description: Per-container resource policies. + items: + description: |- + ContainerResourcePolicy controls how autoscaler computes the recommended + resources for a specific container. + properties: + containerName: + description: |- + Name of the container or DefaultContainerResourcePolicy, in which + case the policy is used by the containers that don't have their own + policy specified. + type: string + controlledResources: + description: |- + Specifies the type of recommendations that will be computed + (and possibly applied) by VPA. + If not specified, the default of [ResourceCPU, ResourceMemory] will be used. + items: + description: ResourceName is the name identifying + various resources in a ResourceList. + type: string + type: array + controlledValues: + description: |- + Specifies which resource values should be controlled. + The default is "RequestsAndLimits". + enum: + - RequestsAndLimits + - RequestsOnly + type: string + maxAllowed: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Specifies the maximum amount of resources that will be recommended + for the container. The default is no maximum. + type: object + minAllowed: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Specifies the minimal amount of resources that will be recommended + for the container. The default is no minimum. + type: object + mode: + description: Whether autoscaler is enabled for the + container. The default is "Auto". + enum: + - Auto + - "Off" + type: string + type: object + type: array + type: object + updatePolicy: + description: UpdatePolicy controls when and how VPA applies + changes to pod resources + properties: + evictionRequirements: + description: |- + EvictionRequirements is a list of EvictionRequirements that need to + evaluate to true in order for a Pod to be evicted. If more than one + EvictionRequirement is specified, all of them need to be fulfilled to allow eviction. + items: + description: |- + EvictionRequirement defines a single condition which needs to be true in + order to evict a Pod + properties: + changeRequirement: + description: EvictionChangeRequirement refers to + the relationship between the new target recommendation + for a Pod and its current requests, what kind + of change is necessary for the Pod to be evicted + enum: + - TargetHigherThanRequests + - TargetLowerThanRequests + type: string + resources: + description: |- + Resources is a list of one or more resources that the condition applies + to. If more than one resource is given, the EvictionRequirement is fulfilled + if at least one resource meets `changeRequirement`. + items: + description: ResourceName is the name identifying + various resources in a ResourceList. + type: string + type: array + required: + - changeRequirement + - resources + type: object + type: array + minReplicas: + description: |- + Minimal number of replicas which need to be alive for Updater to attempt + pod eviction (pending other checks like PDB). Only positive values are + allowed. Overrides global '--min-replicas' flag. + format: int32 + type: integer + updateMode: + description: |- + Controls when autoscaler applies changes to the pod resources. + The default is 'Auto'. + enum: + - "Off" + - Initial + - Recreate + - InPlaceOrRecreate + - Auto + type: string + type: object + type: object webhook: description: Webhook defines configuration for the admission service properties: @@ -2265,6 +2397,138 @@ spec: description: VolumeBindingTimeoutSeconds specifies the timeout for volume binding in seconds type: integer + vpa: + description: VPA specifies Vertical Pod Autoscaler configuration + for the binder + properties: + enabled: + description: Enabled specifies if VPA should be enabled + type: boolean + resourcePolicy: + description: ResourcePolicy controls how VPA computes recommended + resources for containers + properties: + containerPolicies: + description: Per-container resource policies. + items: + description: |- + ContainerResourcePolicy controls how autoscaler computes the recommended + resources for a specific container. + properties: + containerName: + description: |- + Name of the container or DefaultContainerResourcePolicy, in which + case the policy is used by the containers that don't have their own + policy specified. + type: string + controlledResources: + description: |- + Specifies the type of recommendations that will be computed + (and possibly applied) by VPA. + If not specified, the default of [ResourceCPU, ResourceMemory] will be used. + items: + description: ResourceName is the name identifying + various resources in a ResourceList. + type: string + type: array + controlledValues: + description: |- + Specifies which resource values should be controlled. + The default is "RequestsAndLimits". + enum: + - RequestsAndLimits + - RequestsOnly + type: string + maxAllowed: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Specifies the maximum amount of resources that will be recommended + for the container. The default is no maximum. + type: object + minAllowed: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Specifies the minimal amount of resources that will be recommended + for the container. The default is no minimum. + type: object + mode: + description: Whether autoscaler is enabled for the + container. The default is "Auto". + enum: + - Auto + - "Off" + type: string + type: object + type: array + type: object + updatePolicy: + description: UpdatePolicy controls when and how VPA applies + changes to pod resources + properties: + evictionRequirements: + description: |- + EvictionRequirements is a list of EvictionRequirements that need to + evaluate to true in order for a Pod to be evicted. If more than one + EvictionRequirement is specified, all of them need to be fulfilled to allow eviction. + items: + description: |- + EvictionRequirement defines a single condition which needs to be true in + order to evict a Pod + properties: + changeRequirement: + description: EvictionChangeRequirement refers to + the relationship between the new target recommendation + for a Pod and its current requests, what kind + of change is necessary for the Pod to be evicted + enum: + - TargetHigherThanRequests + - TargetLowerThanRequests + type: string + resources: + description: |- + Resources is a list of one or more resources that the condition applies + to. If more than one resource is given, the EvictionRequirement is fulfilled + if at least one resource meets `changeRequirement`. + items: + description: ResourceName is the name identifying + various resources in a ResourceList. + type: string + type: array + required: + - changeRequirement + - resources + type: object + type: array + minReplicas: + description: |- + Minimal number of replicas which need to be alive for Updater to attempt + pod eviction (pending other checks like PDB). Only positive values are + allowed. Overrides global '--min-replicas' flag. + format: int32 + type: integer + updateMode: + description: |- + Controls when autoscaler applies changes to the pod resources. + The default is 'Auto'. + enum: + - "Off" + - Initial + - Recreate + - InPlaceOrRecreate + - Auto + type: string + type: object + type: object type: object global: description: Global defined global configuration of the system @@ -3512,6 +3776,138 @@ spec: type: string type: object type: array + vpa: + description: VPA defines the default Vertical Pod Autoscaler configuration + for all services + properties: + enabled: + description: Enabled specifies if VPA should be enabled + type: boolean + resourcePolicy: + description: ResourcePolicy controls how VPA computes recommended + resources for containers + properties: + containerPolicies: + description: Per-container resource policies. + items: + description: |- + ContainerResourcePolicy controls how autoscaler computes the recommended + resources for a specific container. + properties: + containerName: + description: |- + Name of the container or DefaultContainerResourcePolicy, in which + case the policy is used by the containers that don't have their own + policy specified. + type: string + controlledResources: + description: |- + Specifies the type of recommendations that will be computed + (and possibly applied) by VPA. + If not specified, the default of [ResourceCPU, ResourceMemory] will be used. + items: + description: ResourceName is the name identifying + various resources in a ResourceList. + type: string + type: array + controlledValues: + description: |- + Specifies which resource values should be controlled. + The default is "RequestsAndLimits". + enum: + - RequestsAndLimits + - RequestsOnly + type: string + maxAllowed: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Specifies the maximum amount of resources that will be recommended + for the container. The default is no maximum. + type: object + minAllowed: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Specifies the minimal amount of resources that will be recommended + for the container. The default is no minimum. + type: object + mode: + description: Whether autoscaler is enabled for the + container. The default is "Auto". + enum: + - Auto + - "Off" + type: string + type: object + type: array + type: object + updatePolicy: + description: UpdatePolicy controls when and how VPA applies + changes to pod resources + properties: + evictionRequirements: + description: |- + EvictionRequirements is a list of EvictionRequirements that need to + evaluate to true in order for a Pod to be evicted. If more than one + EvictionRequirement is specified, all of them need to be fulfilled to allow eviction. + items: + description: |- + EvictionRequirement defines a single condition which needs to be true in + order to evict a Pod + properties: + changeRequirement: + description: EvictionChangeRequirement refers to + the relationship between the new target recommendation + for a Pod and its current requests, what kind + of change is necessary for the Pod to be evicted + enum: + - TargetHigherThanRequests + - TargetLowerThanRequests + type: string + resources: + description: |- + Resources is a list of one or more resources that the condition applies + to. If more than one resource is given, the EvictionRequirement is fulfilled + if at least one resource meets `changeRequirement`. + items: + description: ResourceName is the name identifying + various resources in a ResourceList. + type: string + type: array + required: + - changeRequirement + - resources + type: object + type: array + minReplicas: + description: |- + Minimal number of replicas which need to be alive for Updater to attempt + pod eviction (pending other checks like PDB). Only positive values are + allowed. Overrides global '--min-replicas' flag. + format: int32 + type: integer + updateMode: + description: |- + Controls when autoscaler applies changes to the pod resources. + The default is 'Auto'. + enum: + - "Off" + - Initial + - Recreate + - InPlaceOrRecreate + - Auto + type: string + type: object + type: object type: object namespace: description: Namespace is the namespace to create the operands in @@ -4573,6 +4969,138 @@ spec: type: object type: object type: object + vpa: + description: VPA specifies Vertical Pod Autoscaler configuration + for the node-scale-adjuster + properties: + enabled: + description: Enabled specifies if VPA should be enabled + type: boolean + resourcePolicy: + description: ResourcePolicy controls how VPA computes recommended + resources for containers + properties: + containerPolicies: + description: Per-container resource policies. + items: + description: |- + ContainerResourcePolicy controls how autoscaler computes the recommended + resources for a specific container. + properties: + containerName: + description: |- + Name of the container or DefaultContainerResourcePolicy, in which + case the policy is used by the containers that don't have their own + policy specified. + type: string + controlledResources: + description: |- + Specifies the type of recommendations that will be computed + (and possibly applied) by VPA. + If not specified, the default of [ResourceCPU, ResourceMemory] will be used. + items: + description: ResourceName is the name identifying + various resources in a ResourceList. + type: string + type: array + controlledValues: + description: |- + Specifies which resource values should be controlled. + The default is "RequestsAndLimits". + enum: + - RequestsAndLimits + - RequestsOnly + type: string + maxAllowed: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Specifies the maximum amount of resources that will be recommended + for the container. The default is no maximum. + type: object + minAllowed: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Specifies the minimal amount of resources that will be recommended + for the container. The default is no minimum. + type: object + mode: + description: Whether autoscaler is enabled for the + container. The default is "Auto". + enum: + - Auto + - "Off" + type: string + type: object + type: array + type: object + updatePolicy: + description: UpdatePolicy controls when and how VPA applies + changes to pod resources + properties: + evictionRequirements: + description: |- + EvictionRequirements is a list of EvictionRequirements that need to + evaluate to true in order for a Pod to be evicted. If more than one + EvictionRequirement is specified, all of them need to be fulfilled to allow eviction. + items: + description: |- + EvictionRequirement defines a single condition which needs to be true in + order to evict a Pod + properties: + changeRequirement: + description: EvictionChangeRequirement refers to + the relationship between the new target recommendation + for a Pod and its current requests, what kind + of change is necessary for the Pod to be evicted + enum: + - TargetHigherThanRequests + - TargetLowerThanRequests + type: string + resources: + description: |- + Resources is a list of one or more resources that the condition applies + to. If more than one resource is given, the EvictionRequirement is fulfilled + if at least one resource meets `changeRequirement`. + items: + description: ResourceName is the name identifying + various resources in a ResourceList. + type: string + type: array + required: + - changeRequirement + - resources + type: object + type: array + minReplicas: + description: |- + Minimal number of replicas which need to be alive for Updater to attempt + pod eviction (pending other checks like PDB). Only positive values are + allowed. Overrides global '--min-replicas' flag. + format: int32 + type: integer + updateMode: + description: |- + Controls when autoscaler applies changes to the pod resources. + The default is 'Auto'. + enum: + - "Off" + - Initial + - Recreate + - InPlaceOrRecreate + - Auto + type: string + type: object + type: object type: object podGroupController: description: PodGroupController specifies configuration for the pod-group-controller @@ -5623,6 +6151,138 @@ spec: type: object type: object type: object + vpa: + description: VPA specifies Vertical Pod Autoscaler configuration + for the pod group controller + properties: + enabled: + description: Enabled specifies if VPA should be enabled + type: boolean + resourcePolicy: + description: ResourcePolicy controls how VPA computes recommended + resources for containers + properties: + containerPolicies: + description: Per-container resource policies. + items: + description: |- + ContainerResourcePolicy controls how autoscaler computes the recommended + resources for a specific container. + properties: + containerName: + description: |- + Name of the container or DefaultContainerResourcePolicy, in which + case the policy is used by the containers that don't have their own + policy specified. + type: string + controlledResources: + description: |- + Specifies the type of recommendations that will be computed + (and possibly applied) by VPA. + If not specified, the default of [ResourceCPU, ResourceMemory] will be used. + items: + description: ResourceName is the name identifying + various resources in a ResourceList. + type: string + type: array + controlledValues: + description: |- + Specifies which resource values should be controlled. + The default is "RequestsAndLimits". + enum: + - RequestsAndLimits + - RequestsOnly + type: string + maxAllowed: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Specifies the maximum amount of resources that will be recommended + for the container. The default is no maximum. + type: object + minAllowed: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Specifies the minimal amount of resources that will be recommended + for the container. The default is no minimum. + type: object + mode: + description: Whether autoscaler is enabled for the + container. The default is "Auto". + enum: + - Auto + - "Off" + type: string + type: object + type: array + type: object + updatePolicy: + description: UpdatePolicy controls when and how VPA applies + changes to pod resources + properties: + evictionRequirements: + description: |- + EvictionRequirements is a list of EvictionRequirements that need to + evaluate to true in order for a Pod to be evicted. If more than one + EvictionRequirement is specified, all of them need to be fulfilled to allow eviction. + items: + description: |- + EvictionRequirement defines a single condition which needs to be true in + order to evict a Pod + properties: + changeRequirement: + description: EvictionChangeRequirement refers to + the relationship between the new target recommendation + for a Pod and its current requests, what kind + of change is necessary for the Pod to be evicted + enum: + - TargetHigherThanRequests + - TargetLowerThanRequests + type: string + resources: + description: |- + Resources is a list of one or more resources that the condition applies + to. If more than one resource is given, the EvictionRequirement is fulfilled + if at least one resource meets `changeRequirement`. + items: + description: ResourceName is the name identifying + various resources in a ResourceList. + type: string + type: array + required: + - changeRequirement + - resources + type: object + type: array + minReplicas: + description: |- + Minimal number of replicas which need to be alive for Updater to attempt + pod eviction (pending other checks like PDB). Only positive values are + allowed. Overrides global '--min-replicas' flag. + format: int32 + type: integer + updateMode: + description: |- + Controls when autoscaler applies changes to the pod resources. + The default is 'Auto'. + enum: + - "Off" + - Initial + - Recreate + - InPlaceOrRecreate + - Auto + type: string + type: object + type: object webhooks: description: Webhooks describes the configuration of the podgroup controller webhooks @@ -6697,6 +7357,138 @@ spec: type: object type: object type: object + vpa: + description: VPA specifies Vertical Pod Autoscaler configuration + for the pod-grouper + properties: + enabled: + description: Enabled specifies if VPA should be enabled + type: boolean + resourcePolicy: + description: ResourcePolicy controls how VPA computes recommended + resources for containers + properties: + containerPolicies: + description: Per-container resource policies. + items: + description: |- + ContainerResourcePolicy controls how autoscaler computes the recommended + resources for a specific container. + properties: + containerName: + description: |- + Name of the container or DefaultContainerResourcePolicy, in which + case the policy is used by the containers that don't have their own + policy specified. + type: string + controlledResources: + description: |- + Specifies the type of recommendations that will be computed + (and possibly applied) by VPA. + If not specified, the default of [ResourceCPU, ResourceMemory] will be used. + items: + description: ResourceName is the name identifying + various resources in a ResourceList. + type: string + type: array + controlledValues: + description: |- + Specifies which resource values should be controlled. + The default is "RequestsAndLimits". + enum: + - RequestsAndLimits + - RequestsOnly + type: string + maxAllowed: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Specifies the maximum amount of resources that will be recommended + for the container. The default is no maximum. + type: object + minAllowed: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Specifies the minimal amount of resources that will be recommended + for the container. The default is no minimum. + type: object + mode: + description: Whether autoscaler is enabled for the + container. The default is "Auto". + enum: + - Auto + - "Off" + type: string + type: object + type: array + type: object + updatePolicy: + description: UpdatePolicy controls when and how VPA applies + changes to pod resources + properties: + evictionRequirements: + description: |- + EvictionRequirements is a list of EvictionRequirements that need to + evaluate to true in order for a Pod to be evicted. If more than one + EvictionRequirement is specified, all of them need to be fulfilled to allow eviction. + items: + description: |- + EvictionRequirement defines a single condition which needs to be true in + order to evict a Pod + properties: + changeRequirement: + description: EvictionChangeRequirement refers to + the relationship between the new target recommendation + for a Pod and its current requests, what kind + of change is necessary for the Pod to be evicted + enum: + - TargetHigherThanRequests + - TargetLowerThanRequests + type: string + resources: + description: |- + Resources is a list of one or more resources that the condition applies + to. If more than one resource is given, the EvictionRequirement is fulfilled + if at least one resource meets `changeRequirement`. + items: + description: ResourceName is the name identifying + various resources in a ResourceList. + type: string + type: array + required: + - changeRequirement + - resources + type: object + type: array + minReplicas: + description: |- + Minimal number of replicas which need to be alive for Updater to attempt + pod eviction (pending other checks like PDB). Only positive values are + allowed. Overrides global '--min-replicas' flag. + format: int32 + type: integer + updateMode: + description: |- + Controls when autoscaler applies changes to the pod resources. + The default is 'Auto'. + enum: + - "Off" + - Initial + - Recreate + - InPlaceOrRecreate + - Auto + type: string + type: object + type: object type: object prometheus: description: Prometheus specifies configuration for Prometheus monitoring @@ -7855,6 +8647,138 @@ spec: type: object type: object type: object + vpa: + description: VPA specifies Vertical Pod Autoscaler configuration + for the queue controller + properties: + enabled: + description: Enabled specifies if VPA should be enabled + type: boolean + resourcePolicy: + description: ResourcePolicy controls how VPA computes recommended + resources for containers + properties: + containerPolicies: + description: Per-container resource policies. + items: + description: |- + ContainerResourcePolicy controls how autoscaler computes the recommended + resources for a specific container. + properties: + containerName: + description: |- + Name of the container or DefaultContainerResourcePolicy, in which + case the policy is used by the containers that don't have their own + policy specified. + type: string + controlledResources: + description: |- + Specifies the type of recommendations that will be computed + (and possibly applied) by VPA. + If not specified, the default of [ResourceCPU, ResourceMemory] will be used. + items: + description: ResourceName is the name identifying + various resources in a ResourceList. + type: string + type: array + controlledValues: + description: |- + Specifies which resource values should be controlled. + The default is "RequestsAndLimits". + enum: + - RequestsAndLimits + - RequestsOnly + type: string + maxAllowed: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Specifies the maximum amount of resources that will be recommended + for the container. The default is no maximum. + type: object + minAllowed: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Specifies the minimal amount of resources that will be recommended + for the container. The default is no minimum. + type: object + mode: + description: Whether autoscaler is enabled for the + container. The default is "Auto". + enum: + - Auto + - "Off" + type: string + type: object + type: array + type: object + updatePolicy: + description: UpdatePolicy controls when and how VPA applies + changes to pod resources + properties: + evictionRequirements: + description: |- + EvictionRequirements is a list of EvictionRequirements that need to + evaluate to true in order for a Pod to be evicted. If more than one + EvictionRequirement is specified, all of them need to be fulfilled to allow eviction. + items: + description: |- + EvictionRequirement defines a single condition which needs to be true in + order to evict a Pod + properties: + changeRequirement: + description: EvictionChangeRequirement refers to + the relationship between the new target recommendation + for a Pod and its current requests, what kind + of change is necessary for the Pod to be evicted + enum: + - TargetHigherThanRequests + - TargetLowerThanRequests + type: string + resources: + description: |- + Resources is a list of one or more resources that the condition applies + to. If more than one resource is given, the EvictionRequirement is fulfilled + if at least one resource meets `changeRequirement`. + items: + description: ResourceName is the name identifying + various resources in a ResourceList. + type: string + type: array + required: + - changeRequirement + - resources + type: object + type: array + minReplicas: + description: |- + Minimal number of replicas which need to be alive for Updater to attempt + pod eviction (pending other checks like PDB). Only positive values are + allowed. Overrides global '--min-replicas' flag. + format: int32 + type: integer + updateMode: + description: |- + Controls when autoscaler applies changes to the pod resources. + The default is 'Auto'. + enum: + - "Off" + - Initial + - Recreate + - InPlaceOrRecreate + - Auto + type: string + type: object + type: object webhooks: description: Webhooks describes the configuration of the queue controller webhooks @@ -8915,6 +9839,138 @@ spec: type: object type: object type: object + vpa: + description: VPA specifies Vertical Pod Autoscaler configuration + for the scheduler + properties: + enabled: + description: Enabled specifies if VPA should be enabled + type: boolean + resourcePolicy: + description: ResourcePolicy controls how VPA computes recommended + resources for containers + properties: + containerPolicies: + description: Per-container resource policies. + items: + description: |- + ContainerResourcePolicy controls how autoscaler computes the recommended + resources for a specific container. + properties: + containerName: + description: |- + Name of the container or DefaultContainerResourcePolicy, in which + case the policy is used by the containers that don't have their own + policy specified. + type: string + controlledResources: + description: |- + Specifies the type of recommendations that will be computed + (and possibly applied) by VPA. + If not specified, the default of [ResourceCPU, ResourceMemory] will be used. + items: + description: ResourceName is the name identifying + various resources in a ResourceList. + type: string + type: array + controlledValues: + description: |- + Specifies which resource values should be controlled. + The default is "RequestsAndLimits". + enum: + - RequestsAndLimits + - RequestsOnly + type: string + maxAllowed: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Specifies the maximum amount of resources that will be recommended + for the container. The default is no maximum. + type: object + minAllowed: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Specifies the minimal amount of resources that will be recommended + for the container. The default is no minimum. + type: object + mode: + description: Whether autoscaler is enabled for the + container. The default is "Auto". + enum: + - Auto + - "Off" + type: string + type: object + type: array + type: object + updatePolicy: + description: UpdatePolicy controls when and how VPA applies + changes to pod resources + properties: + evictionRequirements: + description: |- + EvictionRequirements is a list of EvictionRequirements that need to + evaluate to true in order for a Pod to be evicted. If more than one + EvictionRequirement is specified, all of them need to be fulfilled to allow eviction. + items: + description: |- + EvictionRequirement defines a single condition which needs to be true in + order to evict a Pod + properties: + changeRequirement: + description: EvictionChangeRequirement refers to + the relationship between the new target recommendation + for a Pod and its current requests, what kind + of change is necessary for the Pod to be evicted + enum: + - TargetHigherThanRequests + - TargetLowerThanRequests + type: string + resources: + description: |- + Resources is a list of one or more resources that the condition applies + to. If more than one resource is given, the EvictionRequirement is fulfilled + if at least one resource meets `changeRequirement`. + items: + description: ResourceName is the name identifying + various resources in a ResourceList. + type: string + type: array + required: + - changeRequirement + - resources + type: object + type: array + minReplicas: + description: |- + Minimal number of replicas which need to be alive for Updater to attempt + pod eviction (pending other checks like PDB). Only positive values are + allowed. Overrides global '--min-replicas' flag. + format: int32 + type: integer + updateMode: + description: |- + Controls when autoscaler applies changes to the pod resources. + The default is 'Auto'. + enum: + - "Off" + - Initial + - Recreate + - InPlaceOrRecreate + - Auto + type: string + type: object + type: object type: object type: object status: diff --git a/deployments/kai-scheduler/crds/kai.scheduler_schedulingshards.yaml b/deployments/kai-scheduler/crds/kai.scheduler_schedulingshards.yaml index d034b7cef..130ccad3d 100644 --- a/deployments/kai-scheduler/crds/kai.scheduler_schedulingshards.yaml +++ b/deployments/kai-scheduler/crds/kai.scheduler_schedulingshards.yaml @@ -9,7 +9,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.1 + controller-gen.kubebuilder.io/version: v0.17.3 name: schedulingshards.kai.scheduler spec: group: kai.scheduler diff --git a/deployments/kai-scheduler/crds/kai.scheduler_topologies.yaml b/deployments/kai-scheduler/crds/kai.scheduler_topologies.yaml index 993ea19eb..8d46a3d6b 100644 --- a/deployments/kai-scheduler/crds/kai.scheduler_topologies.yaml +++ b/deployments/kai-scheduler/crds/kai.scheduler_topologies.yaml @@ -1,18 +1,3 @@ -# Copyright The Kubernetes Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - # Copyright 2025 NVIDIA CORPORATION # SPDX-License-Identifier: Apache-2.0 # @@ -24,7 +9,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.1 + controller-gen.kubebuilder.io/version: v0.17.3 name: topologies.kai.scheduler spec: group: kai.scheduler diff --git a/deployments/kai-scheduler/crds/scheduling.run.ai_bindrequests.yaml b/deployments/kai-scheduler/crds/scheduling.run.ai_bindrequests.yaml index 53be0a220..91e301296 100644 --- a/deployments/kai-scheduler/crds/scheduling.run.ai_bindrequests.yaml +++ b/deployments/kai-scheduler/crds/scheduling.run.ai_bindrequests.yaml @@ -9,7 +9,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.1 + controller-gen.kubebuilder.io/version: v0.17.3 name: bindrequests.scheduling.run.ai spec: group: scheduling.run.ai diff --git a/deployments/kai-scheduler/crds/scheduling.run.ai_podgroups.yaml b/deployments/kai-scheduler/crds/scheduling.run.ai_podgroups.yaml index 58bbaacef..81df93a3d 100644 --- a/deployments/kai-scheduler/crds/scheduling.run.ai_podgroups.yaml +++ b/deployments/kai-scheduler/crds/scheduling.run.ai_podgroups.yaml @@ -9,7 +9,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.1 + controller-gen.kubebuilder.io/version: v0.17.3 name: podgroups.scheduling.run.ai spec: group: scheduling.run.ai diff --git a/deployments/kai-scheduler/crds/scheduling.run.ai_queues.yaml b/deployments/kai-scheduler/crds/scheduling.run.ai_queues.yaml index 2b74d682b..18ac93868 100644 --- a/deployments/kai-scheduler/crds/scheduling.run.ai_queues.yaml +++ b/deployments/kai-scheduler/crds/scheduling.run.ai_queues.yaml @@ -9,7 +9,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.1 + controller-gen.kubebuilder.io/version: v0.17.3 name: queues.scheduling.run.ai spec: group: scheduling.run.ai From df799db3b01648db5c78cdd496664992d78456e9 Mon Sep 17 00:00:00 2001 From: Jonathan Rosenboim Date: Mon, 2 Mar 2026 16:08:50 +0200 Subject: [PATCH 11/25] Add VPA Field Inherit to allow detection of changes in existing VPA objects --- pkg/operator/controller/config_controller.go | 3 +++ .../controller/schedulingshard_controller.go | 3 +++ .../known_types/verticalpodautoscalers.go | 25 +++++++++++++++++++ 3 files changed, 31 insertions(+) diff --git a/pkg/operator/controller/config_controller.go b/pkg/operator/controller/config_controller.go index ab0ac22bb..d0319320c 100644 --- a/pkg/operator/controller/config_controller.go +++ b/pkg/operator/controller/config_controller.go @@ -23,6 +23,7 @@ import ( admissionv1 "k8s.io/api/admissionregistration/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" + vpav1 "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/apis/autoscaling.k8s.io/v1" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/handler" @@ -148,6 +149,8 @@ func (r *ConfigReconciler) SetupWithManager(mgr ctrl.Manager) error { known_types.ValidatingWebhookConfigurationFieldInherit) r.deployable.RegisterFieldsInheritFromClusterObjects(&admissionv1.MutatingWebhookConfiguration{}, known_types.MutatingWebhookConfigurationFieldInherit) + r.deployable.RegisterFieldsInheritFromClusterObjects(&vpav1.VerticalPodAutoscaler{}, + known_types.VPAFieldInherit) r.StatusReconciler = status_reconciler.New(r.Client, r.deployable) builder := ctrl.NewControllerManagedBy(mgr). diff --git a/pkg/operator/controller/schedulingshard_controller.go b/pkg/operator/controller/schedulingshard_controller.go index 59b9ddec3..e18dd02fb 100644 --- a/pkg/operator/controller/schedulingshard_controller.go +++ b/pkg/operator/controller/schedulingshard_controller.go @@ -23,6 +23,7 @@ import ( "golang.org/x/exp/slices" admissionv1 "k8s.io/api/admissionregistration/v1" "k8s.io/apimachinery/pkg/runtime" + vpav1 "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/apis/autoscaling.k8s.io/v1" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/handler" @@ -91,6 +92,8 @@ func (r *SchedulingShardReconciler) Reconcile(ctx context.Context, req ctrl.Requ known_types.ValidatingWebhookConfigurationFieldInherit) r.deployablePerShard[shard.Name].RegisterFieldsInheritFromClusterObjects(&admissionv1.MutatingWebhookConfiguration{}, known_types.MutatingWebhookConfigurationFieldInherit) + r.deployablePerShard[shard.Name].RegisterFieldsInheritFromClusterObjects(&vpav1.VerticalPodAutoscaler{}, + known_types.VPAFieldInherit) r.statusReconcilers[shard.Name] = status_reconciler.New(r.Client, r.deployablePerShard[shard.Name]) deployable := r.deployablePerShard[shard.Name] diff --git a/pkg/operator/operands/known_types/verticalpodautoscalers.go b/pkg/operator/operands/known_types/verticalpodautoscalers.go index 3a6363b49..aec5e5c33 100644 --- a/pkg/operator/operands/known_types/verticalpodautoscalers.go +++ b/pkg/operator/operands/known_types/verticalpodautoscalers.go @@ -40,6 +40,31 @@ func registerVerticalPodAutoscalers() { SetupSchedulingShardOwned(collectable) } +// VPAFieldInherit copies server-managed metadata fields from the current cluster +// object into the desired object so reflect.DeepEqual won't trigger false updates. +func VPAFieldInherit(current, desired client.Object) { + if current == nil { + return + } + desired.SetResourceVersion(current.GetResourceVersion()) + desired.SetUID(current.GetUID()) + desired.SetCreationTimestamp(current.GetCreationTimestamp()) + desired.SetGeneration(current.GetGeneration()) + desired.SetOwnerReferences(current.GetOwnerReferences()) + desired.SetManagedFields(current.GetManagedFields()) + desired.SetAnnotations(mergeAnnotations(desired.GetAnnotations(), current.GetAnnotations())) + + currentVPA, ok := current.(*vpav1.VerticalPodAutoscaler) + if !ok { + return + } + desiredVPA, ok := desired.(*vpav1.VerticalPodAutoscaler) + if !ok { + return + } + desiredVPA.Status = currentVPA.Status +} + func getCurrentVPAState(ctx context.Context, runtimeClient client.Client, reconciler client.Object) (map[string]client.Object, error) { result := map[string]client.Object{} vpas := &vpav1.VerticalPodAutoscalerList{} From 1c966b5074f734aa32c247ca40493fe66c8cc9b6 Mon Sep 17 00:00:00 2001 From: Jonathan Rosenboim Date: Mon, 2 Mar 2026 17:13:15 +0200 Subject: [PATCH 12/25] Run make --- .../crds/kai.scheduler_topologies.yaml | 15 +++++++++ .../templates/rbac/admission.yaml | 18 +++++------ .../kai-scheduler/templates/rbac/binder.yaml | 24 +++++++------- .../templates/rbac/nodescaleadjuster.yaml | 31 ++++++------------ .../templates/rbac/operator.yaml | 12 +++++++ .../templates/rbac/podgroupcontroller.yaml | 22 ++++++------- .../templates/rbac/podgrouper.yaml | 25 +++++++-------- .../templates/rbac/queuecontroller.yaml | 8 ++--- .../templates/rbac/scheduler.yaml | 32 ++++++------------- 9 files changed, 95 insertions(+), 92 deletions(-) diff --git a/deployments/kai-scheduler/crds/kai.scheduler_topologies.yaml b/deployments/kai-scheduler/crds/kai.scheduler_topologies.yaml index 8d46a3d6b..364db292c 100644 --- a/deployments/kai-scheduler/crds/kai.scheduler_topologies.yaml +++ b/deployments/kai-scheduler/crds/kai.scheduler_topologies.yaml @@ -1,3 +1,18 @@ +# Copyright The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + # Copyright 2025 NVIDIA CORPORATION # SPDX-License-Identifier: Apache-2.0 # diff --git a/deployments/kai-scheduler/templates/rbac/admission.yaml b/deployments/kai-scheduler/templates/rbac/admission.yaml index 88d0a08b1..72122bba3 100644 --- a/deployments/kai-scheduler/templates/rbac/admission.yaml +++ b/deployments/kai-scheduler/templates/rbac/admission.yaml @@ -11,9 +11,9 @@ metadata: name: kai-admission rules: - apiGroups: - - coordination.k8s.io + - "" resources: - - leases + - configmaps verbs: - create - delete @@ -25,20 +25,20 @@ rules: - apiGroups: - "" resources: - - configmaps + - events verbs: - create - - delete - - get - - list - patch - update - - watch - apiGroups: - - "" + - coordination.k8s.io resources: - - events + - leases verbs: - create + - delete + - get + - list - patch - update + - watch diff --git a/deployments/kai-scheduler/templates/rbac/binder.yaml b/deployments/kai-scheduler/templates/rbac/binder.yaml index 4a4968f82..b4550f970 100644 --- a/deployments/kai-scheduler/templates/rbac/binder.yaml +++ b/deployments/kai-scheduler/templates/rbac/binder.yaml @@ -10,18 +10,6 @@ kind: ClusterRole metadata: name: kai-binder rules: -- apiGroups: - - coordination.k8s.io - resources: - - leases - verbs: - - create - - delete - - get - - list - - patch - - update - - watch - apiGroups: - "" resources: @@ -75,6 +63,18 @@ rules: - patch - update - watch +- apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - create + - delete + - get + - list + - patch + - update + - watch - apiGroups: - resource.k8s.io resources: diff --git a/deployments/kai-scheduler/templates/rbac/nodescaleadjuster.yaml b/deployments/kai-scheduler/templates/rbac/nodescaleadjuster.yaml index af2000e3d..515676fe9 100644 --- a/deployments/kai-scheduler/templates/rbac/nodescaleadjuster.yaml +++ b/deployments/kai-scheduler/templates/rbac/nodescaleadjuster.yaml @@ -13,6 +13,7 @@ rules: - apiGroups: - "" resources: + - configmaps - pods verbs: - create @@ -23,43 +24,31 @@ rules: - update - watch - apiGroups: - - coordination.k8s.io + - "" resources: - - leases + - events + - pods/finalizers verbs: - create - - delete - - get - - list - patch - update - - watch - apiGroups: - "" resources: - - configmaps + - pods/status verbs: - - create - - delete - get - - list - patch - update - - watch - apiGroups: - - "" + - coordination.k8s.io resources: - - events - - pods/finalizers + - leases verbs: - create - - patch - - update -- apiGroups: - - "" - resources: - - pods/status - verbs: + - delete - get + - list - patch - update + - watch diff --git a/deployments/kai-scheduler/templates/rbac/operator.yaml b/deployments/kai-scheduler/templates/rbac/operator.yaml index 59b0fcb59..95e2ce8c6 100644 --- a/deployments/kai-scheduler/templates/rbac/operator.yaml +++ b/deployments/kai-scheduler/templates/rbac/operator.yaml @@ -92,6 +92,18 @@ rules: - patch - update - watch +- apiGroups: + - autoscaling.k8s.io + resources: + - verticalpodautoscalers + verbs: + - create + - delete + - get + - list + - patch + - update + - watch - apiGroups: - coordination.k8s.io resources: diff --git a/deployments/kai-scheduler/templates/rbac/podgroupcontroller.yaml b/deployments/kai-scheduler/templates/rbac/podgroupcontroller.yaml index 9b091f5a5..4af54ebe2 100644 --- a/deployments/kai-scheduler/templates/rbac/podgroupcontroller.yaml +++ b/deployments/kai-scheduler/templates/rbac/podgroupcontroller.yaml @@ -13,29 +13,29 @@ rules: - apiGroups: - "" resources: - - nodes - - pods - - pods/status + - configmaps verbs: + - create + - delete - get - list + - patch + - update - watch - apiGroups: - - coordination.k8s.io + - "" resources: - - leases + - nodes + - pods + - pods/status verbs: - - create - - delete - get - list - - patch - - update - watch - apiGroups: - - "" + - coordination.k8s.io resources: - - configmaps + - leases verbs: - create - delete diff --git a/deployments/kai-scheduler/templates/rbac/podgrouper.yaml b/deployments/kai-scheduler/templates/rbac/podgrouper.yaml index 1818196d2..f7e9f823f 100644 --- a/deployments/kai-scheduler/templates/rbac/podgrouper.yaml +++ b/deployments/kai-scheduler/templates/rbac/podgrouper.yaml @@ -14,10 +14,13 @@ rules: - "" resources: - configmaps - - namespaces verbs: + - create + - delete - get - list + - patch + - update - watch - apiGroups: - "" @@ -31,6 +34,14 @@ rules: - patch - update - watch +- apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - list + - watch - apiGroups: - "" resources: @@ -121,18 +132,6 @@ rules: - patch - update - watch -- apiGroups: - - "" - resources: - - configmaps - verbs: - - create - - delete - - get - - list - - patch - - update - - watch - apiGroups: - egx.nvidia.io resources: diff --git a/deployments/kai-scheduler/templates/rbac/queuecontroller.yaml b/deployments/kai-scheduler/templates/rbac/queuecontroller.yaml index e7fe08fc6..a911cd9a0 100644 --- a/deployments/kai-scheduler/templates/rbac/queuecontroller.yaml +++ b/deployments/kai-scheduler/templates/rbac/queuecontroller.yaml @@ -11,9 +11,9 @@ metadata: name: queuecontroller rules: - apiGroups: - - coordination.k8s.io + - "" resources: - - leases + - configmaps verbs: - create - delete @@ -23,9 +23,9 @@ rules: - update - watch - apiGroups: - - "" + - coordination.k8s.io resources: - - configmaps + - leases verbs: - create - delete diff --git a/deployments/kai-scheduler/templates/rbac/scheduler.yaml b/deployments/kai-scheduler/templates/rbac/scheduler.yaml index 67f94054f..43bdbe69b 100644 --- a/deployments/kai-scheduler/templates/rbac/scheduler.yaml +++ b/deployments/kai-scheduler/templates/rbac/scheduler.yaml @@ -14,17 +14,6 @@ rules: - "" resources: - configmaps - - namespaces - - nodes - - persistentvolumeclaims - - persistentvolumes - verbs: - - get - - list - - watch -- apiGroups: - - "" - resources: - events - pods/status verbs: @@ -38,29 +27,29 @@ rules: - apiGroups: - "" resources: - - pods + - namespaces + - nodes + - persistentvolumeclaims + - persistentvolumes verbs: - - delete - get - list - - patch - - update - watch - apiGroups: - "" resources: - - pods/finalizers + - pods verbs: - - create - delete - get - list - patch - update + - watch - apiGroups: - - coordination.k8s.io + - "" resources: - - leases + - pods/finalizers verbs: - create - delete @@ -68,11 +57,10 @@ rules: - list - patch - update - - watch - apiGroups: - - "" + - coordination.k8s.io resources: - - configmaps + - leases verbs: - create - delete From b8b141011295b410a00af3c4a793bb838f10b3f7 Mon Sep 17 00:00:00 2001 From: Jonathan Rosenboim Date: Mon, 2 Mar 2026 18:51:30 +0200 Subject: [PATCH 13/25] Support a case where VPA isn't installed --- .../known_types/verticalpodautoscalers.go | 29 ++++++++++++++++--- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/pkg/operator/operands/known_types/verticalpodautoscalers.go b/pkg/operator/operands/known_types/verticalpodautoscalers.go index aec5e5c33..8e4f3bbda 100644 --- a/pkg/operator/operands/known_types/verticalpodautoscalers.go +++ b/pkg/operator/operands/known_types/verticalpodautoscalers.go @@ -6,11 +6,13 @@ package known_types import ( "context" + "github.com/NVIDIA/KAI-scheduler/pkg/operator/operands/common" vpav1 "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/apis/autoscaling.k8s.io/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/manager" ) @@ -24,13 +26,23 @@ func vpaIndexer(object client.Object) []string { } func registerVerticalPodAutoscalers() { + var vpaAvailable bool collectable := &Collectable{ Collect: getCurrentVPAState, InitWithManager: func(ctx context.Context, mgr manager.Manager) error { - return mgr.GetFieldIndexer().IndexField(ctx, &vpav1.VerticalPodAutoscaler{}, CollectableOwnerKey, vpaIndexer) + err := mgr.GetFieldIndexer().IndexField(ctx, &vpav1.VerticalPodAutoscaler{}, CollectableOwnerKey, vpaIndexer) + if err != nil { + log.FromContext(ctx).Info("VPA CRD not available, skipping field indexer registration") + return nil + } + vpaAvailable = true + return nil }, - InitWithBuilder: func(builder *builder.Builder) *builder.Builder { - return builder.Owns(&vpav1.VerticalPodAutoscaler{}) + InitWithBuilder: func(b *builder.Builder) *builder.Builder { + if !vpaAvailable { + return b + } + return b.Owns(&vpav1.VerticalPodAutoscaler{}) }, InitWithFakeClientBuilder: func(fakeClientBuilder *fake.ClientBuilder) { fakeClientBuilder.WithIndex(&vpav1.VerticalPodAutoscaler{}, CollectableOwnerKey, vpaIndexer) @@ -67,10 +79,19 @@ func VPAFieldInherit(current, desired client.Object) { func getCurrentVPAState(ctx context.Context, runtimeClient client.Client, reconciler client.Object) (map[string]client.Object, error) { result := map[string]client.Object{} + + hasVPACRD, err := common.CheckCRDsAvailable(ctx, runtimeClient, "verticalpodautoscalers.autoscaling.k8s.io") + if err != nil { + return nil, err + } + if !hasVPACRD { + return result, nil + } + vpas := &vpav1.VerticalPodAutoscalerList{} reconcilerKey := getReconcilerKey(reconciler) - err := runtimeClient.List(ctx, vpas, client.MatchingFields{CollectableOwnerKey: reconcilerKey}) + err = runtimeClient.List(ctx, vpas, client.MatchingFields{CollectableOwnerKey: reconcilerKey}) if err != nil { return nil, err } From 6d0cc1c38a3414ff225cff11762de66eb22d799a Mon Sep 17 00:00:00 2001 From: Jonathan Rosenboim Date: Mon, 2 Mar 2026 18:57:23 +0200 Subject: [PATCH 14/25] Fix import order --- pkg/operator/operands/known_types/verticalpodautoscalers.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pkg/operator/operands/known_types/verticalpodautoscalers.go b/pkg/operator/operands/known_types/verticalpodautoscalers.go index 8e4f3bbda..c36347a2b 100644 --- a/pkg/operator/operands/known_types/verticalpodautoscalers.go +++ b/pkg/operator/operands/known_types/verticalpodautoscalers.go @@ -6,14 +6,15 @@ package known_types import ( "context" - "github.com/NVIDIA/KAI-scheduler/pkg/operator/operands/common" - vpav1 "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/apis/autoscaling.k8s.io/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + vpav1 "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/apis/autoscaling.k8s.io/v1" "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/manager" + + "github.com/NVIDIA/KAI-scheduler/pkg/operator/operands/common" ) func vpaIndexer(object client.Object) []string { From a731841a2413ada74e27ab5a704812a33e4c3534 Mon Sep 17 00:00:00 2001 From: Jonathan Rosenboim Date: Wed, 4 Mar 2026 10:39:31 +0200 Subject: [PATCH 15/25] Edit yaml --- .../kai-scheduler/crds/kai.scheduler_schedulingshards.yaml | 2 +- deployments/kai-scheduler/crds/kai.scheduler_topologies.yaml | 2 +- deployments/kai-scheduler/crds/scheduling.run.ai_podgroups.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/deployments/kai-scheduler/crds/kai.scheduler_schedulingshards.yaml b/deployments/kai-scheduler/crds/kai.scheduler_schedulingshards.yaml index 130ccad3d..d034b7cef 100644 --- a/deployments/kai-scheduler/crds/kai.scheduler_schedulingshards.yaml +++ b/deployments/kai-scheduler/crds/kai.scheduler_schedulingshards.yaml @@ -9,7 +9,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.17.3 + controller-gen.kubebuilder.io/version: v0.16.1 name: schedulingshards.kai.scheduler spec: group: kai.scheduler diff --git a/deployments/kai-scheduler/crds/kai.scheduler_topologies.yaml b/deployments/kai-scheduler/crds/kai.scheduler_topologies.yaml index 364db292c..993ea19eb 100644 --- a/deployments/kai-scheduler/crds/kai.scheduler_topologies.yaml +++ b/deployments/kai-scheduler/crds/kai.scheduler_topologies.yaml @@ -24,7 +24,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.17.3 + controller-gen.kubebuilder.io/version: v0.16.1 name: topologies.kai.scheduler spec: group: kai.scheduler diff --git a/deployments/kai-scheduler/crds/scheduling.run.ai_podgroups.yaml b/deployments/kai-scheduler/crds/scheduling.run.ai_podgroups.yaml index 81df93a3d..58bbaacef 100644 --- a/deployments/kai-scheduler/crds/scheduling.run.ai_podgroups.yaml +++ b/deployments/kai-scheduler/crds/scheduling.run.ai_podgroups.yaml @@ -9,7 +9,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.17.3 + controller-gen.kubebuilder.io/version: v0.16.1 name: podgroups.scheduling.run.ai spec: group: scheduling.run.ai From 6126f7b3e8d8d5f512269eae7b3a56caf05d23ab Mon Sep 17 00:00:00 2001 From: Jonathan Rosenboim Date: Wed, 4 Mar 2026 12:15:24 +0200 Subject: [PATCH 16/25] Run fmt fix --- pkg/operator/operands/deployable/deployable_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/operator/operands/deployable/deployable_test.go b/pkg/operator/operands/deployable/deployable_test.go index 0ac33543d..2136849a2 100644 --- a/pkg/operator/operands/deployable/deployable_test.go +++ b/pkg/operator/operands/deployable/deployable_test.go @@ -17,11 +17,11 @@ import ( "github.com/NVIDIA/KAI-scheduler/pkg/operator/operands/known_types" monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" - vpav1 "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/apis/autoscaling.k8s.io/v1" appsv1 "k8s.io/api/apps/v1" v1 "k8s.io/api/core/v1" apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + vpav1 "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/apis/autoscaling.k8s.io/v1" "k8s.io/client-go/kubernetes/scheme" "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/client" From d8603ca0a94fb760b20d7f63c09e7cd620475227 Mon Sep 17 00:00:00 2001 From: Jonathan Rosenboim Date: Wed, 4 Mar 2026 12:17:09 +0200 Subject: [PATCH 17/25] Edit versions --- deployments/kai-scheduler/crds/kai.scheduler_configs.yaml | 2 +- .../kai-scheduler/crds/scheduling.run.ai_bindrequests.yaml | 2 +- deployments/kai-scheduler/crds/scheduling.run.ai_queues.yaml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/deployments/kai-scheduler/crds/kai.scheduler_configs.yaml b/deployments/kai-scheduler/crds/kai.scheduler_configs.yaml index 8d4450006..808483ed9 100644 --- a/deployments/kai-scheduler/crds/kai.scheduler_configs.yaml +++ b/deployments/kai-scheduler/crds/kai.scheduler_configs.yaml @@ -9,7 +9,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.17.3 + controller-gen.kubebuilder.io/version: v0.16.1 name: configs.kai.scheduler spec: group: kai.scheduler diff --git a/deployments/kai-scheduler/crds/scheduling.run.ai_bindrequests.yaml b/deployments/kai-scheduler/crds/scheduling.run.ai_bindrequests.yaml index 91e301296..53be0a220 100644 --- a/deployments/kai-scheduler/crds/scheduling.run.ai_bindrequests.yaml +++ b/deployments/kai-scheduler/crds/scheduling.run.ai_bindrequests.yaml @@ -9,7 +9,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.17.3 + controller-gen.kubebuilder.io/version: v0.16.1 name: bindrequests.scheduling.run.ai spec: group: scheduling.run.ai diff --git a/deployments/kai-scheduler/crds/scheduling.run.ai_queues.yaml b/deployments/kai-scheduler/crds/scheduling.run.ai_queues.yaml index 18ac93868..2b74d682b 100644 --- a/deployments/kai-scheduler/crds/scheduling.run.ai_queues.yaml +++ b/deployments/kai-scheduler/crds/scheduling.run.ai_queues.yaml @@ -9,7 +9,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.17.3 + controller-gen.kubebuilder.io/version: v0.16.1 name: queues.scheduling.run.ai spec: group: scheduling.run.ai From 02c64fc81342ecc5787e39d32ac7aa9166010495 Mon Sep 17 00:00:00 2001 From: Jonathan Rosenboim Date: Wed, 4 Mar 2026 17:34:49 +0200 Subject: [PATCH 18/25] Run make validate --- .../templates/rbac/admission.yaml | 18 +++++------ .../kai-scheduler/templates/rbac/binder.yaml | 24 +++++++------- .../templates/rbac/nodescaleadjuster.yaml | 31 ++++++++++++------ .../templates/rbac/podgroupcontroller.yaml | 22 ++++++------- .../templates/rbac/podgrouper.yaml | 25 ++++++++------- .../templates/rbac/queuecontroller.yaml | 8 ++--- .../templates/rbac/scheduler.yaml | 32 +++++++++++++------ 7 files changed, 92 insertions(+), 68 deletions(-) diff --git a/deployments/kai-scheduler/templates/rbac/admission.yaml b/deployments/kai-scheduler/templates/rbac/admission.yaml index 72122bba3..88d0a08b1 100644 --- a/deployments/kai-scheduler/templates/rbac/admission.yaml +++ b/deployments/kai-scheduler/templates/rbac/admission.yaml @@ -11,9 +11,9 @@ metadata: name: kai-admission rules: - apiGroups: - - "" + - coordination.k8s.io resources: - - configmaps + - leases verbs: - create - delete @@ -25,20 +25,20 @@ rules: - apiGroups: - "" resources: - - events + - configmaps verbs: - create + - delete + - get + - list - patch - update + - watch - apiGroups: - - coordination.k8s.io + - "" resources: - - leases + - events verbs: - create - - delete - - get - - list - patch - update - - watch diff --git a/deployments/kai-scheduler/templates/rbac/binder.yaml b/deployments/kai-scheduler/templates/rbac/binder.yaml index b4550f970..4a4968f82 100644 --- a/deployments/kai-scheduler/templates/rbac/binder.yaml +++ b/deployments/kai-scheduler/templates/rbac/binder.yaml @@ -10,6 +10,18 @@ kind: ClusterRole metadata: name: kai-binder rules: +- apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - create + - delete + - get + - list + - patch + - update + - watch - apiGroups: - "" resources: @@ -63,18 +75,6 @@ rules: - patch - update - watch -- apiGroups: - - coordination.k8s.io - resources: - - leases - verbs: - - create - - delete - - get - - list - - patch - - update - - watch - apiGroups: - resource.k8s.io resources: diff --git a/deployments/kai-scheduler/templates/rbac/nodescaleadjuster.yaml b/deployments/kai-scheduler/templates/rbac/nodescaleadjuster.yaml index 515676fe9..af2000e3d 100644 --- a/deployments/kai-scheduler/templates/rbac/nodescaleadjuster.yaml +++ b/deployments/kai-scheduler/templates/rbac/nodescaleadjuster.yaml @@ -13,7 +13,6 @@ rules: - apiGroups: - "" resources: - - configmaps - pods verbs: - create @@ -24,31 +23,43 @@ rules: - update - watch - apiGroups: - - "" + - coordination.k8s.io resources: - - events - - pods/finalizers + - leases verbs: - create + - delete + - get + - list - patch - update + - watch - apiGroups: - "" resources: - - pods/status + - configmaps verbs: + - create + - delete - get + - list - patch - update + - watch - apiGroups: - - coordination.k8s.io + - "" resources: - - leases + - events + - pods/finalizers verbs: - create - - delete + - patch + - update +- apiGroups: + - "" + resources: + - pods/status + verbs: - get - - list - patch - update - - watch diff --git a/deployments/kai-scheduler/templates/rbac/podgroupcontroller.yaml b/deployments/kai-scheduler/templates/rbac/podgroupcontroller.yaml index 4af54ebe2..9b091f5a5 100644 --- a/deployments/kai-scheduler/templates/rbac/podgroupcontroller.yaml +++ b/deployments/kai-scheduler/templates/rbac/podgroupcontroller.yaml @@ -13,29 +13,29 @@ rules: - apiGroups: - "" resources: - - configmaps + - nodes + - pods + - pods/status verbs: - - create - - delete - get - list - - patch - - update - watch - apiGroups: - - "" + - coordination.k8s.io resources: - - nodes - - pods - - pods/status + - leases verbs: + - create + - delete - get - list + - patch + - update - watch - apiGroups: - - coordination.k8s.io + - "" resources: - - leases + - configmaps verbs: - create - delete diff --git a/deployments/kai-scheduler/templates/rbac/podgrouper.yaml b/deployments/kai-scheduler/templates/rbac/podgrouper.yaml index f7e9f823f..1818196d2 100644 --- a/deployments/kai-scheduler/templates/rbac/podgrouper.yaml +++ b/deployments/kai-scheduler/templates/rbac/podgrouper.yaml @@ -14,13 +14,10 @@ rules: - "" resources: - configmaps + - namespaces verbs: - - create - - delete - get - list - - patch - - update - watch - apiGroups: - "" @@ -34,14 +31,6 @@ rules: - patch - update - watch -- apiGroups: - - "" - resources: - - namespaces - verbs: - - get - - list - - watch - apiGroups: - "" resources: @@ -132,6 +121,18 @@ rules: - patch - update - watch +- apiGroups: + - "" + resources: + - configmaps + verbs: + - create + - delete + - get + - list + - patch + - update + - watch - apiGroups: - egx.nvidia.io resources: diff --git a/deployments/kai-scheduler/templates/rbac/queuecontroller.yaml b/deployments/kai-scheduler/templates/rbac/queuecontroller.yaml index a911cd9a0..e7fe08fc6 100644 --- a/deployments/kai-scheduler/templates/rbac/queuecontroller.yaml +++ b/deployments/kai-scheduler/templates/rbac/queuecontroller.yaml @@ -11,9 +11,9 @@ metadata: name: queuecontroller rules: - apiGroups: - - "" + - coordination.k8s.io resources: - - configmaps + - leases verbs: - create - delete @@ -23,9 +23,9 @@ rules: - update - watch - apiGroups: - - coordination.k8s.io + - "" resources: - - leases + - configmaps verbs: - create - delete diff --git a/deployments/kai-scheduler/templates/rbac/scheduler.yaml b/deployments/kai-scheduler/templates/rbac/scheduler.yaml index 43bdbe69b..67f94054f 100644 --- a/deployments/kai-scheduler/templates/rbac/scheduler.yaml +++ b/deployments/kai-scheduler/templates/rbac/scheduler.yaml @@ -14,26 +14,26 @@ rules: - "" resources: - configmaps - - events - - pods/status + - namespaces + - nodes + - persistentvolumeclaims + - persistentvolumes verbs: - - create - - delete - get - list - - patch - - update - watch - apiGroups: - "" resources: - - namespaces - - nodes - - persistentvolumeclaims - - persistentvolumes + - events + - pods/status verbs: + - create + - delete - get - list + - patch + - update - watch - apiGroups: - "" @@ -69,6 +69,18 @@ rules: - patch - update - watch +- apiGroups: + - "" + resources: + - configmaps + verbs: + - create + - delete + - get + - list + - patch + - update + - watch - apiGroups: - kai.scheduler resources: From 74e5604292ea1a6b20996f486d372baa6bf2b647 Mon Sep 17 00:00:00 2001 From: Jonathan Rosenboim Date: Wed, 4 Mar 2026 18:09:39 +0200 Subject: [PATCH 19/25] Edit changelog --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index cd897599d..9360b8ac7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,11 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). ## [Unreleased] +## [v0.14.0] - 2026-03-02 +### Added +- Added support for VPA configuration for the different components of the KAI Scheduler - [jrosenboimnvidia](https://github.com/NVIDIA/KAI-Scheduler/pull/1119) +- Users that have VPA installed on their cluster can now utilize it for proper vertical autoscaling + ## [v0.13.0] - 2026-03-02 ### Added - Added `global.nodeSelector` propagation from Helm values to Config CR, ensuring operator-created sub-component deployments (admission, binder, scheduler, pod-grouper, etc.) receive the configured nodeSelector [#1102](https://github.com/NVIDIA/KAI-Scheduler/pull/1102) [yuanchen8911](https://github.com/yuanchen8911) From 8018fa0d876722780a41fcaa8ca868ba7584bf49 Mon Sep 17 00:00:00 2001 From: Jonathan Rosenboim Date: Wed, 4 Mar 2026 18:20:52 +0200 Subject: [PATCH 20/25] Add VPA unittests --- pkg/apis/kai/v1/common/vpa_test.go | 50 +++++++++ pkg/operator/operands/common/vpa_test.go | 99 +++++++++++++++++ .../operands/known_types/known_types_test.go | 101 ++++++++++++++++++ 3 files changed, 250 insertions(+) create mode 100644 pkg/apis/kai/v1/common/vpa_test.go create mode 100644 pkg/operator/operands/common/vpa_test.go diff --git a/pkg/apis/kai/v1/common/vpa_test.go b/pkg/apis/kai/v1/common/vpa_test.go new file mode 100644 index 000000000..111106e43 --- /dev/null +++ b/pkg/apis/kai/v1/common/vpa_test.go @@ -0,0 +1,50 @@ +// Copyright 2025 NVIDIA CORPORATION +// SPDX-License-Identifier: Apache-2.0 + +package common + +import ( + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + vpav1 "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/apis/autoscaling.k8s.io/v1" + "k8s.io/utils/ptr" +) + +var _ = Describe("VPASpec", func() { + Describe("SetDefaultsWhereNeeded", func() { + It("should set Enabled to false and UpdatePolicy to InPlaceOrRecreate when all fields are nil", func() { + vpa := &VPASpec{} + vpa.SetDefaultsWhereNeeded() + + Expect(vpa.Enabled).To(Equal(ptr.To(false))) + expectedMode := vpav1.UpdateModeInPlaceOrRecreate + Expect(vpa.UpdatePolicy).To(Equal(&vpav1.PodUpdatePolicy{ + UpdateMode: &expectedMode, + })) + }) + + It("should not override Enabled when already set", func() { + vpa := &VPASpec{Enabled: ptr.To(true)} + vpa.SetDefaultsWhereNeeded() + + Expect(*vpa.Enabled).To(BeTrue()) + }) + + It("should not override UpdatePolicy when already set", func() { + mode := vpav1.UpdateModeOff + vpa := &VPASpec{ + UpdatePolicy: &vpav1.PodUpdatePolicy{UpdateMode: &mode}, + } + vpa.SetDefaultsWhereNeeded() + + Expect(*vpa.UpdatePolicy.UpdateMode).To(Equal(vpav1.UpdateModeOff)) + }) + + It("should not set ResourcePolicy", func() { + vpa := &VPASpec{} + vpa.SetDefaultsWhereNeeded() + + Expect(vpa.ResourcePolicy).To(BeNil()) + }) + }) +}) diff --git a/pkg/operator/operands/common/vpa_test.go b/pkg/operator/operands/common/vpa_test.go new file mode 100644 index 000000000..acf678303 --- /dev/null +++ b/pkg/operator/operands/common/vpa_test.go @@ -0,0 +1,99 @@ +// Copyright 2025 NVIDIA CORPORATION +// SPDX-License-Identifier: Apache-2.0 + +package common + +import ( + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + appsv1 "k8s.io/api/apps/v1" + autoscalingv1 "k8s.io/api/autoscaling/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + vpav1 "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/apis/autoscaling.k8s.io/v1" + "k8s.io/utils/ptr" + "sigs.k8s.io/controller-runtime/pkg/client" + + kaicommon "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1/common" +) + +var _ = Describe("BuildVPA", func() { + It("should return nil when vpaSpec is nil", func() { + Expect(BuildVPA(nil, "name", "ns", "Deployment")).To(BeNil()) + }) + + It("should return nil when Enabled is nil", func() { + Expect(BuildVPA(&kaicommon.VPASpec{}, "name", "ns", "Deployment")).To(BeNil()) + }) + + It("should return nil when Enabled is false", func() { + spec := &kaicommon.VPASpec{Enabled: ptr.To(false)} + Expect(BuildVPA(spec, "name", "ns", "Deployment")).To(BeNil()) + }) + + It("should build a VPA targeting the given resource when enabled", func() { + mode := vpav1.UpdateModeAuto + spec := &kaicommon.VPASpec{ + Enabled: ptr.To(true), + UpdatePolicy: &vpav1.PodUpdatePolicy{UpdateMode: &mode}, + } + + result := BuildVPA(spec, "my-deploy", "my-ns", "Deployment") + Expect(result).ToNot(BeNil()) + + vpa := result.(*vpav1.VerticalPodAutoscaler) + Expect(vpa.Name).To(Equal("my-deploy")) + Expect(vpa.Namespace).To(Equal("my-ns")) + Expect(vpa.Spec.TargetRef).To(Equal(&autoscalingv1.CrossVersionObjectReference{ + APIVersion: "apps/v1", + Kind: "Deployment", + Name: "my-deploy", + })) + Expect(*vpa.Spec.UpdatePolicy.UpdateMode).To(Equal(vpav1.UpdateModeAuto)) + }) +}) + +var _ = Describe("BuildVPAFromObjects", func() { + It("should return nil when vpaSpec is nil", func() { + Expect(BuildVPAFromObjects(nil, nil, "ns")).To(BeNil()) + }) + + It("should return nil when disabled", func() { + spec := &kaicommon.VPASpec{Enabled: ptr.To(false)} + Expect(BuildVPAFromObjects(spec, nil, "ns")).To(BeNil()) + }) + + It("should return nil when no Deployment or DaemonSet found", func() { + spec := &kaicommon.VPASpec{Enabled: ptr.To(true)} + objects := []client.Object{ + &metav1.PartialObjectMetadata{ObjectMeta: metav1.ObjectMeta{Name: "svc"}}, + } + Expect(BuildVPAFromObjects(spec, objects, "ns")).To(BeNil()) + }) + + It("should build VPA from the first Deployment", func() { + spec := &kaicommon.VPASpec{Enabled: ptr.To(true)} + objects := []client.Object{ + &appsv1.Deployment{ObjectMeta: metav1.ObjectMeta{Name: "dep-1"}}, + &appsv1.Deployment{ObjectMeta: metav1.ObjectMeta{Name: "dep-2"}}, + } + + result := BuildVPAFromObjects(spec, objects, "ns") + Expect(result).ToNot(BeNil()) + vpa := result.(*vpav1.VerticalPodAutoscaler) + Expect(vpa.Name).To(Equal("dep-1")) + Expect(vpa.Spec.TargetRef.Kind).To(Equal("Deployment")) + }) + + It("should build VPA from a DaemonSet", func() { + spec := &kaicommon.VPASpec{Enabled: ptr.To(true)} + objects := []client.Object{ + &appsv1.DaemonSet{ObjectMeta: metav1.ObjectMeta{Name: "ds-1"}}, + } + + result := BuildVPAFromObjects(spec, objects, "ns") + Expect(result).ToNot(BeNil()) + vpa := result.(*vpav1.VerticalPodAutoscaler) + Expect(vpa.Name).To(Equal("ds-1")) + Expect(vpa.Spec.TargetRef.Kind).To(Equal("DaemonSet")) + }) +}) diff --git a/pkg/operator/operands/known_types/known_types_test.go b/pkg/operator/operands/known_types/known_types_test.go index c8fc95fb4..75639c99a 100644 --- a/pkg/operator/operands/known_types/known_types_test.go +++ b/pkg/operator/operands/known_types/known_types_test.go @@ -8,10 +8,12 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + vpav1 "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/apis/autoscaling.k8s.io/v1" kaiv1 "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" ) func TestKnownTypes(t *testing.T) { @@ -44,3 +46,102 @@ var _ = Describe("KnownTypes", func() { }) }) }) + +var _ = Describe("vpaIndexer", func() { + It("should return owner key for VPA owned by KAI", func() { + vpa := &vpav1.VerticalPodAutoscaler{ + ObjectMeta: metav1.ObjectMeta{ + Name: "my-vpa", + Namespace: "ns", + OwnerReferences: []metav1.OwnerReference{ + { + APIVersion: kaiv1.GroupVersion.String(), + Kind: "Config", + Name: SingletonInstanceName, + UID: types.UID("uid-123"), + Controller: ptrBool(true), + }, + }, + }, + } + + keys := vpaIndexer(vpa) + Expect(keys).To(HaveLen(1)) + }) + + It("should return nil for VPA not owned by KAI", func() { + vpa := &vpav1.VerticalPodAutoscaler{ + ObjectMeta: metav1.ObjectMeta{ + Name: "my-vpa", + }, + } + + keys := vpaIndexer(vpa) + Expect(keys).To(BeNil()) + }) +}) + +var _ = Describe("VPAFieldInherit", func() { + It("should be a no-op when current is nil", func() { + desired := &vpav1.VerticalPodAutoscaler{ + ObjectMeta: metav1.ObjectMeta{Name: "vpa"}, + } + VPAFieldInherit(nil, desired) + Expect(desired.GetResourceVersion()).To(BeEmpty()) + }) + + It("should copy metadata and status from current to desired", func() { + current := &vpav1.VerticalPodAutoscaler{ + ObjectMeta: metav1.ObjectMeta{ + Name: "vpa", + ResourceVersion: "42", + UID: types.UID("abc"), + Generation: 3, + Annotations: map[string]string{"server-added": "val"}, + }, + Status: vpav1.VerticalPodAutoscalerStatus{ + Recommendation: &vpav1.RecommendedPodResources{}, + }, + } + desired := &vpav1.VerticalPodAutoscaler{ + ObjectMeta: metav1.ObjectMeta{ + Name: "vpa", + Annotations: map[string]string{"user-set": "keep"}, + }, + } + + VPAFieldInherit(current, desired) + + Expect(desired.GetResourceVersion()).To(Equal("42")) + Expect(desired.GetUID()).To(Equal(types.UID("abc"))) + Expect(desired.GetGeneration()).To(Equal(int64(3))) + Expect(desired.GetAnnotations()).To(HaveKeyWithValue("user-set", "keep")) + Expect(desired.GetAnnotations()).To(HaveKeyWithValue("server-added", "val")) + Expect(desired.Status.Recommendation).ToNot(BeNil()) + }) +}) + +var _ = Describe("mergeAnnotations", func() { + It("should return current annotations when desired is nil", func() { + result := mergeAnnotations(nil, map[string]string{"a": "1"}) + Expect(result).To(Equal(map[string]string{"a": "1"})) + }) + + It("should not override desired annotations with current", func() { + result := mergeAnnotations( + map[string]string{"key": "desired"}, + map[string]string{"key": "current"}, + ) + Expect(result["key"]).To(Equal("desired")) + }) + + It("should merge non-overlapping annotations", func() { + result := mergeAnnotations( + map[string]string{"a": "1"}, + map[string]string{"b": "2"}, + ) + Expect(result).To(Equal(map[string]string{"a": "1", "b": "2"})) + }) +}) + +func ptrBool(b bool) *bool { return &b } From 82676fe4289328d6b95ca0269544f68e27f0f157 Mon Sep 17 00:00:00 2001 From: Jonathan Rosenboim Date: Wed, 4 Mar 2026 19:19:30 +0200 Subject: [PATCH 21/25] Add vpa to helm chart defaults --- .../kai-scheduler/templates/kai-config.yaml | 4 ++++ deployments/kai-scheduler/values.yaml | 14 ++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/deployments/kai-scheduler/templates/kai-config.yaml b/deployments/kai-scheduler/templates/kai-config.yaml index 4b1dc4ea6..7be32eb5f 100644 --- a/deployments/kai-scheduler/templates/kai-config.yaml +++ b/deployments/kai-scheduler/templates/kai-config.yaml @@ -43,6 +43,10 @@ spec: imagesPullSecret: {{ index .Values.global.imagePullSecrets 0 | default "" }} {{- end }} replicaCount: {{ .Values.operator.replicaCount | default 1 }} + {{- if .Values.global.vpa }} + vpa: + {{- toYaml .Values.global.vpa | nindent 6 }} + {{- end }} binder: service: diff --git a/deployments/kai-scheduler/values.yaml b/deployments/kai-scheduler/values.yaml index ac485a9ac..29035a929 100644 --- a/deployments/kai-scheduler/values.yaml +++ b/deployments/kai-scheduler/values.yaml @@ -16,6 +16,20 @@ global: tolerations: [] namespaceLabelSelector: {} podLabelSelector: {} + vpa: + enabled: false + updatePolicy: + updateMode: InPlaceOrRecreate + minReplicas: 1 + resourcePolicy: + containerPolicies: + - containerName: "*" + minAllowed: + cpu: 50m + memory: 500Mi + maxAllowed: + cpu: 2 + memory: 5Gi resourceReservation: namespace: kai-resource-reservation serviceAccount: kai-resource-reservation From 133a4f866fb5ef4a57816eacfca7ea6755cb1455 Mon Sep 17 00:00:00 2001 From: Jonathan Rosenboim Date: Thu, 5 Mar 2026 17:25:44 +0200 Subject: [PATCH 22/25] Add custom metric server --- hack/setup-e2e-cluster.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hack/setup-e2e-cluster.sh b/hack/setup-e2e-cluster.sh index 4d46cc2ae..5d6f5248d 100755 --- a/hack/setup-e2e-cluster.sh +++ b/hack/setup-e2e-cluster.sh @@ -76,7 +76,7 @@ helm install prometheus prometheus-community/kube-prometheus-stack --namespace m # Install VPA and its prerequisites if [ "$INSTALL_VPA" = "true" ]; then echo "Installing metrics-server (required by VPA recommender)..." - kubectl apply -f https://github.com/kubernetes-sigs/metrics-server/releases/latest/download/components.yaml + kubectl apply -f https://github.com/kubernetes-sigs/metrics-server/releases/download/v0.8.1/components.yaml # kind uses self-signed kubelet certs, so metrics-server needs --kubelet-insecure-tls kubectl patch deployment metrics-server -n kube-system --type=json \ -p '[{"op":"add","path":"/spec/template/spec/containers/0/args/-","value":"--kubelet-insecure-tls"}]' From ab27a0dfe4d37c73b9c96a5da833db741df2310a Mon Sep 17 00:00:00 2001 From: Jonathan Rosenboim Date: Thu, 5 Mar 2026 17:32:06 +0200 Subject: [PATCH 23/25] Add proper default if policy is left blank --- pkg/apis/kai/v1/common/vpa.go | 7 ++++--- pkg/apis/kai/v1/common/vpa_test.go | 10 ++++++++++ 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/pkg/apis/kai/v1/common/vpa.go b/pkg/apis/kai/v1/common/vpa.go index e4ba3ee9a..c21a1db02 100644 --- a/pkg/apis/kai/v1/common/vpa.go +++ b/pkg/apis/kai/v1/common/vpa.go @@ -29,9 +29,10 @@ func (v *VPASpec) SetDefaultsWhereNeeded() { v.Enabled = ptr.To(false) } if v.UpdatePolicy == nil { + v.UpdatePolicy = &vpav1.PodUpdatePolicy{} + } + if v.UpdatePolicy.UpdateMode == nil { mode := vpav1.UpdateModeInPlaceOrRecreate - v.UpdatePolicy = &vpav1.PodUpdatePolicy{ - UpdateMode: &mode, - } + v.UpdatePolicy.UpdateMode = &mode } } diff --git a/pkg/apis/kai/v1/common/vpa_test.go b/pkg/apis/kai/v1/common/vpa_test.go index 111106e43..961be98ab 100644 --- a/pkg/apis/kai/v1/common/vpa_test.go +++ b/pkg/apis/kai/v1/common/vpa_test.go @@ -40,6 +40,16 @@ var _ = Describe("VPASpec", func() { Expect(*vpa.UpdatePolicy.UpdateMode).To(Equal(vpav1.UpdateModeOff)) }) + It("should set UpdateMode to InPlaceOrRecreate when UpdatePolicy is set but UpdateMode is nil", func() { + vpa := &VPASpec{ + UpdatePolicy: &vpav1.PodUpdatePolicy{}, + } + vpa.SetDefaultsWhereNeeded() + + Expect(vpa.UpdatePolicy.UpdateMode).NotTo(BeNil()) + Expect(*vpa.UpdatePolicy.UpdateMode).To(Equal(vpav1.UpdateModeInPlaceOrRecreate)) + }) + It("should not set ResourcePolicy", func() { vpa := &VPASpec{} vpa.SetDefaultsWhereNeeded() From e1f95285654a82a8edfa81a3be9a4a53c7731b93 Mon Sep 17 00:00:00 2001 From: Jonathan Rosenboim Date: Thu, 5 Mar 2026 17:38:10 +0200 Subject: [PATCH 24/25] Apply setDefaultsWhereNeeded in case of partial config --- pkg/apis/kai/v1/scheduler/scheduler.go | 3 ++ pkg/apis/kai/v1/scheduler/scheduler_test.go | 37 +++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/pkg/apis/kai/v1/scheduler/scheduler.go b/pkg/apis/kai/v1/scheduler/scheduler.go index e7d8beae0..78e34a24b 100644 --- a/pkg/apis/kai/v1/scheduler/scheduler.go +++ b/pkg/apis/kai/v1/scheduler/scheduler.go @@ -71,6 +71,9 @@ func (s *Scheduler) SetDefaultsWhereNeeded(replicaCount *int32, globalVPA *commo if s.VPA == nil { s.VPA = globalVPA } + if s.VPA != nil { + s.VPA.SetDefaultsWhereNeeded() + } } // Service defines configuration for the scheduler service diff --git a/pkg/apis/kai/v1/scheduler/scheduler_test.go b/pkg/apis/kai/v1/scheduler/scheduler_test.go index d744b61e9..62b917896 100644 --- a/pkg/apis/kai/v1/scheduler/scheduler_test.go +++ b/pkg/apis/kai/v1/scheduler/scheduler_test.go @@ -12,6 +12,10 @@ import ( v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" + vpav1 "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/apis/autoscaling.k8s.io/v1" + "k8s.io/utils/ptr" + + kaicommon "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1/common" ) func TestScheduler(t *testing.T) { @@ -63,4 +67,37 @@ var _ = Describe("Scheduler", func() { scheduler.SetDefaultsWhereNeeded(replicaCount, nil) Expect(*scheduler.Replicas).To(Equal(int32(1))) }) + + It("inherits globalVPA when VPA is nil", func(ctx context.Context) { + scheduler := &Scheduler{} + mode := vpav1.UpdateModeOff + globalVPA := &kaicommon.VPASpec{ + Enabled: ptr.To(true), + UpdatePolicy: &vpav1.PodUpdatePolicy{UpdateMode: &mode}, + } + scheduler.SetDefaultsWhereNeeded(ptr.To(int32(1)), globalVPA) + + Expect(scheduler.VPA).To(Equal(globalVPA)) + Expect(*scheduler.VPA.UpdatePolicy.UpdateMode).To(Equal(vpav1.UpdateModeOff)) + }) + + It("applies defaults to local VPA when UpdateMode is nil", func(ctx context.Context) { + scheduler := &Scheduler{ + VPA: &kaicommon.VPASpec{ + Enabled: ptr.To(true), + UpdatePolicy: &vpav1.PodUpdatePolicy{}, + }, + } + scheduler.SetDefaultsWhereNeeded(ptr.To(int32(1)), nil) + + Expect(scheduler.VPA.UpdatePolicy.UpdateMode).NotTo(BeNil()) + Expect(*scheduler.VPA.UpdatePolicy.UpdateMode).To(Equal(vpav1.UpdateModeInPlaceOrRecreate)) + }) + + It("does not call SetDefaultsWhereNeeded when VPA remains nil", func(ctx context.Context) { + scheduler := &Scheduler{} + scheduler.SetDefaultsWhereNeeded(ptr.To(int32(1)), nil) + + Expect(scheduler.VPA).To(BeNil()) + }) }) From 20c4154b63d3c5c1d9c4627237ec47411bdf0ebc Mon Sep 17 00:00:00 2001 From: Jonathan Rosenboim Date: Thu, 5 Mar 2026 17:39:23 +0200 Subject: [PATCH 25/25] Add error to the log --- pkg/operator/operands/known_types/verticalpodautoscalers.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/operator/operands/known_types/verticalpodautoscalers.go b/pkg/operator/operands/known_types/verticalpodautoscalers.go index c36347a2b..b3a13b58b 100644 --- a/pkg/operator/operands/known_types/verticalpodautoscalers.go +++ b/pkg/operator/operands/known_types/verticalpodautoscalers.go @@ -33,7 +33,7 @@ func registerVerticalPodAutoscalers() { InitWithManager: func(ctx context.Context, mgr manager.Manager) error { err := mgr.GetFieldIndexer().IndexField(ctx, &vpav1.VerticalPodAutoscaler{}, CollectableOwnerKey, vpaIndexer) if err != nil { - log.FromContext(ctx).Info("VPA CRD not available, skipping field indexer registration") + log.FromContext(ctx).Info("VPA CRD not available, skipping field indexer registration", "error", err) return nil } vpaAvailable = true