Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
2a78f0d
Add Kai Scheduler VPA objects
JRosenboimNVIDIA Mar 2, 2026
83845b1
Add VPA Spec definition
JRosenboimNVIDIA Mar 2, 2026
7755f13
Add VPA config to the scheduler's components
JRosenboimNVIDIA Mar 2, 2026
884bfbe
Add construction of the VPA object
JRosenboimNVIDIA Mar 2, 2026
0889d22
Build the VPA object for the scheduler's components
JRosenboimNVIDIA Mar 2, 2026
9846906
Add VPA package to dependencies
JRosenboimNVIDIA Mar 2, 2026
8101c2f
Run make generate
JRosenboimNVIDIA Mar 2, 2026
47bc505
Update tests to account for VPA
JRosenboimNVIDIA Mar 2, 2026
dc6b906
Merge branch 'main' into run-37138-enable-vpa-for-kai-scheduler
JRosenboimNVIDIA Mar 2, 2026
97bd33f
Add option to setup vpa as part of the e2e script
JRosenboimNVIDIA Mar 2, 2026
2e41494
Add kai scheduler CRDs updates
JRosenboimNVIDIA Mar 2, 2026
df799db
Add VPA Field Inherit to allow detection of changes in existing VPA o…
JRosenboimNVIDIA Mar 2, 2026
1c966b5
Run make
JRosenboimNVIDIA Mar 2, 2026
b8b1410
Support a case where VPA isn't installed
JRosenboimNVIDIA Mar 2, 2026
6d0cc1c
Fix import order
JRosenboimNVIDIA Mar 2, 2026
05087ea
Merge branch 'main' into run-37138-enable-vpa-for-kai-scheduler
JRosenboimNVIDIA Mar 4, 2026
a731841
Edit yaml
JRosenboimNVIDIA Mar 4, 2026
6126f7b
Run fmt fix
JRosenboimNVIDIA Mar 4, 2026
d8603ca
Edit versions
JRosenboimNVIDIA Mar 4, 2026
02c64fc
Run make validate
JRosenboimNVIDIA Mar 4, 2026
fa16e70
Merge branch 'main' into run-37138-enable-vpa-for-kai-scheduler
JRosenboimNVIDIA Mar 4, 2026
74e5604
Edit changelog
JRosenboimNVIDIA Mar 4, 2026
8018fa0
Add VPA unittests
JRosenboimNVIDIA Mar 4, 2026
82676fe
Add vpa to helm chart defaults
JRosenboimNVIDIA Mar 4, 2026
cc0f71b
Merge branch 'main' into run-37138-enable-vpa-for-kai-scheduler
JRosenboimNVIDIA Mar 5, 2026
133a4f8
Add custom metric server
JRosenboimNVIDIA Mar 5, 2026
ab27a0d
Add proper default if policy is left blank
JRosenboimNVIDIA Mar 5, 2026
e1f9528
Apply setDefaultsWhereNeeded in case of partial config
JRosenboimNVIDIA Mar 5, 2026
20c4154
Add error to the log
JRosenboimNVIDIA Mar 5, 2026
7c384d1
Merge branch 'main' into run-37138-enable-vpa-for-kai-scheduler
JRosenboimNVIDIA Mar 5, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,13 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).

## [Unreleased]

## [v0.15.0] - 2026-03-05
### Added
- Added support for VPA configuration for the different components of the KAI Scheduler - [jrosenboimnvidia](https://github.com/NVIDIA/KAI-Scheduler/pull/1119)
- Users that have VPA installed on their cluster can now utilize it for proper vertical autoscaling


## [v0.14.0] - 2026-03-02
### Fixed

- Updated resource enumeration logic to exclude resources with count of 0. [#1120](https://github.com/NVIDIA/KAI-Scheduler/issues/1120)
Expand Down
2 changes: 2 additions & 0 deletions cmd/operator/app/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (

nvidiav1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1"
monitoringv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
vpav1 "k8s.io/autoscaler/vertical-pod-autoscaler/pkg/apis/autoscaling.k8s.io/v1"

"github.com/NVIDIA/KAI-scheduler/cmd/operator/config"
kaiv1 "github.com/NVIDIA/KAI-scheduler/pkg/apis/kai/v1"
Expand Down Expand Up @@ -45,6 +46,7 @@ func init() {
utilruntime.Must(kaiv1alpha1.AddToScheme(scheme))
utilruntime.Must(nvidiav1.AddToScheme(scheme))
utilruntime.Must(monitoringv1.AddToScheme(scheme))
utilruntime.Must(vpav1.AddToScheme(scheme))
// +kubebuilder:scaffold:scheme
}

Expand Down
1,056 changes: 1,056 additions & 0 deletions deployments/kai-scheduler/crds/kai.scheduler_configs.yaml

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions deployments/kai-scheduler/templates/kai-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ spec:
imagesPullSecret: {{ index .Values.global.imagePullSecrets 0 | default "" }}
{{- end }}
replicaCount: {{ .Values.operator.replicaCount | default 1 }}
{{- if .Values.global.vpa }}
vpa:
{{- toYaml .Values.global.vpa | nindent 6 }}
{{- end }}

binder:
service:
Expand Down
12 changes: 12 additions & 0 deletions deployments/kai-scheduler/templates/rbac/operator.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,18 @@ rules:
- patch
- update
- watch
- apiGroups:
- autoscaling.k8s.io
resources:
- verticalpodautoscalers
verbs:
- create
- delete
- get
- list
- patch
- update
- watch
- apiGroups:
- coordination.k8s.io
resources:
Expand Down
14 changes: 14 additions & 0 deletions deployments/kai-scheduler/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,20 @@ global:
tolerations: []
namespaceLabelSelector: {}
podLabelSelector: {}
vpa:
enabled: false
updatePolicy:
updateMode: InPlaceOrRecreate
minReplicas: 1
resourcePolicy:
containerPolicies:
- containerName: "*"
minAllowed:
cpu: 50m
memory: 500Mi
maxAllowed:
cpu: 2
memory: 5Gi
resourceReservation:
namespace: kai-resource-reservation
serviceAccount: kai-resource-reservation
Expand Down
13 changes: 7 additions & 6 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ require (
k8s.io/apiextensions-apiserver v0.34.3
k8s.io/apimachinery v0.34.3
k8s.io/apiserver v0.34.3
k8s.io/autoscaler/vertical-pod-autoscaler v1.5.1
k8s.io/cli-runtime v0.34.1
k8s.io/client-go v0.34.3
k8s.io/cluster-bootstrap v0.34.1
Expand Down Expand Up @@ -98,15 +99,15 @@ require (
github.com/cyphar/filepath-securejoin v0.6.0 // indirect
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
github.com/distribution/reference v0.6.0 // indirect
github.com/emicklei/go-restful/v3 v3.12.2 // indirect
github.com/emicklei/go-restful/v3 v3.13.0 // indirect
github.com/evanphx/json-patch/v5 v5.9.11 // indirect
github.com/felixge/httpsnoop v1.0.4 // indirect
github.com/fsnotify/fsnotify v1.9.0 // indirect
github.com/fxamacker/cbor/v2 v2.9.0 // indirect
github.com/gabriel-vasile/mimetype v1.4.7 // indirect
github.com/gin-contrib/sse v0.1.0 // indirect
github.com/go-logr/zapr v1.3.0 // indirect
github.com/go-openapi/jsonpointer v0.21.1 // indirect
github.com/go-openapi/jsonpointer v0.21.2 // indirect
github.com/go-openapi/jsonreference v0.21.0 // indirect
github.com/go-openapi/swag v0.23.1 // indirect
github.com/go-playground/locales v0.14.1 // indirect
Expand Down Expand Up @@ -148,7 +149,7 @@ require (
github.com/pelletier/go-toml/v2 v2.2.3 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/prometheus/client_model v0.6.2 // indirect
github.com/prometheus/procfs v0.16.1 // indirect
github.com/prometheus/procfs v0.17.0 // indirect
github.com/robfig/cron/v3 v3.0.1 // indirect
github.com/sirupsen/logrus v1.9.3 // indirect
github.com/spf13/cobra v1.10.1 // indirect
Expand Down Expand Up @@ -177,19 +178,19 @@ require (
golang.org/x/sys v0.40.0 // indirect
golang.org/x/term v0.37.0 // indirect
golang.org/x/text v0.31.0 // indirect
golang.org/x/time v0.11.0 // indirect
golang.org/x/time v0.12.0 // indirect
golang.org/x/tools v0.38.0 // indirect
google.golang.org/genproto v0.0.0-20250303144028-a0af3efb3deb // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20250313205543-e70fdf4c4cb4 // indirect
google.golang.org/protobuf v1.36.8 // indirect
gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect
gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect
gopkg.in/inf.v0 v0.9.1 // indirect
k8s.io/cloud-provider v0.34.1 // indirect
k8s.io/controller-manager v0.34.1 // indirect
k8s.io/cri-api v0.34.1 // indirect
k8s.io/csi-translation-lib v0.34.1 // indirect
k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b // indirect
k8s.io/kube-openapi v0.0.0-20250814151709-d7b6acb124c3 // indirect
k8s.io/kubelet v0.34.1 // indirect
knative.dev/networking v0.0.0-20250117155906-67d1c274ba6a // indirect
sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect
Expand Down
26 changes: 14 additions & 12 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,8 @@ github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5Qvfr
github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E=
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
github.com/emicklei/go-restful/v3 v3.12.2 h1:DhwDP0vY3k8ZzE0RunuJy8GhNpPL6zqLkDf9B/a0/xU=
github.com/emicklei/go-restful/v3 v3.12.2/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc=
github.com/emicklei/go-restful/v3 v3.13.0 h1:C4Bl2xDndpU6nJ4bc1jXd+uTmYPVUwkD6bFY/oTyCes=
github.com/emicklei/go-restful/v3 v3.13.0/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc=
github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98=
Expand Down Expand Up @@ -107,8 +107,8 @@ github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
github.com/go-logr/zapr v1.3.0 h1:XGdV8XW8zdwFiwOA2Dryh1gj2KRQyOOoNmBy4EplIcQ=
github.com/go-logr/zapr v1.3.0/go.mod h1:YKepepNBd1u/oyhd/yQmtjVXmm9uML4IXUgMOwR8/Gg=
github.com/go-openapi/jsonpointer v0.21.1 h1:whnzv/pNXtK2FbX/W9yJfRmE2gsmkfahjMKB0fZvcic=
github.com/go-openapi/jsonpointer v0.21.1/go.mod h1:50I1STOfbY1ycR8jGz8DaMeLCdXiI6aDteEdRNNzpdk=
github.com/go-openapi/jsonpointer v0.21.2 h1:AqQaNADVwq/VnkCmQg6ogE+M3FOsKTytwges0JdwVuA=
github.com/go-openapi/jsonpointer v0.21.2/go.mod h1:50I1STOfbY1ycR8jGz8DaMeLCdXiI6aDteEdRNNzpdk=
github.com/go-openapi/jsonreference v0.21.0 h1:Rs+Y7hSXT83Jacb7kFyjn4ijOuVGSvOdF2+tg1TRrwQ=
github.com/go-openapi/jsonreference v0.21.0/go.mod h1:LmZmgsrTkVg9LG4EaHeY8cBDslNPMo06cago5JNLkm4=
github.com/go-openapi/swag v0.23.1 h1:lpsStH0n2ittzTnbaSloVZLuB5+fvSY/+hnagBjSNZU=
Expand Down Expand Up @@ -257,8 +257,8 @@ github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNw
github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE=
github.com/prometheus/common v0.66.1 h1:h5E0h5/Y8niHc5DlaLlWLArTQI7tMrsfQjHV+d9ZoGs=
github.com/prometheus/common v0.66.1/go.mod h1:gcaUsgf3KfRSwHY4dIMXLPV0K/Wg1oZ8+SbZk/HH/dA=
github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzMyRg=
github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is=
github.com/prometheus/procfs v0.17.0 h1:FuLQ+05u4ZI+SS/w9+BWEM2TXiHKsUQ9TADiRH7DuK0=
github.com/prometheus/procfs v0.17.0/go.mod h1:oPQLaDAMRbA+u8H5Pbfq+dl3VDAvHxMUOVhe0wYB2zw=
github.com/prometheus/statsd_exporter v0.22.7 h1:7Pji/i2GuhK6Lu7DHrtTkFmNBCudCPT1pX2CziuyQR0=
github.com/prometheus/statsd_exporter v0.22.7/go.mod h1:N/TevpjkIh9ccs6nuzY3jQn9dFqnUakOjnEuMPJJJnI=
github.com/ray-project/kuberay/ray-operator v1.4.2 h1:A4tGzbIky8sInAUxZBdBb+rrpZ7fbqoxdsOtm559Zqg=
Expand Down Expand Up @@ -403,8 +403,8 @@ golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM=
golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM=
golang.org/x/time v0.11.0 h1:/bpjEDfN9tkoN/ryeYHnv5hcMlc8ncjMcM4XBk5NWV0=
golang.org/x/time v0.11.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg=
golang.org/x/time v0.12.0 h1:ScB/8o8olJvc+CQPWrK3fPZNfh7qgwCrY0zJmoEQLSE=
golang.org/x/time v0.12.0/go.mod h1:CDIdPxbZBQxdj6cxyCIdrNogrJKMJ7pr37NYpMcMDSg=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY=
Expand Down Expand Up @@ -446,8 +446,8 @@ google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXn
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
gopkg.in/evanphx/json-patch.v4 v4.12.0 h1:n6jtcsulIzXPJaxegRbvFNNrZDjbij7ny3gmSPG+6V4=
gopkg.in/evanphx/json-patch.v4 v4.12.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M=
gopkg.in/evanphx/json-patch.v4 v4.13.0 h1:czT3CmqEaQ1aanPc5SdlgQrrEIb8w/wwCvWWnfEbYzo=
gopkg.in/evanphx/json-patch.v4 v4.13.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M=
gopkg.in/h2non/gock.v1 v1.1.2 h1:jBbHXgGBK/AoPVfJh5x4r/WxIrElvbLel8TCZkkZJoY=
gopkg.in/h2non/gock.v1 v1.1.2/go.mod h1:n7UGz/ckNChHiK05rDoiC4MYSunEC/lyaUm2WWaDva0=
gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc=
Expand All @@ -470,6 +470,8 @@ k8s.io/apimachinery v0.34.3 h1:/TB+SFEiQvN9HPldtlWOTp0hWbJ+fjU+wkxysf/aQnE=
k8s.io/apimachinery v0.34.3/go.mod h1:/GwIlEcWuTX9zKIg2mbw0LRFIsXwrfoVxn+ef0X13lw=
k8s.io/apiserver v0.34.3 h1:uGH1qpDvSiYG4HVFqc6A3L4CKiX+aBWDrrsxHYK0Bdo=
k8s.io/apiserver v0.34.3/go.mod h1:QPnnahMO5C2m3lm6fPW3+JmyQbvHZQ8uudAu/493P2w=
k8s.io/autoscaler/vertical-pod-autoscaler v1.5.1 h1:LlVtM3IKqIVHz1ZXC3ahe/mAtDWb7Eob0tyTzqFULqg=
k8s.io/autoscaler/vertical-pod-autoscaler v1.5.1/go.mod h1:znhUnV0Yn+CkZu3TZ2HVqd8GFRMkPj/CXszX1gdBjTU=
k8s.io/cli-runtime v0.34.1 h1:btlgAgTrYd4sk8vJTRG6zVtqBKt9ZMDeQZo2PIzbL7M=
k8s.io/cli-runtime v0.34.1/go.mod h1:aVA65c+f0MZiMUPbseU/M9l1Wo2byeaGwUuQEQVVveE=
k8s.io/client-go v0.34.3 h1:wtYtpzy/OPNYf7WyNBTj3iUA0XaBHVqhv4Iv3tbrF5A=
Expand Down Expand Up @@ -502,8 +504,8 @@ k8s.io/kube-aggregator v0.34.1 h1:WNLV0dVNoFKmuyvdWLd92iDSyD/TSTjqwaPj0U9XAEU=
k8s.io/kube-aggregator v0.34.1/go.mod h1:RU8j+5ERfp0h+gIvWtxRPfsa5nK7rboDm8RST8BJfYQ=
k8s.io/kube-controller-manager v0.34.1 h1:hrPRR4toT+xABAxzGpnldTL1RocYXyVhx6A5Einb9wU=
k8s.io/kube-controller-manager v0.34.1/go.mod h1:+7jKjj5i7NLGM6zPHbdMh7qHaWFOBsF/oeUDdS70DSg=
k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b h1:MloQ9/bdJyIu9lb1PzujOPolHyvO06MXG5TUIj2mNAA=
k8s.io/kube-openapi v0.0.0-20250710124328-f3f2b991d03b/go.mod h1:UZ2yyWbFTpuhSbFhv24aGNOdoRdJZgsIObGBUaYVsts=
k8s.io/kube-openapi v0.0.0-20250814151709-d7b6acb124c3 h1:liMHz39T5dJO1aOKHLvwaCjDbf07wVh6yaUlTpunnkE=
k8s.io/kube-openapi v0.0.0-20250814151709-d7b6acb124c3/go.mod h1:UZ2yyWbFTpuhSbFhv24aGNOdoRdJZgsIObGBUaYVsts=
k8s.io/kube-proxy v0.34.1 h1:cIriNCJY5XmRhXCCyQiazyqi47lbwcBQf0H76fVOpkw=
k8s.io/kube-proxy v0.34.1/go.mod h1:syed9c5+gUVFMo6p24SnlTHzsp+BMd4ACcTw2dbArw0=
k8s.io/kube-scheduler v0.34.1 h1:S5td6VZwC3lCqERXclerDXhJ26zYc6JroY0s03+PqJ8=
Expand Down
25 changes: 24 additions & 1 deletion hack/setup-e2e-cluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ KIND_CONFIG=${REPO_ROOT}/hack/e2e-kind-config.yaml
# Parse named parameters
TEST_THIRD_PARTY_INTEGRATIONS=${TEST_THIRD_PARTY_INTEGRATIONS:-"false"}
LOCAL_IMAGES_BUILD=${LOCAL_IMAGES_BUILD:-"false"}
INSTALL_VPA=${INSTALL_VPA:-"false"}

while [[ $# -gt 0 ]]; do
case $1 in
Expand All @@ -29,10 +30,15 @@ while [[ $# -gt 0 ]]; do
LOCAL_IMAGES_BUILD="true"
shift
;;
--install-vpa)
INSTALL_VPA="true"
shift
;;
-h|--help)
echo "Usage: $0 [--test-third-party-integrations] [--local-images-build]"
echo "Usage: $0 [--test-third-party-integrations] [--local-images-build] [--install-vpa]"
echo " --test-third-party-integrations: Install third party operators for compatibility testing"
echo " --local-images-build: Build and use local images instead of pulling from registry"
echo " --install-vpa: Install Vertical Pod Autoscaler and metrics-server"
exit 0
;;
*)
Expand Down Expand Up @@ -67,6 +73,23 @@ helm install prometheus prometheus-community/kube-prometheus-stack --namespace m
--set "prometheus.enabled=false" \
--wait

# Install VPA and its prerequisites
if [ "$INSTALL_VPA" = "true" ]; then
echo "Installing metrics-server (required by VPA recommender)..."
kubectl apply -f https://github.com/kubernetes-sigs/metrics-server/releases/download/v0.8.1/components.yaml
# kind uses self-signed kubelet certs, so metrics-server needs --kubelet-insecure-tls
kubectl patch deployment metrics-server -n kube-system --type=json \
-p '[{"op":"add","path":"/spec/template/spec/containers/0/args/-","value":"--kubelet-insecure-tls"}]'
kubectl wait --for=condition=available --timeout=120s deployment/metrics-server -n kube-system

echo "Installing Vertical Pod Autoscaler..."
VPA_TMPDIR=$(mktemp -d)
git clone https://github.com/kubernetes/autoscaler.git "$VPA_TMPDIR/autoscaler"
(cd "$VPA_TMPDIR/autoscaler/vertical-pod-autoscaler" && git checkout vertical-pod-autoscaler-1.5.1 && ./hack/vpa-up.sh)
rm -rf "$VPA_TMPDIR"
echo "VPA installation complete."
fi

# Install third party operators to check the compatibility with the kai-scheduler
if [ "$TEST_THIRD_PARTY_INTEGRATIONS" = "true" ]; then
${REPO_ROOT}/hack/third_party_integrations/deploy_ray.sh
Expand Down
10 changes: 9 additions & 1 deletion pkg/apis/kai/v1/admission/admission.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,13 @@ type Admission struct {
// set to empty string to disable
// +kubebuilder:validation:Optional
GPUPodRuntimeClassName *string `json:"gpuPodRuntimeClassName,omitempty"`

// VPA specifies Vertical Pod Autoscaler configuration for the admission service
// +kubebuilder:validation:Optional
VPA *common.VPASpec `json:"vpa,omitempty"`
}

func (b *Admission) SetDefaultsWhereNeeded(replicaCount *int32) {
func (b *Admission) SetDefaultsWhereNeeded(replicaCount *int32, globalVPA *common.VPASpec) {
b.Service = common.SetDefault(b.Service, &common.Service{})
b.Service.SetDefaultsWhereNeeded(imageName)

Expand All @@ -68,6 +72,10 @@ func (b *Admission) SetDefaultsWhereNeeded(replicaCount *int32) {
b.MutatingWebhookConfigurationName = common.SetDefault(b.MutatingWebhookConfigurationName, ptr.To(defaultMutatingWebhookName))

b.GPUPodRuntimeClassName = common.SetDefault(b.GPUPodRuntimeClassName, ptr.To(constants.DefaultRuntimeClassName))

if b.VPA == nil {
b.VPA = globalVPA
}
}

// Webhook defines configuration for the admission webhook
Expand Down
4 changes: 2 additions & 2 deletions pkg/apis/kai/v1/admission/admission_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ var _ = Describe("Admission", func() {
Admission := &Admission{}
var replicaCount int32
replicaCount = 1
Admission.SetDefaultsWhereNeeded(&replicaCount)
Admission.SetDefaultsWhereNeeded(&replicaCount, nil)
Expect(*Admission.Service.Enabled).To(Equal(true))
Expect(*Admission.Service.Image.Name).To(Equal("admission"))
Expect(*Admission.Replicas).To(Equal(int32(1)))
Expand All @@ -32,7 +32,7 @@ var _ = Describe("Admission", func() {
Admission := &Admission{}
var replicaCount int32
replicaCount = 3
Admission.SetDefaultsWhereNeeded(&replicaCount)
Admission.SetDefaultsWhereNeeded(&replicaCount, nil)
Expect(*Admission.Replicas).To(Equal(int32(3)))
})
})
5 changes: 5 additions & 0 deletions pkg/apis/kai/v1/admission/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 9 additions & 1 deletion pkg/apis/kai/v1/binder/binder.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,13 @@ type Binder struct {
// leave empty if unsure to let the operator auto detect using ClusterPolicy (nvidia gpu-operator only)
// +kubebuilder:validation:Optional
CDIEnabled *bool `json:"cdiEnabled,omitempty"`

// VPA specifies Vertical Pod Autoscaler configuration for the binder
// +kubebuilder:validation:Optional
VPA *common.VPASpec `json:"vpa,omitempty"`
}

func (b *Binder) SetDefaultsWhereNeeded(replicaCount *int32) {
func (b *Binder) SetDefaultsWhereNeeded(replicaCount *int32, globalVPA *common.VPASpec) {
b.Service = common.SetDefault(b.Service, &common.Service{})
b.Service.Resources = common.SetDefault(b.Service.Resources, &common.Resources{})
if b.Service.Resources.Requests == nil {
Expand Down Expand Up @@ -81,6 +85,10 @@ func (b *Binder) SetDefaultsWhereNeeded(replicaCount *int32) {

b.ProbePort = common.SetDefault(b.ProbePort, ptr.To(8081))
b.MetricsPort = common.SetDefault(b.MetricsPort, ptr.To(8080))

if b.VPA == nil {
b.VPA = globalVPA
}
}

type ResourceReservation struct {
Expand Down
10 changes: 5 additions & 5 deletions pkg/apis/kai/v1/binder/binder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ func TestBinder(t *testing.T) {
var _ = Describe("Binder", func() {
It("Set Defaults", func(ctx context.Context) {
binder := &Binder{}
binder.SetDefaultsWhereNeeded(nil)
binder.SetDefaultsWhereNeeded(nil, nil)
Expect(*binder.Service.Enabled).To(Equal(true))
Expect(*binder.Service.Image.Name).To(Equal("binder"))
Expect(binder.Service.Resources.Requests[v1.ResourceCPU]).To(Equal(resource.MustParse("50m")))
Expand All @@ -35,14 +35,14 @@ var _ = Describe("Binder", func() {
binder := &Binder{}
var replicaCount int32
replicaCount = 3
binder.SetDefaultsWhereNeeded(&replicaCount)
binder.SetDefaultsWhereNeeded(&replicaCount, nil)
Expect(*binder.Replicas).To(Equal(int32(3)))
})

Context("ResourceReservation PodResources configuration", func() {
It("should not set default PodResources when not configured", func(ctx context.Context) {
binder := &Binder{}
binder.SetDefaultsWhereNeeded(nil)
binder.SetDefaultsWhereNeeded(nil, nil)

// PodResources should be nil when not configured
Expect(binder.ResourceReservation.PodResources).To(BeNil())
Expand All @@ -64,7 +64,7 @@ var _ = Describe("Binder", func() {
PodResources: podResources,
},
}
binder.SetDefaultsWhereNeeded(nil)
binder.SetDefaultsWhereNeeded(nil, nil)

// Configured values should be preserved
Expect(binder.ResourceReservation.PodResources).NotTo(BeNil())
Expand All @@ -88,7 +88,7 @@ var _ = Describe("Binder", func() {
PodResources: podResources,
},
}
binder.SetDefaultsWhereNeeded(nil)
binder.SetDefaultsWhereNeeded(nil, nil)

// Only CPU should be set
Expect(binder.ResourceReservation.PodResources).NotTo(BeNil())
Expand Down
Loading