From 9a206738ee42b823ecafd1449d1178b6cccb5191 Mon Sep 17 00:00:00 2001 From: Kevin Hannon Date: Fri, 3 Apr 2026 16:06:06 -0400 Subject: [PATCH] add kueue components as an option --- recipes/checks/kueue/health-check.yaml | 70 ++++++++++++++++++++++++++ recipes/components/kueue/values.yaml | 21 ++++++++ recipes/registry.yaml | 18 +++++++ 3 files changed, 109 insertions(+) create mode 100644 recipes/checks/kueue/health-check.yaml create mode 100644 recipes/components/kueue/values.yaml diff --git a/recipes/checks/kueue/health-check.yaml b/recipes/checks/kueue/health-check.yaml new file mode 100644 index 00000000..7e035cb8 --- /dev/null +++ b/recipes/checks/kueue/health-check.yaml @@ -0,0 +1,70 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Kueue Health Check +# +# Validates that Kueue is running and healthy in the kueue-system namespace. +# Checks that the kueue-controller-manager deployment has at least one +# available replica and that no pods in the namespace are stuck in Pending, +# Failed, or Unknown phases. +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: kueue-health-check +spec: + timeouts: + assert: 5m + steps: + - name: validate-deployment-exists + try: + # Guard against vacuous pass on empty namespace: verify the + # kueue-controller-manager deployment exists and has at least one ready replica. + - assert: + resource: + apiVersion: apps/v1 + kind: Deployment + metadata: + name: kueue-controller-manager + namespace: kueue-system + status: + (availableReplicas > `0`): true + - name: validate-all-pods-healthy + try: + # Assert no pods are in unhealthy phases. + # Pods must be Running (long-lived) or Succeeded (completed jobs). + # This catches Pending (init containers, scheduling), Failed, and Unknown. + - error: + resource: + apiVersion: v1 + kind: Pod + metadata: + namespace: kueue-system + status: + phase: Pending + - error: + resource: + apiVersion: v1 + kind: Pod + metadata: + namespace: kueue-system + status: + phase: Failed + - error: + resource: + apiVersion: v1 + kind: Pod + metadata: + namespace: kueue-system + status: + phase: Unknown diff --git a/recipes/components/kueue/values.yaml b/recipes/components/kueue/values.yaml new file mode 100644 index 00000000..0ec02cbf --- /dev/null +++ b/recipes/components/kueue/values.yaml @@ -0,0 +1,21 @@ +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Kueue Helm values +# Kubernetes-native job queueing system with quota management, +# fair sharing, priority-based preemption, and multi-tenancy support. + +controllerManager: + tolerations: + - operator: Exists diff --git a/recipes/registry.yaml b/recipes/registry.yaml index cc86eb28..cb0c98dc 100644 --- a/recipes/registry.yaml +++ b/recipes/registry.yaml @@ -425,6 +425,24 @@ components: tolerationPaths: - operator.tolerations + - name: kueue + displayName: kueue + valueOverrideKeys: + - kueue + healthCheck: + assertFile: checks/kueue/health-check.yaml + helm: + defaultRepository: oci://registry.k8s.io/kueue + defaultChart: kueue + defaultVersion: v0.17.0 + defaultNamespace: kueue-system + nodeScheduling: + system: + nodeSelectorPaths: + - controllerManager.nodeSelector + tolerationPaths: + - controllerManager.tolerations + - name: kubeflow-trainer displayName: kubeflow-trainer valueOverrideKeys: